From b01cc1b0eae0dea19257b29347116505fbedf679 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Mon, 26 Apr 2010 19:03:05 -0700
Subject: x86: Convert remaining x86 clocksources to
 clocksource_register_hz/khz

This converts the remaining x86 clocksources to use
clocksource_register_hz/khz.

CC: jacob.jun.pan@intel.com
CC: Glauber Costa <glommer@redhat.com>
CC: Dimitri Sivanich <sivanich@sgi.com>
CC: Rusty Russell <rusty@rustcorp.com.au>
CC: Jeremy Fitzhardinge <jeremy@xensource.com>
CC: Chris McDermott <lcm@us.ibm.com>
CC: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> [xen]
Signed-off-by: John Stultz <johnstul@us.ibm.com>
---
 arch/x86/kernel/apb_timer.c    | 10 +---------
 arch/x86/kernel/i8253.c        |  6 +-----
 arch/x86/kernel/kvmclock.c     |  6 +-----
 arch/x86/lguest/boot.c         |  4 +---
 arch/x86/platform/uv/uv_time.c |  6 +-----
 arch/x86/xen/time.c            |  6 +-----
 6 files changed, 6 insertions(+), 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 51ef31a89be..29ebf5a3b19 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -177,7 +177,6 @@ static struct clocksource clocksource_apbt = {
 	.rating		= APBT_CLOCKSOURCE_RATING,
 	.read		= apbt_read_clocksource,
 	.mask		= APBT_MASK,
-	.shift		= APBT_SHIFT,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 	.resume		= apbt_restart_clocksource,
 };
@@ -595,14 +594,7 @@ static int apbt_clocksource_register(void)
 	if (t1 == apbt_read_clocksource(&clocksource_apbt))
 		panic("APBT counter not counting. APBT disabled\n");
 
-	/*
-	 * initialize and register APBT clocksource
-	 * convert that to ns/clock cycle
-	 * mult = (ns/c) * 2^APBT_SHIFT
-	 */
-	clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
-				       (unsigned long) apbt_freq, APBT_SHIFT);
-	clocksource_register(&clocksource_apbt);
+	clocksource_register_khz(&clocksource_apbt, (u32)apbt_freq*1000);
 
 	return 0;
 }
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 2dfd3159744..212fe6590aa 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -188,8 +188,6 @@ static struct clocksource pit_cs = {
 	.rating		= 110,
 	.read		= pit_read,
 	.mask		= CLOCKSOURCE_MASK(32),
-	.mult		= 0,
-	.shift		= 20,
 };
 
 static int __init init_pit_clocksource(void)
@@ -205,9 +203,7 @@ static int __init init_pit_clocksource(void)
 	    pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
 		return 0;
 
-	pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift);
-
-	return clocksource_register(&pit_cs);
+	return clocksource_register_hz(&pit_cs, CLOCK_TICK_RATE);
 }
 arch_initcall(init_pit_clocksource);
 
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f98d3eafe07..6389a6bca11 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -26,8 +26,6 @@
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
 
-#define KVM_SCALE 22
-
 static int kvmclock = 1;
 static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
 static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
@@ -120,8 +118,6 @@ static struct clocksource kvm_clock = {
 	.read = kvm_clock_get_cycles,
 	.rating = 400,
 	.mask = CLOCKSOURCE_MASK(64),
-	.mult = 1 << KVM_SCALE,
-	.shift = KVM_SCALE,
 	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
@@ -203,7 +199,7 @@ void __init kvmclock_init(void)
 	machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
 	kvm_get_preset_lpj();
-	clocksource_register(&kvm_clock);
+	clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
 	pv_info.paravirt_enabled = 1;
 	pv_info.name = "KVM";
 
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index eba687f0cc0..5b96fd95bda 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -913,8 +913,6 @@ static struct clocksource lguest_clock = {
 	.rating		= 200,
 	.read		= lguest_clock_read,
 	.mask		= CLOCKSOURCE_MASK(64),
-	.mult		= 1 << 22,
-	.shift		= 22,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
@@ -997,7 +995,7 @@ static void lguest_time_init(void)
 	/* Set up the timer interrupt (0) to go to our simple timer routine */
 	set_irq_handler(0, lguest_time_irq);
 
-	clocksource_register(&lguest_clock);
+	clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
 
 	/* We can't set cpumask in the initializer: damn C limitations!  Set it
 	 * here and register our timer device. */
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 9daf5d1af9f..0eb90184515 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -40,7 +40,6 @@ static struct clocksource clocksource_uv = {
 	.rating		= 400,
 	.read		= uv_read_rtc,
 	.mask		= (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
-	.shift		= 10,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
@@ -372,14 +371,11 @@ static __init int uv_rtc_setup_clock(void)
 	if (!is_uv_system())
 		return -ENODEV;
 
-	clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
-				clocksource_uv.shift);
-
 	/* If single blade, prefer tsc */
 	if (uv_num_possible_blades() == 1)
 		clocksource_uv.rating = 250;
 
-	rc = clocksource_register(&clocksource_uv);
+	rc = clocksource_register_hz(&clocksource_uv, sn_rtc_cycles_per_second);
 	if (rc)
 		printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
 	else
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 067759e3d6a..04e11597a8c 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -26,8 +26,6 @@
 
 #include "xen-ops.h"
 
-#define XEN_SHIFT 22
-
 /* Xen may fire a timer up to this many ns early */
 #define TIMER_SLOP	100000
 #define NS_PER_TICK	(1000000000LL / HZ)
@@ -211,8 +209,6 @@ static struct clocksource xen_clocksource __read_mostly = {
 	.rating = 400,
 	.read = xen_clocksource_get_cycles,
 	.mask = ~0,
-	.mult = 1<<XEN_SHIFT,		/* time directly in nanoseconds */
-	.shift = XEN_SHIFT,
 	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
@@ -446,7 +442,7 @@ static __init void xen_time_init(void)
 	int cpu = smp_processor_id();
 	struct timespec tp;
 
-	clocksource_register(&xen_clocksource);
+	clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
 
 	if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
 		/* Successfully turned off 100Hz tick, so we have the
-- 
cgit v1.2.3-70-g09d2


From 45bb1674b976ef81429c1e42de05844b49d45dea Mon Sep 17 00:00:00 2001
From: Daniel Drake <dsd@laptop.org>
Date: Sun, 13 Mar 2011 15:10:17 +0000
Subject: x86, olpc: Use device tree for platform identification

Make OLPC fully depend on device tree, and use it to identify the OLPC
platform details. Some nodes are exposed as platform devices where we
plan to use device tree for device probing.

Signed-off-by: Daniel Drake <dsd@laptop.org>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
LKML-Reference: <20110313151017.C255F9D401E@zog.reactivated.net>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kconfig                 |  2 +-
 arch/x86/include/asm/olpc_ofw.h  |  9 +++----
 arch/x86/platform/olpc/Makefile  |  4 +---
 arch/x86/platform/olpc/olpc.c    | 51 +++++++++++++++++++++-------------------
 arch/x86/platform/olpc/olpc_dt.c | 19 +++++++++++++++
 5 files changed, 51 insertions(+), 34 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b4c2e9c6762..471221bcc4f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2071,7 +2071,7 @@ config OLPC
 	depends on !X86_PAE
 	select GPIOLIB
 	select OF
-	select OF_PROMTREE if PROC_DEVICETREE
+	select OF_PROMTREE
 	---help---
 	  Add support for detecting the unique features of the OLPC
 	  XO hardware.
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index c5d3a5abbb9..24487712e0b 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -26,15 +26,12 @@ extern void setup_olpc_ofw_pgd(void);
 /* check if OFW was detected during boot */
 extern bool olpc_ofw_present(void);
 
+extern void olpc_dt_build_devicetree(void);
+
 #else /* !CONFIG_OLPC */
 static inline void olpc_ofw_detect(void) { }
 static inline void setup_olpc_ofw_pgd(void) { }
-#endif /* !CONFIG_OLPC */
-
-#ifdef CONFIG_OF_PROMTREE
-extern void olpc_dt_build_devicetree(void);
-#else
 static inline void olpc_dt_build_devicetree(void) { }
-#endif
+#endif /* !CONFIG_OLPC */
 
 #endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index c2a8cab65e5..81c5e2165c2 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,4 +1,2 @@
-obj-$(CONFIG_OLPC)		+= olpc.o
+obj-$(CONFIG_OLPC)		+= olpc.o olpc_ofw.o olpc_dt.o
 obj-$(CONFIG_OLPC_XO1)		+= olpc-xo1.o
-obj-$(CONFIG_OLPC)		+= olpc_ofw.o
-obj-$(CONFIG_OF_PROMTREE)	+= olpc_dt.o
diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
index edaf3fe8dc5..0060fd59ea0 100644
--- a/arch/x86/platform/olpc/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -18,6 +18,7 @@
 #include <linux/io.h>
 #include <linux/string.h>
 #include <linux/platform_device.h>
+#include <linux/of.h>
 
 #include <asm/geode.h>
 #include <asm/setup.h>
@@ -187,41 +188,43 @@ err:
 }
 EXPORT_SYMBOL_GPL(olpc_ec_cmd);
 
-static bool __init check_ofw_architecture(void)
+static bool __init check_ofw_architecture(struct device_node *root)
 {
-	size_t propsize;
-	char olpc_arch[5];
-	const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 };
-	void *res[] = { &propsize };
+	const char *olpc_arch;
+	int propsize;
 
-	if (olpc_ofw("getprop", args, res)) {
-		printk(KERN_ERR "ofw: getprop call failed!\n");
-		return false;
-	}
+	olpc_arch = of_get_property(root, "architecture", &propsize);
 	return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0;
 }
 
-static u32 __init get_board_revision(void)
+static u32 __init get_board_revision(struct device_node *root)
 {
-	size_t propsize;
-	__be32 rev;
-	const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
-	void *res[] = { &propsize };
-
-	if (olpc_ofw("getprop", args, res) || propsize != 4) {
-		printk(KERN_ERR "ofw: getprop call failed!\n");
-		return cpu_to_be32(0);
-	}
-	return be32_to_cpu(rev);
+	int propsize;
+	const __be32 *rev;
+
+	rev = of_get_property(root, "board-revision-int", &propsize);
+	if (propsize != 4)
+		return 0;
+
+	return be32_to_cpu(*rev);
 }
 
 static bool __init platform_detect(void)
 {
-	if (!check_ofw_architecture())
+	struct device_node *root = of_find_node_by_path("/");
+	bool success;
+
+	if (!root)
 		return false;
-	olpc_platform_info.flags |= OLPC_F_PRESENT;
-	olpc_platform_info.boardrev = get_board_revision();
-	return true;
+
+	success = check_ofw_architecture(root);
+	if (success) {
+		olpc_platform_info.boardrev = get_board_revision(root);
+		olpc_platform_info.flags |= OLPC_F_PRESENT;
+	}
+
+	of_node_put(root);
+	return success;
 }
 
 static int __init add_xo1_platform_devices(void)
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
index dab87464753..4ce208f885e 100644
--- a/arch/x86/platform/olpc/olpc_dt.c
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -19,7 +19,9 @@
 #include <linux/kernel.h>
 #include <linux/bootmem.h>
 #include <linux/of.h>
+#include <linux/of_platform.h>
 #include <linux/of_pdt.h>
+#include <asm/olpc.h>
 #include <asm/olpc_ofw.h>
 
 static phandle __init olpc_dt_getsibling(phandle node)
@@ -181,3 +183,20 @@ void __init olpc_dt_build_devicetree(void)
 	pr_info("PROM DT: Built device tree with %u bytes of memory.\n",
 			prom_early_allocated);
 }
+
+/* A list of DT node/bus matches that we want to expose as platform devices */
+static struct of_device_id __initdata of_ids[] = {
+	{ .compatible = "olpc,xo1-battery" },
+	{ .compatible = "olpc,xo1-dcon" },
+	{ .compatible = "olpc,xo1-rtc" },
+	{},
+};
+
+static int __init olpc_create_platform_devices(void)
+{
+	if (machine_is_olpc())
+		return of_platform_bus_probe(NULL, of_ids, NULL);
+	else
+		return 0;
+}
+device_initcall(olpc_create_platform_devices);
-- 
cgit v1.2.3-70-g09d2


From 5d94e81f69d4b1d1102d3ab557ce0a817c11fbbb Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 8 Mar 2011 10:36:19 -0800
Subject: x86: Introduce pci_map_biosrom()

The isci driver needs to retrieve its preboot OROM image which contains
necessary runtime parameters like platform specific sas addresses and
phy configuration.  There is no ROM BAR associated with this area,
instead we will need to scan legacy expansion ROM space.

1/ Promote the probe_roms_32 implementation to x86-64
2/ Add a facility to find and map an adapter rom by pci device (according to
   PCI Firmware Specification Revision 3.0)

Signed-off-by: Dave Jiang <dave.jiang@intel.com>
LKML-Reference: <20110308183226.6246.90354.stgit@localhost6.localdomain6>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/probe_roms.h |   8 ++
 arch/x86/include/asm/setup.h      |   2 +-
 arch/x86/kernel/Makefile          |   2 +-
 arch/x86/kernel/head32.c          |   1 -
 arch/x86/kernel/probe_roms.c      | 267 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/probe_roms_32.c   | 166 ------------------------
 arch/x86/kernel/x86_init.c        |   2 +-
 7 files changed, 278 insertions(+), 170 deletions(-)
 create mode 100644 arch/x86/include/asm/probe_roms.h
 create mode 100644 arch/x86/kernel/probe_roms.c
 delete mode 100644 arch/x86/kernel/probe_roms_32.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/probe_roms.h b/arch/x86/include/asm/probe_roms.h
new file mode 100644
index 00000000000..4950a0b1d09
--- /dev/null
+++ b/arch/x86/include/asm/probe_roms.h
@@ -0,0 +1,8 @@
+#ifndef _PROBE_ROMS_H_
+#define _PROBE_ROMS_H_
+struct pci_dev;
+
+extern void __iomem *pci_map_biosrom(struct pci_dev *pdev);
+extern void pci_unmap_biosrom(void __iomem *rom);
+extern size_t pci_biosrom_size(struct pci_dev *pdev);
+#endif
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index db8aa19a08a..03d3a32ace2 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -104,10 +104,10 @@ void *extend_brk(size_t size, size_t align);
 	type *name;					\
 	RESERVE_BRK(name, sizeof(type) * entries)
 
+extern void probe_roms(void);
 #ifdef __i386__
 
 void __init i386_start_kernel(void);
-extern void probe_roms(void);
 
 #else
 void __init x86_64_start_kernel(char *real_mode);
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 62445ba2f8a..f33d738c8da 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -36,7 +36,7 @@ obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time.o ioport.o ldt.o dumpstack.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
 obj-$(CONFIG_IRQ_WORK)  += irq_work.o
-obj-$(CONFIG_X86_32)	+= probe_roms_32.o
+obj-y			+= probe_roms.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 7f138b3c3c5..eab4940c730 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -23,7 +23,6 @@
 static void __init i386_default_early_setup(void)
 {
 	/* Initialize 32bit specific setup functions */
-	x86_init.resources.probe_roms = probe_roms;
 	x86_init.resources.reserve_resources = i386_reserve_resources;
 	x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
 
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
new file mode 100644
index 00000000000..ba0a4cce53b
--- /dev/null
+++ b/arch/x86/kernel/probe_roms.c
@@ -0,0 +1,267 @@
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/mmzone.h>
+#include <linux/ioport.h>
+#include <linux/seq_file.h>
+#include <linux/console.h>
+#include <linux/init.h>
+#include <linux/edd.h>
+#include <linux/dmi.h>
+#include <linux/pfn.h>
+#include <linux/pci.h>
+#include <asm/pci-direct.h>
+
+
+#include <asm/e820.h>
+#include <asm/mmzone.h>
+#include <asm/setup.h>
+#include <asm/sections.h>
+#include <asm/io.h>
+#include <asm/setup_arch.h>
+
+static struct resource system_rom_resource = {
+	.name	= "System ROM",
+	.start	= 0xf0000,
+	.end	= 0xfffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource extension_rom_resource = {
+	.name	= "Extension ROM",
+	.start	= 0xe0000,
+	.end	= 0xeffff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource adapter_rom_resources[] = { {
+	.name 	= "Adapter ROM",
+	.start	= 0xc8000,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+	.name 	= "Adapter ROM",
+	.start	= 0,
+	.end	= 0,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+} };
+
+static struct resource video_rom_resource = {
+	.name 	= "Video ROM",
+	.start	= 0xc0000,
+	.end	= 0xc7fff,
+	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+/* does this oprom support the given pci device, or any of the devices
+ * that the driver supports?
+ */
+static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
+{
+	struct pci_driver *drv = pdev->driver;
+	const struct pci_device_id *id;
+
+	if (pdev->vendor == vendor && pdev->device == device)
+		return true;
+
+	for (id = drv ? drv->id_table : NULL; id && id->vendor; id++)
+		if (id->vendor == vendor && id->device == device)
+			break;
+
+	return id && id->vendor;
+}
+
+static bool probe_list(struct pci_dev *pdev, unsigned short vendor,
+		       const unsigned char *rom_list)
+{
+	unsigned short device;
+
+	do {
+		if (probe_kernel_address(rom_list, device) != 0)
+			device = 0;
+
+		if (device && match_id(pdev, vendor, device))
+			break;
+
+		rom_list += 2;
+	} while (device);
+
+	return !!device;
+}
+
+static struct resource *find_oprom(struct pci_dev *pdev)
+{
+	struct resource *oprom = NULL;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
+		struct resource *res = &adapter_rom_resources[i];
+		unsigned short offset, vendor, device, list, rev;
+		const unsigned char *rom;
+
+		if (res->end == 0)
+			break;
+
+		rom = isa_bus_to_virt(res->start);
+		if (probe_kernel_address(rom + 0x18, offset) != 0)
+			continue;
+
+		if (probe_kernel_address(rom + offset + 0x4, vendor) != 0)
+			continue;
+
+		if (probe_kernel_address(rom + offset + 0x6, device) != 0)
+			continue;
+
+		if (match_id(pdev, vendor, device)) {
+			oprom = res;
+			break;
+		}
+
+		if (probe_kernel_address(rom + offset + 0x8, list) == 0 &&
+		    probe_kernel_address(rom + offset + 0xc, rev) == 0 &&
+		    rev >= 3 && list &&
+		    probe_list(pdev, vendor, rom + offset + list)) {
+			oprom = res;
+			break;
+		}
+	}
+
+	return oprom;
+}
+
+void *pci_map_biosrom(struct pci_dev *pdev)
+{
+	struct resource *oprom = find_oprom(pdev);
+
+	if (!oprom)
+		return NULL;
+
+	return ioremap(oprom->start, resource_size(oprom));
+}
+EXPORT_SYMBOL(pci_map_biosrom);
+
+void pci_unmap_biosrom(void __iomem *image)
+{
+	iounmap(image);
+}
+EXPORT_SYMBOL(pci_unmap_biosrom);
+
+size_t pci_biosrom_size(struct pci_dev *pdev)
+{
+	struct resource *oprom = find_oprom(pdev);
+
+	return oprom ? resource_size(oprom) : 0;
+}
+EXPORT_SYMBOL(pci_biosrom_size);
+
+#define ROMSIGNATURE 0xaa55
+
+static int __init romsignature(const unsigned char *rom)
+{
+	const unsigned short * const ptr = (const unsigned short *)rom;
+	unsigned short sig;
+
+	return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
+}
+
+static int __init romchecksum(const unsigned char *rom, unsigned long length)
+{
+	unsigned char sum, c;
+
+	for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
+		sum += c;
+	return !length && !sum;
+}
+
+void __init probe_roms(void)
+{
+	const unsigned char *rom;
+	unsigned long start, length, upper;
+	unsigned char c;
+	int i;
+
+	/* video rom */
+	upper = adapter_rom_resources[0].start;
+	for (start = video_rom_resource.start; start < upper; start += 2048) {
+		rom = isa_bus_to_virt(start);
+		if (!romsignature(rom))
+			continue;
+
+		video_rom_resource.start = start;
+
+		if (probe_kernel_address(rom + 2, c) != 0)
+			continue;
+
+		/* 0 < length <= 0x7f * 512, historically */
+		length = c * 512;
+
+		/* if checksum okay, trust length byte */
+		if (length && romchecksum(rom, length))
+			video_rom_resource.end = start + length - 1;
+
+		request_resource(&iomem_resource, &video_rom_resource);
+		break;
+	}
+
+	start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+	if (start < upper)
+		start = upper;
+
+	/* system rom */
+	request_resource(&iomem_resource, &system_rom_resource);
+	upper = system_rom_resource.start;
+
+	/* check for extension rom (ignore length byte!) */
+	rom = isa_bus_to_virt(extension_rom_resource.start);
+	if (romsignature(rom)) {
+		length = extension_rom_resource.end - extension_rom_resource.start + 1;
+		if (romchecksum(rom, length)) {
+			request_resource(&iomem_resource, &extension_rom_resource);
+			upper = extension_rom_resource.start;
+		}
+	}
+
+	/* check for adapter roms on 2k boundaries */
+	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
+		rom = isa_bus_to_virt(start);
+		if (!romsignature(rom))
+			continue;
+
+		if (probe_kernel_address(rom + 2, c) != 0)
+			continue;
+
+		/* 0 < length <= 0x7f * 512, historically */
+		length = c * 512;
+
+		/* but accept any length that fits if checksum okay */
+		if (!length || start + length > upper || !romchecksum(rom, length))
+			continue;
+
+		adapter_rom_resources[i].start = start;
+		adapter_rom_resources[i].end = start + length - 1;
+		request_resource(&iomem_resource, &adapter_rom_resources[i]);
+
+		start = adapter_rom_resources[i++].end & ~2047UL;
+	}
+}
+
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms_32.c
deleted file mode 100644
index 071e7fea42e..00000000000
--- a/arch/x86/kernel/probe_roms_32.c
+++ /dev/null
@@ -1,166 +0,0 @@
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/uaccess.h>
-#include <linux/mmzone.h>
-#include <linux/ioport.h>
-#include <linux/seq_file.h>
-#include <linux/console.h>
-#include <linux/init.h>
-#include <linux/edd.h>
-#include <linux/dmi.h>
-#include <linux/pfn.h>
-#include <linux/pci.h>
-#include <asm/pci-direct.h>
-
-
-#include <asm/e820.h>
-#include <asm/mmzone.h>
-#include <asm/setup.h>
-#include <asm/sections.h>
-#include <asm/io.h>
-#include <asm/setup_arch.h>
-
-static struct resource system_rom_resource = {
-	.name	= "System ROM",
-	.start	= 0xf0000,
-	.end	= 0xfffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource extension_rom_resource = {
-	.name	= "Extension ROM",
-	.start	= 0xe0000,
-	.end	= 0xeffff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-static struct resource adapter_rom_resources[] = { {
-	.name 	= "Adapter ROM",
-	.start	= 0xc8000,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-}, {
-	.name 	= "Adapter ROM",
-	.start	= 0,
-	.end	= 0,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-} };
-
-static struct resource video_rom_resource = {
-	.name 	= "Video ROM",
-	.start	= 0xc0000,
-	.end	= 0xc7fff,
-	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
-};
-
-#define ROMSIGNATURE 0xaa55
-
-static int __init romsignature(const unsigned char *rom)
-{
-	const unsigned short * const ptr = (const unsigned short *)rom;
-	unsigned short sig;
-
-	return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE;
-}
-
-static int __init romchecksum(const unsigned char *rom, unsigned long length)
-{
-	unsigned char sum, c;
-
-	for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--)
-		sum += c;
-	return !length && !sum;
-}
-
-void __init probe_roms(void)
-{
-	const unsigned char *rom;
-	unsigned long start, length, upper;
-	unsigned char c;
-	int i;
-
-	/* video rom */
-	upper = adapter_rom_resources[0].start;
-	for (start = video_rom_resource.start; start < upper; start += 2048) {
-		rom = isa_bus_to_virt(start);
-		if (!romsignature(rom))
-			continue;
-
-		video_rom_resource.start = start;
-
-		if (probe_kernel_address(rom + 2, c) != 0)
-			continue;
-
-		/* 0 < length <= 0x7f * 512, historically */
-		length = c * 512;
-
-		/* if checksum okay, trust length byte */
-		if (length && romchecksum(rom, length))
-			video_rom_resource.end = start + length - 1;
-
-		request_resource(&iomem_resource, &video_rom_resource);
-		break;
-	}
-
-	start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
-	if (start < upper)
-		start = upper;
-
-	/* system rom */
-	request_resource(&iomem_resource, &system_rom_resource);
-	upper = system_rom_resource.start;
-
-	/* check for extension rom (ignore length byte!) */
-	rom = isa_bus_to_virt(extension_rom_resource.start);
-	if (romsignature(rom)) {
-		length = extension_rom_resource.end - extension_rom_resource.start + 1;
-		if (romchecksum(rom, length)) {
-			request_resource(&iomem_resource, &extension_rom_resource);
-			upper = extension_rom_resource.start;
-		}
-	}
-
-	/* check for adapter roms on 2k boundaries */
-	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) {
-		rom = isa_bus_to_virt(start);
-		if (!romsignature(rom))
-			continue;
-
-		if (probe_kernel_address(rom + 2, c) != 0)
-			continue;
-
-		/* 0 < length <= 0x7f * 512, historically */
-		length = c * 512;
-
-		/* but accept any length that fits if checksum okay */
-		if (!length || start + length > upper || !romchecksum(rom, length))
-			continue;
-
-		adapter_rom_resources[i].start = start;
-		adapter_rom_resources[i].end = start + length - 1;
-		request_resource(&iomem_resource, &adapter_rom_resources[i]);
-
-		start = adapter_rom_resources[i++].end & ~2047UL;
-	}
-}
-
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index c11514e9128..6eee0828e32 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -35,7 +35,7 @@ void iommu_shutdown_noop(void) { }
 struct x86_init_ops x86_init __initdata = {
 
 	.resources = {
-		.probe_roms		= x86_init_noop,
+		.probe_roms		= probe_roms,
 		.reserve_resources	= reserve_standard_io_resources,
 		.memory_setup		= default_machine_specific_memory_setup,
 	},
-- 
cgit v1.2.3-70-g09d2


From 9f1f1bfd8d7e579f07dbe56d6f93bd594da43b3d Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Wed, 23 Mar 2011 21:31:40 +0600
Subject: x86, mpparse: Remove unnecessary variable

'ret' isn't used by check_slot(), gets initialized but has no real use,
so remove it.

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
LKML-Reference: <AANLkTikh3y+is3xixKBdyHhr_cHxzPFJF729Fcvt8+Ns@mail.gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mpparse.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 6f789a887c0..ef32d4c09c6 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -722,14 +722,12 @@ inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
 static int
 check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
 {
-	int ret = 0;
-
 	if (!mpc_new_phys || count <= mpc_new_length) {
 		WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
 		return -1;
 	}
 
-	return ret;
+	return 0;
 }
 
 static int  __init replace_intsrc_all(struct mpc_table *mpc,
-- 
cgit v1.2.3-70-g09d2


From 1d321881afb955bba1e0a8f50d3a04e111fcb581 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Mon, 28 Mar 2011 00:46:11 +0400
Subject: perf, x86: P4 PMU - clean up the code a bit

No change on the functional level, just align the table properly.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Lin Ming <ming.m.lin@intel.com>
LKML-Reference: <4D8FA213.5050108@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index c2520e178d3..8ff882fdb1c 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -468,7 +468,7 @@ static struct p4_event_bind p4_event_bind_map[] = {
 		.opcode		= P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
 		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
 		.escr_emask	=
-		P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
+			P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_X87_ASSIST] = {
-- 
cgit v1.2.3-70-g09d2


From 349c004e3d31fda23ad225b61861be38047fff16 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Sat, 12 Mar 2011 12:50:10 +0100
Subject: x86: A fast way to check capabilities of the current cpu

Add this_cpu_has() which determines if the current cpu has a certain
ability using a segment prefix and a bit test operation.

For that we need to add bit operations to x86s percpu.h.

Many uses of cpu_has use a pointer passed to a function to determine
the current flags. That is no longer necessary after this patch.

However, this patch only converts the straightforward cases where
cpu_has is used with this_cpu_ptr. The rest is work for later.

-tj: Rolled up patch to add x86_ prefix and use percpu_read() instead
     of percpu_read_stable().

Signed-off-by: Christoph Lameter <cl@linux.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/include/asm/cpufeature.h | 13 +++++++++----
 arch/x86/include/asm/percpu.h     | 27 +++++++++++++++++++++++++++
 arch/x86/kernel/apic/apic.c       |  2 +-
 arch/x86/kernel/process.c         |  4 ++--
 arch/x86/kernel/smpboot.c         |  4 ++--
 5 files changed, 41 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 91f3e087cf2..50c0d30e676 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -207,8 +207,7 @@ extern const char * const x86_power_flags[32];
 #define test_cpu_cap(c, bit)						\
 	 test_bit(bit, (unsigned long *)((c)->x86_capability))
 
-#define cpu_has(c, bit)							\
-	(__builtin_constant_p(bit) &&					\
+#define REQUIRED_MASK_BIT_SET(bit)					\
 	 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) ||	\
 	   (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) ||	\
 	   (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) ||	\
@@ -218,10 +217,16 @@ extern const char * const x86_power_flags[32];
 	   (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) ||	\
 	   (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ||	\
 	   (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) ||	\
-	   (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )	\
-	  ? 1 :								\
+	   (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )
+
+#define cpu_has(c, bit)							\
+	(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :	\
 	 test_cpu_cap(c, bit))
 
+#define this_cpu_has(bit)						\
+	(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : 	\
+	 x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
+
 #define boot_cpu_has(bit)	cpu_has(&boot_cpu_data, bit)
 
 #define set_cpu_cap(c, bit)	set_bit(bit, (unsigned long *)((c)->x86_capability))
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index d475b4398d8..76042d98159 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -542,6 +542,33 @@ do {									\
 	old__;								\
 })
 
+static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr,
+                        const unsigned long __percpu *addr)
+{
+	unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
+
+	return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0;
+}
+
+static inline int x86_this_cpu_variable_test_bit(int nr,
+                        const unsigned long __percpu *addr)
+{
+	int oldbit;
+
+	asm volatile("bt "__percpu_arg(2)",%1\n\t"
+			"sbb %0,%0"
+			: "=r" (oldbit)
+			: "m" (*(unsigned long *)addr), "Ir" (nr));
+
+	return oldbit;
+}
+
+#define x86_this_cpu_test_bit(nr, addr)			\
+	(__builtin_constant_p((nr))			\
+	 ? x86_this_cpu_constant_test_bit((nr), (addr))	\
+	 : x86_this_cpu_variable_test_bit((nr), (addr)))
+
+
 #include <asm-generic/percpu.h>
 
 /* We can use this directly for local CPU (faster). */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index fabf01eff77..2bc503bf9e9 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -505,7 +505,7 @@ static void __cpuinit setup_APIC_timer(void)
 {
 	struct clock_event_device *levt = &__get_cpu_var(lapic_events);
 
-	if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) {
+	if (this_cpu_has(X86_FEATURE_ARAT)) {
 		lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
 		/* Make LAPIC timer preferrable over percpu HPET */
 		lapic_clockevent.rating = 150;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d46cbe46b7a..88a90a977f8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -449,7 +449,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
 	if (!need_resched()) {
-		if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
+		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
 			clflush((void *)&current_thread_info()->flags);
 
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -465,7 +465,7 @@ static void mwait_idle(void)
 	if (!need_resched()) {
 		trace_power_start(POWER_CSTATE, 1, smp_processor_id());
 		trace_cpu_idle(1, smp_processor_id());
-		if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
+		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
 			clflush((void *)&current_thread_info()->flags);
 
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c2871d3c71b..a3c430bdfb6 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1332,9 +1332,9 @@ static inline void mwait_play_dead(void)
 	void *mwait_ptr;
 	struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
 
-	if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)))
+	if (!this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c))
 		return;
-	if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH))
+	if (!this_cpu_has(X86_FEATURE_CLFLSH))
 		return;
 	if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
 		return;
-- 
cgit v1.2.3-70-g09d2


From fe5042138b6fc60edde3b60025078884c2eb71ac Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Sat, 12 Mar 2011 12:50:46 +0100
Subject: x86: Use this_cpu_has for thermal_interrupt current cpu

It is more effective to use a segment prefix instead of calculating the
address of the current cpu area amd then testing flags.

Signed-off-by: Christoph Lameter <cl@linux.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 6f8c5e9da97..6b0f4cde7a2 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -355,7 +355,6 @@ static void notify_thresholds(__u64 msr_val)
 static void intel_thermal_interrupt(void)
 {
 	__u64 msr_val;
-	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
 
 	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
 
@@ -367,19 +366,19 @@ static void intel_thermal_interrupt(void)
 				CORE_LEVEL) != 0)
 		mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
 
-	if (cpu_has(c, X86_FEATURE_PLN))
+	if (this_cpu_has(X86_FEATURE_PLN))
 		if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
 					POWER_LIMIT_EVENT,
 					CORE_LEVEL) != 0)
 			mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
 
-	if (cpu_has(c, X86_FEATURE_PTS)) {
+	if (this_cpu_has(X86_FEATURE_PTS)) {
 		rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
 		if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
 					THERMAL_THROTTLING_EVENT,
 					PACKAGE_LEVEL) != 0)
 			mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
-		if (cpu_has(c, X86_FEATURE_PLN))
+		if (this_cpu_has(X86_FEATURE_PLN))
 			if (therm_throt_process(msr_val &
 					PACKAGE_THERM_STATUS_POWER_LIMIT,
 					POWER_LIMIT_EVENT,
-- 
cgit v1.2.3-70-g09d2


From 052936080c8fb2f791103995b21bd4018c8df886 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 1 Apr 2011 11:15:12 +0200
Subject: x86-64, NUMA: Remove custom phys_to_nid() implementation

phys_to_nid() maps physical address to NUMA node id.  This is
implemented by building perfect hash in compute_hash_shift() during
initialization.

However, with SPARSE memory model, the nid is encoded in page flags.
The perfect hash implementation was for DISCONTIG memory model which
got removed years ago by b263295dbf (x86: 64-bit, make sparsemem
vmemmap the only memory model).

So, the perfect hash ends up being used only during initialization
when the core SPARSE code already provides perfectly acceptable
generic early_pfn_to_nid() implementation.

Drop phys_to_nid() and use the generic ealry_pfn_to_nid() instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig                 |   4 --
 arch/x86/include/asm/mmzone_64.h |  23 --------
 arch/x86/mm/numa_64.c            | 123 +--------------------------------------
 3 files changed, 1 insertion(+), 149 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc6c53a95bf..b034814bf97 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1703,10 +1703,6 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE
 	def_bool y
 	depends on MEMORY_HOTPLUG
 
-config HAVE_ARCH_EARLY_PFN_TO_NID
-	def_bool X86_64
-	depends on NUMA
-
 config USE_PERCPU_NUMA_NODE_ID
 	def_bool y
 	depends on NUMA
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index 288b96f815a..b3f88d7867c 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -4,36 +4,13 @@
 #ifndef _ASM_X86_MMZONE_64_H
 #define _ASM_X86_MMZONE_64_H
 
-
 #ifdef CONFIG_NUMA
 
 #include <linux/mmdebug.h>
-
 #include <asm/smp.h>
 
-/* Simple perfect hash to map physical addresses to node numbers */
-struct memnode {
-	int shift;
-	unsigned int mapsize;
-	s16 *map;
-	s16 embedded_map[64 - 8];
-} ____cacheline_aligned; /* total size = 128 bytes */
-extern struct memnode memnode;
-#define memnode_shift memnode.shift
-#define memnodemap memnode.map
-#define memnodemapsize memnode.mapsize
-
 extern struct pglist_data *node_data[];
 
-static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
-{
-	unsigned nid;
-	VIRTUAL_BUG_ON(!memnodemap);
-	nid = memnodemap[addr >> memnode_shift];
-	VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
-	return nid;
-}
-
 #define NODE_DATA(nid)		(node_data[nid])
 
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index e8c00cc7203..3951ee6eade 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -28,125 +28,10 @@ EXPORT_SYMBOL(node_data);
 
 nodemask_t numa_nodes_parsed __initdata;
 
-struct memnode memnode;
-
-static unsigned long __initdata nodemap_addr;
-static unsigned long __initdata nodemap_size;
-
 static struct numa_meminfo numa_meminfo __initdata;
-
 static int numa_distance_cnt;
 static u8 *numa_distance;
 
-/*
- * Given a shift value, try to populate memnodemap[]
- * Returns :
- * 1 if OK
- * 0 if memnodmap[] too small (of shift too small)
- * -1 if node overlap or lost ram (shift too big)
- */
-static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
-{
-	unsigned long addr, end;
-	int i, res = -1;
-
-	memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
-	for (i = 0; i < mi->nr_blks; i++) {
-		addr = mi->blk[i].start;
-		end = mi->blk[i].end;
-		if (addr >= end)
-			continue;
-		if ((end >> shift) >= memnodemapsize)
-			return 0;
-		do {
-			if (memnodemap[addr >> shift] != NUMA_NO_NODE)
-				return -1;
-			memnodemap[addr >> shift] = mi->blk[i].nid;
-			addr += (1UL << shift);
-		} while (addr < end);
-		res = 1;
-	}
-	return res;
-}
-
-static int __init allocate_cachealigned_memnodemap(void)
-{
-	unsigned long addr;
-
-	memnodemap = memnode.embedded_map;
-	if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
-		return 0;
-
-	addr = 0x8000;
-	nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
-	nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
-				      nodemap_size, L1_CACHE_BYTES);
-	if (nodemap_addr == MEMBLOCK_ERROR) {
-		printk(KERN_ERR
-		       "NUMA: Unable to allocate Memory to Node hash map\n");
-		nodemap_addr = nodemap_size = 0;
-		return -1;
-	}
-	memnodemap = phys_to_virt(nodemap_addr);
-	memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
-
-	printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
-	       nodemap_addr, nodemap_addr + nodemap_size);
-	return 0;
-}
-
-/*
- * The LSB of all start and end addresses in the node map is the value of the
- * maximum possible shift.
- */
-static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
-{
-	int i, nodes_used = 0;
-	unsigned long start, end;
-	unsigned long bitfield = 0, memtop = 0;
-
-	for (i = 0; i < mi->nr_blks; i++) {
-		start = mi->blk[i].start;
-		end = mi->blk[i].end;
-		if (start >= end)
-			continue;
-		bitfield |= start;
-		nodes_used++;
-		if (end > memtop)
-			memtop = end;
-	}
-	if (nodes_used <= 1)
-		i = 63;
-	else
-		i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
-	memnodemapsize = (memtop >> i)+1;
-	return i;
-}
-
-static int __init compute_hash_shift(const struct numa_meminfo *mi)
-{
-	int shift;
-
-	shift = extract_lsb_from_nodes(mi);
-	if (allocate_cachealigned_memnodemap())
-		return -1;
-	printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
-		shift);
-
-	if (populate_memnodemap(mi, shift) != 1) {
-		printk(KERN_INFO "Your memory is not aligned you need to "
-		       "rebuild your kernel with a bigger NODEMAPSIZE "
-		       "shift=%d\n", shift);
-		return -1;
-	}
-	return shift;
-}
-
-int __meminit  __early_pfn_to_nid(unsigned long pfn)
-{
-	return phys_to_nid(pfn << PAGE_SHIFT);
-}
-
 static void * __init early_node_mem(int nodeid, unsigned long start,
 				    unsigned long end, unsigned long size,
 				    unsigned long align)
@@ -270,7 +155,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 	memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
 	printk(KERN_INFO "  NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
 		nodedata_phys + pgdat_size - 1);
-	nid = phys_to_nid(nodedata_phys);
+	nid = early_pfn_to_nid(nodedata_phys >> PAGE_SHIFT);
 	if (nid != nodeid)
 		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nodeid, nid);
 
@@ -527,12 +412,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 	if (WARN_ON(nodes_empty(node_possible_map)))
 		return -EINVAL;
 
-	memnode_shift = compute_hash_shift(mi);
-	if (memnode_shift < 0) {
-		printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
-		return -EINVAL;
-	}
-
 	for (i = 0; i < mi->nr_blks; i++)
 		memblock_x86_register_active_regions(mi->blk[i].nid,
 					mi->blk[i].start >> PAGE_SHIFT,
-- 
cgit v1.2.3-70-g09d2


From 3b16651f806d35b5c404f2525fbce76afa3c9297 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 1 Apr 2011 11:15:12 +0200
Subject: x86: Clean up memory model related configs in arch/x86/Kconfig

* Remove bogus dependency on ARCH_SELECT_MEMORY_MODEL from
  ARCH_FLATMEM_ENABLE.  ENABLE configs don't interfere with
  SELECT_MEMORY_MODEL.  They just need to indicate whether the
  specific memory model is supported.

* Relocate HAVE_ARCH_ALLOC_REMAP, ARCH_PROC_KCORE_TEXT and
  ARCH_SPARSEMEM_DEFAULT so that memory model related configs are
  together in consistent order.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Christoph Lameter <cl@linux.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b034814bf97..8db4fbf30b5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1223,6 +1223,10 @@ config HAVE_ARCH_BOOTMEM
 	def_bool y
 	depends on X86_32 && NUMA
 
+config HAVE_ARCH_ALLOC_REMAP
+	def_bool y
+	depends on X86_32 && NUMA
+
 config ARCH_HAVE_MEMORY_PRESENT
 	def_bool y
 	depends on X86_32 && DISCONTIGMEM
@@ -1231,13 +1235,9 @@ config NEED_NODE_MEMMAP_SIZE
 	def_bool y
 	depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
 
-config HAVE_ARCH_ALLOC_REMAP
-	def_bool y
-	depends on X86_32 && NUMA
-
 config ARCH_FLATMEM_ENABLE
 	def_bool y
-	depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA
+	depends on X86_32 && !NUMA
 
 config ARCH_DISCONTIGMEM_ENABLE
 	def_bool y
@@ -1247,20 +1247,16 @@ config ARCH_DISCONTIGMEM_DEFAULT
 	def_bool y
 	depends on NUMA && X86_32
 
-config ARCH_PROC_KCORE_TEXT
-	def_bool y
-	depends on X86_64 && PROC_KCORE
-
-config ARCH_SPARSEMEM_DEFAULT
-	def_bool y
-	depends on X86_64
-
 config ARCH_SPARSEMEM_ENABLE
 	def_bool y
 	depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
 	select SPARSEMEM_STATIC if X86_32
 	select SPARSEMEM_VMEMMAP_ENABLE if X86_64
 
+config ARCH_SPARSEMEM_DEFAULT
+	def_bool y
+	depends on X86_64
+
 config ARCH_SELECT_MEMORY_MODEL
 	def_bool y
 	depends on ARCH_SPARSEMEM_ENABLE
@@ -1269,6 +1265,10 @@ config ARCH_MEMORY_PROBE
 	def_bool X86_64
 	depends on MEMORY_HOTPLUG
 
+config ARCH_PROC_KCORE_TEXT
+	def_bool y
+	depends on X86_64 && PROC_KCORE
+
 config ILLEGAL_POINTER_VALUE
        hex
        default 0 if X86_32
-- 
cgit v1.2.3-70-g09d2


From 711b8c87a5fe6de78e90411cb67b506babfef74a Mon Sep 17 00:00:00 2001
From: Florian Mickler <florian@mickler.org>
Date: Mon, 4 Apr 2011 01:17:40 +0200
Subject: x86-64, NUMA: Remove unused variable

In case !CONFIG_ACPI_NUMA and !CONFIG_AMD_NUMA gcc emits a warning
about the unused variable ret.

As that variable is in fact not needed I choose to remove it.

Signed-off-by: Florian Mickler <florian@mickler.org>
LKML-Reference: <1301843624-22364-1-git-send-email-florian@mickler.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/mm/numa_64.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 3951ee6eade..13f5b068e8c 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -505,17 +505,13 @@ static int __init numa_init(int (*init_func)(void))
 
 void __init initmem_init(void)
 {
-	int ret;
-
 	if (!numa_off) {
 #ifdef CONFIG_ACPI_NUMA
-		ret = numa_init(x86_acpi_numa_init);
-		if (!ret)
+		if (!numa_init(x86_acpi_numa_init))
 			return;
 #endif
 #ifdef CONFIG_AMD_NUMA
-		ret = numa_init(amd_numa_init);
-		if (!ret)
+		if (!numa_init(amd_numa_init))
 			return;
 #endif
 	}
-- 
cgit v1.2.3-70-g09d2


From 64d21fc194e12bdf7347019bf10325a4b3d77e7b Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Sat, 2 Apr 2011 13:17:48 +0600
Subject: x86, mpparse: Put check_slot() into .init section

check_slot() is only called from replace_intsrc_all() - which is
in the .init section.

So, put check_slot into the .init section as well, so it can be freed
after system boot.

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
LKML-Reference: <AANLkTing52ntzRcHkODCWDKOfRF=0uhXw5-cCUhx6M54@mail.gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mpparse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 5a532ce646b..f1b27181de0 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -715,7 +715,7 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
 	}
 }
 
-static int
+static int __init
 check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
 {
 	int ret = 0;
-- 
cgit v1.2.3-70-g09d2


From d430d3d7e646eb1eac2bb4aa244a644312e67c76 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Wed, 16 Mar 2011 17:29:47 -0400
Subject: jump label: Introduce static_branch() interface

Introduce:

static __always_inline bool static_branch(struct jump_label_key *key);

instead of the old JUMP_LABEL(key, label) macro.

In this way, jump labels become really easy to use:

Define:

        struct jump_label_key jump_key;

Can be used as:

        if (static_branch(&jump_key))
                do unlikely code

enable/disale via:

        jump_label_inc(&jump_key);
        jump_label_dec(&jump_key);

that's it!

For the jump labels disabled case, the static_branch() becomes an
atomic_read(), and jump_label_inc()/dec() are simply atomic_inc(),
atomic_dec() operations. We show testing results for this change below.

Thanks to H. Peter Anvin for suggesting the 'static_branch()' construct.

Since we now require a 'struct jump_label_key *key', we can store a pointer into
the jump table addresses. In this way, we can enable/disable jump labels, in
basically constant time. This change allows us to completely remove the previous
hashtable scheme. Thanks to Peter Zijlstra for this re-write.

Testing:

I ran a series of 'tbench 20' runs 5 times (with reboots) for 3
configurations, where tracepoints were disabled.

jump label configured in
avg: 815.6

jump label *not* configured in (using atomic reads)
avg: 800.1

jump label *not* configured in (regular reads)
avg: 803.4

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20110316212947.GA8792@redhat.com>
Signed-off-by: Jason Baron <jbaron@redhat.com>
Suggested-by: H. Peter Anvin <hpa@linux.intel.com>
Tested-by: David Daney <ddaney@caviumnetworks.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/mips/include/asm/jump_label.h  |  22 +-
 arch/sparc/include/asm/jump_label.h |  25 +-
 arch/x86/include/asm/alternative.h  |   3 +-
 arch/x86/include/asm/jump_label.h   |  26 +-
 arch/x86/kernel/alternative.c       |   2 +-
 arch/x86/kernel/module.c            |   1 +
 include/asm-generic/vmlinux.lds.h   |  14 +-
 include/linux/dynamic_debug.h       |   2 -
 include/linux/jump_label.h          |  89 +++---
 include/linux/jump_label_ref.h      |  44 ---
 include/linux/perf_event.h          |  26 +-
 include/linux/tracepoint.h          |  22 +-
 kernel/jump_label.c                 | 539 +++++++++++++++---------------------
 kernel/perf_event.c                 |   4 +-
 kernel/tracepoint.c                 |  23 +-
 15 files changed, 356 insertions(+), 486 deletions(-)
 delete mode 100644 include/linux/jump_label_ref.h

(limited to 'arch/x86')

diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h
index 7622ccf7507..1881b316ca4 100644
--- a/arch/mips/include/asm/jump_label.h
+++ b/arch/mips/include/asm/jump_label.h
@@ -20,16 +20,18 @@
 #define WORD_INSN ".word"
 #endif
 
-#define JUMP_LABEL(key, label)						\
-	do {								\
-		asm goto("1:\tnop\n\t"					\
-			"nop\n\t"					\
-			".pushsection __jump_table,  \"a\"\n\t"		\
-			WORD_INSN " 1b, %l[" #label "], %0\n\t"		\
-			".popsection\n\t"				\
-			: :  "i" (key) :  : label);			\
-	} while (0)
-
+static __always_inline bool arch_static_branch(struct jump_label_key *key)
+{
+	asm goto("1:\tnop\n\t"
+		"nop\n\t"
+		".pushsection __jump_table,  \"aw\"\n\t"
+		WORD_INSN " 1b, %l[l_yes], %0\n\t"
+		".popsection\n\t"
+		: :  "i" (key) : : l_yes);
+	return false;
+l_yes:
+	return true;
+}
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h
index 427d4684e0d..fc73a82366f 100644
--- a/arch/sparc/include/asm/jump_label.h
+++ b/arch/sparc/include/asm/jump_label.h
@@ -7,17 +7,20 @@
 
 #define JUMP_LABEL_NOP_SIZE 4
 
-#define JUMP_LABEL(key, label)					\
-	do {							\
-		asm goto("1:\n\t"				\
-			 "nop\n\t"				\
-			 "nop\n\t"				\
-			 ".pushsection __jump_table,  \"a\"\n\t"\
-			 ".align 4\n\t"				\
-			 ".word 1b, %l[" #label "], %c0\n\t"	\
-			 ".popsection \n\t"			\
-			 : :  "i" (key) :  : label);\
-	} while (0)
+static __always_inline bool arch_static_branch(struct jump_label_key *key)
+{
+		asm goto("1:\n\t"
+			 "nop\n\t"
+			 "nop\n\t"
+			 ".pushsection __jump_table,  \"aw\"\n\t"
+			 ".align 4\n\t"
+			 ".word 1b, %l[l_yes], %c0\n\t"
+			 ".popsection \n\t"
+			 : :  "i" (key) : : l_yes);
+	return false;
+l_yes:
+	return true;
+}
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 13009d1af99..8cdd1e24797 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -4,7 +4,6 @@
 #include <linux/types.h>
 #include <linux/stddef.h>
 #include <linux/stringify.h>
-#include <linux/jump_label.h>
 #include <asm/asm.h>
 
 /*
@@ -191,7 +190,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
 extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
 extern void text_poke_smp_batch(struct text_poke_param *params, int n);
 
-#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_JUMP_LABEL)
 #define IDEAL_NOP_SIZE_5 5
 extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
 extern void arch_init_ideal_nop5(void);
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 574dbc22893..f217cee8653 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -5,20 +5,24 @@
 
 #include <linux/types.h>
 #include <asm/nops.h>
+#include <asm/asm.h>
 
 #define JUMP_LABEL_NOP_SIZE 5
 
-# define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
-
-# define JUMP_LABEL(key, label)					\
-	do {							\
-		asm goto("1:"					\
-			JUMP_LABEL_INITIAL_NOP			\
-			".pushsection __jump_table,  \"aw\" \n\t"\
-			_ASM_PTR "1b, %l[" #label "], %c0 \n\t" \
-			".popsection \n\t"			\
-			: :  "i" (key) :  : label);		\
-	} while (0)
+#define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
+
+static __always_inline bool arch_static_branch(struct jump_label_key *key)
+{
+	asm goto("1:"
+		JUMP_LABEL_INITIAL_NOP
+		".pushsection __jump_table,  \"aw\" \n\t"
+		_ASM_PTR "1b, %l[l_yes], %c0 \n\t"
+		".popsection \n\t"
+		: :  "i" (key) : : l_yes);
+	return false;
+l_yes:
+	return true;
+}
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 4a234677e21..651454b0c81 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -679,7 +679,7 @@ void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
 	__stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
 }
 
-#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
+#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_JUMP_LABEL)
 
 #ifdef CONFIG_X86_64
 unsigned char ideal_nop5[5] = { 0x66, 0x66, 0x66, 0x66, 0x90 };
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index ab23f1ad4bf..52f256f2cc8 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
 #include <linux/bug.h>
 #include <linux/mm.h>
 #include <linux/gfp.h>
+#include <linux/jump_label.h>
 
 #include <asm/system.h>
 #include <asm/page.h>
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 32c45e5fe0a..79522166d7f 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -170,6 +170,10 @@
 	STRUCT_ALIGN();							\
 	*(__tracepoints)						\
 	/* implement dynamic printk debug */				\
+	. = ALIGN(8);                                                   \
+	VMLINUX_SYMBOL(__start___jump_table) = .;                       \
+	*(__jump_table)                                                 \
+	VMLINUX_SYMBOL(__stop___jump_table) = .;                        \
 	. = ALIGN(8);							\
 	VMLINUX_SYMBOL(__start___verbose) = .;                          \
 	*(__verbose)                                                    \
@@ -228,8 +232,6 @@
 									\
 	BUG_TABLE							\
 									\
-	JUMP_TABLE							\
-									\
 	/* PCI quirks */						\
 	.pci_fixup        : AT(ADDR(.pci_fixup) - LOAD_OFFSET) {	\
 		VMLINUX_SYMBOL(__start_pci_fixups_early) = .;		\
@@ -589,14 +591,6 @@
 #define BUG_TABLE
 #endif
 
-#define JUMP_TABLE							\
-	. = ALIGN(8);							\
-	__jump_table : AT(ADDR(__jump_table) - LOAD_OFFSET) {		\
-		VMLINUX_SYMBOL(__start___jump_table) = .;		\
-		*(__jump_table)						\
-		VMLINUX_SYMBOL(__stop___jump_table) = .;		\
-	}
-
 #ifdef CONFIG_PM_TRACE
 #define TRACEDATA							\
 	. = ALIGN(4);							\
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index 0c9653f11c1..e747ecd48e1 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -1,8 +1,6 @@
 #ifndef _DYNAMIC_DEBUG_H
 #define _DYNAMIC_DEBUG_H
 
-#include <linux/jump_label.h>
-
 /* dynamic_printk_enabled, and dynamic_printk_enabled2 are bitmasks in which
  * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They
  * use independent hash functions, to reduce the chance of false positives.
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 7880f18e4b8..83e745f3ead 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -1,20 +1,43 @@
 #ifndef _LINUX_JUMP_LABEL_H
 #define _LINUX_JUMP_LABEL_H
 
+#include <linux/types.h>
+#include <linux/compiler.h>
+
 #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL)
+
+struct jump_label_key {
+	atomic_t enabled;
+	struct jump_entry *entries;
+#ifdef CONFIG_MODULES
+	struct jump_label_mod *next;
+#endif
+};
+
 # include <asm/jump_label.h>
 # define HAVE_JUMP_LABEL
 #endif
 
 enum jump_label_type {
+	JUMP_LABEL_DISABLE = 0,
 	JUMP_LABEL_ENABLE,
-	JUMP_LABEL_DISABLE
 };
 
 struct module;
 
 #ifdef HAVE_JUMP_LABEL
 
+#ifdef CONFIG_MODULES
+#define JUMP_LABEL_INIT {{ 0 }, NULL, NULL}
+#else
+#define JUMP_LABEL_INIT {{ 0 }, NULL}
+#endif
+
+static __always_inline bool static_branch(struct jump_label_key *key)
+{
+	return arch_static_branch(key);
+}
+
 extern struct jump_entry __start___jump_table[];
 extern struct jump_entry __stop___jump_table[];
 
@@ -23,37 +46,37 @@ extern void jump_label_unlock(void);
 extern void arch_jump_label_transform(struct jump_entry *entry,
 				 enum jump_label_type type);
 extern void arch_jump_label_text_poke_early(jump_label_t addr);
-extern void jump_label_update(unsigned long key, enum jump_label_type type);
-extern void jump_label_apply_nops(struct module *mod);
 extern int jump_label_text_reserved(void *start, void *end);
+extern void jump_label_inc(struct jump_label_key *key);
+extern void jump_label_dec(struct jump_label_key *key);
+extern bool jump_label_enabled(struct jump_label_key *key);
+extern void jump_label_apply_nops(struct module *mod);
 
-#define jump_label_enable(key) \
-	jump_label_update((unsigned long)key, JUMP_LABEL_ENABLE);
+#else
 
-#define jump_label_disable(key) \
-	jump_label_update((unsigned long)key, JUMP_LABEL_DISABLE);
+#include <asm/atomic.h>
 
-#else
+#define JUMP_LABEL_INIT {ATOMIC_INIT(0)}
 
-#define JUMP_LABEL(key, label)			\
-do {						\
-	if (unlikely(*key))			\
-		goto label;			\
-} while (0)
+struct jump_label_key {
+	atomic_t enabled;
+};
 
-#define jump_label_enable(cond_var)	\
-do {					\
-       *(cond_var) = 1;			\
-} while (0)
+static __always_inline bool static_branch(struct jump_label_key *key)
+{
+	if (unlikely(atomic_read(&key->enabled)))
+		return true;
+	return false;
+}
 
-#define jump_label_disable(cond_var)	\
-do {					\
-       *(cond_var) = 0;			\
-} while (0)
+static inline void jump_label_inc(struct jump_label_key *key)
+{
+	atomic_inc(&key->enabled);
+}
 
-static inline int jump_label_apply_nops(struct module *mod)
+static inline void jump_label_dec(struct jump_label_key *key)
 {
-	return 0;
+	atomic_dec(&key->enabled);
 }
 
 static inline int jump_label_text_reserved(void *start, void *end)
@@ -64,16 +87,16 @@ static inline int jump_label_text_reserved(void *start, void *end)
 static inline void jump_label_lock(void) {}
 static inline void jump_label_unlock(void) {}
 
-#endif
+static inline bool jump_label_enabled(struct jump_label_key *key)
+{
+	return !!atomic_read(&key->enabled);
+}
 
-#define COND_STMT(key, stmt)					\
-do {								\
-	__label__ jl_enabled;					\
-	JUMP_LABEL(key, jl_enabled);				\
-	if (0) {						\
-jl_enabled:							\
-		stmt;						\
-	}							\
-} while (0)
+static inline int jump_label_apply_nops(struct module *mod)
+{
+	return 0;
+}
+
+#endif
 
 #endif
diff --git a/include/linux/jump_label_ref.h b/include/linux/jump_label_ref.h
deleted file mode 100644
index e5d012ad92c..00000000000
--- a/include/linux/jump_label_ref.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef _LINUX_JUMP_LABEL_REF_H
-#define _LINUX_JUMP_LABEL_REF_H
-
-#include <linux/jump_label.h>
-#include <asm/atomic.h>
-
-#ifdef HAVE_JUMP_LABEL
-
-static inline void jump_label_inc(atomic_t *key)
-{
-	if (atomic_add_return(1, key) == 1)
-		jump_label_enable(key);
-}
-
-static inline void jump_label_dec(atomic_t *key)
-{
-	if (atomic_dec_and_test(key))
-		jump_label_disable(key);
-}
-
-#else /* !HAVE_JUMP_LABEL */
-
-static inline void jump_label_inc(atomic_t *key)
-{
-	atomic_inc(key);
-}
-
-static inline void jump_label_dec(atomic_t *key)
-{
-	atomic_dec(key);
-}
-
-#undef JUMP_LABEL
-#define JUMP_LABEL(key, label)						\
-do {									\
-	if (unlikely(__builtin_choose_expr(				\
-	      __builtin_types_compatible_p(typeof(key), atomic_t *),	\
-	      atomic_read((atomic_t *)(key)), *(key))))			\
-		goto label;						\
-} while (0)
-
-#endif /* HAVE_JUMP_LABEL */
-
-#endif /* _LINUX_JUMP_LABEL_REF_H */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 311b4dc785a..730b7821690 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -505,7 +505,7 @@ struct perf_guest_info_callbacks {
 #include <linux/ftrace.h>
 #include <linux/cpu.h>
 #include <linux/irq_work.h>
-#include <linux/jump_label_ref.h>
+#include <linux/jump_label.h>
 #include <asm/atomic.h>
 #include <asm/local.h>
 
@@ -1034,7 +1034,7 @@ static inline int is_software_event(struct perf_event *event)
 	return event->pmu->task_ctx_nr == perf_sw_context;
 }
 
-extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+extern struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
 extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64);
 
@@ -1063,22 +1063,21 @@ perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
 {
 	struct pt_regs hot_regs;
 
-	JUMP_LABEL(&perf_swevent_enabled[event_id], have_event);
-	return;
-
-have_event:
-	if (!regs) {
-		perf_fetch_caller_regs(&hot_regs);
-		regs = &hot_regs;
+	if (static_branch(&perf_swevent_enabled[event_id])) {
+		if (!regs) {
+			perf_fetch_caller_regs(&hot_regs);
+			regs = &hot_regs;
+		}
+		__perf_sw_event(event_id, nr, nmi, regs, addr);
 	}
-	__perf_sw_event(event_id, nr, nmi, regs, addr);
 }
 
-extern atomic_t perf_sched_events;
+extern struct jump_label_key perf_sched_events;
 
 static inline void perf_event_task_sched_in(struct task_struct *task)
 {
-	COND_STMT(&perf_sched_events, __perf_event_task_sched_in(task));
+	if (static_branch(&perf_sched_events))
+		__perf_event_task_sched_in(task);
 }
 
 static inline
@@ -1086,7 +1085,8 @@ void perf_event_task_sched_out(struct task_struct *task, struct task_struct *nex
 {
 	perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
 
-	COND_STMT(&perf_sched_events, __perf_event_task_sched_out(task, next));
+	if (static_branch(&perf_sched_events))
+		__perf_event_task_sched_out(task, next);
 }
 
 extern void perf_event_mmap(struct vm_area_struct *vma);
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 97c84a58efb..d530a4460a0 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -29,7 +29,7 @@ struct tracepoint_func {
 
 struct tracepoint {
 	const char *name;		/* Tracepoint name */
-	int state;			/* State. */
+	struct jump_label_key key;
 	void (*regfunc)(void);
 	void (*unregfunc)(void);
 	struct tracepoint_func __rcu *funcs;
@@ -146,9 +146,7 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin,
 	extern struct tracepoint __tracepoint_##name;			\
 	static inline void trace_##name(proto)				\
 	{								\
-		JUMP_LABEL(&__tracepoint_##name.state, do_trace);	\
-		return;							\
-do_trace:								\
+		if (static_branch(&__tracepoint_##name.key))		\
 			__DO_TRACE(&__tracepoint_##name,		\
 				TP_PROTO(data_proto),			\
 				TP_ARGS(data_args),			\
@@ -176,14 +174,14 @@ do_trace:								\
  * structures, so we create an array of pointers that will be used for iteration
  * on the tracepoints.
  */
-#define DEFINE_TRACE_FN(name, reg, unreg)				\
-	static const char __tpstrtab_##name[]				\
-	__attribute__((section("__tracepoints_strings"))) = #name;	\
-	struct tracepoint __tracepoint_##name				\
-	__attribute__((section("__tracepoints"))) =			\
-		{ __tpstrtab_##name, 0, reg, unreg, NULL };		\
-	static struct tracepoint * const __tracepoint_ptr_##name __used	\
-	__attribute__((section("__tracepoints_ptrs"))) =		\
+#define DEFINE_TRACE_FN(name, reg, unreg)				 \
+	static const char __tpstrtab_##name[]				 \
+	__attribute__((section("__tracepoints_strings"))) = #name;	 \
+	struct tracepoint __tracepoint_##name				 \
+	__attribute__((section("__tracepoints"))) =			 \
+		{ __tpstrtab_##name, JUMP_LABEL_INIT, reg, unreg, NULL };\
+	static struct tracepoint * const __tracepoint_ptr_##name __used	 \
+	__attribute__((section("__tracepoints_ptrs"))) =		 \
 		&__tracepoint_##name;
 
 #define DEFINE_TRACE(name)						\
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 3b79bd93833..74d1c099fbd 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -2,43 +2,23 @@
  * jump label support
  *
  * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com>
  *
  */
-#include <linux/jump_label.h>
 #include <linux/memory.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/list.h>
-#include <linux/jhash.h>
 #include <linux/slab.h>
 #include <linux/sort.h>
 #include <linux/err.h>
+#include <linux/jump_label.h>
 
 #ifdef HAVE_JUMP_LABEL
 
-#define JUMP_LABEL_HASH_BITS 6
-#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
-static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
-
 /* mutex to protect coming/going of the the jump_label table */
 static DEFINE_MUTEX(jump_label_mutex);
 
-struct jump_label_entry {
-	struct hlist_node hlist;
-	struct jump_entry *table;
-	int nr_entries;
-	/* hang modules off here */
-	struct hlist_head modules;
-	unsigned long key;
-};
-
-struct jump_label_module_entry {
-	struct hlist_node hlist;
-	struct jump_entry *table;
-	int nr_entries;
-	struct module *mod;
-};
-
 void jump_label_lock(void)
 {
 	mutex_lock(&jump_label_mutex);
@@ -49,6 +29,11 @@ void jump_label_unlock(void)
 	mutex_unlock(&jump_label_mutex);
 }
 
+bool jump_label_enabled(struct jump_label_key *key)
+{
+	return !!atomic_read(&key->enabled);
+}
+
 static int jump_label_cmp(const void *a, const void *b)
 {
 	const struct jump_entry *jea = a;
@@ -64,7 +49,7 @@ static int jump_label_cmp(const void *a, const void *b)
 }
 
 static void
-sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
+jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
 {
 	unsigned long size;
 
@@ -73,118 +58,25 @@ sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
 	sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
 }
 
-static struct jump_label_entry *get_jump_label_entry(jump_label_t key)
-{
-	struct hlist_head *head;
-	struct hlist_node *node;
-	struct jump_label_entry *e;
-	u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
-
-	head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
-	hlist_for_each_entry(e, node, head, hlist) {
-		if (key == e->key)
-			return e;
-	}
-	return NULL;
-}
+static void jump_label_update(struct jump_label_key *key, int enable);
 
-static struct jump_label_entry *
-add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
+void jump_label_inc(struct jump_label_key *key)
 {
-	struct hlist_head *head;
-	struct jump_label_entry *e;
-	u32 hash;
-
-	e = get_jump_label_entry(key);
-	if (e)
-		return ERR_PTR(-EEXIST);
-
-	e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
-	if (!e)
-		return ERR_PTR(-ENOMEM);
-
-	hash = jhash((void *)&key, sizeof(jump_label_t), 0);
-	head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
-	e->key = key;
-	e->table = table;
-	e->nr_entries = nr_entries;
-	INIT_HLIST_HEAD(&(e->modules));
-	hlist_add_head(&e->hlist, head);
-	return e;
-}
+	if (atomic_inc_not_zero(&key->enabled))
+		return;
 
-static int
-build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop)
-{
-	struct jump_entry *iter, *iter_begin;
-	struct jump_label_entry *entry;
-	int count;
-
-	sort_jump_label_entries(start, stop);
-	iter = start;
-	while (iter < stop) {
-		entry = get_jump_label_entry(iter->key);
-		if (!entry) {
-			iter_begin = iter;
-			count = 0;
-			while ((iter < stop) &&
-				(iter->key == iter_begin->key)) {
-				iter++;
-				count++;
-			}
-			entry = add_jump_label_entry(iter_begin->key,
-							count, iter_begin);
-			if (IS_ERR(entry))
-				return PTR_ERR(entry);
-		 } else {
-			WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
-			return -1;
-		}
-	}
-	return 0;
+	jump_label_lock();
+	if (atomic_add_return(1, &key->enabled) == 1)
+		jump_label_update(key, JUMP_LABEL_ENABLE);
+	jump_label_unlock();
 }
 
-/***
- * jump_label_update - update jump label text
- * @key -  key value associated with a a jump label
- * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
- *
- * Will enable/disable the jump for jump label @key, depending on the
- * value of @type.
- *
- */
-
-void jump_label_update(unsigned long key, enum jump_label_type type)
+void jump_label_dec(struct jump_label_key *key)
 {
-	struct jump_entry *iter;
-	struct jump_label_entry *entry;
-	struct hlist_node *module_node;
-	struct jump_label_module_entry *e_module;
-	int count;
+	if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
+		return;
 
-	jump_label_lock();
-	entry = get_jump_label_entry((jump_label_t)key);
-	if (entry) {
-		count = entry->nr_entries;
-		iter = entry->table;
-		while (count--) {
-			if (kernel_text_address(iter->code))
-				arch_jump_label_transform(iter, type);
-			iter++;
-		}
-		/* eanble/disable jump labels in modules */
-		hlist_for_each_entry(e_module, module_node, &(entry->modules),
-							hlist) {
-			count = e_module->nr_entries;
-			iter = e_module->table;
-			while (count--) {
-				if (iter->key &&
-						kernel_text_address(iter->code))
-					arch_jump_label_transform(iter, type);
-				iter++;
-			}
-		}
-	}
+	jump_label_update(key, JUMP_LABEL_DISABLE);
 	jump_label_unlock();
 }
 
@@ -197,77 +89,33 @@ static int addr_conflict(struct jump_entry *entry, void *start, void *end)
 	return 0;
 }
 
-#ifdef CONFIG_MODULES
-
-static int module_conflict(void *start, void *end)
+static int __jump_label_text_reserved(struct jump_entry *iter_start,
+		struct jump_entry *iter_stop, void *start, void *end)
 {
-	struct hlist_head *head;
-	struct hlist_node *node, *node_next, *module_node, *module_node_next;
-	struct jump_label_entry *e;
-	struct jump_label_module_entry *e_module;
 	struct jump_entry *iter;
-	int i, count;
-	int conflict = 0;
-
-	for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
-		head = &jump_label_table[i];
-		hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
-			hlist_for_each_entry_safe(e_module, module_node,
-							module_node_next,
-							&(e->modules), hlist) {
-				count = e_module->nr_entries;
-				iter = e_module->table;
-				while (count--) {
-					if (addr_conflict(iter, start, end)) {
-						conflict = 1;
-						goto out;
-					}
-					iter++;
-				}
-			}
-		}
-	}
-out:
-	return conflict;
-}
-
-#endif
-
-/***
- * jump_label_text_reserved - check if addr range is reserved
- * @start: start text addr
- * @end: end text addr
- *
- * checks if the text addr located between @start and @end
- * overlaps with any of the jump label patch addresses. Code
- * that wants to modify kernel text should first verify that
- * it does not overlap with any of the jump label addresses.
- * Caller must hold jump_label_mutex.
- *
- * returns 1 if there is an overlap, 0 otherwise
- */
-int jump_label_text_reserved(void *start, void *end)
-{
-	struct jump_entry *iter;
-	struct jump_entry *iter_start = __start___jump_table;
-	struct jump_entry *iter_stop = __start___jump_table;
-	int conflict = 0;
 
 	iter = iter_start;
 	while (iter < iter_stop) {
-		if (addr_conflict(iter, start, end)) {
-			conflict = 1;
-			goto out;
-		}
+		if (addr_conflict(iter, start, end))
+			return 1;
 		iter++;
 	}
 
-	/* now check modules */
-#ifdef CONFIG_MODULES
-	conflict = module_conflict(start, end);
-#endif
-out:
-	return conflict;
+	return 0;
+}
+
+static void __jump_label_update(struct jump_label_key *key,
+		struct jump_entry *entry, int enable)
+{
+	for (; entry->key == (jump_label_t)(unsigned long)key; entry++) {
+		/*
+		 * entry->code set to 0 invalidates module init text sections
+		 * kernel_text_address() verifies we are not in core kernel
+		 * init code, see jump_label_invalidate_module_init().
+		 */
+		if (entry->code && kernel_text_address(entry->code))
+			arch_jump_label_transform(entry, enable);
+	}
 }
 
 /*
@@ -277,142 +125,173 @@ void __weak arch_jump_label_text_poke_early(jump_label_t addr)
 {
 }
 
-static __init int init_jump_label(void)
+static __init int jump_label_init(void)
 {
-	int ret;
 	struct jump_entry *iter_start = __start___jump_table;
 	struct jump_entry *iter_stop = __stop___jump_table;
+	struct jump_label_key *key = NULL;
 	struct jump_entry *iter;
 
 	jump_label_lock();
-	ret = build_jump_label_hashtable(__start___jump_table,
-					 __stop___jump_table);
-	iter = iter_start;
-	while (iter < iter_stop) {
+	jump_label_sort_entries(iter_start, iter_stop);
+
+	for (iter = iter_start; iter < iter_stop; iter++) {
 		arch_jump_label_text_poke_early(iter->code);
-		iter++;
+		if (iter->key == (jump_label_t)(unsigned long)key)
+			continue;
+
+		key = (struct jump_label_key *)(unsigned long)iter->key;
+		atomic_set(&key->enabled, 0);
+		key->entries = iter;
+#ifdef CONFIG_MODULES
+		key->next = NULL;
+#endif
 	}
 	jump_label_unlock();
-	return ret;
+
+	return 0;
 }
-early_initcall(init_jump_label);
+early_initcall(jump_label_init);
 
 #ifdef CONFIG_MODULES
 
-static struct jump_label_module_entry *
-add_jump_label_module_entry(struct jump_label_entry *entry,
-			    struct jump_entry *iter_begin,
-			    int count, struct module *mod)
+struct jump_label_mod {
+	struct jump_label_mod *next;
+	struct jump_entry *entries;
+	struct module *mod;
+};
+
+static int __jump_label_mod_text_reserved(void *start, void *end)
+{
+	struct module *mod;
+
+	mod = __module_text_address((unsigned long)start);
+	if (!mod)
+		return 0;
+
+	WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+
+	return __jump_label_text_reserved(mod->jump_entries,
+				mod->jump_entries + mod->num_jump_entries,
+				start, end);
+}
+
+static void __jump_label_mod_update(struct jump_label_key *key, int enable)
+{
+	struct jump_label_mod *mod = key->next;
+
+	while (mod) {
+		__jump_label_update(key, mod->entries, enable);
+		mod = mod->next;
+	}
+}
+
+/***
+ * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
+ * @mod: module to patch
+ *
+ * Allow for run-time selection of the optimal nops. Before the module
+ * loads patch these with arch_get_jump_label_nop(), which is specified by
+ * the arch specific jump label code.
+ */
+void jump_label_apply_nops(struct module *mod)
 {
-	struct jump_label_module_entry *e;
-
-	e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL);
-	if (!e)
-		return ERR_PTR(-ENOMEM);
-	e->mod = mod;
-	e->nr_entries = count;
-	e->table = iter_begin;
-	hlist_add_head(&e->hlist, &entry->modules);
-	return e;
+	struct jump_entry *iter_start = mod->jump_entries;
+	struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
+	struct jump_entry *iter;
+
+	/* if the module doesn't have jump label entries, just return */
+	if (iter_start == iter_stop)
+		return;
+
+	for (iter = iter_start; iter < iter_stop; iter++)
+		arch_jump_label_text_poke_early(iter->code);
 }
 
-static int add_jump_label_module(struct module *mod)
+static int jump_label_add_module(struct module *mod)
 {
-	struct jump_entry *iter, *iter_begin;
-	struct jump_label_entry *entry;
-	struct jump_label_module_entry *module_entry;
-	int count;
+	struct jump_entry *iter_start = mod->jump_entries;
+	struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
+	struct jump_entry *iter;
+	struct jump_label_key *key = NULL;
+	struct jump_label_mod *jlm;
 
 	/* if the module doesn't have jump label entries, just return */
-	if (!mod->num_jump_entries)
+	if (iter_start == iter_stop)
 		return 0;
 
-	sort_jump_label_entries(mod->jump_entries,
-				mod->jump_entries + mod->num_jump_entries);
-	iter = mod->jump_entries;
-	while (iter < mod->jump_entries + mod->num_jump_entries) {
-		entry = get_jump_label_entry(iter->key);
-		iter_begin = iter;
-		count = 0;
-		while ((iter < mod->jump_entries + mod->num_jump_entries) &&
-			(iter->key == iter_begin->key)) {
-				iter++;
-				count++;
-		}
-		if (!entry) {
-			entry = add_jump_label_entry(iter_begin->key, 0, NULL);
-			if (IS_ERR(entry))
-				return PTR_ERR(entry);
+	jump_label_sort_entries(iter_start, iter_stop);
+
+	for (iter = iter_start; iter < iter_stop; iter++) {
+		if (iter->key == (jump_label_t)(unsigned long)key)
+			continue;
+
+		key = (struct jump_label_key *)(unsigned long)iter->key;
+
+		if (__module_address(iter->key) == mod) {
+			atomic_set(&key->enabled, 0);
+			key->entries = iter;
+			key->next = NULL;
+			continue;
 		}
-		module_entry = add_jump_label_module_entry(entry, iter_begin,
-							   count, mod);
-		if (IS_ERR(module_entry))
-			return PTR_ERR(module_entry);
+
+		jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL);
+		if (!jlm)
+			return -ENOMEM;
+
+		jlm->mod = mod;
+		jlm->entries = iter;
+		jlm->next = key->next;
+		key->next = jlm;
+
+		if (jump_label_enabled(key))
+			__jump_label_update(key, iter, JUMP_LABEL_ENABLE);
 	}
+
 	return 0;
 }
 
-static void remove_jump_label_module(struct module *mod)
+static void jump_label_del_module(struct module *mod)
 {
-	struct hlist_head *head;
-	struct hlist_node *node, *node_next, *module_node, *module_node_next;
-	struct jump_label_entry *e;
-	struct jump_label_module_entry *e_module;
-	int i;
+	struct jump_entry *iter_start = mod->jump_entries;
+	struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
+	struct jump_entry *iter;
+	struct jump_label_key *key = NULL;
+	struct jump_label_mod *jlm, **prev;
 
-	/* if the module doesn't have jump label entries, just return */
-	if (!mod->num_jump_entries)
-		return;
+	for (iter = iter_start; iter < iter_stop; iter++) {
+		if (iter->key == (jump_label_t)(unsigned long)key)
+			continue;
+
+		key = (struct jump_label_key *)(unsigned long)iter->key;
+
+		if (__module_address(iter->key) == mod)
+			continue;
+
+		prev = &key->next;
+		jlm = key->next;
 
-	for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
-		head = &jump_label_table[i];
-		hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
-			hlist_for_each_entry_safe(e_module, module_node,
-						  module_node_next,
-						  &(e->modules), hlist) {
-				if (e_module->mod == mod) {
-					hlist_del(&e_module->hlist);
-					kfree(e_module);
-				}
-			}
-			if (hlist_empty(&e->modules) && (e->nr_entries == 0)) {
-				hlist_del(&e->hlist);
-				kfree(e);
-			}
+		while (jlm && jlm->mod != mod) {
+			prev = &jlm->next;
+			jlm = jlm->next;
+		}
+
+		if (jlm) {
+			*prev = jlm->next;
+			kfree(jlm);
 		}
 	}
 }
 
-static void remove_jump_label_module_init(struct module *mod)
+static void jump_label_invalidate_module_init(struct module *mod)
 {
-	struct hlist_head *head;
-	struct hlist_node *node, *node_next, *module_node, *module_node_next;
-	struct jump_label_entry *e;
-	struct jump_label_module_entry *e_module;
+	struct jump_entry *iter_start = mod->jump_entries;
+	struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
 	struct jump_entry *iter;
-	int i, count;
-
-	/* if the module doesn't have jump label entries, just return */
-	if (!mod->num_jump_entries)
-		return;
 
-	for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
-		head = &jump_label_table[i];
-		hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
-			hlist_for_each_entry_safe(e_module, module_node,
-						  module_node_next,
-						  &(e->modules), hlist) {
-				if (e_module->mod != mod)
-					continue;
-				count = e_module->nr_entries;
-				iter = e_module->table;
-				while (count--) {
-					if (within_module_init(iter->code, mod))
-						iter->key = 0;
-					iter++;
-				}
-			}
-		}
+	for (iter = iter_start; iter < iter_stop; iter++) {
+		if (within_module_init(iter->code, mod))
+			iter->code = 0;
 	}
 }
 
@@ -426,59 +305,77 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
 	switch (val) {
 	case MODULE_STATE_COMING:
 		jump_label_lock();
-		ret = add_jump_label_module(mod);
+		ret = jump_label_add_module(mod);
 		if (ret)
-			remove_jump_label_module(mod);
+			jump_label_del_module(mod);
 		jump_label_unlock();
 		break;
 	case MODULE_STATE_GOING:
 		jump_label_lock();
-		remove_jump_label_module(mod);
+		jump_label_del_module(mod);
 		jump_label_unlock();
 		break;
 	case MODULE_STATE_LIVE:
 		jump_label_lock();
-		remove_jump_label_module_init(mod);
+		jump_label_invalidate_module_init(mod);
 		jump_label_unlock();
 		break;
 	}
-	return ret;
-}
 
-/***
- * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
- * @mod: module to patch
- *
- * Allow for run-time selection of the optimal nops. Before the module
- * loads patch these with arch_get_jump_label_nop(), which is specified by
- * the arch specific jump label code.
- */
-void jump_label_apply_nops(struct module *mod)
-{
-	struct jump_entry *iter;
-
-	/* if the module doesn't have jump label entries, just return */
-	if (!mod->num_jump_entries)
-		return;
-
-	iter = mod->jump_entries;
-	while (iter < mod->jump_entries + mod->num_jump_entries) {
-		arch_jump_label_text_poke_early(iter->code);
-		iter++;
-	}
+	return notifier_from_errno(ret);
 }
 
 struct notifier_block jump_label_module_nb = {
 	.notifier_call = jump_label_module_notify,
-	.priority = 0,
+	.priority = 1, /* higher than tracepoints */
 };
 
-static __init int init_jump_label_module(void)
+static __init int jump_label_init_module(void)
 {
 	return register_module_notifier(&jump_label_module_nb);
 }
-early_initcall(init_jump_label_module);
+early_initcall(jump_label_init_module);
 
 #endif /* CONFIG_MODULES */
 
+/***
+ * jump_label_text_reserved - check if addr range is reserved
+ * @start: start text addr
+ * @end: end text addr
+ *
+ * checks if the text addr located between @start and @end
+ * overlaps with any of the jump label patch addresses. Code
+ * that wants to modify kernel text should first verify that
+ * it does not overlap with any of the jump label addresses.
+ * Caller must hold jump_label_mutex.
+ *
+ * returns 1 if there is an overlap, 0 otherwise
+ */
+int jump_label_text_reserved(void *start, void *end)
+{
+	int ret = __jump_label_text_reserved(__start___jump_table,
+			__stop___jump_table, start, end);
+
+	if (ret)
+		return ret;
+
+#ifdef CONFIG_MODULES
+	ret = __jump_label_mod_text_reserved(start, end);
+#endif
+	return ret;
+}
+
+static void jump_label_update(struct jump_label_key *key, int enable)
+{
+	struct jump_entry *entry = key->entries;
+
+	/* if there are no users, entry can be NULL */
+	if (entry)
+		__jump_label_update(key, entry, enable);
+
+#ifdef CONFIG_MODULES
+	__jump_label_mod_update(key, enable);
+#endif
+}
+
 #endif
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index c75925c4d1e..d665e92fbd4 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -125,7 +125,7 @@ enum event_type_t {
  * perf_sched_events : >0 events exist
  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
  */
-atomic_t perf_sched_events __read_mostly;
+struct jump_label_key perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -5417,7 +5417,7 @@ fail:
 	return err;
 }
 
-atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
 
 static void sw_perf_event_destroy(struct perf_event *event)
 {
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 68187af4889..b219f1449c5 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -251,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
 {
 	WARN_ON(strcmp((*entry)->name, elem->name) != 0);
 
-	if (elem->regfunc && !elem->state && active)
+	if (elem->regfunc && !jump_label_enabled(&elem->key) && active)
 		elem->regfunc();
-	else if (elem->unregfunc && elem->state && !active)
+	else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active)
 		elem->unregfunc();
 
 	/*
@@ -264,13 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,
 	 * is used.
 	 */
 	rcu_assign_pointer(elem->funcs, (*entry)->funcs);
-	if (!elem->state && active) {
-		jump_label_enable(&elem->state);
-		elem->state = active;
-	} else if (elem->state && !active) {
-		jump_label_disable(&elem->state);
-		elem->state = active;
-	}
+	if (active && !jump_label_enabled(&elem->key))
+		jump_label_inc(&elem->key);
+	else if (!active && jump_label_enabled(&elem->key))
+		jump_label_dec(&elem->key);
 }
 
 /*
@@ -281,13 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
  */
 static void disable_tracepoint(struct tracepoint *elem)
 {
-	if (elem->unregfunc && elem->state)
+	if (elem->unregfunc && jump_label_enabled(&elem->key))
 		elem->unregfunc();
 
-	if (elem->state) {
-		jump_label_disable(&elem->state);
-		elem->state = 0;
-	}
+	if (jump_label_enabled(&elem->key))
+		jump_label_dec(&elem->key);
 	rcu_assign_pointer(elem->funcs, NULL);
 }
 
-- 
cgit v1.2.3-70-g09d2


From ef64789413c73f32faa5e5f1bc393e5843b0aa51 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Wed, 16 Mar 2011 15:58:27 -0400
Subject: jump label: Add _ASM_ALIGN for x86 and x86_64

The linker should not be adding holes to word size aligned pointers, but
out of paranoia we are explicitly specifying that alignment. I have not
seen any holes in the jump label section in practice.

Signed-off-by: Jason Baron <jbaron@redhat.com>
LKML-Reference: <e119fbd060c9452c56063ea6148ba1070e7434cc.1300299760.git.jbaron@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/include/asm/jump_label.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index f217cee8653..a32b18ce6ea 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -16,6 +16,7 @@ static __always_inline bool arch_static_branch(struct jump_label_key *key)
 	asm goto("1:"
 		JUMP_LABEL_INITIAL_NOP
 		".pushsection __jump_table,  \"aw\" \n\t"
+		_ASM_ALIGN "\n\t"
 		_ASM_PTR "1b, %l[l_yes], %c0 \n\t"
 		".popsection \n\t"
 		: :  "i" (key) : : l_yes);
-- 
cgit v1.2.3-70-g09d2


From 660e34cebf0a11d54f2d5dd8838607452355f321 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Mon, 4 Apr 2011 13:55:05 -0400
Subject: x86: Reorder reboot method preferences

We have a never ending stream of 'reboot quirks' for new boxes
that will not reboot properly under Linux (they will hang on
reboot).

The reason is widespread 'Windows compatible' assumption of modern
x86 hardware, which expects the following reboot sequence:

 - hitting the ACPI reboot vector (if available)
 - trying the keyboard controller
 - hitting the ACPI reboot vector again
 - then giving the keyboard controller one last go

This sequence expectation gets more and more embedded in modern
hardware, which often lacks a keyboard controller and may even
lock up if the legacy io ports are hit - and which hardware is
often not tested with Linux during development.

The end result is that reboot works under Windows-alike OSs but not
under Linux.

Rework our reboot process to meet this hardware externality a little
better and match this assumption of newer x86 hardware.

In addition to the ACPI,kbd,ACPI,kbd sequence we'll still fall
through to attempting a legacy triple fault if nothing else
works - and keep trying that and the kbd reset.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
[ this commit will also save special casing Oaktrail boards ]
Acked-by: Alan Cox <alan@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Leann Ogasawara <leann.ogasawara@canonical.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Len Brown <len.brown@intel.com>
LKML-Reference: <1301939705-2404-1-git-send-email-mjg@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/reboot.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 08c44b08bf5..0c016f72769 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -36,7 +36,7 @@ EXPORT_SYMBOL(pm_power_off);
 
 static const struct desc_ptr no_idt = {};
 static int reboot_mode;
-enum reboot_type reboot_type = BOOT_KBD;
+enum reboot_type reboot_type = BOOT_ACPI;
 int reboot_force;
 
 #if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
@@ -478,9 +478,24 @@ void __attribute__((weak)) mach_reboot_fixups(void)
 {
 }
 
+/*
+ * Windows compatible x86 hardware expects the following on reboot:
+ *
+ * 1) If the FADT has the ACPI reboot register flag set, try it
+ * 2) If still alive, write to the keyboard controller
+ * 3) If still alive, write to the ACPI reboot register again
+ * 4) If still alive, write to the keyboard controller again
+ *
+ * If the machine is still alive at this stage, it gives up. We default to
+ * following the same pattern, except that if we're still alive after (4) we'll
+ * try to force a triple fault and then cycle between hitting the keyboard
+ * controller and doing that
+ */
 static void native_machine_emergency_restart(void)
 {
 	int i;
+	int attempt = 0;
+	int orig_reboot_type = reboot_type;
 
 	if (reboot_emergency)
 		emergency_vmx_disable_all();
@@ -502,6 +517,13 @@ static void native_machine_emergency_restart(void)
 				outb(0xfe, 0x64); /* pulse reset low */
 				udelay(50);
 			}
+			if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
+				attempt = 1;
+				reboot_type = BOOT_ACPI;
+			} else {
+				reboot_type = BOOT_TRIPLE;
+			}
+			break;
 
 		case BOOT_TRIPLE:
 			load_idt(&no_idt);
-- 
cgit v1.2.3-70-g09d2


From ded467374a34eb80020c2213456b1d9ca946b88c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 6 Apr 2011 10:53:48 +0200
Subject: x86/amd-iommu: Move compl-wait command building to own function

This patch introduces a seperate function for building
completion-wait commands.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 57ca7778722..eebd504519c 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -383,6 +383,13 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
  *
  ****************************************************************************/
 
+static void build_completion_wait(struct iommu_cmd *cmd)
+{
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->data[0] = CMD_COMPL_WAIT_INT_MASK;
+	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
+}
+
 /*
  * Writes the command to the IOMMUs command buffer and informs the
  * hardware about the new command. Must be called with iommu->lock held.
@@ -458,9 +465,7 @@ static int __iommu_completion_wait(struct amd_iommu *iommu)
 {
 	struct iommu_cmd cmd;
 
-	 memset(&cmd, 0, sizeof(cmd));
-	 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
-	 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
+	build_completion_wait(&cmd);
 
 	 return __iommu_queue_command(iommu, &cmd);
 }
-- 
cgit v1.2.3-70-g09d2


From 94fe79e2f100bfcd8e7689cbf8838634779b80a2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 6 Apr 2011 11:07:21 +0200
Subject: x86/amd-iommu: Move inv-dte command building to own function

This patch moves command building for the invalidate-dte
command into its own function.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index eebd504519c..4e5631a433a 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -390,6 +390,13 @@ static void build_completion_wait(struct iommu_cmd *cmd)
 	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
 }
 
+static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
+{
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->data[0] = devid;
+	CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
+}
+
 /*
  * Writes the command to the IOMMUs command buffer and informs the
  * hardware about the new command. Must be called with iommu->lock held.
@@ -533,10 +540,7 @@ static int iommu_flush_device(struct device *dev)
 	devid = get_device_id(dev);
 	iommu = amd_iommu_rlookup_table[devid];
 
-	/* Build command */
-	memset(&cmd, 0, sizeof(cmd));
-	CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
-	cmd.data[0] = devid;
+	build_inv_dte(&cmd, devid);
 
 	return iommu_queue_command(iommu, &cmd);
 }
-- 
cgit v1.2.3-70-g09d2


From 11b6402c6673b530fac9920c5640c75e99fee956 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 6 Apr 2011 11:49:28 +0200
Subject: x86/amd-iommu: Cleanup inv_pages command handling

This patch reworks the processing of invalidate-pages
commands to the IOMMU. The function building the the command
is extended so we can get rid of another function. It was
also renamed to match with the other function names.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 83 ++++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 47 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 4e5631a433a..f8ec28ea331 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -397,6 +397,37 @@ static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
 	CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
 }
 
+static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
+				  size_t size, u16 domid, int pde)
+{
+	u64 pages;
+	int s;
+
+	pages = iommu_num_pages(address, size, PAGE_SIZE);
+	s     = 0;
+
+	if (pages > 1) {
+		/*
+		 * If we have to flush more than one page, flush all
+		 * TLB entries for this domain
+		 */
+		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+		s = 1;
+	}
+
+	address &= PAGE_MASK;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->data[1] |= domid;
+	cmd->data[2]  = lower_32_bits(address);
+	cmd->data[3]  = upper_32_bits(address);
+	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
+	if (s) /* size bit - we flush more than one 4kb page */
+		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+	if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
+		cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+}
+
 /*
  * Writes the command to the IOMMUs command buffer and informs the
  * hardware about the new command. Must be called with iommu->lock held.
@@ -545,37 +576,6 @@ static int iommu_flush_device(struct device *dev)
 	return iommu_queue_command(iommu, &cmd);
 }
 
-static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
-					  u16 domid, int pde, int s)
-{
-	memset(cmd, 0, sizeof(*cmd));
-	address &= PAGE_MASK;
-	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
-	cmd->data[1] |= domid;
-	cmd->data[2] = lower_32_bits(address);
-	cmd->data[3] = upper_32_bits(address);
-	if (s) /* size bit - we flush more than one 4kb page */
-		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
-	if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
-		cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
-}
-
-/*
- * Generic command send function for invalidaing TLB entries
- */
-static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
-		u64 address, u16 domid, int pde, int s)
-{
-	struct iommu_cmd cmd;
-	int ret;
-
-	__iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s);
-
-	ret = iommu_queue_command(iommu, &cmd);
-
-	return ret;
-}
-
 /*
  * TLB invalidation function which is called from the mapping functions.
  * It invalidates a single PTE if the range to flush is within a single
@@ -584,20 +584,10 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
 static void __iommu_flush_pages(struct protection_domain *domain,
 				u64 address, size_t size, int pde)
 {
-	int s = 0, i;
-	unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE);
-
-	address &= PAGE_MASK;
-
-	if (pages > 1) {
-		/*
-		 * If we have to flush more than one page, flush all
-		 * TLB entries for this domain
-		 */
-		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
-		s = 1;
-	}
+	struct iommu_cmd cmd;
+	int ret = 0, i;
 
+	build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
 
 	for (i = 0; i < amd_iommus_present; ++i) {
 		if (!domain->dev_iommu[i])
@@ -607,11 +597,10 @@ static void __iommu_flush_pages(struct protection_domain *domain,
 		 * Devices of this domain are behind this IOMMU
 		 * We need a TLB flush
 		 */
-		iommu_queue_inv_iommu_pages(amd_iommus[i], address,
-					    domain->id, pde, s);
+		ret |= iommu_queue_command(amd_iommus[i], &cmd);
 	}
 
-	return;
+	WARN_ON(ret);
 }
 
 static void iommu_flush_pages(struct protection_domain *domain,
-- 
cgit v1.2.3-70-g09d2


From 3fe14ab541cd9b0d1f243afb7556046f12c8743c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Apr 2011 00:23:47 +0200
Subject: x86-32, numa: Fix failure condition check in alloc_remap()

node_remap_{start|end}_vaddr[] describe [start, end) ranges; however,
alloc_remap() incorrectly failed when the current allocation + size
equaled the end but it should fail only when it goes over.  Fix it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1301955840-7246-2-git-send-email-tj@kernel.org
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/numa_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index bde3906420d..84aac47c388 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -200,7 +200,7 @@ void *alloc_remap(int nid, unsigned long size)
 
 	size = ALIGN(size, L1_CACHE_BYTES);
 
-	if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
+	if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
 		return NULL;
 
 	node_remap_alloc_vaddr[nid] += size;
-- 
cgit v1.2.3-70-g09d2


From a6c24f7a705d939ddd2fcaa443fa3d8e852b933d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Apr 2011 00:23:48 +0200
Subject: x86-32, numa: Align pgdat size while initializing alloc_remap

When pgdat is reserved in init_remap_allocator(), PAGE_SIZE aligned
size will be used.  Match the size alignment in initialization to
avoid allocation failure down the road.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1301955840-7246-3-git-send-email-tj@kernel.org
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/numa_32.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 84aac47c388..50e82507eab 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -287,7 +287,8 @@ static __init unsigned long calculate_numa_remap_pages(void)
 			node_end_pfn[nid] = max_pfn;
 
 		/* ensure the remap includes space for the pgdat. */
-		size = node_remap_size[nid] + sizeof(pg_data_t);
+		size = node_remap_size[nid];
+		size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
 
 		/* convert size to large (pmd size) pages, rounding up */
 		size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
-- 
cgit v1.2.3-70-g09d2


From 5b8443b25c0f323ec190d094e4b441957b02664e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Apr 2011 00:23:49 +0200
Subject: x86-32, numa: Remove redundant top-down alloc code from remap
 initialization

memblock_find_in_range() now does top-down allocation by default, so
there's no reason for its callers to explicitly implement it by
gradually lowering the start address.

Remove redundant top-down allocation logic from init_meminit() and
calculate_numa_remap_pages().

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1301955840-7246-4-git-send-email-tj@kernel.org
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/numa_32.c | 43 ++++++++++++++-----------------------------
 1 file changed, 14 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 50e82507eab..60701a5e0de 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -270,8 +270,7 @@ static __init unsigned long calculate_numa_remap_pages(void)
 	unsigned long size, reserve_pages = 0;
 
 	for_each_online_node(nid) {
-		u64 node_kva_target;
-		u64 node_kva_final;
+		u64 node_kva;
 
 		/*
 		 * The acpi/srat node info can show hot-add memroy zones
@@ -295,19 +294,11 @@ static __init unsigned long calculate_numa_remap_pages(void)
 		/* now the roundup is correct, convert to PAGE_SIZE pages */
 		size = size * PTRS_PER_PTE;
 
-		node_kva_target = round_down(node_end_pfn[nid] - size,
-						 PTRS_PER_PTE);
-		node_kva_target <<= PAGE_SHIFT;
-		do {
-			node_kva_final = memblock_find_in_range(node_kva_target,
+		node_kva = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT,
 					((u64)node_end_pfn[nid])<<PAGE_SHIFT,
-						((u64)size)<<PAGE_SHIFT,
-						LARGE_PAGE_BYTES);
-			node_kva_target -= LARGE_PAGE_BYTES;
-		} while (node_kva_final == MEMBLOCK_ERROR &&
-			 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
-
-		if (node_kva_final == MEMBLOCK_ERROR)
+					((u64)size)<<PAGE_SHIFT,
+					LARGE_PAGE_BYTES);
+		if (node_kva == MEMBLOCK_ERROR)
 			panic("Can not get kva ram\n");
 
 		node_remap_size[nid] = size;
@@ -315,7 +306,7 @@ static __init unsigned long calculate_numa_remap_pages(void)
 		reserve_pages += size;
 		printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
 				  " node %d at %llx\n",
-				size, nid, node_kva_final>>PAGE_SHIFT);
+				size, nid, node_kva >> PAGE_SHIFT);
 
 		/*
 		 *  prevent kva address below max_low_pfn want it on system
@@ -328,11 +319,11 @@ static __init unsigned long calculate_numa_remap_pages(void)
 		 *  to use it as free.
 		 *  So memblock_x86_reserve_range here, hope we don't run out of that array
 		 */
-		memblock_x86_reserve_range(node_kva_final,
-			      node_kva_final+(((u64)size)<<PAGE_SHIFT),
-			      "KVA RAM");
+		memblock_x86_reserve_range(node_kva,
+					   node_kva + (((u64)size)<<PAGE_SHIFT),
+					   "KVA RAM");
 
-		node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
+		node_remap_start_pfn[nid] = node_kva >> PAGE_SHIFT;
 	}
 	printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
 			reserve_pages);
@@ -356,7 +347,6 @@ static void init_remap_allocator(int nid)
 void __init initmem_init(void)
 {
 	int nid;
-	long kva_target_pfn;
 
 	/*
 	 * When mapping a NUMA machine we allocate the node_mem_map arrays
@@ -371,15 +361,10 @@ void __init initmem_init(void)
 
 	kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
 
-	kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
-	do {
-		kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT,
-					max_low_pfn<<PAGE_SHIFT,
-					kva_pages<<PAGE_SHIFT,
-					PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
-		kva_target_pfn -= PTRS_PER_PTE;
-	} while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn);
-
+	kva_start_pfn = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
+				max_low_pfn << PAGE_SHIFT,
+				kva_pages << PAGE_SHIFT,
+				PTRS_PER_PTE << PAGE_SHIFT) >> PAGE_SHIFT;
 	if (kva_start_pfn == MEMBLOCK_ERROR)
 		panic("Can not get kva space\n");
 
-- 
cgit v1.2.3-70-g09d2


From 5510db9c1be111528ce46c57f0bec1c9dce258f4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Apr 2011 00:23:50 +0200
Subject: x86-32, numa: Reorganize calculate_numa_remap_page()

Separate the outer node walking loop and per-node logic from
calculate_numa_remap_pages().  The outer loop is collapsed into
initmem_init() and the per-node logic is moved into a new function -
init_alloc_remap().

The new function name is confusing with the existing
init_remap_allocator() and the behavior is the function isn't very
clean either at this point, but this is to prepare for further
cleanups and it will become prettier.

This function doesn't introduce any behavior change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1301955840-7246-5-git-send-email-tj@kernel.org
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/numa_32.c | 125 +++++++++++++++++++++++++-------------------------
 1 file changed, 62 insertions(+), 63 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 60701a5e0de..5039e9b21d9 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -264,70 +264,64 @@ void resume_map_numa_kva(pgd_t *pgd_base)
 }
 #endif
 
-static __init unsigned long calculate_numa_remap_pages(void)
+static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 {
-	int nid;
-	unsigned long size, reserve_pages = 0;
+	unsigned long size;
+	u64 node_kva;
 
-	for_each_online_node(nid) {
-		u64 node_kva;
-
-		/*
-		 * The acpi/srat node info can show hot-add memroy zones
-		 * where memory could be added but not currently present.
-		 */
-		printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
-			nid, node_start_pfn[nid], node_end_pfn[nid]);
-		if (node_start_pfn[nid] > max_pfn)
-			continue;
-		if (!node_end_pfn[nid])
-			continue;
-		if (node_end_pfn[nid] > max_pfn)
-			node_end_pfn[nid] = max_pfn;
-
-		/* ensure the remap includes space for the pgdat. */
-		size = node_remap_size[nid];
-		size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
-
-		/* convert size to large (pmd size) pages, rounding up */
-		size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
-		/* now the roundup is correct, convert to PAGE_SIZE pages */
-		size = size * PTRS_PER_PTE;
-
-		node_kva = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT,
-					((u64)node_end_pfn[nid])<<PAGE_SHIFT,
-					((u64)size)<<PAGE_SHIFT,
-					LARGE_PAGE_BYTES);
-		if (node_kva == MEMBLOCK_ERROR)
-			panic("Can not get kva ram\n");
-
-		node_remap_size[nid] = size;
-		node_remap_offset[nid] = reserve_pages;
-		reserve_pages += size;
-		printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
-				  " node %d at %llx\n",
-				size, nid, node_kva >> PAGE_SHIFT);
-
-		/*
-		 *  prevent kva address below max_low_pfn want it on system
-		 *  with less memory later.
-		 *  layout will be: KVA address , KVA RAM
-		 *
-		 *  we are supposed to only record the one less then max_low_pfn
-		 *  but we could have some hole in high memory, and it will only
-		 *  check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
-		 *  to use it as free.
-		 *  So memblock_x86_reserve_range here, hope we don't run out of that array
-		 */
-		memblock_x86_reserve_range(node_kva,
-					   node_kva + (((u64)size)<<PAGE_SHIFT),
-					   "KVA RAM");
-
-		node_remap_start_pfn[nid] = node_kva >> PAGE_SHIFT;
-	}
-	printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
-			reserve_pages);
-	return reserve_pages;
+	/*
+	 * The acpi/srat node info can show hot-add memroy zones where
+	 * memory could be added but not currently present.
+	 */
+	printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
+	       nid, node_start_pfn[nid], node_end_pfn[nid]);
+	if (node_start_pfn[nid] > max_pfn)
+		return 0;
+	if (!node_end_pfn[nid])
+		return 0;
+	if (node_end_pfn[nid] > max_pfn)
+		node_end_pfn[nid] = max_pfn;
+
+	/* ensure the remap includes space for the pgdat. */
+	size = node_remap_size[nid];
+	size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+
+	/* convert size to large (pmd size) pages, rounding up */
+	size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
+	/* now the roundup is correct, convert to PAGE_SIZE pages */
+	size = size * PTRS_PER_PTE;
+
+	node_kva = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT,
+					  (u64)node_end_pfn[nid] << PAGE_SHIFT,
+					  (u64)size << PAGE_SHIFT,
+					  LARGE_PAGE_BYTES);
+	if (node_kva == MEMBLOCK_ERROR)
+		panic("Can not get kva ram\n");
+
+	node_remap_size[nid] = size;
+	node_remap_offset[nid] = offset;
+	printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of node %d at %llx\n",
+	       size, nid, node_kva >> PAGE_SHIFT);
+
+	/*
+	 *  prevent kva address below max_low_pfn want it on system
+	 *  with less memory later.
+	 *  layout will be: KVA address , KVA RAM
+	 *
+	 *  we are supposed to only record the one less then
+	 *  max_low_pfn but we could have some hole in high memory,
+	 *  and it will only check page_is_ram(pfn) &&
+	 *  !page_is_reserved_early(pfn) to decide to use it as free.
+	 *  So memblock_x86_reserve_range here, hope we don't run out
+	 *  of that array
+	 */
+	memblock_x86_reserve_range(node_kva,
+				   node_kva + ((u64)size << PAGE_SHIFT),
+				   "KVA RAM");
+
+	node_remap_start_pfn[nid] = node_kva >> PAGE_SHIFT;
+
+	return size;
 }
 
 static void init_remap_allocator(int nid)
@@ -346,6 +340,7 @@ static void init_remap_allocator(int nid)
 
 void __init initmem_init(void)
 {
+	unsigned long reserve_pages = 0;
 	int nid;
 
 	/*
@@ -359,7 +354,11 @@ void __init initmem_init(void)
 	get_memcfg_numa();
 	numa_init_array();
 
-	kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
+	for_each_online_node(nid)
+		reserve_pages += init_alloc_remap(nid, reserve_pages);
+	kva_pages = roundup(reserve_pages, PTRS_PER_PTE);
+	printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
+			reserve_pages);
 
 	kva_start_pfn = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
 				max_low_pfn << PAGE_SHIFT,
-- 
cgit v1.2.3-70-g09d2


From c4d4f577d49c441ab4f1bb6068247dafb366e635 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Apr 2011 00:23:51 +0200
Subject: x86-32, numa: Rename @node_kva to @node_pa in init_alloc_remap()

init_alloc_remap() is about to do more and using _kva suffix for
physical address becomes confusing because the function will be
handling both physical and virtual addresses.  Rename @node_kva to
@node_pa.

This is trivial rename and doesn't cause any behavior difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1301955840-7246-6-git-send-email-tj@kernel.org
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/numa_32.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 5039e9b21d9..30933fec8f7 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -267,7 +267,7 @@ void resume_map_numa_kva(pgd_t *pgd_base)
 static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 {
 	unsigned long size;
-	u64 node_kva;
+	u64 node_pa;
 
 	/*
 	 * The acpi/srat node info can show hot-add memroy zones where
@@ -291,17 +291,17 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 	/* now the roundup is correct, convert to PAGE_SIZE pages */
 	size = size * PTRS_PER_PTE;
 
-	node_kva = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT,
-					  (u64)node_end_pfn[nid] << PAGE_SHIFT,
-					  (u64)size << PAGE_SHIFT,
-					  LARGE_PAGE_BYTES);
-	if (node_kva == MEMBLOCK_ERROR)
+	node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT,
+					 (u64)node_end_pfn[nid] << PAGE_SHIFT,
+					 (u64)size << PAGE_SHIFT,
+					 LARGE_PAGE_BYTES);
+	if (node_pa == MEMBLOCK_ERROR)
 		panic("Can not get kva ram\n");
 
 	node_remap_size[nid] = size;
 	node_remap_offset[nid] = offset;
 	printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of node %d at %llx\n",
-	       size, nid, node_kva >> PAGE_SHIFT);
+	       size, nid, node_pa >> PAGE_SHIFT);
 
 	/*
 	 *  prevent kva address below max_low_pfn want it on system
@@ -315,11 +315,10 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 	 *  So memblock_x86_reserve_range here, hope we don't run out
 	 *  of that array
 	 */
-	memblock_x86_reserve_range(node_kva,
-				   node_kva + ((u64)size << PAGE_SHIFT),
+	memblock_x86_reserve_range(node_pa, node_pa + ((u64)size << PAGE_SHIFT),
 				   "KVA RAM");
 
-	node_remap_start_pfn[nid] = node_kva >> PAGE_SHIFT;
+	node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
 
 	return size;
 }
-- 
cgit v1.2.3-70-g09d2


From af7c1a6e8374e05aab4a98ce4d2fb07b66506a02 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Apr 2011 00:23:52 +0200
Subject: x86-32, numa: Make @size in init_aloc_remap() represent bytes

@size variable in init_alloc_remap() is confusing in that it starts as
number of bytes as its name implies and then becomes number of pages.
Make it consistently represent bytes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1301955840-7246-7-git-send-email-tj@kernel.org
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/numa_32.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 30933fec8f7..99310d26fe3 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -286,22 +286,19 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 	size = node_remap_size[nid];
 	size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
 
-	/* convert size to large (pmd size) pages, rounding up */
-	size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
-	/* now the roundup is correct, convert to PAGE_SIZE pages */
-	size = size * PTRS_PER_PTE;
+	/* align to large page */
+	size = ALIGN(size, LARGE_PAGE_BYTES);
 
 	node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT,
 					 (u64)node_end_pfn[nid] << PAGE_SHIFT,
-					 (u64)size << PAGE_SHIFT,
-					 LARGE_PAGE_BYTES);
+					 size, LARGE_PAGE_BYTES);
 	if (node_pa == MEMBLOCK_ERROR)
 		panic("Can not get kva ram\n");
 
-	node_remap_size[nid] = size;
+	node_remap_size[nid] = size >> PAGE_SHIFT;
 	node_remap_offset[nid] = offset;
 	printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of node %d at %llx\n",
-	       size, nid, node_pa >> PAGE_SHIFT);
+	       size >> PAGE_SHIFT, nid, node_pa >> PAGE_SHIFT);
 
 	/*
 	 *  prevent kva address below max_low_pfn want it on system
@@ -315,12 +312,11 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 	 *  So memblock_x86_reserve_range here, hope we don't run out
 	 *  of that array
 	 */
-	memblock_x86_reserve_range(node_pa, node_pa + ((u64)size << PAGE_SHIFT),
-				   "KVA RAM");
+	memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");
 
 	node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
 
-	return size;
+	return size >> PAGE_SHIFT;
 }
 
 static void init_remap_allocator(int nid)
-- 
cgit v1.2.3-70-g09d2


From 7210cf9217937e470a9acbc113a590f476b9c047 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Apr 2011 00:23:53 +0200
Subject: x86-32, numa: Calculate remap size in common code

Only pgdat and memmap use remap area and there isn't much benefit in
allowing per-node override.  In addition, the use of node_remap_size[]
is confusing in that it contains number of bytes before remap
initialization and then number of pages afterwards.

Move remap size calculation for memap from specific NUMA config
implementations to init_alloc_remap() and make node_remap_size[]
static.

The only behavior difference is that, before this patch, numaq_32
didn't consider max_pfn when calculating the memmap size but it's
enforced after this patch, which is the right thing to do.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1301955840-7246-8-git-send-email-tj@kernel.org
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/topology.h |  1 -
 arch/x86/kernel/apic/numaq_32.c |  4 ----
 arch/x86/mm/numa_32.c           | 10 ++++------
 arch/x86/mm/srat_32.c           |  1 -
 4 files changed, 4 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 910a7084f7f..8dba76972fd 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -95,7 +95,6 @@ extern void setup_node_to_cpumask_map(void);
 #ifdef CONFIG_X86_32
 extern unsigned long node_start_pfn[];
 extern unsigned long node_end_pfn[];
-extern unsigned long node_remap_size[];
 #define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
 
 # define SD_CACHE_NICE_TRIES	1
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 6273eee5134..0aced70815f 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -93,10 +93,6 @@ static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
 						node_end_pfn[node]);
 
 	memory_present(node, node_start_pfn[node], node_end_pfn[node]);
-
-	node_remap_size[node] = node_memmap_size_bytes(node,
-					node_start_pfn[node],
-					node_end_pfn[node]);
 }
 
 /*
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 99310d26fe3..9a7336550f0 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -104,7 +104,7 @@ extern unsigned long highend_pfn, highstart_pfn;
 
 #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
 
-unsigned long node_remap_size[MAX_NUMNODES];
+static unsigned long node_remap_size[MAX_NUMNODES];
 static void *node_remap_start_vaddr[MAX_NUMNODES];
 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
@@ -129,7 +129,6 @@ int __init get_memcfg_numa_flat(void)
 	node_end_pfn[0] = max_pfn;
 	memblock_x86_register_active_regions(0, 0, max_pfn);
 	memory_present(0, 0, max_pfn);
-	node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
 
         /* Indicate there is one node available. */
 	nodes_clear(node_online_map);
@@ -282,11 +281,10 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 	if (node_end_pfn[nid] > max_pfn)
 		node_end_pfn[nid] = max_pfn;
 
-	/* ensure the remap includes space for the pgdat. */
-	size = node_remap_size[nid];
+	/* calculate the necessary space aligned to large page size */
+	size = node_memmap_size_bytes(nid, node_start_pfn[nid],
+				      min(node_end_pfn[nid], max_pfn));
 	size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
-
-	/* align to large page */
 	size = ALIGN(size, LARGE_PAGE_BYTES);
 
 	node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT,
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 48651c6f657..1b9e82c96dc 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -276,7 +276,6 @@ int __init get_memcfg_from_srat(void)
 		unsigned long end = min(node_end_pfn[nid], max_pfn);
 
 		memory_present(nid, start, end);
-		node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
 	}
 	return 1;
 out_fail:
-- 
cgit v1.2.3-70-g09d2


From 82044c328d6f6b22882c2a936e487e6d2240817a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Apr 2011 00:23:54 +0200
Subject: x86-32, numa: Make init_alloc_remap() less panicky

Remap allocator failure isn't fatal.  The callers are required to fall
back to regular early memory allocation mechanisms on failure anyway,
so there's no reason to panic on remap init failure.  Whining and
returning are enough.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1301955840-7246-9-git-send-email-tj@kernel.org
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/numa_32.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 9a7336550f0..c127543372f 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -290,8 +290,11 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 	node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT,
 					 (u64)node_end_pfn[nid] << PAGE_SHIFT,
 					 size, LARGE_PAGE_BYTES);
-	if (node_pa == MEMBLOCK_ERROR)
-		panic("Can not get kva ram\n");
+	if (node_pa == MEMBLOCK_ERROR) {
+		pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
+			   size, nid);
+		return 0;
+	}
 
 	node_remap_size[nid] = size >> PAGE_SHIFT;
 	node_remap_offset[nid] = offset;
-- 
cgit v1.2.3-70-g09d2


From 0e9f93c1c04c8ab10cc564df54a7ad0f83c67796 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Apr 2011 00:23:55 +0200
Subject: x86-32, numa: Move lowmem address space reservation to
 init_alloc_remap()

Remap alloc init is done in the following stages.

1. init_alloc_remap() calculates how much memory is necessary for each
   node and reserves node local memory.

2. initmem_init() collects how much each node needs and reserves a
   single contiguous lowmem area which can contain all.

3. init_remap_allocator() initializes allocator parameters from the
   determined lowmem address and per-node offsets.

4. Actual remap happens.

There is no reason for the lowmem remap area to be reserved as a
single contiguous area at one go.  They don't interact with each other
and the memblock allocator will put them side-by-side anyway.

This patch breaks up the single lowmem address reservation and put
per-node lowmem address reservation into init_alloc_remap() and
initializes allocator parameters directly in the function as all the
addresses are determined there.  This merges steps 2 and 3 into 1.

While at it, remove now largely irrelevant comments in
init_alloc_remap().

This change causes the following behavior changes.

* Remap lowmem areas are allocated in smaller per-node chunks.

* Remap lowmem area reservation failure fail future remap allocations
  instead of panicking.

* Remap allocator initialization is less verbose.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1301955840-7246-10-git-send-email-tj@kernel.org
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/numa_32.c | 82 ++++++++++++++++-----------------------------------
 1 file changed, 25 insertions(+), 57 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index c127543372f..12bb34c434e 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -108,9 +108,6 @@ static unsigned long node_remap_size[MAX_NUMNODES];
 static void *node_remap_start_vaddr[MAX_NUMNODES];
 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
-static unsigned long kva_start_pfn;
-static unsigned long kva_pages;
-
 int __cpuinit numa_cpu_node(int cpu)
 {
 	return apic->x86_32_numa_cpu_node(cpu);
@@ -266,7 +263,8 @@ void resume_map_numa_kva(pgd_t *pgd_base)
 static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 {
 	unsigned long size;
-	u64 node_pa;
+	u64 node_pa, remap_pa;
+	void *remap_va;
 
 	/*
 	 * The acpi/srat node info can show hot-add memroy zones where
@@ -287,6 +285,7 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 	size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
 	size = ALIGN(size, LARGE_PAGE_BYTES);
 
+	/* allocate node memory and the lowmem remap area */
 	node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT,
 					 (u64)node_end_pfn[nid] << PAGE_SHIFT,
 					 size, LARGE_PAGE_BYTES);
@@ -295,45 +294,35 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 			   size, nid);
 		return 0;
 	}
+	memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");
+
+	remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
+					  max_low_pfn << PAGE_SHIFT,
+					  size, LARGE_PAGE_BYTES);
+	if (remap_pa == MEMBLOCK_ERROR) {
+		pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
+			   size, nid);
+		memblock_x86_free_range(node_pa, node_pa + size);
+		return 0;
+	}
+	memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG");
+	remap_va = phys_to_virt(remap_pa);
 
+	/* initialize remap allocator parameters */
+	node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
 	node_remap_size[nid] = size >> PAGE_SHIFT;
 	node_remap_offset[nid] = offset;
-	printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of node %d at %llx\n",
-	       size >> PAGE_SHIFT, nid, node_pa >> PAGE_SHIFT);
 
-	/*
-	 *  prevent kva address below max_low_pfn want it on system
-	 *  with less memory later.
-	 *  layout will be: KVA address , KVA RAM
-	 *
-	 *  we are supposed to only record the one less then
-	 *  max_low_pfn but we could have some hole in high memory,
-	 *  and it will only check page_is_ram(pfn) &&
-	 *  !page_is_reserved_early(pfn) to decide to use it as free.
-	 *  So memblock_x86_reserve_range here, hope we don't run out
-	 *  of that array
-	 */
-	memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");
+	node_remap_start_vaddr[nid] = remap_va;
+	node_remap_end_vaddr[nid] = remap_va + size;
+	node_remap_alloc_vaddr[nid] = remap_va + ALIGN(sizeof(pg_data_t), PAGE_SIZE);
 
-	node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
+	printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
+	       nid, node_pa, node_pa + size, remap_va, remap_va + size);
 
 	return size >> PAGE_SHIFT;
 }
 
-static void init_remap_allocator(int nid)
-{
-	node_remap_start_vaddr[nid] = pfn_to_kaddr(
-			kva_start_pfn + node_remap_offset[nid]);
-	node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
-		(node_remap_size[nid] * PAGE_SIZE);
-	node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
-		ALIGN(sizeof(pg_data_t), PAGE_SIZE);
-
-	printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
-		(ulong) node_remap_start_vaddr[nid],
-		(ulong) node_remap_end_vaddr[nid]);
-}
-
 void __init initmem_init(void)
 {
 	unsigned long reserve_pages = 0;
@@ -352,25 +341,7 @@ void __init initmem_init(void)
 
 	for_each_online_node(nid)
 		reserve_pages += init_alloc_remap(nid, reserve_pages);
-	kva_pages = roundup(reserve_pages, PTRS_PER_PTE);
-	printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
-			reserve_pages);
-
-	kva_start_pfn = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
-				max_low_pfn << PAGE_SHIFT,
-				kva_pages << PAGE_SHIFT,
-				PTRS_PER_PTE << PAGE_SHIFT) >> PAGE_SHIFT;
-	if (kva_start_pfn == MEMBLOCK_ERROR)
-		panic("Can not get kva space\n");
-
-	printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
-		kva_start_pfn, max_low_pfn);
-	printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
-
-	/* avoid clash with initrd */
-	memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT,
-		      (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
-		     "KVA PG");
+
 #ifdef CONFIG_HIGHMEM
 	highstart_pfn = highend_pfn = max_pfn;
 	if (max_pfn > max_low_pfn)
@@ -390,11 +361,8 @@ void __init initmem_init(void)
 
 	printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
 			(ulong) pfn_to_kaddr(max_low_pfn));
-	for_each_online_node(nid) {
-		init_remap_allocator(nid);
-
+	for_each_online_node(nid)
 		allocate_pgdat(nid);
-	}
 	remap_numa_kva();
 
 	printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
-- 
cgit v1.2.3-70-g09d2


From 2a286344f06d6341740b284494379373e87648f7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Apr 2011 00:23:56 +0200
Subject: x86-32, numa: Move remapping for remap allocator into
 init_alloc_remap()

There's no reason to perform the actual remapping separately.
Collapse remap_numa_kva() into init_alloc_remap() and, while at it,
make it less verbose.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1301955840-7246-11-git-send-email-tj@kernel.org
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/numa_32.c | 29 +++++++----------------------
 1 file changed, 7 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 12bb34c434e..53ec13a17b9 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -205,26 +205,6 @@ void *alloc_remap(int nid, unsigned long size)
 	return allocation;
 }
 
-static void __init remap_numa_kva(void)
-{
-	void *vaddr;
-	unsigned long pfn;
-	int node;
-
-	for_each_online_node(node) {
-		printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
-		for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
-			vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
-			printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
-				(unsigned long)vaddr,
-				node_remap_start_pfn[node] + pfn);
-			set_pmd_pfn((ulong) vaddr, 
-				node_remap_start_pfn[node] + pfn, 
-				PAGE_KERNEL_LARGE);
-		}
-	}
-}
-
 #ifdef CONFIG_HIBERNATION
 /**
  * resume_map_numa_kva - add KVA mapping to the temporary page tables created
@@ -262,7 +242,7 @@ void resume_map_numa_kva(pgd_t *pgd_base)
 
 static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 {
-	unsigned long size;
+	unsigned long size, pfn;
 	u64 node_pa, remap_pa;
 	void *remap_va;
 
@@ -308,6 +288,12 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 	memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG");
 	remap_va = phys_to_virt(remap_pa);
 
+	/* perform actual remap */
+	for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
+		set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
+			    (node_pa >> PAGE_SHIFT) + pfn,
+			    PAGE_KERNEL_LARGE);
+
 	/* initialize remap allocator parameters */
 	node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
 	node_remap_size[nid] = size >> PAGE_SHIFT;
@@ -363,7 +349,6 @@ void __init initmem_init(void)
 			(ulong) pfn_to_kaddr(max_low_pfn));
 	for_each_online_node(nid)
 		allocate_pgdat(nid);
-	remap_numa_kva();
 
 	printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
 			(ulong) pfn_to_kaddr(highstart_pfn));
-- 
cgit v1.2.3-70-g09d2


From b2e3e4fa3eee752b893687783f2a427106c93423 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Apr 2011 00:23:57 +0200
Subject: x86-32, numa: Make pgdat allocation use alloc_remap()

pgdat allocation is handled differnetly from other remap allocations -
it's reserved during initialization.  There's no reason to handle this
any differnetly.  Remap allocator is initialized for every node and if
init failed the allocation will fail and pgdat allocation can fall
back to generic code like anyone else.

Remove special init-time pgdat reservation and make allocate_pgdat()
use alloc_remap() like everyone else.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1301955840-7246-12-git-send-email-tj@kernel.org
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/numa_32.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 53ec13a17b9..0184a9f5a34 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -160,9 +160,8 @@ static void __init allocate_pgdat(int nid)
 {
 	char buf[16];
 
-	if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
-		NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
-	else {
+	NODE_DATA(nid) = alloc_remap(nid, ALIGN(sizeof(pg_data_t), PAGE_SIZE));
+	if (!NODE_DATA(nid)) {
 		unsigned long pgdat_phys;
 		pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT,
 				 max_pfn_mapped<<PAGE_SHIFT,
@@ -301,7 +300,7 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 
 	node_remap_start_vaddr[nid] = remap_va;
 	node_remap_end_vaddr[nid] = remap_va + size;
-	node_remap_alloc_vaddr[nid] = remap_va + ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+	node_remap_alloc_vaddr[nid] = remap_va;
 
 	printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
 	       nid, node_pa, node_pa + size, remap_va, remap_va + size);
-- 
cgit v1.2.3-70-g09d2


From 1d85b61baf0334dd6bb88261bec42b808204d694 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Apr 2011 00:23:58 +0200
Subject: x86-32, numa: Remove now useless node_remap_offset[]

With lowmem address reservation moved into init_alloc_remap(),
node_remap_offset[] is no longer useful.  Remove it and related offset
handling code.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1301955840-7246-13-git-send-email-tj@kernel.org
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/numa_32.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 0184a9f5a34..960ea7bc0ac 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -187,7 +187,6 @@ static void __init allocate_pgdat(int nid)
 static unsigned long node_remap_start_pfn[MAX_NUMNODES];
 static void *node_remap_end_vaddr[MAX_NUMNODES];
 static void *node_remap_alloc_vaddr[MAX_NUMNODES];
-static unsigned long node_remap_offset[MAX_NUMNODES];
 
 void *alloc_remap(int nid, unsigned long size)
 {
@@ -239,7 +238,7 @@ void resume_map_numa_kva(pgd_t *pgd_base)
 }
 #endif
 
-static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
+static __init void init_alloc_remap(int nid)
 {
 	unsigned long size, pfn;
 	u64 node_pa, remap_pa;
@@ -252,9 +251,9 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 	printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
 	       nid, node_start_pfn[nid], node_end_pfn[nid]);
 	if (node_start_pfn[nid] > max_pfn)
-		return 0;
+		return;
 	if (!node_end_pfn[nid])
-		return 0;
+		return;
 	if (node_end_pfn[nid] > max_pfn)
 		node_end_pfn[nid] = max_pfn;
 
@@ -271,7 +270,7 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 	if (node_pa == MEMBLOCK_ERROR) {
 		pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
 			   size, nid);
-		return 0;
+		return;
 	}
 	memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");
 
@@ -282,7 +281,7 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 		pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
 			   size, nid);
 		memblock_x86_free_range(node_pa, node_pa + size);
-		return 0;
+		return;
 	}
 	memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG");
 	remap_va = phys_to_virt(remap_pa);
@@ -296,7 +295,6 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 	/* initialize remap allocator parameters */
 	node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
 	node_remap_size[nid] = size >> PAGE_SHIFT;
-	node_remap_offset[nid] = offset;
 
 	node_remap_start_vaddr[nid] = remap_va;
 	node_remap_end_vaddr[nid] = remap_va + size;
@@ -304,13 +302,10 @@ static __init unsigned long init_alloc_remap(int nid, unsigned long offset)
 
 	printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
 	       nid, node_pa, node_pa + size, remap_va, remap_va + size);
-
-	return size >> PAGE_SHIFT;
 }
 
 void __init initmem_init(void)
 {
-	unsigned long reserve_pages = 0;
 	int nid;
 
 	/*
@@ -325,7 +320,7 @@ void __init initmem_init(void)
 	numa_init_array();
 
 	for_each_online_node(nid)
-		reserve_pages += init_alloc_remap(nid, reserve_pages);
+		init_alloc_remap(nid);
 
 #ifdef CONFIG_HIGHMEM
 	highstart_pfn = highend_pfn = max_pfn;
-- 
cgit v1.2.3-70-g09d2


From 198bd06bbfde2984027e91f64c55eb19a7034a27 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Apr 2011 00:23:59 +0200
Subject: x86-32, numa: Remove redundant node_remap_size[]

Remap area size can be determined from node_remap_start_vaddr[] and
node_remap_end_vaddr[] making node_remap_size[] redundant.  Remove it.

While at it, make resume_map_numa_kva() use @nr_pages for number of
pages instead of @size.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1301955840-7246-14-git-send-email-tj@kernel.org
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/numa_32.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 960ea7bc0ac..f325e6fab75 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -104,7 +104,6 @@ extern unsigned long highend_pfn, highstart_pfn;
 
 #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
 
-static unsigned long node_remap_size[MAX_NUMNODES];
 static void *node_remap_start_vaddr[MAX_NUMNODES];
 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
@@ -214,15 +213,16 @@ void resume_map_numa_kva(pgd_t *pgd_base)
 	int node;
 
 	for_each_online_node(node) {
-		unsigned long start_va, start_pfn, size, pfn;
+		unsigned long start_va, start_pfn, nr_pages, pfn;
 
 		start_va = (unsigned long)node_remap_start_vaddr[node];
 		start_pfn = node_remap_start_pfn[node];
-		size = node_remap_size[node];
+		nr_pages = (node_remap_end_vaddr[node] -
+			    node_remap_start_vaddr[node]) >> PAGE_SHIFT;
 
 		printk(KERN_DEBUG "%s: node %d\n", __func__, node);
 
-		for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) {
+		for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
 			unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
 			pgd_t *pgd = pgd_base + pgd_index(vaddr);
 			pud_t *pud = pud_offset(pgd, vaddr);
@@ -294,8 +294,6 @@ static __init void init_alloc_remap(int nid)
 
 	/* initialize remap allocator parameters */
 	node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
-	node_remap_size[nid] = size >> PAGE_SHIFT;
-
 	node_remap_start_vaddr[nid] = remap_va;
 	node_remap_end_vaddr[nid] = remap_va + size;
 	node_remap_alloc_vaddr[nid] = remap_va;
-- 
cgit v1.2.3-70-g09d2


From 993ba1585cbb03fab012e41d1a5d24330a283b31 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Apr 2011 00:24:00 +0200
Subject: x86-32, numa: Update remap allocator comments

Now that remap allocator is cleaned up, update comments such that they
are in docbook function description format and reflect the actual
implementation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1301955840-7246-15-git-send-email-tj@kernel.org
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/mm/numa_32.c | 56 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index f325e6fab75..c757c0a3b52 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -176,17 +176,31 @@ static void __init allocate_pgdat(int nid)
 }
 
 /*
- * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel
- * virtual address space (KVA) is reserved and portions of nodes are mapped
- * using it. This is to allow node-local memory to be allocated for
- * structures that would normally require ZONE_NORMAL. The memory is
- * allocated with alloc_remap() and callers should be prepared to allocate
- * from the bootmem allocator instead.
+ * Remap memory allocator
  */
 static unsigned long node_remap_start_pfn[MAX_NUMNODES];
 static void *node_remap_end_vaddr[MAX_NUMNODES];
 static void *node_remap_alloc_vaddr[MAX_NUMNODES];
 
+/**
+ * alloc_remap - Allocate remapped memory
+ * @nid: NUMA node to allocate memory from
+ * @size: The size of allocation
+ *
+ * Allocate @size bytes from the remap area of NUMA node @nid.  The
+ * size of the remap area is predetermined by init_alloc_remap() and
+ * only the callers considered there should call this function.  For
+ * more info, please read the comment on top of init_alloc_remap().
+ *
+ * The caller must be ready to handle allocation failure from this
+ * function and fall back to regular memory allocator in such cases.
+ *
+ * CONTEXT:
+ * Single CPU early boot context.
+ *
+ * RETURNS:
+ * Pointer to the allocated memory on success, %NULL on failure.
+ */
 void *alloc_remap(int nid, unsigned long size)
 {
 	void *allocation = node_remap_alloc_vaddr[nid];
@@ -238,6 +252,28 @@ void resume_map_numa_kva(pgd_t *pgd_base)
 }
 #endif
 
+/**
+ * init_alloc_remap - Initialize remap allocator for a NUMA node
+ * @nid: NUMA node to initizlie remap allocator for
+ *
+ * NUMA nodes may end up without any lowmem.  As allocating pgdat and
+ * memmap on a different node with lowmem is inefficient, a special
+ * remap allocator is implemented which can be used by alloc_remap().
+ *
+ * For each node, the amount of memory which will be necessary for
+ * pgdat and memmap is calculated and two memory areas of the size are
+ * allocated - one in the node and the other in lowmem; then, the area
+ * in the node is remapped to the lowmem area.
+ *
+ * As pgdat and memmap must be allocated in lowmem anyway, this
+ * doesn't waste lowmem address space; however, the actual lowmem
+ * which gets remapped over is wasted.  The amount shouldn't be
+ * problematic on machines this feature will be used.
+ *
+ * Initialization failure isn't fatal.  alloc_remap() is used
+ * opportunistically and the callers will fall back to other memory
+ * allocation mechanisms on failure.
+ */
 static __init void init_alloc_remap(int nid)
 {
 	unsigned long size, pfn;
@@ -306,14 +342,6 @@ void __init initmem_init(void)
 {
 	int nid;
 
-	/*
-	 * When mapping a NUMA machine we allocate the node_mem_map arrays
-	 * from node local memory.  They are then mapped directly into KVA
-	 * between zone normal and vmalloc space.  Calculate the size of
-	 * this space and use it to adjust the boundary between ZONE_NORMAL
-	 * and ZONE_HIGHMEM.
-	 */
-
 	get_memcfg_numa();
 	numa_init_array();
 
-- 
cgit v1.2.3-70-g09d2


From 815b33fdc279d34ab40a8bfe1866623a4cc5669b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 6 Apr 2011 17:26:49 +0200
Subject: x86/amd-iommu: Cleanup completion-wait handling

This patch cleans up the implementation of completion-wait
command sending. It also switches the completion indicator
from the MMIO bit to a memory store which can be checked
without IOMMU locking.
As a side effect this patch makes the __iommu_queue_command
function obsolete and so it is removed too.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 107 ++++++++++++--------------------------------
 1 file changed, 28 insertions(+), 79 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f8ec28ea331..073c64b1994 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -25,6 +25,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/iommu-helper.h>
 #include <linux/iommu.h>
+#include <linux/delay.h>
 #include <asm/proto.h>
 #include <asm/iommu.h>
 #include <asm/gart.h>
@@ -34,7 +35,7 @@
 
 #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
 
-#define EXIT_LOOP_COUNT 10000000
+#define LOOP_TIMEOUT	100000
 
 static DEFINE_RWLOCK(amd_iommu_devtable_lock);
 
@@ -383,10 +384,14 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
  *
  ****************************************************************************/
 
-static void build_completion_wait(struct iommu_cmd *cmd)
+static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
 {
+	WARN_ON(address & 0x7ULL);
+
 	memset(cmd, 0, sizeof(*cmd));
-	cmd->data[0] = CMD_COMPL_WAIT_INT_MASK;
+	cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
+	cmd->data[1] = upper_32_bits(__pa(address));
+	cmd->data[2] = 1;
 	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
 }
 
@@ -432,12 +437,14 @@ static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
  * Writes the command to the IOMMUs command buffer and informs the
  * hardware about the new command. Must be called with iommu->lock held.
  */
-static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
+static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
 {
+	unsigned long flags;
 	u32 tail, head;
 	u8 *target;
 
 	WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
+	spin_lock_irqsave(&iommu->lock, flags);
 	tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
 	target = iommu->cmd_buf + tail;
 	memcpy_toio(target, cmd, sizeof(*cmd));
@@ -446,99 +453,41 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
 	if (tail == head)
 		return -ENOMEM;
 	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-
-	return 0;
-}
-
-/*
- * General queuing function for commands. Takes iommu->lock and calls
- * __iommu_queue_command().
- */
-static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
-{
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&iommu->lock, flags);
-	ret = __iommu_queue_command(iommu, cmd);
-	if (!ret)
-		iommu->need_sync = true;
+	iommu->need_sync = true;
 	spin_unlock_irqrestore(&iommu->lock, flags);
 
-	return ret;
-}
-
-/*
- * This function waits until an IOMMU has completed a completion
- * wait command
- */
-static void __iommu_wait_for_completion(struct amd_iommu *iommu)
-{
-	int ready = 0;
-	unsigned status = 0;
-	unsigned long i = 0;
-
-	INC_STATS_COUNTER(compl_wait);
-
-	while (!ready && (i < EXIT_LOOP_COUNT)) {
-		++i;
-		/* wait for the bit to become one */
-		status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
-		ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
-	}
-
-	/* set bit back to zero */
-	status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
-	writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
-
-	if (unlikely(i == EXIT_LOOP_COUNT))
-		iommu->reset_in_progress = true;
+	return 0;
 }
 
 /*
  * This function queues a completion wait command into the command
  * buffer of an IOMMU
  */
-static int __iommu_completion_wait(struct amd_iommu *iommu)
-{
-	struct iommu_cmd cmd;
-
-	build_completion_wait(&cmd);
-
-	 return __iommu_queue_command(iommu, &cmd);
-}
-
-/*
- * This function is called whenever we need to ensure that the IOMMU has
- * completed execution of all commands we sent. It sends a
- * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
- * us about that by writing a value to a physical address we pass with
- * the command.
- */
 static int iommu_completion_wait(struct amd_iommu *iommu)
 {
-	int ret = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iommu->lock, flags);
+	struct iommu_cmd cmd;
+	volatile u64 sem = 0;
+	int ret, i = 0;
 
 	if (!iommu->need_sync)
-		goto out;
-
-	ret = __iommu_completion_wait(iommu);
+		return 0;
 
-	iommu->need_sync = false;
+	build_completion_wait(&cmd, (u64)&sem);
 
+	ret = iommu_queue_command(iommu, &cmd);
 	if (ret)
-		goto out;
-
-	__iommu_wait_for_completion(iommu);
+		return ret;
 
-out:
-	spin_unlock_irqrestore(&iommu->lock, flags);
+	while (sem == 0 && i < LOOP_TIMEOUT) {
+		udelay(1);
+		i += 1;
+	}
 
-	if (iommu->reset_in_progress)
+	if (i == LOOP_TIMEOUT) {
+		pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
+		iommu->reset_in_progress = true;
 		reset_iommu_command_buffer(iommu);
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 61985a040f17c03b09a2772508ee02729571365b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 6 Apr 2011 17:46:49 +0200
Subject: x86/amd-iommu: Remove command buffer resetting logic

The logic to reset the command buffer caused more problems
than it actually helped. The logic jumped in when the IOMMU
hardware doesn't execute commands anymore but the reasons
for this are usually not fixed by just resetting the command
buffer. So the code can be removed to reduce complexity.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu_types.h |  3 ---
 arch/x86/kernel/amd_iommu.c            | 20 +-------------------
 2 files changed, 1 insertion(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index e3509fc303b..878ae008eb0 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -409,9 +409,6 @@ struct amd_iommu {
 	/* if one, we need to send a completion wait command */
 	bool need_sync;
 
-	/* becomes true if a command buffer reset is running */
-	bool reset_in_progress;
-
 	/* default dma_ops domain for that IOMMU */
 	struct dma_ops_domain *default_dom;
 
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 073c64b1994..0147c5c87aa 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -58,7 +58,6 @@ struct iommu_cmd {
 	u32 data[4];
 };
 
-static void reset_iommu_command_buffer(struct amd_iommu *iommu);
 static void update_domain(struct protection_domain *domain);
 
 /****************************************************************************
@@ -323,8 +322,6 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
 		break;
 	case EVENT_TYPE_ILL_CMD:
 		printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
-		iommu->reset_in_progress = true;
-		reset_iommu_command_buffer(iommu);
 		dump_command(address);
 		break;
 	case EVENT_TYPE_CMD_HARD_ERR:
@@ -485,8 +482,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
 
 	if (i == LOOP_TIMEOUT) {
 		pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
-		iommu->reset_in_progress = true;
-		reset_iommu_command_buffer(iommu);
+		ret = -EIO;
 	}
 
 	return 0;
@@ -628,20 +624,6 @@ void amd_iommu_flush_all_domains(void)
 	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
 }
 
-static void reset_iommu_command_buffer(struct amd_iommu *iommu)
-{
-	pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
-
-	if (iommu->reset_in_progress)
-		panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
-
-	amd_iommu_reset_cmd_buffer(iommu);
-	amd_iommu_flush_all_devices();
-	amd_iommu_flush_all_domains();
-
-	iommu->reset_in_progress = false;
-}
-
 /****************************************************************************
  *
  * The functions below are used the create the page table mappings for
-- 
cgit v1.2.3-70-g09d2


From 17b124bf1463582005d662d4dd95f037ad863c57 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 6 Apr 2011 18:01:35 +0200
Subject: x86/amd-iommu: Rename iommu_flush* to domain_flush*

These functions all operate on protection domains and not on
singe IOMMUs. Represent that in their name.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 87 +++++++++++++++++++++++----------------------
 1 file changed, 44 insertions(+), 43 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0147c5c87aa..9d66b2092ae 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -488,22 +488,6 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
 	return 0;
 }
 
-static void iommu_flush_complete(struct protection_domain *domain)
-{
-	int i;
-
-	for (i = 0; i < amd_iommus_present; ++i) {
-		if (!domain->dev_iommu[i])
-			continue;
-
-		/*
-		 * Devices of this domain are behind this IOMMU
-		 * We need to wait for completion of all commands.
-		 */
-		iommu_completion_wait(amd_iommus[i]);
-	}
-}
-
 /*
  * Command send function for invalidating a device table entry
  */
@@ -526,8 +510,8 @@ static int iommu_flush_device(struct device *dev)
  * It invalidates a single PTE if the range to flush is within a single
  * page. Otherwise it flushes the whole TLB of the IOMMU.
  */
-static void __iommu_flush_pages(struct protection_domain *domain,
-				u64 address, size_t size, int pde)
+static void __domain_flush_pages(struct protection_domain *domain,
+				 u64 address, size_t size, int pde)
 {
 	struct iommu_cmd cmd;
 	int ret = 0, i;
@@ -548,29 +532,45 @@ static void __iommu_flush_pages(struct protection_domain *domain,
 	WARN_ON(ret);
 }
 
-static void iommu_flush_pages(struct protection_domain *domain,
-			     u64 address, size_t size)
+static void domain_flush_pages(struct protection_domain *domain,
+			       u64 address, size_t size)
 {
-	__iommu_flush_pages(domain, address, size, 0);
+	__domain_flush_pages(domain, address, size, 0);
 }
 
 /* Flush the whole IO/TLB for a given protection domain */
-static void iommu_flush_tlb(struct protection_domain *domain)
+static void domain_flush_tlb(struct protection_domain *domain)
 {
-	__iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
+	__domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
 }
 
 /* Flush the whole IO/TLB for a given protection domain - including PDE */
-static void iommu_flush_tlb_pde(struct protection_domain *domain)
+static void domain_flush_tlb_pde(struct protection_domain *domain)
 {
-	__iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
+	__domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
+}
+
+static void domain_flush_complete(struct protection_domain *domain)
+{
+	int i;
+
+	for (i = 0; i < amd_iommus_present; ++i) {
+		if (!domain->dev_iommu[i])
+			continue;
+
+		/*
+		 * Devices of this domain are behind this IOMMU
+		 * We need to wait for completion of all commands.
+		 */
+		iommu_completion_wait(amd_iommus[i]);
+	}
 }
 
 
 /*
  * This function flushes the DTEs for all devices in domain
  */
-static void iommu_flush_domain_devices(struct protection_domain *domain)
+static void domain_flush_devices(struct protection_domain *domain)
 {
 	struct iommu_dev_data *dev_data;
 	unsigned long flags;
@@ -591,8 +591,8 @@ static void iommu_flush_all_domain_devices(void)
 	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
 
 	list_for_each_entry(domain, &amd_iommu_pd_list, list) {
-		iommu_flush_domain_devices(domain);
-		iommu_flush_complete(domain);
+		domain_flush_devices(domain);
+		domain_flush_complete(domain);
 	}
 
 	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
@@ -616,8 +616,8 @@ void amd_iommu_flush_all_domains(void)
 
 	list_for_each_entry(domain, &amd_iommu_pd_list, list) {
 		spin_lock(&domain->lock);
-		iommu_flush_tlb_pde(domain);
-		iommu_flush_complete(domain);
+		domain_flush_tlb_pde(domain);
+		domain_flush_complete(domain);
 		spin_unlock(&domain->lock);
 	}
 
@@ -1480,7 +1480,7 @@ static int attach_device(struct device *dev,
 	 * left the caches in the IOMMU dirty. So we have to flush
 	 * here to evict all dirty stuff.
 	 */
-	iommu_flush_tlb_pde(domain);
+	domain_flush_tlb_pde(domain);
 
 	return ret;
 }
@@ -1693,8 +1693,9 @@ static void update_domain(struct protection_domain *domain)
 		return;
 
 	update_device_table(domain);
-	iommu_flush_domain_devices(domain);
-	iommu_flush_tlb_pde(domain);
+
+	domain_flush_devices(domain);
+	domain_flush_tlb_pde(domain);
 
 	domain->updated = false;
 }
@@ -1853,10 +1854,10 @@ retry:
 	ADD_STATS_COUNTER(alloced_io_mem, size);
 
 	if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
-		iommu_flush_tlb(&dma_dom->domain);
+		domain_flush_tlb(&dma_dom->domain);
 		dma_dom->need_flush = false;
 	} else if (unlikely(amd_iommu_np_cache))
-		iommu_flush_pages(&dma_dom->domain, address, size);
+		domain_flush_pages(&dma_dom->domain, address, size);
 
 out:
 	return address;
@@ -1905,7 +1906,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
 	dma_ops_free_addresses(dma_dom, dma_addr, pages);
 
 	if (amd_iommu_unmap_flush || dma_dom->need_flush) {
-		iommu_flush_pages(&dma_dom->domain, flush_addr, size);
+		domain_flush_pages(&dma_dom->domain, flush_addr, size);
 		dma_dom->need_flush = false;
 	}
 }
@@ -1941,7 +1942,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
 	if (addr == DMA_ERROR_CODE)
 		goto out;
 
-	iommu_flush_complete(domain);
+	domain_flush_complete(domain);
 
 out:
 	spin_unlock_irqrestore(&domain->lock, flags);
@@ -1968,7 +1969,7 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
 
 	__unmap_single(domain->priv, dma_addr, size, dir);
 
-	iommu_flush_complete(domain);
+	domain_flush_complete(domain);
 
 	spin_unlock_irqrestore(&domain->lock, flags);
 }
@@ -2033,7 +2034,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
 			goto unmap;
 	}
 
-	iommu_flush_complete(domain);
+	domain_flush_complete(domain);
 
 out:
 	spin_unlock_irqrestore(&domain->lock, flags);
@@ -2079,7 +2080,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
 		s->dma_address = s->dma_length = 0;
 	}
 
-	iommu_flush_complete(domain);
+	domain_flush_complete(domain);
 
 	spin_unlock_irqrestore(&domain->lock, flags);
 }
@@ -2129,7 +2130,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
 		goto out_free;
 	}
 
-	iommu_flush_complete(domain);
+	domain_flush_complete(domain);
 
 	spin_unlock_irqrestore(&domain->lock, flags);
 
@@ -2161,7 +2162,7 @@ static void free_coherent(struct device *dev, size_t size,
 
 	__unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
 
-	iommu_flush_complete(domain);
+	domain_flush_complete(domain);
 
 	spin_unlock_irqrestore(&domain->lock, flags);
 
@@ -2471,7 +2472,7 @@ static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
 	unmap_size = iommu_unmap_page(domain, iova, page_size);
 	mutex_unlock(&domain->api_lock);
 
-	iommu_flush_tlb_pde(domain);
+	domain_flush_tlb_pde(domain);
 
 	return get_order(unmap_size);
 }
-- 
cgit v1.2.3-70-g09d2


From ac0ea6e92b2227c86fe4f7f9eb429071d617a25d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 6 Apr 2011 18:38:20 +0200
Subject: x86/amd-iommu: Improve handling of full command buffer

This patch improved the handling of commands when the IOMMU
command buffer is nearly full. In this case it issues an
completion wait command and waits until the IOMMU has
processed it before continuing queuing new commands.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 88 +++++++++++++++++++++++++++++++++------------
 1 file changed, 65 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 9d66b2092ae..75c7f8c3fe1 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -381,6 +381,39 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
  *
  ****************************************************************************/
 
+static int wait_on_sem(volatile u64 *sem)
+{
+	int i = 0;
+
+	while (*sem == 0 && i < LOOP_TIMEOUT) {
+		udelay(1);
+		i += 1;
+	}
+
+	if (i == LOOP_TIMEOUT) {
+		pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static void copy_cmd_to_buffer(struct amd_iommu *iommu,
+			       struct iommu_cmd *cmd,
+			       u32 tail)
+{
+	u8 *target;
+
+	target = iommu->cmd_buf + tail;
+	tail   = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
+
+	/* Copy command to buffer */
+	memcpy(target, cmd, sizeof(*cmd));
+
+	/* Tell the IOMMU about it */
+	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+}
+
 static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
 {
 	WARN_ON(address & 0x7ULL);
@@ -432,25 +465,44 @@ static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
 
 /*
  * Writes the command to the IOMMUs command buffer and informs the
- * hardware about the new command. Must be called with iommu->lock held.
+ * hardware about the new command.
  */
 static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
 {
+	u32 left, tail, head, next_tail;
 	unsigned long flags;
-	u32 tail, head;
-	u8 *target;
 
 	WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
+
+again:
 	spin_lock_irqsave(&iommu->lock, flags);
-	tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-	target = iommu->cmd_buf + tail;
-	memcpy_toio(target, cmd, sizeof(*cmd));
-	tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
-	head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
-	if (tail == head)
-		return -ENOMEM;
-	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+
+	head      = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
+	tail      = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
+	next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
+	left      = (head - next_tail) % iommu->cmd_buf_size;
+
+	if (left <= 2) {
+		struct iommu_cmd sync_cmd;
+		volatile u64 sem = 0;
+		int ret;
+
+		build_completion_wait(&sync_cmd, (u64)&sem);
+		copy_cmd_to_buffer(iommu, &sync_cmd, tail);
+
+		spin_unlock_irqrestore(&iommu->lock, flags);
+
+		if ((ret = wait_on_sem(&sem)) != 0)
+			return ret;
+
+		goto again;
+	}
+
+	copy_cmd_to_buffer(iommu, cmd, tail);
+
+	/* We need to sync now to make sure all commands are processed */
 	iommu->need_sync = true;
+
 	spin_unlock_irqrestore(&iommu->lock, flags);
 
 	return 0;
@@ -464,7 +516,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
 {
 	struct iommu_cmd cmd;
 	volatile u64 sem = 0;
-	int ret, i = 0;
+	int ret;
 
 	if (!iommu->need_sync)
 		return 0;
@@ -475,17 +527,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
 	if (ret)
 		return ret;
 
-	while (sem == 0 && i < LOOP_TIMEOUT) {
-		udelay(1);
-		i += 1;
-	}
-
-	if (i == LOOP_TIMEOUT) {
-		pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
-		ret = -EIO;
-	}
-
-	return 0;
+	return wait_on_sem(&sem);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From d8c13085775c72e2d46edc54ed0c803c3a944ddb Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 6 Apr 2011 18:51:26 +0200
Subject: x86/amd-iommu: Rename iommu_flush_device

This function operates on a struct device, so give it a name
that represents that. As a side effect a new function is
introduced which operates on am iommu and a device-id. It
will be used again in a later patch.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 75c7f8c3fe1..3557f223f40 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -530,21 +530,27 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
 	return wait_on_sem(&sem);
 }
 
+static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
+{
+	struct iommu_cmd cmd;
+
+	build_inv_dte(&cmd, devid);
+
+	return iommu_queue_command(iommu, &cmd);
+}
+
 /*
  * Command send function for invalidating a device table entry
  */
-static int iommu_flush_device(struct device *dev)
+static int device_flush_dte(struct device *dev)
 {
 	struct amd_iommu *iommu;
-	struct iommu_cmd cmd;
 	u16 devid;
 
 	devid = get_device_id(dev);
 	iommu = amd_iommu_rlookup_table[devid];
 
-	build_inv_dte(&cmd, devid);
-
-	return iommu_queue_command(iommu, &cmd);
+	return iommu_flush_dte(iommu, devid);
 }
 
 /*
@@ -620,7 +626,7 @@ static void domain_flush_devices(struct protection_domain *domain)
 	spin_lock_irqsave(&domain->lock, flags);
 
 	list_for_each_entry(dev_data, &domain->dev_list, list)
-		iommu_flush_device(dev_data->dev);
+		device_flush_dte(dev_data->dev);
 
 	spin_unlock_irqrestore(&domain->lock, flags);
 }
@@ -1424,7 +1430,7 @@ static void do_attach(struct device *dev, struct protection_domain *domain)
 	domain->dev_cnt                 += 1;
 
 	/* Flush the DTE entry */
-	iommu_flush_device(dev);
+	device_flush_dte(dev);
 }
 
 static void do_detach(struct device *dev)
@@ -1447,7 +1453,7 @@ static void do_detach(struct device *dev)
 	clear_dte_entry(devid);
 
 	/* Flush the DTE entry */
-	iommu_flush_device(dev);
+	device_flush_dte(dev);
 }
 
 /*
@@ -1663,7 +1669,7 @@ static int device_change_notifier(struct notifier_block *nb,
 		goto out;
 	}
 
-	iommu_flush_device(dev);
+	device_flush_dte(dev);
 	iommu_completion_wait(iommu);
 
 out:
@@ -2448,7 +2454,7 @@ static void amd_iommu_detach_device(struct iommu_domain *dom,
 	if (!iommu)
 		return;
 
-	iommu_flush_device(dev);
+	device_flush_dte(dev);
 	iommu_completion_wait(iommu);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 7d0c5cc5be73f7ce26fdcca7b8ec2203f661eb93 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 7 Apr 2011 08:16:10 +0200
Subject: x86/amd-iommu: Flush all internal TLBs when IOMMUs are enabled

The old code only flushed a DTE or a domain TLB before it is
actually used by the IOMMU driver. While this is efficient
and works when done right it is more likely to introduce new
bugs when changing code (which happened in the past).
This patch adds code to flush all DTEs and all domain TLBs
in each IOMMU right after it is enabled (at boot and after
resume). This reduces the complexity of the driver and makes
it less likely to introduce stale-TLB bugs in the future.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu_proto.h |  2 -
 arch/x86/kernel/amd_iommu.c            | 75 +++++++++++++++-------------------
 arch/x86/kernel/amd_iommu_init.c       | 11 ++++-
 3 files changed, 43 insertions(+), 45 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
index 916bc8111a0..1223c0fe03f 100644
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -24,8 +24,6 @@ struct amd_iommu;
 extern int amd_iommu_init_dma_ops(void);
 extern int amd_iommu_init_passthrough(void);
 extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
-extern void amd_iommu_flush_all_domains(void);
-extern void amd_iommu_flush_all_devices(void);
 extern void amd_iommu_apply_erratum_63(u16 devid);
 extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
 extern int amd_iommu_init_devices(void);
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 3557f223f40..bcf58ea55cf 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -539,6 +539,40 @@ static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
 	return iommu_queue_command(iommu, &cmd);
 }
 
+static void iommu_flush_dte_all(struct amd_iommu *iommu)
+{
+	u32 devid;
+
+	for (devid = 0; devid <= 0xffff; ++devid)
+		iommu_flush_dte(iommu, devid);
+
+	iommu_completion_wait(iommu);
+}
+
+/*
+ * This function uses heavy locking and may disable irqs for some time. But
+ * this is no issue because it is only called during resume.
+ */
+static void iommu_flush_tlb_all(struct amd_iommu *iommu)
+{
+	u32 dom_id;
+
+	for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
+		struct iommu_cmd cmd;
+		build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
+				      dom_id, 1);
+		iommu_queue_command(iommu, &cmd);
+	}
+
+	iommu_completion_wait(iommu);
+}
+
+void iommu_flush_all_caches(struct amd_iommu *iommu)
+{
+	iommu_flush_dte_all(iommu);
+	iommu_flush_tlb_all(iommu);
+}
+
 /*
  * Command send function for invalidating a device table entry
  */
@@ -631,47 +665,6 @@ static void domain_flush_devices(struct protection_domain *domain)
 	spin_unlock_irqrestore(&domain->lock, flags);
 }
 
-static void iommu_flush_all_domain_devices(void)
-{
-	struct protection_domain *domain;
-	unsigned long flags;
-
-	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
-
-	list_for_each_entry(domain, &amd_iommu_pd_list, list) {
-		domain_flush_devices(domain);
-		domain_flush_complete(domain);
-	}
-
-	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-void amd_iommu_flush_all_devices(void)
-{
-	iommu_flush_all_domain_devices();
-}
-
-/*
- * This function uses heavy locking and may disable irqs for some time. But
- * this is no issue because it is only called during resume.
- */
-void amd_iommu_flush_all_domains(void)
-{
-	struct protection_domain *domain;
-	unsigned long flags;
-
-	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
-
-	list_for_each_entry(domain, &amd_iommu_pd_list, list) {
-		spin_lock(&domain->lock);
-		domain_flush_tlb_pde(domain);
-		domain_flush_complete(domain);
-		spin_unlock(&domain->lock);
-	}
-
-	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
 /****************************************************************************
  *
  * The functions below are used the create the page table mappings for
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 246d727b65b..8848dda808e 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -180,6 +180,12 @@ static u32 dev_table_size;	/* size of the device table */
 static u32 alias_table_size;	/* size of the alias table */
 static u32 rlookup_table_size;	/* size if the rlookup table */
 
+/*
+ * This function flushes all internal caches of
+ * the IOMMU used by this driver.
+ */
+extern void iommu_flush_all_caches(struct amd_iommu *iommu);
+
 static inline void update_last_devid(u16 devid)
 {
 	if (devid > amd_iommu_last_bdf)
@@ -1244,6 +1250,7 @@ static void enable_iommus(void)
 		iommu_set_exclusion_range(iommu);
 		iommu_init_msi(iommu);
 		iommu_enable(iommu);
+		iommu_flush_all_caches(iommu);
 	}
 }
 
@@ -1274,8 +1281,8 @@ static void amd_iommu_resume(void)
 	 * we have to flush after the IOMMUs are enabled because a
 	 * disabled IOMMU will never execute the commands we send
 	 */
-	amd_iommu_flush_all_devices();
-	amd_iommu_flush_all_domains();
+	for_each_iommu(iommu)
+		iommu_flush_all_caches(iommu);
 }
 
 static int amd_iommu_suspend(void)
-- 
cgit v1.2.3-70-g09d2


From ce9c99af8d4b3b0b9463654fd252d8640d804dc3 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ijc@hellion.org.uk>
Date: Fri, 8 Apr 2011 07:42:29 +0100
Subject: x86, cpu: Move AMD Elan Kconfig under "Processor family"

Currently the option resides under X86_EXTENDED_PLATFORM due to historical
nonstandard A20M# handling. However that is no longer the case and so Elan can
be treated as part of the standard processor choice Kconfig option.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Link: http://lkml.kernel.org/r/1302245177.31620.47.camel@localhost.localdomain
Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kconfig                    | 11 -----------
 arch/x86/Kconfig.cpu                | 16 ++++++++++------
 arch/x86/Makefile_32.cpu            |  2 +-
 arch/x86/include/asm/module.h       |  2 +-
 arch/x86/kernel/cpu/cpufreq/Kconfig |  4 ++--
 5 files changed, 14 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc6c53a95bf..f00a3f377cc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -365,17 +365,6 @@ config X86_UV
 # Following is an alphabetically sorted list of 32 bit extended platforms
 # Please maintain the alphabetic order if and when there are additions
 
-config X86_ELAN
-	bool "AMD Elan"
-	depends on X86_32
-	depends on X86_EXTENDED_PLATFORM
-	---help---
-	  Select this for an AMD Elan processor.
-
-	  Do not use this option for K6/Athlon/Opteron processors!
-
-	  If unsure, choose "PC-compatible" instead.
-
 config X86_INTEL_CE
 	bool "CE4100 TV platform"
 	depends on PCI
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index d161e939df6..6a7cfdf8ff6 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -1,6 +1,4 @@
 # Put here option for CPU selection and depending optimization
-if !X86_ELAN
-
 choice
 	prompt "Processor family"
 	default M686 if X86_32
@@ -203,6 +201,14 @@ config MWINCHIP3D
 	  stores for this CPU, which can increase performance of some
 	  operations.
 
+config MELAN
+	bool "AMD Elan"
+	depends on X86_32
+	---help---
+	  Select this for an AMD Elan processor.
+
+	  Do not use this option for K6/Athlon/Opteron processors!
+
 config MGEODEGX1
 	bool "GeodeGX1"
 	depends on X86_32
@@ -292,8 +298,6 @@ config X86_GENERIC
 	  This is really intended for distributors who need more
 	  generic optimizations.
 
-endif
-
 #
 # Define implied options from the CPU selection here
 config X86_INTERNODE_CACHE_SHIFT
@@ -312,7 +316,7 @@ config X86_L1_CACHE_SHIFT
 	int
 	default "7" if MPENTIUM4 || MPSC
 	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
-	default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
+	default "4" if MELAN || M486 || M386 || MGEODEGX1
 	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
 
 config X86_XADD
@@ -358,7 +362,7 @@ config X86_POPAD_OK
 
 config X86_ALIGNMENT_16
 	def_bool y
-	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
+	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
 
 config X86_INTEL_USERCOPY
 	def_bool y
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index f2ee1abb1df..86cee7b749e 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -37,7 +37,7 @@ cflags-$(CONFIG_MATOM)		+= $(call cc-option,-march=atom,$(call cc-option,-march=
 	$(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
 
 # AMD Elan support
-cflags-$(CONFIG_X86_ELAN)	+= -march=i486
+cflags-$(CONFIG_MELAN)		+= -march=i486
 
 # Geode GX1 support
 cflags-$(CONFIG_MGEODEGX1)	+= -march=pentium-mmx
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index 67763c5d8b4..9eae7752ae9 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -35,7 +35,7 @@
 #define MODULE_PROC_FAMILY "K7 "
 #elif defined CONFIG_MK8
 #define MODULE_PROC_FAMILY "K8 "
-#elif defined CONFIG_X86_ELAN
+#elif defined CONFIG_MELAN
 #define MODULE_PROC_FAMILY "ELAN "
 #elif defined CONFIG_MCRUSOE
 #define MODULE_PROC_FAMILY "CRUSOE "
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index 870e6cc6ad2..0ab9b22557c 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -43,7 +43,7 @@ config X86_ACPI_CPUFREQ
 config ELAN_CPUFREQ
 	tristate "AMD Elan SC400 and SC410"
 	select CPU_FREQ_TABLE
-	depends on X86_ELAN
+	depends on MELAN
 	---help---
 	  This adds the CPUFreq driver for AMD Elan SC400 and SC410
 	  processors.
@@ -59,7 +59,7 @@ config ELAN_CPUFREQ
 config SC520_CPUFREQ
 	tristate "AMD Elan SC520"
 	select CPU_FREQ_TABLE
-	depends on X86_ELAN
+	depends on MELAN
 	---help---
 	  This adds the CPUFreq driver for AMD Elan SC520 processor.
 
-- 
cgit v1.2.3-70-g09d2


From 9844b4e5dd1932e175a23d84ce09702bdf4b5689 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 5 Apr 2011 09:22:56 +0200
Subject: x86/amd-iommu: Select PCI_IOV with AMD IOMMU driver

In order to support ATS in the AMD IOMMU driver this patch
makes sure that the generic support for ATS is compiled in.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc6c53a95bf..8cc29da2d68 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -690,6 +690,7 @@ config AMD_IOMMU
 	bool "AMD IOMMU support"
 	select SWIOTLB
 	select PCI_MSI
+	select PCI_IOV
 	depends on X86_64 && PCI && ACPI
 	---help---
 	  With this option you can enable support for AMD IOMMU hardware in
-- 
cgit v1.2.3-70-g09d2


From cb41ed85efa01e633388314c03a4f3004c6b783b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 5 Apr 2011 11:00:53 +0200
Subject: x86/amd-iommu: Flush device IOTLB if ATS is enabled

This patch implements a function to flush the IOTLB on
devices supporting ATS and makes sure that this TLB is also
flushed if necessary.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu_types.h |  3 +-
 arch/x86/kernel/amd_iommu.c            | 74 +++++++++++++++++++++++++++++++++-
 2 files changed, 75 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 878ae008eb0..f5d184e7d5b 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -113,7 +113,8 @@
 /* command specific defines */
 #define CMD_COMPL_WAIT          0x01
 #define CMD_INV_DEV_ENTRY       0x02
-#define CMD_INV_IOMMU_PAGES     0x03
+#define CMD_INV_IOMMU_PAGES	0x03
+#define CMD_INV_IOTLB_PAGES	0x04
 
 #define CMD_COMPL_WAIT_STORE_MASK	0x01
 #define CMD_COMPL_WAIT_INT_MASK		0x02
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index bcf58ea55cf..f3ce4338dad 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -18,6 +18,7 @@
  */
 
 #include <linux/pci.h>
+#include <linux/pci-ats.h>
 #include <linux/bitmap.h>
 #include <linux/slab.h>
 #include <linux/debugfs.h>
@@ -463,6 +464,37 @@ static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
 		cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
 }
 
+static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
+				  u64 address, size_t size)
+{
+	u64 pages;
+	int s;
+
+	pages = iommu_num_pages(address, size, PAGE_SIZE);
+	s     = 0;
+
+	if (pages > 1) {
+		/*
+		 * If we have to flush more than one page, flush all
+		 * TLB entries for this domain
+		 */
+		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+		s = 1;
+	}
+
+	address &= PAGE_MASK;
+
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->data[0]  = devid;
+	cmd->data[0] |= (qdep & 0xff) << 24;
+	cmd->data[1]  = devid;
+	cmd->data[2]  = lower_32_bits(address);
+	cmd->data[3]  = upper_32_bits(address);
+	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
+	if (s)
+		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+}
+
 /*
  * Writes the command to the IOMMUs command buffer and informs the
  * hardware about the new command.
@@ -573,18 +605,48 @@ void iommu_flush_all_caches(struct amd_iommu *iommu)
 	iommu_flush_tlb_all(iommu);
 }
 
+/*
+ * Command send function for flushing on-device TLB
+ */
+static int device_flush_iotlb(struct device *dev, u64 address, size_t size)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct amd_iommu *iommu;
+	struct iommu_cmd cmd;
+	u16 devid;
+	int qdep;
+
+	qdep  = pci_ats_queue_depth(pdev);
+	devid = get_device_id(dev);
+	iommu = amd_iommu_rlookup_table[devid];
+
+	build_inv_iotlb_pages(&cmd, devid, qdep, address, size);
+
+	return iommu_queue_command(iommu, &cmd);
+}
+
 /*
  * Command send function for invalidating a device table entry
  */
 static int device_flush_dte(struct device *dev)
 {
 	struct amd_iommu *iommu;
+	struct pci_dev *pdev;
 	u16 devid;
+	int ret;
 
+	pdev  = to_pci_dev(dev);
 	devid = get_device_id(dev);
 	iommu = amd_iommu_rlookup_table[devid];
 
-	return iommu_flush_dte(iommu, devid);
+	ret = iommu_flush_dte(iommu, devid);
+	if (ret)
+		return ret;
+
+	if (pci_ats_enabled(pdev))
+		ret = device_flush_iotlb(dev, 0, ~0UL);
+
+	return ret;
 }
 
 /*
@@ -595,6 +657,7 @@ static int device_flush_dte(struct device *dev)
 static void __domain_flush_pages(struct protection_domain *domain,
 				 u64 address, size_t size, int pde)
 {
+	struct iommu_dev_data *dev_data;
 	struct iommu_cmd cmd;
 	int ret = 0, i;
 
@@ -611,6 +674,15 @@ static void __domain_flush_pages(struct protection_domain *domain,
 		ret |= iommu_queue_command(amd_iommus[i], &cmd);
 	}
 
+	list_for_each_entry(dev_data, &domain->dev_list, list) {
+		struct pci_dev *pdev = to_pci_dev(dev_data->dev);
+
+		if (!pci_ats_enabled(pdev))
+			continue;
+
+		ret |= device_flush_iotlb(dev_data->dev, address, size);
+	}
+
 	WARN_ON(ret);
 }
 
-- 
cgit v1.2.3-70-g09d2


From 60f723b4117507c05c8b0b5c8b98ecc12a76878e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 5 Apr 2011 12:50:24 +0200
Subject: x86/amd-iommu: Add flag to indicate IOTLB support

This patch adds a flag to the AMD IOMMU driver to indicate
that all IOMMUs present in the system support device IOTLBs.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu_types.h | 2 ++
 arch/x86/kernel/amd_iommu_init.c       | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index f5d184e7d5b..cb811c96554 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -250,6 +250,8 @@ extern bool amd_iommu_dump;
 
 /* global flag if IOMMUs cache non-present entries */
 extern bool amd_iommu_np_cache;
+/* Only true if all IOMMUs support device IOTLBs */
+extern bool amd_iommu_iotlb_sup;
 
 /*
  * Make iterating over all IOMMUs easier
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8848dda808e..b6c634f3dc0 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -137,6 +137,7 @@ int amd_iommus_present;
 
 /* IOMMUs have a non-present cache? */
 bool amd_iommu_np_cache __read_mostly;
+bool amd_iommu_iotlb_sup __read_mostly = true;
 
 /*
  * The ACPI table parsing functions set this variable on an error
@@ -673,6 +674,9 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
 					MMIO_GET_LD(range));
 	iommu->evt_msi_num = MMIO_MSI_NUM(misc);
 
+	if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB)))
+		amd_iommu_iotlb_sup = false;
+
 	if (!is_rd890_iommu(iommu->dev))
 		return;
 
-- 
cgit v1.2.3-70-g09d2


From fd7b5535e10ce820f030842da3f289f80ec0d4f3 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 5 Apr 2011 15:31:08 +0200
Subject: x86/amd-iommu: Add ATS enable/disable code

This patch adds the necessary code to the AMD IOMMU driver
for enabling and disabling the ATS capability on a device
and to setup the IOMMU data structures correctly.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu_types.h |  2 ++
 arch/x86/kernel/amd_iommu.c            | 34 ++++++++++++++++++++++++++++------
 2 files changed, 30 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index cb811c96554..7434377b2ab 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -216,6 +216,8 @@
 #define IOMMU_PTE_IR (1ULL << 61)
 #define IOMMU_PTE_IW (1ULL << 62)
 
+#define DTE_FLAG_IOTLB	0x01
+
 #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
 #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
 #define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f3ce4338dad..e4791f66aa3 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1452,17 +1452,22 @@ static bool dma_ops_domain(struct protection_domain *domain)
 	return domain->flags & PD_DMA_OPS_MASK;
 }
 
-static void set_dte_entry(u16 devid, struct protection_domain *domain)
+static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
 {
 	u64 pte_root = virt_to_phys(domain->pt_root);
+	u32 flags = 0;
 
 	pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
 		    << DEV_ENTRY_MODE_SHIFT;
 	pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
 
-	amd_iommu_dev_table[devid].data[2] = domain->id;
-	amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
-	amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
+	if (ats)
+		flags |= DTE_FLAG_IOTLB;
+
+	amd_iommu_dev_table[devid].data[3] |= flags;
+	amd_iommu_dev_table[devid].data[2]  = domain->id;
+	amd_iommu_dev_table[devid].data[1]  = upper_32_bits(pte_root);
+	amd_iommu_dev_table[devid].data[0]  = lower_32_bits(pte_root);
 }
 
 static void clear_dte_entry(u16 devid)
@@ -1479,16 +1484,22 @@ static void do_attach(struct device *dev, struct protection_domain *domain)
 {
 	struct iommu_dev_data *dev_data;
 	struct amd_iommu *iommu;
+	struct pci_dev *pdev;
+	bool ats = false;
 	u16 devid;
 
 	devid    = get_device_id(dev);
 	iommu    = amd_iommu_rlookup_table[devid];
 	dev_data = get_dev_data(dev);
+	pdev     = to_pci_dev(dev);
+
+	if (amd_iommu_iotlb_sup)
+		ats = pci_ats_enabled(pdev);
 
 	/* Update data structures */
 	dev_data->domain = domain;
 	list_add(&dev_data->list, &domain->dev_list);
-	set_dte_entry(devid, domain);
+	set_dte_entry(devid, domain, ats);
 
 	/* Do reference counting */
 	domain->dev_iommu[iommu->index] += 1;
@@ -1502,11 +1513,13 @@ static void do_detach(struct device *dev)
 {
 	struct iommu_dev_data *dev_data;
 	struct amd_iommu *iommu;
+	struct pci_dev *pdev;
 	u16 devid;
 
 	devid    = get_device_id(dev);
 	iommu    = amd_iommu_rlookup_table[devid];
 	dev_data = get_dev_data(dev);
+	pdev     = to_pci_dev(dev);
 
 	/* decrease reference counters */
 	dev_data->domain->dev_iommu[iommu->index] -= 1;
@@ -1581,9 +1594,13 @@ out_unlock:
 static int attach_device(struct device *dev,
 			 struct protection_domain *domain)
 {
+	struct pci_dev *pdev = to_pci_dev(dev);
 	unsigned long flags;
 	int ret;
 
+	if (amd_iommu_iotlb_sup)
+		pci_enable_ats(pdev, PAGE_SHIFT);
+
 	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
 	ret = __attach_device(dev, domain);
 	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
@@ -1640,12 +1657,16 @@ static void __detach_device(struct device *dev)
  */
 static void detach_device(struct device *dev)
 {
+	struct pci_dev *pdev = to_pci_dev(dev);
 	unsigned long flags;
 
 	/* lock device table */
 	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
 	__detach_device(dev);
 	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+
+	if (amd_iommu_iotlb_sup && pci_ats_enabled(pdev))
+		pci_disable_ats(pdev);
 }
 
 /*
@@ -1795,8 +1816,9 @@ static void update_device_table(struct protection_domain *domain)
 	struct iommu_dev_data *dev_data;
 
 	list_for_each_entry(dev_data, &domain->dev_list, list) {
+		struct pci_dev *pdev = to_pci_dev(dev_data->dev);
 		u16 devid = get_device_id(dev_data->dev);
-		set_dte_entry(devid, domain);
+		set_dte_entry(devid, domain, pci_ats_enabled(pdev));
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From d99ddec3eee0be8a43b2c1ff624b9dfaaa26b959 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 11 Apr 2011 11:03:18 +0200
Subject: x86/amd-iommu: Add extended feature detection

This patch adds detection of the extended features of an
AMD IOMMU. The available features are printed to dmesg on
boot.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu_proto.h | 10 +++++++++-
 arch/x86/include/asm/amd_iommu_types.h | 17 +++++++++++++++++
 arch/x86/kernel/amd_iommu_init.c       | 24 ++++++++++++++++++++++--
 3 files changed, 48 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
index 1223c0fe03f..a4ae6c3875e 100644
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -19,7 +19,7 @@
 #ifndef _ASM_X86_AMD_IOMMU_PROTO_H
 #define _ASM_X86_AMD_IOMMU_PROTO_H
 
-struct amd_iommu;
+#include <asm/amd_iommu_types.h>
 
 extern int amd_iommu_init_dma_ops(void);
 extern int amd_iommu_init_passthrough(void);
@@ -42,4 +42,12 @@ static inline bool is_rd890_iommu(struct pci_dev *pdev)
 	       (pdev->device == PCI_DEVICE_ID_RD890_IOMMU);
 }
 
+static inline bool iommu_feature(struct amd_iommu *iommu, u64 f)
+{
+	if (!(iommu->cap & (1 << IOMMU_CAP_EFR)))
+		return false;
+
+	return !!(iommu->features & f);
+}
+
 #endif /* _ASM_X86_AMD_IOMMU_PROTO_H  */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 878ae008eb0..5c24e465234 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -68,12 +68,25 @@
 #define MMIO_CONTROL_OFFSET     0x0018
 #define MMIO_EXCL_BASE_OFFSET   0x0020
 #define MMIO_EXCL_LIMIT_OFFSET  0x0028
+#define MMIO_EXT_FEATURES	0x0030
 #define MMIO_CMD_HEAD_OFFSET	0x2000
 #define MMIO_CMD_TAIL_OFFSET	0x2008
 #define MMIO_EVT_HEAD_OFFSET	0x2010
 #define MMIO_EVT_TAIL_OFFSET	0x2018
 #define MMIO_STATUS_OFFSET	0x2020
 
+
+/* Extended Feature Bits */
+#define FEATURE_PREFETCH	(1ULL<<0)
+#define FEATURE_PPR		(1ULL<<1)
+#define FEATURE_X2APIC		(1ULL<<2)
+#define FEATURE_NX		(1ULL<<3)
+#define FEATURE_GT		(1ULL<<4)
+#define FEATURE_IA		(1ULL<<6)
+#define FEATURE_GA		(1ULL<<7)
+#define FEATURE_HE		(1ULL<<8)
+#define FEATURE_PC		(1ULL<<9)
+
 /* MMIO status bits */
 #define MMIO_STATUS_COM_WAIT_INT_MASK	0x04
 
@@ -227,6 +240,7 @@
 /* IOMMU capabilities */
 #define IOMMU_CAP_IOTLB   24
 #define IOMMU_CAP_NPCACHE 26
+#define IOMMU_CAP_EFR     27
 
 #define MAX_DOMAIN_ID 65536
 
@@ -371,6 +385,9 @@ struct amd_iommu {
 	/* flags read from acpi table */
 	u8 acpi_flags;
 
+	/* Extended features */
+	u64 features;
+
 	/*
 	 * Capability pointer. There could be more than one IOMMU per PCI
 	 * device function if there are more than one AMD IOMMU capability
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8848dda808e..047905dc3e1 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -299,9 +299,23 @@ static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
 /* Function to enable the hardware */
 static void iommu_enable(struct amd_iommu *iommu)
 {
-	printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n",
+	static const char * const feat_str[] = {
+		"PreF", "PPR", "X2APIC", "NX", "GT", "[5]",
+		"IA", "GA", "HE", "PC", NULL
+	};
+	int i;
+
+	printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx",
 	       dev_name(&iommu->dev->dev), iommu->cap_ptr);
 
+	if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
+		printk(KERN_CONT " extended features: ");
+		for (i = 0; feat_str[i]; ++i)
+			if (iommu_feature(iommu, (1ULL << i)))
+				printk(KERN_CONT " %s", feat_str[i]);
+	}
+	printk(KERN_CONT "\n");
+
 	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
 }
 
@@ -657,7 +671,7 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
 static void __init init_iommu_from_pci(struct amd_iommu *iommu)
 {
 	int cap_ptr = iommu->cap_ptr;
-	u32 range, misc;
+	u32 range, misc, low, high;
 	int i, j;
 
 	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
@@ -673,6 +687,12 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
 					MMIO_GET_LD(range));
 	iommu->evt_msi_num = MMIO_MSI_NUM(misc);
 
+	/* read extended feature bits */
+	low  = readl(iommu->mmio_base + MMIO_EXT_FEATURES);
+	high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4);
+
+	iommu->features = ((u64)high << 32) | low;
+
 	if (!is_rd890_iommu(iommu->dev))
 		return;
 
-- 
cgit v1.2.3-70-g09d2


From 58fc7f1419560efa9c426b829c195050e0147d7f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 11 Apr 2011 11:13:24 +0200
Subject: x86/amd-iommu: Add support for invalidate_all command

This patch adds support for the invalidate_all command
present in new versions of the AMD IOMMU.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu_types.h |  1 +
 arch/x86/kernel/amd_iommu.c            | 24 ++++++++++++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 5c24e465234..df62d26ed2a 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -127,6 +127,7 @@
 #define CMD_COMPL_WAIT          0x01
 #define CMD_INV_DEV_ENTRY       0x02
 #define CMD_INV_IOMMU_PAGES     0x03
+#define CMD_INV_ALL		0x08
 
 #define CMD_COMPL_WAIT_STORE_MASK	0x01
 #define CMD_COMPL_WAIT_INT_MASK		0x02
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index bcf58ea55cf..d6192bcf9f0 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -463,6 +463,12 @@ static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
 		cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
 }
 
+static void build_inv_all(struct iommu_cmd *cmd)
+{
+	memset(cmd, 0, sizeof(*cmd));
+	CMD_SET_TYPE(cmd, CMD_INV_ALL);
+}
+
 /*
  * Writes the command to the IOMMUs command buffer and informs the
  * hardware about the new command.
@@ -567,10 +573,24 @@ static void iommu_flush_tlb_all(struct amd_iommu *iommu)
 	iommu_completion_wait(iommu);
 }
 
+static void iommu_flush_all(struct amd_iommu *iommu)
+{
+	struct iommu_cmd cmd;
+
+	build_inv_all(&cmd);
+
+	iommu_queue_command(iommu, &cmd);
+	iommu_completion_wait(iommu);
+}
+
 void iommu_flush_all_caches(struct amd_iommu *iommu)
 {
-	iommu_flush_dte_all(iommu);
-	iommu_flush_tlb_all(iommu);
+	if (iommu_feature(iommu, FEATURE_IA)) {
+		iommu_flush_all(iommu);
+	} else {
+		iommu_flush_dte_all(iommu);
+		iommu_flush_tlb_all(iommu);
+	}
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 184748cc50b2dceb8287f9fb657eda48ff8fcfe7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 Apr 2011 17:23:39 +0200
Subject: sched: Provide scheduler_ipi() callback in response to
 smp_send_reschedule()

For future rework of try_to_wake_up() we'd like to push part of that
function onto the CPU the task is actually going to run on.

In order to do so we need a generic callback from the existing scheduler IPI.

This patch introduces such a generic callback: scheduler_ipi() and
implements it as a NOP.

BenH notes: PowerPC might use this IPI on offline CPUs under rare conditions!

Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Acked-by: Chris Metcalf <cmetcalf@tilera.com>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Reviewed-by: Frank Rowand <frank.rowand@am.sony.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110405152728.744338123@chello.nl
---
 arch/alpha/kernel/smp.c             |  3 +--
 arch/arm/kernel/smp.c               |  5 +----
 arch/blackfin/mach-common/smp.c     |  3 +++
 arch/cris/arch-v32/kernel/smp.c     | 13 ++++++++-----
 arch/ia64/kernel/irq_ia64.c         |  2 ++
 arch/ia64/xen/irq_xen.c             | 10 +++++++++-
 arch/m32r/kernel/smp.c              |  4 +---
 arch/mips/cavium-octeon/smp.c       |  2 ++
 arch/mips/kernel/smtc.c             |  2 +-
 arch/mips/mti-malta/malta-int.c     |  2 ++
 arch/mips/pmc-sierra/yosemite/smp.c |  4 ++++
 arch/mips/sgi-ip27/ip27-irq.c       |  2 ++
 arch/mips/sibyte/bcm1480/smp.c      |  7 +++----
 arch/mips/sibyte/sb1250/smp.c       |  7 +++----
 arch/mn10300/kernel/smp.c           |  5 +----
 arch/parisc/kernel/smp.c            |  5 +----
 arch/powerpc/kernel/smp.c           |  4 ++--
 arch/s390/kernel/smp.c              |  6 +++---
 arch/sh/kernel/smp.c                |  2 ++
 arch/sparc/kernel/smp_32.c          |  4 +++-
 arch/sparc/kernel/smp_64.c          |  1 +
 arch/tile/kernel/smp.c              |  6 +-----
 arch/um/kernel/smp.c                |  2 +-
 arch/x86/kernel/smp.c               |  5 ++---
 arch/x86/xen/smp.c                  |  5 ++---
 include/linux/sched.h               |  2 ++
 26 files changed, 63 insertions(+), 50 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 42aa078a5e4..5a621c6d22a 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -585,8 +585,7 @@ handle_ipi(struct pt_regs *regs)
 
 		switch (which) {
 		case IPI_RESCHEDULE:
-			/* Reschedule callback.  Everything to be done
-			   is done by the interrupt return path.  */
+			scheduler_ipi();
 			break;
 
 		case IPI_CALL_FUNC:
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 8fe05ad932e..7a561eb731e 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -560,10 +560,7 @@ asmlinkage void __exception_irq_entry do_IPI(int ipinr, struct pt_regs *regs)
 		break;
 
 	case IPI_RESCHEDULE:
-		/*
-		 * nothing more to do - eveything is
-		 * done on the interrupt return path
-		 */
+		scheduler_ipi();
 		break;
 
 	case IPI_CALL_FUNC:
diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index 6e17a265c4d..326bb86f4d2 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -164,6 +164,9 @@ static irqreturn_t ipi_handler_int1(int irq, void *dev_instance)
 	while (msg_queue->count) {
 		msg = &msg_queue->ipi_message[msg_queue->head];
 		switch (msg->type) {
+		case BFIN_IPI_RESCHEDULE:
+			scheduler_ipi();
+			break;
 		case BFIN_IPI_CALL_FUNC:
 			spin_unlock_irqrestore(&msg_queue->lock, flags);
 			ipi_call_function(cpu, msg);
diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c
index 4c9e3e1ba5d..66cc75657e2 100644
--- a/arch/cris/arch-v32/kernel/smp.c
+++ b/arch/cris/arch-v32/kernel/smp.c
@@ -342,15 +342,18 @@ irqreturn_t crisv32_ipi_interrupt(int irq, void *dev_id)
 
 	ipi = REG_RD(intr_vect, irq_regs[smp_processor_id()], rw_ipi);
 
+	if (ipi.vector & IPI_SCHEDULE) {
+		scheduler_ipi();
+	}
 	if (ipi.vector & IPI_CALL) {
-	         func(info);
+		func(info);
 	}
 	if (ipi.vector & IPI_FLUSH_TLB) {
-		     if (flush_mm == FLUSH_ALL)
-			 __flush_tlb_all();
-		     else if (flush_vma == FLUSH_ALL)
+		if (flush_mm == FLUSH_ALL)
+			__flush_tlb_all();
+		else if (flush_vma == FLUSH_ALL)
 			__flush_tlb_mm(flush_mm);
-		     else
+		else
 			__flush_tlb_page(flush_vma, flush_addr);
 	}
 
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index 5b704740f16..782c3a357f2 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -31,6 +31,7 @@
 #include <linux/irq.h>
 #include <linux/ratelimit.h>
 #include <linux/acpi.h>
+#include <linux/sched.h>
 
 #include <asm/delay.h>
 #include <asm/intrinsics.h>
@@ -496,6 +497,7 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs)
 			smp_local_flush_tlb();
 			kstat_incr_irqs_this_cpu(irq, desc);
 		} else if (unlikely(IS_RESCHEDULE(vector))) {
+			scheduler_ipi();
 			kstat_incr_irqs_this_cpu(irq, desc);
 		} else {
 			ia64_setreg(_IA64_REG_CR_TPR, vector);
diff --git a/arch/ia64/xen/irq_xen.c b/arch/ia64/xen/irq_xen.c
index 108bb858acf..b279e142c63 100644
--- a/arch/ia64/xen/irq_xen.c
+++ b/arch/ia64/xen/irq_xen.c
@@ -92,6 +92,8 @@ static unsigned short saved_irq_cnt;
 static int xen_slab_ready;
 
 #ifdef CONFIG_SMP
+#include <linux/sched.h>
+
 /* Dummy stub. Though we may check XEN_RESCHEDULE_VECTOR before __do_IRQ,
  * it ends up to issue several memory accesses upon percpu data and
  * thus adds unnecessary traffic to other paths.
@@ -99,7 +101,13 @@ static int xen_slab_ready;
 static irqreturn_t
 xen_dummy_handler(int irq, void *dev_id)
 {
+	return IRQ_HANDLED;
+}
 
+static irqreturn_t
+xen_resched_handler(int irq, void *dev_id)
+{
+	scheduler_ipi();
 	return IRQ_HANDLED;
 }
 
@@ -110,7 +118,7 @@ static struct irqaction xen_ipi_irqaction = {
 };
 
 static struct irqaction xen_resched_irqaction = {
-	.handler =	xen_dummy_handler,
+	.handler =	xen_resched_handler,
 	.flags =	IRQF_DISABLED,
 	.name =		"resched"
 };
diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c
index 31cef20b299..fc10b39893d 100644
--- a/arch/m32r/kernel/smp.c
+++ b/arch/m32r/kernel/smp.c
@@ -122,8 +122,6 @@ void smp_send_reschedule(int cpu_id)
  *
  * Description:  This routine executes on CPU which received
  *               'RESCHEDULE_IPI'.
- *               Rescheduling is processed at the exit of interrupt
- *               operation.
  *
  * Born on Date: 2002.02.05
  *
@@ -138,7 +136,7 @@ void smp_send_reschedule(int cpu_id)
  *==========================================================================*/
 void smp_reschedule_interrupt(void)
 {
-	/* nothing to do */
+	scheduler_ipi();
 }
 
 /*==========================================================================*
diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c
index ba78b21cc8d..76923eeb58b 100644
--- a/arch/mips/cavium-octeon/smp.c
+++ b/arch/mips/cavium-octeon/smp.c
@@ -44,6 +44,8 @@ static irqreturn_t mailbox_interrupt(int irq, void *dev_id)
 
 	if (action & SMP_CALL_FUNCTION)
 		smp_call_function_interrupt();
+	if (action & SMP_RESCHEDULE_YOURSELF)
+		scheduler_ipi();
 
 	/* Check if we've been told to flush the icache */
 	if (action & SMP_ICACHE_FLUSH)
diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c
index 5a88cc4ccd5..cedac463374 100644
--- a/arch/mips/kernel/smtc.c
+++ b/arch/mips/kernel/smtc.c
@@ -929,7 +929,7 @@ static void post_direct_ipi(int cpu, struct smtc_ipi *pipi)
 
 static void ipi_resched_interrupt(void)
 {
-	/* Return from interrupt should be enough to cause scheduler check */
+	scheduler_ipi();
 }
 
 static void ipi_call_interrupt(void)
diff --git a/arch/mips/mti-malta/malta-int.c b/arch/mips/mti-malta/malta-int.c
index 9027061f0ea..7d93e6fbfa5 100644
--- a/arch/mips/mti-malta/malta-int.c
+++ b/arch/mips/mti-malta/malta-int.c
@@ -309,6 +309,8 @@ static void ipi_call_dispatch(void)
 
 static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id)
 {
+	scheduler_ipi();
+
 	return IRQ_HANDLED;
 }
 
diff --git a/arch/mips/pmc-sierra/yosemite/smp.c b/arch/mips/pmc-sierra/yosemite/smp.c
index efc9e889b34..2608752898c 100644
--- a/arch/mips/pmc-sierra/yosemite/smp.c
+++ b/arch/mips/pmc-sierra/yosemite/smp.c
@@ -55,6 +55,8 @@ void titan_mailbox_irq(void)
 
 		if (status & 0x2)
 			smp_call_function_interrupt();
+		if (status & 0x4)
+			scheduler_ipi();
 		break;
 
 	case 1:
@@ -63,6 +65,8 @@ void titan_mailbox_irq(void)
 
 		if (status & 0x2)
 			smp_call_function_interrupt();
+		if (status & 0x4)
+			scheduler_ipi();
 		break;
 	}
 }
diff --git a/arch/mips/sgi-ip27/ip27-irq.c b/arch/mips/sgi-ip27/ip27-irq.c
index 0a04603d577..b18b04e4857 100644
--- a/arch/mips/sgi-ip27/ip27-irq.c
+++ b/arch/mips/sgi-ip27/ip27-irq.c
@@ -147,8 +147,10 @@ static void ip27_do_irq_mask0(void)
 #ifdef CONFIG_SMP
 	if (pend0 & (1UL << CPU_RESCHED_A_IRQ)) {
 		LOCAL_HUB_CLR_INTR(CPU_RESCHED_A_IRQ);
+		scheduler_ipi();
 	} else if (pend0 & (1UL << CPU_RESCHED_B_IRQ)) {
 		LOCAL_HUB_CLR_INTR(CPU_RESCHED_B_IRQ);
+		scheduler_ipi();
 	} else if (pend0 & (1UL << CPU_CALL_A_IRQ)) {
 		LOCAL_HUB_CLR_INTR(CPU_CALL_A_IRQ);
 		smp_call_function_interrupt();
diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c
index 47b347c992e..d667875be56 100644
--- a/arch/mips/sibyte/bcm1480/smp.c
+++ b/arch/mips/sibyte/bcm1480/smp.c
@@ -20,6 +20,7 @@
 #include <linux/delay.h>
 #include <linux/smp.h>
 #include <linux/kernel_stat.h>
+#include <linux/sched.h>
 
 #include <asm/mmu_context.h>
 #include <asm/io.h>
@@ -189,10 +190,8 @@ void bcm1480_mailbox_interrupt(void)
 	/* Clear the mailbox to clear the interrupt */
 	__raw_writeq(((u64)action)<<48, mailbox_0_clear_regs[cpu]);
 
-	/*
-	 * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the
-	 * interrupt will do the reschedule for us
-	 */
+	if (action & SMP_RESCHEDULE_YOURSELF)
+		scheduler_ipi();
 
 	if (action & SMP_CALL_FUNCTION)
 		smp_call_function_interrupt();
diff --git a/arch/mips/sibyte/sb1250/smp.c b/arch/mips/sibyte/sb1250/smp.c
index c00a5cb1128..38e7f6bd792 100644
--- a/arch/mips/sibyte/sb1250/smp.c
+++ b/arch/mips/sibyte/sb1250/smp.c
@@ -21,6 +21,7 @@
 #include <linux/interrupt.h>
 #include <linux/smp.h>
 #include <linux/kernel_stat.h>
+#include <linux/sched.h>
 
 #include <asm/mmu_context.h>
 #include <asm/io.h>
@@ -177,10 +178,8 @@ void sb1250_mailbox_interrupt(void)
 	/* Clear the mailbox to clear the interrupt */
 	____raw_writeq(((u64)action) << 48, mailbox_clear_regs[cpu]);
 
-	/*
-	 * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the
-	 * interrupt will do the reschedule for us
-	 */
+	if (action & SMP_RESCHEDULE_YOURSELF)
+		scheduler_ipi();
 
 	if (action & SMP_CALL_FUNCTION)
 		smp_call_function_interrupt();
diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c
index 226c826a219..83fb2791223 100644
--- a/arch/mn10300/kernel/smp.c
+++ b/arch/mn10300/kernel/smp.c
@@ -494,14 +494,11 @@ void smp_send_stop(void)
  * @irq: The interrupt number.
  * @dev_id: The device ID.
  *
- * We need do nothing here, since the scheduling will be effected on our way
- * back through entry.S.
- *
  * Returns IRQ_HANDLED to indicate we handled the interrupt successfully.
  */
 static irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id)
 {
-	/* do nothing */
+	scheduler_ipi();
 	return IRQ_HANDLED;
 }
 
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 69d63d354ef..828305f19cf 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -155,10 +155,7 @@ ipi_interrupt(int irq, void *dev_id)
 				
 			case IPI_RESCHEDULE:
 				smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu);
-				/*
-				 * Reschedule callback.  Everything to be
-				 * done is done by the interrupt return path.
-				 */
+				scheduler_ipi();
 				break;
 
 			case IPI_CALL_FUNC:
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index cbdbb14be4b..9f9c204bef6 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -116,7 +116,7 @@ void smp_message_recv(int msg)
 		generic_smp_call_function_interrupt();
 		break;
 	case PPC_MSG_RESCHEDULE:
-		/* we notice need_resched on exit */
+		scheduler_ipi();
 		break;
 	case PPC_MSG_CALL_FUNC_SINGLE:
 		generic_smp_call_function_single_interrupt();
@@ -146,7 +146,7 @@ static irqreturn_t call_function_action(int irq, void *data)
 
 static irqreturn_t reschedule_action(int irq, void *data)
 {
-	/* we just need the return path side effect of checking need_resched */
+	scheduler_ipi();
 	return IRQ_HANDLED;
 }
 
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 63a97db83f9..63c7d9ff220 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -165,12 +165,12 @@ static void do_ext_call_interrupt(unsigned int ext_int_code,
 	kstat_cpu(smp_processor_id()).irqs[EXTINT_IPI]++;
 	/*
 	 * handle bit signal external calls
-	 *
-	 * For the ec_schedule signal we have to do nothing. All the work
-	 * is done automatically when we return from the interrupt.
 	 */
 	bits = xchg(&S390_lowcore.ext_call_fast, 0);
 
+	if (test_bit(ec_schedule, &bits))
+		scheduler_ipi();
+
 	if (test_bit(ec_call_function, &bits))
 		generic_smp_call_function_interrupt();
 
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 509b36b4511..6207561ea34 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/cpu.h>
 #include <linux/interrupt.h>
+#include <linux/sched.h>
 #include <asm/atomic.h>
 #include <asm/processor.h>
 #include <asm/system.h>
@@ -323,6 +324,7 @@ void smp_message_recv(unsigned int msg)
 		generic_smp_call_function_interrupt();
 		break;
 	case SMP_MSG_RESCHEDULE:
+		scheduler_ipi();
 		break;
 	case SMP_MSG_FUNCTION_SINGLE:
 		generic_smp_call_function_single_interrupt();
diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c
index 91c10fb7085..f95690c167b 100644
--- a/arch/sparc/kernel/smp_32.c
+++ b/arch/sparc/kernel/smp_32.c
@@ -125,7 +125,9 @@ struct linux_prom_registers smp_penguin_ctable __cpuinitdata = { 0 };
 
 void smp_send_reschedule(int cpu)
 {
-	/* See sparc64 */
+	/*
+	 * XXX missing reschedule IPI, see scheduler_ipi()
+	 */
 }
 
 void smp_send_stop(void)
diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index 3e94a8c2323..9478da7fdb3 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1368,6 +1368,7 @@ void smp_send_reschedule(int cpu)
 void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs)
 {
 	clear_softint(1 << irq);
+	scheduler_ipi();
 }
 
 /* This is a nop because we capture all other cpus
diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c
index a4293102ef8..c52224d5ed4 100644
--- a/arch/tile/kernel/smp.c
+++ b/arch/tile/kernel/smp.c
@@ -189,12 +189,8 @@ void flush_icache_range(unsigned long start, unsigned long end)
 /* Called when smp_send_reschedule() triggers IRQ_RESCHEDULE. */
 static irqreturn_t handle_reschedule_ipi(int irq, void *token)
 {
-	/*
-	 * Nothing to do here; when we return from interrupt, the
-	 * rescheduling will occur there. But do bump the interrupt
-	 * profiler count in the meantime.
-	 */
 	__get_cpu_var(irq_stat).irq_resched_count++;
+	scheduler_ipi();
 
 	return IRQ_HANDLED;
 }
diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
index 106bf27e2a9..eefb107d2d7 100644
--- a/arch/um/kernel/smp.c
+++ b/arch/um/kernel/smp.c
@@ -173,7 +173,7 @@ void IPI_handler(int cpu)
 			break;
 
 		case 'R':
-			set_tsk_need_resched(current);
+			scheduler_ipi();
 			break;
 
 		case 'S':
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 513deac7228..013e7eba83b 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait)
 }
 
 /*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
+ * Reschedule call back.
  */
 void smp_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
 	inc_irq_stat(irq_resched_count);
+	scheduler_ipi();
 	/*
 	 * KVM uses this interrupt to force a cpu out of guest mode
 	 */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 30612441ed9..762b46ab14d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -46,13 +46,12 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
 static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
 
 /*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
+ * Reschedule call back.
  */
 static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
 {
 	inc_irq_stat(irq_resched_count);
+	scheduler_ipi();
 
 	return IRQ_HANDLED;
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4ec2c027e92..758e27afcda 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2189,8 +2189,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
 extern char *get_task_comm(char *to, struct task_struct *tsk);
 
 #ifdef CONFIG_SMP
+static inline void scheduler_ipi(void) { }
 extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 #else
+static inline void scheduler_ipi(void) { }
 static inline unsigned long wait_task_inactive(struct task_struct *p,
 					       long match_state)
 {
-- 
cgit v1.2.3-70-g09d2


From c55fa78b13b32d3f19e19cd0c8b9378fdc09e521 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 8 Nov 2010 14:13:35 -0500
Subject: xen/pci: Add xen_[find|register|unregister]_device_domain_owner
 functions.

When the Xen PCI backend is told to enable or disable MSI/MSI-X functions,
the initial domain performs these operations. The initial domain needs
to know which domain (guest) is going to use the PCI device so when it
makes the appropiate hypercall to retrieve the MSI/MSI-X vector it will
also assign the PCI device to the appropiate domain (guest).

This boils down to us needing a mechanism to find, set and unset the domain
id that will be using the device.

[v2: EXPORT_SYMBOL -> EXPORT_SYMBOL_GPL.]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/xen/pci.h | 16 +++++++++
 arch/x86/pci/xen.c             | 73 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 89 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index aa862098916..4fbda9a3f33 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -15,10 +15,26 @@ static inline int pci_xen_hvm_init(void)
 #endif
 #if defined(CONFIG_XEN_DOM0)
 void __init xen_setup_pirqs(void);
+int xen_find_device_domain_owner(struct pci_dev *dev);
+int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain);
+int xen_unregister_device_domain_owner(struct pci_dev *dev);
 #else
 static inline void __init xen_setup_pirqs(void)
 {
 }
+static inline int xen_find_device_domain_owner(struct pci_dev *dev)
+{
+	return -1;
+}
+static inline int xen_register_device_domain_owner(struct pci_dev *dev,
+						   uint16_t domain)
+{
+	return -1;
+}
+static inline int xen_unregister_device_domain_owner(struct pci_dev *dev)
+{
+	return -1;
+}
 #endif
 
 #if defined(CONFIG_PCI_MSI)
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index e37b407a0ee..6075f2d6533 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -461,3 +461,76 @@ void __init xen_setup_pirqs(void)
 	}
 }
 #endif
+
+struct xen_device_domain_owner {
+	domid_t domain;
+	struct pci_dev *dev;
+	struct list_head list;
+};
+
+static DEFINE_SPINLOCK(dev_domain_list_spinlock);
+static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list);
+
+static struct xen_device_domain_owner *find_device(struct pci_dev *dev)
+{
+	struct xen_device_domain_owner *owner;
+
+	list_for_each_entry(owner, &dev_domain_list, list) {
+		if (owner->dev == dev)
+			return owner;
+	}
+	return NULL;
+}
+
+int xen_find_device_domain_owner(struct pci_dev *dev)
+{
+	struct xen_device_domain_owner *owner;
+	int domain = -ENODEV;
+
+	spin_lock(&dev_domain_list_spinlock);
+	owner = find_device(dev);
+	if (owner)
+		domain = owner->domain;
+	spin_unlock(&dev_domain_list_spinlock);
+	return domain;
+}
+EXPORT_SYMBOL_GPL(xen_find_device_domain_owner);
+
+int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain)
+{
+	struct xen_device_domain_owner *owner;
+
+	owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL);
+	if (!owner)
+		return -ENODEV;
+
+	spin_lock(&dev_domain_list_spinlock);
+	if (find_device(dev)) {
+		spin_unlock(&dev_domain_list_spinlock);
+		kfree(owner);
+		return -EEXIST;
+	}
+	owner->domain = domain;
+	owner->dev = dev;
+	list_add_tail(&owner->list, &dev_domain_list);
+	spin_unlock(&dev_domain_list_spinlock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xen_register_device_domain_owner);
+
+int xen_unregister_device_domain_owner(struct pci_dev *dev)
+{
+	struct xen_device_domain_owner *owner;
+
+	spin_lock(&dev_domain_list_spinlock);
+	owner = find_device(dev);
+	if (!owner) {
+		spin_unlock(&dev_domain_list_spinlock);
+		return -ENODEV;
+	}
+	list_del(&owner->list);
+	spin_unlock(&dev_domain_list_spinlock);
+	kfree(owner);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner);
-- 
cgit v1.2.3-70-g09d2


From beafbdc1df02877612dc9039c1de0639921fddec Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 14 Apr 2011 11:17:36 -0400
Subject: xen/irq: Check if the PCI device is owned by a domain different than
 DOMID_SELF.

We check if there is a domain owner for the PCI device. In case of failure
(meaning no domain has registered for this device) we make DOMID_SELF the owner.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
[v2: deal with rebasing on v2.6.37-1]
[v3: deal with rebasing on stable/irq.cleanup]
[v4: deal with rebasing on stable/irq.ween_of_nr_irqs]
[v5: deal with rebasing on v2.6.39-rc3]
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>
---
 arch/x86/pci/xen.c   | 21 ++++++++++++++++-----
 drivers/xen/events.c | 12 ++++++++----
 include/xen/events.h |  3 ++-
 3 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 6075f2d6533..393981feb12 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -108,7 +108,8 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		}
 		irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0,
 					       (type == PCI_CAP_ID_MSIX) ?
-					       "msi-x" : "msi");
+					       "msi-x" : "msi",
+					       DOMID_SELF);
 		if (irq < 0)
 			goto error;
 		dev_dbg(&dev->dev,
@@ -148,7 +149,8 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 		irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
 					       (type == PCI_CAP_ID_MSIX) ?
 					       "pcifront-msi-x" :
-					       "pcifront-msi");
+					       "pcifront-msi",
+						DOMID_SELF);
 		if (irq < 0)
 			goto free;
 		i++;
@@ -190,9 +192,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 
 	list_for_each_entry(msidesc, &dev->msi_list, list) {
 		struct physdev_map_pirq map_irq;
+		domid_t domid;
+
+		domid = ret = xen_find_device_domain_owner(dev);
+		/* N.B. Casting int's -ENODEV to uint16_t results in 0xFFED,
+		 * hence check ret value for < 0. */
+		if (ret < 0)
+			domid = DOMID_SELF;
 
 		memset(&map_irq, 0, sizeof(map_irq));
-		map_irq.domid = DOMID_SELF;
+		map_irq.domid = domid;
 		map_irq.type = MAP_PIRQ_TYPE_MSI;
 		map_irq.index = -1;
 		map_irq.pirq = -1;
@@ -215,14 +224,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 
 		ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
 		if (ret) {
-			dev_warn(&dev->dev, "xen map irq failed %d\n", ret);
+			dev_warn(&dev->dev, "xen map irq failed %d for %d domain\n",
+				 ret, domid);
 			goto out;
 		}
 
 		ret = xen_bind_pirq_msi_to_irq(dev, msidesc,
 					       map_irq.pirq, map_irq.index,
 					       (type == PCI_CAP_ID_MSIX) ?
-					       "msi-x" : "msi");
+					       "msi-x" : "msi",
+						domid);
 		if (ret < 0)
 			goto out;
 	}
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 42d6c930cc8..ac0e2282635 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -101,6 +101,7 @@ struct irq_info
 			unsigned short gsi;
 			unsigned char vector;
 			unsigned char flags;
+			uint16_t domid;
 		} pirq;
 	} u;
 };
@@ -184,6 +185,7 @@ static void xen_irq_info_pirq_init(unsigned irq,
 				   unsigned short pirq,
 				   unsigned short gsi,
 				   unsigned short vector,
+				   uint16_t domid,
 				   unsigned char flags)
 {
 	struct irq_info *info = info_for_irq(irq);
@@ -193,6 +195,7 @@ static void xen_irq_info_pirq_init(unsigned irq,
 	info->u.pirq.pirq = pirq;
 	info->u.pirq.gsi = gsi;
 	info->u.pirq.vector = vector;
+	info->u.pirq.domid = domid;
 	info->u.pirq.flags = flags;
 }
 
@@ -655,7 +658,7 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi,
 		goto out;
 	}
 
-	xen_irq_info_pirq_init(irq, 0, pirq, gsi, irq_op.vector,
+	xen_irq_info_pirq_init(irq, 0, pirq, gsi, irq_op.vector, DOMID_SELF,
 			       shareable ? PIRQ_SHAREABLE : 0);
 
 out:
@@ -680,7 +683,8 @@ int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc)
 }
 
 int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
-			     int pirq, int vector, const char *name)
+			     int pirq, int vector, const char *name,
+			     domid_t domid)
 {
 	int irq, ret;
 
@@ -693,7 +697,7 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
 	irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_level_irq,
 				      name);
 
-	xen_irq_info_pirq_init(irq, 0, pirq, 0, vector, 0);
+	xen_irq_info_pirq_init(irq, 0, pirq, 0, vector, domid, 0);
 	ret = irq_set_msi_desc(irq, msidesc);
 	if (ret < 0)
 		goto error_irq;
@@ -722,7 +726,7 @@ int xen_destroy_irq(int irq)
 
 	if (xen_initial_domain()) {
 		unmap_irq.pirq = info->u.pirq.pirq;
-		unmap_irq.domid = DOMID_SELF;
+		unmap_irq.domid = info->u.pirq.domid;
 		rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq);
 		if (rc) {
 			printk(KERN_WARNING "unmap irq failed %d\n", rc);
diff --git a/include/xen/events.h b/include/xen/events.h
index f1b87ad48ac..9aecc0b5a0e 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -85,7 +85,8 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi,
 int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc);
 /* Bind an PSI pirq to an irq. */
 int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc,
-			     int pirq, int vector, const char *name);
+			     int pirq, int vector, const char *name,
+			     domid_t domid);
 #endif
 
 /* De-allocates the above mentioned physical interrupt. */
-- 
cgit v1.2.3-70-g09d2


From cf8d91633ddef9e816ccbf3da833c79ce508988d Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 28 Feb 2011 17:58:48 -0500
Subject: xen/p2m/m2p/gnttab: Support GNTMAP_host_map in the M2P override.

We only supported the M2P (and P2M) override only for the
GNTMAP_contains_pte type mappings. Meaning that we grants
operations would "contain the machine address of the PTE to update"
If the flag is unset, then the grant operation is
"contains a host virtual address". The latter case means that
the Hypervisor takes care of updating our page table
(specifically the PTE entry) with the guest's MFN. As such we should
not try to do anything with the PTE. Previous to this patch
we would try to clear the PTE which resulted in Xen hypervisor
being upset with us:

(XEN) mm.c:1066:d0 Attempt to implicitly unmap a granted PTE c0100000ccc59067
(XEN) domain_crash called from mm.c:1067
(XEN) Domain 0 (vcpu#0) crashed on cpu#3:
(XEN) ----[ Xen-4.0-110228  x86_64  debug=y  Not tainted ]----

and crashing us.

This patch allows us to inhibit the PTE clearing in the PV guest
if the GNTMAP_contains_pte is not set.

On the m2p_remove_override path we provide the same parameter.

Sadly in the grant-table driver we do not have a mechanism to
tell m2p_remove_override whether to clear the PTE or not. Since
the grant-table driver is used by user-space, we can safely assume
that it operates only on PTE's. Hence the implementation for
it to work on !GNTMAP_contains_pte returns -EOPNOTSUPP. In the future
we can implement the support for this. It will require some extra
accounting structure to keep track of the page[i], and the flag.

[v1: Added documentation details, made it return -EOPNOTSUPP instead
 of trying to do a half-way implementation]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/include/asm/xen/page.h |  5 +++--
 arch/x86/xen/p2m.c              | 10 ++++------
 drivers/xen/grant-table.c       | 31 ++++++++++++++++++++++++-------
 3 files changed, 31 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index c61934fbf22..64a619d47d3 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -47,8 +47,9 @@ extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 extern unsigned long set_phys_range_identity(unsigned long pfn_s,
 					     unsigned long pfn_e);
 
-extern int m2p_add_override(unsigned long mfn, struct page *page);
-extern int m2p_remove_override(struct page *page);
+extern int m2p_add_override(unsigned long mfn, struct page *page,
+			    bool clear_pte);
+extern int m2p_remove_override(struct page *page, bool clear_pte);
 extern struct page *m2p_find_override(unsigned long mfn);
 extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
 
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 141eb0de8b0..2d2b32af3a1 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -650,7 +650,7 @@ static unsigned long mfn_hash(unsigned long mfn)
 }
 
 /* Add an MFN override for a particular page */
-int m2p_add_override(unsigned long mfn, struct page *page)
+int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte)
 {
 	unsigned long flags;
 	unsigned long pfn;
@@ -662,7 +662,6 @@ int m2p_add_override(unsigned long mfn, struct page *page)
 	if (!PageHighMem(page)) {
 		address = (unsigned long)__va(pfn << PAGE_SHIFT);
 		ptep = lookup_address(address, &level);
-
 		if (WARN(ptep == NULL || level != PG_LEVEL_4K,
 					"m2p_add_override: pfn %lx not mapped", pfn))
 			return -EINVAL;
@@ -674,10 +673,9 @@ int m2p_add_override(unsigned long mfn, struct page *page)
 	if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
 		return -ENOMEM;
 
-	if (!PageHighMem(page))
+	if (clear_pte && !PageHighMem(page))
 		/* Just zap old mapping for now */
 		pte_clear(&init_mm, address, ptep);
-
 	spin_lock_irqsave(&m2p_override_lock, flags);
 	list_add(&page->lru,  &m2p_overrides[mfn_hash(mfn)]);
 	spin_unlock_irqrestore(&m2p_override_lock, flags);
@@ -685,7 +683,7 @@ int m2p_add_override(unsigned long mfn, struct page *page)
 	return 0;
 }
 
-int m2p_remove_override(struct page *page)
+int m2p_remove_override(struct page *page, bool clear_pte)
 {
 	unsigned long flags;
 	unsigned long mfn;
@@ -713,7 +711,7 @@ int m2p_remove_override(struct page *page)
 	spin_unlock_irqrestore(&m2p_override_lock, flags);
 	set_phys_to_machine(pfn, page->index);
 
-	if (!PageHighMem(page))
+	if (clear_pte && !PageHighMem(page))
 		set_pte_at(&init_mm, address, ptep,
 				pfn_pte(pfn, PAGE_KERNEL));
 		/* No tlb flush necessary because the caller already
diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c
index 3745a318def..fd725cde6ad 100644
--- a/drivers/xen/grant-table.c
+++ b/drivers/xen/grant-table.c
@@ -466,13 +466,30 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
 		if (map_ops[i].status)
 			continue;
 
-		/* m2p override only supported for GNTMAP_contains_pte mappings */
-		if (!(map_ops[i].flags & GNTMAP_contains_pte))
-			continue;
-		pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
+		if (map_ops[i].flags & GNTMAP_contains_pte) {
+			pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
 				(map_ops[i].host_addr & ~PAGE_MASK));
-		mfn = pte_mfn(*pte);
-		ret = m2p_add_override(mfn, pages[i]);
+			mfn = pte_mfn(*pte);
+		} else {
+			/* If you really wanted to do this:
+			 * mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
+			 *
+			 * The reason we do not implement it is b/c on the
+			 * unmap path (gnttab_unmap_refs) we have no means of
+			 * checking whether the page is !GNTMAP_contains_pte.
+			 *
+			 * That is without some extra data-structure to carry
+			 * the struct page, bool clear_pte, and list_head next
+			 * tuples and deal with allocation/delallocation, etc.
+			 *
+			 * The users of this API set the GNTMAP_contains_pte
+			 * flag so lets just return not supported until it
+			 * becomes neccessary to implement.
+			 */
+			return -EOPNOTSUPP;
+		}
+		ret = m2p_add_override(mfn, pages[i],
+				       map_ops[i].flags & GNTMAP_contains_pte);
 		if (ret)
 			return ret;
 	}
@@ -494,7 +511,7 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops,
 		return ret;
 
 	for (i = 0; i < count; i++) {
-		ret = m2p_remove_override(pages[i]);
+		ret = m2p_remove_override(pages[i], true /* clear the PTE */);
 		if (ret)
 			return ret;
 	}
-- 
cgit v1.2.3-70-g09d2


From c387aa3a1a910ce00b86f3a85082d24f144db256 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 18 Apr 2011 15:45:43 +0200
Subject: x86, gart: Don't enforce GART aperture lower-bound by alignment

This patch changes the allocation of the GART aperture to
enforce only natural alignment instead of aligning it on
512MB. This big alignment was used to force the GART
aperture to be over 512MB. This is enforced by using 512MB
as the lower-bound address in the allocation range.

[ hpa: The actual number 512 MiB needs to be revisited, too. ]

Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Link: http://lkml.kernel.org/r/1303134346-5805-2-git-send-email-joerg.roedel@amd.com
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/aperture_64.c | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 73fb469908c..3d2661ca654 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -30,6 +30,22 @@
 #include <asm/amd_nb.h>
 #include <asm/x86_init.h>
 
+/*
+ * Using 512M as goal, in case kexec will load kernel_big
+ * that will do the on-position decompress, and could overlap with
+ * with the gart aperture that is used.
+ * Sequence:
+ * kernel_small
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kernel_small (gart area become e820_reserved)
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kerne_big (uncompressed size will be big than 64M or 128M)
+ * So don't use 512M below as gart iommu, leave the space for kernel
+ * code for safe.
+ */
+#define GART_MIN_ADDR	(512ULL << 20)
+#define GART_MAX_ADDR	(1ULL   << 32)
+
 int gart_iommu_aperture;
 int gart_iommu_aperture_disabled __initdata;
 int gart_iommu_aperture_allowed __initdata;
@@ -70,21 +86,9 @@ static u32 __init allocate_aperture(void)
 	 * memory. Unfortunately we cannot move it up because that would
 	 * make the IOMMU useless.
 	 */
-	/*
-	 * using 512M as goal, in case kexec will load kernel_big
-	 * that will do the on position decompress, and  could overlap with
-	 * that position with gart that is used.
-	 * sequende:
-	 * kernel_small
-	 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
-	 * ==> kernel_small(gart area become e820_reserved)
-	 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
-	 * ==> kerne_big (uncompressed size will be big than 64M or 128M)
-	 * so don't use 512M below as gart iommu, leave the space for kernel
-	 * code for safe
-	 */
-	addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20);
-	if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) {
+	addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
+				      aper_size, aper_size);
+	if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) {
 		printk(KERN_ERR
 			"Cannot allocate aperture memory hole (%lx,%uK)\n",
 				addr, aper_size>>10);
-- 
cgit v1.2.3-70-g09d2


From b1e7734f024c9ce4393016a97c8d821e1f18d9b4 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Mon, 18 Apr 2011 15:18:02 -0700
Subject: x86, percpu: Use ASM_NOP4 instead of hardcoding P6_NOP4

For use in assembly constants, use the ASM_NOP* defines.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1303166160-10315-2-git-send-email-hpa@linux.intel.com
---
 arch/x86/include/asm/percpu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index d475b4398d8..751e7f3f705 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -517,7 +517,7 @@ do {									\
 	typeof(o2) __o2 = o2;						\
 	typeof(o2) __n2 = n2;						\
 	typeof(o2) __dummy;						\
-	alternative_io("call this_cpu_cmpxchg16b_emu\n\t" P6_NOP4,	\
+	alternative_io("call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP4,	\
 		       "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t",	\
 		       X86_FEATURE_CX16,				\
 		       ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)),		\
-- 
cgit v1.2.3-70-g09d2


From dc326fca2b640fc41aed7c015d0f456935a66255 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Mon, 18 Apr 2011 15:19:51 -0700
Subject: x86, cpu: Clean up and unify the NOP selection infrastructure

Clean up and unify the NOP selection infrastructure:

- Make the atomic 5-byte NOP a part of the selection system.
- Pick NOPs once during early boot and then be done with it.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jason Baron <jbaron@redhat.com>
Link: http://lkml.kernel.org/r/1303166160-10315-3-git-send-email-hpa@linux.intel.com
---
 arch/x86/include/asm/alternative.h |   8 --
 arch/x86/include/asm/nops.h        | 146 ++++++++++++++++-------------
 arch/x86/kernel/alternative.c      | 182 ++++++++++++++++++++-----------------
 arch/x86/kernel/ftrace.c           |   4 +-
 arch/x86/kernel/jump_label.c       |   5 +-
 arch/x86/kernel/setup.c            |   6 +-
 6 files changed, 190 insertions(+), 161 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 13009d1af99..7da168225a9 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -191,12 +191,4 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
 extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
 extern void text_poke_smp_batch(struct text_poke_param *params, int n);
 
-#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
-#define IDEAL_NOP_SIZE_5 5
-extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
-extern void arch_init_ideal_nop5(void);
-#else
-static inline void arch_init_ideal_nop5(void) {}
-#endif
-
 #endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index af788496020..405b4032a60 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -1,7 +1,13 @@
 #ifndef _ASM_X86_NOPS_H
 #define _ASM_X86_NOPS_H
 
-/* Define nops for use with alternative() */
+/*
+ * Define nops for use with alternative() and for tracing.
+ *
+ * *_NOP5_ATOMIC must be a single instruction.
+ */
+
+#define NOP_DS_PREFIX 0x3e
 
 /* generic versions from gas
    1: nop
@@ -13,14 +19,15 @@
    6: leal 0x00000000(%esi),%esi
    7: leal 0x00000000(,%esi,1),%esi
 */
-#define GENERIC_NOP1 ".byte 0x90\n"
-#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
-#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
-#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
-#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
-#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
-#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
-#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
+#define GENERIC_NOP1 0x90
+#define GENERIC_NOP2 0x89,0xf6
+#define GENERIC_NOP3 0x8d,0x76,0x00
+#define GENERIC_NOP4 0x8d,0x74,0x26,0x00
+#define GENERIC_NOP5 GENERIC_NOP1,GENERIC_NOP4
+#define GENERIC_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00
+#define GENERIC_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00
+#define GENERIC_NOP8 GENERIC_NOP1,GENERIC_NOP7
+#define GENERIC_NOP5_ATOMIC NOP_DS_PREFIX,GENERIC_NOP4
 
 /* Opteron 64bit nops
    1: nop
@@ -29,13 +36,14 @@
    4: osp osp osp nop
 */
 #define K8_NOP1 GENERIC_NOP1
-#define K8_NOP2	".byte 0x66,0x90\n"
-#define K8_NOP3	".byte 0x66,0x66,0x90\n"
-#define K8_NOP4	".byte 0x66,0x66,0x66,0x90\n"
-#define K8_NOP5	K8_NOP3 K8_NOP2
-#define K8_NOP6	K8_NOP3 K8_NOP3
-#define K8_NOP7	K8_NOP4 K8_NOP3
-#define K8_NOP8	K8_NOP4 K8_NOP4
+#define K8_NOP2	0x66,K8_NOP1
+#define K8_NOP3	0x66,K8_NOP2
+#define K8_NOP4	0x66,K8_NOP3
+#define K8_NOP5	K8_NOP3,K8_NOP2
+#define K8_NOP6	K8_NOP3,K8_NOP3
+#define K8_NOP7	K8_NOP4,K8_NOP3
+#define K8_NOP8	K8_NOP4,K8_NOP4
+#define K8_NOP5_ATOMIC 0x66,K8_NOP4
 
 /* K7 nops
    uses eax dependencies (arbitrary choice)
@@ -47,13 +55,14 @@
    7: leal 0x00000000(,%eax,1),%eax
 */
 #define K7_NOP1	GENERIC_NOP1
-#define K7_NOP2	".byte 0x8b,0xc0\n"
-#define K7_NOP3	".byte 0x8d,0x04,0x20\n"
-#define K7_NOP4	".byte 0x8d,0x44,0x20,0x00\n"
-#define K7_NOP5	K7_NOP4 ASM_NOP1
-#define K7_NOP6	".byte 0x8d,0x80,0,0,0,0\n"
-#define K7_NOP7	".byte 0x8D,0x04,0x05,0,0,0,0\n"
-#define K7_NOP8	K7_NOP7 ASM_NOP1
+#define K7_NOP2	0x8b,0xc0
+#define K7_NOP3	0x8d,0x04,0x20
+#define K7_NOP4	0x8d,0x44,0x20,0x00
+#define K7_NOP5	K7_NOP4,K7_NOP1
+#define K7_NOP6	0x8d,0x80,0,0,0,0
+#define K7_NOP7	0x8D,0x04,0x05,0,0,0,0
+#define K7_NOP8	K7_NOP7,K7_NOP1
+#define K7_NOP5_ATOMIC NOP_DS_PREFIX,K7_NOP4
 
 /* P6 nops
    uses eax dependencies (Intel-recommended choice)
@@ -69,52 +78,65 @@
 	There is kernel code that depends on this.
 */
 #define P6_NOP1	GENERIC_NOP1
-#define P6_NOP2	".byte 0x66,0x90\n"
-#define P6_NOP3	".byte 0x0f,0x1f,0x00\n"
-#define P6_NOP4	".byte 0x0f,0x1f,0x40,0\n"
-#define P6_NOP5	".byte 0x0f,0x1f,0x44,0x00,0\n"
-#define P6_NOP6	".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
-#define P6_NOP7	".byte 0x0f,0x1f,0x80,0,0,0,0\n"
-#define P6_NOP8	".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
+#define P6_NOP2	0x66,0x90
+#define P6_NOP3	0x0f,0x1f,0x00
+#define P6_NOP4	0x0f,0x1f,0x40,0
+#define P6_NOP5	0x0f,0x1f,0x44,0x00,0
+#define P6_NOP6	0x66,0x0f,0x1f,0x44,0x00,0
+#define P6_NOP7	0x0f,0x1f,0x80,0,0,0,0
+#define P6_NOP8	0x0f,0x1f,0x84,0x00,0,0,0,0
+#define P6_NOP5_ATOMIC P6_NOP5
+
+#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n"
 
 #if defined(CONFIG_MK7)
-#define ASM_NOP1 K7_NOP1
-#define ASM_NOP2 K7_NOP2
-#define ASM_NOP3 K7_NOP3
-#define ASM_NOP4 K7_NOP4
-#define ASM_NOP5 K7_NOP5
-#define ASM_NOP6 K7_NOP6
-#define ASM_NOP7 K7_NOP7
-#define ASM_NOP8 K7_NOP8
+#define ASM_NOP1 _ASM_MK_NOP(K7_NOP1)
+#define ASM_NOP2 _ASM_MK_NOP(K7_NOP2)
+#define ASM_NOP3 _ASM_MK_NOP(K7_NOP3)
+#define ASM_NOP4 _ASM_MK_NOP(K7_NOP4)
+#define ASM_NOP5 _ASM_MK_NOP(K7_NOP5)
+#define ASM_NOP6 _ASM_MK_NOP(K7_NOP6)
+#define ASM_NOP7 _ASM_MK_NOP(K7_NOP7)
+#define ASM_NOP8 _ASM_MK_NOP(K7_NOP8)
+#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K7_NOP5_ATOMIC)
 #elif defined(CONFIG_X86_P6_NOP)
-#define ASM_NOP1 P6_NOP1
-#define ASM_NOP2 P6_NOP2
-#define ASM_NOP3 P6_NOP3
-#define ASM_NOP4 P6_NOP4
-#define ASM_NOP5 P6_NOP5
-#define ASM_NOP6 P6_NOP6
-#define ASM_NOP7 P6_NOP7
-#define ASM_NOP8 P6_NOP8
+#define ASM_NOP1 _ASM_MK_NOP(P6_NOP1)
+#define ASM_NOP2 _ASM_MK_NOP(P6_NOP2)
+#define ASM_NOP3 _ASM_MK_NOP(P6_NOP3)
+#define ASM_NOP4 _ASM_MK_NOP(P6_NOP4)
+#define ASM_NOP5 _ASM_MK_NOP(P6_NOP5)
+#define ASM_NOP6 _ASM_MK_NOP(P6_NOP6)
+#define ASM_NOP7 _ASM_MK_NOP(P6_NOP7)
+#define ASM_NOP8 _ASM_MK_NOP(P6_NOP8)
+#define ASM_NOP5_ATOMIC _ASM_MK_NOP(P6_NOP5_ATOMIC)
 #elif defined(CONFIG_X86_64)
-#define ASM_NOP1 K8_NOP1
-#define ASM_NOP2 K8_NOP2
-#define ASM_NOP3 K8_NOP3
-#define ASM_NOP4 K8_NOP4
-#define ASM_NOP5 K8_NOP5
-#define ASM_NOP6 K8_NOP6
-#define ASM_NOP7 K8_NOP7
-#define ASM_NOP8 K8_NOP8
+#define ASM_NOP1 _ASM_MK_NOP(K8_NOP1)
+#define ASM_NOP2 _ASM_MK_NOP(K8_NOP2)
+#define ASM_NOP3 _ASM_MK_NOP(K8_NOP3)
+#define ASM_NOP4 _ASM_MK_NOP(K8_NOP4)
+#define ASM_NOP5 _ASM_MK_NOP(K8_NOP5)
+#define ASM_NOP6 _ASM_MK_NOP(K8_NOP6)
+#define ASM_NOP7 _ASM_MK_NOP(K8_NOP7)
+#define ASM_NOP8 _ASM_MK_NOP(K8_NOP8)
+#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K8_NOP5_ATOMIC)
 #else
-#define ASM_NOP1 GENERIC_NOP1
-#define ASM_NOP2 GENERIC_NOP2
-#define ASM_NOP3 GENERIC_NOP3
-#define ASM_NOP4 GENERIC_NOP4
-#define ASM_NOP5 GENERIC_NOP5
-#define ASM_NOP6 GENERIC_NOP6
-#define ASM_NOP7 GENERIC_NOP7
-#define ASM_NOP8 GENERIC_NOP8
+#define ASM_NOP1 _ASM_MK_NOP(GENERIC_NOP1)
+#define ASM_NOP2 _ASM_MK_NOP(GENERIC_NOP2)
+#define ASM_NOP3 _ASM_MK_NOP(GENERIC_NOP3)
+#define ASM_NOP4 _ASM_MK_NOP(GENERIC_NOP4)
+#define ASM_NOP5 _ASM_MK_NOP(GENERIC_NOP5)
+#define ASM_NOP6 _ASM_MK_NOP(GENERIC_NOP6)
+#define ASM_NOP7 _ASM_MK_NOP(GENERIC_NOP7)
+#define ASM_NOP8 _ASM_MK_NOP(GENERIC_NOP8)
+#define ASM_NOP5_ATOMIC _ASM_MK_NOP(GENERIC_NOP5_ATOMIC)
 #endif
 
 #define ASM_NOP_MAX 8
+#define NOP_ATOMIC5 (ASM_NOP_MAX+1)	/* Entry for the 5-byte atomic NOP */
+
+#ifndef __ASSEMBLY__
+extern const unsigned char * const *ideal_nops;
+extern void arch_init_ideal_nops(void);
+#endif
 
 #endif /* _ASM_X86_NOPS_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 4a234677e21..846f61eb89c 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -67,17 +67,30 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt);
 #define DPRINTK(fmt, args...) if (debug_alternative) \
 	printk(KERN_DEBUG fmt, args)
 
+/*
+ * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
+ * that correspond to that nop. Getting from one nop to the next, we
+ * add to the array the offset that is equal to the sum of all sizes of
+ * nops preceding the one we are after.
+ *
+ * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
+ * nice symmetry of sizes of the previous nops.
+ */
 #if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
-/* Use inline assembly to define this because the nops are defined
-   as inline assembly strings in the include files and we cannot
-   get them easily into strings. */
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: "
-	GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
-	GENERIC_NOP7 GENERIC_NOP8
-    "\t.previous");
-extern const unsigned char intelnops[];
-static const unsigned char *const __initconst_or_module
-intel_nops[ASM_NOP_MAX+1] = {
+static const unsigned char intelnops[] =
+{
+	GENERIC_NOP1,
+	GENERIC_NOP2,
+	GENERIC_NOP3,
+	GENERIC_NOP4,
+	GENERIC_NOP5,
+	GENERIC_NOP6,
+	GENERIC_NOP7,
+	GENERIC_NOP8,
+	GENERIC_NOP5_ATOMIC
+};
+static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
+{
 	NULL,
 	intelnops,
 	intelnops + 1,
@@ -87,17 +100,25 @@ intel_nops[ASM_NOP_MAX+1] = {
 	intelnops + 1 + 2 + 3 + 4 + 5,
 	intelnops + 1 + 2 + 3 + 4 + 5 + 6,
 	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 };
 #endif
 
 #ifdef K8_NOP1
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: "
-	K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
-	K8_NOP7 K8_NOP8
-    "\t.previous");
-extern const unsigned char k8nops[];
-static const unsigned char *const __initconst_or_module
-k8_nops[ASM_NOP_MAX+1] = {
+static const unsigned char k8nops[] =
+{
+	K8_NOP1,
+	K8_NOP2,
+	K8_NOP3,
+	K8_NOP4,
+	K8_NOP5,
+	K8_NOP6,
+	K8_NOP7,
+	K8_NOP8,
+	K8_NOP5_ATOMIC
+};
+static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
+{
 	NULL,
 	k8nops,
 	k8nops + 1,
@@ -107,17 +128,25 @@ k8_nops[ASM_NOP_MAX+1] = {
 	k8nops + 1 + 2 + 3 + 4 + 5,
 	k8nops + 1 + 2 + 3 + 4 + 5 + 6,
 	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 };
 #endif
 
 #if defined(K7_NOP1) && !defined(CONFIG_X86_64)
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: "
-	K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
-	K7_NOP7 K7_NOP8
-    "\t.previous");
-extern const unsigned char k7nops[];
-static const unsigned char *const __initconst_or_module
-k7_nops[ASM_NOP_MAX+1] = {
+static const unsigned char k7nops[] =
+{
+	K7_NOP1,
+	K7_NOP2,
+	K7_NOP3,
+	K7_NOP4,
+	K7_NOP5,
+	K7_NOP6,
+	K7_NOP7,
+	K7_NOP8,
+	K7_NOP5_ATOMIC
+};
+static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
+{
 	NULL,
 	k7nops,
 	k7nops + 1,
@@ -127,17 +156,25 @@ k7_nops[ASM_NOP_MAX+1] = {
 	k7nops + 1 + 2 + 3 + 4 + 5,
 	k7nops + 1 + 2 + 3 + 4 + 5 + 6,
 	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 };
 #endif
 
 #ifdef P6_NOP1
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: "
-	P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6
-	P6_NOP7 P6_NOP8
-    "\t.previous");
-extern const unsigned char p6nops[];
-static const unsigned char *const __initconst_or_module
-p6_nops[ASM_NOP_MAX+1] = {
+static const unsigned char  __initconst_or_module p6nops[] =
+{
+	P6_NOP1,
+	P6_NOP2,
+	P6_NOP3,
+	P6_NOP4,
+	P6_NOP5,
+	P6_NOP6,
+	P6_NOP7,
+	P6_NOP8,
+	P6_NOP5_ATOMIC
+};
+static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
+{
 	NULL,
 	p6nops,
 	p6nops + 1,
@@ -147,47 +184,53 @@ p6_nops[ASM_NOP_MAX+1] = {
 	p6nops + 1 + 2 + 3 + 4 + 5,
 	p6nops + 1 + 2 + 3 + 4 + 5 + 6,
 	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 };
 #endif
 
+/* Initialize these to a safe default */
 #ifdef CONFIG_X86_64
+const unsigned char * const *ideal_nops = p6_nops;
+#else
+const unsigned char * const *ideal_nops = intel_nops;
+#endif
 
-extern char __vsyscall_0;
-static const unsigned char *const *__init_or_module find_nop_table(void)
+void __init arch_init_ideal_nops(void)
 {
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
-	    boot_cpu_has(X86_FEATURE_NOPL))
-		return p6_nops;
-	else
-		return k8_nops;
-}
-
-#else /* CONFIG_X86_64 */
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_INTEL:
+		if (boot_cpu_has(X86_FEATURE_NOPL)) {
+			   ideal_nops = p6_nops;
+		} else {
+#ifdef CONFIG_X86_64
+			ideal_nops = k8_nops;
+#else
+			ideal_nops = intel_nops;
+#endif
+		}
 
-static const unsigned char *const *__init_or_module find_nop_table(void)
-{
-	if (boot_cpu_has(X86_FEATURE_K8))
-		return k8_nops;
-	else if (boot_cpu_has(X86_FEATURE_K7))
-		return k7_nops;
-	else if (boot_cpu_has(X86_FEATURE_NOPL))
-		return p6_nops;
-	else
-		return intel_nops;
+	default:
+#ifdef CONFIG_X86_64
+		ideal_nops = k8_nops;
+#else
+		if (boot_cpu_has(X86_FEATURE_K8))
+			ideal_nops = k8_nops;
+		else if (boot_cpu_has(X86_FEATURE_K7))
+			ideal_nops = k7_nops;
+		else
+			ideal_nops = intel_nops;
+#endif
+	}
 }
 
-#endif /* CONFIG_X86_64 */
-
 /* Use this to add nops to a buffer, then text_poke the whole buffer. */
 static void __init_or_module add_nops(void *insns, unsigned int len)
 {
-	const unsigned char *const *noptable = find_nop_table();
-
 	while (len > 0) {
 		unsigned int noplen = len;
 		if (noplen > ASM_NOP_MAX)
 			noplen = ASM_NOP_MAX;
-		memcpy(insns, noptable[noplen], noplen);
+		memcpy(insns, ideal_nops[noplen], noplen);
 		insns += noplen;
 		len -= noplen;
 	}
@@ -195,6 +238,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
 
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern s32 __smp_locks[], __smp_locks_end[];
+extern char __vsyscall_0;
 void *text_poke_early(void *addr, const void *opcode, size_t len);
 
 /* Replace instructions with better alternatives for this CPU type.
@@ -678,29 +722,3 @@ void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
 	wrote_text = 0;
 	__stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
 }
-
-#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
-
-#ifdef CONFIG_X86_64
-unsigned char ideal_nop5[5] = { 0x66, 0x66, 0x66, 0x66, 0x90 };
-#else
-unsigned char ideal_nop5[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 };
-#endif
-
-void __init arch_init_ideal_nop5(void)
-{
-	/*
-	 * There is no good nop for all x86 archs.  This selection
-	 * algorithm should be unified with the one in find_nop_table(),
-	 * but this should be good enough for now.
-	 *
-	 * For cases other than the ones below, use the safe (as in
-	 * always functional) defaults above.
-	 */
-#ifdef CONFIG_X86_64
-	/* Don't use these on 32 bits due to broken virtualizers */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-		memcpy(ideal_nop5, p6_nops[5], 5);
-#endif
-}
-#endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index a93742a5746..0ba15a6cc57 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -260,9 +260,9 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
 	return mod_code_status;
 }
 
-static unsigned char *ftrace_nop_replace(void)
+static const unsigned char *ftrace_nop_replace(void)
 {
-	return ideal_nop5;
+	return ideal_nops[NOP_ATOMIC5];
 }
 
 static int
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 961b6b30ba9..3fee346ef54 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -34,7 +34,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
 		code.offset = entry->target -
 				(entry->code + JUMP_LABEL_NOP_SIZE);
 	} else
-		memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+		memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);
 	get_online_cpus();
 	mutex_lock(&text_mutex);
 	text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
@@ -44,7 +44,8 @@ void arch_jump_label_transform(struct jump_entry *entry,
 
 void arch_jump_label_text_poke_early(jump_label_t addr)
 {
-	text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+	text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5],
+			JUMP_LABEL_NOP_SIZE);
 }
 
 #endif
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 5a0484a95ad..390a663894d 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -691,8 +691,6 @@ early_param("reservelow", parse_reservelow);
 
 void __init setup_arch(char **cmdline_p)
 {
-	unsigned long flags;
-
 #ifdef CONFIG_X86_32
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
 	visws_early_detect();
@@ -1036,9 +1034,7 @@ void __init setup_arch(char **cmdline_p)
 
 	mcheck_init();
 
-	local_irq_save(flags);
-	arch_init_ideal_nop5();
-	local_irq_restore(flags);
+	arch_init_ideal_nops();
 }
 
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3-70-g09d2


From d8d9766c8c29f71c37bc4b74cc9fcf6a192c9bfd Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Mon, 18 Apr 2011 15:31:57 -0700
Subject: x86, cpu: Change NOP selection for certain Intel CPUs

Due to a decoder implementation quirk, some specific Intel CPUs
actually perform better with the "k8_nops" than with the
SDM-recommended NOPs.  For runtime-selected NOPs, if we detect those
specific CPUs then use the k8_nops instead of the ones we would
normally use.

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jason Baron <jbaron@redhat.com>
Link: http://lkml.kernel.org/r/1303166160-10315-4-git-send-email-hpa@linux.intel.com
---
 arch/x86/kernel/alternative.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 846f61eb89c..c0501ea6b63 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -199,7 +199,19 @@ void __init arch_init_ideal_nops(void)
 {
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
-		if (boot_cpu_has(X86_FEATURE_NOPL)) {
+		/*
+		 * Due to a decoder implementation quirk, some
+		 * specific Intel CPUs actually perform better with
+		 * the "k8_nops" than with the SDM-recommended NOPs.
+		 */
+		if (boot_cpu_data.x86 == 6 &&
+		    boot_cpu_data.x86_model >= 0x0f &&
+		    boot_cpu_data.x86_model != 0x1c &&
+		    boot_cpu_data.x86_model != 0x26 &&
+		    boot_cpu_data.x86_model != 0x27 &&
+		    boot_cpu_data.x86_model < 0x30) {
+			ideal_nops = k8_nops;
+		} else if (boot_cpu_has(X86_FEATURE_NOPL)) {
 			   ideal_nops = p6_nops;
 		} else {
 #ifdef CONFIG_X86_64
-- 
cgit v1.2.3-70-g09d2


From c8e5910edf8bbe2e5c6c35a4ef2a578cc7893b25 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Sat, 16 Apr 2011 02:27:55 +0200
Subject: perf, x86: Use ALTERNATIVE() to check for X86_FEATURE_PERFCTR_CORE

Using ALTERNATIVE() when checking for X86_FEATURE_PERFCTR_CORE avoids
an extra pointer chase and data cache hit.

Signed-off-by: Robert Richter <robert.richter@amd.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1302913676-14352-4-git-send-email-robert.richter@amd.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index eed3673a865..224a84f7080 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,6 +31,7 @@
 #include <asm/nmi.h>
 #include <asm/compat.h>
 #include <asm/smp.h>
+#include <asm/alternative.h>
 
 #if 0
 #undef wrmsrl
@@ -363,12 +364,18 @@ again:
 	return new_raw_count;
 }
 
-/* using X86_FEATURE_PERFCTR_CORE to later implement ALTERNATIVE() here */
 static inline int x86_pmu_addr_offset(int index)
 {
-	if (boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
-		return index << 1;
-	return index;
+	int offset;
+
+	/* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
+	alternative_io(ASM_NOP2,
+		       "shll $1, %%eax",
+		       X86_FEATURE_PERFCTR_CORE,
+		       "=a" (offset),
+		       "a"  (index));
+
+	return offset;
 }
 
 static inline unsigned int x86_pmu_config_addr(int index)
-- 
cgit v1.2.3-70-g09d2


From 2b398bd9f8f73be706b41adcbb240ce95793049a Mon Sep 17 00:00:00 2001
From: Youquan Song <youquan.song@intel.com>
Date: Thu, 14 Apr 2011 14:36:08 +0800
Subject: x86, apic: Print verbose error interrupt reason on apic=debug

End users worry about the error interrupt printout we generate
currently:

	pr_debug("APIC error on CPU%d: %02x(%02x)\n",
	smp_processor_id(), v , v1);

... and would like to know the reason why error interrupts are generated.

This patch prints out more detailed debug information.

Another practical problem is that dynamic debug is not initialized yet
when the APIC initializes, so the pr_debug() will not output the error
interrupt debug information on bootup. In this patch, we use
apic_printk(APIC_DEBUG, ...), so the apic=debug boot option will print
verbose error interupts during bootup.

Signed-off-by: Youquan Song <youquan.song@intel.com>
Cc: Joe Perches <joe@perches.com>
Cc: hpa@linux.intel.com
Cc: suresh.b.siddha@intel.com
Cc: yong.y.wang@linux.intel.com
Cc: jbaron@redhat.com
Cc: trenn@suse.de
Cc: kent.liu@intel.com
Cc: chaohong.guo@intel.com
Link: http://lkml.kernel.org/r/1302762968-24380-2-git-send-email-youquan.song@intel.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c | 41 ++++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index fabf01eff77..ae147126b7b 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1812,30 +1812,41 @@ void smp_spurious_interrupt(struct pt_regs *regs)
  */
 void smp_error_interrupt(struct pt_regs *regs)
 {
-	u32 v, v1;
+	u32 v0, v1;
+	u32 i = 0;
+	static const char * const error_interrupt_reason[] = {
+		"Send CS error",		/* APIC Error Bit 0 */
+		"Receive CS error",		/* APIC Error Bit 1 */
+		"Send accept error",		/* APIC Error Bit 2 */
+		"Receive accept error",		/* APIC Error Bit 3 */
+		"Redirectable IPI",		/* APIC Error Bit 4 */
+		"Send illegal vector",		/* APIC Error Bit 5 */
+		"Received illegal vector",	/* APIC Error Bit 6 */
+		"Illegal register address",	/* APIC Error Bit 7 */
+	};
 
 	exit_idle();
 	irq_enter();
 	/* First tickle the hardware, only then report what went on. -- REW */
-	v = apic_read(APIC_ESR);
+	v0 = apic_read(APIC_ESR);
 	apic_write(APIC_ESR, 0);
 	v1 = apic_read(APIC_ESR);
 	ack_APIC_irq();
 	atomic_inc(&irq_err_count);
 
-	/*
-	 * Here is what the APIC error bits mean:
-	 * 0: Send CS error
-	 * 1: Receive CS error
-	 * 2: Send accept error
-	 * 3: Receive accept error
-	 * 4: Reserved
-	 * 5: Send illegal vector
-	 * 6: Received illegal vector
-	 * 7: Illegal register address
-	 */
-	pr_debug("APIC error on CPU%d: %02x(%02x)\n",
-		smp_processor_id(), v , v1);
+	apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)",
+		    smp_processor_id(), v0 , v1);
+
+	v1 = v1 & 0xff;
+	while (v1) {
+		if (v1 & 0x1)
+			apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
+		i++;
+		v1 >>= 1;
+	};
+
+	apic_printk(APIC_DEBUG, KERN_CONT "\n");
+
 	irq_exit();
 }
 
-- 
cgit v1.2.3-70-g09d2


From 7b70bd3441437b7bc04fc9d321e17c8ed0e8f958 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Mon, 18 Apr 2011 16:00:21 +0200
Subject: x86, MCE: Do not taint when handling correctable errors

Correctable errors are considered something rather normal on
modern hardware these days. Even more importantly, correctable
errors mean exactly that - they've been corrected by the
hardware - and there's no need to taint the kernel since
execution hasn't been compromised so far.

Also, drop tainting in the thermal throttling code for a similar
reason: crossing a thermal threshold does not mean corruption.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Acked-by: Nagananda Chumbalkar <Nagananda.Chumbalkar@hp.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Russ Anderson <rja@sgi.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/1303135222-17118-1-git-send-email-bp@amd64.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce.c         | 1 -
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 3 ---
 2 files changed, 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3385ea26f68..68e230327d6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -590,7 +590,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
 			mce_log(&m);
 			atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
-			add_taint(TAINT_MACHINE_CHECK);
 		}
 
 		/*
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 6f8c5e9da97..5846a797b97 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -187,8 +187,6 @@ static int therm_throt_process(bool new_event, int event, int level)
 				this_cpu,
 				level == CORE_LEVEL ? "Core" : "Package",
 				state->count);
-
-		add_taint(TAINT_MACHINE_CHECK);
 		return 1;
 	}
 	if (old_event) {
@@ -393,7 +391,6 @@ static void unexpected_thermal_interrupt(void)
 {
 	printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
 			smp_processor_id());
-	add_taint(TAINT_MACHINE_CHECK);
 }
 
 static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
-- 
cgit v1.2.3-70-g09d2


From 8a91707d0a1a49193e23cb2d243632f2289feb24 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 20 Apr 2011 11:54:10 -0400
Subject: xen/p2m: Add EXPORT_SYMBOL_GPL to the M2P override functions.

If the backends, which use these two functions, are compiled as
a module we need these two functions to be exported.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/p2m.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 2d2b32af3a1..c851397e657 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -682,7 +682,7 @@ int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte)
 
 	return 0;
 }
-
+EXPORT_SYMBOL_GPL(m2p_add_override);
 int m2p_remove_override(struct page *page, bool clear_pte)
 {
 	unsigned long flags;
@@ -719,6 +719,7 @@ int m2p_remove_override(struct page *page, bool clear_pte)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(m2p_remove_override);
 
 struct page *m2p_find_override(unsigned long mfn)
 {
-- 
cgit v1.2.3-70-g09d2


From dffa4b2f62ff28c982144c7033001b1ece4d3532 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 20 Apr 2011 12:23:49 +0200
Subject: x86, mce: Drop the default decoding notifier

The default notifier doesn't make a lot of sense to call in the
correctable errors case. Drop it and emit the mcelog decoding
hint only in the uncorrectable errors case and when no notifier
is registered. Also, limit issuing the "mcelog --ascii" message
in the rare case when we dump unreported CEs before panicking.

While at it, remove unused old x86_mce_decode_callback from the
header.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: Prarit Bhargava <prarit@redhat.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Nagananda Chumbalkar <Nagananda.Chumbalkar@hp.com>
Cc: Russ Anderson <rja@sgi.com>
Link: http://lkml.kernel.org/r/20110420102349.GB1361@aftab
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/mce.h       |  2 --
 arch/x86/kernel/cpu/mcheck/mce.c | 24 +++++++-----------------
 2 files changed, 7 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index eb16e94ae04..021979a6e23 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -142,8 +142,6 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
 static inline void enable_p5_mce(void) {}
 #endif
 
-extern void (*x86_mce_decode_callback)(struct mce *m);
-
 void mce_setup(struct mce *m);
 void mce_log(struct mce *m);
 DECLARE_PER_CPU(struct sys_device, mce_dev);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 68e230327d6..ff1ae9b6464 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -105,20 +105,6 @@ static int			cpu_missing;
 ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
 EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
 
-static int default_decode_mce(struct notifier_block *nb, unsigned long val,
-			       void *data)
-{
-	pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
-	pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
-
-	return NOTIFY_STOP;
-}
-
-static struct notifier_block mce_dec_nb = {
-	.notifier_call = default_decode_mce,
-	.priority      = -1,
-};
-
 /* MCA banks polled by the period polling timer for corrected events */
 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
@@ -212,6 +198,8 @@ void mce_log(struct mce *mce)
 
 static void print_mce(struct mce *m)
 {
+	int ret = 0;
+
 	pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
 	       m->extcpu, m->mcgstatus, m->bank, m->status);
 
@@ -239,7 +227,11 @@ static void print_mce(struct mce *m)
 	 * Print out human-readable details about the MCE error,
 	 * (if the CPU has an implementation for that)
 	 */
-	atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
+	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
+	if (ret == NOTIFY_STOP)
+		return;
+
+	pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 }
 
 #define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -1721,8 +1713,6 @@ __setup("mce", mcheck_enable);
 
 int __init mcheck_init(void)
 {
-	atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
-
 	mcheck_intel_therm_init();
 
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 103b3934817a7c42fba6e1ef76ecb390a2837d40 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Thu, 21 Apr 2011 11:03:20 -0400
Subject: perf, x86: P4 PMU -- Use perf_sample_data_init helper

Instead of opencoded assignments better to use
perf_sample_data_init helper.

Tested-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Link: http://lkml.kernel.org/r/1303398203-2918-2-git-send-email-dzickus@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 8ff882fdb1c..ae31e9698d9 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -912,8 +912,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 	int idx, handled = 0;
 	u64 val;
 
-	data.addr = 0;
-	data.raw = NULL;
+	perf_sample_data_init(&data, 0);
 
 	cpuc = &__get_cpu_var(cpu_hw_events);
 
-- 
cgit v1.2.3-70-g09d2


From fa7b69475a6c192853949ba496dd9c37b497b548 Mon Sep 17 00:00:00 2001
From: "Justin P. Mattock" <justinmattock@gmail.com>
Date: Fri, 22 Apr 2011 10:08:52 -0700
Subject: perf events, x86, P4: Fix typo in comment

Signed-off-by: Justin P. Mattock <justinmattock@gmail.com>
Acked-by: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: trivial@kernel.org
Link: http://lkml.kernel.org/r/1303492132-3004-1-git-send-email-justinmattock@gmail.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ae31e9698d9..f4c1da2f935 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1187,7 +1187,7 @@ static __init int p4_pmu_init(void)
 {
 	unsigned int low, high;
 
-	/* If we get stripped -- indexig fails */
+	/* If we get stripped -- indexing fails */
 	BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
 
 	rdmsr(MSR_IA32_MISC_ENABLE, low, high);
-- 
cgit v1.2.3-70-g09d2


From 15d6aba24d88231415f4e7e091c0f1e60c3e6fd5 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 24 Apr 2011 11:11:51 +0300
Subject: x86: Demacro CONFIG_PARAVIRT cpu accessors

Recently, we had a build failure on !CONFIG_PARAVIRT due to a
callback ->wbinvd() clashing with a macro wbinvd().

While we worked around the issue, avoid it in the future by
changing the macro (and a few surrounding ones) to an inline
function.

Signed-off-by: Avi Kivity <avi@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: http://lkml.kernel.org/r/1303632711-21662-1-git-send-email-avi@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/system.h | 85 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 71 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 12569e691ce..c2ff2a1d845 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -303,24 +303,81 @@ static inline void native_wbinvd(void)
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
-#define read_cr0()	(native_read_cr0())
-#define write_cr0(x)	(native_write_cr0(x))
-#define read_cr2()	(native_read_cr2())
-#define write_cr2(x)	(native_write_cr2(x))
-#define read_cr3()	(native_read_cr3())
-#define write_cr3(x)	(native_write_cr3(x))
-#define read_cr4()	(native_read_cr4())
-#define read_cr4_safe()	(native_read_cr4_safe())
-#define write_cr4(x)	(native_write_cr4(x))
-#define wbinvd()	(native_wbinvd())
+
+static inline unsigned long read_cr0(void)
+{
+	return native_read_cr0();
+}
+
+static inline void write_cr0(unsigned long x)
+{
+	native_write_cr0(x);
+}
+
+static inline unsigned long read_cr2(void)
+{
+	return native_read_cr2();
+}
+
+static inline void write_cr2(unsigned long x)
+{
+	native_write_cr2(x);
+}
+
+static inline unsigned long read_cr3(void)
+{
+	return native_read_cr3();
+}
+
+static inline void write_cr3(unsigned long x)
+{
+	native_write_cr3(x);
+}
+
+static inline unsigned long read_cr4(void)
+{
+	return native_read_cr4();
+}
+
+static inline unsigned long read_cr4_safe(void)
+{
+	return native_read_cr4_safe();
+}
+
+static inline void write_cr4(unsigned long x)
+{
+	native_write_cr4(x);
+}
+
+static inline void wbinvd(void)
+{
+	native_wbinvd();
+}
+
 #ifdef CONFIG_X86_64
-#define read_cr8()	(native_read_cr8())
-#define write_cr8(x)	(native_write_cr8(x))
-#define load_gs_index   native_load_gs_index
+
+static inline unsigned long read_cr8(void)
+{
+	return native_read_cr8();
+}
+
+static inline void write_cr8(unsigned long x)
+{
+	native_write_cr8(x);
+}
+
+static inline void load_gs_index(unsigned selector)
+{
+	native_load_gs_index(selector);
+}
+
 #endif
 
 /* Clear the 'TS' bit */
-#define clts()		(native_clts())
+static inline void clts(void)
+{
+	native_clts();
+}
 
 #endif/* CONFIG_PARAVIRT */
 
-- 
cgit v1.2.3-70-g09d2


From 94403f8863d0d1d2005291b2ef0719c2534aa303 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 24 Apr 2011 08:18:31 +0200
Subject: perf events: Add stalled cycles generic event -
 PERF_COUNT_HW_STALLED_CYCLES

The new PERF_COUNT_HW_STALLED_CYCLES event tries to approximate
cycles the CPU does nothing useful, because it is stalled on a
cache-miss or some other condition.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/n/tip-fue11vymwqsoo5to72jxxjyl@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 3 +++
 include/linux/perf_event.h             | 1 +
 tools/perf/util/parse-events.c         | 1 +
 tools/perf/util/python.c               | 1 +
 4 files changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9ae4a2aa739..efa2704c9df 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1413,6 +1413,9 @@ static __init int intel_pmu_init(void)
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
 		x86_pmu.extra_regs = intel_nehalem_extra_regs;
 
+		/* Install the stalled-cycles event: 0xff: All reasons, 0xa2: Resource stalls */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES] = 0xffa2;
+
 		if (ebx & 0x40) {
 			/*
 			 * Erratum AAJ80 detected, we work it around by using
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ee9f1e78280..ac636dd20a0 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -52,6 +52,7 @@ enum perf_hw_id {
 	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
 	PERF_COUNT_HW_BRANCH_MISSES		= 5,
 	PERF_COUNT_HW_BUS_CYCLES		= 6,
+	PERF_COUNT_HW_STALLED_CYCLES		= 7,
 
 	PERF_COUNT_HW_MAX,			/* non-ABI */
 };
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 952b4ae3d95..1869e4c646d 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -38,6 +38,7 @@ static struct event_symbol event_symbols[] = {
   { CHW(BRANCH_INSTRUCTIONS),	"branch-instructions",	"branches"	},
   { CHW(BRANCH_MISSES),		"branch-misses",	""		},
   { CHW(BUS_CYCLES),		"bus-cycles",		""		},
+  { CHW(STALLED_CYCLES),	"stalled-cycles",	""		},
 
   { CSW(CPU_CLOCK),		"cpu-clock",		""		},
   { CSW(TASK_CLOCK),		"task-clock",		""		},
diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c
index f5e38451fdc..406f613ee61 100644
--- a/tools/perf/util/python.c
+++ b/tools/perf/util/python.c
@@ -798,6 +798,7 @@ static struct {
 	{ "COUNT_HW_BRANCH_INSTRUCTIONS", PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
 	{ "COUNT_HW_BRANCH_MISSES",	  PERF_COUNT_HW_BRANCH_MISSES },
 	{ "COUNT_HW_BUS_CYCLES",	  PERF_COUNT_HW_BUS_CYCLES },
+	{ "COUNT_HW_STALLED_CYCLES",	  PERF_COUNT_HW_STALLED_CYCLES },
 	{ "COUNT_HW_CACHE_L1D",		  PERF_COUNT_HW_CACHE_L1D },
 	{ "COUNT_HW_CACHE_L1I",		  PERF_COUNT_HW_CACHE_L1I },
 	{ "COUNT_HW_CACHE_LL",	  	  PERF_COUNT_HW_CACHE_LL },
-- 
cgit v1.2.3-70-g09d2


From 5c543e3c442d6382db127152c7096ca6a55283de Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 27 Apr 2011 12:02:04 +0200
Subject: perf events, x86: Mark constrant tables read mostly

Various constraint tables were not marked read-mostly.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/n/tip-wpqwwvmhxucy5e718wnamjiv@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index efa2704c9df..067a48b13a7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -36,7 +36,7 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
   [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
 };
 
-static struct event_constraint intel_core_event_constraints[] =
+static struct event_constraint intel_core_event_constraints[] __read_mostly =
 {
 	INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
 	INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
@@ -47,7 +47,7 @@ static struct event_constraint intel_core_event_constraints[] =
 	EVENT_CONSTRAINT_END
 };
 
-static struct event_constraint intel_core2_event_constraints[] =
+static struct event_constraint intel_core2_event_constraints[] __read_mostly =
 {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
 	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -70,7 +70,7 @@ static struct event_constraint intel_core2_event_constraints[] =
 	EVENT_CONSTRAINT_END
 };
 
-static struct event_constraint intel_nehalem_event_constraints[] =
+static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
 {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
 	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -86,19 +86,19 @@ static struct event_constraint intel_nehalem_event_constraints[] =
 	EVENT_CONSTRAINT_END
 };
 
-static struct extra_reg intel_nehalem_extra_regs[] =
+static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
 {
 	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
 	EVENT_EXTRA_END
 };
 
-static struct event_constraint intel_nehalem_percore_constraints[] =
+static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly =
 {
 	INTEL_EVENT_CONSTRAINT(0xb7, 0),
 	EVENT_CONSTRAINT_END
 };
 
-static struct event_constraint intel_westmere_event_constraints[] =
+static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
 {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
 	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -110,7 +110,7 @@ static struct event_constraint intel_westmere_event_constraints[] =
 	EVENT_CONSTRAINT_END
 };
 
-static struct event_constraint intel_snb_event_constraints[] =
+static struct event_constraint intel_snb_event_constraints[] __read_mostly =
 {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
 	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -123,21 +123,21 @@ static struct event_constraint intel_snb_event_constraints[] =
 	EVENT_CONSTRAINT_END
 };
 
-static struct extra_reg intel_westmere_extra_regs[] =
+static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
 {
 	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
 	INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
 	EVENT_EXTRA_END
 };
 
-static struct event_constraint intel_westmere_percore_constraints[] =
+static struct event_constraint intel_westmere_percore_constraints[] __read_mostly =
 {
 	INTEL_EVENT_CONSTRAINT(0xb7, 0),
 	INTEL_EVENT_CONSTRAINT(0xbb, 0),
 	EVENT_CONSTRAINT_END
 };
 
-static struct event_constraint intel_gen_event_constraints[] =
+static struct event_constraint intel_gen_event_constraints[] __read_mostly =
 {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
 	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
-- 
cgit v1.2.3-70-g09d2


From 8a850cadca0e387c87a0911a61e99fd66aeb57ec Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 28 Apr 2011 11:16:44 +0200
Subject: perf event, x86: Use better stalled cycles metric

Use the UOPS_EXECUTED.*,c=1,i=1 event on Intel CPUs - it is a rather
good indicator of CPU execution stalls, more sensitive and more inclusive
than the 0xa2 resource stalls event (which does not count nearly as many
stall types).

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/n/tip-7y40wib8n1eqio7hjpn2dsrm@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 067a48b13a7..1ea94224f62 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1413,8 +1413,8 @@ static __init int intel_pmu_init(void)
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
 		x86_pmu.extra_regs = intel_nehalem_extra_regs;
 
-		/* Install the stalled-cycles event: 0xff: All reasons, 0xa2: Resource stalls */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES] = 0xffa2;
+		/* Install the stalled-cycles event: UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES] = 0x1803fb1;
 
 		if (ebx & 0x40) {
 			/*
-- 
cgit v1.2.3-70-g09d2


From c7a7b814c9dca9ee01b38e63b4a46de87156d3b6 Mon Sep 17 00:00:00 2001
From: Tim Gardner <tim.gardner@canonical.com>
Date: Thu, 28 Apr 2011 11:00:30 -0600
Subject: ioremap: Delay sanity check until after a successful mapping

While tracking down the reason for an ioremap() failure I was
distracted  by the WARN_ONCE() in __ioremap_caller().

Performing a WARN_ONCE() sanity check before the mapping
is successful seems pointless if the caller sends bad values.

A case in point is when the BIOS provides erroneous screen_info
values causing vesafb_probe() to request an outrageuous size.
The WARN_ONCE is then wasted on bogosity. Move the warning to a
point where the mapping has been successfully allocated.

Addresses:

  http://bugs.launchpad.net/bugs/772042

Reviewed-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
Link: http://lkml.kernel.org/r/4DB99D2E.9080106@canonical.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 0369843511d..be1ef574ce9 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -90,13 +90,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	if (is_ISA_range(phys_addr, last_addr))
 		return (__force void __iomem *)phys_to_virt(phys_addr);
 
-	/*
-	 * Check if the request spans more than any BAR in the iomem resource
-	 * tree.
-	 */
-	WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
-		  KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
-
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using..
 	 */
@@ -170,6 +163,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	ret_addr = (void __iomem *) (vaddr + offset);
 	mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
 
+	/*
+	 * Check if the request spans more than any BAR in the iomem resource
+	 * tree.
+	 */
+	WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
+		  KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
+
 	return ret_addr;
 err_free_area:
 	free_vm_area(area);
-- 
cgit v1.2.3-70-g09d2


From 8f62242246351b5a4bc0c1f00c0c7003edea128a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 29 Apr 2011 13:19:47 +0200
Subject: perf events: Add generic front-end and back-end stalled cycle event
 definitions

Add two generic hardware events: front-end and back-end stalled cycles.

These events measure conditions when the CPU is executing code but its
capabilities are not fully utilized. Understanding such situations and
analyzing them is an important sub-task of code optimization workflows.

Both events limit performance: most front end stalls tend to be caused
by branch misprediction or instruction fetch cachemisses, backend
stalls can be caused by various resource shortages or inefficient
instruction scheduling.

Front-end stalls are the more important ones: code cannot run fast
if the instruction stream is not being kept up.

An over-utilized back-end can cause front-end stalls and thus
has to be kept an eye on as well.

The exact composition is very program logic and instruction mix
dependent.

We use the terms 'stall', 'front-end' and 'back-end' loosely and
try to use the best available events from specific CPUs that
approximate these concepts.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/n/tip-7y40wib8n000io7hjpn1dsrm@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 2 +-
 include/linux/perf_event.h             | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 1ea94224f62..393085b87a2 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1414,7 +1414,7 @@ static __init int intel_pmu_init(void)
 		x86_pmu.extra_regs = intel_nehalem_extra_regs;
 
 		/* Install the stalled-cycles event: UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
-		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES] = 0x1803fb1;
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
 
 		if (ebx & 0x40) {
 			/*
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ac636dd20a0..4e2d7ae7149 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -52,7 +52,8 @@ enum perf_hw_id {
 	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
 	PERF_COUNT_HW_BRANCH_MISSES		= 5,
 	PERF_COUNT_HW_BUS_CYCLES		= 6,
-	PERF_COUNT_HW_STALLED_CYCLES		= 7,
+	PERF_COUNT_HW_STALLED_CYCLES_FRONTEND	= 7,
+	PERF_COUNT_HW_STALLED_CYCLES_BACKEND	= 8,
 
 	PERF_COUNT_HW_MAX,			/* non-ABI */
 };
-- 
cgit v1.2.3-70-g09d2


From 91fc4cc00099986bc1ba50e1f421c3548cffae42 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 29 Apr 2011 14:17:19 +0200
Subject: perf, x86: Add new stalled cycles events for Intel and AMD CPUs

Extend the Intel and AMD event definitions with generic front-end and
back-end stall events.

( These are only approximations - suggestions are welcome for better events. )

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/n/tip-7y40wib8n001io7hjpn1dsrm@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_amd.c   | 14 ++++++++------
 arch/x86/kernel/cpu/perf_event_intel.c |  4 +++-
 2 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index cf4e369cea6..fe29c1d2219 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -96,12 +96,14 @@ static __initconst const u64 amd_hw_cache_event_ids
  */
 static const u64 amd_perfmon_event_map[] =
 {
-  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0076,
-  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0080,
-  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0081,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c2,
-  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c3,
+  [PERF_COUNT_HW_CPU_CYCLES]			= 0x0076,
+  [PERF_COUNT_HW_INSTRUCTIONS]			= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]		= 0x0080,
+  [PERF_COUNT_HW_CACHE_MISSES]			= 0x0081,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]		= 0x00c2,
+  [PERF_COUNT_HW_BRANCH_MISSES]			= 0x00c3,
+  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= 0x00d0, /* "Decoder empty" event */
+  [PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= 0x00d1, /* "Dispatch stalls" event */
 };
 
 static u64 amd_pmu_event_map(int hw_event)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 393085b87a2..7983b9a9533 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1413,7 +1413,9 @@ static __init int intel_pmu_init(void)
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
 		x86_pmu.extra_regs = intel_nehalem_extra_regs;
 
-		/* Install the stalled-cycles event: UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+		/* UOPS_ISSUED.STALLED_CYCLES */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
 		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
 
 		if (ebx & 0x40) {
-- 
cgit v1.2.3-70-g09d2


From 301120396b766ae4480e52ece220516a1707822b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 30 Apr 2011 09:14:54 +0200
Subject: perf events, x86: Add Westmere stalled-cycles-frontend/backend events

Extend the Intel Westmere PMU driver with definitions for generic front-end and
back-end stall events.

( These are only approximations. )

Reported-by: David Ahern <dsahern@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/n/tip-7y40wib8n008io7hjpn1dsrm@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 7983b9a9533..be8363adf4e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1458,6 +1458,12 @@ static __init int intel_pmu_init(void)
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
 		x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_westmere_extra_regs;
+
+		/* UOPS_ISSUED.STALLED_CYCLES */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
+
 		pr_cont("Westmere events, ");
 		break;
 
-- 
cgit v1.2.3-70-g09d2


From 57d5f9f808b7650a92f31e9cd3acd3f415a22530 Mon Sep 17 00:00:00 2001
From: Mike Waychison <mikew@google.com>
Date: Mon, 14 Mar 2011 23:58:45 -0700
Subject: x86: get_bios_ebda_length()

Add a wrapper routine that tells us the length of the EBDA if it is
present.  This guy also ensures that the returned length doesn't let the
EBDA run past the 640KiB mark.

Signed-off-by: Mike Waychison <mikew@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/include/asm/bios_ebda.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h
index 3c7521063d3..5174cf0bf59 100644
--- a/arch/x86/include/asm/bios_ebda.h
+++ b/arch/x86/include/asm/bios_ebda.h
@@ -14,6 +14,27 @@ static inline unsigned int get_bios_ebda(void)
 	return address;	/* 0 means none */
 }
 
+/*
+ * Return the sanitized length of the EBDA in bytes, if it exists.
+ */
+static inline unsigned int get_bios_ebda_length(void)
+{
+	unsigned int address;
+	unsigned int length;
+
+	address = get_bios_ebda();
+	if (!address)
+		return 0;
+
+	/* EBDA length is byte 0 of the EBDA (stored in KiB) */
+	length = *(unsigned char *)phys_to_virt(address);
+	length <<= 10;
+
+	/* Trim the length if it extends beyond 640KiB */
+	length = min_t(unsigned int, (640 * 1024) - address, length);
+	return length;
+}
+
 void reserve_ebda_region(void);
 
 #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
-- 
cgit v1.2.3-70-g09d2


From f548ccd47d608e88d432745091e13f927ced83f7 Mon Sep 17 00:00:00 2001
From: Mike Waychison <mikew@google.com>
Date: Mon, 14 Mar 2011 23:58:50 -0700
Subject: x86: Better comments for get_bios_ebda()

Make the comments a bit clearer for get_bios_ebda so that it actually
tells us what it is returning.

Signed-off-by: Mike Waychison <mikew@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/include/asm/bios_ebda.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h
index 5174cf0bf59..aa6a3170ab5 100644
--- a/arch/x86/include/asm/bios_ebda.h
+++ b/arch/x86/include/asm/bios_ebda.h
@@ -4,11 +4,14 @@
 #include <asm/io.h>
 
 /*
- * there is a real-mode segmented pointer pointing to the
- * 4K EBDA area at 0x40E.
+ * Returns physical address of EBDA.  Returns 0 if there is no EBDA.
  */
 static inline unsigned int get_bios_ebda(void)
 {
+	/*
+	 * There is a real-mode segmented pointer pointing to the
+	 * 4K EBDA area at 0x40E.
+	 */
 	unsigned int address = *(unsigned short *)phys_to_virt(0x40E);
 	address <<= 4;
 	return address;	/* 0 means none */
-- 
cgit v1.2.3-70-g09d2


From 9de4966a4d218f29c68e96e8e7b4d2840dedec79 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bvanassche@acm.org>
Date: Sun, 1 May 2011 14:09:21 +0200
Subject: x86: Fix spelling error in the memcpy() source code comment

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: http://lkml.kernel.org/r/201105011409.21629.bvanassche@acm.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/lib/memcpy_64.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 75ef61e35e3..2a560bb2573 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -49,7 +49,7 @@ ENTRY(memcpy)
 	jb .Lhandle_tail
 
 	/*
-	 * We check whether memory false dependece could occur,
+	 * We check whether memory false dependence could occur,
 	 * then jump to corresponding copy mode.
 	 */
 	cmp  %dil, %sil
-- 
cgit v1.2.3-70-g09d2


From 9688678a6670c7f0ae3872450a8047c0ad401efb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:51 +0200
Subject: x86-64, NUMA: Simplify hotadd memory handling

The only special handling NUMA needs to do for hotadd memory is
determining the node for the hotadd memory given the address of it and
there's nothing specific to specific config method used.

srat_64.c does somewhat elaborate error checking on
ACPI_SRAT_MEM_HOT_PLUGGABLE regions, remembers them and implements
memory_add_physaddr_to_nid() which determines the node for given
hotadd address.

This is almost completely redundant.  All the information is already
available to the generic NUMA code which already performs all the
sanity checking and merging.  All that's necessary is not using
__initdata from numa_meminfo and providing a function which uses it to
map address to node.

Drop the specific implementation from srat_64.c and add generic
memory_add_physaddr_to_nid() in numa_64.c, which is enabled if
CONFIG_MEMORY_HOTPLUG is set.  Other than dropping the code, srat_64.c
doesn't need any change as it already calls numa_add_memblk() for hot
pluggable regions which is enough.

While at it, change CONFIG_MEMORY_HOTPLUG_SPARSE in srat_64.c to
CONFIG_MEMORY_HOTPLUG, for NUMA on x86-64, the two are always the
same.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/mm/init_64.c |  8 ------
 arch/x86/mm/numa_64.c | 22 ++++++++++++++-
 arch/x86/mm/srat_64.c | 78 +--------------------------------------------------
 3 files changed, 22 insertions(+), 86 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 79423358728..0404bb3a077 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -679,14 +679,6 @@ int arch_add_memory(int nid, u64 start, u64 size)
 }
 EXPORT_SYMBOL_GPL(arch_add_memory);
 
-#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
-int memory_add_physaddr_to_nid(u64 start)
-{
-	return 0;
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
-
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 static struct kcore_list kcore_vsyscall;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a96767cb068..4057b5d4391 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -28,7 +28,12 @@ EXPORT_SYMBOL(node_data);
 
 nodemask_t numa_nodes_parsed __initdata;
 
-static struct numa_meminfo numa_meminfo __initdata;
+static struct numa_meminfo numa_meminfo
+#ifndef CONFIG_MEMORY_HOTPLUG
+__initdata
+#endif
+;
+
 static int numa_distance_cnt;
 static u8 *numa_distance;
 
@@ -540,3 +545,18 @@ int __cpuinit numa_cpu_node(int cpu)
 		return __apicid_to_node[apicid];
 	return NUMA_NO_NODE;
 }
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+int memory_add_physaddr_to_nid(u64 start)
+{
+	struct numa_meminfo *mi = &numa_meminfo;
+	int nid = mi->blk[0].nid;
+	int i;
+
+	for (i = 0; i < mi->nr_blks; i++)
+		if (mi->blk[i].start <= start && mi->blk[i].end > start)
+			nid = mi->blk[i].nid;
+	return nid;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 8e9d3394f6d..9994d2cacf7 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -26,8 +26,6 @@
 
 int acpi_numa __initdata;
 
-static struct bootnode nodes_add[MAX_NUMNODES];
-
 static __init int setup_node(int pxm)
 {
 	return acpi_map_pxm_to_node(pxm);
@@ -37,7 +35,6 @@ static __init void bad_srat(void)
 {
 	printk(KERN_ERR "SRAT: SRAT not used.\n");
 	acpi_numa = -1;
-	memset(nodes_add, 0, sizeof(nodes_add));
 }
 
 static __init inline int srat_disabled(void)
@@ -131,67 +128,11 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 	       pxm, apic_id, node);
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifdef CONFIG_MEMORY_HOTPLUG
 static inline int save_add_info(void) {return 1;}
 #else
 static inline int save_add_info(void) {return 0;}
 #endif
-/*
- * Update nodes_add[]
- * This code supports one contiguous hot add area per node
- */
-static void __init
-update_nodes_add(int node, unsigned long start, unsigned long end)
-{
-	unsigned long s_pfn = start >> PAGE_SHIFT;
-	unsigned long e_pfn = end >> PAGE_SHIFT;
-	int changed = 0;
-	struct bootnode *nd = &nodes_add[node];
-
-	/* I had some trouble with strange memory hotadd regions breaking
-	   the boot. Be very strict here and reject anything unexpected.
-	   If you want working memory hotadd write correct SRATs.
-
-	   The node size check is a basic sanity check to guard against
-	   mistakes */
-	if ((signed long)(end - start) < NODE_MIN_SIZE) {
-		printk(KERN_ERR "SRAT: Hotplug area too small\n");
-		return;
-	}
-
-	/* This check might be a bit too strict, but I'm keeping it for now. */
-	if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
-		printk(KERN_ERR
-			"SRAT: Hotplug area %lu -> %lu has existing memory\n",
-			s_pfn, e_pfn);
-		return;
-	}
-
-	/* Looks good */
-
-	if (nd->start == nd->end) {
-		nd->start = start;
-		nd->end = end;
-		changed = 1;
-	} else {
-		if (nd->start == end) {
-			nd->start = start;
-			changed = 1;
-		}
-		if (nd->end == start) {
-			nd->end = end;
-			changed = 1;
-		}
-		if (!changed)
-			printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
-	}
-
-	if (changed) {
-		node_set(node, numa_nodes_parsed);
-		printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
-				 nd->start, nd->end);
-	}
-}
 
 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
 void __init
@@ -228,9 +169,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 
 	printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
 	       start, end);
-
-	if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
-		update_nodes_add(node, start, end);
 }
 
 void __init acpi_numa_arch_fixup(void) {}
@@ -244,17 +182,3 @@ int __init x86_acpi_numa_init(void)
 		return ret;
 	return srat_disabled() ? -EINVAL : 0;
 }
-
-#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
-int memory_add_physaddr_to_nid(u64 start)
-{
-	int i, ret = 0;
-
-	for_each_node(i)
-		if (nodes_add[i].start <= start && nodes_add[i].end > start)
-			ret = i;
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
-- 
cgit v1.2.3-70-g09d2


From ebe685f24eeb85fbdb0f33792f1dabdbf35eff38 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:51 +0200
Subject: x86-64, NUMA: trivial cleanups for setup_node_bootmem()

Make the following trivial changes in preparation for further updates.

* nodeid -> nid, nid -> tnid
* use nd_ prefix for nodedata related variables
* remove start/end_pfn and use start/end directly

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/mm/numa_64.c | 52 +++++++++++++++++++++++----------------------------
 1 file changed, 23 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 4057b5d4391..8043d5e7f0d 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -128,14 +128,11 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
 
 /* Initialize bootmem allocator for a node */
 void __init
-setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
+setup_node_bootmem(int nid, unsigned long start, unsigned long end)
 {
-	unsigned long start_pfn, last_pfn, nodedata_phys;
-	const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
-	int nid;
-
-	if (!end)
-		return;
+	const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+	unsigned long nd_pa;
+	int tnid;
 
 	/*
 	 * Don't confuse VM with a node that doesn't have the
@@ -146,30 +143,27 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 
 	start = roundup(start, ZONE_ALIGN);
 
-	printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
-	       start, end);
-
-	start_pfn = start >> PAGE_SHIFT;
-	last_pfn = end >> PAGE_SHIFT;
+	printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n",
+	       nid, start, end);
 
-	node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
-					   SMP_CACHE_BYTES);
-	if (node_data[nodeid] == NULL)
+	node_data[nid] = early_node_mem(nid, start, end, nd_size,
+					SMP_CACHE_BYTES);
+	if (node_data[nid] == NULL)
 		return;
-	nodedata_phys = __pa(node_data[nodeid]);
-	memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
-	printk(KERN_INFO "  NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
-		nodedata_phys + pgdat_size - 1);
-	nid = early_pfn_to_nid(nodedata_phys >> PAGE_SHIFT);
-	if (nid != nodeid)
-		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nodeid, nid);
-
-	memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
-	NODE_DATA(nodeid)->node_id = nodeid;
-	NODE_DATA(nodeid)->node_start_pfn = start_pfn;
-	NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
-
-	node_set_online(nodeid);
+	nd_pa = __pa(node_data[nid]);
+	memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
+	printk(KERN_INFO "  NODE_DATA [%016lx - %016lx]\n",
+	       nd_pa, nd_pa + nd_size - 1);
+	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
+	if (tnid != nid)
+		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
+
+	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+	NODE_DATA(nid)->node_id = nid;
+	NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
+	NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
+
+	node_set_online(nid);
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From acd26d611e60c1a7c2a14269ab99760f779121f4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:51 +0200
Subject: x86-64, NUMA: simplify nodedata allocation

With top-down memblock allocation, the allocation range limits in
ealry_node_mem() can be simplified - try node-local first, then any
node but in any case don't allocate below DMA limit.

Remove early_node_mem() and implement simplified allocation directly
in setup_node_bootmem().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/mm/numa_64.c | 53 +++++++++++++++++----------------------------------
 1 file changed, 17 insertions(+), 36 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 8043d5e7f0d..b4fd25e753c 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -37,38 +37,6 @@ __initdata
 static int numa_distance_cnt;
 static u8 *numa_distance;
 
-static void * __init early_node_mem(int nodeid, unsigned long start,
-				    unsigned long end, unsigned long size,
-				    unsigned long align)
-{
-	unsigned long mem;
-
-	/*
-	 * put it on high as possible
-	 * something will go with NODE_DATA
-	 */
-	if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
-		start = MAX_DMA_PFN<<PAGE_SHIFT;
-	if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
-	    end > (MAX_DMA32_PFN<<PAGE_SHIFT))
-		start = MAX_DMA32_PFN<<PAGE_SHIFT;
-	mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align);
-	if (mem != MEMBLOCK_ERROR)
-		return __va(mem);
-
-	/* extend the search scope */
-	end = max_pfn_mapped << PAGE_SHIFT;
-	start = MAX_DMA_PFN << PAGE_SHIFT;
-	mem = memblock_find_in_range(start, end, size, align);
-	if (mem != MEMBLOCK_ERROR)
-		return __va(mem);
-
-	printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
-		       size, nodeid);
-
-	return NULL;
-}
-
 static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
 				     struct numa_meminfo *mi)
 {
@@ -130,6 +98,8 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
 void __init
 setup_node_bootmem(int nid, unsigned long start, unsigned long end)
 {
+	const u64 nd_low = (u64)MAX_DMA_PFN << PAGE_SHIFT;
+	const u64 nd_high = (u64)max_pfn_mapped << PAGE_SHIFT;
 	const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
 	unsigned long nd_pa;
 	int tnid;
@@ -146,18 +116,29 @@ setup_node_bootmem(int nid, unsigned long start, unsigned long end)
 	printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n",
 	       nid, start, end);
 
-	node_data[nid] = early_node_mem(nid, start, end, nd_size,
-					SMP_CACHE_BYTES);
-	if (node_data[nid] == NULL)
+	/*
+	 * Try to allocate node data on local node and then fall back to
+	 * all nodes.  Never allocate in DMA zone.
+	 */
+	nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high,
+						nd_size, SMP_CACHE_BYTES);
+	if (nd_pa == MEMBLOCK_ERROR)
+		nd_pa = memblock_find_in_range(nd_low, nd_high,
+					       nd_size, SMP_CACHE_BYTES);
+	if (nd_pa == MEMBLOCK_ERROR) {
+		pr_err("Cannot find %lu bytes in node %d\n", nd_size, nid);
 		return;
-	nd_pa = __pa(node_data[nid]);
+	}
 	memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
+
+	/* report and initialize */
 	printk(KERN_INFO "  NODE_DATA [%016lx - %016lx]\n",
 	       nd_pa, nd_pa + nd_size - 1);
 	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
 	if (tnid != nid)
 		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
 
+	node_data[nid] = __va(nd_pa);
 	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
 	NODE_DATA(nid)->node_id = nid;
 	NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
-- 
cgit v1.2.3-70-g09d2


From c4b90c11992e61123071977c0e5556e59a70852c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:52 +0200
Subject: x86-32, NUMA: Automatically set apicid -> node in setup_local_APIC()

Some x86-32 NUMA implementations (NUMAQ) don't initialize apicid ->
node mapping using set_apicid_to_node() during NUMA init but implement
custom apic->x86_32_numa_cpu_node() instead.

This patch automatically initializes the default apic -> node mapping
table from apic->x86_32_numa_cpu_node() from setup_local_APIC() such
that the mapping table is in sync with the actual mapping.

As the table isn't used by custom implementations, this doesn't make
any difference at this point.  This is in preparation of unifying
numa_cpu_node() between x86-32 and 64.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/kernel/apic/apic.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2bc503bf9e9..a6cd02a9268 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1237,6 +1237,16 @@ void __cpuinit setup_local_APIC(void)
 	/* always use the value from LDR */
 	early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
 		logical_smp_processor_id();
+
+	/*
+	 * Some NUMA implementations (NUMAQ) don't initialize apicid to
+	 * node mapping during NUMA init.  Now that logical apicid is
+	 * guaranteed to be known, give it another chance.  This is already
+	 * a bit too late - percpu allocation has already happened without
+	 * proper NUMA affinity.
+	 */
+	set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu),
+			   apic->x86_32_numa_cpu_node(cpu));
 #endif
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From 6bd262731bf7559bab8c749786e8652e2df1fb4e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:52 +0200
Subject: x86, NUMA: Unify 32/64bit numa_cpu_node() implementation

Currently, the only meaningful user of apic->x86_32_numa_cpu_node() is
NUMAQ which returns valid mapping only after CPU is initialized during
SMP bringup; thus, the previous patch to set apicid -> node in
setup_local_APIC() makes __apicid_to_node[] always contain the correct
mapping whether custom apic->x86_32_numa_cpu_node() is used or not.

So, there is no reason to keep separate 32bit implementation.  We can
always consult __apicid_to_node[].  Move 64bit implementation from
numa_64.c to numa.c and remove 32bit implementation from numa_32.c.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/include/asm/numa.h    | 10 ++++++++++
 arch/x86/include/asm/numa_32.h |  6 ------
 arch/x86/include/asm/numa_64.h |  3 ---
 arch/x86/mm/numa.c             |  9 +++++++++
 arch/x86/mm/numa_32.c          |  5 -----
 arch/x86/mm/numa_64.c          |  9 ---------
 6 files changed, 19 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index a50fc9f493b..5982d418c35 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_NUMA_H
 #define _ASM_X86_NUMA_H
 
+#include <linux/nodemask.h>
+
 #include <asm/topology.h>
 #include <asm/apicdef.h>
 
@@ -22,10 +24,18 @@ static inline void set_apicid_to_node(int apicid, s16 node)
 {
 	__apicid_to_node[apicid] = node;
 }
+
+extern int __cpuinit numa_cpu_node(int cpu);
+
 #else	/* CONFIG_NUMA */
 static inline void set_apicid_to_node(int apicid, s16 node)
 {
 }
+
+static inline int numa_cpu_node(int cpu)
+{
+	return NUMA_NO_NODE;
+}
 #endif	/* CONFIG_NUMA */
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index c6beed1ef10..242522fe9f8 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -5,12 +5,6 @@ extern int numa_off;
 
 extern int pxm_to_nid(int pxm);
 
-#ifdef CONFIG_NUMA
-extern int __cpuinit numa_cpu_node(int cpu);
-#else	/* CONFIG_NUMA */
-static inline int numa_cpu_node(int cpu)		{ return NUMA_NO_NODE; }
-#endif	/* CONFIG_NUMA */
-
 #ifdef CONFIG_HIGHMEM
 extern void set_highmem_pages_init(void);
 #else
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 344eb1790b4..12461ebec70 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -26,7 +26,6 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
 
 extern nodemask_t numa_nodes_parsed __initdata;
 
-extern int __cpuinit numa_cpu_node(int cpu);
 extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
 extern void __init numa_set_distance(int from, int to, int distance);
 
@@ -35,8 +34,6 @@ extern void __init numa_set_distance(int from, int to, int distance);
 #define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1UL))
 void numa_emu_cmdline(char *);
 #endif /* CONFIG_NUMA_EMU */
-#else
-static inline int numa_cpu_node(int cpu)		{ return NUMA_NO_NODE; }
 #endif
 
 #endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 745258dfc4d..e9005c4ea29 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -32,6 +32,15 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
 	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
 };
 
+int __cpuinit numa_cpu_node(int cpu)
+{
+	int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+
+	if (apicid != BAD_APICID)
+		return __apicid_to_node[apicid];
+	return NUMA_NO_NODE;
+}
+
 cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
 EXPORT_SYMBOL(node_to_cpumask_map);
 
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index c757c0a3b52..e0d9716ab38 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -107,11 +107,6 @@ extern unsigned long highend_pfn, highstart_pfn;
 static void *node_remap_start_vaddr[MAX_NUMNODES];
 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
-int __cpuinit numa_cpu_node(int cpu)
-{
-	return apic->x86_32_numa_cpu_node(cpu);
-}
-
 /*
  * FLAT - support for basic PC memory model with discontig enabled, essentially
  *        a single node with all available processors in it with a flat
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index b4fd25e753c..7f83adec048 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -512,15 +512,6 @@ unsigned long __init numa_free_all_bootmem(void)
 	return pages;
 }
 
-int __cpuinit numa_cpu_node(int cpu)
-{
-	int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
-
-	if (apicid != BAD_APICID)
-		return __apicid_to_node[apicid];
-	return NUMA_NO_NODE;
-}
-
 #ifdef CONFIG_MEMORY_HOTPLUG
 int memory_add_physaddr_to_nid(u64 start)
 {
-- 
cgit v1.2.3-70-g09d2


From 84914ed0ec6787d38e84b510f92ad4ca3a572fd8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:52 +0200
Subject: x86-32, NUMA: Make apic->x86_32_numa_cpu_node() optional

NUMAQ is the only meaningful user of this callback and
setup_local_APIC() the only callsite.  Stop torturing everyone else by
making the callback optional and removing all the boilerplate
implementations and assignments.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/include/asm/apic.h      |  9 ++++++---
 arch/x86/kernel/apic/apic.c      | 20 +++-----------------
 arch/x86/kernel/apic/apic_noop.c |  9 ---------
 arch/x86/kernel/apic/bigsmp_32.c |  1 -
 arch/x86/kernel/apic/es7000_32.c |  7 -------
 arch/x86/kernel/apic/probe_32.c  |  1 -
 arch/x86/kernel/apic/summit_32.c |  1 -
 7 files changed, 9 insertions(+), 39 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 2b7d573be54..a0c46f06121 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -363,7 +363,12 @@ struct apic {
 	 */
 	int (*x86_32_early_logical_apicid)(int cpu);
 
-	/* determine CPU -> NUMA node mapping */
+	/*
+	 * Optional method called from setup_local_APIC() after logical
+	 * apicid is guaranteed to be known to initialize apicid -> node
+	 * mapping if NUMA initialization hasn't done so already.  Don't
+	 * add new users.
+	 */
 	int (*x86_32_numa_cpu_node)(int cpu);
 #endif
 };
@@ -537,8 +542,6 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
 	return cpuid_apic >> index_msb;
 }
 
-extern int default_x86_32_numa_cpu_node(int cpu);
-
 #endif
 
 static inline unsigned int
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a6cd02a9268..0c67b4fc25b 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1245,8 +1245,9 @@ void __cpuinit setup_local_APIC(void)
 	 * a bit too late - percpu allocation has already happened without
 	 * proper NUMA affinity.
 	 */
-	set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu),
-			   apic->x86_32_numa_cpu_node(cpu));
+	if (apic->x86_32_numa_cpu_node)
+		set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu),
+				   apic->x86_32_numa_cpu_node(cpu));
 #endif
 
 	/*
@@ -2013,21 +2014,6 @@ void default_init_apic_ldr(void)
 	apic_write(APIC_LDR, val);
 }
 
-#ifdef CONFIG_X86_32
-int default_x86_32_numa_cpu_node(int cpu)
-{
-#ifdef CONFIG_NUMA
-	int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
-
-	if (apicid != BAD_APICID)
-		return __apicid_to_node[apicid];
-	return NUMA_NO_NODE;
-#else
-	return 0;
-#endif
-}
-#endif
-
 /*
  * Power management
  */
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index f1baa2dc087..775b82bc655 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -119,14 +119,6 @@ static void noop_apic_write(u32 reg, u32 v)
 	WARN_ON_ONCE(cpu_has_apic && !disable_apic);
 }
 
-#ifdef CONFIG_X86_32
-static int noop_x86_32_numa_cpu_node(int cpu)
-{
-	/* we're always on node 0 */
-	return 0;
-}
-#endif
-
 struct apic apic_noop = {
 	.name				= "noop",
 	.probe				= noop_probe,
@@ -195,6 +187,5 @@ struct apic apic_noop = {
 
 #ifdef CONFIG_X86_32
 	.x86_32_early_logical_apicid	= noop_x86_32_early_logical_apicid,
-	.x86_32_numa_cpu_node		= noop_x86_32_numa_cpu_node,
 #endif
 };
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 541a2e43165..d84ac5a584b 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -253,5 +253,4 @@ struct apic apic_bigsmp = {
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
 	.x86_32_early_logical_apicid	= bigsmp_early_logical_apicid,
-	.x86_32_numa_cpu_node		= default_x86_32_numa_cpu_node,
 };
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 3e9de4854c5..70533de5bd2 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -510,11 +510,6 @@ static void es7000_setup_apic_routing(void)
 		nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
 }
 
-static int es7000_numa_cpu_node(int cpu)
-{
-	return 0;
-}
-
 static int es7000_cpu_present_to_apicid(int mps_cpu)
 {
 	if (!mps_cpu)
@@ -688,7 +683,6 @@ struct apic __refdata apic_es7000_cluster = {
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
 	.x86_32_early_logical_apicid	= es7000_early_logical_apicid,
-	.x86_32_numa_cpu_node		= es7000_numa_cpu_node,
 };
 
 struct apic __refdata apic_es7000 = {
@@ -752,5 +746,4 @@ struct apic __refdata apic_es7000 = {
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
 	.x86_32_early_logical_apicid	= es7000_early_logical_apicid,
-	.x86_32_numa_cpu_node		= es7000_numa_cpu_node,
 };
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index fc84c7b6110..6541e471fd9 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -172,7 +172,6 @@ struct apic apic_default = {
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
 	.x86_32_early_logical_apicid	= default_x86_32_early_logical_apicid,
-	.x86_32_numa_cpu_node		= default_x86_32_numa_cpu_node,
 };
 
 extern struct apic apic_numaq;
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index e4b8059b414..35bcd7d995a 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -551,5 +551,4 @@ struct apic apic_summit = {
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
 	.x86_32_early_logical_apicid	= summit_early_logical_apicid,
-	.x86_32_numa_cpu_node		= default_x86_32_numa_cpu_node,
 };
-- 
cgit v1.2.3-70-g09d2


From 797390d8554b1e07aabea37d0140933b0412dba0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:52 +0200
Subject: x86-32, NUMA: use sparse_memory_present_with_active_regions()

Instead of calling memory_present() for each region from NUMA init,
call sparse_memory_present_with_active_regions() from paging_init()
similarly to x86-64.

For flat and numaq, this results in exactly the same memory_present()
calls.  For srat, if there are multiple memory chunks for a node,
after this change, memory_present() will be called separately for each
chunk instead of being called once to encompass the whole range, which
doesn't cause any harm and actually is the better behavior.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/kernel/apic/numaq_32.c | 2 --
 arch/x86/mm/init_32.c           | 1 +
 arch/x86/mm/numa_32.c           | 1 -
 arch/x86/mm/srat_32.c           | 8 +-------
 4 files changed, 2 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 0aced70815f..41b8b29d36f 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -91,8 +91,6 @@ static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
 
 	memblock_x86_register_active_regions(node, node_start_pfn[node],
 						node_end_pfn[node]);
-
-	memory_present(node, node_start_pfn[node], node_end_pfn[node]);
 }
 
 /*
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 80088f99419..2cde0a34bed 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -716,6 +716,7 @@ void __init paging_init(void)
 	 * NOTE: at this point the bootmem allocator is fully available.
 	 */
 	olpc_dt_build_devicetree();
+	sparse_memory_present_with_active_regions(MAX_NUMNODES);
 	sparse_init();
 	zone_sizes_init();
 }
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index e0d9716ab38..f847fa1e02d 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -119,7 +119,6 @@ int __init get_memcfg_numa_flat(void)
 	node_start_pfn[0] = 0;
 	node_end_pfn[0] = max_pfn;
 	memblock_x86_register_active_regions(0, 0, max_pfn);
-	memory_present(0, 0, max_pfn);
 
         /* Indicate there is one node available. */
 	nodes_clear(node_online_map);
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index ae20046a9e9..6b9bfd78bc3 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -209,7 +209,7 @@ static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_ch
 
 int __init get_memcfg_from_srat(void)
 {
-	int i, j, nid;
+	int i, j;
 
 	if (srat_disabled())
 		goto out_fail;
@@ -273,12 +273,6 @@ int __init get_memcfg_from_srat(void)
 	/* for out of order entries in SRAT */
 	sort_node_map();
 
-	for_each_online_node(nid) {
-		unsigned long start = node_start_pfn[nid];
-		unsigned long end = min(node_end_pfn[nid], max_pfn);
-
-		memory_present(nid, start, end);
-	}
 	return 1;
 out_fail:
 	printk(KERN_DEBUG "failed to get NUMA memory information from SRAT"
-- 
cgit v1.2.3-70-g09d2


From 1201e10a092adc9c88a6ce5f27740cc5cd0d26e5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:52 +0200
Subject: x86, NUMA: trivial cleanups

* Kill no longer used struct bootnode.

* Kill dangling declaration of pxm_to_nid() in numa_32.h.

* Make setup_node_bootmem() static.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/include/asm/acpi.h    | 2 --
 arch/x86/include/asm/amd_nb.h  | 1 -
 arch/x86/include/asm/numa_32.h | 2 --
 arch/x86/include/asm/numa_64.h | 7 -------
 arch/x86/mm/numa_64.c          | 2 +-
 5 files changed, 1 insertion(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 12e0e7dd869..416d865eae3 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -183,8 +183,6 @@ static inline void disable_acpi(void) { }
 
 #define ARCH_HAS_POWER_INIT	1
 
-struct bootnode;
-
 #ifdef CONFIG_ACPI_NUMA
 extern int acpi_numa;
 extern int x86_acpi_numa_init(void);
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 331682231bb..67f87f25761 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -11,7 +11,6 @@ struct amd_nb_bus_dev_range {
 
 extern const struct pci_device_id amd_nb_misc_ids[];
 extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
-struct bootnode;
 
 extern bool early_is_amd_nb(u32 value);
 extern int amd_cache_northbridges(void);
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index 242522fe9f8..7e54b64623a 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -3,8 +3,6 @@
 
 extern int numa_off;
 
-extern int pxm_to_nid(int pxm);
-
 #ifdef CONFIG_HIGHMEM
 extern void set_highmem_pages_init(void);
 #else
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 12461ebec70..794da6de689 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -3,18 +3,11 @@
 
 #include <linux/nodemask.h>
 
-struct bootnode {
-	u64 start;
-	u64 end;
-};
-
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 
 extern int numa_off;
 
 extern unsigned long numa_free_all_bootmem(void);
-extern void setup_node_bootmem(int nodeid, unsigned long start,
-			       unsigned long end);
 
 #ifdef CONFIG_NUMA
 /*
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 7f83adec048..287ae798935 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -95,7 +95,7 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
 }
 
 /* Initialize bootmem allocator for a node */
-void __init
+static void __init
 setup_node_bootmem(int nid, unsigned long start, unsigned long end)
 {
 	const u64 nd_low = (u64)MAX_DMA_PFN << PAGE_SHIFT;
-- 
cgit v1.2.3-70-g09d2


From 7b2600f8ee0536bb738f3387cf2c30e8e334e149 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:52 +0200
Subject: x86, NUMA: rename srat_64.c to srat.c

Rename srat_64.c to srat.c.  This is to prepare for unification of
NUMA init paths between 32 and 64bit.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/mm/Makefile  |   5 +-
 arch/x86/mm/srat.c    | 184 ++++++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/mm/srat_64.c | 184 --------------------------------------------------
 3 files changed, 188 insertions(+), 185 deletions(-)
 create mode 100644 arch/x86/mm/srat.c
 delete mode 100644 arch/x86/mm/srat_64.c

(limited to 'arch/x86')

diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 3e608edf995..37e7043362a 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -24,7 +24,10 @@ obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
 
 obj-$(CONFIG_NUMA)		+= numa.o numa_$(BITS).o
 obj-$(CONFIG_AMD_NUMA)		+= amdtopology_64.o
-obj-$(CONFIG_ACPI_NUMA)		+= srat_$(BITS).o
+ifeq ($(CONFIG_ACPI_NUMA),y)
+obj-$(CONFIG_X86_64)		+= srat.o
+obj-$(CONFIG_X86_32)		+= srat_32.o
+endif
 obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 
 obj-$(CONFIG_HAVE_MEMBLOCK)		+= memblock.o
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
new file mode 100644
index 00000000000..9994d2cacf7
--- /dev/null
+++ b/arch/x86/mm/srat.c
@@ -0,0 +1,184 @@
+/*
+ * ACPI 3.0 based NUMA setup
+ * Copyright 2004 Andi Kleen, SuSE Labs.
+ *
+ * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
+ *
+ * Called from acpi_numa_init while reading the SRAT and SLIT tables.
+ * Assumes all memory regions belonging to a single proximity domain
+ * are in one chunk. Holes between them will be included in the node.
+ */
+
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/mmzone.h>
+#include <linux/bitmap.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+#include <linux/bootmem.h>
+#include <linux/memblock.h>
+#include <linux/mm.h>
+#include <asm/proto.h>
+#include <asm/numa.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/uv/uv.h>
+
+int acpi_numa __initdata;
+
+static __init int setup_node(int pxm)
+{
+	return acpi_map_pxm_to_node(pxm);
+}
+
+static __init void bad_srat(void)
+{
+	printk(KERN_ERR "SRAT: SRAT not used.\n");
+	acpi_numa = -1;
+}
+
+static __init inline int srat_disabled(void)
+{
+	return acpi_numa < 0;
+}
+
+/* Callback for SLIT parsing */
+void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
+{
+	int i, j;
+
+	for (i = 0; i < slit->locality_count; i++)
+		for (j = 0; j < slit->locality_count; j++)
+			numa_set_distance(pxm_to_node(i), pxm_to_node(j),
+				slit->entry[slit->locality_count * i + j]);
+}
+
+/* Callback for Proximity Domain -> x2APIC mapping */
+void __init
+acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
+{
+	int pxm, node;
+	int apic_id;
+
+	if (srat_disabled())
+		return;
+	if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
+		bad_srat();
+		return;
+	}
+	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+		return;
+	pxm = pa->proximity_domain;
+	node = setup_node(pxm);
+	if (node < 0) {
+		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+		bad_srat();
+		return;
+	}
+
+	apic_id = pa->apic_id;
+	if (apic_id >= MAX_LOCAL_APIC) {
+		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+		return;
+	}
+	set_apicid_to_node(apic_id, node);
+	node_set(node, numa_nodes_parsed);
+	acpi_numa = 1;
+	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
+	       pxm, apic_id, node);
+}
+
+/* Callback for Proximity Domain -> LAPIC mapping */
+void __init
+acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
+{
+	int pxm, node;
+	int apic_id;
+
+	if (srat_disabled())
+		return;
+	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
+		bad_srat();
+		return;
+	}
+	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
+		return;
+	pxm = pa->proximity_domain_lo;
+	node = setup_node(pxm);
+	if (node < 0) {
+		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
+		bad_srat();
+		return;
+	}
+
+	if (get_uv_system_type() >= UV_X2APIC)
+		apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
+	else
+		apic_id = pa->apic_id;
+
+	if (apic_id >= MAX_LOCAL_APIC) {
+		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+		return;
+	}
+
+	set_apicid_to_node(apic_id, node);
+	node_set(node, numa_nodes_parsed);
+	acpi_numa = 1;
+	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
+	       pxm, apic_id, node);
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static inline int save_add_info(void) {return 1;}
+#else
+static inline int save_add_info(void) {return 0;}
+#endif
+
+/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
+void __init
+acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
+{
+	unsigned long start, end;
+	int node, pxm;
+
+	if (srat_disabled())
+		return;
+	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
+		bad_srat();
+		return;
+	}
+	if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
+		return;
+
+	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
+		return;
+	start = ma->base_address;
+	end = start + ma->length;
+	pxm = ma->proximity_domain;
+	node = setup_node(pxm);
+	if (node < 0) {
+		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
+		bad_srat();
+		return;
+	}
+
+	if (numa_add_memblk(node, start, end) < 0) {
+		bad_srat();
+		return;
+	}
+
+	printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
+	       start, end);
+}
+
+void __init acpi_numa_arch_fixup(void) {}
+
+int __init x86_acpi_numa_init(void)
+{
+	int ret;
+
+	ret = acpi_numa_init();
+	if (ret < 0)
+		return ret;
+	return srat_disabled() ? -EINVAL : 0;
+}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
deleted file mode 100644
index 9994d2cacf7..00000000000
--- a/arch/x86/mm/srat_64.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * ACPI 3.0 based NUMA setup
- * Copyright 2004 Andi Kleen, SuSE Labs.
- *
- * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
- *
- * Called from acpi_numa_init while reading the SRAT and SLIT tables.
- * Assumes all memory regions belonging to a single proximity domain
- * are in one chunk. Holes between them will be included in the node.
- */
-
-#include <linux/kernel.h>
-#include <linux/acpi.h>
-#include <linux/mmzone.h>
-#include <linux/bitmap.h>
-#include <linux/module.h>
-#include <linux/topology.h>
-#include <linux/bootmem.h>
-#include <linux/memblock.h>
-#include <linux/mm.h>
-#include <asm/proto.h>
-#include <asm/numa.h>
-#include <asm/e820.h>
-#include <asm/apic.h>
-#include <asm/uv/uv.h>
-
-int acpi_numa __initdata;
-
-static __init int setup_node(int pxm)
-{
-	return acpi_map_pxm_to_node(pxm);
-}
-
-static __init void bad_srat(void)
-{
-	printk(KERN_ERR "SRAT: SRAT not used.\n");
-	acpi_numa = -1;
-}
-
-static __init inline int srat_disabled(void)
-{
-	return acpi_numa < 0;
-}
-
-/* Callback for SLIT parsing */
-void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
-{
-	int i, j;
-
-	for (i = 0; i < slit->locality_count; i++)
-		for (j = 0; j < slit->locality_count; j++)
-			numa_set_distance(pxm_to_node(i), pxm_to_node(j),
-				slit->entry[slit->locality_count * i + j]);
-}
-
-/* Callback for Proximity Domain -> x2APIC mapping */
-void __init
-acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
-{
-	int pxm, node;
-	int apic_id;
-
-	if (srat_disabled())
-		return;
-	if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
-		bad_srat();
-		return;
-	}
-	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
-		return;
-	pxm = pa->proximity_domain;
-	node = setup_node(pxm);
-	if (node < 0) {
-		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
-		bad_srat();
-		return;
-	}
-
-	apic_id = pa->apic_id;
-	if (apic_id >= MAX_LOCAL_APIC) {
-		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
-		return;
-	}
-	set_apicid_to_node(apic_id, node);
-	node_set(node, numa_nodes_parsed);
-	acpi_numa = 1;
-	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
-	       pxm, apic_id, node);
-}
-
-/* Callback for Proximity Domain -> LAPIC mapping */
-void __init
-acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
-{
-	int pxm, node;
-	int apic_id;
-
-	if (srat_disabled())
-		return;
-	if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
-		bad_srat();
-		return;
-	}
-	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
-		return;
-	pxm = pa->proximity_domain_lo;
-	node = setup_node(pxm);
-	if (node < 0) {
-		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
-		bad_srat();
-		return;
-	}
-
-	if (get_uv_system_type() >= UV_X2APIC)
-		apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
-	else
-		apic_id = pa->apic_id;
-
-	if (apic_id >= MAX_LOCAL_APIC) {
-		printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
-		return;
-	}
-
-	set_apicid_to_node(apic_id, node);
-	node_set(node, numa_nodes_parsed);
-	acpi_numa = 1;
-	printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
-	       pxm, apic_id, node);
-}
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-static inline int save_add_info(void) {return 1;}
-#else
-static inline int save_add_info(void) {return 0;}
-#endif
-
-/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
-void __init
-acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
-{
-	unsigned long start, end;
-	int node, pxm;
-
-	if (srat_disabled())
-		return;
-	if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
-		bad_srat();
-		return;
-	}
-	if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
-		return;
-
-	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
-		return;
-	start = ma->base_address;
-	end = start + ma->length;
-	pxm = ma->proximity_domain;
-	node = setup_node(pxm);
-	if (node < 0) {
-		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
-		bad_srat();
-		return;
-	}
-
-	if (numa_add_memblk(node, start, end) < 0) {
-		bad_srat();
-		return;
-	}
-
-	printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
-	       start, end);
-}
-
-void __init acpi_numa_arch_fixup(void) {}
-
-int __init x86_acpi_numa_init(void)
-{
-	int ret;
-
-	ret = acpi_numa_init();
-	if (ret < 0)
-		return ret;
-	return srat_disabled() ? -EINVAL : 0;
-}
-- 
cgit v1.2.3-70-g09d2


From eca9ad313293c41021bfcf23e985a14f6991a121 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:52 +0200
Subject: x86, NUMA: make srat.c 32bit safe

Make srat.c 32bit safe by removing the assumption that unsigned long
is 64bit.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/mm/srat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 9994d2cacf7..81dbfdeb080 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -138,7 +138,7 @@ static inline int save_add_info(void) {return 0;}
 void __init
 acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 {
-	unsigned long start, end;
+	u64 start, end;
 	int node, pxm;
 
 	if (srat_disabled())
@@ -167,7 +167,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 		return;
 	}
 
-	printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
+	printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
 	       start, end);
 }
 
-- 
cgit v1.2.3-70-g09d2


From daf4f480ae24270bac06db4293908d36b4834e21 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:53 +0200
Subject: x86-32, NUMA: Move get_memcfg_numa() into numa_32.c

There's no reason get_memcfg_numa() to be implemented inline in
mmzone_32.h.  Move it to numa_32.c and also make
get_memcfg_numa_flag() static.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/include/asm/mmzone_32.h | 18 ------------------
 arch/x86/mm/numa_32.c            | 11 ++++++++++-
 2 files changed, 10 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 91df7c51806..73e5745aef3 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -16,28 +16,10 @@ extern struct pglist_data *node_data[];
 /* summit or generic arch */
 #include <asm/srat.h>
 
-extern int get_memcfg_numa_flat(void);
-/*
- * This allows any one NUMA architecture to be compiled
- * for, and still fall back to the flat function if it
- * fails.
- */
-static inline void get_memcfg_numa(void)
-{
-
-	if (get_memcfg_numaq())
-		return;
-	if (get_memcfg_from_srat())
-		return;
-	get_memcfg_numa_flat();
-}
-
 extern void resume_map_numa_kva(pgd_t *pgd);
 
 #else /* !CONFIG_NUMA */
 
-#define get_memcfg_numa get_memcfg_numa_flat
-
 static inline void resume_map_numa_kva(pgd_t *pgd) {}
 
 #endif /* CONFIG_NUMA */
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index f847fa1e02d..abf1247a4c3 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -112,7 +112,7 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
  *        a single node with all available processors in it with a flat
  *        memory map.
  */
-int __init get_memcfg_numa_flat(void)
+static int __init get_memcfg_numa_flat(void)
 {
 	printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
 
@@ -332,6 +332,15 @@ static __init void init_alloc_remap(int nid)
 	       nid, node_pa, node_pa + size, remap_va, remap_va + size);
 }
 
+static void get_memcfg_numa(void)
+{
+	if (get_memcfg_numaq())
+		return;
+	if (get_memcfg_from_srat())
+		return;
+	get_memcfg_numa_flat();
+}
+
 void __init initmem_init(void)
 {
 	int nid;
-- 
cgit v1.2.3-70-g09d2


From e6df595b37c7c033ef7400b4fdd382a2dc4f4131 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:53 +0200
Subject: x86, NUMA: Move numa_nodes_parsed to numa.[hc]

Move numa_nodes_parsed from numa_64.[hc] to numa.[hc] to prepare for
NUMA init path unification.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/include/asm/numa.h    | 1 +
 arch/x86/include/asm/numa_64.h | 4 ----
 arch/x86/mm/numa.c             | 1 +
 arch/x86/mm/numa_64.c          | 2 --
 4 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 5982d418c35..c24306cd150 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -19,6 +19,7 @@
  * numa_cpu_node().
  */
 extern s16 __apicid_to_node[MAX_LOCAL_APIC];
+extern nodemask_t numa_nodes_parsed __initdata;
 
 static inline void set_apicid_to_node(int apicid, s16 node)
 {
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 794da6de689..e84113f8fff 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -1,8 +1,6 @@
 #ifndef _ASM_X86_NUMA_64_H
 #define _ASM_X86_NUMA_64_H
 
-#include <linux/nodemask.h>
-
 #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
 
 extern int numa_off;
@@ -17,8 +15,6 @@ extern unsigned long numa_free_all_bootmem(void);
  */
 #define NODE_MIN_SIZE (4*1024*1024)
 
-extern nodemask_t numa_nodes_parsed __initdata;
-
 extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
 extern void __init numa_set_distance(int from, int to, int distance);
 
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index e9005c4ea29..cce174109ca 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -6,6 +6,7 @@
 #include <asm/acpi.h>
 
 int __initdata numa_off;
+nodemask_t numa_nodes_parsed __initdata;
 
 static __init int numa_setup(char *opt)
 {
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 287ae798935..70bd8221f92 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -26,8 +26,6 @@
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
-nodemask_t numa_nodes_parsed __initdata;
-
 static struct numa_meminfo numa_meminfo
 #ifndef CONFIG_MEMORY_HOTPLUG
 __initdata
-- 
cgit v1.2.3-70-g09d2


From b0d310801a4c1f95b44357e4ebc22a9903e3bf3d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:53 +0200
Subject: x86-32, NUMA: implement temporary NUMA init shims

To help transition to common NUMA init, implement temporary 32bit
shims for numa_add_memblk() and numa_set_distance().
numa_add_memblk() registers the memblk and adjusts
node_start/end_pfn[].  numa_set_distance() is noop.

These shims will allow using 64bit NUMA init functions on 32bit and
gradual transition to common NUMA init path.

For detailed description, please read description of commits which
make use of the shim functions.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/include/asm/numa.h    |  3 +++
 arch/x86/include/asm/numa_64.h |  3 ---
 arch/x86/mm/numa_32.c          | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 37 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index c24306cd150..db449c7d89b 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -21,6 +21,9 @@
 extern s16 __apicid_to_node[MAX_LOCAL_APIC];
 extern nodemask_t numa_nodes_parsed __initdata;
 
+extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
+extern void __init numa_set_distance(int from, int to, int distance);
+
 static inline void set_apicid_to_node(int apicid, s16 node)
 {
 	__apicid_to_node[apicid] = node;
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index e84113f8fff..506dd050ff3 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -15,9 +15,6 @@ extern unsigned long numa_free_all_bootmem(void);
  */
 #define NODE_MIN_SIZE (4*1024*1024)
 
-extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
-extern void __init numa_set_distance(int from, int to, int distance);
-
 #ifdef CONFIG_NUMA_EMU
 #define FAKE_NODE_MIN_SIZE	((u64)32 << 20)
 #define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1UL))
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index abf1247a4c3..d0369a56f84 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -414,3 +414,37 @@ int memory_add_physaddr_to_nid(u64 addr)
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 #endif
 
+/* temporary shim, will go away soon */
+int __init numa_add_memblk(int nid, u64 start, u64 end)
+{
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long end_pfn = end >> PAGE_SHIFT;
+
+	printk(KERN_DEBUG "nid %d start_pfn %08lx end_pfn %08lx\n",
+	       nid, start_pfn, end_pfn);
+
+	if (start >= (u64)max_pfn << PAGE_SHIFT) {
+		printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
+		       start_pfn, end_pfn);
+		return 0;
+	}
+
+	node_set_online(nid);
+	memblock_x86_register_active_regions(nid, start_pfn,
+					     min(end_pfn, max_pfn));
+
+	if (!node_has_online_mem(nid)) {
+		node_start_pfn[nid] = start_pfn;
+		node_end_pfn[nid] = end_pfn;
+	} else {
+		node_start_pfn[nid] = min(node_start_pfn[nid], start_pfn);
+		node_end_pfn[nid] = max(node_end_pfn[nid], end_pfn);
+	}
+	return 0;
+}
+
+/* temporary shim, will go away soon */
+void __init numa_set_distance(int from, int to, int distance)
+{
+	/* nada */
+}
-- 
cgit v1.2.3-70-g09d2


From 5acd91ab837c9d066af7345aea6462dc55695db7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:53 +0200
Subject: x86-32, NUMA: Replace srat_32.c with srat.c

SRAT support implementation in srat_32.c and srat.c are generally
similar; however, there are some differences.

First of all, 64bit implementation supports more types of SRAT
entries.  64bit supports x2apic, affinity, memory and SLIT.  32bit
only supports processor and memory.

Most other differences stem from different initialization protocols
employed by 64bit and 32bit NUMA init paths.

On 64bit,

* Mappings among PXM, node and apicid are directly done in each SRAT
  entry callback.

* Memory affinity information is passed to numa_add_memblk() which
  takes care of all interfacing with NUMA init.

* Doesn't directly initialize NUMA configurations.  All the
  information is recorded in numa_nodes_parsed and memblks.

On 32bit,

* Checks numa_off.

* Things go through one more level of indirection via private tables
  but eventually end up initializing the same mappings.

* node_start/end_pfn[] are initialized and
  memblock_x86_register_active_regions() is called for each memory
  chunk.

* node_set_online() is called for each online node.

* sort_node_map() is called.

There are also other minor differences in sanity checking and messages
but taking 64bit version should be good enough.

This patch drops the 32bit specific implementation and makes the 64bit
implementation common for both 32 and 64bit.

The init protocol differences are dealt with in two places - the
numa_add_memblk() shim added in the previous patch and new temporary
numa_32.c:get_memcfg_from_srat() which wraps invocation of
x86_acpi_numa_init().

The shim numa_add_memblk() handles the folowings.

* node_start/end_pfn[] initialization.

* node_set_online() for memory nodes.

* Invocation of memblock_x86_register_active_regions().

The shim get_memcfg_from_srat() handles the followings.

* numa_off check.

* node_set_online() for CPU nodes.

* sort_node_map() invocation.

* Clearing of numa_nodes_parsed and active_ranges on failure.

The shims are temporary and will be removed as the generic NUMA init
path in 32bit is replaced with 64bit one.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/include/asm/mmzone_32.h |   2 -
 arch/x86/include/asm/srat.h      |  39 ------
 arch/x86/mm/Makefile             |   5 +-
 arch/x86/mm/numa_32.c            |  23 ++++
 arch/x86/mm/srat_32.c            | 281 ---------------------------------------
 5 files changed, 24 insertions(+), 326 deletions(-)
 delete mode 100644 arch/x86/include/asm/srat.h
 delete mode 100644 arch/x86/mm/srat_32.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 73e5745aef3..5e83a416eca 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -13,8 +13,6 @@ extern struct pglist_data *node_data[];
 #define NODE_DATA(nid)	(node_data[nid])
 
 #include <asm/numaq.h>
-/* summit or generic arch */
-#include <asm/srat.h>
 
 extern void resume_map_numa_kva(pgd_t *pgd);
 
diff --git a/arch/x86/include/asm/srat.h b/arch/x86/include/asm/srat.h
deleted file mode 100644
index b508d639d1a..00000000000
--- a/arch/x86/include/asm/srat.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Some of the code in this file has been gleaned from the 64 bit
- * discontigmem support code base.
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to Pat Gaughen <gone@us.ibm.com>
- */
-
-#ifndef _ASM_X86_SRAT_H
-#define _ASM_X86_SRAT_H
-
-#ifdef CONFIG_ACPI_NUMA
-extern int get_memcfg_from_srat(void);
-#else
-static inline int get_memcfg_from_srat(void)
-{
-	return 0;
-}
-#endif
-
-#endif /* _ASM_X86_SRAT_H */
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 37e7043362a..62997be3307 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -24,10 +24,7 @@ obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
 
 obj-$(CONFIG_NUMA)		+= numa.o numa_$(BITS).o
 obj-$(CONFIG_AMD_NUMA)		+= amdtopology_64.o
-ifeq ($(CONFIG_ACPI_NUMA),y)
-obj-$(CONFIG_X86_64)		+= srat.o
-obj-$(CONFIG_X86_32)		+= srat_32.o
-endif
+obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 
 obj-$(CONFIG_HAVE_MEMBLOCK)		+= memblock.o
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index d0369a56f84..8641239a066 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -332,6 +332,29 @@ static __init void init_alloc_remap(int nid)
 	       nid, node_pa, node_pa + size, remap_va, remap_va + size);
 }
 
+static int get_memcfg_from_srat(void)
+{
+#ifdef CONFIG_ACPI_NUMA
+	int nid;
+
+	if (numa_off)
+		return 0;
+
+	if (x86_acpi_numa_init() < 0) {
+		nodes_clear(numa_nodes_parsed);
+		remove_all_active_ranges();
+		return 0;
+	}
+
+	for_each_node_mask(nid, numa_nodes_parsed)
+		node_set_online(nid);
+	sort_node_map();
+	return 1;
+#else
+	return 0;
+#endif
+}
+
 static void get_memcfg_numa(void)
 {
 	if (get_memcfg_numaq())
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
deleted file mode 100644
index 6b9bfd78bc3..00000000000
--- a/arch/x86/mm/srat_32.c
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Some of the code in this file has been gleaned from the 64 bit 
- * discontigmem support code base.
- *
- * Copyright (C) 2002, IBM Corp.
- *
- * All rights reserved.          
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT.  See the GNU General Public License for more
- * details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Send feedback to Pat Gaughen <gone@us.ibm.com>
- */
-#include <linux/mm.h>
-#include <linux/bootmem.h>
-#include <linux/memblock.h>
-#include <linux/mmzone.h>
-#include <linux/acpi.h>
-#include <linux/nodemask.h>
-#include <asm/srat.h>
-#include <asm/topology.h>
-#include <asm/smp.h>
-#include <asm/e820.h>
-
-/*
- * proximity macros and definitions
- */
-#define NODE_ARRAY_INDEX(x)	((x) / 8)	/* 8 bits/char */
-#define NODE_ARRAY_OFFSET(x)	((x) % 8)	/* 8 bits/char */
-#define BMAP_SET(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
-#define BMAP_TEST(bmap, bit)	((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
-/* bitmap length; _PXM is at most 255 */
-#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) 
-static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN];	/* bitmap of proximity domains */
-
-#define MAX_CHUNKS_PER_NODE	3
-#define MAXCHUNKS		(MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
-struct node_memory_chunk_s {
-	unsigned long	start_pfn;
-	unsigned long	end_pfn;
-	u8	pxm;		// proximity domain of node
-	u8	nid;		// which cnode contains this chunk?
-	u8	bank;		// which mem bank on this node
-};
-static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
-
-static int __initdata num_memory_chunks; /* total number of memory chunks */
-static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC];
-
-int acpi_numa __initdata;
-
-static __init void bad_srat(void)
-{
-        printk(KERN_ERR "SRAT: SRAT not used.\n");
-        acpi_numa = -1;
-	num_memory_chunks = 0;
-}
-
-static __init inline int srat_disabled(void)
-{
-	return numa_off || acpi_numa < 0;
-}
-
-/* Identify CPU proximity domains */
-void __init
-acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
-{
-	if (srat_disabled())
-		return;
-	if (cpu_affinity->header.length !=
-	     sizeof(struct acpi_srat_cpu_affinity)) {
-		bad_srat();
-		return;
-	}
-
-	if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
-		return;		/* empty entry */
-
-	/* mark this node as "seen" in node bitmap */
-	BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
-
-	/* don't need to check apic_id here, because it is always 8 bits */
-	apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
-
-	printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
-		cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
-}
-
-/*
- * Identify memory proximity domains and hot-remove capabilities.
- * Fill node memory chunk list structure.
- */
-void __init
-acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
-{
-	unsigned long long paddr, size;
-	unsigned long start_pfn, end_pfn;
-	u8 pxm;
-	struct node_memory_chunk_s *p, *q, *pend;
-
-	if (srat_disabled())
-		return;
-	if (memory_affinity->header.length !=
-	     sizeof(struct acpi_srat_mem_affinity)) {
-		bad_srat();
-		return;
-	}
-
-	if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
-		return;		/* empty entry */
-
-	pxm = memory_affinity->proximity_domain & 0xff;
-
-	/* mark this node as "seen" in node bitmap */
-	BMAP_SET(pxm_bitmap, pxm);
-
-	/* calculate info for memory chunk structure */
-	paddr = memory_affinity->base_address;
-	size = memory_affinity->length;
-
-	start_pfn = paddr >> PAGE_SHIFT;
-	end_pfn = (paddr + size) >> PAGE_SHIFT;
-
-
-	if (num_memory_chunks >= MAXCHUNKS) {
-		printk(KERN_WARNING "Too many mem chunks in SRAT."
-			" Ignoring %lld MBytes at %llx\n",
-			size/(1024*1024), paddr);
-		return;
-	}
-
-	/* Insertion sort based on base address */
-	pend = &node_memory_chunk[num_memory_chunks];
-	for (p = &node_memory_chunk[0]; p < pend; p++) {
-		if (start_pfn < p->start_pfn)
-			break;
-	}
-	if (p < pend) {
-		for (q = pend; q >= p; q--)
-			*(q + 1) = *q;
-	}
-	p->start_pfn = start_pfn;
-	p->end_pfn = end_pfn;
-	p->pxm = pxm;
-
-	num_memory_chunks++;
-
-	printk(KERN_DEBUG "Memory range %08lx to %08lx"
-			  " in proximity domain %02x %s\n",
-		start_pfn, end_pfn,
-		pxm,
-		((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
-		 "enabled and removable" : "enabled" ) );
-}
-
-/* Callback for SLIT parsing */
-void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
-{
-}
-
-void acpi_numa_arch_fixup(void)
-{
-}
-/*
- * The SRAT table always lists ascending addresses, so can always
- * assume that the first "start" address that you see is the real
- * start of the node, and that the current "end" address is after
- * the previous one.
- */
-static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
-{
-	/*
-	 * Only add present memory as told by the e820.
-	 * There is no guarantee from the SRAT that the memory it
-	 * enumerates is present at boot time because it represents
-	 * *possible* memory hotplug areas the same as normal RAM.
-	 */
-	if (memory_chunk->start_pfn >= max_pfn) {
-		printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
-			memory_chunk->start_pfn, memory_chunk->end_pfn);
-		return -1;
-	}
-	if (memory_chunk->nid != nid)
-		return -1;
-
-	if (!node_has_online_mem(nid))
-		node_start_pfn[nid] = memory_chunk->start_pfn;
-
-	if (node_start_pfn[nid] > memory_chunk->start_pfn)
-		node_start_pfn[nid] = memory_chunk->start_pfn;
-
-	if (node_end_pfn[nid] < memory_chunk->end_pfn)
-		node_end_pfn[nid] = memory_chunk->end_pfn;
-
-	return 0;
-}
-
-int __init get_memcfg_from_srat(void)
-{
-	int i, j;
-
-	if (srat_disabled())
-		goto out_fail;
-
-	if (acpi_numa_init() < 0)
-		goto out_fail;
-
-	if (num_memory_chunks == 0) {
-		printk(KERN_DEBUG
-			 "could not find any ACPI SRAT memory areas.\n");
-		goto out_fail;
-	}
-
-	/* Calculate total number of nodes in system from PXM bitmap and create
-	 * a set of sequential node IDs starting at zero.  (ACPI doesn't seem
-	 * to specify the range of _PXM values.)
-	 */
-	/*
-	 * MCD - we no longer HAVE to number nodes sequentially.  PXM domain
-	 * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
-	 * 32, so we will continue numbering them in this manner until MAX_NUMNODES
-	 * approaches MAX_PXM_DOMAINS for i386.
-	 */
-	nodes_clear(node_online_map);
-	for (i = 0; i < MAX_PXM_DOMAINS; i++) {
-		if (BMAP_TEST(pxm_bitmap, i)) {
-			int nid = acpi_map_pxm_to_node(i);
-			node_set_online(nid);
-		}
-	}
-	BUG_ON(num_online_nodes() == 0);
-
-	/* set cnode id in memory chunk structure */
-	for (i = 0; i < num_memory_chunks; i++)
-		node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
-
-	printk(KERN_DEBUG "pxm bitmap: ");
-	for (i = 0; i < sizeof(pxm_bitmap); i++) {
-		printk(KERN_CONT "%02x ", pxm_bitmap[i]);
-	}
-	printk(KERN_CONT "\n");
-	printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
-			 num_online_nodes());
-	printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
-			 num_memory_chunks);
-
-	for (i = 0; i < MAX_LOCAL_APIC; i++)
-		set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i]));
-
-	for (j = 0; j < num_memory_chunks; j++){
-		struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
-		printk(KERN_DEBUG
-			"chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
-		       j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
-		if (node_read_chunk(chunk->nid, chunk))
-			continue;
-
-		memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn,
-					     min(chunk->end_pfn, max_pfn));
-	}
-	/* for out of order entries in SRAT */
-	sort_node_map();
-
-	return 1;
-out_fail:
-	printk(KERN_DEBUG "failed to get NUMA memory information from SRAT"
-			" table\n");
-	return 0;
-}
-- 
cgit v1.2.3-70-g09d2


From 299a180aec6a8ee3069cf0fe90d722ac20c1f837 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:53 +0200
Subject: x86-32, NUMA: Update numaq to use new NUMA init protocol

Update numaq such that it calls numa_add_memblk() and sets
numa_nodes_parsed instead of directly diddling with NUMA states.  The
original get_memcfg_numaq() is renamed to numaq_numa_init() and new
get_memcfg_numaq() is created in numa_32.c.

The shim numa_add_memblk() implementation handles node_start/end_pfn[]
and node_set_online() for nodes with memory.  The new
get_memcfg_numaq() exactly the same with get_memcfg_from_srat() other
than calling the numaq init function.  Things get_memcfgs_numaq() do
are not strictly necessary for numaq but added for consistency and to
help unifying NUMA init handling.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/include/asm/numaq.h    |  7 +------
 arch/x86/kernel/apic/numaq_32.c | 28 ++++++++++------------------
 arch/x86/mm/numa_32.c           | 23 +++++++++++++++++++++++
 3 files changed, 34 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
index 37c516545ec..c3b3c322fd8 100644
--- a/arch/x86/include/asm/numaq.h
+++ b/arch/x86/include/asm/numaq.h
@@ -29,7 +29,7 @@
 #ifdef CONFIG_X86_NUMAQ
 
 extern int found_numaq;
-extern int get_memcfg_numaq(void);
+extern int numaq_numa_init(void);
 extern int pci_numaq_init(void);
 
 extern void *xquad_portio;
@@ -166,11 +166,6 @@ struct sys_cfg_data {
 
 void numaq_tsc_disable(void);
 
-#else
-static inline int get_memcfg_numaq(void)
-{
-	return 0;
-}
 #endif /* CONFIG_X86_NUMAQ */
 #endif /* _ASM_X86_NUMAQ_H */
 
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 41b8b29d36f..30f13319e24 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -48,8 +48,6 @@
 #include <asm/e820.h>
 #include <asm/ipi.h>
 
-#define	MB_TO_PAGES(addr)		((addr) << (20 - PAGE_SHIFT))
-
 int found_numaq;
 
 /*
@@ -79,25 +77,20 @@ int					quad_local_to_mp_bus_id[NR_CPUS/4][4];
 static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
 {
 	struct eachquadmem *eq = scd->eq + node;
+	u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20;
+	u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20;
+	int ret;
 
-	node_set_online(node);
-
-	/* Convert to pages */
-	node_start_pfn[node] =
-		 MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size);
-
-	node_end_pfn[node] =
-		 MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
-
-	memblock_x86_register_active_regions(node, node_start_pfn[node],
-						node_end_pfn[node]);
+	node_set(node, numa_nodes_parsed);
+	ret = numa_add_memblk(node, start, end);
+	BUG_ON(ret < 0);
 }
 
 /*
  * Function: smp_dump_qct()
  *
  * Description: gets memory layout from the quad config table.  This
- * function also updates node_online_map with the nodes (quads) present.
+ * function also updates numa_nodes_parsed with the nodes (quads) present.
  */
 static void __init smp_dump_qct(void)
 {
@@ -106,7 +99,6 @@ static void __init smp_dump_qct(void)
 
 	scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
 
-	nodes_clear(node_online_map);
 	for_each_node(node) {
 		if (scd->quads_present31_0 & (1 << node))
 			numaq_register_node(node, scd);
@@ -276,14 +268,14 @@ static __init void early_check_numaq(void)
 	}
 }
 
-int __init get_memcfg_numaq(void)
+int __init numaq_numa_init(void)
 {
 	early_check_numaq();
 	if (!found_numaq)
-		return 0;
+		return -ENOENT;
 	smp_dump_qct();
 
-	return 1;
+	return 0;
 }
 
 #define NUMAQ_APIC_DFR_VALUE	(APIC_DFR_CLUSTER)
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 8641239a066..14135e52cef 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -332,6 +332,29 @@ static __init void init_alloc_remap(int nid)
 	       nid, node_pa, node_pa + size, remap_va, remap_va + size);
 }
 
+static int get_memcfg_numaq(void)
+{
+#ifdef CONFIG_X86_NUMAQ
+	int nid;
+
+	if (numa_off)
+		return 0;
+
+	if (numaq_numa_init() < 0) {
+		nodes_clear(numa_nodes_parsed);
+		remove_all_active_ranges();
+		return 0;
+	}
+
+	for_each_node_mask(nid, numa_nodes_parsed)
+		node_set_online(nid);
+	sort_node_map();
+	return 1;
+#else
+	return 0;
+#endif
+}
+
 static int get_memcfg_from_srat(void)
 {
 #ifdef CONFIG_ACPI_NUMA
-- 
cgit v1.2.3-70-g09d2


From a4106eae650a4d5d30fcdd36d998edfa5ccb0ec4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:53 +0200
Subject: x86, NUMA: Move NUMA init logic from numa_64.c to numa.c

Move the generic 64bit NUMA init machinery from numa_64.c to numa.c.

* node_data[], numa_mem_info and numa_distance
* numa_add_memblk[_to](), numa_remove_memblk[_from]()
* numa_set_distance() and friends
* numa_init() and all the numa_meminfo handling helpers called from it
* dummy_numa_init()
* memory_add_physaddr_to_nid()

A new function x86_numa_init() is added and the content of
numa_64.c::initmem_init() is moved into it.  initmem_init() now simply
calls x86_numa_init().

Constants and numa_off declaration are moved from numa_{32|64}.h to
numa.h.

This is code reorganization and doesn't involve any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/include/asm/numa.h    |  16 ++
 arch/x86/include/asm/numa_32.h |   2 -
 arch/x86/include/asm/numa_64.h |  19 --
 arch/x86/mm/numa.c             | 523 ++++++++++++++++++++++++++++++++++++++++-
 arch/x86/mm/numa_64.c          | 503 +--------------------------------------
 arch/x86/mm/numa_internal.h    |   2 +
 6 files changed, 539 insertions(+), 526 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index db449c7d89b..74540865dd8 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -9,6 +9,16 @@
 #ifdef CONFIG_NUMA
 
 #define NR_NODE_MEMBLKS		(MAX_NUMNODES*2)
+#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
+
+/*
+ * Too small node sizes may confuse the VM badly. Usually they
+ * result from BIOS bugs. So dont recognize nodes as standalone
+ * NUMA entities that have less than this amount of RAM listed:
+ */
+#define NODE_MIN_SIZE (4*1024*1024)
+
+extern int numa_off;
 
 /*
  * __apicid_to_node[] stores the raw mapping between physical apicid and
@@ -68,4 +78,10 @@ static inline void numa_remove_cpu(int cpu)		{ }
 void debug_cpumask_set_cpu(int cpu, int node, bool enable);
 #endif
 
+#ifdef CONFIG_NUMA_EMU
+#define FAKE_NODE_MIN_SIZE	((u64)32 << 20)
+#define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1UL))
+void numa_emu_cmdline(char *);
+#endif /* CONFIG_NUMA_EMU */
+
 #endif	/* _ASM_X86_NUMA_H */
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index 7e54b64623a..e7d6b825474 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -1,8 +1,6 @@
 #ifndef _ASM_X86_NUMA_32_H
 #define _ASM_X86_NUMA_32_H
 
-extern int numa_off;
-
 #ifdef CONFIG_HIGHMEM
 extern void set_highmem_pages_init(void);
 #else
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 506dd050ff3..0c05f7ae46e 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -1,25 +1,6 @@
 #ifndef _ASM_X86_NUMA_64_H
 #define _ASM_X86_NUMA_64_H
 
-#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
-
-extern int numa_off;
-
 extern unsigned long numa_free_all_bootmem(void);
 
-#ifdef CONFIG_NUMA
-/*
- * Too small node sizes may confuse the VM badly. Usually they
- * result from BIOS bugs. So dont recognize nodes as standalone
- * NUMA entities that have less than this amount of RAM listed:
- */
-#define NODE_MIN_SIZE (4*1024*1024)
-
-#ifdef CONFIG_NUMA_EMU
-#define FAKE_NODE_MIN_SIZE	((u64)32 << 20)
-#define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1UL))
-void numa_emu_cmdline(char *);
-#endif /* CONFIG_NUMA_EMU */
-#endif
-
 #endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index cce174109ca..ed1daba5490 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -1,13 +1,42 @@
 /* Common code for 32 and 64-bit NUMA */
-#include <linux/topology.h>
-#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/init.h>
 #include <linux/bootmem.h>
-#include <asm/numa.h>
+#include <linux/memblock.h>
+#include <linux/mmzone.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/sched.h>
+#include <linux/topology.h>
+
+#include <asm/e820.h>
+#include <asm/proto.h>
+#include <asm/dma.h>
 #include <asm/acpi.h>
+#include <asm/amd_nb.h>
+
+#include "numa_internal.h"
 
 int __initdata numa_off;
 nodemask_t numa_nodes_parsed __initdata;
 
+#ifdef CONFIG_X86_64
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
+EXPORT_SYMBOL(node_data);
+
+static struct numa_meminfo numa_meminfo
+#ifndef CONFIG_MEMORY_HOTPLUG
+__initdata
+#endif
+;
+
+static int numa_distance_cnt;
+static u8 *numa_distance;
+#endif
+
 static __init int numa_setup(char *opt)
 {
 	if (!opt)
@@ -105,6 +134,392 @@ void __init setup_node_to_cpumask_map(void)
 	pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
 }
 
+#ifdef CONFIG_X86_64
+static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
+				     struct numa_meminfo *mi)
+{
+	/* ignore zero length blks */
+	if (start == end)
+		return 0;
+
+	/* whine about and ignore invalid blks */
+	if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
+		pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
+			   nid, start, end);
+		return 0;
+	}
+
+	if (mi->nr_blks >= NR_NODE_MEMBLKS) {
+		pr_err("NUMA: too many memblk ranges\n");
+		return -EINVAL;
+	}
+
+	mi->blk[mi->nr_blks].start = start;
+	mi->blk[mi->nr_blks].end = end;
+	mi->blk[mi->nr_blks].nid = nid;
+	mi->nr_blks++;
+	return 0;
+}
+
+/**
+ * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
+ * @idx: Index of memblk to remove
+ * @mi: numa_meminfo to remove memblk from
+ *
+ * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
+ * decrementing @mi->nr_blks.
+ */
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
+{
+	mi->nr_blks--;
+	memmove(&mi->blk[idx], &mi->blk[idx + 1],
+		(mi->nr_blks - idx) * sizeof(mi->blk[0]));
+}
+
+/**
+ * numa_add_memblk - Add one numa_memblk to numa_meminfo
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @end: End address of the new memblk
+ *
+ * Add a new memblk to the default numa_meminfo.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_add_memblk(int nid, u64 start, u64 end)
+{
+	return numa_add_memblk_to(nid, start, end, &numa_meminfo);
+}
+
+/* Initialize bootmem allocator for a node */
+static void __init
+setup_node_bootmem(int nid, unsigned long start, unsigned long end)
+{
+	const u64 nd_low = (u64)MAX_DMA_PFN << PAGE_SHIFT;
+	const u64 nd_high = (u64)max_pfn_mapped << PAGE_SHIFT;
+	const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+	unsigned long nd_pa;
+	int tnid;
+
+	/*
+	 * Don't confuse VM with a node that doesn't have the
+	 * minimum amount of memory:
+	 */
+	if (end && (end - start) < NODE_MIN_SIZE)
+		return;
+
+	start = roundup(start, ZONE_ALIGN);
+
+	printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n",
+	       nid, start, end);
+
+	/*
+	 * Try to allocate node data on local node and then fall back to
+	 * all nodes.  Never allocate in DMA zone.
+	 */
+	nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high,
+						nd_size, SMP_CACHE_BYTES);
+	if (nd_pa == MEMBLOCK_ERROR)
+		nd_pa = memblock_find_in_range(nd_low, nd_high,
+					       nd_size, SMP_CACHE_BYTES);
+	if (nd_pa == MEMBLOCK_ERROR) {
+		pr_err("Cannot find %lu bytes in node %d\n", nd_size, nid);
+		return;
+	}
+	memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
+
+	/* report and initialize */
+	printk(KERN_INFO "  NODE_DATA [%016lx - %016lx]\n",
+	       nd_pa, nd_pa + nd_size - 1);
+	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
+	if (tnid != nid)
+		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
+
+	node_data[nid] = __va(nd_pa);
+	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+	NODE_DATA(nid)->node_id = nid;
+	NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
+	NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
+
+	node_set_online(nid);
+}
+
+/**
+ * numa_cleanup_meminfo - Cleanup a numa_meminfo
+ * @mi: numa_meminfo to clean up
+ *
+ * Sanitize @mi by merging and removing unncessary memblks.  Also check for
+ * conflicts and clear unused memblks.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
+{
+	const u64 low = 0;
+	const u64 high = (u64)max_pfn << PAGE_SHIFT;
+	int i, j, k;
+
+	for (i = 0; i < mi->nr_blks; i++) {
+		struct numa_memblk *bi = &mi->blk[i];
+
+		/* make sure all blocks are inside the limits */
+		bi->start = max(bi->start, low);
+		bi->end = min(bi->end, high);
+
+		/* and there's no empty block */
+		if (bi->start >= bi->end) {
+			numa_remove_memblk_from(i--, mi);
+			continue;
+		}
+
+		for (j = i + 1; j < mi->nr_blks; j++) {
+			struct numa_memblk *bj = &mi->blk[j];
+			unsigned long start, end;
+
+			/*
+			 * See whether there are overlapping blocks.  Whine
+			 * about but allow overlaps of the same nid.  They
+			 * will be merged below.
+			 */
+			if (bi->end > bj->start && bi->start < bj->end) {
+				if (bi->nid != bj->nid) {
+					pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
+					       bi->nid, bi->start, bi->end,
+					       bj->nid, bj->start, bj->end);
+					return -EINVAL;
+				}
+				pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
+					   bi->nid, bi->start, bi->end,
+					   bj->start, bj->end);
+			}
+
+			/*
+			 * Join together blocks on the same node, holes
+			 * between which don't overlap with memory on other
+			 * nodes.
+			 */
+			if (bi->nid != bj->nid)
+				continue;
+			start = max(min(bi->start, bj->start), low);
+			end = min(max(bi->end, bj->end), high);
+			for (k = 0; k < mi->nr_blks; k++) {
+				struct numa_memblk *bk = &mi->blk[k];
+
+				if (bi->nid == bk->nid)
+					continue;
+				if (start < bk->end && end > bk->start)
+					break;
+			}
+			if (k < mi->nr_blks)
+				continue;
+			printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
+			       bi->nid, bi->start, bi->end, bj->start, bj->end,
+			       start, end);
+			bi->start = start;
+			bi->end = end;
+			numa_remove_memblk_from(j--, mi);
+		}
+	}
+
+	for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
+		mi->blk[i].start = mi->blk[i].end = 0;
+		mi->blk[i].nid = NUMA_NO_NODE;
+	}
+
+	return 0;
+}
+
+/*
+ * Set nodes, which have memory in @mi, in *@nodemask.
+ */
+static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
+					      const struct numa_meminfo *mi)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
+		if (mi->blk[i].start != mi->blk[i].end &&
+		    mi->blk[i].nid != NUMA_NO_NODE)
+			node_set(mi->blk[i].nid, *nodemask);
+}
+
+/**
+ * numa_reset_distance - Reset NUMA distance table
+ *
+ * The current table is freed.  The next numa_set_distance() call will
+ * create a new one.
+ */
+void __init numa_reset_distance(void)
+{
+	size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
+
+	/* numa_distance could be 1LU marking allocation failure, test cnt */
+	if (numa_distance_cnt)
+		memblock_x86_free_range(__pa(numa_distance),
+					__pa(numa_distance) + size);
+	numa_distance_cnt = 0;
+	numa_distance = NULL;	/* enable table creation */
+}
+
+static int __init numa_alloc_distance(void)
+{
+	nodemask_t nodes_parsed;
+	size_t size;
+	int i, j, cnt = 0;
+	u64 phys;
+
+	/* size the new table and allocate it */
+	nodes_parsed = numa_nodes_parsed;
+	numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
+
+	for_each_node_mask(i, nodes_parsed)
+		cnt = i;
+	cnt++;
+	size = cnt * cnt * sizeof(numa_distance[0]);
+
+	phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
+				      size, PAGE_SIZE);
+	if (phys == MEMBLOCK_ERROR) {
+		pr_warning("NUMA: Warning: can't allocate distance table!\n");
+		/* don't retry until explicitly reset */
+		numa_distance = (void *)1LU;
+		return -ENOMEM;
+	}
+	memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
+
+	numa_distance = __va(phys);
+	numa_distance_cnt = cnt;
+
+	/* fill with the default distances */
+	for (i = 0; i < cnt; i++)
+		for (j = 0; j < cnt; j++)
+			numa_distance[i * cnt + j] = i == j ?
+				LOCAL_DISTANCE : REMOTE_DISTANCE;
+	printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
+
+	return 0;
+}
+
+/**
+ * numa_set_distance - Set NUMA distance from one NUMA to another
+ * @from: the 'from' node to set distance
+ * @to: the 'to'  node to set distance
+ * @distance: NUMA distance
+ *
+ * Set the distance from node @from to @to to @distance.  If distance table
+ * doesn't exist, one which is large enough to accommodate all the currently
+ * known nodes will be created.
+ *
+ * If such table cannot be allocated, a warning is printed and further
+ * calls are ignored until the distance table is reset with
+ * numa_reset_distance().
+ *
+ * If @from or @to is higher than the highest known node at the time of
+ * table creation or @distance doesn't make sense, the call is ignored.
+ * This is to allow simplification of specific NUMA config implementations.
+ */
+void __init numa_set_distance(int from, int to, int distance)
+{
+	if (!numa_distance && numa_alloc_distance() < 0)
+		return;
+
+	if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
+		printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
+			    from, to, distance);
+		return;
+	}
+
+	if ((u8)distance != distance ||
+	    (from == to && distance != LOCAL_DISTANCE)) {
+		pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
+			     from, to, distance);
+		return;
+	}
+
+	numa_distance[from * numa_distance_cnt + to] = distance;
+}
+
+int __node_distance(int from, int to)
+{
+	if (from >= numa_distance_cnt || to >= numa_distance_cnt)
+		return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
+	return numa_distance[from * numa_distance_cnt + to];
+}
+EXPORT_SYMBOL(__node_distance);
+
+/*
+ * Sanity check to catch more bad NUMA configurations (they are amazingly
+ * common).  Make sure the nodes cover all memory.
+ */
+static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
+{
+	unsigned long numaram, e820ram;
+	int i;
+
+	numaram = 0;
+	for (i = 0; i < mi->nr_blks; i++) {
+		unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
+		unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
+		numaram += e - s;
+		numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
+		if ((long)numaram < 0)
+			numaram = 0;
+	}
+
+	e820ram = max_pfn - (memblock_x86_hole_size(0,
+					max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
+	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
+	if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
+		printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
+		       (numaram << PAGE_SHIFT) >> 20,
+		       (e820ram << PAGE_SHIFT) >> 20);
+		return false;
+	}
+	return true;
+}
+
+static int __init numa_register_memblks(struct numa_meminfo *mi)
+{
+	int i, nid;
+
+	/* Account for nodes with cpus and no memory */
+	node_possible_map = numa_nodes_parsed;
+	numa_nodemask_from_meminfo(&node_possible_map, mi);
+	if (WARN_ON(nodes_empty(node_possible_map)))
+		return -EINVAL;
+
+	for (i = 0; i < mi->nr_blks; i++)
+		memblock_x86_register_active_regions(mi->blk[i].nid,
+					mi->blk[i].start >> PAGE_SHIFT,
+					mi->blk[i].end >> PAGE_SHIFT);
+
+	/* for out of order entries */
+	sort_node_map();
+	if (!numa_meminfo_cover_memory(mi))
+		return -EINVAL;
+
+	/* Finally register nodes. */
+	for_each_node_mask(nid, node_possible_map) {
+		u64 start = (u64)max_pfn << PAGE_SHIFT;
+		u64 end = 0;
+
+		for (i = 0; i < mi->nr_blks; i++) {
+			if (nid != mi->blk[i].nid)
+				continue;
+			start = min(mi->blk[i].start, start);
+			end = max(mi->blk[i].end, end);
+		}
+
+		if (start < end)
+			setup_node_bootmem(nid, start, end);
+	}
+
+	return 0;
+}
+#endif
+
 /*
  * There are unfortunately some poorly designed mainboards around that
  * only connect memory to a single CPU. This breaks the 1:1 cpu->node
@@ -127,6 +542,93 @@ void __init numa_init_array(void)
 	}
 }
 
+#ifdef CONFIG_X86_64
+static int __init numa_init(int (*init_func)(void))
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < MAX_LOCAL_APIC; i++)
+		set_apicid_to_node(i, NUMA_NO_NODE);
+
+	nodes_clear(numa_nodes_parsed);
+	nodes_clear(node_possible_map);
+	nodes_clear(node_online_map);
+	memset(&numa_meminfo, 0, sizeof(numa_meminfo));
+	remove_all_active_ranges();
+	numa_reset_distance();
+
+	ret = init_func();
+	if (ret < 0)
+		return ret;
+	ret = numa_cleanup_meminfo(&numa_meminfo);
+	if (ret < 0)
+		return ret;
+
+	numa_emulation(&numa_meminfo, numa_distance_cnt);
+
+	ret = numa_register_memblks(&numa_meminfo);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < nr_cpu_ids; i++) {
+		int nid = early_cpu_to_node(i);
+
+		if (nid == NUMA_NO_NODE)
+			continue;
+		if (!node_online(nid))
+			numa_clear_node(i);
+	}
+	numa_init_array();
+	return 0;
+}
+
+/**
+ * dummy_numa_init - Fallback dummy NUMA init
+ *
+ * Used if there's no underlying NUMA architecture, NUMA initialization
+ * fails, or NUMA is disabled on the command line.
+ *
+ * Must online at least one node and add memory blocks that cover all
+ * allowed memory.  This function must not fail.
+ */
+static int __init dummy_numa_init(void)
+{
+	printk(KERN_INFO "%s\n",
+	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
+	printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
+	       0LU, max_pfn << PAGE_SHIFT);
+
+	node_set(0, numa_nodes_parsed);
+	numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
+
+	return 0;
+}
+
+/**
+ * x86_numa_init - Initialize NUMA
+ *
+ * Try each configured NUMA initialization method until one succeeds.  The
+ * last fallback is dummy single node config encomapssing whole memory and
+ * never fails.
+ */
+void __init x86_numa_init(void)
+{
+	if (!numa_off) {
+#ifdef CONFIG_ACPI_NUMA
+		if (!numa_init(x86_acpi_numa_init))
+			return;
+#endif
+#ifdef CONFIG_AMD_NUMA
+		if (!numa_init(amd_numa_init))
+			return;
+#endif
+	}
+
+	numa_init(dummy_numa_init);
+}
+#endif
+
 static __init int find_near_online_node(int node)
 {
 	int n, val;
@@ -292,3 +794,18 @@ const struct cpumask *cpumask_of_node(int node)
 EXPORT_SYMBOL(cpumask_of_node);
 
 #endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_MEMORY_HOTPLUG)
+int memory_add_physaddr_to_nid(u64 start)
+{
+	struct numa_meminfo *mi = &numa_meminfo;
+	int nid = mi->blk[0].nid;
+	int i;
+
+	for (i = 0; i < mi->nr_blks; i++)
+		if (mi->blk[i].start <= start && mi->blk[i].end > start)
+			nid = mi->blk[i].nid;
+	return nid;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 70bd8221f92..dd27f401f0a 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -2,499 +2,13 @@
  * Generic VM initialization for x86-64 NUMA setups.
  * Copyright 2002,2003 Andi Kleen, SuSE Labs.
  */
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/init.h>
 #include <linux/bootmem.h>
-#include <linux/memblock.h>
-#include <linux/mmzone.h>
-#include <linux/ctype.h>
-#include <linux/module.h>
-#include <linux/nodemask.h>
-#include <linux/sched.h>
-#include <linux/acpi.h>
-
-#include <asm/e820.h>
-#include <asm/proto.h>
-#include <asm/dma.h>
-#include <asm/acpi.h>
-#include <asm/amd_nb.h>
 
 #include "numa_internal.h"
 
-struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
-EXPORT_SYMBOL(node_data);
-
-static struct numa_meminfo numa_meminfo
-#ifndef CONFIG_MEMORY_HOTPLUG
-__initdata
-#endif
-;
-
-static int numa_distance_cnt;
-static u8 *numa_distance;
-
-static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
-				     struct numa_meminfo *mi)
-{
-	/* ignore zero length blks */
-	if (start == end)
-		return 0;
-
-	/* whine about and ignore invalid blks */
-	if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
-		pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
-			   nid, start, end);
-		return 0;
-	}
-
-	if (mi->nr_blks >= NR_NODE_MEMBLKS) {
-		pr_err("NUMA: too many memblk ranges\n");
-		return -EINVAL;
-	}
-
-	mi->blk[mi->nr_blks].start = start;
-	mi->blk[mi->nr_blks].end = end;
-	mi->blk[mi->nr_blks].nid = nid;
-	mi->nr_blks++;
-	return 0;
-}
-
-/**
- * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
- * @idx: Index of memblk to remove
- * @mi: numa_meminfo to remove memblk from
- *
- * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
- * decrementing @mi->nr_blks.
- */
-void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
-{
-	mi->nr_blks--;
-	memmove(&mi->blk[idx], &mi->blk[idx + 1],
-		(mi->nr_blks - idx) * sizeof(mi->blk[0]));
-}
-
-/**
- * numa_add_memblk - Add one numa_memblk to numa_meminfo
- * @nid: NUMA node ID of the new memblk
- * @start: Start address of the new memblk
- * @end: End address of the new memblk
- *
- * Add a new memblk to the default numa_meminfo.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int __init numa_add_memblk(int nid, u64 start, u64 end)
-{
-	return numa_add_memblk_to(nid, start, end, &numa_meminfo);
-}
-
-/* Initialize bootmem allocator for a node */
-static void __init
-setup_node_bootmem(int nid, unsigned long start, unsigned long end)
-{
-	const u64 nd_low = (u64)MAX_DMA_PFN << PAGE_SHIFT;
-	const u64 nd_high = (u64)max_pfn_mapped << PAGE_SHIFT;
-	const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
-	unsigned long nd_pa;
-	int tnid;
-
-	/*
-	 * Don't confuse VM with a node that doesn't have the
-	 * minimum amount of memory:
-	 */
-	if (end && (end - start) < NODE_MIN_SIZE)
-		return;
-
-	start = roundup(start, ZONE_ALIGN);
-
-	printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n",
-	       nid, start, end);
-
-	/*
-	 * Try to allocate node data on local node and then fall back to
-	 * all nodes.  Never allocate in DMA zone.
-	 */
-	nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high,
-						nd_size, SMP_CACHE_BYTES);
-	if (nd_pa == MEMBLOCK_ERROR)
-		nd_pa = memblock_find_in_range(nd_low, nd_high,
-					       nd_size, SMP_CACHE_BYTES);
-	if (nd_pa == MEMBLOCK_ERROR) {
-		pr_err("Cannot find %lu bytes in node %d\n", nd_size, nid);
-		return;
-	}
-	memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
-
-	/* report and initialize */
-	printk(KERN_INFO "  NODE_DATA [%016lx - %016lx]\n",
-	       nd_pa, nd_pa + nd_size - 1);
-	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
-	if (tnid != nid)
-		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
-
-	node_data[nid] = __va(nd_pa);
-	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
-	NODE_DATA(nid)->node_id = nid;
-	NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
-	NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
-
-	node_set_online(nid);
-}
-
-/**
- * numa_cleanup_meminfo - Cleanup a numa_meminfo
- * @mi: numa_meminfo to clean up
- *
- * Sanitize @mi by merging and removing unncessary memblks.  Also check for
- * conflicts and clear unused memblks.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
-{
-	const u64 low = 0;
-	const u64 high = (u64)max_pfn << PAGE_SHIFT;
-	int i, j, k;
-
-	for (i = 0; i < mi->nr_blks; i++) {
-		struct numa_memblk *bi = &mi->blk[i];
-
-		/* make sure all blocks are inside the limits */
-		bi->start = max(bi->start, low);
-		bi->end = min(bi->end, high);
-
-		/* and there's no empty block */
-		if (bi->start >= bi->end) {
-			numa_remove_memblk_from(i--, mi);
-			continue;
-		}
-
-		for (j = i + 1; j < mi->nr_blks; j++) {
-			struct numa_memblk *bj = &mi->blk[j];
-			unsigned long start, end;
-
-			/*
-			 * See whether there are overlapping blocks.  Whine
-			 * about but allow overlaps of the same nid.  They
-			 * will be merged below.
-			 */
-			if (bi->end > bj->start && bi->start < bj->end) {
-				if (bi->nid != bj->nid) {
-					pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
-					       bi->nid, bi->start, bi->end,
-					       bj->nid, bj->start, bj->end);
-					return -EINVAL;
-				}
-				pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
-					   bi->nid, bi->start, bi->end,
-					   bj->start, bj->end);
-			}
-
-			/*
-			 * Join together blocks on the same node, holes
-			 * between which don't overlap with memory on other
-			 * nodes.
-			 */
-			if (bi->nid != bj->nid)
-				continue;
-			start = max(min(bi->start, bj->start), low);
-			end = min(max(bi->end, bj->end), high);
-			for (k = 0; k < mi->nr_blks; k++) {
-				struct numa_memblk *bk = &mi->blk[k];
-
-				if (bi->nid == bk->nid)
-					continue;
-				if (start < bk->end && end > bk->start)
-					break;
-			}
-			if (k < mi->nr_blks)
-				continue;
-			printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
-			       bi->nid, bi->start, bi->end, bj->start, bj->end,
-			       start, end);
-			bi->start = start;
-			bi->end = end;
-			numa_remove_memblk_from(j--, mi);
-		}
-	}
-
-	for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
-		mi->blk[i].start = mi->blk[i].end = 0;
-		mi->blk[i].nid = NUMA_NO_NODE;
-	}
-
-	return 0;
-}
-
-/*
- * Set nodes, which have memory in @mi, in *@nodemask.
- */
-static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
-					      const struct numa_meminfo *mi)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
-		if (mi->blk[i].start != mi->blk[i].end &&
-		    mi->blk[i].nid != NUMA_NO_NODE)
-			node_set(mi->blk[i].nid, *nodemask);
-}
-
-/**
- * numa_reset_distance - Reset NUMA distance table
- *
- * The current table is freed.  The next numa_set_distance() call will
- * create a new one.
- */
-void __init numa_reset_distance(void)
-{
-	size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
-
-	/* numa_distance could be 1LU marking allocation failure, test cnt */
-	if (numa_distance_cnt)
-		memblock_x86_free_range(__pa(numa_distance),
-					__pa(numa_distance) + size);
-	numa_distance_cnt = 0;
-	numa_distance = NULL;	/* enable table creation */
-}
-
-static int __init numa_alloc_distance(void)
-{
-	nodemask_t nodes_parsed;
-	size_t size;
-	int i, j, cnt = 0;
-	u64 phys;
-
-	/* size the new table and allocate it */
-	nodes_parsed = numa_nodes_parsed;
-	numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
-
-	for_each_node_mask(i, nodes_parsed)
-		cnt = i;
-	cnt++;
-	size = cnt * cnt * sizeof(numa_distance[0]);
-
-	phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
-				      size, PAGE_SIZE);
-	if (phys == MEMBLOCK_ERROR) {
-		pr_warning("NUMA: Warning: can't allocate distance table!\n");
-		/* don't retry until explicitly reset */
-		numa_distance = (void *)1LU;
-		return -ENOMEM;
-	}
-	memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
-
-	numa_distance = __va(phys);
-	numa_distance_cnt = cnt;
-
-	/* fill with the default distances */
-	for (i = 0; i < cnt; i++)
-		for (j = 0; j < cnt; j++)
-			numa_distance[i * cnt + j] = i == j ?
-				LOCAL_DISTANCE : REMOTE_DISTANCE;
-	printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
-
-	return 0;
-}
-
-/**
- * numa_set_distance - Set NUMA distance from one NUMA to another
- * @from: the 'from' node to set distance
- * @to: the 'to'  node to set distance
- * @distance: NUMA distance
- *
- * Set the distance from node @from to @to to @distance.  If distance table
- * doesn't exist, one which is large enough to accommodate all the currently
- * known nodes will be created.
- *
- * If such table cannot be allocated, a warning is printed and further
- * calls are ignored until the distance table is reset with
- * numa_reset_distance().
- *
- * If @from or @to is higher than the highest known node at the time of
- * table creation or @distance doesn't make sense, the call is ignored.
- * This is to allow simplification of specific NUMA config implementations.
- */
-void __init numa_set_distance(int from, int to, int distance)
-{
-	if (!numa_distance && numa_alloc_distance() < 0)
-		return;
-
-	if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
-		printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
-			    from, to, distance);
-		return;
-	}
-
-	if ((u8)distance != distance ||
-	    (from == to && distance != LOCAL_DISTANCE)) {
-		pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
-			     from, to, distance);
-		return;
-	}
-
-	numa_distance[from * numa_distance_cnt + to] = distance;
-}
-
-int __node_distance(int from, int to)
-{
-	if (from >= numa_distance_cnt || to >= numa_distance_cnt)
-		return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
-	return numa_distance[from * numa_distance_cnt + to];
-}
-EXPORT_SYMBOL(__node_distance);
-
-/*
- * Sanity check to catch more bad NUMA configurations (they are amazingly
- * common).  Make sure the nodes cover all memory.
- */
-static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
-{
-	unsigned long numaram, e820ram;
-	int i;
-
-	numaram = 0;
-	for (i = 0; i < mi->nr_blks; i++) {
-		unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
-		unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
-		numaram += e - s;
-		numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
-		if ((long)numaram < 0)
-			numaram = 0;
-	}
-
-	e820ram = max_pfn - (memblock_x86_hole_size(0,
-					max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
-	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
-	if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
-		printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
-		       (numaram << PAGE_SHIFT) >> 20,
-		       (e820ram << PAGE_SHIFT) >> 20);
-		return false;
-	}
-	return true;
-}
-
-static int __init numa_register_memblks(struct numa_meminfo *mi)
-{
-	int i, nid;
-
-	/* Account for nodes with cpus and no memory */
-	node_possible_map = numa_nodes_parsed;
-	numa_nodemask_from_meminfo(&node_possible_map, mi);
-	if (WARN_ON(nodes_empty(node_possible_map)))
-		return -EINVAL;
-
-	for (i = 0; i < mi->nr_blks; i++)
-		memblock_x86_register_active_regions(mi->blk[i].nid,
-					mi->blk[i].start >> PAGE_SHIFT,
-					mi->blk[i].end >> PAGE_SHIFT);
-
-	/* for out of order entries */
-	sort_node_map();
-	if (!numa_meminfo_cover_memory(mi))
-		return -EINVAL;
-
-	/* Finally register nodes. */
-	for_each_node_mask(nid, node_possible_map) {
-		u64 start = (u64)max_pfn << PAGE_SHIFT;
-		u64 end = 0;
-
-		for (i = 0; i < mi->nr_blks; i++) {
-			if (nid != mi->blk[i].nid)
-				continue;
-			start = min(mi->blk[i].start, start);
-			end = max(mi->blk[i].end, end);
-		}
-
-		if (start < end)
-			setup_node_bootmem(nid, start, end);
-	}
-
-	return 0;
-}
-
-/**
- * dummy_numma_init - Fallback dummy NUMA init
- *
- * Used if there's no underlying NUMA architecture, NUMA initialization
- * fails, or NUMA is disabled on the command line.
- *
- * Must online at least one node and add memory blocks that cover all
- * allowed memory.  This function must not fail.
- */
-static int __init dummy_numa_init(void)
-{
-	printk(KERN_INFO "%s\n",
-	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
-	printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
-	       0LU, max_pfn << PAGE_SHIFT);
-
-	node_set(0, numa_nodes_parsed);
-	numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
-
-	return 0;
-}
-
-static int __init numa_init(int (*init_func)(void))
-{
-	int i;
-	int ret;
-
-	for (i = 0; i < MAX_LOCAL_APIC; i++)
-		set_apicid_to_node(i, NUMA_NO_NODE);
-
-	nodes_clear(numa_nodes_parsed);
-	nodes_clear(node_possible_map);
-	nodes_clear(node_online_map);
-	memset(&numa_meminfo, 0, sizeof(numa_meminfo));
-	remove_all_active_ranges();
-	numa_reset_distance();
-
-	ret = init_func();
-	if (ret < 0)
-		return ret;
-	ret = numa_cleanup_meminfo(&numa_meminfo);
-	if (ret < 0)
-		return ret;
-
-	numa_emulation(&numa_meminfo, numa_distance_cnt);
-
-	ret = numa_register_memblks(&numa_meminfo);
-	if (ret < 0)
-		return ret;
-
-	for (i = 0; i < nr_cpu_ids; i++) {
-		int nid = early_cpu_to_node(i);
-
-		if (nid == NUMA_NO_NODE)
-			continue;
-		if (!node_online(nid))
-			numa_clear_node(i);
-	}
-	numa_init_array();
-	return 0;
-}
-
 void __init initmem_init(void)
 {
-	if (!numa_off) {
-#ifdef CONFIG_ACPI_NUMA
-		if (!numa_init(x86_acpi_numa_init))
-			return;
-#endif
-#ifdef CONFIG_AMD_NUMA
-		if (!numa_init(amd_numa_init))
-			return;
-#endif
-	}
-
-	numa_init(dummy_numa_init);
+	x86_numa_init();
 }
 
 unsigned long __init numa_free_all_bootmem(void)
@@ -509,18 +23,3 @@ unsigned long __init numa_free_all_bootmem(void)
 
 	return pages;
 }
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-int memory_add_physaddr_to_nid(u64 start)
-{
-	struct numa_meminfo *mi = &numa_meminfo;
-	int nid = mi->blk[0].nid;
-	int i;
-
-	for (i = 0; i < mi->nr_blks; i++)
-		if (mi->blk[i].start <= start && mi->blk[i].end > start)
-			nid = mi->blk[i].nid;
-	return nid;
-}
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index ef2d97377d7..ad86ec91e64 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -19,6 +19,8 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
 int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
 void __init numa_reset_distance(void);
 
+void __init x86_numa_init(void);
+
 #ifdef CONFIG_NUMA_EMU
 void __init numa_emulation(struct numa_meminfo *numa_meminfo,
 			   int numa_dist_cnt);
-- 
cgit v1.2.3-70-g09d2


From 744baba0c4072b04664952a89292e4708eaf949a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:53 +0200
Subject: x86, NUMA: Enable build of generic NUMA init code on 32bit

Generic NUMA init code was moved to numa.c from numa_64.c but is still
guaraded by CONFIG_X86_64.  This patch removes the compile guard and
enables compiling on 32bit.

* numa_add_memblk() and numa_set_distance() clash with the shim
  implementation in numa_32.c and are left out.

* memory_add_physaddr_to_nid() clashes with 32bit implementation and
  is left out.

* MAX_DMA_PFN definition in dma.h moved out of !CONFIG_X86_32.

* node_data definition in numa_32.c removed in favor of the one in
  numa.c.

There are places where ulong is assumed to be 64bit.  The next patch
will fix them up.  Note that although the code is compiled it isn't
used yet and this patch doesn't cause any functional change.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/include/asm/dma.h |  6 +++---
 arch/x86/mm/numa.c         | 10 ++++------
 arch/x86/mm/numa_32.c      |  3 ---
 3 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index 057099e5fab..d1a314b610f 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -69,6 +69,9 @@
 
 #define MAX_DMA_CHANNELS	8
 
+/* 16MB ISA DMA zone */
+#define MAX_DMA_PFN   ((16 * 1024 * 1024) >> PAGE_SHIFT)
+
 #ifdef CONFIG_X86_32
 
 /* The maximum address that we can perform a DMA transfer to on this platform */
@@ -76,9 +79,6 @@
 
 #else
 
-/* 16MB ISA DMA zone */
-#define MAX_DMA_PFN   ((16 * 1024 * 1024) >> PAGE_SHIFT)
-
 /* 4GB broken PCI/AGP hardware bus master zone */
 #define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
 
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index ed1daba5490..c400f3b2b93 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -23,7 +23,6 @@
 int __initdata numa_off;
 nodemask_t numa_nodes_parsed __initdata;
 
-#ifdef CONFIG_X86_64
 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
@@ -35,7 +34,6 @@ __initdata
 
 static int numa_distance_cnt;
 static u8 *numa_distance;
-#endif
 
 static __init int numa_setup(char *opt)
 {
@@ -134,7 +132,6 @@ void __init setup_node_to_cpumask_map(void)
 	pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
 }
 
-#ifdef CONFIG_X86_64
 static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
 				     struct numa_meminfo *mi)
 {
@@ -176,6 +173,7 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
 		(mi->nr_blks - idx) * sizeof(mi->blk[0]));
 }
 
+#ifdef CONFIG_X86_64
 /**
  * numa_add_memblk - Add one numa_memblk to numa_meminfo
  * @nid: NUMA node ID of the new memblk
@@ -191,6 +189,7 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
 {
 	return numa_add_memblk_to(nid, start, end, &numa_meminfo);
 }
+#endif
 
 /* Initialize bootmem allocator for a node */
 static void __init
@@ -402,6 +401,7 @@ static int __init numa_alloc_distance(void)
 	return 0;
 }
 
+#ifdef CONFIG_X86_64
 /**
  * numa_set_distance - Set NUMA distance from one NUMA to another
  * @from: the 'from' node to set distance
@@ -440,6 +440,7 @@ void __init numa_set_distance(int from, int to, int distance)
 
 	numa_distance[from * numa_distance_cnt + to] = distance;
 }
+#endif
 
 int __node_distance(int from, int to)
 {
@@ -518,7 +519,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 
 	return 0;
 }
-#endif
 
 /*
  * There are unfortunately some poorly designed mainboards around that
@@ -542,7 +542,6 @@ void __init numa_init_array(void)
 	}
 }
 
-#ifdef CONFIG_X86_64
 static int __init numa_init(int (*init_func)(void))
 {
 	int i;
@@ -627,7 +626,6 @@ void __init x86_numa_init(void)
 
 	numa_init(dummy_numa_init);
 }
-#endif
 
 static __init int find_near_online_node(int node)
 {
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 14135e52cef..975a76f622b 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -41,9 +41,6 @@
 #include <asm/bios_ebda.h>
 #include <asm/proto.h>
 
-struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
-EXPORT_SYMBOL(node_data);
-
 /*
  * numa interface - we expect the numa architecture specific code to have
  *                  populated the following initialisation.
-- 
cgit v1.2.3-70-g09d2


From 38f3e1ca24cc3ec416855e02676f91c898a8a262 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:53 +0200
Subject: x86, NUMA: Remove long 64bit assumption from numa.c

Code moved from numa_64.c has assumption that long is 64bit in several
places.  This patch removes the assumption by using {s|u}64_t
explicity, using PFN_PHYS() for page number -> addr conversions and
adjusting printf formats.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/mm/numa.c | 45 ++++++++++++++++++++++-----------------------
 1 file changed, 22 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index c400f3b2b93..b45caa39f7c 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -192,13 +192,12 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
 #endif
 
 /* Initialize bootmem allocator for a node */
-static void __init
-setup_node_bootmem(int nid, unsigned long start, unsigned long end)
+static void __init setup_node_bootmem(int nid, u64 start, u64 end)
 {
-	const u64 nd_low = (u64)MAX_DMA_PFN << PAGE_SHIFT;
-	const u64 nd_high = (u64)max_pfn_mapped << PAGE_SHIFT;
+	const u64 nd_low = PFN_PHYS(MAX_DMA_PFN);
+	const u64 nd_high = PFN_PHYS(max_pfn_mapped);
 	const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
-	unsigned long nd_pa;
+	u64 nd_pa;
 	int tnid;
 
 	/*
@@ -210,7 +209,7 @@ setup_node_bootmem(int nid, unsigned long start, unsigned long end)
 
 	start = roundup(start, ZONE_ALIGN);
 
-	printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n",
+	printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n",
 	       nid, start, end);
 
 	/*
@@ -223,13 +222,13 @@ setup_node_bootmem(int nid, unsigned long start, unsigned long end)
 		nd_pa = memblock_find_in_range(nd_low, nd_high,
 					       nd_size, SMP_CACHE_BYTES);
 	if (nd_pa == MEMBLOCK_ERROR) {
-		pr_err("Cannot find %lu bytes in node %d\n", nd_size, nid);
+		pr_err("Cannot find %zu bytes in node %d\n", nd_size, nid);
 		return;
 	}
 	memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
 
 	/* report and initialize */
-	printk(KERN_INFO "  NODE_DATA [%016lx - %016lx]\n",
+	printk(KERN_INFO "  NODE_DATA [%016Lx - %016Lx]\n",
 	       nd_pa, nd_pa + nd_size - 1);
 	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
 	if (tnid != nid)
@@ -257,7 +256,7 @@ setup_node_bootmem(int nid, unsigned long start, unsigned long end)
 int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 {
 	const u64 low = 0;
-	const u64 high = (u64)max_pfn << PAGE_SHIFT;
+	const u64 high = PFN_PHYS(max_pfn);
 	int i, j, k;
 
 	for (i = 0; i < mi->nr_blks; i++) {
@@ -275,7 +274,7 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 
 		for (j = i + 1; j < mi->nr_blks; j++) {
 			struct numa_memblk *bj = &mi->blk[j];
-			unsigned long start, end;
+			u64 start, end;
 
 			/*
 			 * See whether there are overlapping blocks.  Whine
@@ -313,7 +312,7 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 			}
 			if (k < mi->nr_blks)
 				continue;
-			printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
+			printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n",
 			       bi->nid, bi->start, bi->end, bj->start, bj->end,
 			       start, end);
 			bi->start = start;
@@ -378,7 +377,7 @@ static int __init numa_alloc_distance(void)
 	cnt++;
 	size = cnt * cnt * sizeof(numa_distance[0]);
 
-	phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
+	phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
 				      size, PAGE_SIZE);
 	if (phys == MEMBLOCK_ERROR) {
 		pr_warning("NUMA: Warning: can't allocate distance table!\n");
@@ -456,24 +455,24 @@ EXPORT_SYMBOL(__node_distance);
  */
 static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
 {
-	unsigned long numaram, e820ram;
+	u64 numaram, e820ram;
 	int i;
 
 	numaram = 0;
 	for (i = 0; i < mi->nr_blks; i++) {
-		unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
-		unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
+		u64 s = mi->blk[i].start >> PAGE_SHIFT;
+		u64 e = mi->blk[i].end >> PAGE_SHIFT;
 		numaram += e - s;
 		numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
-		if ((long)numaram < 0)
+		if ((s64)numaram < 0)
 			numaram = 0;
 	}
 
 	e820ram = max_pfn - (memblock_x86_hole_size(0,
-					max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
+					PFN_PHYS(max_pfn)) >> PAGE_SHIFT);
 	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
-	if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
-		printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
+	if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
+		printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
 		       (numaram << PAGE_SHIFT) >> 20,
 		       (e820ram << PAGE_SHIFT) >> 20);
 		return false;
@@ -503,7 +502,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 
 	/* Finally register nodes. */
 	for_each_node_mask(nid, node_possible_map) {
-		u64 start = (u64)max_pfn << PAGE_SHIFT;
+		u64 start = PFN_PHYS(max_pfn);
 		u64 end = 0;
 
 		for (i = 0; i < mi->nr_blks; i++) {
@@ -595,11 +594,11 @@ static int __init dummy_numa_init(void)
 {
 	printk(KERN_INFO "%s\n",
 	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
-	printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
-	       0LU, max_pfn << PAGE_SHIFT);
+	printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n",
+	       0LLU, PFN_PHYS(max_pfn));
 
 	node_set(0, numa_nodes_parsed);
-	numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
+	numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 99cca492ea8ced305bfd687521ed69fb9e0147aa Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:54 +0200
Subject: x86-32, NUMA: Add @start and @end to init_alloc_remap()

Instead of dereferencing node_start/end_pfn[] directly, make
init_alloc_remap() take @start and @end and let the caller be
responsible for making sure the range is sane.  This is to prepare for
use from unified NUMA init code.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/mm/numa_32.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 975a76f622b..900863204be 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -265,8 +265,10 @@ void resume_map_numa_kva(pgd_t *pgd_base)
  * opportunistically and the callers will fall back to other memory
  * allocation mechanisms on failure.
  */
-static __init void init_alloc_remap(int nid)
+static __init void init_alloc_remap(int nid, u64 start, u64 end)
 {
+	unsigned long start_pfn = start >> PAGE_SHIFT;
+	unsigned long end_pfn = end >> PAGE_SHIFT;
 	unsigned long size, pfn;
 	u64 node_pa, remap_pa;
 	void *remap_va;
@@ -276,24 +278,15 @@ static __init void init_alloc_remap(int nid)
 	 * memory could be added but not currently present.
 	 */
 	printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
-	       nid, node_start_pfn[nid], node_end_pfn[nid]);
-	if (node_start_pfn[nid] > max_pfn)
-		return;
-	if (!node_end_pfn[nid])
-		return;
-	if (node_end_pfn[nid] > max_pfn)
-		node_end_pfn[nid] = max_pfn;
+	       nid, start_pfn, end_pfn);
 
 	/* calculate the necessary space aligned to large page size */
-	size = node_memmap_size_bytes(nid, node_start_pfn[nid],
-				      min(node_end_pfn[nid], max_pfn));
+	size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
 	size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
 	size = ALIGN(size, LARGE_PAGE_BYTES);
 
 	/* allocate node memory and the lowmem remap area */
-	node_pa = memblock_find_in_range(node_start_pfn[nid] << PAGE_SHIFT,
-					 (u64)node_end_pfn[nid] << PAGE_SHIFT,
-					 size, LARGE_PAGE_BYTES);
+	node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
 	if (node_pa == MEMBLOCK_ERROR) {
 		pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
 			   size, nid);
@@ -391,8 +384,14 @@ void __init initmem_init(void)
 	get_memcfg_numa();
 	numa_init_array();
 
-	for_each_online_node(nid)
-		init_alloc_remap(nid);
+	for_each_online_node(nid) {
+		u64 start = (u64)node_start_pfn[nid] << PAGE_SHIFT;
+		u64 end = min((u64)node_end_pfn[nid] << PAGE_SHIFT,
+			      (u64)max_pfn << PAGE_SHIFT);
+
+		if (start < end)
+			init_alloc_remap(nid, start, end);
+	}
 
 #ifdef CONFIG_HIGHMEM
 	highstart_pfn = highend_pfn = max_pfn;
-- 
cgit v1.2.3-70-g09d2


From 7888e96b264fad27f97f58c0f3a4d20326eaf181 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 14:18:54 +0200
Subject: x86, NUMA: Initialize and use remap allocator from
 setup_node_bootmem()

setup_node_bootmem() is taken from 64bit and doesn't use remap
allocator.  It's about to be shared with 32bit so add support for it.
If NODE_DATA is remapped, it's noted in the debug message and node
locality check is skipped as the __pa() of the remapped address
doesn't reflect the actual physical address.

On 64bit, remap allocator becomes noop and doesn't affect the
behavior.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/mm/numa.c          | 41 +++++++++++++++++++++++++++--------------
 arch/x86/mm/numa_32.c       |  2 +-
 arch/x86/mm/numa_internal.h |  6 ++++++
 3 files changed, 34 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index b45caa39f7c..a72317ae74c 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -197,7 +197,9 @@ static void __init setup_node_bootmem(int nid, u64 start, u64 end)
 	const u64 nd_low = PFN_PHYS(MAX_DMA_PFN);
 	const u64 nd_high = PFN_PHYS(max_pfn_mapped);
 	const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
+	bool remapped = false;
 	u64 nd_pa;
+	void *nd;
 	int tnid;
 
 	/*
@@ -207,34 +209,45 @@ static void __init setup_node_bootmem(int nid, u64 start, u64 end)
 	if (end && (end - start) < NODE_MIN_SIZE)
 		return;
 
+	/* initialize remap allocator before aligning to ZONE_ALIGN */
+	init_alloc_remap(nid, start, end);
+
 	start = roundup(start, ZONE_ALIGN);
 
 	printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n",
 	       nid, start, end);
 
 	/*
-	 * Try to allocate node data on local node and then fall back to
-	 * all nodes.  Never allocate in DMA zone.
+	 * Allocate node data.  Try remap allocator first, node-local
+	 * memory and then any node.  Never allocate in DMA zone.
 	 */
-	nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high,
+	nd = alloc_remap(nid, nd_size);
+	if (nd) {
+		nd_pa = __pa(nd);
+		remapped = true;
+	} else {
+		nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high,
 						nd_size, SMP_CACHE_BYTES);
-	if (nd_pa == MEMBLOCK_ERROR)
-		nd_pa = memblock_find_in_range(nd_low, nd_high,
-					       nd_size, SMP_CACHE_BYTES);
-	if (nd_pa == MEMBLOCK_ERROR) {
-		pr_err("Cannot find %zu bytes in node %d\n", nd_size, nid);
-		return;
+		if (nd_pa == MEMBLOCK_ERROR)
+			nd_pa = memblock_find_in_range(nd_low, nd_high,
+						nd_size, SMP_CACHE_BYTES);
+		if (nd_pa == MEMBLOCK_ERROR) {
+			pr_err("Cannot find %zu bytes in node %d\n",
+			       nd_size, nid);
+			return;
+		}
+		memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
+		nd = __va(nd_pa);
 	}
-	memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
 
 	/* report and initialize */
-	printk(KERN_INFO "  NODE_DATA [%016Lx - %016Lx]\n",
-	       nd_pa, nd_pa + nd_size - 1);
+	printk(KERN_INFO "  NODE_DATA [%016Lx - %016Lx]%s\n",
+	       nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
 	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
-	if (tnid != nid)
+	if (!remapped && tnid != nid)
 		printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nid, tnid);
 
-	node_data[nid] = __va(nd_pa);
+	node_data[nid] = nd;
 	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
 	NODE_DATA(nid)->node_id = nid;
 	NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 900863204be..fbd558fe10b 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -265,7 +265,7 @@ void resume_map_numa_kva(pgd_t *pgd_base)
  * opportunistically and the callers will fall back to other memory
  * allocation mechanisms on failure.
  */
-static __init void init_alloc_remap(int nid, u64 start, u64 end)
+void __init init_alloc_remap(int nid, u64 start, u64 end)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long end_pfn = end >> PAGE_SHIFT;
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index ad86ec91e64..7178c3afe05 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -21,6 +21,12 @@ void __init numa_reset_distance(void);
 
 void __init x86_numa_init(void);
 
+#ifdef CONFIG_X86_64
+static inline void init_alloc_remap(int nid, u64 start, u64 end)	{ }
+#else
+void __init init_alloc_remap(int nid, u64 start, u64 end);
+#endif
+
 #ifdef CONFIG_NUMA_EMU
 void __init numa_emulation(struct numa_meminfo *numa_meminfo,
 			   int numa_dist_cnt);
-- 
cgit v1.2.3-70-g09d2


From bd6709a91a593d8fe35d08da542e9f93bb74a304 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 17:24:48 +0200
Subject: x86, NUMA: Make 32bit use common NUMA init path

With both _numa_init() methods converted and the rest of init code
adjusted, numa_32.c now can switch from the 32bit only init code to
the common one in numa.c.

* Shim get_memcfg_*()'s are dropped and initmem_init() calls
  x86_numa_init(), which is updated to handle NUMAQ.

* All boilerplate operations including node range limiting, pgdat
  alloc/init are handled by numa_init().  32bit only implementation is
  removed.

* 32bit numa_add_memblk(), numa_set_distance() and
  memory_add_physaddr_to_nid() removed and common versions in
  numa_32.c enabled for 32bit.

This change causes the following behavior changes.

* NODE_DATA()->node_start_pfn/node_spanned_pages properly initialized
  for 32bit too.

* Much more sanity checks and configuration cleanups.

* Proper handling of node distances.

* The same NUMA init messages as 64bit.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/include/asm/topology.h |   7 --
 arch/x86/mm/numa.c              |  10 +-
 arch/x86/mm/numa_32.c           | 231 +---------------------------------------
 3 files changed, 7 insertions(+), 241 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 8dba76972fd..c00692476e9 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -93,18 +93,11 @@ extern void setup_node_to_cpumask_map(void);
 #define pcibus_to_node(bus) __pcibus_to_node(bus)
 
 #ifdef CONFIG_X86_32
-extern unsigned long node_start_pfn[];
-extern unsigned long node_end_pfn[];
-#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
-
 # define SD_CACHE_NICE_TRIES	1
 # define SD_IDLE_IDX		1
-
 #else
-
 # define SD_CACHE_NICE_TRIES	2
 # define SD_IDLE_IDX		2
-
 #endif
 
 /* sched_domains SD_NODE_INIT for NUMA machines */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index a72317ae74c..56ed714ed24 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -173,7 +173,6 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
 		(mi->nr_blks - idx) * sizeof(mi->blk[0]));
 }
 
-#ifdef CONFIG_X86_64
 /**
  * numa_add_memblk - Add one numa_memblk to numa_meminfo
  * @nid: NUMA node ID of the new memblk
@@ -189,7 +188,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
 {
 	return numa_add_memblk_to(nid, start, end, &numa_meminfo);
 }
-#endif
 
 /* Initialize bootmem allocator for a node */
 static void __init setup_node_bootmem(int nid, u64 start, u64 end)
@@ -413,7 +411,6 @@ static int __init numa_alloc_distance(void)
 	return 0;
 }
 
-#ifdef CONFIG_X86_64
 /**
  * numa_set_distance - Set NUMA distance from one NUMA to another
  * @from: the 'from' node to set distance
@@ -452,7 +449,6 @@ void __init numa_set_distance(int from, int to, int distance)
 
 	numa_distance[from * numa_distance_cnt + to] = distance;
 }
-#endif
 
 int __node_distance(int from, int to)
 {
@@ -626,6 +622,10 @@ static int __init dummy_numa_init(void)
 void __init x86_numa_init(void)
 {
 	if (!numa_off) {
+#ifdef CONFIG_X86_NUMAQ
+		if (!numa_init(numaq_numa_init))
+			return;
+#endif
 #ifdef CONFIG_ACPI_NUMA
 		if (!numa_init(x86_acpi_numa_init))
 			return;
@@ -805,7 +805,7 @@ EXPORT_SYMBOL(cpumask_of_node);
 
 #endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
 
-#if defined(CONFIG_X86_64) && defined(CONFIG_MEMORY_HOTPLUG)
+#ifdef CONFIG_MEMORY_HOTPLUG
 int memory_add_physaddr_to_nid(u64 start)
 {
 	struct numa_meminfo *mi = &numa_meminfo;
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index fbd558fe10b..849a975d3fa 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -22,36 +22,11 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
-#include <linux/mm.h>
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
-#include <linux/mmzone.h>
-#include <linux/highmem.h>
-#include <linux/initrd.h>
-#include <linux/nodemask.h>
 #include <linux/module.h>
-#include <linux/kexec.h>
-#include <linux/pfn.h>
-#include <linux/swap.h>
-#include <linux/acpi.h>
-
-#include <asm/e820.h>
-#include <asm/setup.h>
-#include <asm/mmzone.h>
-#include <asm/bios_ebda.h>
-#include <asm/proto.h>
-
-/*
- * numa interface - we expect the numa architecture specific code to have
- *                  populated the following initialisation.
- *
- * 1) node_online_map  - the map of all nodes configured (online) in the system
- * 2) node_start_pfn   - the starting page frame number for a node
- * 3) node_end_pfn     - the ending page fram number for a node
- */
-unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
-unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
 
+#include "numa_internal.h"
 
 #ifdef CONFIG_DISCONTIGMEM
 /*
@@ -96,7 +71,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
 }
 #endif
 
-extern unsigned long find_max_low_pfn(void);
 extern unsigned long highend_pfn, highstart_pfn;
 
 #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
@@ -104,68 +78,6 @@ extern unsigned long highend_pfn, highstart_pfn;
 static void *node_remap_start_vaddr[MAX_NUMNODES];
 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
-/*
- * FLAT - support for basic PC memory model with discontig enabled, essentially
- *        a single node with all available processors in it with a flat
- *        memory map.
- */
-static int __init get_memcfg_numa_flat(void)
-{
-	printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
-
-	node_start_pfn[0] = 0;
-	node_end_pfn[0] = max_pfn;
-	memblock_x86_register_active_regions(0, 0, max_pfn);
-
-        /* Indicate there is one node available. */
-	nodes_clear(node_online_map);
-	node_set_online(0);
-	return 1;
-}
-
-/*
- * Find the highest page frame number we have available for the node
- */
-static void __init propagate_e820_map_node(int nid)
-{
-	if (node_end_pfn[nid] > max_pfn)
-		node_end_pfn[nid] = max_pfn;
-	/*
-	 * if a user has given mem=XXXX, then we need to make sure 
-	 * that the node _starts_ before that, too, not just ends
-	 */
-	if (node_start_pfn[nid] > max_pfn)
-		node_start_pfn[nid] = max_pfn;
-	BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
-}
-
-/* 
- * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
- * method.  For node zero take this from the bottom of memory, for
- * subsequent nodes place them at node_remap_start_vaddr which contains
- * node local data in physically node local memory.  See setup_memory()
- * for details.
- */
-static void __init allocate_pgdat(int nid)
-{
-	char buf[16];
-
-	NODE_DATA(nid) = alloc_remap(nid, ALIGN(sizeof(pg_data_t), PAGE_SIZE));
-	if (!NODE_DATA(nid)) {
-		unsigned long pgdat_phys;
-		pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT,
-				 max_pfn_mapped<<PAGE_SHIFT,
-				 sizeof(pg_data_t),
-				 PAGE_SIZE);
-		NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
-		memset(buf, 0, sizeof(buf));
-		sprintf(buf, "NODE_DATA %d",  nid);
-		memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
-	}
-	printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
-		nid, (unsigned long)NODE_DATA(nid));
-}
-
 /*
  * Remap memory allocator
  */
@@ -322,76 +234,9 @@ void __init init_alloc_remap(int nid, u64 start, u64 end)
 	       nid, node_pa, node_pa + size, remap_va, remap_va + size);
 }
 
-static int get_memcfg_numaq(void)
-{
-#ifdef CONFIG_X86_NUMAQ
-	int nid;
-
-	if (numa_off)
-		return 0;
-
-	if (numaq_numa_init() < 0) {
-		nodes_clear(numa_nodes_parsed);
-		remove_all_active_ranges();
-		return 0;
-	}
-
-	for_each_node_mask(nid, numa_nodes_parsed)
-		node_set_online(nid);
-	sort_node_map();
-	return 1;
-#else
-	return 0;
-#endif
-}
-
-static int get_memcfg_from_srat(void)
-{
-#ifdef CONFIG_ACPI_NUMA
-	int nid;
-
-	if (numa_off)
-		return 0;
-
-	if (x86_acpi_numa_init() < 0) {
-		nodes_clear(numa_nodes_parsed);
-		remove_all_active_ranges();
-		return 0;
-	}
-
-	for_each_node_mask(nid, numa_nodes_parsed)
-		node_set_online(nid);
-	sort_node_map();
-	return 1;
-#else
-	return 0;
-#endif
-}
-
-static void get_memcfg_numa(void)
-{
-	if (get_memcfg_numaq())
-		return;
-	if (get_memcfg_from_srat())
-		return;
-	get_memcfg_numa_flat();
-}
-
 void __init initmem_init(void)
 {
-	int nid;
-
-	get_memcfg_numa();
-	numa_init_array();
-
-	for_each_online_node(nid) {
-		u64 start = (u64)node_start_pfn[nid] << PAGE_SHIFT;
-		u64 end = min((u64)node_end_pfn[nid] << PAGE_SHIFT,
-			      (u64)max_pfn << PAGE_SHIFT);
-
-		if (start < end)
-			init_alloc_remap(nid, start, end);
-	}
+	x86_numa_init();
 
 #ifdef CONFIG_HIGHMEM
 	highstart_pfn = highend_pfn = max_pfn;
@@ -412,81 +257,9 @@ void __init initmem_init(void)
 
 	printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
 			(ulong) pfn_to_kaddr(max_low_pfn));
-	for_each_online_node(nid)
-		allocate_pgdat(nid);
 
 	printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
 			(ulong) pfn_to_kaddr(highstart_pfn));
-	for_each_online_node(nid)
-		propagate_e820_map_node(nid);
-
-	for_each_online_node(nid) {
-		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
-		NODE_DATA(nid)->node_id = nid;
-	}
 
 	setup_bootmem_allocator();
 }
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-static int paddr_to_nid(u64 addr)
-{
-	int nid;
-	unsigned long pfn = PFN_DOWN(addr);
-
-	for_each_node(nid)
-		if (node_start_pfn[nid] <= pfn &&
-		    pfn < node_end_pfn[nid])
-			return nid;
-
-	return -1;
-}
-
-/*
- * This function is used to ask node id BEFORE memmap and mem_section's
- * initialization (pfn_to_nid() can't be used yet).
- * If _PXM is not defined on ACPI's DSDT, node id must be found by this.
- */
-int memory_add_physaddr_to_nid(u64 addr)
-{
-	int nid = paddr_to_nid(addr);
-	return (nid >= 0) ? nid : 0;
-}
-
-EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
-
-/* temporary shim, will go away soon */
-int __init numa_add_memblk(int nid, u64 start, u64 end)
-{
-	unsigned long start_pfn = start >> PAGE_SHIFT;
-	unsigned long end_pfn = end >> PAGE_SHIFT;
-
-	printk(KERN_DEBUG "nid %d start_pfn %08lx end_pfn %08lx\n",
-	       nid, start_pfn, end_pfn);
-
-	if (start >= (u64)max_pfn << PAGE_SHIFT) {
-		printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
-		       start_pfn, end_pfn);
-		return 0;
-	}
-
-	node_set_online(nid);
-	memblock_x86_register_active_regions(nid, start_pfn,
-					     min(end_pfn, max_pfn));
-
-	if (!node_has_online_mem(nid)) {
-		node_start_pfn[nid] = start_pfn;
-		node_end_pfn[nid] = end_pfn;
-	} else {
-		node_start_pfn[nid] = min(node_start_pfn[nid], start_pfn);
-		node_end_pfn[nid] = max(node_end_pfn[nid], end_pfn);
-	}
-	return 0;
-}
-
-/* temporary shim, will go away soon */
-void __init numa_set_distance(int from, int to, int distance)
-{
-	/* nada */
-}
-- 
cgit v1.2.3-70-g09d2


From 752d4f372f90a2f6eb562aaffb639957890cbcab Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 17:24:48 +0200
Subject: x86, NUMA: Make numa_init_array() static

numa_init_array() no longer has users outside of numa.c.  Make it
static.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/include/asm/numa.h | 2 --
 arch/x86/mm/numa.c          | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 74540865dd8..bfacd2ccf65 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -61,14 +61,12 @@ static inline int numa_cpu_node(int cpu)
 #ifdef CONFIG_NUMA
 extern void __cpuinit numa_set_node(int cpu, int node);
 extern void __cpuinit numa_clear_node(int cpu);
-extern void __init numa_init_array(void);
 extern void __init init_cpu_to_node(void);
 extern void __cpuinit numa_add_cpu(int cpu);
 extern void __cpuinit numa_remove_cpu(int cpu);
 #else	/* CONFIG_NUMA */
 static inline void numa_set_node(int cpu, int node)	{ }
 static inline void numa_clear_node(int cpu)		{ }
-static inline void numa_init_array(void)		{ }
 static inline void init_cpu_to_node(void)		{ }
 static inline void numa_add_cpu(int cpu)		{ }
 static inline void numa_remove_cpu(int cpu)		{ }
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 56ed714ed24..ecb5685d7d2 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -535,7 +535,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
  * as the number of CPUs is not known yet. We round robin the existing
  * nodes.
  */
-void __init numa_init_array(void)
+static void __init numa_init_array(void)
 {
 	int rr, i;
 
-- 
cgit v1.2.3-70-g09d2


From c6f58878204b0414e00b43f9bcebf754284f95b4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 17:24:48 +0200
Subject: x86, NUMA: Rename amdtopology_64.c to amdtopology.c

amdtopology is going to be used by 32bit too drop _64 suffix.  This is
pure rename.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/mm/Makefile         |   2 +-
 arch/x86/mm/amdtopology.c    | 196 +++++++++++++++++++++++++++++++++++++++++++
 arch/x86/mm/amdtopology_64.c | 196 -------------------------------------------
 3 files changed, 197 insertions(+), 197 deletions(-)
 create mode 100644 arch/x86/mm/amdtopology.c
 delete mode 100644 arch/x86/mm/amdtopology_64.c

(limited to 'arch/x86')

diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 62997be3307..3d11327c9ab 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -23,7 +23,7 @@ mmiotrace-y			:= kmmio.o pf_in.o mmio-mod.o
 obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
 
 obj-$(CONFIG_NUMA)		+= numa.o numa_$(BITS).o
-obj-$(CONFIG_AMD_NUMA)		+= amdtopology_64.o
+obj-$(CONFIG_AMD_NUMA)		+= amdtopology.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
new file mode 100644
index 00000000000..0919c26820d
--- /dev/null
+++ b/arch/x86/mm/amdtopology.c
@@ -0,0 +1,196 @@
+/*
+ * AMD NUMA support.
+ * Discover the memory map and associated nodes.
+ *
+ * This version reads it directly from the AMD northbridge.
+ *
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/nodemask.h>
+#include <linux/memblock.h>
+
+#include <asm/io.h>
+#include <linux/pci_ids.h>
+#include <linux/acpi.h>
+#include <asm/types.h>
+#include <asm/mmzone.h>
+#include <asm/proto.h>
+#include <asm/e820.h>
+#include <asm/pci-direct.h>
+#include <asm/numa.h>
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <asm/amd_nb.h>
+
+static unsigned char __initdata nodeids[8];
+
+static __init int find_northbridge(void)
+{
+	int num;
+
+	for (num = 0; num < 32; num++) {
+		u32 header;
+
+		header = read_pci_config(0, num, 0, 0x00);
+		if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) &&
+			header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) &&
+			header != (PCI_VENDOR_ID_AMD | (0x1300<<16)))
+			continue;
+
+		header = read_pci_config(0, num, 1, 0x00);
+		if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) &&
+			header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) &&
+			header != (PCI_VENDOR_ID_AMD | (0x1301<<16)))
+			continue;
+		return num;
+	}
+
+	return -ENOENT;
+}
+
+static __init void early_get_boot_cpu_id(void)
+{
+	/*
+	 * need to get the APIC ID of the BSP so can use that to
+	 * create apicid_to_node in amd_scan_nodes()
+	 */
+#ifdef CONFIG_X86_MPPARSE
+	/*
+	 * get boot-time SMP configuration:
+	 */
+	if (smp_found_config)
+		early_get_smp_config();
+#endif
+}
+
+int __init amd_numa_init(void)
+{
+	unsigned long start = PFN_PHYS(0);
+	unsigned long end = PFN_PHYS(max_pfn);
+	unsigned numnodes;
+	unsigned long prevbase;
+	int i, j, nb;
+	u32 nodeid, reg;
+	unsigned int bits, cores, apicid_base;
+
+	if (!early_pci_allowed())
+		return -EINVAL;
+
+	nb = find_northbridge();
+	if (nb < 0)
+		return nb;
+
+	pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
+
+	reg = read_pci_config(0, nb, 0, 0x60);
+	numnodes = ((reg >> 4) & 0xF) + 1;
+	if (numnodes <= 1)
+		return -ENOENT;
+
+	pr_info("Number of physical nodes %d\n", numnodes);
+
+	prevbase = 0;
+	for (i = 0; i < 8; i++) {
+		unsigned long base, limit;
+
+		base = read_pci_config(0, nb, 1, 0x40 + i*8);
+		limit = read_pci_config(0, nb, 1, 0x44 + i*8);
+
+		nodeids[i] = nodeid = limit & 7;
+		if ((base & 3) == 0) {
+			if (i < numnodes)
+				pr_info("Skipping disabled node %d\n", i);
+			continue;
+		}
+		if (nodeid >= numnodes) {
+			pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
+				base, limit);
+			continue;
+		}
+
+		if (!limit) {
+			pr_info("Skipping node entry %d (base %lx)\n",
+				i, base);
+			continue;
+		}
+		if ((base >> 8) & 3 || (limit >> 8) & 3) {
+			pr_err("Node %d using interleaving mode %lx/%lx\n",
+			       nodeid, (base >> 8) & 3, (limit >> 8) & 3);
+			return -EINVAL;
+		}
+		if (node_isset(nodeid, numa_nodes_parsed)) {
+			pr_info("Node %d already present, skipping\n",
+				nodeid);
+			continue;
+		}
+
+		limit >>= 16;
+		limit <<= 24;
+		limit |= (1<<24)-1;
+		limit++;
+
+		if (limit > end)
+			limit = end;
+		if (limit <= base)
+			continue;
+
+		base >>= 16;
+		base <<= 24;
+
+		if (base < start)
+			base = start;
+		if (limit > end)
+			limit = end;
+		if (limit == base) {
+			pr_err("Empty node %d\n", nodeid);
+			continue;
+		}
+		if (limit < base) {
+			pr_err("Node %d bogus settings %lx-%lx.\n",
+			       nodeid, base, limit);
+			continue;
+		}
+
+		/* Could sort here, but pun for now. Should not happen anyroads. */
+		if (prevbase > base) {
+			pr_err("Node map not sorted %lx,%lx\n",
+			       prevbase, base);
+			return -EINVAL;
+		}
+
+		pr_info("Node %d MemBase %016lx Limit %016lx\n",
+			nodeid, base, limit);
+
+		prevbase = base;
+		numa_add_memblk(nodeid, base, limit);
+		node_set(nodeid, numa_nodes_parsed);
+	}
+
+	if (!nodes_weight(numa_nodes_parsed))
+		return -ENOENT;
+
+	/*
+	 * We seem to have valid NUMA configuration.  Map apicids to nodes
+	 * using the coreid bits from early_identify_cpu.
+	 */
+	bits = boot_cpu_data.x86_coreid_bits;
+	cores = 1 << bits;
+	apicid_base = 0;
+
+	/* get the APIC ID of the BSP early for systems with apicid lifting */
+	early_get_boot_cpu_id();
+	if (boot_cpu_physical_apicid > 0) {
+		pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
+		apicid_base = boot_cpu_physical_apicid;
+	}
+
+	for_each_node_mask(i, numa_nodes_parsed)
+		for (j = apicid_base; j < cores + apicid_base; j++)
+			set_apicid_to_node((i << bits) + j, i);
+
+	return 0;
+}
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
deleted file mode 100644
index 0919c26820d..00000000000
--- a/arch/x86/mm/amdtopology_64.c
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * AMD NUMA support.
- * Discover the memory map and associated nodes.
- *
- * This version reads it directly from the AMD northbridge.
- *
- * Copyright 2002,2003 Andi Kleen, SuSE Labs.
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/module.h>
-#include <linux/nodemask.h>
-#include <linux/memblock.h>
-
-#include <asm/io.h>
-#include <linux/pci_ids.h>
-#include <linux/acpi.h>
-#include <asm/types.h>
-#include <asm/mmzone.h>
-#include <asm/proto.h>
-#include <asm/e820.h>
-#include <asm/pci-direct.h>
-#include <asm/numa.h>
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#include <asm/amd_nb.h>
-
-static unsigned char __initdata nodeids[8];
-
-static __init int find_northbridge(void)
-{
-	int num;
-
-	for (num = 0; num < 32; num++) {
-		u32 header;
-
-		header = read_pci_config(0, num, 0, 0x00);
-		if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) &&
-			header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) &&
-			header != (PCI_VENDOR_ID_AMD | (0x1300<<16)))
-			continue;
-
-		header = read_pci_config(0, num, 1, 0x00);
-		if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) &&
-			header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) &&
-			header != (PCI_VENDOR_ID_AMD | (0x1301<<16)))
-			continue;
-		return num;
-	}
-
-	return -ENOENT;
-}
-
-static __init void early_get_boot_cpu_id(void)
-{
-	/*
-	 * need to get the APIC ID of the BSP so can use that to
-	 * create apicid_to_node in amd_scan_nodes()
-	 */
-#ifdef CONFIG_X86_MPPARSE
-	/*
-	 * get boot-time SMP configuration:
-	 */
-	if (smp_found_config)
-		early_get_smp_config();
-#endif
-}
-
-int __init amd_numa_init(void)
-{
-	unsigned long start = PFN_PHYS(0);
-	unsigned long end = PFN_PHYS(max_pfn);
-	unsigned numnodes;
-	unsigned long prevbase;
-	int i, j, nb;
-	u32 nodeid, reg;
-	unsigned int bits, cores, apicid_base;
-
-	if (!early_pci_allowed())
-		return -EINVAL;
-
-	nb = find_northbridge();
-	if (nb < 0)
-		return nb;
-
-	pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
-
-	reg = read_pci_config(0, nb, 0, 0x60);
-	numnodes = ((reg >> 4) & 0xF) + 1;
-	if (numnodes <= 1)
-		return -ENOENT;
-
-	pr_info("Number of physical nodes %d\n", numnodes);
-
-	prevbase = 0;
-	for (i = 0; i < 8; i++) {
-		unsigned long base, limit;
-
-		base = read_pci_config(0, nb, 1, 0x40 + i*8);
-		limit = read_pci_config(0, nb, 1, 0x44 + i*8);
-
-		nodeids[i] = nodeid = limit & 7;
-		if ((base & 3) == 0) {
-			if (i < numnodes)
-				pr_info("Skipping disabled node %d\n", i);
-			continue;
-		}
-		if (nodeid >= numnodes) {
-			pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
-				base, limit);
-			continue;
-		}
-
-		if (!limit) {
-			pr_info("Skipping node entry %d (base %lx)\n",
-				i, base);
-			continue;
-		}
-		if ((base >> 8) & 3 || (limit >> 8) & 3) {
-			pr_err("Node %d using interleaving mode %lx/%lx\n",
-			       nodeid, (base >> 8) & 3, (limit >> 8) & 3);
-			return -EINVAL;
-		}
-		if (node_isset(nodeid, numa_nodes_parsed)) {
-			pr_info("Node %d already present, skipping\n",
-				nodeid);
-			continue;
-		}
-
-		limit >>= 16;
-		limit <<= 24;
-		limit |= (1<<24)-1;
-		limit++;
-
-		if (limit > end)
-			limit = end;
-		if (limit <= base)
-			continue;
-
-		base >>= 16;
-		base <<= 24;
-
-		if (base < start)
-			base = start;
-		if (limit > end)
-			limit = end;
-		if (limit == base) {
-			pr_err("Empty node %d\n", nodeid);
-			continue;
-		}
-		if (limit < base) {
-			pr_err("Node %d bogus settings %lx-%lx.\n",
-			       nodeid, base, limit);
-			continue;
-		}
-
-		/* Could sort here, but pun for now. Should not happen anyroads. */
-		if (prevbase > base) {
-			pr_err("Node map not sorted %lx,%lx\n",
-			       prevbase, base);
-			return -EINVAL;
-		}
-
-		pr_info("Node %d MemBase %016lx Limit %016lx\n",
-			nodeid, base, limit);
-
-		prevbase = base;
-		numa_add_memblk(nodeid, base, limit);
-		node_set(nodeid, numa_nodes_parsed);
-	}
-
-	if (!nodes_weight(numa_nodes_parsed))
-		return -ENOENT;
-
-	/*
-	 * We seem to have valid NUMA configuration.  Map apicids to nodes
-	 * using the coreid bits from early_identify_cpu.
-	 */
-	bits = boot_cpu_data.x86_coreid_bits;
-	cores = 1 << bits;
-	apicid_base = 0;
-
-	/* get the APIC ID of the BSP early for systems with apicid lifting */
-	early_get_boot_cpu_id();
-	if (boot_cpu_physical_apicid > 0) {
-		pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
-		apicid_base = boot_cpu_physical_apicid;
-	}
-
-	for_each_node_mask(i, numa_nodes_parsed)
-		for (j = apicid_base; j < cores + apicid_base; j++)
-			set_apicid_to_node((i << bits) + j, i);
-
-	return 0;
-}
-- 
cgit v1.2.3-70-g09d2


From 2706a0bf7b02693ed88752df877f10c2206292ff Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 17:24:48 +0200
Subject: x86, NUMA: Enable CONFIG_AMD_NUMA on 32bit too

Now that NUMA init path is unified, amdtopology can be enabled on
32bit.  Make amdtopology.c safe on 32bit by explicitly using u64 and
drop X86_64 dependency from Kconfig.

Inclusion of bootmem.h is added for max_pfn declaration.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/Kconfig          |  2 +-
 arch/x86/mm/amdtopology.c | 21 +++++++++++----------
 2 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8db4fbf30b5..50cb68d2901 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1174,7 +1174,7 @@ comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
 config AMD_NUMA
 	def_bool y
 	prompt "Old style AMD Opteron NUMA detection"
-	depends on X86_64 && NUMA && PCI
+	depends on NUMA && PCI
 	---help---
 	  Enable AMD NUMA node topology detection.  You should say Y here if
 	  you have a multi processor AMD system. This uses an old method to
diff --git a/arch/x86/mm/amdtopology.c b/arch/x86/mm/amdtopology.c
index 0919c26820d..5247d01329c 100644
--- a/arch/x86/mm/amdtopology.c
+++ b/arch/x86/mm/amdtopology.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/nodemask.h>
 #include <linux/memblock.h>
+#include <linux/bootmem.h>
 
 #include <asm/io.h>
 #include <linux/pci_ids.h>
@@ -69,10 +70,10 @@ static __init void early_get_boot_cpu_id(void)
 
 int __init amd_numa_init(void)
 {
-	unsigned long start = PFN_PHYS(0);
-	unsigned long end = PFN_PHYS(max_pfn);
+	u64 start = PFN_PHYS(0);
+	u64 end = PFN_PHYS(max_pfn);
 	unsigned numnodes;
-	unsigned long prevbase;
+	u64 prevbase;
 	int i, j, nb;
 	u32 nodeid, reg;
 	unsigned int bits, cores, apicid_base;
@@ -95,7 +96,7 @@ int __init amd_numa_init(void)
 
 	prevbase = 0;
 	for (i = 0; i < 8; i++) {
-		unsigned long base, limit;
+		u64 base, limit;
 
 		base = read_pci_config(0, nb, 1, 0x40 + i*8);
 		limit = read_pci_config(0, nb, 1, 0x44 + i*8);
@@ -107,18 +108,18 @@ int __init amd_numa_init(void)
 			continue;
 		}
 		if (nodeid >= numnodes) {
-			pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
+			pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid,
 				base, limit);
 			continue;
 		}
 
 		if (!limit) {
-			pr_info("Skipping node entry %d (base %lx)\n",
+			pr_info("Skipping node entry %d (base %Lx)\n",
 				i, base);
 			continue;
 		}
 		if ((base >> 8) & 3 || (limit >> 8) & 3) {
-			pr_err("Node %d using interleaving mode %lx/%lx\n",
+			pr_err("Node %d using interleaving mode %Lx/%Lx\n",
 			       nodeid, (base >> 8) & 3, (limit >> 8) & 3);
 			return -EINVAL;
 		}
@@ -150,19 +151,19 @@ int __init amd_numa_init(void)
 			continue;
 		}
 		if (limit < base) {
-			pr_err("Node %d bogus settings %lx-%lx.\n",
+			pr_err("Node %d bogus settings %Lx-%Lx.\n",
 			       nodeid, base, limit);
 			continue;
 		}
 
 		/* Could sort here, but pun for now. Should not happen anyroads. */
 		if (prevbase > base) {
-			pr_err("Node map not sorted %lx,%lx\n",
+			pr_err("Node map not sorted %Lx,%Lx\n",
 			       prevbase, base);
 			return -EINVAL;
 		}
 
-		pr_info("Node %d MemBase %016lx Limit %016lx\n",
+		pr_info("Node %d MemBase %016Lx Limit %016Lx\n",
 			nodeid, base, limit);
 
 		prevbase = base;
-- 
cgit v1.2.3-70-g09d2


From 1b7e03ef7570568d2fb9e6640d7006a0edd728f6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 2 May 2011 17:24:48 +0200
Subject: x86, NUMA: Enable emulation on 32bit too

Now that NUMA init path is unified, NUMA emulation can be enabled on
32bit.  Make numa_emluation.c safe on 32bit by doing the followings.

* Define MAX_DMA32_PFN on 32bit too.

* Include bootmem.h for max_pfn declaration.

* Use u64 explicitly and always use PFN_PHYS() when converting page
  number to address.

* Avoid __udivdi3() generation on 32bit by doing number of pages
  calculation instead in split_nodes_interleave().

And drop X86_64 dependency from Kconfig.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
---
 arch/x86/Kconfig             |  2 +-
 arch/x86/include/asm/dma.h   | 10 +++-------
 arch/x86/mm/numa_emulation.c | 16 +++++++++++-----
 3 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 50cb68d2901..648fca42ae6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1201,7 +1201,7 @@ config NODES_SPAN_OTHER_NODES
 
 config NUMA_EMU
 	bool "NUMA emulation"
-	depends on X86_64 && NUMA
+	depends on NUMA
 	---help---
 	  Enable NUMA emulation. A flat machine will be split
 	  into virtual nodes when booted with "numa=fake=N", where N is the
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index d1a314b610f..0bdb0c54d9a 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -72,19 +72,15 @@
 /* 16MB ISA DMA zone */
 #define MAX_DMA_PFN   ((16 * 1024 * 1024) >> PAGE_SHIFT)
 
-#ifdef CONFIG_X86_32
+/* 4GB broken PCI/AGP hardware bus master zone */
+#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
 
+#ifdef CONFIG_X86_32
 /* The maximum address that we can perform a DMA transfer to on this platform */
 #define MAX_DMA_ADDRESS      (PAGE_OFFSET + 0x1000000)
-
 #else
-
-/* 4GB broken PCI/AGP hardware bus master zone */
-#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
-
 /* Compat define for old dma zone */
 #define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
-
 #endif
 
 /* 8237 DMA controllers */
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index de84cc14037..d0ed086b624 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -5,6 +5,7 @@
 #include <linux/errno.h>
 #include <linux/topology.h>
 #include <linux/memblock.h>
+#include <linux/bootmem.h>
 #include <asm/dma.h>
 
 #include "numa_internal.h"
@@ -84,7 +85,13 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
 		nr_nodes = MAX_NUMNODES;
 	}
 
-	size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes;
+	/*
+	 * Calculate target node size.  x86_32 freaks on __udivdi3() so do
+	 * the division in ulong number of pages and convert back.
+	 */
+	size = max_addr - addr - memblock_x86_hole_size(addr, max_addr);
+	size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
+
 	/*
 	 * Calculate the number of big nodes that can be allocated as a result
 	 * of consolidating the remainder.
@@ -226,7 +233,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
 	 */
 	while (nodes_weight(physnode_mask)) {
 		for_each_node_mask(i, physnode_mask) {
-			u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
+			u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
 			u64 start, limit, end;
 			int phys_blk;
 
@@ -298,7 +305,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 {
 	static struct numa_meminfo ei __initdata;
 	static struct numa_meminfo pi __initdata;
-	const u64 max_addr = max_pfn << PAGE_SHIFT;
+	const u64 max_addr = PFN_PHYS(max_pfn);
 	u8 *phys_dist = NULL;
 	size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
 	int max_emu_nid, dfl_phys_nid;
@@ -342,8 +349,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
 	if (numa_dist_cnt) {
 		u64 phys;
 
-		phys = memblock_find_in_range(0,
-					      (u64)max_pfn_mapped << PAGE_SHIFT,
+		phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
 					      phys_size, PAGE_SIZE);
 		if (phys == MEMBLOCK_ERROR) {
 			pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
-- 
cgit v1.2.3-70-g09d2


From a56bca80db8903bb557b9ac38da68dc5b98ea672 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 2 May 2011 17:24:49 +0200
Subject: x86, NUMA: Rename setup_node_bootmem() to setup_node_data()

After using memblock to replace bootmem, that function only sets up
node_data now.

Change the name to reflect what it actually does.

tj: Minor adjustment to the patch description.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/mm/numa.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index ecb5685d7d2..9a0ed312b83 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -189,8 +189,8 @@ int __init numa_add_memblk(int nid, u64 start, u64 end)
 	return numa_add_memblk_to(nid, start, end, &numa_meminfo);
 }
 
-/* Initialize bootmem allocator for a node */
-static void __init setup_node_bootmem(int nid, u64 start, u64 end)
+/* Initialize NODE_DATA for a node on the local memory */
+static void __init setup_node_data(int nid, u64 start, u64 end)
 {
 	const u64 nd_low = PFN_PHYS(MAX_DMA_PFN);
 	const u64 nd_high = PFN_PHYS(max_pfn_mapped);
@@ -522,7 +522,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 		}
 
 		if (start < end)
-			setup_node_bootmem(nid, start, end);
+			setup_node_data(nid, start, end);
 	}
 
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From e5a10c1bd12a5d71bbb6406c1b0dbbc9d8958397 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 2 May 2011 17:24:49 +0200
Subject: x86, NUMA: Trim numa meminfo with max_pfn in a separate loop

During testing 32bit numa unifying code from tj, found one system with
more than 64g fails to use numa.  It turns out we do not trim numa
meminfo correctly against max_pfn in case start address of a node is
higher than 64GiB.  Bug fix made it to tip tree.

This patch moves the checking and trimming to a separate loop.  So we
don't need to compare low/high in following merge loops.  It makes the
code more readable.

Also it makes the node merge printouts less strange.  On a 512GiB numa
system with 32bit,

before:
> NUMA: Node 0 [0,a0000) + [100000,80000000) -> [0,80000000)
> NUMA: Node 0 [0,80000000) + [100000000,1080000000) -> [0,1000000000)

after:
> NUMA: Node 0 [0,a0000) + [100000,80000000) -> [0,80000000)
> NUMA: Node 0 [0,80000000) + [100000000,1000000000) -> [0,1000000000)

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
[Updated patch description and comment slightly.]
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/mm/numa.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 9a0ed312b83..f5510d889a2 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -270,6 +270,7 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 	const u64 high = PFN_PHYS(max_pfn);
 	int i, j, k;
 
+	/* first, trim all entries */
 	for (i = 0; i < mi->nr_blks; i++) {
 		struct numa_memblk *bi = &mi->blk[i];
 
@@ -278,10 +279,13 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 		bi->end = min(bi->end, high);
 
 		/* and there's no empty block */
-		if (bi->start >= bi->end) {
+		if (bi->start >= bi->end)
 			numa_remove_memblk_from(i--, mi);
-			continue;
-		}
+	}
+
+	/* merge neighboring / overlapping entries */
+	for (i = 0; i < mi->nr_blks; i++) {
+		struct numa_memblk *bi = &mi->blk[i];
 
 		for (j = i + 1; j < mi->nr_blks; j++) {
 			struct numa_memblk *bj = &mi->blk[j];
@@ -311,8 +315,8 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 			 */
 			if (bi->nid != bj->nid)
 				continue;
-			start = max(min(bi->start, bj->start), low);
-			end = min(max(bi->end, bj->end), high);
+			start = min(bi->start, bj->start);
+			end = max(bi->end, bj->end);
 			for (k = 0; k < mi->nr_blks; k++) {
 				struct numa_memblk *bk = &mi->blk[k];
 
@@ -332,6 +336,7 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
 		}
 	}
 
+	/* clear unused ones */
 	for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
 		mi->blk[i].start = mi->blk[i].end = 0;
 		mi->blk[i].nid = NUMA_NO_NODE;
-- 
cgit v1.2.3-70-g09d2


From 904cc1e637a00dba1b58e7752f485f90ebf2a568 Mon Sep 17 00:00:00 2001
From: Naga Chumbalkar <nagananda.chumbalkar@hp.com>
Date: Tue, 26 Apr 2011 17:05:18 +0000
Subject: [CPUFREQ] Fix _OSC UUID in pcc-cpufreq

UUID needs to be written out the way it is described in
Sec 18.5.124 of ACPI 4.0a Specification.

Platform firmware's use of this UUID/_OSC is optional, which is
why we didn't notice this bug earlier.

Signed-off-by: Naga Chumbalkar <nagananda.chumbalkar@hp.com>
Signed-off-by: Dave Jones <davej@redhat.com>
Cc: stable@kernel.org
---
 arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index 755a31e0f5b..907c8e637ef 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -39,7 +39,7 @@
 
 #include <acpi/processor.h>
 
-#define PCC_VERSION 	"1.00.00"
+#define PCC_VERSION	"1.10.00"
 #define POLL_LOOPS 	300
 
 #define CMD_COMPLETE 	0x1
@@ -102,7 +102,7 @@ static struct acpi_generic_address doorbell;
 static u64 doorbell_preserve;
 static u64 doorbell_write;
 
-static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
+static u8 OSC_UUID[16] = {0x9F, 0x2C, 0x9B, 0x63, 0x91, 0x70, 0x1f, 0x49,
 			  0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
 
 struct pcc_cpu {
-- 
cgit v1.2.3-70-g09d2


From 2d06d8c49afdcc9bb35a85039fa50f0fe35bd40e Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Sun, 27 Mar 2011 15:04:46 +0200
Subject: [CPUFREQ] use dynamic debug instead of custom infrastructure

With dynamic debug having gained the capability to report debug messages
also during the boot process, it offers a far superior interface for
debug messages than the custom cpufreq infrastructure. As a first step,
remove the old cpufreq_debug_printk() function and replace it with a call
to the generic pr_debug() function.

How can dynamic debug be used on cpufreq? You need a kernel which has
CONFIG_DYNAMIC_DEBUG enabled.

To enabled debugging during runtime, mount debugfs and

$ echo -n 'module cpufreq +p' > /sys/kernel/debug/dynamic_debug/control

for debugging the complete "cpufreq" module. To achieve the same goal during
boot, append

	ddebug_query="module cpufreq +p"

as a boot parameter to the kernel of your choice.

For more detailled instructions, please see
Documentation/dynamic-debug-howto.txt

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/arm/mach-davinci/cpufreq.c                  |   4 +-
 arch/blackfin/mach-common/dpmc.c                 |   3 -
 arch/ia64/kernel/cpufreq/acpi-cpufreq.c          |  44 +++---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c       |  45 +++---
 arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c    |   6 +-
 arch/x86/kernel/cpu/cpufreq/gx-suspmod.c         |  21 ++-
 arch/x86/kernel/cpu/cpufreq/longhaul.c           |  11 +-
 arch/x86/kernel/cpu/cpufreq/longrun.c            |  17 +--
 arch/x86/kernel/cpu/cpufreq/p4-clockmod.c        |  10 +-
 arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c        |  47 +++---
 arch/x86/kernel/cpu/cpufreq/powernow-k7.c        |  33 ++---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c        | 100 ++++++-------
 arch/x86/kernel/cpu/cpufreq/powernow-k8.h        |   2 -
 arch/x86/kernel/cpu/cpufreq/sc520_freq.c         |   6 +-
 arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c |  23 ++-
 arch/x86/kernel/cpu/cpufreq/speedstep-ich.c      |  28 ++--
 arch/x86/kernel/cpu/cpufreq/speedstep-lib.c      |  43 +++---
 arch/x86/kernel/cpu/cpufreq/speedstep-smi.c      |  41 +++---
 drivers/acpi/processor_perflib.c                 |   6 +-
 drivers/cpufreq/Kconfig                          |  13 --
 drivers/cpufreq/cpufreq.c                        | 180 +++++------------------
 drivers/cpufreq/cpufreq_performance.c            |   5 +-
 drivers/cpufreq/cpufreq_powersave.c              |   5 +-
 drivers/cpufreq/cpufreq_userspace.c              |  13 +-
 drivers/cpufreq/freq_table.c                     |  19 +--
 include/linux/cpufreq.h                          |  19 ---
 26 files changed, 270 insertions(+), 474 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/mach-davinci/cpufreq.c b/arch/arm/mach-davinci/cpufreq.c
index 0a95be1512b..41669ecc1f9 100644
--- a/arch/arm/mach-davinci/cpufreq.c
+++ b/arch/arm/mach-davinci/cpufreq.c
@@ -94,9 +94,7 @@ static int davinci_target(struct cpufreq_policy *policy,
 	if (freqs.old == freqs.new)
 		return ret;
 
-	cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER,
-			dev_driver_string(cpufreq.dev),
-			"transition: %u --> %u\n", freqs.old, freqs.new);
+	dev_dbg(&cpufreq.dev, "transition: %u --> %u\n", freqs.old, freqs.new);
 
 	ret = cpufreq_frequency_table_target(policy, pdata->freq_table,
 						freqs.new, relation, &idx);
diff --git a/arch/blackfin/mach-common/dpmc.c b/arch/blackfin/mach-common/dpmc.c
index 382099fd556..5e4112e518a 100644
--- a/arch/blackfin/mach-common/dpmc.c
+++ b/arch/blackfin/mach-common/dpmc.c
@@ -19,9 +19,6 @@
 
 #define DRIVER_NAME "bfin dpmc"
 
-#define dprintk(msg...) \
-	cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, DRIVER_NAME, msg)
-
 struct bfin_dpmc_platform_data *pdata;
 
 /**
diff --git a/arch/ia64/kernel/cpufreq/acpi-cpufreq.c b/arch/ia64/kernel/cpufreq/acpi-cpufreq.c
index 22f61526a8e..f09b174244d 100644
--- a/arch/ia64/kernel/cpufreq/acpi-cpufreq.c
+++ b/arch/ia64/kernel/cpufreq/acpi-cpufreq.c
@@ -23,8 +23,6 @@
 #include <linux/acpi.h>
 #include <acpi/processor.h>
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "acpi-cpufreq", msg)
-
 MODULE_AUTHOR("Venkatesh Pallipadi");
 MODULE_DESCRIPTION("ACPI Processor P-States Driver");
 MODULE_LICENSE("GPL");
@@ -47,12 +45,12 @@ processor_set_pstate (
 {
 	s64 retval;
 
-	dprintk("processor_set_pstate\n");
+	pr_debug("processor_set_pstate\n");
 
 	retval = ia64_pal_set_pstate((u64)value);
 
 	if (retval) {
-		dprintk("Failed to set freq to 0x%x, with error 0x%lx\n",
+		pr_debug("Failed to set freq to 0x%x, with error 0x%lx\n",
 		        value, retval);
 		return -ENODEV;
 	}
@@ -67,14 +65,14 @@ processor_get_pstate (
 	u64	pstate_index = 0;
 	s64 	retval;
 
-	dprintk("processor_get_pstate\n");
+	pr_debug("processor_get_pstate\n");
 
 	retval = ia64_pal_get_pstate(&pstate_index,
 	                             PAL_GET_PSTATE_TYPE_INSTANT);
 	*value = (u32) pstate_index;
 
 	if (retval)
-		dprintk("Failed to get current freq with "
+		pr_debug("Failed to get current freq with "
 			"error 0x%lx, idx 0x%x\n", retval, *value);
 
 	return (int)retval;
@@ -90,7 +88,7 @@ extract_clock (
 {
 	unsigned long i;
 
-	dprintk("extract_clock\n");
+	pr_debug("extract_clock\n");
 
 	for (i = 0; i < data->acpi_data.state_count; i++) {
 		if (value == data->acpi_data.states[i].status)
@@ -110,7 +108,7 @@ processor_get_freq (
 	cpumask_t		saved_mask;
 	unsigned long 		clock_freq;
 
-	dprintk("processor_get_freq\n");
+	pr_debug("processor_get_freq\n");
 
 	saved_mask = current->cpus_allowed;
 	set_cpus_allowed_ptr(current, cpumask_of(cpu));
@@ -148,7 +146,7 @@ processor_set_freq (
 	cpumask_t		saved_mask;
 	int			retval;
 
-	dprintk("processor_set_freq\n");
+	pr_debug("processor_set_freq\n");
 
 	saved_mask = current->cpus_allowed;
 	set_cpus_allowed_ptr(current, cpumask_of(cpu));
@@ -159,16 +157,16 @@ processor_set_freq (
 
 	if (state == data->acpi_data.state) {
 		if (unlikely(data->resume)) {
-			dprintk("Called after resume, resetting to P%d\n", state);
+			pr_debug("Called after resume, resetting to P%d\n", state);
 			data->resume = 0;
 		} else {
-			dprintk("Already at target state (P%d)\n", state);
+			pr_debug("Already at target state (P%d)\n", state);
 			retval = 0;
 			goto migrate_end;
 		}
 	}
 
-	dprintk("Transitioning from P%d to P%d\n",
+	pr_debug("Transitioning from P%d to P%d\n",
 		data->acpi_data.state, state);
 
 	/* cpufreq frequency struct */
@@ -186,7 +184,7 @@ processor_set_freq (
 
 	value = (u32) data->acpi_data.states[state].control;
 
-	dprintk("Transitioning to state: 0x%08x\n", value);
+	pr_debug("Transitioning to state: 0x%08x\n", value);
 
 	ret = processor_set_pstate(value);
 	if (ret) {
@@ -219,7 +217,7 @@ acpi_cpufreq_get (
 {
 	struct cpufreq_acpi_io *data = acpi_io_data[cpu];
 
-	dprintk("acpi_cpufreq_get\n");
+	pr_debug("acpi_cpufreq_get\n");
 
 	return processor_get_freq(data, cpu);
 }
@@ -235,7 +233,7 @@ acpi_cpufreq_target (
 	unsigned int next_state = 0;
 	unsigned int result = 0;
 
-	dprintk("acpi_cpufreq_setpolicy\n");
+	pr_debug("acpi_cpufreq_setpolicy\n");
 
 	result = cpufreq_frequency_table_target(policy,
 			data->freq_table, target_freq, relation, &next_state);
@@ -255,7 +253,7 @@ acpi_cpufreq_verify (
 	unsigned int result = 0;
 	struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu];
 
-	dprintk("acpi_cpufreq_verify\n");
+	pr_debug("acpi_cpufreq_verify\n");
 
 	result = cpufreq_frequency_table_verify(policy,
 			data->freq_table);
@@ -273,7 +271,7 @@ acpi_cpufreq_cpu_init (
 	struct cpufreq_acpi_io	*data;
 	unsigned int		result = 0;
 
-	dprintk("acpi_cpufreq_cpu_init\n");
+	pr_debug("acpi_cpufreq_cpu_init\n");
 
 	data = kzalloc(sizeof(struct cpufreq_acpi_io), GFP_KERNEL);
 	if (!data)
@@ -288,7 +286,7 @@ acpi_cpufreq_cpu_init (
 
 	/* capability check */
 	if (data->acpi_data.state_count <= 1) {
-		dprintk("No P-States\n");
+		pr_debug("No P-States\n");
 		result = -ENODEV;
 		goto err_unreg;
 	}
@@ -297,7 +295,7 @@ acpi_cpufreq_cpu_init (
 					ACPI_ADR_SPACE_FIXED_HARDWARE) ||
 	    (data->acpi_data.status_register.space_id !=
 					ACPI_ADR_SPACE_FIXED_HARDWARE)) {
-		dprintk("Unsupported address space [%d, %d]\n",
+		pr_debug("Unsupported address space [%d, %d]\n",
 			(u32) (data->acpi_data.control_register.space_id),
 			(u32) (data->acpi_data.status_register.space_id));
 		result = -ENODEV;
@@ -348,7 +346,7 @@ acpi_cpufreq_cpu_init (
 	       "activated.\n", cpu);
 
 	for (i = 0; i < data->acpi_data.state_count; i++)
-		dprintk("     %cP%d: %d MHz, %d mW, %d uS, %d uS, 0x%x 0x%x\n",
+		pr_debug("     %cP%d: %d MHz, %d mW, %d uS, %d uS, 0x%x 0x%x\n",
 			(i == data->acpi_data.state?'*':' '), i,
 			(u32) data->acpi_data.states[i].core_frequency,
 			(u32) data->acpi_data.states[i].power,
@@ -383,7 +381,7 @@ acpi_cpufreq_cpu_exit (
 {
 	struct cpufreq_acpi_io *data = acpi_io_data[policy->cpu];
 
-	dprintk("acpi_cpufreq_cpu_exit\n");
+	pr_debug("acpi_cpufreq_cpu_exit\n");
 
 	if (data) {
 		cpufreq_frequency_table_put_attr(policy->cpu);
@@ -418,7 +416,7 @@ static struct cpufreq_driver acpi_cpufreq_driver = {
 static int __init
 acpi_cpufreq_init (void)
 {
-	dprintk("acpi_cpufreq_init\n");
+	pr_debug("acpi_cpufreq_init\n");
 
  	return cpufreq_register_driver(&acpi_cpufreq_driver);
 }
@@ -427,7 +425,7 @@ acpi_cpufreq_init (void)
 static void __exit
 acpi_cpufreq_exit (void)
 {
-	dprintk("acpi_cpufreq_exit\n");
+	pr_debug("acpi_cpufreq_exit\n");
 
 	cpufreq_unregister_driver(&acpi_cpufreq_driver);
 	return;
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index a2baafb2fe6..4e04e127438 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -47,9 +47,6 @@
 #include <asm/cpufeature.h>
 #include "mperf.h"
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"acpi-cpufreq", msg)
-
 MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
 MODULE_DESCRIPTION("ACPI Processor P-States Driver");
 MODULE_LICENSE("GPL");
@@ -233,7 +230,7 @@ static u32 get_cur_val(const struct cpumask *mask)
 	cmd.mask = mask;
 	drv_read(&cmd);
 
-	dprintk("get_cur_val = %u\n", cmd.val);
+	pr_debug("get_cur_val = %u\n", cmd.val);
 
 	return cmd.val;
 }
@@ -244,7 +241,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
 	unsigned int freq;
 	unsigned int cached_freq;
 
-	dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
+	pr_debug("get_cur_freq_on_cpu (%d)\n", cpu);
 
 	if (unlikely(data == NULL ||
 		     data->acpi_data == NULL || data->freq_table == NULL)) {
@@ -261,7 +258,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
 		data->resume = 1;
 	}
 
-	dprintk("cur freq = %u\n", freq);
+	pr_debug("cur freq = %u\n", freq);
 
 	return freq;
 }
@@ -293,7 +290,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 	unsigned int i;
 	int result = 0;
 
-	dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
+	pr_debug("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
 
 	if (unlikely(data == NULL ||
 	     data->acpi_data == NULL || data->freq_table == NULL)) {
@@ -313,11 +310,11 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 	next_perf_state = data->freq_table[next_state].index;
 	if (perf->state == next_perf_state) {
 		if (unlikely(data->resume)) {
-			dprintk("Called after resume, resetting to P%d\n",
+			pr_debug("Called after resume, resetting to P%d\n",
 				next_perf_state);
 			data->resume = 0;
 		} else {
-			dprintk("Already at target state (P%d)\n",
+			pr_debug("Already at target state (P%d)\n",
 				next_perf_state);
 			goto out;
 		}
@@ -357,7 +354,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 
 	if (acpi_pstate_strict) {
 		if (!check_freqs(cmd.mask, freqs.new, data)) {
-			dprintk("acpi_cpufreq_target failed (%d)\n",
+			pr_debug("acpi_cpufreq_target failed (%d)\n",
 				policy->cpu);
 			result = -EAGAIN;
 			goto out;
@@ -378,7 +375,7 @@ static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
 {
 	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
 
-	dprintk("acpi_cpufreq_verify\n");
+	pr_debug("acpi_cpufreq_verify\n");
 
 	return cpufreq_frequency_table_verify(policy, data->freq_table);
 }
@@ -433,11 +430,11 @@ static void free_acpi_perf_data(void)
 static int __init acpi_cpufreq_early_init(void)
 {
 	unsigned int i;
-	dprintk("acpi_cpufreq_early_init\n");
+	pr_debug("acpi_cpufreq_early_init\n");
 
 	acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
 	if (!acpi_perf_data) {
-		dprintk("Memory allocation error for acpi_perf_data.\n");
+		pr_debug("Memory allocation error for acpi_perf_data.\n");
 		return -ENOMEM;
 	}
 	for_each_possible_cpu(i) {
@@ -519,7 +516,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	static int blacklisted;
 #endif
 
-	dprintk("acpi_cpufreq_cpu_init\n");
+	pr_debug("acpi_cpufreq_cpu_init\n");
 
 #ifdef CONFIG_SMP
 	if (blacklisted)
@@ -566,7 +563,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	/* capability check */
 	if (perf->state_count <= 1) {
-		dprintk("No P-States\n");
+		pr_debug("No P-States\n");
 		result = -ENODEV;
 		goto err_unreg;
 	}
@@ -578,11 +575,11 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	switch (perf->control_register.space_id) {
 	case ACPI_ADR_SPACE_SYSTEM_IO:
-		dprintk("SYSTEM IO addr space\n");
+		pr_debug("SYSTEM IO addr space\n");
 		data->cpu_feature = SYSTEM_IO_CAPABLE;
 		break;
 	case ACPI_ADR_SPACE_FIXED_HARDWARE:
-		dprintk("HARDWARE addr space\n");
+		pr_debug("HARDWARE addr space\n");
 		if (!check_est_cpu(cpu)) {
 			result = -ENODEV;
 			goto err_unreg;
@@ -590,7 +587,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 		data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
 		break;
 	default:
-		dprintk("Unknown addr space %d\n",
+		pr_debug("Unknown addr space %d\n",
 			(u32) (perf->control_register.space_id));
 		result = -ENODEV;
 		goto err_unreg;
@@ -661,9 +658,9 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	if (cpu_has(c, X86_FEATURE_APERFMPERF))
 		acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
 
-	dprintk("CPU%u - ACPI performance management activated.\n", cpu);
+	pr_debug("CPU%u - ACPI performance management activated.\n", cpu);
 	for (i = 0; i < perf->state_count; i++)
-		dprintk("     %cP%d: %d MHz, %d mW, %d uS\n",
+		pr_debug("     %cP%d: %d MHz, %d mW, %d uS\n",
 			(i == perf->state ? '*' : ' '), i,
 			(u32) perf->states[i].core_frequency,
 			(u32) perf->states[i].power,
@@ -694,7 +691,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
 {
 	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
 
-	dprintk("acpi_cpufreq_cpu_exit\n");
+	pr_debug("acpi_cpufreq_cpu_exit\n");
 
 	if (data) {
 		cpufreq_frequency_table_put_attr(policy->cpu);
@@ -712,7 +709,7 @@ static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
 {
 	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
 
-	dprintk("acpi_cpufreq_resume\n");
+	pr_debug("acpi_cpufreq_resume\n");
 
 	data->resume = 1;
 
@@ -743,7 +740,7 @@ static int __init acpi_cpufreq_init(void)
 	if (acpi_disabled)
 		return 0;
 
-	dprintk("acpi_cpufreq_init\n");
+	pr_debug("acpi_cpufreq_init\n");
 
 	ret = acpi_cpufreq_early_init();
 	if (ret)
@@ -758,7 +755,7 @@ static int __init acpi_cpufreq_init(void)
 
 static void __exit acpi_cpufreq_exit(void)
 {
-	dprintk("acpi_cpufreq_exit\n");
+	pr_debug("acpi_cpufreq_exit\n");
 
 	cpufreq_unregister_driver(&acpi_cpufreq_driver);
 
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
index 141abebc451..7bac808804f 100644
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
@@ -57,8 +57,6 @@ MODULE_PARM_DESC(min_fsb,
 		"Minimum FSB to use, if not defined: current FSB - 50");
 
 #define PFX "cpufreq-nforce2: "
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"cpufreq-nforce2", msg)
 
 /**
  * nforce2_calc_fsb - calculate FSB
@@ -270,7 +268,7 @@ static int nforce2_target(struct cpufreq_policy *policy,
 	if (freqs.old == freqs.new)
 		return 0;
 
-	dprintk("Old CPU frequency %d kHz, new %d kHz\n",
+	pr_debug("Old CPU frequency %d kHz, new %d kHz\n",
 	       freqs.old, freqs.new);
 
 	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
@@ -282,7 +280,7 @@ static int nforce2_target(struct cpufreq_policy *policy,
 		printk(KERN_ERR PFX "Changing FSB to %d failed\n",
 			target_fsb);
 	else
-		dprintk("Changed FSB successfully to %d\n",
+		pr_debug("Changed FSB successfully to %d\n",
 			target_fsb);
 
 	/* Enable IRQs */
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index 32974cf8423..ffe1f2c92ed 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -142,9 +142,6 @@ module_param(max_duration, int, 0444);
 #define POLICY_MIN_DIV 20
 
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"gx-suspmod", msg)
-
 /**
  * we can detect a core multipiler from dir0_lsb
  * from GX1 datasheet p.56,
@@ -191,7 +188,7 @@ static __init struct pci_dev *gx_detect_chipset(void)
 	/* check if CPU is a MediaGX or a Geode. */
 	if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) &&
 	    (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) {
-		dprintk("error: no MediaGX/Geode processor found!\n");
+		pr_debug("error: no MediaGX/Geode processor found!\n");
 		return NULL;
 	}
 
@@ -201,7 +198,7 @@ static __init struct pci_dev *gx_detect_chipset(void)
 			return gx_pci;
 	}
 
-	dprintk("error: no supported chipset found!\n");
+	pr_debug("error: no supported chipset found!\n");
 	return NULL;
 }
 
@@ -305,14 +302,14 @@ static void gx_set_cpuspeed(unsigned int khz)
 			break;
 		default:
 			local_irq_restore(flags);
-			dprintk("fatal: try to set unknown chipset.\n");
+			pr_debug("fatal: try to set unknown chipset.\n");
 			return;
 		}
 	} else {
 		suscfg = gx_params->pci_suscfg & ~(SUSMOD);
 		gx_params->off_duration = 0;
 		gx_params->on_duration = 0;
-		dprintk("suspend modulation disabled: cpu runs 100%% speed.\n");
+		pr_debug("suspend modulation disabled: cpu runs 100%% speed.\n");
 	}
 
 	gx_write_byte(PCI_MODOFF, gx_params->off_duration);
@@ -327,9 +324,9 @@ static void gx_set_cpuspeed(unsigned int khz)
 
 	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 
-	dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n",
+	pr_debug("suspend modulation w/ duration of ON:%d us, OFF:%d us\n",
 		gx_params->on_duration * 32, gx_params->off_duration * 32);
-	dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new);
+	pr_debug("suspend modulation w/ clock speed: %d kHz.\n", freqs.new);
 }
 
 /****************************************************************
@@ -428,8 +425,8 @@ static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
 	stock_freq = maxfreq;
 	curfreq = gx_get_cpuspeed(0);
 
-	dprintk("cpu max frequency is %d.\n", maxfreq);
-	dprintk("cpu current frequency is %dkHz.\n", curfreq);
+	pr_debug("cpu max frequency is %d.\n", maxfreq);
+	pr_debug("cpu current frequency is %dkHz.\n", curfreq);
 
 	/* setup basic struct for cpufreq API */
 	policy->cpu = 0;
@@ -475,7 +472,7 @@ static int __init cpufreq_gx_init(void)
 	if (max_duration > 0xff)
 		max_duration = 0xff;
 
-	dprintk("geode suspend modulation available.\n");
+	pr_debug("geode suspend modulation available.\n");
 
 	params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL);
 	if (params == NULL)
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index cf48cdd6907..f47d26e2a13 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -77,9 +77,6 @@ static int scale_voltage;
 static int disable_acpi_c3;
 static int revid_errata;
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"longhaul", msg)
-
 
 /* Clock ratios multiplied by 10 */
 static int mults[32];
@@ -87,7 +84,6 @@ static int eblcr[32];
 static int longhaul_version;
 static struct cpufreq_frequency_table *longhaul_table;
 
-#ifdef CONFIG_CPU_FREQ_DEBUG
 static char speedbuffer[8];
 
 static char *print_speed(int speed)
@@ -106,7 +102,6 @@ static char *print_speed(int speed)
 
 	return speedbuffer;
 }
-#endif
 
 
 static unsigned int calc_speed(int mult)
@@ -275,7 +270,7 @@ static void longhaul_setstate(unsigned int table_index)
 
 	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 
-	dprintk("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
+	pr_debug("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
 			fsb, mult/10, mult%10, print_speed(speed/1000));
 retry_loop:
 	preempt_disable();
@@ -460,12 +455,12 @@ static int __cpuinit longhaul_get_ranges(void)
 		break;
 	}
 
-	dprintk("MinMult:%d.%dx MaxMult:%d.%dx\n",
+	pr_debug("MinMult:%d.%dx MaxMult:%d.%dx\n",
 		 minmult/10, minmult%10, maxmult/10, maxmult%10);
 
 	highest_speed = calc_speed(maxmult);
 	lowest_speed = calc_speed(minmult);
-	dprintk("FSB:%dMHz  Lowest speed: %s   Highest speed:%s\n", fsb,
+	pr_debug("FSB:%dMHz  Lowest speed: %s   Highest speed:%s\n", fsb,
 		 print_speed(lowest_speed/1000),
 		 print_speed(highest_speed/1000));
 
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index d9f51367666..34ea359b370 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -15,9 +15,6 @@
 #include <asm/msr.h>
 #include <asm/processor.h>
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"longrun", msg)
-
 static struct cpufreq_driver	longrun_driver;
 
 /**
@@ -40,14 +37,14 @@ static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy)
 	u32 msr_lo, msr_hi;
 
 	rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
-	dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi);
+	pr_debug("longrun flags are %x - %x\n", msr_lo, msr_hi);
 	if (msr_lo & 0x01)
 		policy->policy = CPUFREQ_POLICY_PERFORMANCE;
 	else
 		policy->policy = CPUFREQ_POLICY_POWERSAVE;
 
 	rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-	dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi);
+	pr_debug("longrun ctrl is %x - %x\n", msr_lo, msr_hi);
 	msr_lo &= 0x0000007F;
 	msr_hi &= 0x0000007F;
 
@@ -150,7 +147,7 @@ static unsigned int longrun_get(unsigned int cpu)
 		return 0;
 
 	cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
-	dprintk("cpuid eax is %u\n", eax);
+	pr_debug("cpuid eax is %u\n", eax);
 
 	return eax * 1000;
 }
@@ -196,7 +193,7 @@ static int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
 		rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
 		*high_freq = msr_lo * 1000; /* to kHz */
 
-		dprintk("longrun table interface told %u - %u kHz\n",
+		pr_debug("longrun table interface told %u - %u kHz\n",
 				*low_freq, *high_freq);
 
 		if (*low_freq > *high_freq)
@@ -207,7 +204,7 @@ static int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
 	/* set the upper border to the value determined during TSC init */
 	*high_freq = (cpu_khz / 1000);
 	*high_freq = *high_freq * 1000;
-	dprintk("high frequency is %u kHz\n", *high_freq);
+	pr_debug("high frequency is %u kHz\n", *high_freq);
 
 	/* get current borders */
 	rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
@@ -233,7 +230,7 @@ static int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
 		/* restore values */
 		wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi);
 	}
-	dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax);
+	pr_debug("percentage is %u %%, freq is %u MHz\n", ecx, eax);
 
 	/* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
 	 * eqals
@@ -249,7 +246,7 @@ static int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
 	edx = ((eax - ebx) * 100) / (100 - ecx);
 	*low_freq = edx * 1000; /* back to kHz */
 
-	dprintk("low frequency is %u kHz\n", *low_freq);
+	pr_debug("low frequency is %u kHz\n", *low_freq);
 
 	if (*low_freq > *high_freq)
 		*low_freq = *high_freq;
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 52c93648e49..6be3e0760c2 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -35,8 +35,6 @@
 #include "speedstep-lib.h"
 
 #define PFX	"p4-clockmod: "
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"p4-clockmod", msg)
 
 /*
  * Duty Cycle (3bits), note DC_DISABLE is not specified in
@@ -66,7 +64,7 @@ static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
 	rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
 
 	if (l & 0x01)
-		dprintk("CPU#%d currently thermal throttled\n", cpu);
+		pr_debug("CPU#%d currently thermal throttled\n", cpu);
 
 	if (has_N44_O17_errata[cpu] &&
 	    (newstate == DC_25PT || newstate == DC_DFLT))
@@ -74,10 +72,10 @@ static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
 
 	rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
 	if (newstate == DC_DISABLE) {
-		dprintk("CPU#%d disabling modulation\n", cpu);
+		pr_debug("CPU#%d disabling modulation\n", cpu);
 		wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
 	} else {
-		dprintk("CPU#%d setting duty cycle to %d%%\n",
+		pr_debug("CPU#%d setting duty cycle to %d%%\n",
 			cpu, ((125 * newstate) / 10));
 		/* bits 63 - 5	: reserved
 		 * bit  4	: enable/disable
@@ -217,7 +215,7 @@ static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
 	case 0x0f11:
 	case 0x0f12:
 		has_N44_O17_errata[policy->cpu] = 1;
-		dprintk("has errata -- disabling low frequencies\n");
+		pr_debug("has errata -- disabling low frequencies\n");
 	}
 
 	if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D &&
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index 907c8e637ef..7b0603eb012 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -48,9 +48,6 @@
 
 #define BUF_SZ		4
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER,	\
-					     "pcc-cpufreq", msg)
-
 struct pcc_register_resource {
 	u8 descriptor;
 	u16 length;
@@ -152,7 +149,7 @@ static unsigned int pcc_get_freq(unsigned int cpu)
 
 	spin_lock(&pcc_lock);
 
-	dprintk("get: get_freq for CPU %d\n", cpu);
+	pr_debug("get: get_freq for CPU %d\n", cpu);
 	pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
 
 	input_buffer = 0x1;
@@ -170,7 +167,7 @@ static unsigned int pcc_get_freq(unsigned int cpu)
 
 	status = ioread16(&pcch_hdr->status);
 	if (status != CMD_COMPLETE) {
-		dprintk("get: FAILED: for CPU %d, status is %d\n",
+		pr_debug("get: FAILED: for CPU %d, status is %d\n",
 			cpu, status);
 		goto cmd_incomplete;
 	}
@@ -178,14 +175,14 @@ static unsigned int pcc_get_freq(unsigned int cpu)
 	curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
 			/ 100) * 1000);
 
-	dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
-		"0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
+	pr_debug("get: SUCCESS: (virtual) output_offset for cpu %d is "
+		"0x%p, contains a value of: 0x%x. Speed is: %d MHz\n",
 		cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
 		output_buffer, curr_freq);
 
 	freq_limit = (output_buffer >> 8) & 0xff;
 	if (freq_limit != 0xff) {
-		dprintk("get: frequency for cpu %d is being temporarily"
+		pr_debug("get: frequency for cpu %d is being temporarily"
 			" capped at %d\n", cpu, curr_freq);
 	}
 
@@ -212,8 +209,8 @@ static int pcc_cpufreq_target(struct cpufreq_policy *policy,
 	cpu = policy->cpu;
 	pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
 
-	dprintk("target: CPU %d should go to target freq: %d "
-		"(virtual) input_offset is 0x%x\n",
+	pr_debug("target: CPU %d should go to target freq: %d "
+		"(virtual) input_offset is 0x%p\n",
 		cpu, target_freq,
 		(pcch_virt_addr + pcc_cpu_data->input_offset));
 
@@ -234,14 +231,14 @@ static int pcc_cpufreq_target(struct cpufreq_policy *policy,
 
 	status = ioread16(&pcch_hdr->status);
 	if (status != CMD_COMPLETE) {
-		dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
+		pr_debug("target: FAILED for cpu %d, with status: 0x%x\n",
 			cpu, status);
 		goto cmd_incomplete;
 	}
 	iowrite16(0, &pcch_hdr->status);
 
 	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
+	pr_debug("target: was SUCCESSFUL for cpu %d\n", cpu);
 	spin_unlock(&pcc_lock);
 
 	return 0;
@@ -293,7 +290,7 @@ static int pcc_get_offset(int cpu)
 	memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
 	memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
 
-	dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
+	pr_debug("pcc_get_offset: for CPU %d: pcc_cpu_data "
 		"input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
 		cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
 out_free:
@@ -410,7 +407,7 @@ static int __init pcc_cpufreq_probe(void)
 	if (ACPI_SUCCESS(status)) {
 		ret = pcc_cpufreq_do_osc(&osc_handle);
 		if (ret)
-			dprintk("probe: _OSC evaluation did not succeed\n");
+			pr_debug("probe: _OSC evaluation did not succeed\n");
 		/* Firmware's use of _OSC is optional */
 		ret = 0;
 	}
@@ -433,7 +430,7 @@ static int __init pcc_cpufreq_probe(void)
 
 	mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
 
-	dprintk("probe: mem_resource descriptor: 0x%x,"
+	pr_debug("probe: mem_resource descriptor: 0x%x,"
 		" length: %d, space_id: %d, resource_usage: %d,"
 		" type_specific: %d, granularity: 0x%llx,"
 		" minimum: 0x%llx, maximum: 0x%llx,"
@@ -453,13 +450,13 @@ static int __init pcc_cpufreq_probe(void)
 	pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
 					mem_resource->address_length);
 	if (pcch_virt_addr == NULL) {
-		dprintk("probe: could not map shared mem region\n");
+		pr_debug("probe: could not map shared mem region\n");
 		goto out_free;
 	}
 	pcch_hdr = pcch_virt_addr;
 
-	dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
-	dprintk("probe: PCCH header is at physical address: 0x%llx,"
+	pr_debug("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
+	pr_debug("probe: PCCH header is at physical address: 0x%llx,"
 		" signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
 		" supported features: 0x%x, command field: 0x%x,"
 		" status field: 0x%x, nominal latency: %d us\n",
@@ -469,7 +466,7 @@ static int __init pcc_cpufreq_probe(void)
 		ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
 		ioread32(&pcch_hdr->latency));
 
-	dprintk("probe: min time between commands: %d us,"
+	pr_debug("probe: min time between commands: %d us,"
 		" max time between commands: %d us,"
 		" nominal CPU frequency: %d MHz,"
 		" minimum CPU frequency: %d MHz,"
@@ -494,7 +491,7 @@ static int __init pcc_cpufreq_probe(void)
 	doorbell.access_width = 64;
 	doorbell.address = reg_resource->address;
 
-	dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
+	pr_debug("probe: doorbell: space_id is %d, bit_width is %d, "
 		"bit_offset is %d, access_width is %d, address is 0x%llx\n",
 		doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
 		doorbell.access_width, reg_resource->address);
@@ -515,7 +512,7 @@ static int __init pcc_cpufreq_probe(void)
 
 	doorbell_write = member->integer.value;
 
-	dprintk("probe: doorbell_preserve: 0x%llx,"
+	pr_debug("probe: doorbell_preserve: 0x%llx,"
 		" doorbell_write: 0x%llx\n",
 		doorbell_preserve, doorbell_write);
 
@@ -550,7 +547,7 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 
 	result = pcc_get_offset(cpu);
 	if (result) {
-		dprintk("init: PCCP evaluation failed\n");
+		pr_debug("init: PCCP evaluation failed\n");
 		goto out;
 	}
 
@@ -561,12 +558,12 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	policy->cur = pcc_get_freq(cpu);
 
 	if (!policy->cur) {
-		dprintk("init: Unable to get current CPU frequency\n");
+		pr_debug("init: Unable to get current CPU frequency\n");
 		result = -EINVAL;
 		goto out;
 	}
 
-	dprintk("init: policy->max is %d, policy->min is %d\n",
+	pr_debug("init: policy->max is %d, policy->min is %d\n",
 		policy->max, policy->min);
 out:
 	return result;
@@ -597,7 +594,7 @@ static int __init pcc_cpufreq_init(void)
 
 	ret = pcc_cpufreq_probe();
 	if (ret) {
-		dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
+		pr_debug("pcc_cpufreq_init: PCCH evaluation failed\n");
 		return ret;
 	}
 
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 4a45fd6e41b..d71d9f37235 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -68,7 +68,6 @@ union powernow_acpi_control_t {
 };
 #endif
 
-#ifdef CONFIG_CPU_FREQ_DEBUG
 /* divide by 1000 to get VCore voltage in V. */
 static const int mobile_vid_table[32] = {
     2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
@@ -76,7 +75,6 @@ static const int mobile_vid_table[32] = {
     1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
     1075, 1050, 1025, 1000, 975, 950, 925, 0,
 };
-#endif
 
 /* divide by 10 to get FID. */
 static const int fid_codes[32] = {
@@ -103,9 +101,6 @@ static unsigned int fsb;
 static unsigned int latency;
 static char have_a0;
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"powernow-k7", msg)
-
 static int check_fsb(unsigned int fsbspeed)
 {
 	int delta;
@@ -209,7 +204,7 @@ static int get_ranges(unsigned char *pst)
 		vid = *pst++;
 		powernow_table[j].index |= (vid << 8); /* upper 8 bits */
 
-		dprintk("   FID: 0x%x (%d.%dx [%dMHz])  "
+		pr_debug("   FID: 0x%x (%d.%dx [%dMHz])  "
 			 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
 			 fid_codes[fid] % 10, speed/1000, vid,
 			 mobile_vid_table[vid]/1000,
@@ -367,7 +362,7 @@ static int powernow_acpi_init(void)
 		unsigned int speed, speed_mhz;
 
 		pc.val = (unsigned long) state->control;
-		dprintk("acpi:  P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
+		pr_debug("acpi:  P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
 			 i,
 			 (u32) state->core_frequency,
 			 (u32) state->power,
@@ -401,7 +396,7 @@ static int powernow_acpi_init(void)
 				invalidate_entry(i);
 		}
 
-		dprintk("   FID: 0x%x (%d.%dx [%dMHz])  "
+		pr_debug("   FID: 0x%x (%d.%dx [%dMHz])  "
 			 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
 			 fid_codes[fid] % 10, speed_mhz, vid,
 			 mobile_vid_table[vid]/1000,
@@ -409,7 +404,7 @@ static int powernow_acpi_init(void)
 
 		if (state->core_frequency != speed_mhz) {
 			state->core_frequency = speed_mhz;
-			dprintk("   Corrected ACPI frequency to %d\n",
+			pr_debug("   Corrected ACPI frequency to %d\n",
 				speed_mhz);
 		}
 
@@ -453,8 +448,8 @@ static int powernow_acpi_init(void)
 
 static void print_pst_entry(struct pst_s *pst, unsigned int j)
 {
-	dprintk("PST:%d (@%p)\n", j, pst);
-	dprintk(" cpuid: 0x%x  fsb: %d  maxFID: 0x%x  startvid: 0x%x\n",
+	pr_debug("PST:%d (@%p)\n", j, pst);
+	pr_debug(" cpuid: 0x%x  fsb: %d  maxFID: 0x%x  startvid: 0x%x\n",
 		pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
 }
 
@@ -474,20 +469,20 @@ static int powernow_decode_bios(int maxfid, int startvid)
 		p = phys_to_virt(i);
 
 		if (memcmp(p, "AMDK7PNOW!",  10) == 0) {
-			dprintk("Found PSB header at %p\n", p);
+			pr_debug("Found PSB header at %p\n", p);
 			psb = (struct psb_s *) p;
-			dprintk("Table version: 0x%x\n", psb->tableversion);
+			pr_debug("Table version: 0x%x\n", psb->tableversion);
 			if (psb->tableversion != 0x12) {
 				printk(KERN_INFO PFX "Sorry, only v1.2 tables"
 						" supported right now\n");
 				return -ENODEV;
 			}
 
-			dprintk("Flags: 0x%x\n", psb->flags);
+			pr_debug("Flags: 0x%x\n", psb->flags);
 			if ((psb->flags & 1) == 0)
-				dprintk("Mobile voltage regulator\n");
+				pr_debug("Mobile voltage regulator\n");
 			else
-				dprintk("Desktop voltage regulator\n");
+				pr_debug("Desktop voltage regulator\n");
 
 			latency = psb->settlingtime;
 			if (latency < 100) {
@@ -497,9 +492,9 @@ static int powernow_decode_bios(int maxfid, int startvid)
 						"Correcting.\n", latency);
 				latency = 100;
 			}
-			dprintk("Settling Time: %d microseconds.\n",
+			pr_debug("Settling Time: %d microseconds.\n",
 					psb->settlingtime);
-			dprintk("Has %d PST tables. (Only dumping ones "
+			pr_debug("Has %d PST tables. (Only dumping ones "
 					"relevant to this CPU).\n",
 					psb->numpst);
 
@@ -650,7 +645,7 @@ static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
 		printk(KERN_WARNING PFX "can not determine bus frequency\n");
 		return -EINVAL;
 	}
-	dprintk("FSB: %3dMHz\n", fsb/1000);
+	pr_debug("FSB: %3dMHz\n", fsb/1000);
 
 	if (dmi_check_system(powernow_dmi_table) || acpi_force) {
 		printk(KERN_INFO PFX "PSB/PST known to be broken.  "
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 2368e38327b..83479b6fb9a 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -139,7 +139,7 @@ static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
 	}
 	do {
 		if (i++ > 10000) {
-			dprintk("detected change pending stuck\n");
+			pr_debug("detected change pending stuck\n");
 			return 1;
 		}
 		rdmsr(MSR_FIDVID_STATUS, lo, hi);
@@ -176,7 +176,7 @@ static void fidvid_msr_init(void)
 	fid = lo & MSR_S_LO_CURRENT_FID;
 	lo = fid | (vid << MSR_C_LO_VID_SHIFT);
 	hi = MSR_C_HI_STP_GNT_BENIGN;
-	dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);
+	pr_debug("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);
 	wrmsr(MSR_FIDVID_CTL, lo, hi);
 }
 
@@ -196,7 +196,7 @@ static int write_new_fid(struct powernow_k8_data *data, u32 fid)
 	lo |= (data->currvid << MSR_C_LO_VID_SHIFT);
 	lo |= MSR_C_LO_INIT_FID_VID;
 
-	dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
+	pr_debug("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
 		fid, lo, data->plllock * PLL_LOCK_CONVERSION);
 
 	do {
@@ -244,7 +244,7 @@ static int write_new_vid(struct powernow_k8_data *data, u32 vid)
 	lo |= (vid << MSR_C_LO_VID_SHIFT);
 	lo |= MSR_C_LO_INIT_FID_VID;
 
-	dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
+	pr_debug("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
 		vid, lo, STOP_GRANT_5NS);
 
 	do {
@@ -325,7 +325,7 @@ static int transition_fid_vid(struct powernow_k8_data *data,
 		return 1;
 	}
 
-	dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",
+	pr_debug("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",
 		smp_processor_id(), data->currfid, data->currvid);
 
 	return 0;
@@ -339,7 +339,7 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data,
 	u32 savefid = data->currfid;
 	u32 maxvid, lo, rvomult = 1;
 
-	dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
+	pr_debug("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
 		"reqvid 0x%x, rvo 0x%x\n",
 		smp_processor_id(),
 		data->currfid, data->currvid, reqvid, data->rvo);
@@ -349,12 +349,12 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data,
 	rvosteps *= rvomult;
 	rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
 	maxvid = 0x1f & (maxvid >> 16);
-	dprintk("ph1 maxvid=0x%x\n", maxvid);
+	pr_debug("ph1 maxvid=0x%x\n", maxvid);
 	if (reqvid < maxvid) /* lower numbers are higher voltages */
 		reqvid = maxvid;
 
 	while (data->currvid > reqvid) {
-		dprintk("ph1: curr 0x%x, req vid 0x%x\n",
+		pr_debug("ph1: curr 0x%x, req vid 0x%x\n",
 			data->currvid, reqvid);
 		if (decrease_vid_code_by_step(data, reqvid, data->vidmvs))
 			return 1;
@@ -365,7 +365,7 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data,
 		if (data->currvid == maxvid) {
 			rvosteps = 0;
 		} else {
-			dprintk("ph1: changing vid for rvo, req 0x%x\n",
+			pr_debug("ph1: changing vid for rvo, req 0x%x\n",
 				data->currvid - 1);
 			if (decrease_vid_code_by_step(data, data->currvid-1, 1))
 				return 1;
@@ -382,7 +382,7 @@ static int core_voltage_pre_transition(struct powernow_k8_data *data,
 		return 1;
 	}
 
-	dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n",
+	pr_debug("ph1 complete, currfid 0x%x, currvid 0x%x\n",
 		data->currfid, data->currvid);
 
 	return 0;
@@ -400,7 +400,7 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
 		return 0;
 	}
 
-	dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "
+	pr_debug("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "
 		"reqfid 0x%x\n",
 		smp_processor_id(),
 		data->currfid, data->currvid, reqfid);
@@ -457,7 +457,7 @@ static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
 		return 1;
 	}
 
-	dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n",
+	pr_debug("ph2 complete, currfid 0x%x, currvid 0x%x\n",
 		data->currfid, data->currvid);
 
 	return 0;
@@ -470,7 +470,7 @@ static int core_voltage_post_transition(struct powernow_k8_data *data,
 	u32 savefid = data->currfid;
 	u32 savereqvid = reqvid;
 
-	dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",
+	pr_debug("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",
 		smp_processor_id(),
 		data->currfid, data->currvid);
 
@@ -498,17 +498,17 @@ static int core_voltage_post_transition(struct powernow_k8_data *data,
 		return 1;
 
 	if (savereqvid != data->currvid) {
-		dprintk("ph3 failed, currvid 0x%x\n", data->currvid);
+		pr_debug("ph3 failed, currvid 0x%x\n", data->currvid);
 		return 1;
 	}
 
 	if (savefid != data->currfid) {
-		dprintk("ph3 failed, currfid changed 0x%x\n",
+		pr_debug("ph3 failed, currfid changed 0x%x\n",
 			data->currfid);
 		return 1;
 	}
 
-	dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n",
+	pr_debug("ph3 complete, currfid 0x%x, currvid 0x%x\n",
 		data->currfid, data->currvid);
 
 	return 0;
@@ -707,7 +707,7 @@ static int fill_powernow_table(struct powernow_k8_data *data,
 		return -EIO;
 	}
 
-	dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
+	pr_debug("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
 	data->powernow_table = powernow_table;
 	if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
 		print_basics(data);
@@ -717,7 +717,7 @@ static int fill_powernow_table(struct powernow_k8_data *data,
 		    (pst[j].vid == data->currvid))
 			return 0;
 
-	dprintk("currfid/vid do not match PST, ignoring\n");
+	pr_debug("currfid/vid do not match PST, ignoring\n");
 	return 0;
 }
 
@@ -739,36 +739,36 @@ static int find_psb_table(struct powernow_k8_data *data)
 		if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0)
 			continue;
 
-		dprintk("found PSB header at 0x%p\n", psb);
+		pr_debug("found PSB header at 0x%p\n", psb);
 
-		dprintk("table vers: 0x%x\n", psb->tableversion);
+		pr_debug("table vers: 0x%x\n", psb->tableversion);
 		if (psb->tableversion != PSB_VERSION_1_4) {
 			printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
 			return -ENODEV;
 		}
 
-		dprintk("flags: 0x%x\n", psb->flags1);
+		pr_debug("flags: 0x%x\n", psb->flags1);
 		if (psb->flags1) {
 			printk(KERN_ERR FW_BUG PFX "unknown flags\n");
 			return -ENODEV;
 		}
 
 		data->vstable = psb->vstable;
-		dprintk("voltage stabilization time: %d(*20us)\n",
+		pr_debug("voltage stabilization time: %d(*20us)\n",
 				data->vstable);
 
-		dprintk("flags2: 0x%x\n", psb->flags2);
+		pr_debug("flags2: 0x%x\n", psb->flags2);
 		data->rvo = psb->flags2 & 3;
 		data->irt = ((psb->flags2) >> 2) & 3;
 		mvs = ((psb->flags2) >> 4) & 3;
 		data->vidmvs = 1 << mvs;
 		data->batps = ((psb->flags2) >> 6) & 3;
 
-		dprintk("ramp voltage offset: %d\n", data->rvo);
-		dprintk("isochronous relief time: %d\n", data->irt);
-		dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);
+		pr_debug("ramp voltage offset: %d\n", data->rvo);
+		pr_debug("isochronous relief time: %d\n", data->irt);
+		pr_debug("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);
 
-		dprintk("numpst: 0x%x\n", psb->num_tables);
+		pr_debug("numpst: 0x%x\n", psb->num_tables);
 		cpst = psb->num_tables;
 		if ((psb->cpuid == 0x00000fc0) ||
 		    (psb->cpuid == 0x00000fe0)) {
@@ -783,13 +783,13 @@ static int find_psb_table(struct powernow_k8_data *data)
 		}
 
 		data->plllock = psb->plllocktime;
-		dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);
-		dprintk("maxfid: 0x%x\n", psb->maxfid);
-		dprintk("maxvid: 0x%x\n", psb->maxvid);
+		pr_debug("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);
+		pr_debug("maxfid: 0x%x\n", psb->maxfid);
+		pr_debug("maxvid: 0x%x\n", psb->maxvid);
 		maxvid = psb->maxvid;
 
 		data->numps = psb->numps;
-		dprintk("numpstates: 0x%x\n", data->numps);
+		pr_debug("numpstates: 0x%x\n", data->numps);
 		return fill_powernow_table(data,
 				(struct pst_s *)(psb+1), maxvid);
 	}
@@ -834,13 +834,13 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
 	u64 control, status;
 
 	if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
-		dprintk("register performance failed: bad ACPI data\n");
+		pr_debug("register performance failed: bad ACPI data\n");
 		return -EIO;
 	}
 
 	/* verify the data contained in the ACPI structures */
 	if (data->acpi_data.state_count <= 1) {
-		dprintk("No ACPI P-States\n");
+		pr_debug("No ACPI P-States\n");
 		goto err_out;
 	}
 
@@ -849,7 +849,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
 
 	if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
 	    (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
-		dprintk("Invalid control/status registers (%x - %x)\n",
+		pr_debug("Invalid control/status registers (%llx - %llx)\n",
 			control, status);
 		goto err_out;
 	}
@@ -858,7 +858,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
 	powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
 		* (data->acpi_data.state_count + 1)), GFP_KERNEL);
 	if (!powernow_table) {
-		dprintk("powernow_table memory alloc failure\n");
+		pr_debug("powernow_table memory alloc failure\n");
 		goto err_out;
 	}
 
@@ -928,7 +928,7 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
 		}
 		rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
 		if (!(hi & HW_PSTATE_VALID_MASK)) {
-			dprintk("invalid pstate %d, ignoring\n", index);
+			pr_debug("invalid pstate %d, ignoring\n", index);
 			invalidate_entry(powernow_table, i);
 			continue;
 		}
@@ -968,7 +968,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
 			vid = (control >> VID_SHIFT) & VID_MASK;
 		}
 
-		dprintk("   %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
+		pr_debug("   %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
 
 		index = fid | (vid<<8);
 		powernow_table[i].index = index;
@@ -978,7 +978,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
 
 		/* verify frequency is OK */
 		if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
-			dprintk("invalid freq %u kHz, ignoring\n", freq);
+			pr_debug("invalid freq %u kHz, ignoring\n", freq);
 			invalidate_entry(powernow_table, i);
 			continue;
 		}
@@ -986,7 +986,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
 		/* verify voltage is OK -
 		 * BIOSs are using "off" to indicate invalid */
 		if (vid == VID_OFF) {
-			dprintk("invalid vid %u, ignoring\n", vid);
+			pr_debug("invalid vid %u, ignoring\n", vid);
 			invalidate_entry(powernow_table, i);
 			continue;
 		}
@@ -1047,7 +1047,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,
 	int res, i;
 	struct cpufreq_freqs freqs;
 
-	dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
+	pr_debug("cpu %d transition to index %u\n", smp_processor_id(), index);
 
 	/* fid/vid correctness check for k8 */
 	/* fid are the lower 8 bits of the index we stored into
@@ -1057,18 +1057,18 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,
 	fid = data->powernow_table[index].index & 0xFF;
 	vid = (data->powernow_table[index].index & 0xFF00) >> 8;
 
-	dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
+	pr_debug("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
 
 	if (query_current_values_with_pending_wait(data))
 		return 1;
 
 	if ((data->currvid == vid) && (data->currfid == fid)) {
-		dprintk("target matches current values (fid 0x%x, vid 0x%x)\n",
+		pr_debug("target matches current values (fid 0x%x, vid 0x%x)\n",
 			fid, vid);
 		return 0;
 	}
 
-	dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
+	pr_debug("cpu %d, changing to fid 0x%x, vid 0x%x\n",
 		smp_processor_id(), fid, vid);
 	freqs.old = find_khz_freq_from_fid(data->currfid);
 	freqs.new = find_khz_freq_from_fid(fid);
@@ -1096,7 +1096,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,
 	int res, i;
 	struct cpufreq_freqs freqs;
 
-	dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
+	pr_debug("cpu %d transition to index %u\n", smp_processor_id(), index);
 
 	/* get MSR index for hardware pstate transition */
 	pstate = index & HW_PSTATE_MASK;
@@ -1156,14 +1156,14 @@ static int powernowk8_target(struct cpufreq_policy *pol,
 		goto err_out;
 	}
 
-	dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
+	pr_debug("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
 		pol->cpu, targfreq, pol->min, pol->max, relation);
 
 	if (query_current_values_with_pending_wait(data))
 		goto err_out;
 
 	if (cpu_family != CPU_HW_PSTATE) {
-		dprintk("targ: curr fid 0x%x, vid 0x%x\n",
+		pr_debug("targ: curr fid 0x%x, vid 0x%x\n",
 		data->currfid, data->currvid);
 
 		if ((checkvid != data->currvid) ||
@@ -1319,7 +1319,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 				data->currpstate);
 	else
 		pol->cur = find_khz_freq_from_fid(data->currfid);
-	dprintk("policy current frequency %d kHz\n", pol->cur);
+	pr_debug("policy current frequency %d kHz\n", pol->cur);
 
 	/* min/max the cpu is capable of */
 	if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
@@ -1337,10 +1337,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 	cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
 
 	if (cpu_family == CPU_HW_PSTATE)
-		dprintk("cpu_init done, current pstate 0x%x\n",
+		pr_debug("cpu_init done, current pstate 0x%x\n",
 				data->currpstate);
 	else
-		dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
+		pr_debug("cpu_init done, current fid 0x%x, vid 0x%x\n",
 			data->currfid, data->currvid);
 
 	per_cpu(powernow_data, pol->cpu) = data;
@@ -1586,7 +1586,7 @@ static int __cpuinit powernowk8_init(void)
 /* driver entry point for term */
 static void __exit powernowk8_exit(void)
 {
-	dprintk("exit\n");
+	pr_debug("exit\n");
 
 	if (boot_cpu_has(X86_FEATURE_CPB)) {
 		msrs_free(msrs);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index df3529b1c02..3744d26cdc2 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -211,8 +211,6 @@ struct pst_s {
 	u8 vid;
 };
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
-
 static int core_voltage_pre_transition(struct powernow_k8_data *data,
 	u32 reqvid, u32 regfid);
 static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
index 435a996a613..1e205e6b172 100644
--- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
+++ b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
@@ -29,8 +29,6 @@
 
 static __u8 __iomem *cpuctl;
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"sc520_freq", msg)
 #define PFX "sc520_freq: "
 
 static struct cpufreq_frequency_table sc520_freq_table[] = {
@@ -66,7 +64,7 @@ static void sc520_freq_set_cpu_state(unsigned int state)
 
 	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 
-	dprintk("attempting to set frequency to %i kHz\n",
+	pr_debug("attempting to set frequency to %i kHz\n",
 			sc520_freq_table[state].frequency);
 
 	local_irq_disable();
@@ -161,7 +159,7 @@ static int __init sc520_freq_init(void)
 	/* Test if we have the right hardware */
 	if (c->x86_vendor != X86_VENDOR_AMD ||
 	    c->x86 != 4 || c->x86_model != 9) {
-		dprintk("no Elan SC520 processor found!\n");
+		pr_debug("no Elan SC520 processor found!\n");
 		return -ENODEV;
 	}
 	cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 9b1ff37de46..6ea3455def2 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -29,9 +29,6 @@
 #define PFX		"speedstep-centrino: "
 #define MAINTAINER	"cpufreq@vger.kernel.org"
 
-#define dprintk(msg...) \
-	cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
-
 #define INTEL_MSR_RANGE	(0xffff)
 
 struct cpu_id
@@ -244,7 +241,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
 
 	if (model->cpu_id == NULL) {
 		/* No match at all */
-		dprintk("no support for CPU model \"%s\": "
+		pr_debug("no support for CPU model \"%s\": "
 		       "send /proc/cpuinfo to " MAINTAINER "\n",
 		       cpu->x86_model_id);
 		return -ENOENT;
@@ -252,15 +249,15 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
 
 	if (model->op_points == NULL) {
 		/* Matched a non-match */
-		dprintk("no table support for CPU model \"%s\"\n",
+		pr_debug("no table support for CPU model \"%s\"\n",
 		       cpu->x86_model_id);
-		dprintk("try using the acpi-cpufreq driver\n");
+		pr_debug("try using the acpi-cpufreq driver\n");
 		return -ENOENT;
 	}
 
 	per_cpu(centrino_model, policy->cpu) = model;
 
-	dprintk("found \"%s\": max frequency: %dkHz\n",
+	pr_debug("found \"%s\": max frequency: %dkHz\n",
 	       model->model_name, model->max_freq);
 
 	return 0;
@@ -369,7 +366,7 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
 		per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
 
 	if (!per_cpu(centrino_cpu, policy->cpu)) {
-		dprintk("found unsupported CPU with "
+		pr_debug("found unsupported CPU with "
 		"Enhanced SpeedStep: send /proc/cpuinfo to "
 		MAINTAINER "\n");
 		return -ENODEV;
@@ -385,7 +382,7 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
 
 	if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
 		l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
-		dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);
+		pr_debug("trying to enable Enhanced SpeedStep (%x)\n", l);
 		wrmsr(MSR_IA32_MISC_ENABLE, l, h);
 
 		/* check to see if it stuck */
@@ -402,7 +399,7 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
 						/* 10uS transition latency */
 	policy->cur = freq;
 
-	dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
+	pr_debug("centrino_cpu_init: cur=%dkHz\n", policy->cur);
 
 	ret = cpufreq_frequency_table_cpuinfo(policy,
 		per_cpu(centrino_model, policy->cpu)->op_points);
@@ -498,7 +495,7 @@ static int centrino_target (struct cpufreq_policy *policy,
 			good_cpu = j;
 
 		if (good_cpu >= nr_cpu_ids) {
-			dprintk("couldn't limit to CPUs in this domain\n");
+			pr_debug("couldn't limit to CPUs in this domain\n");
 			retval = -EAGAIN;
 			if (first_cpu) {
 				/* We haven't started the transition yet. */
@@ -512,7 +509,7 @@ static int centrino_target (struct cpufreq_policy *policy,
 		if (first_cpu) {
 			rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);
 			if (msr == (oldmsr & 0xffff)) {
-				dprintk("no change needed - msr was and needs "
+				pr_debug("no change needed - msr was and needs "
 					"to be %x\n", oldmsr);
 				retval = 0;
 				goto out;
@@ -521,7 +518,7 @@ static int centrino_target (struct cpufreq_policy *policy,
 			freqs.old = extract_clock(oldmsr, cpu, 0);
 			freqs.new = extract_clock(msr, cpu, 0);
 
-			dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
+			pr_debug("target=%dkHz old=%d new=%d msr=%04x\n",
 				target_freq, freqs.old, freqs.new, msr);
 
 			for_each_cpu(k, policy->cpus) {
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 561758e9518..a748ce782fe 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -53,10 +53,6 @@ static struct cpufreq_frequency_table speedstep_freqs[] = {
 };
 
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"speedstep-ich", msg)
-
-
 /**
  * speedstep_find_register - read the PMBASE address
  *
@@ -80,7 +76,7 @@ static int speedstep_find_register(void)
 		return -ENODEV;
 	}
 
-	dprintk("pmbase is 0x%x\n", pmbase);
+	pr_debug("pmbase is 0x%x\n", pmbase);
 	return 0;
 }
 
@@ -106,13 +102,13 @@ static void speedstep_set_state(unsigned int state)
 	/* read state */
 	value = inb(pmbase + 0x50);
 
-	dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
+	pr_debug("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
 
 	/* write new state */
 	value &= 0xFE;
 	value |= state;
 
-	dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase);
+	pr_debug("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase);
 
 	/* Disable bus master arbitration */
 	pm2_blk = inb(pmbase + 0x20);
@@ -132,10 +128,10 @@ static void speedstep_set_state(unsigned int state)
 	/* Enable IRQs */
 	local_irq_restore(flags);
 
-	dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
+	pr_debug("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
 
 	if (state == (value & 0x1))
-		dprintk("change to %u MHz succeeded\n",
+		pr_debug("change to %u MHz succeeded\n",
 			speedstep_get_frequency(speedstep_processor) / 1000);
 	else
 		printk(KERN_ERR "cpufreq: change failed - I/O error\n");
@@ -165,7 +161,7 @@ static int speedstep_activate(void)
 	pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value);
 	if (!(value & 0x08)) {
 		value |= 0x08;
-		dprintk("activating SpeedStep (TM) registers\n");
+		pr_debug("activating SpeedStep (TM) registers\n");
 		pci_write_config_word(speedstep_chipset_dev, 0x00A0, value);
 	}
 
@@ -218,7 +214,7 @@ static unsigned int speedstep_detect_chipset(void)
 			return 2; /* 2-M */
 
 		if (hostbridge->revision < 5) {
-			dprintk("hostbridge does not support speedstep\n");
+			pr_debug("hostbridge does not support speedstep\n");
 			speedstep_chipset_dev = NULL;
 			pci_dev_put(hostbridge);
 			return 0;
@@ -246,7 +242,7 @@ static unsigned int speedstep_get(unsigned int cpu)
 	if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
 		BUG();
 
-	dprintk("detected %u kHz as current frequency\n", speed);
+	pr_debug("detected %u kHz as current frequency\n", speed);
 	return speed;
 }
 
@@ -276,7 +272,7 @@ static int speedstep_target(struct cpufreq_policy *policy,
 	freqs.new = speedstep_freqs[newstate].frequency;
 	freqs.cpu = policy->cpu;
 
-	dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new);
+	pr_debug("transiting from %u to %u kHz\n", freqs.old, freqs.new);
 
 	/* no transition necessary */
 	if (freqs.old == freqs.new)
@@ -351,7 +347,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
 	if (!speed)
 		return -EIO;
 
-	dprintk("currently at %s speed setting - %i MHz\n",
+	pr_debug("currently at %s speed setting - %i MHz\n",
 		(speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
 		? "low" : "high",
 		(speed / 1000));
@@ -405,14 +401,14 @@ static int __init speedstep_init(void)
 	/* detect processor */
 	speedstep_processor = speedstep_detect_processor();
 	if (!speedstep_processor) {
-		dprintk("Intel(R) SpeedStep(TM) capable processor "
+		pr_debug("Intel(R) SpeedStep(TM) capable processor "
 				"not found\n");
 		return -ENODEV;
 	}
 
 	/* detect chipset */
 	if (!speedstep_detect_chipset()) {
-		dprintk("Intel(R) SpeedStep(TM) for this chipset not "
+		pr_debug("Intel(R) SpeedStep(TM) for this chipset not "
 				"(yet) available.\n");
 		return -ENODEV;
 	}
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index a94ec6be69f..8af2d2fd9d5 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -18,9 +18,6 @@
 #include <asm/tsc.h>
 #include "speedstep-lib.h"
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"speedstep-lib", msg)
-
 #define PFX "speedstep-lib: "
 
 #ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
@@ -75,7 +72,7 @@ static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
 
 	/* read MSR 0x2a - we only need the low 32 bits */
 	rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
-	dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
+	pr_debug("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
 	msr_tmp = msr_lo;
 
 	/* decode the FSB */
@@ -89,7 +86,7 @@ static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
 
 	/* decode the multiplier */
 	if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) {
-		dprintk("workaround for early PIIIs\n");
+		pr_debug("workaround for early PIIIs\n");
 		msr_lo &= 0x03c00000;
 	} else
 		msr_lo &= 0x0bc00000;
@@ -100,7 +97,7 @@ static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
 		j++;
 	}
 
-	dprintk("speed is %u\n",
+	pr_debug("speed is %u\n",
 		(msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
 
 	return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100;
@@ -112,7 +109,7 @@ static unsigned int pentiumM_get_frequency(void)
 	u32 msr_lo, msr_tmp;
 
 	rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
-	dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
+	pr_debug("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
 
 	/* see table B-2 of 24547212.pdf */
 	if (msr_lo & 0x00040000) {
@@ -122,7 +119,7 @@ static unsigned int pentiumM_get_frequency(void)
 	}
 
 	msr_tmp = (msr_lo >> 22) & 0x1f;
-	dprintk("bits 22-26 are 0x%x, speed is %u\n",
+	pr_debug("bits 22-26 are 0x%x, speed is %u\n",
 			msr_tmp, (msr_tmp * 100 * 1000));
 
 	return msr_tmp * 100 * 1000;
@@ -160,11 +157,11 @@ static unsigned int pentium_core_get_frequency(void)
 	}
 
 	rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
-	dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n",
+	pr_debug("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n",
 			msr_lo, msr_tmp);
 
 	msr_tmp = (msr_lo >> 22) & 0x1f;
-	dprintk("bits 22-26 are 0x%x, speed is %u\n",
+	pr_debug("bits 22-26 are 0x%x, speed is %u\n",
 			msr_tmp, (msr_tmp * fsb));
 
 	ret = (msr_tmp * fsb);
@@ -190,7 +187,7 @@ static unsigned int pentium4_get_frequency(void)
 
 	rdmsr(0x2c, msr_lo, msr_hi);
 
-	dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi);
+	pr_debug("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi);
 
 	/* decode the FSB: see IA-32 Intel (C) Architecture Software
 	 * Developer's Manual, Volume 3: System Prgramming Guide,
@@ -217,7 +214,7 @@ static unsigned int pentium4_get_frequency(void)
 	/* Multiplier. */
 	mult = msr_lo >> 24;
 
-	dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n",
+	pr_debug("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n",
 			fsb, mult, (fsb * mult));
 
 	ret = (fsb * mult);
@@ -257,7 +254,7 @@ unsigned int speedstep_detect_processor(void)
 	struct cpuinfo_x86 *c = &cpu_data(0);
 	u32 ebx, msr_lo, msr_hi;
 
-	dprintk("x86: %x, model: %x\n", c->x86, c->x86_model);
+	pr_debug("x86: %x, model: %x\n", c->x86, c->x86_model);
 
 	if ((c->x86_vendor != X86_VENDOR_INTEL) ||
 	    ((c->x86 != 6) && (c->x86 != 0xF)))
@@ -272,7 +269,7 @@ unsigned int speedstep_detect_processor(void)
 		ebx = cpuid_ebx(0x00000001);
 		ebx &= 0x000000FF;
 
-		dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask);
+		pr_debug("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask);
 
 		switch (c->x86_mask) {
 		case 4:
@@ -327,7 +324,7 @@ unsigned int speedstep_detect_processor(void)
 		/* cpuid_ebx(1) is 0x04 for desktop PIII,
 		 * 0x06 for mobile PIII-M */
 		ebx = cpuid_ebx(0x00000001);
-		dprintk("ebx is %x\n", ebx);
+		pr_debug("ebx is %x\n", ebx);
 
 		ebx &= 0x000000FF;
 
@@ -344,7 +341,7 @@ unsigned int speedstep_detect_processor(void)
 		/* all mobile PIII Coppermines have FSB 100 MHz
 		 * ==> sort out a few desktop PIIIs. */
 		rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
-		dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n",
+		pr_debug("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n",
 				msr_lo, msr_hi);
 		msr_lo &= 0x00c0000;
 		if (msr_lo != 0x0080000)
@@ -357,12 +354,12 @@ unsigned int speedstep_detect_processor(void)
 		 * bit 56 or 57 is set
 		 */
 		rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
-		dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n",
+		pr_debug("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n",
 				msr_lo, msr_hi);
 		if ((msr_hi & (1<<18)) &&
 		    (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
 			if (c->x86_mask == 0x01) {
-				dprintk("early PIII version\n");
+				pr_debug("early PIII version\n");
 				return SPEEDSTEP_CPU_PIII_C_EARLY;
 			} else
 				return SPEEDSTEP_CPU_PIII_C;
@@ -393,14 +390,14 @@ unsigned int speedstep_get_freqs(enum speedstep_processor processor,
 	if ((!processor) || (!low_speed) || (!high_speed) || (!set_state))
 		return -EINVAL;
 
-	dprintk("trying to determine both speeds\n");
+	pr_debug("trying to determine both speeds\n");
 
 	/* get current speed */
 	prev_speed = speedstep_get_frequency(processor);
 	if (!prev_speed)
 		return -EIO;
 
-	dprintk("previous speed is %u\n", prev_speed);
+	pr_debug("previous speed is %u\n", prev_speed);
 
 	local_irq_save(flags);
 
@@ -412,7 +409,7 @@ unsigned int speedstep_get_freqs(enum speedstep_processor processor,
 		goto out;
 	}
 
-	dprintk("low speed is %u\n", *low_speed);
+	pr_debug("low speed is %u\n", *low_speed);
 
 	/* start latency measurement */
 	if (transition_latency)
@@ -431,7 +428,7 @@ unsigned int speedstep_get_freqs(enum speedstep_processor processor,
 		goto out;
 	}
 
-	dprintk("high speed is %u\n", *high_speed);
+	pr_debug("high speed is %u\n", *high_speed);
 
 	if (*low_speed == *high_speed) {
 		ret = -ENODEV;
@@ -445,7 +442,7 @@ unsigned int speedstep_get_freqs(enum speedstep_processor processor,
 	if (transition_latency) {
 		*transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC +
 			tv2.tv_usec - tv1.tv_usec;
-		dprintk("transition latency is %u uSec\n", *transition_latency);
+		pr_debug("transition latency is %u uSec\n", *transition_latency);
 
 		/* convert uSec to nSec and add 20% for safety reasons */
 		*transition_latency *= 1200;
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index 91bc25b67bc..c76ead3490b 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -55,9 +55,6 @@ static struct cpufreq_frequency_table speedstep_freqs[] = {
  * of DMA activity going on? */
 #define SMI_TRIES 5
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"speedstep-smi", msg)
-
 /**
  * speedstep_smi_ownership
  */
@@ -70,7 +67,7 @@ static int speedstep_smi_ownership(void)
 	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
 	magic = virt_to_phys(magic_data);
 
-	dprintk("trying to obtain ownership with command %x at port %x\n",
+	pr_debug("trying to obtain ownership with command %x at port %x\n",
 			command, smi_port);
 
 	__asm__ __volatile__(
@@ -85,7 +82,7 @@ static int speedstep_smi_ownership(void)
 		: "memory"
 	);
 
-	dprintk("result is %x\n", result);
+	pr_debug("result is %x\n", result);
 
 	return result;
 }
@@ -106,13 +103,13 @@ static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high)
 	u32 function = GET_SPEEDSTEP_FREQS;
 
 	if (!(ist_info.event & 0xFFFF)) {
-		dprintk("bug #1422 -- can't read freqs from BIOS\n");
+		pr_debug("bug #1422 -- can't read freqs from BIOS\n");
 		return -ENODEV;
 	}
 
 	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
 
-	dprintk("trying to determine frequencies with command %x at port %x\n",
+	pr_debug("trying to determine frequencies with command %x at port %x\n",
 			command, smi_port);
 
 	__asm__ __volatile__(
@@ -129,7 +126,7 @@ static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high)
 		  "d" (smi_port), "S" (0), "D" (0)
 	);
 
-	dprintk("result %x, low_freq %u, high_freq %u\n",
+	pr_debug("result %x, low_freq %u, high_freq %u\n",
 			result, low_mhz, high_mhz);
 
 	/* abort if results are obviously incorrect... */
@@ -154,7 +151,7 @@ static int speedstep_get_state(void)
 
 	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
 
-	dprintk("trying to determine current setting with command %x "
+	pr_debug("trying to determine current setting with command %x "
 		"at port %x\n", command, smi_port);
 
 	__asm__ __volatile__(
@@ -168,7 +165,7 @@ static int speedstep_get_state(void)
 		  "d" (smi_port), "S" (0), "D" (0)
 	);
 
-	dprintk("state is %x, result is %x\n", state, result);
+	pr_debug("state is %x, result is %x\n", state, result);
 
 	return state & 1;
 }
@@ -194,13 +191,13 @@ static void speedstep_set_state(unsigned int state)
 
 	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
 
-	dprintk("trying to set frequency to state %u "
+	pr_debug("trying to set frequency to state %u "
 		"with command %x at port %x\n",
 		state, command, smi_port);
 
 	do {
 		if (retry) {
-			dprintk("retry %u, previous result %u, waiting...\n",
+			pr_debug("retry %u, previous result %u, waiting...\n",
 					retry, result);
 			mdelay(retry * 50);
 		}
@@ -221,7 +218,7 @@ static void speedstep_set_state(unsigned int state)
 	local_irq_restore(flags);
 
 	if (new_state == state)
-		dprintk("change to %u MHz succeeded after %u tries "
+		pr_debug("change to %u MHz succeeded after %u tries "
 			"with result %u\n",
 			(speedstep_freqs[new_state].frequency / 1000),
 			retry, result);
@@ -292,7 +289,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
 
 	result = speedstep_smi_ownership();
 	if (result) {
-		dprintk("fails in acquiring ownership of a SMI interface.\n");
+		pr_debug("fails in acquiring ownership of a SMI interface.\n");
 		return -EINVAL;
 	}
 
@@ -304,7 +301,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
 	if (result) {
 		/* fall back to speedstep_lib.c dection mechanism:
 		 * try both states out */
-		dprintk("could not detect low and high frequencies "
+		pr_debug("could not detect low and high frequencies "
 				"by SMI call.\n");
 		result = speedstep_get_freqs(speedstep_processor,
 				low, high,
@@ -312,18 +309,18 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
 				&speedstep_set_state);
 
 		if (result) {
-			dprintk("could not detect two different speeds"
+			pr_debug("could not detect two different speeds"
 					" -- aborting.\n");
 			return result;
 		} else
-			dprintk("workaround worked.\n");
+			pr_debug("workaround worked.\n");
 	}
 
 	/* get current speed setting */
 	state = speedstep_get_state();
 	speed = speedstep_freqs[state].frequency;
 
-	dprintk("currently at %s speed setting - %i MHz\n",
+	pr_debug("currently at %s speed setting - %i MHz\n",
 		(speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
 		? "low" : "high",
 		(speed / 1000));
@@ -360,7 +357,7 @@ static int speedstep_resume(struct cpufreq_policy *policy)
 	int result = speedstep_smi_ownership();
 
 	if (result)
-		dprintk("fails in re-acquiring ownership of a SMI interface.\n");
+		pr_debug("fails in re-acquiring ownership of a SMI interface.\n");
 
 	return result;
 }
@@ -403,12 +400,12 @@ static int __init speedstep_init(void)
 	}
 
 	if (!speedstep_processor) {
-		dprintk("No supported Intel CPU detected.\n");
+		pr_debug("No supported Intel CPU detected.\n");
 		return -ENODEV;
 	}
 
-	dprintk("signature:0x%.8lx, command:0x%.8lx, "
-		"event:0x%.8lx, perf_level:0x%.8lx.\n",
+	pr_debug("signature:0x%.8ulx, command:0x%.8ulx, "
+		"event:0x%.8ulx, perf_level:0x%.8ulx.\n",
 		ist_info.signature, ist_info.command,
 		ist_info.event, ist_info.perf_level);
 
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 3a73a93596e..85b32376dad 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -49,10 +49,6 @@ ACPI_MODULE_NAME("processor_perflib");
 
 static DEFINE_MUTEX(performance_mutex);
 
-/* Use cpufreq debug layer for _PPC changes. */
-#define cpufreq_printk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_CORE, \
-						"cpufreq-core", msg)
-
 /*
  * _PPC support is implemented as a CPUfreq policy notifier:
  * This means each time a CPUfreq driver registered also with
@@ -145,7 +141,7 @@ static int acpi_processor_get_platform_limit(struct acpi_processor *pr)
 		return -ENODEV;
 	}
 
-	cpufreq_printk("CPU %d: _PPC is %d - frequency %s limited\n", pr->id,
+	pr_debug("CPU %d: _PPC is %d - frequency %s limited\n", pr->id,
 		       (int)ppc, ppc ? "" : "not");
 
 	pr->performance_platform_limit = (int)ppc;
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index ca8ee8093d6..b78baa547ef 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -18,19 +18,6 @@ if CPU_FREQ
 config CPU_FREQ_TABLE
 	tristate
 
-config CPU_FREQ_DEBUG
-	bool "Enable CPUfreq debugging"
-	help
-	  Say Y here to enable CPUfreq subsystem (including drivers)
-	  debugging. You will need to activate it via the kernel
-	  command line by passing
-	     cpufreq.debug=<value>
-
-	  To get <value>, add 
-	       1 to activate CPUfreq core debugging,
-	       2 to activate CPUfreq drivers debugging, and
-	       4 to activate CPUfreq governor debugging
-
 config CPU_FREQ_STAT
 	tristate "CPU frequency translation statistics"
 	select CPU_FREQ_TABLE
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 7c10f96c5ae..1e08af43ae7 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -32,9 +32,6 @@
 
 #include <trace/events/power.h>
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_CORE, \
-						"cpufreq-core", msg)
-
 /**
  * The "cpufreq driver" - the arch- or hardware-dependent low
  * level driver of CPUFreq support, and its spinlock. This lock
@@ -180,93 +177,6 @@ void cpufreq_cpu_put(struct cpufreq_policy *data)
 EXPORT_SYMBOL_GPL(cpufreq_cpu_put);
 
 
-/*********************************************************************
- *                     UNIFIED DEBUG HELPERS                         *
- *********************************************************************/
-#ifdef CONFIG_CPU_FREQ_DEBUG
-
-/* what part(s) of the CPUfreq subsystem are debugged? */
-static unsigned int debug;
-
-/* is the debug output ratelimit'ed using printk_ratelimit? User can
- * set or modify this value.
- */
-static unsigned int debug_ratelimit = 1;
-
-/* is the printk_ratelimit'ing enabled? It's enabled after a successful
- * loading of a cpufreq driver, temporarily disabled when a new policy
- * is set, and disabled upon cpufreq driver removal
- */
-static unsigned int disable_ratelimit = 1;
-static DEFINE_SPINLOCK(disable_ratelimit_lock);
-
-static void cpufreq_debug_enable_ratelimit(void)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&disable_ratelimit_lock, flags);
-	if (disable_ratelimit)
-		disable_ratelimit--;
-	spin_unlock_irqrestore(&disable_ratelimit_lock, flags);
-}
-
-static void cpufreq_debug_disable_ratelimit(void)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&disable_ratelimit_lock, flags);
-	disable_ratelimit++;
-	spin_unlock_irqrestore(&disable_ratelimit_lock, flags);
-}
-
-void cpufreq_debug_printk(unsigned int type, const char *prefix,
-			const char *fmt, ...)
-{
-	char s[256];
-	va_list args;
-	unsigned int len;
-	unsigned long flags;
-
-	WARN_ON(!prefix);
-	if (type & debug) {
-		spin_lock_irqsave(&disable_ratelimit_lock, flags);
-		if (!disable_ratelimit && debug_ratelimit
-					&& !printk_ratelimit()) {
-			spin_unlock_irqrestore(&disable_ratelimit_lock, flags);
-			return;
-		}
-		spin_unlock_irqrestore(&disable_ratelimit_lock, flags);
-
-		len = snprintf(s, 256, KERN_DEBUG "%s: ", prefix);
-
-		va_start(args, fmt);
-		len += vsnprintf(&s[len], (256 - len), fmt, args);
-		va_end(args);
-
-		printk(s);
-
-		WARN_ON(len < 5);
-	}
-}
-EXPORT_SYMBOL(cpufreq_debug_printk);
-
-
-module_param(debug, uint, 0644);
-MODULE_PARM_DESC(debug, "CPUfreq debugging: add 1 to debug core,"
-			" 2 to debug drivers, and 4 to debug governors.");
-
-module_param(debug_ratelimit, uint, 0644);
-MODULE_PARM_DESC(debug_ratelimit, "CPUfreq debugging:"
-					" set to 0 to disable ratelimiting.");
-
-#else /* !CONFIG_CPU_FREQ_DEBUG */
-
-static inline void cpufreq_debug_enable_ratelimit(void) { return; }
-static inline void cpufreq_debug_disable_ratelimit(void) { return; }
-
-#endif /* CONFIG_CPU_FREQ_DEBUG */
-
-
 /*********************************************************************
  *            EXTERNALLY AFFECTING FREQUENCY CHANGES                 *
  *********************************************************************/
@@ -291,7 +201,7 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
 	if (!l_p_j_ref_freq) {
 		l_p_j_ref = loops_per_jiffy;
 		l_p_j_ref_freq = ci->old;
-		dprintk("saving %lu as reference value for loops_per_jiffy; "
+		pr_debug("saving %lu as reference value for loops_per_jiffy; "
 			"freq is %u kHz\n", l_p_j_ref, l_p_j_ref_freq);
 	}
 	if ((val == CPUFREQ_PRECHANGE  && ci->old < ci->new) ||
@@ -299,7 +209,7 @@ static void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
 	    (val == CPUFREQ_RESUMECHANGE || val == CPUFREQ_SUSPENDCHANGE)) {
 		loops_per_jiffy = cpufreq_scale(l_p_j_ref, l_p_j_ref_freq,
 								ci->new);
-		dprintk("scaling loops_per_jiffy to %lu "
+		pr_debug("scaling loops_per_jiffy to %lu "
 			"for frequency %u kHz\n", loops_per_jiffy, ci->new);
 	}
 }
@@ -326,7 +236,7 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
 	BUG_ON(irqs_disabled());
 
 	freqs->flags = cpufreq_driver->flags;
-	dprintk("notification %u of frequency transition to %u kHz\n",
+	pr_debug("notification %u of frequency transition to %u kHz\n",
 		state, freqs->new);
 
 	policy = per_cpu(cpufreq_cpu_data, freqs->cpu);
@@ -340,7 +250,7 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
 		if (!(cpufreq_driver->flags & CPUFREQ_CONST_LOOPS)) {
 			if ((policy) && (policy->cpu == freqs->cpu) &&
 			    (policy->cur) && (policy->cur != freqs->old)) {
-				dprintk("Warning: CPU frequency is"
+				pr_debug("Warning: CPU frequency is"
 					" %u, cpufreq assumed %u kHz.\n",
 					freqs->old, policy->cur);
 				freqs->old = policy->cur;
@@ -353,7 +263,7 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
 
 	case CPUFREQ_POSTCHANGE:
 		adjust_jiffies(CPUFREQ_POSTCHANGE, freqs);
-		dprintk("FREQ: %lu - CPU: %lu", (unsigned long)freqs->new,
+		pr_debug("FREQ: %lu - CPU: %lu", (unsigned long)freqs->new,
 			(unsigned long)freqs->cpu);
 		trace_power_frequency(POWER_PSTATE, freqs->new, freqs->cpu);
 		trace_cpu_frequency(freqs->new, freqs->cpu);
@@ -753,7 +663,7 @@ no_policy:
 static void cpufreq_sysfs_release(struct kobject *kobj)
 {
 	struct cpufreq_policy *policy = to_policy(kobj);
-	dprintk("last reference is dropped\n");
+	pr_debug("last reference is dropped\n");
 	complete(&policy->kobj_unregister);
 }
 
@@ -788,7 +698,7 @@ static int cpufreq_add_dev_policy(unsigned int cpu,
 	gov = __find_governor(per_cpu(cpufreq_cpu_governor, cpu));
 	if (gov) {
 		policy->governor = gov;
-		dprintk("Restoring governor %s for cpu %d\n",
+		pr_debug("Restoring governor %s for cpu %d\n",
 		       policy->governor->name, cpu);
 	}
 #endif
@@ -824,7 +734,7 @@ static int cpufreq_add_dev_policy(unsigned int cpu,
 			per_cpu(cpufreq_cpu_data, cpu) = managed_policy;
 			spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 
-			dprintk("CPU already managed, adding link\n");
+			pr_debug("CPU already managed, adding link\n");
 			ret = sysfs_create_link(&sys_dev->kobj,
 						&managed_policy->kobj,
 						"cpufreq");
@@ -865,7 +775,7 @@ static int cpufreq_add_dev_symlink(unsigned int cpu,
 		if (!cpu_online(j))
 			continue;
 
-		dprintk("CPU %u already managed, adding link\n", j);
+		pr_debug("CPU %u already managed, adding link\n", j);
 		managed_policy = cpufreq_cpu_get(cpu);
 		cpu_sys_dev = get_cpu_sysdev(j);
 		ret = sysfs_create_link(&cpu_sys_dev->kobj, &policy->kobj,
@@ -941,7 +851,7 @@ static int cpufreq_add_dev_interface(unsigned int cpu,
 	policy->user_policy.governor = policy->governor;
 
 	if (ret) {
-		dprintk("setting policy failed\n");
+		pr_debug("setting policy failed\n");
 		if (cpufreq_driver->exit)
 			cpufreq_driver->exit(policy);
 	}
@@ -977,8 +887,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 	if (cpu_is_offline(cpu))
 		return 0;
 
-	cpufreq_debug_disable_ratelimit();
-	dprintk("adding CPU %u\n", cpu);
+	pr_debug("adding CPU %u\n", cpu);
 
 #ifdef CONFIG_SMP
 	/* check whether a different CPU already registered this
@@ -986,7 +895,6 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 	policy = cpufreq_cpu_get(cpu);
 	if (unlikely(policy)) {
 		cpufreq_cpu_put(policy);
-		cpufreq_debug_enable_ratelimit();
 		return 0;
 	}
 #endif
@@ -1037,7 +945,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 	 */
 	ret = cpufreq_driver->init(policy);
 	if (ret) {
-		dprintk("initialization failed\n");
+		pr_debug("initialization failed\n");
 		goto err_unlock_policy;
 	}
 	policy->user_policy.min = policy->min;
@@ -1063,8 +971,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
 
 	kobject_uevent(&policy->kobj, KOBJ_ADD);
 	module_put(cpufreq_driver->owner);
-	dprintk("initialization complete\n");
-	cpufreq_debug_enable_ratelimit();
+	pr_debug("initialization complete\n");
 
 	return 0;
 
@@ -1088,7 +995,6 @@ err_free_policy:
 nomem_out:
 	module_put(cpufreq_driver->owner);
 module_out:
-	cpufreq_debug_enable_ratelimit();
 	return ret;
 }
 
@@ -1112,15 +1018,13 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 	unsigned int j;
 #endif
 
-	cpufreq_debug_disable_ratelimit();
-	dprintk("unregistering CPU %u\n", cpu);
+	pr_debug("unregistering CPU %u\n", cpu);
 
 	spin_lock_irqsave(&cpufreq_driver_lock, flags);
 	data = per_cpu(cpufreq_cpu_data, cpu);
 
 	if (!data) {
 		spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
-		cpufreq_debug_enable_ratelimit();
 		unlock_policy_rwsem_write(cpu);
 		return -EINVAL;
 	}
@@ -1132,12 +1036,11 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 	 * only need to unlink, put and exit
 	 */
 	if (unlikely(cpu != data->cpu)) {
-		dprintk("removing link\n");
+		pr_debug("removing link\n");
 		cpumask_clear_cpu(cpu, data->cpus);
 		spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
 		kobj = &sys_dev->kobj;
 		cpufreq_cpu_put(data);
-		cpufreq_debug_enable_ratelimit();
 		unlock_policy_rwsem_write(cpu);
 		sysfs_remove_link(kobj, "cpufreq");
 		return 0;
@@ -1170,7 +1073,7 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 		for_each_cpu(j, data->cpus) {
 			if (j == cpu)
 				continue;
-			dprintk("removing link for cpu %u\n", j);
+			pr_debug("removing link for cpu %u\n", j);
 #ifdef CONFIG_HOTPLUG_CPU
 			strncpy(per_cpu(cpufreq_cpu_governor, j),
 				data->governor->name, CPUFREQ_NAME_LEN);
@@ -1199,17 +1102,15 @@ static int __cpufreq_remove_dev(struct sys_device *sys_dev)
 	 * not referenced anymore by anybody before we proceed with
 	 * unloading.
 	 */
-	dprintk("waiting for dropping of refcount\n");
+	pr_debug("waiting for dropping of refcount\n");
 	wait_for_completion(cmp);
-	dprintk("wait complete\n");
+	pr_debug("wait complete\n");
 
 	lock_policy_rwsem_write(cpu);
 	if (cpufreq_driver->exit)
 		cpufreq_driver->exit(data);
 	unlock_policy_rwsem_write(cpu);
 
-	cpufreq_debug_enable_ratelimit();
-
 #ifdef CONFIG_HOTPLUG_CPU
 	/* when the CPU which is the parent of the kobj is hotplugged
 	 * offline, check for siblings, and create cpufreq sysfs interface
@@ -1255,7 +1156,7 @@ static void handle_update(struct work_struct *work)
 	struct cpufreq_policy *policy =
 		container_of(work, struct cpufreq_policy, update);
 	unsigned int cpu = policy->cpu;
-	dprintk("handle_update for cpu %u called\n", cpu);
+	pr_debug("handle_update for cpu %u called\n", cpu);
 	cpufreq_update_policy(cpu);
 }
 
@@ -1273,7 +1174,7 @@ static void cpufreq_out_of_sync(unsigned int cpu, unsigned int old_freq,
 {
 	struct cpufreq_freqs freqs;
 
-	dprintk("Warning: CPU frequency out of sync: cpufreq and timing "
+	pr_debug("Warning: CPU frequency out of sync: cpufreq and timing "
 	       "core thinks of %u, is %u kHz.\n", old_freq, new_freq);
 
 	freqs.cpu = cpu;
@@ -1376,7 +1277,7 @@ static int cpufreq_bp_suspend(void)
 	int cpu = smp_processor_id();
 	struct cpufreq_policy *cpu_policy;
 
-	dprintk("suspending cpu %u\n", cpu);
+	pr_debug("suspending cpu %u\n", cpu);
 
 	/* If there's no policy for the boot CPU, we have nothing to do. */
 	cpu_policy = cpufreq_cpu_get(cpu);
@@ -1414,7 +1315,7 @@ static void cpufreq_bp_resume(void)
 	int cpu = smp_processor_id();
 	struct cpufreq_policy *cpu_policy;
 
-	dprintk("resuming cpu %u\n", cpu);
+	pr_debug("resuming cpu %u\n", cpu);
 
 	/* If there's no policy for the boot CPU, we have nothing to do. */
 	cpu_policy = cpufreq_cpu_get(cpu);
@@ -1526,7 +1427,7 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 {
 	int retval = -EINVAL;
 
-	dprintk("target for CPU %u: %u kHz, relation %u\n", policy->cpu,
+	pr_debug("target for CPU %u: %u kHz, relation %u\n", policy->cpu,
 		target_freq, relation);
 	if (cpu_online(policy->cpu) && cpufreq_driver->target)
 		retval = cpufreq_driver->target(policy, target_freq, relation);
@@ -1612,7 +1513,7 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
 	if (!try_module_get(policy->governor->owner))
 		return -EINVAL;
 
-	dprintk("__cpufreq_governor for CPU %u, event %u\n",
+	pr_debug("__cpufreq_governor for CPU %u, event %u\n",
 						policy->cpu, event);
 	ret = policy->governor->governor(policy, event);
 
@@ -1713,8 +1614,7 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data,
 {
 	int ret = 0;
 
-	cpufreq_debug_disable_ratelimit();
-	dprintk("setting new policy for CPU %u: %u - %u kHz\n", policy->cpu,
+	pr_debug("setting new policy for CPU %u: %u - %u kHz\n", policy->cpu,
 		policy->min, policy->max);
 
 	memcpy(&policy->cpuinfo, &data->cpuinfo,
@@ -1751,19 +1651,19 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data,
 	data->min = policy->min;
 	data->max = policy->max;
 
-	dprintk("new min and max freqs are %u - %u kHz\n",
+	pr_debug("new min and max freqs are %u - %u kHz\n",
 					data->min, data->max);
 
 	if (cpufreq_driver->setpolicy) {
 		data->policy = policy->policy;
-		dprintk("setting range\n");
+		pr_debug("setting range\n");
 		ret = cpufreq_driver->setpolicy(policy);
 	} else {
 		if (policy->governor != data->governor) {
 			/* save old, working values */
 			struct cpufreq_governor *old_gov = data->governor;
 
-			dprintk("governor switch\n");
+			pr_debug("governor switch\n");
 
 			/* end old governor */
 			if (data->governor)
@@ -1773,7 +1673,7 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data,
 			data->governor = policy->governor;
 			if (__cpufreq_governor(data, CPUFREQ_GOV_START)) {
 				/* new governor failed, so re-start old one */
-				dprintk("starting governor %s failed\n",
+				pr_debug("starting governor %s failed\n",
 							data->governor->name);
 				if (old_gov) {
 					data->governor = old_gov;
@@ -1785,12 +1685,11 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data,
 			}
 			/* might be a policy change, too, so fall through */
 		}
-		dprintk("governor: change or update limits\n");
+		pr_debug("governor: change or update limits\n");
 		__cpufreq_governor(data, CPUFREQ_GOV_LIMITS);
 	}
 
 error_out:
-	cpufreq_debug_enable_ratelimit();
 	return ret;
 }
 
@@ -1817,7 +1716,7 @@ int cpufreq_update_policy(unsigned int cpu)
 		goto fail;
 	}
 
-	dprintk("updating policy for CPU %u\n", cpu);
+	pr_debug("updating policy for CPU %u\n", cpu);
 	memcpy(&policy, data, sizeof(struct cpufreq_policy));
 	policy.min = data->user_policy.min;
 	policy.max = data->user_policy.max;
@@ -1829,7 +1728,7 @@ int cpufreq_update_policy(unsigned int cpu)
 	if (cpufreq_driver->get) {
 		policy.cur = cpufreq_driver->get(cpu);
 		if (!data->cur) {
-			dprintk("Driver did not initialize current freq");
+			pr_debug("Driver did not initialize current freq");
 			data->cur = policy.cur;
 		} else {
 			if (data->cur != policy.cur)
@@ -1905,7 +1804,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	    ((!driver_data->setpolicy) && (!driver_data->target)))
 		return -EINVAL;
 
-	dprintk("trying to register driver %s\n", driver_data->name);
+	pr_debug("trying to register driver %s\n", driver_data->name);
 
 	if (driver_data->setpolicy)
 		driver_data->flags |= CPUFREQ_CONST_LOOPS;
@@ -1936,15 +1835,14 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 
 		/* if all ->init() calls failed, unregister */
 		if (ret) {
-			dprintk("no CPU initialized for driver %s\n",
+			pr_debug("no CPU initialized for driver %s\n",
 							driver_data->name);
 			goto err_sysdev_unreg;
 		}
 	}
 
 	register_hotcpu_notifier(&cpufreq_cpu_notifier);
-	dprintk("driver %s up and running\n", driver_data->name);
-	cpufreq_debug_enable_ratelimit();
+	pr_debug("driver %s up and running\n", driver_data->name);
 
 	return 0;
 err_sysdev_unreg:
@@ -1971,14 +1869,10 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
 {
 	unsigned long flags;
 
-	cpufreq_debug_disable_ratelimit();
-
-	if (!cpufreq_driver || (driver != cpufreq_driver)) {
-		cpufreq_debug_enable_ratelimit();
+	if (!cpufreq_driver || (driver != cpufreq_driver))
 		return -EINVAL;
-	}
 
-	dprintk("unregistering driver %s\n", driver->name);
+	pr_debug("unregistering driver %s\n", driver->name);
 
 	sysdev_driver_unregister(&cpu_sysdev_class, &cpufreq_sysdev_driver);
 	unregister_hotcpu_notifier(&cpufreq_cpu_notifier);
diff --git a/drivers/cpufreq/cpufreq_performance.c b/drivers/cpufreq/cpufreq_performance.c
index 7e2e515087f..f13a8a9af6a 100644
--- a/drivers/cpufreq/cpufreq_performance.c
+++ b/drivers/cpufreq/cpufreq_performance.c
@@ -15,9 +15,6 @@
 #include <linux/cpufreq.h>
 #include <linux/init.h>
 
-#define dprintk(msg...) \
-	cpufreq_debug_printk(CPUFREQ_DEBUG_GOVERNOR, "performance", msg)
-
 
 static int cpufreq_governor_performance(struct cpufreq_policy *policy,
 					unsigned int event)
@@ -25,7 +22,7 @@ static int cpufreq_governor_performance(struct cpufreq_policy *policy,
 	switch (event) {
 	case CPUFREQ_GOV_START:
 	case CPUFREQ_GOV_LIMITS:
-		dprintk("setting to %u kHz because of event %u\n",
+		pr_debug("setting to %u kHz because of event %u\n",
 						policy->max, event);
 		__cpufreq_driver_target(policy, policy->max,
 						CPUFREQ_RELATION_H);
diff --git a/drivers/cpufreq/cpufreq_powersave.c b/drivers/cpufreq/cpufreq_powersave.c
index e6db5faf3eb..4c2eb512f2b 100644
--- a/drivers/cpufreq/cpufreq_powersave.c
+++ b/drivers/cpufreq/cpufreq_powersave.c
@@ -15,16 +15,13 @@
 #include <linux/cpufreq.h>
 #include <linux/init.h>
 
-#define dprintk(msg...) \
-	cpufreq_debug_printk(CPUFREQ_DEBUG_GOVERNOR, "powersave", msg)
-
 static int cpufreq_governor_powersave(struct cpufreq_policy *policy,
 					unsigned int event)
 {
 	switch (event) {
 	case CPUFREQ_GOV_START:
 	case CPUFREQ_GOV_LIMITS:
-		dprintk("setting to %u kHz because of event %u\n",
+		pr_debug("setting to %u kHz because of event %u\n",
 							policy->min, event);
 		__cpufreq_driver_target(policy, policy->min,
 						CPUFREQ_RELATION_L);
diff --git a/drivers/cpufreq/cpufreq_userspace.c b/drivers/cpufreq/cpufreq_userspace.c
index 66d2d1d6c80..f231015904c 100644
--- a/drivers/cpufreq/cpufreq_userspace.c
+++ b/drivers/cpufreq/cpufreq_userspace.c
@@ -37,9 +37,6 @@ static DEFINE_PER_CPU(unsigned int, cpu_is_managed);
 static DEFINE_MUTEX(userspace_mutex);
 static int cpus_using_userspace_governor;
 
-#define dprintk(msg...) \
-	cpufreq_debug_printk(CPUFREQ_DEBUG_GOVERNOR, "userspace", msg)
-
 /* keep track of frequency transitions */
 static int
 userspace_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -50,7 +47,7 @@ userspace_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 	if (!per_cpu(cpu_is_managed, freq->cpu))
 		return 0;
 
-	dprintk("saving cpu_cur_freq of cpu %u to be %u kHz\n",
+	pr_debug("saving cpu_cur_freq of cpu %u to be %u kHz\n",
 			freq->cpu, freq->new);
 	per_cpu(cpu_cur_freq, freq->cpu) = freq->new;
 
@@ -73,7 +70,7 @@ static int cpufreq_set(struct cpufreq_policy *policy, unsigned int freq)
 {
 	int ret = -EINVAL;
 
-	dprintk("cpufreq_set for cpu %u, freq %u kHz\n", policy->cpu, freq);
+	pr_debug("cpufreq_set for cpu %u, freq %u kHz\n", policy->cpu, freq);
 
 	mutex_lock(&userspace_mutex);
 	if (!per_cpu(cpu_is_managed, policy->cpu))
@@ -134,7 +131,7 @@ static int cpufreq_governor_userspace(struct cpufreq_policy *policy,
 		per_cpu(cpu_max_freq, cpu) = policy->max;
 		per_cpu(cpu_cur_freq, cpu) = policy->cur;
 		per_cpu(cpu_set_freq, cpu) = policy->cur;
-		dprintk("managing cpu %u started "
+		pr_debug("managing cpu %u started "
 			"(%u - %u kHz, currently %u kHz)\n",
 				cpu,
 				per_cpu(cpu_min_freq, cpu),
@@ -156,12 +153,12 @@ static int cpufreq_governor_userspace(struct cpufreq_policy *policy,
 		per_cpu(cpu_min_freq, cpu) = 0;
 		per_cpu(cpu_max_freq, cpu) = 0;
 		per_cpu(cpu_set_freq, cpu) = 0;
-		dprintk("managing cpu %u stopped\n", cpu);
+		pr_debug("managing cpu %u stopped\n", cpu);
 		mutex_unlock(&userspace_mutex);
 		break;
 	case CPUFREQ_GOV_LIMITS:
 		mutex_lock(&userspace_mutex);
-		dprintk("limit event for cpu %u: %u - %u kHz, "
+		pr_debug("limit event for cpu %u: %u - %u kHz, "
 			"currently %u kHz, last set to %u kHz\n",
 			cpu, policy->min, policy->max,
 			per_cpu(cpu_cur_freq, cpu),
diff --git a/drivers/cpufreq/freq_table.c b/drivers/cpufreq/freq_table.c
index 05432216e22..90431cb9280 100644
--- a/drivers/cpufreq/freq_table.c
+++ b/drivers/cpufreq/freq_table.c
@@ -14,9 +14,6 @@
 #include <linux/init.h>
 #include <linux/cpufreq.h>
 
-#define dprintk(msg...) \
-	cpufreq_debug_printk(CPUFREQ_DEBUG_CORE, "freq-table", msg)
-
 /*********************************************************************
  *                     FREQUENCY TABLE HELPERS                       *
  *********************************************************************/
@@ -31,11 +28,11 @@ int cpufreq_frequency_table_cpuinfo(struct cpufreq_policy *policy,
 	for (i = 0; (table[i].frequency != CPUFREQ_TABLE_END); i++) {
 		unsigned int freq = table[i].frequency;
 		if (freq == CPUFREQ_ENTRY_INVALID) {
-			dprintk("table entry %u is invalid, skipping\n", i);
+			pr_debug("table entry %u is invalid, skipping\n", i);
 
 			continue;
 		}
-		dprintk("table entry %u: %u kHz, %u index\n",
+		pr_debug("table entry %u: %u kHz, %u index\n",
 					i, freq, table[i].index);
 		if (freq < min_freq)
 			min_freq = freq;
@@ -61,7 +58,7 @@ int cpufreq_frequency_table_verify(struct cpufreq_policy *policy,
 	unsigned int i;
 	unsigned int count = 0;
 
-	dprintk("request for verification of policy (%u - %u kHz) for cpu %u\n",
+	pr_debug("request for verification of policy (%u - %u kHz) for cpu %u\n",
 					policy->min, policy->max, policy->cpu);
 
 	if (!cpu_online(policy->cpu))
@@ -86,7 +83,7 @@ int cpufreq_frequency_table_verify(struct cpufreq_policy *policy,
 	cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
 				     policy->cpuinfo.max_freq);
 
-	dprintk("verification lead to (%u - %u kHz) for cpu %u\n",
+	pr_debug("verification lead to (%u - %u kHz) for cpu %u\n",
 				policy->min, policy->max, policy->cpu);
 
 	return 0;
@@ -110,7 +107,7 @@ int cpufreq_frequency_table_target(struct cpufreq_policy *policy,
 	};
 	unsigned int i;
 
-	dprintk("request for target %u kHz (relation: %u) for cpu %u\n",
+	pr_debug("request for target %u kHz (relation: %u) for cpu %u\n",
 					target_freq, relation, policy->cpu);
 
 	switch (relation) {
@@ -167,7 +164,7 @@ int cpufreq_frequency_table_target(struct cpufreq_policy *policy,
 	} else
 		*index = optimal.index;
 
-	dprintk("target is %u (%u kHz, %u)\n", *index, table[*index].frequency,
+	pr_debug("target is %u (%u kHz, %u)\n", *index, table[*index].frequency,
 		table[*index].index);
 
 	return 0;
@@ -216,14 +213,14 @@ EXPORT_SYMBOL_GPL(cpufreq_freq_attr_scaling_available_freqs);
 void cpufreq_frequency_table_get_attr(struct cpufreq_frequency_table *table,
 				      unsigned int cpu)
 {
-	dprintk("setting show_table for cpu %u to %p\n", cpu, table);
+	pr_debug("setting show_table for cpu %u to %p\n", cpu, table);
 	per_cpu(cpufreq_show_table, cpu) = table;
 }
 EXPORT_SYMBOL_GPL(cpufreq_frequency_table_get_attr);
 
 void cpufreq_frequency_table_put_attr(unsigned int cpu)
 {
-	dprintk("clearing show_table for cpu %u\n", cpu);
+	pr_debug("clearing show_table for cpu %u\n", cpu);
 	per_cpu(cpufreq_show_table, cpu) = NULL;
 }
 EXPORT_SYMBOL_GPL(cpufreq_frequency_table_put_attr);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 9343dd3de85..2845f6e6722 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -397,23 +397,4 @@ void cpufreq_frequency_table_get_attr(struct cpufreq_frequency_table *table,
 void cpufreq_frequency_table_put_attr(unsigned int cpu);
 
 
-/*********************************************************************
- *                     UNIFIED DEBUG HELPERS                         *
- *********************************************************************/
-
-#define CPUFREQ_DEBUG_CORE	1
-#define CPUFREQ_DEBUG_DRIVER	2
-#define CPUFREQ_DEBUG_GOVERNOR	4
-
-#ifdef CONFIG_CPU_FREQ_DEBUG
-
-extern void cpufreq_debug_printk(unsigned int type, const char *prefix, 
-				 const char *fmt, ...);
-
-#else
-
-#define cpufreq_debug_printk(msg...) do { } while(0)
-
-#endif /* CONFIG_CPU_FREQ_DEBUG */
-
 #endif /* _LINUX_CPUFREQ_H */
-- 
cgit v1.2.3-70-g09d2


From e04d1b23f9706186187dcb0be1a752e48dcc540b Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Fri, 6 May 2011 07:14:02 +0000
Subject: perf events, x86: Add SandyBridge stalled-cycles-frontend/backend
 events

Extend the Intel SandyBridge PMU driver with definitions
for generic front-end and back-end stall events.

( As commit 3011203 "perf events, x86: Add Westmere stalled-cycles-frontend/backend
  events" says, these are only approximations. )

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1304666042-17577-1-git-send-email-ming.m.lin@intel.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index e61539b07d2..7cf2ec59c81 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1474,6 +1474,12 @@ static __init int intel_pmu_init(void)
 
 		x86_pmu.event_constraints = intel_snb_event_constraints;
 		x86_pmu.pebs_constraints = intel_snb_pebs_events;
+
+		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+		/* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1;
+
 		pr_cont("SandyBridge events, ");
 		break;
 
-- 
cgit v1.2.3-70-g09d2


From 61516587513c84ac26e68e3ab008dc6e965d0378 Mon Sep 17 00:00:00 2001
From: Rob Landley <rob@landley.net>
Date: Fri, 6 May 2011 09:27:36 -0700
Subject: Correct occurrences of - Documentation/kvm/ to
 Documentation/virtual/kvm - Documentation/uml/ to Documentation/virtual/uml -
 Documentation/lguest/ to Documentation/virtual/lguest throughout the kernel
 source tree.

Signed-off-by: Rob Landley <rob@landley.net>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
---
 Documentation/virtual/kvm/review-checklist.txt | 2 +-
 Documentation/virtual/lguest/lguest.txt        | 3 ++-
 MAINTAINERS                                    | 4 ++--
 arch/x86/lguest/boot.c                         | 2 +-
 drivers/lguest/Kconfig                         | 6 ++++--
 drivers/lguest/Makefile                        | 2 +-
 drivers/vhost/vhost.c                          | 2 +-
 7 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/virtual/kvm/review-checklist.txt b/Documentation/virtual/kvm/review-checklist.txt
index 730475ae1b8..a850986ed68 100644
--- a/Documentation/virtual/kvm/review-checklist.txt
+++ b/Documentation/virtual/kvm/review-checklist.txt
@@ -7,7 +7,7 @@ Review checklist for kvm patches
 2.  Patches should be against kvm.git master branch.
 
 3.  If the patch introduces or modifies a new userspace API:
-    - the API must be documented in Documentation/kvm/api.txt
+    - the API must be documented in Documentation/virtual/kvm/api.txt
     - the API must be discoverable using KVM_CHECK_EXTENSION
 
 4.  New state must include support for save/restore.
diff --git a/Documentation/virtual/lguest/lguest.txt b/Documentation/virtual/lguest/lguest.txt
index dad99978a6a..bff0c554485 100644
--- a/Documentation/virtual/lguest/lguest.txt
+++ b/Documentation/virtual/lguest/lguest.txt
@@ -74,7 +74,8 @@ Running Lguest:
 
 - Run an lguest as root:
 
-      Documentation/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 --block=rootfile root=/dev/vda
+      Documentation/virtual/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \
+        --block=rootfile root=/dev/vda
 
    Explanation:
     64: the amount of memory to use, in MB.
diff --git a/MAINTAINERS b/MAINTAINERS
index 16a5c5f2c6a..aa9bbd1c0e2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3807,7 +3807,7 @@ M:	Rusty Russell <rusty@rustcorp.com.au>
 L:	lguest@lists.ozlabs.org
 W:	http://lguest.ozlabs.org/
 S:	Odd Fixes
-F:	Documentation/lguest/
+F:	Documentation/virtual/lguest/
 F:	arch/x86/lguest/
 F:	drivers/lguest/
 F:	include/linux/lguest*.h
@@ -6618,7 +6618,7 @@ L:	user-mode-linux-devel@lists.sourceforge.net
 L:	user-mode-linux-user@lists.sourceforge.net
 W:	http://user-mode-linux.sourceforge.net
 S:	Maintained
-F:	Documentation/uml/
+F:	Documentation/virtual/uml/
 F:	arch/um/
 F:	fs/hostfs/
 F:	fs/hppfs/
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 1cd608973ce..395bf0114aa 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -7,7 +7,7 @@
  * kernel and insert a module (lg.ko) which allows us to run other Linux
  * kernels the same way we'd run processes.  We call the first kernel the Host,
  * and the others the Guests.  The program which sets up and configures Guests
- * (such as the example in Documentation/lguest/lguest.c) is called the
+ * (such as the example in Documentation/virtual/lguest/lguest.c) is called the
  * Launcher.
  *
  * Secondly, we only run specially modified Guests, not normal kernels: setting
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index 0aaa0597a62..34ae49dc557 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -5,8 +5,10 @@ config LGUEST
 	---help---
 	  This is a very simple module which allows you to run
 	  multiple instances of the same Linux kernel, using the
-	  "lguest" command found in the Documentation/lguest directory.
+	  "lguest" command found in the Documentation/virtual/lguest
+	  directory.
+
 	  Note that "lguest" is pronounced to rhyme with "fell quest",
-	  not "rustyvisor".  See Documentation/lguest/lguest.txt.
+	  not "rustyvisor". See Documentation/virtual/lguest/lguest.txt.
 
 	  If unsure, say N.  If curious, say M.  If masochistic, say Y.
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile
index 7d463c26124..8ac947c7e7c 100644
--- a/drivers/lguest/Makefile
+++ b/drivers/lguest/Makefile
@@ -18,7 +18,7 @@ Mastery: PREFIX=M
 Beer:
 	@for f in Preparation Guest Drivers Launcher Host Switcher Mastery; do echo "{==- $$f -==}"; make -s $$f; done; echo "{==-==}"
 Preparation Preparation! Guest Drivers Launcher Host Switcher Mastery:
-	@sh ../../Documentation/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'`
+	@sh ../../Documentation/virtual/lguest/extract $(PREFIX) `find ../../* -name '*.[chS]' -wholename '*lguest*'`
 Puppy:
 	@clear
 	@printf "      __  \n (___()'\`;\n /,    /\`\n \\\\\\\"--\\\\\\   \n"
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2ab29124163..7aa4eea930f 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -4,7 +4,7 @@
  * Author: Michael S. Tsirkin <mst@redhat.com>
  *
  * Inspiration, some code, and most witty comments come from
- * Documentation/lguest/lguest.c, by Rusty Russell
+ * Documentation/virtual/lguest/lguest.c, by Rusty Russell
  *
  * This work is licensed under the terms of the GNU GPL, version 2.
  *
-- 
cgit v1.2.3-70-g09d2


From 8fab6af2156c0100f953fd61f4e0b2f82c9776dc Mon Sep 17 00:00:00 2001
From: "Luis R. Rodriguez" <lrodriguez@atheros.com>
Date: Fri, 6 May 2011 15:00:09 -0700
Subject: x86: Fix mrst sparse complaints

Fix these Sparse complaints:

  CHECK   arch/x86/platform/mrst/mrst.c
  arch/x86/platform/mrst/mrst.c:197:13: warning: symbol 'mrst_time_init' was not declared. Should it be static?
  arch/x86/platform/mrst/mrst.c:219:16: warning: symbol 'mrst_arch_setup' was not declared. Should it be static?

Acked-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Roman Gezikov <roman.gezikov@atheros.com>
Cc: Joonas Viskari <joonas.viskari@atheros.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Allen Kao <allen.kao@atheros.com>
Signed-off-by: Luis R. Rodriguez <lrodriguez@atheros.com>
Link: http://lkml.kernel.org/r/1304719209-26913-1-git-send-email-lrodriguez@atheros.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/platform/mrst/mrst.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 275dbc19e2c..7000e74b308 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -194,7 +194,7 @@ static unsigned long __init mrst_calibrate_tsc(void)
 	return 0;
 }
 
-void __init mrst_time_init(void)
+static void __init mrst_time_init(void)
 {
 	sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
 	switch (mrst_timer_options) {
@@ -216,7 +216,7 @@ void __init mrst_time_init(void)
 	apbt_time_init();
 }
 
-void __cpuinit mrst_arch_setup(void)
+static void __cpuinit mrst_arch_setup(void)
 {
 	if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
 		__mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
-- 
cgit v1.2.3-70-g09d2


From 2b5e8ef35bc89eee944c0627edacbb1fea5a1b84 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Thu, 5 May 2011 15:19:42 -0400
Subject: x86, efi: Remove virtual-mode SetVirtualAddressMap call

The spec says that SetVirtualAddressMap doesn't work once you're in
virtual mode, so there's no point in having infrastructure for calling
it from there.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Link: http://lkml.kernel.org/r/1304623186-18261-1-git-send-email-mjg@redhat.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/platform/efi/efi.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 0fe27d7c625..f2e4fe9e92a 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -145,17 +145,6 @@ static void virt_efi_reset_system(int reset_type,
 		       data_size, data);
 }
 
-static efi_status_t virt_efi_set_virtual_address_map(
-	unsigned long memory_map_size,
-	unsigned long descriptor_size,
-	u32 descriptor_version,
-	efi_memory_desc_t *virtual_map)
-{
-	return efi_call_virt4(set_virtual_address_map,
-			      memory_map_size, descriptor_size,
-			      descriptor_version, virtual_map);
-}
-
 static efi_status_t __init phys_efi_set_virtual_address_map(
 	unsigned long memory_map_size,
 	unsigned long descriptor_size,
@@ -572,7 +561,7 @@ void __init efi_enter_virtual_mode(void)
 	efi.set_variable = virt_efi_set_variable;
 	efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
 	efi.reset_system = virt_efi_reset_system;
-	efi.set_virtual_address_map = virt_efi_set_virtual_address_map;
+	efi.set_virtual_address_map = NULL;
 	if (__supported_pte_mask & _PAGE_NX)
 		runtime_code_page_mkexec();
 	early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
-- 
cgit v1.2.3-70-g09d2


From 9cd2b07c197e3ff594fc04f5fb3d86efbeab6ad8 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Thu, 5 May 2011 15:19:43 -0400
Subject: x86, efi: Consolidate EFI nx control

The core EFI code and 64-bit EFI code currently have independent
implementations of code for setting memory regions as executable or not.
Let's consolidate them.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Link: http://lkml.kernel.org/r/1304623186-18261-2-git-send-email-mjg@redhat.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/efi.h     |  1 +
 arch/x86/platform/efi/efi.c    | 21 ++++++++++++++++-----
 arch/x86/platform/efi/efi_64.c | 28 +++++-----------------------
 3 files changed, 22 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 8e4a16508d4..7093e4a6a0b 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -90,6 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
 #endif /* CONFIG_X86_32 */
 
 extern int add_efi_memmap;
+extern void efi_set_executable(efi_memory_desc_t *md, bool executable);
 extern void efi_memblock_x86_reserve_range(void);
 extern void efi_call_phys_prelog(void);
 extern void efi_call_phys_epilog(void);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index f2e4fe9e92a..7daae27e975 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -457,11 +457,25 @@ void __init efi_init(void)
 #endif
 }
 
+void __init efi_set_executable(efi_memory_desc_t *md, bool executable)
+{
+	u64 addr, npages;
+
+	addr = md->virt_addr;
+	npages = md->num_pages;
+
+	memrange_efi_to_native(&addr, &npages);
+
+	if (executable)
+		set_memory_x(addr, npages);
+	else
+		set_memory_nx(addr, npages);
+}
+
 static void __init runtime_code_page_mkexec(void)
 {
 	efi_memory_desc_t *md;
 	void *p;
-	u64 addr, npages;
 
 	/* Make EFI runtime service code area executable */
 	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
@@ -470,10 +484,7 @@ static void __init runtime_code_page_mkexec(void)
 		if (md->type != EFI_RUNTIME_SERVICES_CODE)
 			continue;
 
-		addr = md->virt_addr;
-		npages = md->num_pages;
-		memrange_efi_to_native(&addr, &npages);
-		set_memory_x(addr, npages);
+		efi_set_executable(md, true);
 	}
 }
 
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index ac0621a7ac3..94d6b394b65 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -41,22 +41,7 @@
 static pgd_t save_pgd __initdata;
 static unsigned long efi_flags __initdata;
 
-static void __init early_mapping_set_exec(unsigned long start,
-					  unsigned long end,
-					  int executable)
-{
-	unsigned long num_pages;
-
-	start &= PMD_MASK;
-	end = (end + PMD_SIZE - 1) & PMD_MASK;
-	num_pages = (end - start) >> PAGE_SHIFT;
-	if (executable)
-		set_memory_x((unsigned long)__va(start), num_pages);
-	else
-		set_memory_nx((unsigned long)__va(start), num_pages);
-}
-
-static void __init early_runtime_code_mapping_set_exec(int executable)
+static void __init early_code_mapping_set_exec(int executable)
 {
 	efi_memory_desc_t *md;
 	void *p;
@@ -67,11 +52,8 @@ static void __init early_runtime_code_mapping_set_exec(int executable)
 	/* Make EFI runtime service code area executable */
 	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
 		md = p;
-		if (md->type == EFI_RUNTIME_SERVICES_CODE) {
-			unsigned long end;
-			end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
-			early_mapping_set_exec(md->phys_addr, end, executable);
-		}
+		if (md->type == EFI_RUNTIME_SERVICES_CODE)
+			efi_set_executable(md, executable);
 	}
 }
 
@@ -79,7 +61,7 @@ void __init efi_call_phys_prelog(void)
 {
 	unsigned long vaddress;
 
-	early_runtime_code_mapping_set_exec(1);
+	early_code_mapping_set_exec(1);
 	local_irq_save(efi_flags);
 	vaddress = (unsigned long)__va(0x0UL);
 	save_pgd = *pgd_offset_k(0x0UL);
@@ -95,7 +77,7 @@ void __init efi_call_phys_epilog(void)
 	set_pgd(pgd_offset_k(0x0UL), save_pgd);
 	__flush_tlb_all();
 	local_irq_restore(efi_flags);
-	early_runtime_code_mapping_set_exec(0);
+	early_code_mapping_set_exec(0);
 }
 
 void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
-- 
cgit v1.2.3-70-g09d2


From 202f9d0a41809e3424af5f61489b48b622824aed Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Thu, 5 May 2011 15:19:44 -0400
Subject: x86, efi: Merge contiguous memory regions of the same type and
 attribute

Some firmware implementations assume that physically contiguous regions
will be contiguous in virtual address space. This assumption is, obviously,
entirely unjustifiable. Said firmware implementations lack the good grace
to handle their failings in a measured and reasonable manner, instead
tending to shit all over address space and oopsing the kernel.

In an ideal universe these firmware implementations would simultaneously
catch fire and cease to be a problem, but since some of them are present
in attractively thin and shiny metal devices vanity wins out and some
poor developer spends an extended period of time surrounded by a
growing array of empty bottles until the underlying reason becomes
apparent. Said developer presents this patch, which simply merges
adjacent regions if they happen to be contiguous and have the same EFI
memory type and caching attributes.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Link: http://lkml.kernel.org/r/1304623186-18261-3-git-send-email-mjg@redhat.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/platform/efi/efi.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 7daae27e975..a46a73ecc5f 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -498,13 +498,41 @@ static void __init runtime_code_page_mkexec(void)
  */
 void __init efi_enter_virtual_mode(void)
 {
-	efi_memory_desc_t *md;
+	efi_memory_desc_t *md, *prev_md = NULL;
 	efi_status_t status;
 	unsigned long size;
 	u64 end, systab, addr, npages, end_pfn;
 	void *p, *va;
 
 	efi.systab = NULL;
+
+	/* Merge contiguous regions of the same type and attribute */
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		u64 prev_size;
+		md = p;
+
+		if (!prev_md) {
+			prev_md = md;
+			continue;
+		}
+
+		if (prev_md->type != md->type ||
+		    prev_md->attribute != md->attribute) {
+			prev_md = md;
+			continue;
+		}
+
+		prev_size = prev_md->num_pages << EFI_PAGE_SHIFT;
+
+		if (md->phys_addr == (prev_md->phys_addr + prev_size)) {
+			prev_md->num_pages += md->num_pages;
+			md->type = EFI_RESERVED_TYPE;
+			md->attribute = 0;
+			continue;
+		}
+		prev_md = md;
+	}
+
 	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
 		md = p;
 		if (!(md->attribute & EFI_MEMORY_RUNTIME))
-- 
cgit v1.2.3-70-g09d2


From 7cb00b72876ea2451eb79d468da0e8fb9134aa8a Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Thu, 5 May 2011 15:19:45 -0400
Subject: x86, efi: Pass a minimal map to SetVirtualAddressMap()

Experimentation with various EFI implementations has shown that functions
outside runtime services will still update their pointers if
SetVirtualAddressMap() is called with memory descriptors outside the
runtime area. This is obviously insane, and therefore is unsurprising.
Evidence from instrumenting another EFI implementation suggests that it
only passes the set of descriptors covering runtime regions, so let's
avoid any problems by doing the same. Runtime descriptors are copied to
a separate memory map, and only that map is passed back to the firmware.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Link: http://lkml.kernel.org/r/1304623186-18261-4-git-send-email-mjg@redhat.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/platform/efi/efi.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index a46a73ecc5f..b30aa26a8df 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -502,7 +502,8 @@ void __init efi_enter_virtual_mode(void)
 	efi_status_t status;
 	unsigned long size;
 	u64 end, systab, addr, npages, end_pfn;
-	void *p, *va;
+	void *p, *va, *new_memmap = NULL;
+	int count = 0;
 
 	efi.systab = NULL;
 
@@ -569,15 +570,21 @@ void __init efi_enter_virtual_mode(void)
 			systab += md->virt_addr - md->phys_addr;
 			efi.systab = (efi_system_table_t *) (unsigned long) systab;
 		}
+		new_memmap = krealloc(new_memmap,
+				      (count + 1) * memmap.desc_size,
+				      GFP_KERNEL);
+		memcpy(new_memmap + (count * memmap.desc_size), md,
+		       memmap.desc_size);
+		count++;
 	}
 
 	BUG_ON(!efi.systab);
 
 	status = phys_efi_set_virtual_address_map(
-		memmap.desc_size * memmap.nr_map,
+		memmap.desc_size * count,
 		memmap.desc_size,
 		memmap.desc_version,
-		memmap.phys_map);
+		(efi_memory_desc_t *)__pa(new_memmap));
 
 	if (status != EFI_SUCCESS) {
 		printk(KERN_ALERT "Unable to switch EFI into virtual mode "
@@ -605,6 +612,7 @@ void __init efi_enter_virtual_mode(void)
 		runtime_code_page_mkexec();
 	early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
 	memmap.map = NULL;
+	kfree(new_memmap);
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 935a638241b0658b9749edd060f972575f9d4a78 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Thu, 5 May 2011 15:19:46 -0400
Subject: x86, efi: Ensure that the entirity of a region is mapped

It's possible for init_memory_mapping() to fail to map the entire region
if it crosses a boundary, so ensure that we complete the mapping.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Link: http://lkml.kernel.org/r/1304623186-18261-5-git-send-email-mjg@redhat.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/platform/efi/efi_64.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 94d6b394b65..2649426a790 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -89,8 +89,10 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
 		return ioremap(phys_addr, size);
 
 	last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
-	if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
-		return NULL;
+	if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) {
+		unsigned long top = last_map_pfn << PAGE_SHIFT;
+		efi_ioremap(top, size - (top - phys_addr), type);
+	}
 
 	return (void __iomem *)__va(phys_addr);
 }
-- 
cgit v1.2.3-70-g09d2


From e969687595c27e02e02be0c9363261826123ba77 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 5 Nov 2010 16:12:35 -0700
Subject: arch/x86/kernel/pci-iommu_table.c: Convert sprintf_symbol to %pS

Coalesce format as well.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/pci-iommu_table.c | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
index 55d745ec118..35ccf75696e 100644
--- a/arch/x86/kernel/pci-iommu_table.c
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -50,20 +50,14 @@ void __init check_iommu_entries(struct iommu_table_entry *start,
 				struct iommu_table_entry *finish)
 {
 	struct iommu_table_entry *p, *q, *x;
-	char sym_p[KSYM_SYMBOL_LEN];
-	char sym_q[KSYM_SYMBOL_LEN];
 
 	/* Simple cyclic dependency checker. */
 	for (p = start; p < finish; p++) {
 		q = find_dependents_of(start, finish, p);
 		x = find_dependents_of(start, finish, q);
 		if (p == x) {
-			sprint_symbol(sym_p, (unsigned long)p->detect);
-			sprint_symbol(sym_q, (unsigned long)q->detect);
-
-			printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %s depends" \
-					" on %s and vice-versa. BREAKING IT.\n",
-					sym_p, sym_q);
+			printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n",
+			       p->detect, q->detect);
 			/* Heavy handed way..*/
 			x->depend = 0;
 		}
@@ -72,12 +66,8 @@ void __init check_iommu_entries(struct iommu_table_entry *start,
 	for (p = start; p < finish; p++) {
 		q = find_dependents_of(p, finish, p);
 		if (q && q > p) {
-			sprint_symbol(sym_p, (unsigned long)p->detect);
-			sprint_symbol(sym_q, (unsigned long)q->detect);
-
-			printk(KERN_ERR "EXECUTION ORDER INVALID! %s "\
-					"should be called before %s!\n",
-					sym_p, sym_q);
+			printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n",
+			       p->detect, q->detect);
 		}
 	}
 }
-- 
cgit v1.2.3-70-g09d2


From 72fe00f01f9a3240a1073be27aeaf4fc476cc662 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 10 May 2011 10:50:42 +0200
Subject: x86/amd-iommu: Use threaded interupt handler

Move the interupt handling for the iommu into the interupt
thread to reduce latencies and prepare interupt handling for
pri handling.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu_proto.h | 1 +
 arch/x86/kernel/amd_iommu.c            | 7 ++++++-
 arch/x86/kernel/amd_iommu_init.c       | 9 +++++----
 3 files changed, 12 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
index a4ae6c3875e..55d95eb789b 100644
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -23,6 +23,7 @@
 
 extern int amd_iommu_init_dma_ops(void);
 extern int amd_iommu_init_passthrough(void);
+extern irqreturn_t amd_iommu_int_thread(int irq, void *data);
 extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
 extern void amd_iommu_apply_erratum_63(u16 devid);
 extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index dc5dddafe5c..873e7e1ead7 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -366,7 +366,7 @@ static void iommu_poll_events(struct amd_iommu *iommu)
 	spin_unlock_irqrestore(&iommu->lock, flags);
 }
 
-irqreturn_t amd_iommu_int_handler(int irq, void *data)
+irqreturn_t amd_iommu_int_thread(int irq, void *data)
 {
 	struct amd_iommu *iommu;
 
@@ -376,6 +376,11 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
+irqreturn_t amd_iommu_int_handler(int irq, void *data)
+{
+	return IRQ_WAKE_THREAD;
+}
+
 /****************************************************************************
  *
  * IOMMU command queuing functions
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 28b07813368..9179c21120a 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1034,10 +1034,11 @@ static int iommu_setup_msi(struct amd_iommu *iommu)
 	if (pci_enable_msi(iommu->dev))
 		return 1;
 
-	r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
-			IRQF_SAMPLE_RANDOM,
-			"AMD-Vi",
-			NULL);
+	r = request_threaded_irq(iommu->dev->irq,
+				 amd_iommu_int_handler,
+				 amd_iommu_int_thread,
+				 0, "AMD-Vi",
+				 iommu->dev);
 
 	if (r) {
 		pci_disable_msi(iommu->dev);
-- 
cgit v1.2.3-70-g09d2


From fffcda1183e93df84ad73ba7eb7782a5c354e2b3 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 10 May 2011 17:22:06 +0200
Subject: x86, gart: Rename pci-gart_64.c to amd_gart_64.c

This file only contains code relevant for the northbridge
gart in AMD processors. This patch renames the file to
represent this fact in the filename.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 Documentation/x86/x86_64/boot-options.txt |   2 +-
 arch/x86/kernel/Makefile                  |   2 +-
 arch/x86/kernel/amd_gart_64.c             | 898 ++++++++++++++++++++++++++++++
 arch/x86/kernel/pci-gart_64.c             | 898 ------------------------------
 4 files changed, 900 insertions(+), 900 deletions(-)
 create mode 100644 arch/x86/kernel/amd_gart_64.c
 delete mode 100644 arch/x86/kernel/pci-gart_64.c

(limited to 'arch/x86')

diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 092e596a130..c54b4f503e2 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -206,7 +206,7 @@ IOMMU (input/output memory management unit)
       (e.g. because you have < 3 GB memory).
       Kernel boot message: "PCI-DMA: Disabling IOMMU"
 
-   2. <arch/x86_64/kernel/pci-gart.c>: AMD GART based hardware IOMMU.
+   2. <arch/x86/kernel/amd_gart_64.c>: AMD GART based hardware IOMMU.
       Kernel boot message: "PCI-DMA: using GART IOMMU"
 
    3. <arch/x86_64/kernel/pci-swiotlb.c> : Software IOMMU implementation. Used
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 7338ef2218b..97ebf82e0b7 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -117,7 +117,7 @@ obj-$(CONFIG_OF)			+= devicetree.o
 ifeq ($(CONFIG_X86_64),y)
 	obj-$(CONFIG_AUDIT)		+= audit_64.o
 
-	obj-$(CONFIG_GART_IOMMU)	+= pci-gart_64.o aperture_64.o
+	obj-$(CONFIG_GART_IOMMU)	+= amd_gart_64.o aperture_64.o
 	obj-$(CONFIG_CALGARY_IOMMU)	+= pci-calgary_64.o tce_64.o
 	obj-$(CONFIG_AMD_IOMMU)		+= amd_iommu_init.o amd_iommu.o
 
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
new file mode 100644
index 00000000000..b117efd24f7
--- /dev/null
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -0,0 +1,898 @@
+/*
+ * Dynamic DMA mapping support for AMD Hammer.
+ *
+ * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
+ * This allows to use PCI devices that only support 32bit addresses on systems
+ * with more than 4GB.
+ *
+ * See Documentation/PCI/PCI-DMA-mapping.txt for the interface specification.
+ *
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU General Public License v2 only.
+ */
+
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/agp_backend.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+#include <linux/topology.h>
+#include <linux/interrupt.h>
+#include <linux/bitmap.h>
+#include <linux/kdebug.h>
+#include <linux/scatterlist.h>
+#include <linux/iommu-helper.h>
+#include <linux/syscore_ops.h>
+#include <linux/io.h>
+#include <linux/gfp.h>
+#include <asm/atomic.h>
+#include <asm/mtrr.h>
+#include <asm/pgtable.h>
+#include <asm/proto.h>
+#include <asm/iommu.h>
+#include <asm/gart.h>
+#include <asm/cacheflush.h>
+#include <asm/swiotlb.h>
+#include <asm/dma.h>
+#include <asm/amd_nb.h>
+#include <asm/x86_init.h>
+#include <asm/iommu_table.h>
+
+static unsigned long iommu_bus_base;	/* GART remapping area (physical) */
+static unsigned long iommu_size;	/* size of remapping area bytes */
+static unsigned long iommu_pages;	/* .. and in pages */
+
+static u32 *iommu_gatt_base;		/* Remapping table */
+
+static dma_addr_t bad_dma_addr;
+
+/*
+ * If this is disabled the IOMMU will use an optimized flushing strategy
+ * of only flushing when an mapping is reused. With it true the GART is
+ * flushed for every mapping. Problem is that doing the lazy flush seems
+ * to trigger bugs with some popular PCI cards, in particular 3ware (but
+ * has been also also seen with Qlogic at least).
+ */
+static int iommu_fullflush = 1;
+
+/* Allocation bitmap for the remapping area: */
+static DEFINE_SPINLOCK(iommu_bitmap_lock);
+/* Guarded by iommu_bitmap_lock: */
+static unsigned long *iommu_gart_bitmap;
+
+static u32 gart_unmapped_entry;
+
+#define GPTE_VALID    1
+#define GPTE_COHERENT 2
+#define GPTE_ENCODE(x) \
+	(((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
+#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
+
+#define EMERGENCY_PAGES 32 /* = 128KB */
+
+#ifdef CONFIG_AGP
+#define AGPEXTERN extern
+#else
+#define AGPEXTERN
+#endif
+
+/* GART can only remap to physical addresses < 1TB */
+#define GART_MAX_PHYS_ADDR	(1ULL << 40)
+
+/* backdoor interface to AGP driver */
+AGPEXTERN int agp_memory_reserved;
+AGPEXTERN __u32 *agp_gatt_table;
+
+static unsigned long next_bit;  /* protected by iommu_bitmap_lock */
+static bool need_flush;		/* global flush state. set for each gart wrap */
+
+static unsigned long alloc_iommu(struct device *dev, int size,
+				 unsigned long align_mask)
+{
+	unsigned long offset, flags;
+	unsigned long boundary_size;
+	unsigned long base_index;
+
+	base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
+			   PAGE_SIZE) >> PAGE_SHIFT;
+	boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1,
+			      PAGE_SIZE) >> PAGE_SHIFT;
+
+	spin_lock_irqsave(&iommu_bitmap_lock, flags);
+	offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
+				  size, base_index, boundary_size, align_mask);
+	if (offset == -1) {
+		need_flush = true;
+		offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
+					  size, base_index, boundary_size,
+					  align_mask);
+	}
+	if (offset != -1) {
+		next_bit = offset+size;
+		if (next_bit >= iommu_pages) {
+			next_bit = 0;
+			need_flush = true;
+		}
+	}
+	if (iommu_fullflush)
+		need_flush = true;
+	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+
+	return offset;
+}
+
+static void free_iommu(unsigned long offset, int size)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu_bitmap_lock, flags);
+	bitmap_clear(iommu_gart_bitmap, offset, size);
+	if (offset >= next_bit)
+		next_bit = offset + size;
+	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+}
+
+/*
+ * Use global flush state to avoid races with multiple flushers.
+ */
+static void flush_gart(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu_bitmap_lock, flags);
+	if (need_flush) {
+		amd_flush_garts();
+		need_flush = false;
+	}
+	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
+}
+
+#ifdef CONFIG_IOMMU_LEAK
+/* Debugging aid for drivers that don't free their IOMMU tables */
+static int leak_trace;
+static int iommu_leak_pages = 20;
+
+static void dump_leak(void)
+{
+	static int dump;
+
+	if (dump)
+		return;
+	dump = 1;
+
+	show_stack(NULL, NULL);
+	debug_dma_dump_mappings(NULL);
+}
+#endif
+
+static void iommu_full(struct device *dev, size_t size, int dir)
+{
+	/*
+	 * Ran out of IOMMU space for this operation. This is very bad.
+	 * Unfortunately the drivers cannot handle this operation properly.
+	 * Return some non mapped prereserved space in the aperture and
+	 * let the Northbridge deal with it. This will result in garbage
+	 * in the IO operation. When the size exceeds the prereserved space
+	 * memory corruption will occur or random memory will be DMAed
+	 * out. Hopefully no network devices use single mappings that big.
+	 */
+
+	dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size);
+
+	if (size > PAGE_SIZE*EMERGENCY_PAGES) {
+		if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+			panic("PCI-DMA: Memory would be corrupted\n");
+		if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
+			panic(KERN_ERR
+				"PCI-DMA: Random memory would be DMAed\n");
+	}
+#ifdef CONFIG_IOMMU_LEAK
+	dump_leak();
+#endif
+}
+
+static inline int
+need_iommu(struct device *dev, unsigned long addr, size_t size)
+{
+	return force_iommu || !dma_capable(dev, addr, size);
+}
+
+static inline int
+nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
+{
+	return !dma_capable(dev, addr, size);
+}
+
+/* Map a single continuous physical area into the IOMMU.
+ * Caller needs to check if the iommu is needed and flush.
+ */
+static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
+				size_t size, int dir, unsigned long align_mask)
+{
+	unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
+	unsigned long iommu_page;
+	int i;
+
+	if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR))
+		return bad_dma_addr;
+
+	iommu_page = alloc_iommu(dev, npages, align_mask);
+	if (iommu_page == -1) {
+		if (!nonforced_iommu(dev, phys_mem, size))
+			return phys_mem;
+		if (panic_on_overflow)
+			panic("dma_map_area overflow %lu bytes\n", size);
+		iommu_full(dev, size, dir);
+		return bad_dma_addr;
+	}
+
+	for (i = 0; i < npages; i++) {
+		iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
+		phys_mem += PAGE_SIZE;
+	}
+	return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
+}
+
+/* Map a single area into the IOMMU */
+static dma_addr_t gart_map_page(struct device *dev, struct page *page,
+				unsigned long offset, size_t size,
+				enum dma_data_direction dir,
+				struct dma_attrs *attrs)
+{
+	unsigned long bus;
+	phys_addr_t paddr = page_to_phys(page) + offset;
+
+	if (!dev)
+		dev = &x86_dma_fallback_dev;
+
+	if (!need_iommu(dev, paddr, size))
+		return paddr;
+
+	bus = dma_map_area(dev, paddr, size, dir, 0);
+	flush_gart();
+
+	return bus;
+}
+
+/*
+ * Free a DMA mapping.
+ */
+static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
+			    size_t size, enum dma_data_direction dir,
+			    struct dma_attrs *attrs)
+{
+	unsigned long iommu_page;
+	int npages;
+	int i;
+
+	if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
+	    dma_addr >= iommu_bus_base + iommu_size)
+		return;
+
+	iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
+	npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
+	for (i = 0; i < npages; i++) {
+		iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
+	}
+	free_iommu(iommu_page, npages);
+}
+
+/*
+ * Wrapper for pci_unmap_single working with scatterlists.
+ */
+static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+			  enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+	struct scatterlist *s;
+	int i;
+
+	for_each_sg(sg, s, nents, i) {
+		if (!s->dma_length || !s->length)
+			break;
+		gart_unmap_page(dev, s->dma_address, s->dma_length, dir, NULL);
+	}
+}
+
+/* Fallback for dma_map_sg in case of overflow */
+static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
+			       int nents, int dir)
+{
+	struct scatterlist *s;
+	int i;
+
+#ifdef CONFIG_IOMMU_DEBUG
+	pr_debug("dma_map_sg overflow\n");
+#endif
+
+	for_each_sg(sg, s, nents, i) {
+		unsigned long addr = sg_phys(s);
+
+		if (nonforced_iommu(dev, addr, s->length)) {
+			addr = dma_map_area(dev, addr, s->length, dir, 0);
+			if (addr == bad_dma_addr) {
+				if (i > 0)
+					gart_unmap_sg(dev, sg, i, dir, NULL);
+				nents = 0;
+				sg[0].dma_length = 0;
+				break;
+			}
+		}
+		s->dma_address = addr;
+		s->dma_length = s->length;
+	}
+	flush_gart();
+
+	return nents;
+}
+
+/* Map multiple scatterlist entries continuous into the first. */
+static int __dma_map_cont(struct device *dev, struct scatterlist *start,
+			  int nelems, struct scatterlist *sout,
+			  unsigned long pages)
+{
+	unsigned long iommu_start = alloc_iommu(dev, pages, 0);
+	unsigned long iommu_page = iommu_start;
+	struct scatterlist *s;
+	int i;
+
+	if (iommu_start == -1)
+		return -1;
+
+	for_each_sg(start, s, nelems, i) {
+		unsigned long pages, addr;
+		unsigned long phys_addr = s->dma_address;
+
+		BUG_ON(s != start && s->offset);
+		if (s == start) {
+			sout->dma_address = iommu_bus_base;
+			sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
+			sout->dma_length = s->length;
+		} else {
+			sout->dma_length += s->length;
+		}
+
+		addr = phys_addr;
+		pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
+		while (pages--) {
+			iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
+			addr += PAGE_SIZE;
+			iommu_page++;
+		}
+	}
+	BUG_ON(iommu_page - iommu_start != pages);
+
+	return 0;
+}
+
+static inline int
+dma_map_cont(struct device *dev, struct scatterlist *start, int nelems,
+	     struct scatterlist *sout, unsigned long pages, int need)
+{
+	if (!need) {
+		BUG_ON(nelems != 1);
+		sout->dma_address = start->dma_address;
+		sout->dma_length = start->length;
+		return 0;
+	}
+	return __dma_map_cont(dev, start, nelems, sout, pages);
+}
+
+/*
+ * DMA map all entries in a scatterlist.
+ * Merge chunks that have page aligned sizes into a continuous mapping.
+ */
+static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+		       enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+	struct scatterlist *s, *ps, *start_sg, *sgmap;
+	int need = 0, nextneed, i, out, start;
+	unsigned long pages = 0;
+	unsigned int seg_size;
+	unsigned int max_seg_size;
+
+	if (nents == 0)
+		return 0;
+
+	if (!dev)
+		dev = &x86_dma_fallback_dev;
+
+	out		= 0;
+	start		= 0;
+	start_sg	= sg;
+	sgmap		= sg;
+	seg_size	= 0;
+	max_seg_size	= dma_get_max_seg_size(dev);
+	ps		= NULL; /* shut up gcc */
+
+	for_each_sg(sg, s, nents, i) {
+		dma_addr_t addr = sg_phys(s);
+
+		s->dma_address = addr;
+		BUG_ON(s->length == 0);
+
+		nextneed = need_iommu(dev, addr, s->length);
+
+		/* Handle the previous not yet processed entries */
+		if (i > start) {
+			/*
+			 * Can only merge when the last chunk ends on a
+			 * page boundary and the new one doesn't have an
+			 * offset.
+			 */
+			if (!iommu_merge || !nextneed || !need || s->offset ||
+			    (s->length + seg_size > max_seg_size) ||
+			    (ps->offset + ps->length) % PAGE_SIZE) {
+				if (dma_map_cont(dev, start_sg, i - start,
+						 sgmap, pages, need) < 0)
+					goto error;
+				out++;
+
+				seg_size	= 0;
+				sgmap		= sg_next(sgmap);
+				pages		= 0;
+				start		= i;
+				start_sg	= s;
+			}
+		}
+
+		seg_size += s->length;
+		need = nextneed;
+		pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
+		ps = s;
+	}
+	if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
+		goto error;
+	out++;
+	flush_gart();
+	if (out < nents) {
+		sgmap = sg_next(sgmap);
+		sgmap->dma_length = 0;
+	}
+	return out;
+
+error:
+	flush_gart();
+	gart_unmap_sg(dev, sg, out, dir, NULL);
+
+	/* When it was forced or merged try again in a dumb way */
+	if (force_iommu || iommu_merge) {
+		out = dma_map_sg_nonforce(dev, sg, nents, dir);
+		if (out > 0)
+			return out;
+	}
+	if (panic_on_overflow)
+		panic("dma_map_sg: overflow on %lu pages\n", pages);
+
+	iommu_full(dev, pages << PAGE_SHIFT, dir);
+	for_each_sg(sg, s, nents, i)
+		s->dma_address = bad_dma_addr;
+	return 0;
+}
+
+/* allocate and map a coherent mapping */
+static void *
+gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
+		    gfp_t flag)
+{
+	dma_addr_t paddr;
+	unsigned long align_mask;
+	struct page *page;
+
+	if (force_iommu && !(flag & GFP_DMA)) {
+		flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+		page = alloc_pages(flag | __GFP_ZERO, get_order(size));
+		if (!page)
+			return NULL;
+
+		align_mask = (1UL << get_order(size)) - 1;
+		paddr = dma_map_area(dev, page_to_phys(page), size,
+				     DMA_BIDIRECTIONAL, align_mask);
+
+		flush_gart();
+		if (paddr != bad_dma_addr) {
+			*dma_addr = paddr;
+			return page_address(page);
+		}
+		__free_pages(page, get_order(size));
+	} else
+		return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
+
+	return NULL;
+}
+
+/* free a coherent mapping */
+static void
+gart_free_coherent(struct device *dev, size_t size, void *vaddr,
+		   dma_addr_t dma_addr)
+{
+	gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
+	free_pages((unsigned long)vaddr, get_order(size));
+}
+
+static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return (dma_addr == bad_dma_addr);
+}
+
+static int no_agp;
+
+static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
+{
+	unsigned long a;
+
+	if (!iommu_size) {
+		iommu_size = aper_size;
+		if (!no_agp)
+			iommu_size /= 2;
+	}
+
+	a = aper + iommu_size;
+	iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
+
+	if (iommu_size < 64*1024*1024) {
+		pr_warning(
+			"PCI-DMA: Warning: Small IOMMU %luMB."
+			" Consider increasing the AGP aperture in BIOS\n",
+				iommu_size >> 20);
+	}
+
+	return iommu_size;
+}
+
+static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
+{
+	unsigned aper_size = 0, aper_base_32, aper_order;
+	u64 aper_base;
+
+	pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32);
+	pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order);
+	aper_order = (aper_order >> 1) & 7;
+
+	aper_base = aper_base_32 & 0x7fff;
+	aper_base <<= 25;
+
+	aper_size = (32 * 1024 * 1024) << aper_order;
+	if (aper_base + aper_size > 0x100000000UL || !aper_size)
+		aper_base = 0;
+
+	*size = aper_size;
+	return aper_base;
+}
+
+static void enable_gart_translations(void)
+{
+	int i;
+
+	if (!amd_nb_has_feature(AMD_NB_GART))
+		return;
+
+	for (i = 0; i < amd_nb_num(); i++) {
+		struct pci_dev *dev = node_to_amd_nb(i)->misc;
+
+		enable_gart_translation(dev, __pa(agp_gatt_table));
+	}
+
+	/* Flush the GART-TLB to remove stale entries */
+	amd_flush_garts();
+}
+
+/*
+ * If fix_up_north_bridges is set, the north bridges have to be fixed up on
+ * resume in the same way as they are handled in gart_iommu_hole_init().
+ */
+static bool fix_up_north_bridges;
+static u32 aperture_order;
+static u32 aperture_alloc;
+
+void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
+{
+	fix_up_north_bridges = true;
+	aperture_order = aper_order;
+	aperture_alloc = aper_alloc;
+}
+
+static void gart_fixup_northbridges(void)
+{
+	int i;
+
+	if (!fix_up_north_bridges)
+		return;
+
+	if (!amd_nb_has_feature(AMD_NB_GART))
+		return;
+
+	pr_info("PCI-DMA: Restoring GART aperture settings\n");
+
+	for (i = 0; i < amd_nb_num(); i++) {
+		struct pci_dev *dev = node_to_amd_nb(i)->misc;
+
+		/*
+		 * Don't enable translations just yet.  That is the next
+		 * step.  Restore the pre-suspend aperture settings.
+		 */
+		gart_set_size_and_enable(dev, aperture_order);
+		pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
+	}
+}
+
+static void gart_resume(void)
+{
+	pr_info("PCI-DMA: Resuming GART IOMMU\n");
+
+	gart_fixup_northbridges();
+
+	enable_gart_translations();
+}
+
+static struct syscore_ops gart_syscore_ops = {
+	.resume		= gart_resume,
+
+};
+
+/*
+ * Private Northbridge GATT initialization in case we cannot use the
+ * AGP driver for some reason.
+ */
+static __init int init_amd_gatt(struct agp_kern_info *info)
+{
+	unsigned aper_size, gatt_size, new_aper_size;
+	unsigned aper_base, new_aper_base;
+	struct pci_dev *dev;
+	void *gatt;
+	int i;
+
+	pr_info("PCI-DMA: Disabling AGP.\n");
+
+	aper_size = aper_base = info->aper_size = 0;
+	dev = NULL;
+	for (i = 0; i < amd_nb_num(); i++) {
+		dev = node_to_amd_nb(i)->misc;
+		new_aper_base = read_aperture(dev, &new_aper_size);
+		if (!new_aper_base)
+			goto nommu;
+
+		if (!aper_base) {
+			aper_size = new_aper_size;
+			aper_base = new_aper_base;
+		}
+		if (aper_size != new_aper_size || aper_base != new_aper_base)
+			goto nommu;
+	}
+	if (!aper_base)
+		goto nommu;
+
+	info->aper_base = aper_base;
+	info->aper_size = aper_size >> 20;
+
+	gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
+	gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+					get_order(gatt_size));
+	if (!gatt)
+		panic("Cannot allocate GATT table");
+	if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
+		panic("Could not set GART PTEs to uncacheable pages");
+
+	agp_gatt_table = gatt;
+
+	register_syscore_ops(&gart_syscore_ops);
+
+	flush_gart();
+
+	pr_info("PCI-DMA: aperture base @ %x size %u KB\n",
+	       aper_base, aper_size>>10);
+
+	return 0;
+
+ nommu:
+	/* Should not happen anymore */
+	pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
+	       "falling back to iommu=soft.\n");
+	return -1;
+}
+
+static struct dma_map_ops gart_dma_ops = {
+	.map_sg				= gart_map_sg,
+	.unmap_sg			= gart_unmap_sg,
+	.map_page			= gart_map_page,
+	.unmap_page			= gart_unmap_page,
+	.alloc_coherent			= gart_alloc_coherent,
+	.free_coherent			= gart_free_coherent,
+	.mapping_error			= gart_mapping_error,
+};
+
+static void gart_iommu_shutdown(void)
+{
+	struct pci_dev *dev;
+	int i;
+
+	/* don't shutdown it if there is AGP installed */
+	if (!no_agp)
+		return;
+
+	if (!amd_nb_has_feature(AMD_NB_GART))
+		return;
+
+	for (i = 0; i < amd_nb_num(); i++) {
+		u32 ctl;
+
+		dev = node_to_amd_nb(i)->misc;
+		pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
+
+		ctl &= ~GARTEN;
+
+		pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
+	}
+}
+
+int __init gart_iommu_init(void)
+{
+	struct agp_kern_info info;
+	unsigned long iommu_start;
+	unsigned long aper_base, aper_size;
+	unsigned long start_pfn, end_pfn;
+	unsigned long scratch;
+	long i;
+
+	if (!amd_nb_has_feature(AMD_NB_GART))
+		return 0;
+
+#ifndef CONFIG_AGP_AMD64
+	no_agp = 1;
+#else
+	/* Makefile puts PCI initialization via subsys_initcall first. */
+	/* Add other AMD AGP bridge drivers here */
+	no_agp = no_agp ||
+		(agp_amd64_init() < 0) ||
+		(agp_copy_info(agp_bridge, &info) < 0);
+#endif
+
+	if (no_iommu ||
+	    (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
+	    !gart_iommu_aperture ||
+	    (no_agp && init_amd_gatt(&info) < 0)) {
+		if (max_pfn > MAX_DMA32_PFN) {
+			pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
+			pr_warning("falling back to iommu=soft.\n");
+		}
+		return 0;
+	}
+
+	/* need to map that range */
+	aper_size	= info.aper_size << 20;
+	aper_base	= info.aper_base;
+	end_pfn		= (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
+
+	if (end_pfn > max_low_pfn_mapped) {
+		start_pfn = (aper_base>>PAGE_SHIFT);
+		init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+	}
+
+	pr_info("PCI-DMA: using GART IOMMU.\n");
+	iommu_size = check_iommu_size(info.aper_base, aper_size);
+	iommu_pages = iommu_size >> PAGE_SHIFT;
+
+	iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+						      get_order(iommu_pages/8));
+	if (!iommu_gart_bitmap)
+		panic("Cannot allocate iommu bitmap\n");
+
+#ifdef CONFIG_IOMMU_LEAK
+	if (leak_trace) {
+		int ret;
+
+		ret = dma_debug_resize_entries(iommu_pages);
+		if (ret)
+			pr_debug("PCI-DMA: Cannot trace all the entries\n");
+	}
+#endif
+
+	/*
+	 * Out of IOMMU space handling.
+	 * Reserve some invalid pages at the beginning of the GART.
+	 */
+	bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
+
+	pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
+	       iommu_size >> 20);
+
+	agp_memory_reserved	= iommu_size;
+	iommu_start		= aper_size - iommu_size;
+	iommu_bus_base		= info.aper_base + iommu_start;
+	bad_dma_addr		= iommu_bus_base;
+	iommu_gatt_base		= agp_gatt_table + (iommu_start>>PAGE_SHIFT);
+
+	/*
+	 * Unmap the IOMMU part of the GART. The alias of the page is
+	 * always mapped with cache enabled and there is no full cache
+	 * coherency across the GART remapping. The unmapping avoids
+	 * automatic prefetches from the CPU allocating cache lines in
+	 * there. All CPU accesses are done via the direct mapping to
+	 * the backing memory. The GART address is only used by PCI
+	 * devices.
+	 */
+	set_memory_np((unsigned long)__va(iommu_bus_base),
+				iommu_size >> PAGE_SHIFT);
+	/*
+	 * Tricky. The GART table remaps the physical memory range,
+	 * so the CPU wont notice potential aliases and if the memory
+	 * is remapped to UC later on, we might surprise the PCI devices
+	 * with a stray writeout of a cacheline. So play it sure and
+	 * do an explicit, full-scale wbinvd() _after_ having marked all
+	 * the pages as Not-Present:
+	 */
+	wbinvd();
+
+	/*
+	 * Now all caches are flushed and we can safely enable
+	 * GART hardware.  Doing it early leaves the possibility
+	 * of stale cache entries that can lead to GART PTE
+	 * errors.
+	 */
+	enable_gart_translations();
+
+	/*
+	 * Try to workaround a bug (thanks to BenH):
+	 * Set unmapped entries to a scratch page instead of 0.
+	 * Any prefetches that hit unmapped entries won't get an bus abort
+	 * then. (P2P bridge may be prefetching on DMA reads).
+	 */
+	scratch = get_zeroed_page(GFP_KERNEL);
+	if (!scratch)
+		panic("Cannot allocate iommu scratch page");
+	gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
+	for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
+		iommu_gatt_base[i] = gart_unmapped_entry;
+
+	flush_gart();
+	dma_ops = &gart_dma_ops;
+	x86_platform.iommu_shutdown = gart_iommu_shutdown;
+	swiotlb = 0;
+
+	return 0;
+}
+
+void __init gart_parse_options(char *p)
+{
+	int arg;
+
+#ifdef CONFIG_IOMMU_LEAK
+	if (!strncmp(p, "leak", 4)) {
+		leak_trace = 1;
+		p += 4;
+		if (*p == '=')
+			++p;
+		if (isdigit(*p) && get_option(&p, &arg))
+			iommu_leak_pages = arg;
+	}
+#endif
+	if (isdigit(*p) && get_option(&p, &arg))
+		iommu_size = arg;
+	if (!strncmp(p, "fullflush", 9))
+		iommu_fullflush = 1;
+	if (!strncmp(p, "nofullflush", 11))
+		iommu_fullflush = 0;
+	if (!strncmp(p, "noagp", 5))
+		no_agp = 1;
+	if (!strncmp(p, "noaperture", 10))
+		fix_aperture = 0;
+	/* duplicated from pci-dma.c */
+	if (!strncmp(p, "force", 5))
+		gart_iommu_aperture_allowed = 1;
+	if (!strncmp(p, "allowed", 7))
+		gart_iommu_aperture_allowed = 1;
+	if (!strncmp(p, "memaper", 7)) {
+		fallback_aper_force = 1;
+		p += 7;
+		if (*p == '=') {
+			++p;
+			if (get_option(&p, &arg))
+				fallback_aper_order = arg;
+		}
+	}
+}
+IOMMU_INIT_POST(gart_iommu_hole_init);
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
deleted file mode 100644
index b117efd24f7..00000000000
--- a/arch/x86/kernel/pci-gart_64.c
+++ /dev/null
@@ -1,898 +0,0 @@
-/*
- * Dynamic DMA mapping support for AMD Hammer.
- *
- * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI.
- * This allows to use PCI devices that only support 32bit addresses on systems
- * with more than 4GB.
- *
- * See Documentation/PCI/PCI-DMA-mapping.txt for the interface specification.
- *
- * Copyright 2002 Andi Kleen, SuSE Labs.
- * Subject to the GNU General Public License v2 only.
- */
-
-#include <linux/types.h>
-#include <linux/ctype.h>
-#include <linux/agp_backend.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/spinlock.h>
-#include <linux/pci.h>
-#include <linux/module.h>
-#include <linux/topology.h>
-#include <linux/interrupt.h>
-#include <linux/bitmap.h>
-#include <linux/kdebug.h>
-#include <linux/scatterlist.h>
-#include <linux/iommu-helper.h>
-#include <linux/syscore_ops.h>
-#include <linux/io.h>
-#include <linux/gfp.h>
-#include <asm/atomic.h>
-#include <asm/mtrr.h>
-#include <asm/pgtable.h>
-#include <asm/proto.h>
-#include <asm/iommu.h>
-#include <asm/gart.h>
-#include <asm/cacheflush.h>
-#include <asm/swiotlb.h>
-#include <asm/dma.h>
-#include <asm/amd_nb.h>
-#include <asm/x86_init.h>
-#include <asm/iommu_table.h>
-
-static unsigned long iommu_bus_base;	/* GART remapping area (physical) */
-static unsigned long iommu_size;	/* size of remapping area bytes */
-static unsigned long iommu_pages;	/* .. and in pages */
-
-static u32 *iommu_gatt_base;		/* Remapping table */
-
-static dma_addr_t bad_dma_addr;
-
-/*
- * If this is disabled the IOMMU will use an optimized flushing strategy
- * of only flushing when an mapping is reused. With it true the GART is
- * flushed for every mapping. Problem is that doing the lazy flush seems
- * to trigger bugs with some popular PCI cards, in particular 3ware (but
- * has been also also seen with Qlogic at least).
- */
-static int iommu_fullflush = 1;
-
-/* Allocation bitmap for the remapping area: */
-static DEFINE_SPINLOCK(iommu_bitmap_lock);
-/* Guarded by iommu_bitmap_lock: */
-static unsigned long *iommu_gart_bitmap;
-
-static u32 gart_unmapped_entry;
-
-#define GPTE_VALID    1
-#define GPTE_COHERENT 2
-#define GPTE_ENCODE(x) \
-	(((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
-#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
-
-#define EMERGENCY_PAGES 32 /* = 128KB */
-
-#ifdef CONFIG_AGP
-#define AGPEXTERN extern
-#else
-#define AGPEXTERN
-#endif
-
-/* GART can only remap to physical addresses < 1TB */
-#define GART_MAX_PHYS_ADDR	(1ULL << 40)
-
-/* backdoor interface to AGP driver */
-AGPEXTERN int agp_memory_reserved;
-AGPEXTERN __u32 *agp_gatt_table;
-
-static unsigned long next_bit;  /* protected by iommu_bitmap_lock */
-static bool need_flush;		/* global flush state. set for each gart wrap */
-
-static unsigned long alloc_iommu(struct device *dev, int size,
-				 unsigned long align_mask)
-{
-	unsigned long offset, flags;
-	unsigned long boundary_size;
-	unsigned long base_index;
-
-	base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
-			   PAGE_SIZE) >> PAGE_SHIFT;
-	boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1,
-			      PAGE_SIZE) >> PAGE_SHIFT;
-
-	spin_lock_irqsave(&iommu_bitmap_lock, flags);
-	offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
-				  size, base_index, boundary_size, align_mask);
-	if (offset == -1) {
-		need_flush = true;
-		offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
-					  size, base_index, boundary_size,
-					  align_mask);
-	}
-	if (offset != -1) {
-		next_bit = offset+size;
-		if (next_bit >= iommu_pages) {
-			next_bit = 0;
-			need_flush = true;
-		}
-	}
-	if (iommu_fullflush)
-		need_flush = true;
-	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
-
-	return offset;
-}
-
-static void free_iommu(unsigned long offset, int size)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&iommu_bitmap_lock, flags);
-	bitmap_clear(iommu_gart_bitmap, offset, size);
-	if (offset >= next_bit)
-		next_bit = offset + size;
-	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
-}
-
-/*
- * Use global flush state to avoid races with multiple flushers.
- */
-static void flush_gart(void)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&iommu_bitmap_lock, flags);
-	if (need_flush) {
-		amd_flush_garts();
-		need_flush = false;
-	}
-	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
-}
-
-#ifdef CONFIG_IOMMU_LEAK
-/* Debugging aid for drivers that don't free their IOMMU tables */
-static int leak_trace;
-static int iommu_leak_pages = 20;
-
-static void dump_leak(void)
-{
-	static int dump;
-
-	if (dump)
-		return;
-	dump = 1;
-
-	show_stack(NULL, NULL);
-	debug_dma_dump_mappings(NULL);
-}
-#endif
-
-static void iommu_full(struct device *dev, size_t size, int dir)
-{
-	/*
-	 * Ran out of IOMMU space for this operation. This is very bad.
-	 * Unfortunately the drivers cannot handle this operation properly.
-	 * Return some non mapped prereserved space in the aperture and
-	 * let the Northbridge deal with it. This will result in garbage
-	 * in the IO operation. When the size exceeds the prereserved space
-	 * memory corruption will occur or random memory will be DMAed
-	 * out. Hopefully no network devices use single mappings that big.
-	 */
-
-	dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size);
-
-	if (size > PAGE_SIZE*EMERGENCY_PAGES) {
-		if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
-			panic("PCI-DMA: Memory would be corrupted\n");
-		if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL)
-			panic(KERN_ERR
-				"PCI-DMA: Random memory would be DMAed\n");
-	}
-#ifdef CONFIG_IOMMU_LEAK
-	dump_leak();
-#endif
-}
-
-static inline int
-need_iommu(struct device *dev, unsigned long addr, size_t size)
-{
-	return force_iommu || !dma_capable(dev, addr, size);
-}
-
-static inline int
-nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
-{
-	return !dma_capable(dev, addr, size);
-}
-
-/* Map a single continuous physical area into the IOMMU.
- * Caller needs to check if the iommu is needed and flush.
- */
-static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
-				size_t size, int dir, unsigned long align_mask)
-{
-	unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
-	unsigned long iommu_page;
-	int i;
-
-	if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR))
-		return bad_dma_addr;
-
-	iommu_page = alloc_iommu(dev, npages, align_mask);
-	if (iommu_page == -1) {
-		if (!nonforced_iommu(dev, phys_mem, size))
-			return phys_mem;
-		if (panic_on_overflow)
-			panic("dma_map_area overflow %lu bytes\n", size);
-		iommu_full(dev, size, dir);
-		return bad_dma_addr;
-	}
-
-	for (i = 0; i < npages; i++) {
-		iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
-		phys_mem += PAGE_SIZE;
-	}
-	return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
-}
-
-/* Map a single area into the IOMMU */
-static dma_addr_t gart_map_page(struct device *dev, struct page *page,
-				unsigned long offset, size_t size,
-				enum dma_data_direction dir,
-				struct dma_attrs *attrs)
-{
-	unsigned long bus;
-	phys_addr_t paddr = page_to_phys(page) + offset;
-
-	if (!dev)
-		dev = &x86_dma_fallback_dev;
-
-	if (!need_iommu(dev, paddr, size))
-		return paddr;
-
-	bus = dma_map_area(dev, paddr, size, dir, 0);
-	flush_gart();
-
-	return bus;
-}
-
-/*
- * Free a DMA mapping.
- */
-static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
-			    size_t size, enum dma_data_direction dir,
-			    struct dma_attrs *attrs)
-{
-	unsigned long iommu_page;
-	int npages;
-	int i;
-
-	if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE ||
-	    dma_addr >= iommu_bus_base + iommu_size)
-		return;
-
-	iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
-	npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
-	for (i = 0; i < npages; i++) {
-		iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
-	}
-	free_iommu(iommu_page, npages);
-}
-
-/*
- * Wrapper for pci_unmap_single working with scatterlists.
- */
-static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
-			  enum dma_data_direction dir, struct dma_attrs *attrs)
-{
-	struct scatterlist *s;
-	int i;
-
-	for_each_sg(sg, s, nents, i) {
-		if (!s->dma_length || !s->length)
-			break;
-		gart_unmap_page(dev, s->dma_address, s->dma_length, dir, NULL);
-	}
-}
-
-/* Fallback for dma_map_sg in case of overflow */
-static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
-			       int nents, int dir)
-{
-	struct scatterlist *s;
-	int i;
-
-#ifdef CONFIG_IOMMU_DEBUG
-	pr_debug("dma_map_sg overflow\n");
-#endif
-
-	for_each_sg(sg, s, nents, i) {
-		unsigned long addr = sg_phys(s);
-
-		if (nonforced_iommu(dev, addr, s->length)) {
-			addr = dma_map_area(dev, addr, s->length, dir, 0);
-			if (addr == bad_dma_addr) {
-				if (i > 0)
-					gart_unmap_sg(dev, sg, i, dir, NULL);
-				nents = 0;
-				sg[0].dma_length = 0;
-				break;
-			}
-		}
-		s->dma_address = addr;
-		s->dma_length = s->length;
-	}
-	flush_gart();
-
-	return nents;
-}
-
-/* Map multiple scatterlist entries continuous into the first. */
-static int __dma_map_cont(struct device *dev, struct scatterlist *start,
-			  int nelems, struct scatterlist *sout,
-			  unsigned long pages)
-{
-	unsigned long iommu_start = alloc_iommu(dev, pages, 0);
-	unsigned long iommu_page = iommu_start;
-	struct scatterlist *s;
-	int i;
-
-	if (iommu_start == -1)
-		return -1;
-
-	for_each_sg(start, s, nelems, i) {
-		unsigned long pages, addr;
-		unsigned long phys_addr = s->dma_address;
-
-		BUG_ON(s != start && s->offset);
-		if (s == start) {
-			sout->dma_address = iommu_bus_base;
-			sout->dma_address += iommu_page*PAGE_SIZE + s->offset;
-			sout->dma_length = s->length;
-		} else {
-			sout->dma_length += s->length;
-		}
-
-		addr = phys_addr;
-		pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
-		while (pages--) {
-			iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
-			addr += PAGE_SIZE;
-			iommu_page++;
-		}
-	}
-	BUG_ON(iommu_page - iommu_start != pages);
-
-	return 0;
-}
-
-static inline int
-dma_map_cont(struct device *dev, struct scatterlist *start, int nelems,
-	     struct scatterlist *sout, unsigned long pages, int need)
-{
-	if (!need) {
-		BUG_ON(nelems != 1);
-		sout->dma_address = start->dma_address;
-		sout->dma_length = start->length;
-		return 0;
-	}
-	return __dma_map_cont(dev, start, nelems, sout, pages);
-}
-
-/*
- * DMA map all entries in a scatterlist.
- * Merge chunks that have page aligned sizes into a continuous mapping.
- */
-static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
-		       enum dma_data_direction dir, struct dma_attrs *attrs)
-{
-	struct scatterlist *s, *ps, *start_sg, *sgmap;
-	int need = 0, nextneed, i, out, start;
-	unsigned long pages = 0;
-	unsigned int seg_size;
-	unsigned int max_seg_size;
-
-	if (nents == 0)
-		return 0;
-
-	if (!dev)
-		dev = &x86_dma_fallback_dev;
-
-	out		= 0;
-	start		= 0;
-	start_sg	= sg;
-	sgmap		= sg;
-	seg_size	= 0;
-	max_seg_size	= dma_get_max_seg_size(dev);
-	ps		= NULL; /* shut up gcc */
-
-	for_each_sg(sg, s, nents, i) {
-		dma_addr_t addr = sg_phys(s);
-
-		s->dma_address = addr;
-		BUG_ON(s->length == 0);
-
-		nextneed = need_iommu(dev, addr, s->length);
-
-		/* Handle the previous not yet processed entries */
-		if (i > start) {
-			/*
-			 * Can only merge when the last chunk ends on a
-			 * page boundary and the new one doesn't have an
-			 * offset.
-			 */
-			if (!iommu_merge || !nextneed || !need || s->offset ||
-			    (s->length + seg_size > max_seg_size) ||
-			    (ps->offset + ps->length) % PAGE_SIZE) {
-				if (dma_map_cont(dev, start_sg, i - start,
-						 sgmap, pages, need) < 0)
-					goto error;
-				out++;
-
-				seg_size	= 0;
-				sgmap		= sg_next(sgmap);
-				pages		= 0;
-				start		= i;
-				start_sg	= s;
-			}
-		}
-
-		seg_size += s->length;
-		need = nextneed;
-		pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
-		ps = s;
-	}
-	if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
-		goto error;
-	out++;
-	flush_gart();
-	if (out < nents) {
-		sgmap = sg_next(sgmap);
-		sgmap->dma_length = 0;
-	}
-	return out;
-
-error:
-	flush_gart();
-	gart_unmap_sg(dev, sg, out, dir, NULL);
-
-	/* When it was forced or merged try again in a dumb way */
-	if (force_iommu || iommu_merge) {
-		out = dma_map_sg_nonforce(dev, sg, nents, dir);
-		if (out > 0)
-			return out;
-	}
-	if (panic_on_overflow)
-		panic("dma_map_sg: overflow on %lu pages\n", pages);
-
-	iommu_full(dev, pages << PAGE_SHIFT, dir);
-	for_each_sg(sg, s, nents, i)
-		s->dma_address = bad_dma_addr;
-	return 0;
-}
-
-/* allocate and map a coherent mapping */
-static void *
-gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
-		    gfp_t flag)
-{
-	dma_addr_t paddr;
-	unsigned long align_mask;
-	struct page *page;
-
-	if (force_iommu && !(flag & GFP_DMA)) {
-		flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-		page = alloc_pages(flag | __GFP_ZERO, get_order(size));
-		if (!page)
-			return NULL;
-
-		align_mask = (1UL << get_order(size)) - 1;
-		paddr = dma_map_area(dev, page_to_phys(page), size,
-				     DMA_BIDIRECTIONAL, align_mask);
-
-		flush_gart();
-		if (paddr != bad_dma_addr) {
-			*dma_addr = paddr;
-			return page_address(page);
-		}
-		__free_pages(page, get_order(size));
-	} else
-		return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
-
-	return NULL;
-}
-
-/* free a coherent mapping */
-static void
-gart_free_coherent(struct device *dev, size_t size, void *vaddr,
-		   dma_addr_t dma_addr)
-{
-	gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
-	free_pages((unsigned long)vaddr, get_order(size));
-}
-
-static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-	return (dma_addr == bad_dma_addr);
-}
-
-static int no_agp;
-
-static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
-{
-	unsigned long a;
-
-	if (!iommu_size) {
-		iommu_size = aper_size;
-		if (!no_agp)
-			iommu_size /= 2;
-	}
-
-	a = aper + iommu_size;
-	iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
-
-	if (iommu_size < 64*1024*1024) {
-		pr_warning(
-			"PCI-DMA: Warning: Small IOMMU %luMB."
-			" Consider increasing the AGP aperture in BIOS\n",
-				iommu_size >> 20);
-	}
-
-	return iommu_size;
-}
-
-static __init unsigned read_aperture(struct pci_dev *dev, u32 *size)
-{
-	unsigned aper_size = 0, aper_base_32, aper_order;
-	u64 aper_base;
-
-	pci_read_config_dword(dev, AMD64_GARTAPERTUREBASE, &aper_base_32);
-	pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &aper_order);
-	aper_order = (aper_order >> 1) & 7;
-
-	aper_base = aper_base_32 & 0x7fff;
-	aper_base <<= 25;
-
-	aper_size = (32 * 1024 * 1024) << aper_order;
-	if (aper_base + aper_size > 0x100000000UL || !aper_size)
-		aper_base = 0;
-
-	*size = aper_size;
-	return aper_base;
-}
-
-static void enable_gart_translations(void)
-{
-	int i;
-
-	if (!amd_nb_has_feature(AMD_NB_GART))
-		return;
-
-	for (i = 0; i < amd_nb_num(); i++) {
-		struct pci_dev *dev = node_to_amd_nb(i)->misc;
-
-		enable_gart_translation(dev, __pa(agp_gatt_table));
-	}
-
-	/* Flush the GART-TLB to remove stale entries */
-	amd_flush_garts();
-}
-
-/*
- * If fix_up_north_bridges is set, the north bridges have to be fixed up on
- * resume in the same way as they are handled in gart_iommu_hole_init().
- */
-static bool fix_up_north_bridges;
-static u32 aperture_order;
-static u32 aperture_alloc;
-
-void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
-{
-	fix_up_north_bridges = true;
-	aperture_order = aper_order;
-	aperture_alloc = aper_alloc;
-}
-
-static void gart_fixup_northbridges(void)
-{
-	int i;
-
-	if (!fix_up_north_bridges)
-		return;
-
-	if (!amd_nb_has_feature(AMD_NB_GART))
-		return;
-
-	pr_info("PCI-DMA: Restoring GART aperture settings\n");
-
-	for (i = 0; i < amd_nb_num(); i++) {
-		struct pci_dev *dev = node_to_amd_nb(i)->misc;
-
-		/*
-		 * Don't enable translations just yet.  That is the next
-		 * step.  Restore the pre-suspend aperture settings.
-		 */
-		gart_set_size_and_enable(dev, aperture_order);
-		pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
-	}
-}
-
-static void gart_resume(void)
-{
-	pr_info("PCI-DMA: Resuming GART IOMMU\n");
-
-	gart_fixup_northbridges();
-
-	enable_gart_translations();
-}
-
-static struct syscore_ops gart_syscore_ops = {
-	.resume		= gart_resume,
-
-};
-
-/*
- * Private Northbridge GATT initialization in case we cannot use the
- * AGP driver for some reason.
- */
-static __init int init_amd_gatt(struct agp_kern_info *info)
-{
-	unsigned aper_size, gatt_size, new_aper_size;
-	unsigned aper_base, new_aper_base;
-	struct pci_dev *dev;
-	void *gatt;
-	int i;
-
-	pr_info("PCI-DMA: Disabling AGP.\n");
-
-	aper_size = aper_base = info->aper_size = 0;
-	dev = NULL;
-	for (i = 0; i < amd_nb_num(); i++) {
-		dev = node_to_amd_nb(i)->misc;
-		new_aper_base = read_aperture(dev, &new_aper_size);
-		if (!new_aper_base)
-			goto nommu;
-
-		if (!aper_base) {
-			aper_size = new_aper_size;
-			aper_base = new_aper_base;
-		}
-		if (aper_size != new_aper_size || aper_base != new_aper_base)
-			goto nommu;
-	}
-	if (!aper_base)
-		goto nommu;
-
-	info->aper_base = aper_base;
-	info->aper_size = aper_size >> 20;
-
-	gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
-	gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-					get_order(gatt_size));
-	if (!gatt)
-		panic("Cannot allocate GATT table");
-	if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
-		panic("Could not set GART PTEs to uncacheable pages");
-
-	agp_gatt_table = gatt;
-
-	register_syscore_ops(&gart_syscore_ops);
-
-	flush_gart();
-
-	pr_info("PCI-DMA: aperture base @ %x size %u KB\n",
-	       aper_base, aper_size>>10);
-
-	return 0;
-
- nommu:
-	/* Should not happen anymore */
-	pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
-	       "falling back to iommu=soft.\n");
-	return -1;
-}
-
-static struct dma_map_ops gart_dma_ops = {
-	.map_sg				= gart_map_sg,
-	.unmap_sg			= gart_unmap_sg,
-	.map_page			= gart_map_page,
-	.unmap_page			= gart_unmap_page,
-	.alloc_coherent			= gart_alloc_coherent,
-	.free_coherent			= gart_free_coherent,
-	.mapping_error			= gart_mapping_error,
-};
-
-static void gart_iommu_shutdown(void)
-{
-	struct pci_dev *dev;
-	int i;
-
-	/* don't shutdown it if there is AGP installed */
-	if (!no_agp)
-		return;
-
-	if (!amd_nb_has_feature(AMD_NB_GART))
-		return;
-
-	for (i = 0; i < amd_nb_num(); i++) {
-		u32 ctl;
-
-		dev = node_to_amd_nb(i)->misc;
-		pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
-
-		ctl &= ~GARTEN;
-
-		pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
-	}
-}
-
-int __init gart_iommu_init(void)
-{
-	struct agp_kern_info info;
-	unsigned long iommu_start;
-	unsigned long aper_base, aper_size;
-	unsigned long start_pfn, end_pfn;
-	unsigned long scratch;
-	long i;
-
-	if (!amd_nb_has_feature(AMD_NB_GART))
-		return 0;
-
-#ifndef CONFIG_AGP_AMD64
-	no_agp = 1;
-#else
-	/* Makefile puts PCI initialization via subsys_initcall first. */
-	/* Add other AMD AGP bridge drivers here */
-	no_agp = no_agp ||
-		(agp_amd64_init() < 0) ||
-		(agp_copy_info(agp_bridge, &info) < 0);
-#endif
-
-	if (no_iommu ||
-	    (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
-	    !gart_iommu_aperture ||
-	    (no_agp && init_amd_gatt(&info) < 0)) {
-		if (max_pfn > MAX_DMA32_PFN) {
-			pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
-			pr_warning("falling back to iommu=soft.\n");
-		}
-		return 0;
-	}
-
-	/* need to map that range */
-	aper_size	= info.aper_size << 20;
-	aper_base	= info.aper_base;
-	end_pfn		= (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
-
-	if (end_pfn > max_low_pfn_mapped) {
-		start_pfn = (aper_base>>PAGE_SHIFT);
-		init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
-	}
-
-	pr_info("PCI-DMA: using GART IOMMU.\n");
-	iommu_size = check_iommu_size(info.aper_base, aper_size);
-	iommu_pages = iommu_size >> PAGE_SHIFT;
-
-	iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
-						      get_order(iommu_pages/8));
-	if (!iommu_gart_bitmap)
-		panic("Cannot allocate iommu bitmap\n");
-
-#ifdef CONFIG_IOMMU_LEAK
-	if (leak_trace) {
-		int ret;
-
-		ret = dma_debug_resize_entries(iommu_pages);
-		if (ret)
-			pr_debug("PCI-DMA: Cannot trace all the entries\n");
-	}
-#endif
-
-	/*
-	 * Out of IOMMU space handling.
-	 * Reserve some invalid pages at the beginning of the GART.
-	 */
-	bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
-
-	pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
-	       iommu_size >> 20);
-
-	agp_memory_reserved	= iommu_size;
-	iommu_start		= aper_size - iommu_size;
-	iommu_bus_base		= info.aper_base + iommu_start;
-	bad_dma_addr		= iommu_bus_base;
-	iommu_gatt_base		= agp_gatt_table + (iommu_start>>PAGE_SHIFT);
-
-	/*
-	 * Unmap the IOMMU part of the GART. The alias of the page is
-	 * always mapped with cache enabled and there is no full cache
-	 * coherency across the GART remapping. The unmapping avoids
-	 * automatic prefetches from the CPU allocating cache lines in
-	 * there. All CPU accesses are done via the direct mapping to
-	 * the backing memory. The GART address is only used by PCI
-	 * devices.
-	 */
-	set_memory_np((unsigned long)__va(iommu_bus_base),
-				iommu_size >> PAGE_SHIFT);
-	/*
-	 * Tricky. The GART table remaps the physical memory range,
-	 * so the CPU wont notice potential aliases and if the memory
-	 * is remapped to UC later on, we might surprise the PCI devices
-	 * with a stray writeout of a cacheline. So play it sure and
-	 * do an explicit, full-scale wbinvd() _after_ having marked all
-	 * the pages as Not-Present:
-	 */
-	wbinvd();
-
-	/*
-	 * Now all caches are flushed and we can safely enable
-	 * GART hardware.  Doing it early leaves the possibility
-	 * of stale cache entries that can lead to GART PTE
-	 * errors.
-	 */
-	enable_gart_translations();
-
-	/*
-	 * Try to workaround a bug (thanks to BenH):
-	 * Set unmapped entries to a scratch page instead of 0.
-	 * Any prefetches that hit unmapped entries won't get an bus abort
-	 * then. (P2P bridge may be prefetching on DMA reads).
-	 */
-	scratch = get_zeroed_page(GFP_KERNEL);
-	if (!scratch)
-		panic("Cannot allocate iommu scratch page");
-	gart_unmapped_entry = GPTE_ENCODE(__pa(scratch));
-	for (i = EMERGENCY_PAGES; i < iommu_pages; i++)
-		iommu_gatt_base[i] = gart_unmapped_entry;
-
-	flush_gart();
-	dma_ops = &gart_dma_ops;
-	x86_platform.iommu_shutdown = gart_iommu_shutdown;
-	swiotlb = 0;
-
-	return 0;
-}
-
-void __init gart_parse_options(char *p)
-{
-	int arg;
-
-#ifdef CONFIG_IOMMU_LEAK
-	if (!strncmp(p, "leak", 4)) {
-		leak_trace = 1;
-		p += 4;
-		if (*p == '=')
-			++p;
-		if (isdigit(*p) && get_option(&p, &arg))
-			iommu_leak_pages = arg;
-	}
-#endif
-	if (isdigit(*p) && get_option(&p, &arg))
-		iommu_size = arg;
-	if (!strncmp(p, "fullflush", 9))
-		iommu_fullflush = 1;
-	if (!strncmp(p, "nofullflush", 11))
-		iommu_fullflush = 0;
-	if (!strncmp(p, "noagp", 5))
-		no_agp = 1;
-	if (!strncmp(p, "noaperture", 10))
-		fix_aperture = 0;
-	/* duplicated from pci-dma.c */
-	if (!strncmp(p, "force", 5))
-		gart_iommu_aperture_allowed = 1;
-	if (!strncmp(p, "allowed", 7))
-		gart_iommu_aperture_allowed = 1;
-	if (!strncmp(p, "memaper", 7)) {
-		fallback_aper_force = 1;
-		p += 7;
-		if (*p == '=') {
-			++p;
-			if (get_option(&p, &arg))
-				fallback_aper_order = arg;
-		}
-	}
-}
-IOMMU_INIT_POST(gart_iommu_hole_init);
-- 
cgit v1.2.3-70-g09d2


From 2e711c04dbbf7a7732a3f7073b1fc285d12b369d Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 26 Apr 2011 19:15:07 +0200
Subject: PM: Remove sysdev suspend, resume and shutdown operations

Since suspend, resume and shutdown operations in struct sysdev_class
and struct sysdev_driver are not used any more, remove them.  Also
drop sysdev_suspend(), sysdev_resume() and sysdev_shutdown() used
for executing those operations and modify all of their users
accordingly.  This reduces kernel code size quite a bit and reduces
its complexity.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/sh/Kconfig          |   1 -
 arch/x86/Kconfig         |   1 -
 arch/x86/kernel/apm_32.c |   4 -
 drivers/base/Kconfig     |   7 --
 drivers/base/base.h      |   2 -
 drivers/base/sys.c       | 202 +----------------------------------------------
 drivers/xen/manage.c     |   8 +-
 include/linux/device.h   |   7 --
 include/linux/pm.h       |   8 --
 include/linux/sysdev.h   |  11 ---
 kernel/kexec.c           |   9 +--
 kernel/power/hibernate.c |  18 +----
 kernel/power/suspend.c   |   8 +-
 kernel/sys.c             |   3 -
 14 files changed, 7 insertions(+), 282 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 4b89da248d1..bc439de48cd 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -24,7 +24,6 @@ config SUPERH
 	select RTC_LIB
 	select GENERIC_ATOMIC64
 	select GENERIC_IRQ_SHOW
-	select ARCH_NO_SYSDEV_OPS
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc6c53a95bf..140e254fe54 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -71,7 +71,6 @@ config X86
 	select GENERIC_IRQ_SHOW
 	select IRQ_FORCED_THREADING
 	select USE_GENERIC_SMP_HELPERS if SMP
-	select ARCH_NO_SYSDEV_OPS
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index adee12e0da1..3bfa0223596 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1238,7 +1238,6 @@ static int suspend(int vetoable)
 	dpm_suspend_noirq(PMSG_SUSPEND);
 
 	local_irq_disable();
-	sysdev_suspend(PMSG_SUSPEND);
 	syscore_suspend();
 
 	local_irq_enable();
@@ -1258,7 +1257,6 @@ static int suspend(int vetoable)
 	err = (err == APM_SUCCESS) ? 0 : -EIO;
 
 	syscore_resume();
-	sysdev_resume();
 	local_irq_enable();
 
 	dpm_resume_noirq(PMSG_RESUME);
@@ -1282,7 +1280,6 @@ static void standby(void)
 	dpm_suspend_noirq(PMSG_SUSPEND);
 
 	local_irq_disable();
-	sysdev_suspend(PMSG_SUSPEND);
 	syscore_suspend();
 	local_irq_enable();
 
@@ -1292,7 +1289,6 @@ static void standby(void)
 
 	local_irq_disable();
 	syscore_resume();
-	sysdev_resume();
 	local_irq_enable();
 
 	dpm_resume_noirq(PMSG_RESUME);
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index e9e5238f310..d57e8d0fb82 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -168,11 +168,4 @@ config SYS_HYPERVISOR
 	bool
 	default n
 
-config ARCH_NO_SYSDEV_OPS
-	bool
-	---help---
-	  To be selected by architectures that don't use sysdev class or
-	  sysdev driver power management (suspend/resume) and shutdown
-	  operations.
-
 endmenu
diff --git a/drivers/base/base.h b/drivers/base/base.h
index 19f49e41ce5..a34dca0ad04 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -111,8 +111,6 @@ static inline int driver_match_device(struct device_driver *drv,
 	return drv->bus->match ? drv->bus->match(dev, drv) : 1;
 }
 
-extern void sysdev_shutdown(void);
-
 extern char *make_class_name(const char *name, struct kobject *kobj);
 
 extern int devres_release_all(struct device *dev);
diff --git a/drivers/base/sys.c b/drivers/base/sys.c
index acde9b5ee13..9dff77bfe1e 100644
--- a/drivers/base/sys.c
+++ b/drivers/base/sys.c
@@ -328,203 +328,8 @@ void sysdev_unregister(struct sys_device *sysdev)
 	kobject_put(&sysdev->kobj);
 }
 
-
-#ifndef CONFIG_ARCH_NO_SYSDEV_OPS
-/**
- *	sysdev_shutdown - Shut down all system devices.
- *
- *	Loop over each class of system devices, and the devices in each
- *	of those classes. For each device, we call the shutdown method for
- *	each driver registered for the device - the auxiliaries,
- *	and the class driver.
- *
- *	Note: The list is iterated in reverse order, so that we shut down
- *	child devices before we shut down their parents. The list ordering
- *	is guaranteed by virtue of the fact that child devices are registered
- *	after their parents.
- */
-void sysdev_shutdown(void)
-{
-	struct sysdev_class *cls;
-
-	pr_debug("Shutting Down System Devices\n");
-
-	mutex_lock(&sysdev_drivers_lock);
-	list_for_each_entry_reverse(cls, &system_kset->list, kset.kobj.entry) {
-		struct sys_device *sysdev;
-
-		pr_debug("Shutting down type '%s':\n",
-			 kobject_name(&cls->kset.kobj));
-
-		list_for_each_entry(sysdev, &cls->kset.list, kobj.entry) {
-			struct sysdev_driver *drv;
-			pr_debug(" %s\n", kobject_name(&sysdev->kobj));
-
-			/* Call auxiliary drivers first */
-			list_for_each_entry(drv, &cls->drivers, entry) {
-				if (drv->shutdown)
-					drv->shutdown(sysdev);
-			}
-
-			/* Now call the generic one */
-			if (cls->shutdown)
-				cls->shutdown(sysdev);
-		}
-	}
-	mutex_unlock(&sysdev_drivers_lock);
-}
-
-static void __sysdev_resume(struct sys_device *dev)
-{
-	struct sysdev_class *cls = dev->cls;
-	struct sysdev_driver *drv;
-
-	/* First, call the class-specific one */
-	if (cls->resume)
-		cls->resume(dev);
-	WARN_ONCE(!irqs_disabled(),
-		"Interrupts enabled after %pF\n", cls->resume);
-
-	/* Call auxiliary drivers next. */
-	list_for_each_entry(drv, &cls->drivers, entry) {
-		if (drv->resume)
-			drv->resume(dev);
-		WARN_ONCE(!irqs_disabled(),
-			"Interrupts enabled after %pF\n", drv->resume);
-	}
-}
-
-/**
- *	sysdev_suspend - Suspend all system devices.
- *	@state:		Power state to enter.
- *
- *	We perform an almost identical operation as sysdev_shutdown()
- *	above, though calling ->suspend() instead. Interrupts are disabled
- *	when this called. Devices are responsible for both saving state and
- *	quiescing or powering down the device.
- *
- *	This is only called by the device PM core, so we let them handle
- *	all synchronization.
- */
-int sysdev_suspend(pm_message_t state)
-{
-	struct sysdev_class *cls;
-	struct sys_device *sysdev, *err_dev;
-	struct sysdev_driver *drv, *err_drv;
-	int ret;
-
-	pr_debug("Checking wake-up interrupts\n");
-
-	/* Return error code if there are any wake-up interrupts pending */
-	ret = check_wakeup_irqs();
-	if (ret)
-		return ret;
-
-	WARN_ONCE(!irqs_disabled(),
-		"Interrupts enabled while suspending system devices\n");
-
-	pr_debug("Suspending System Devices\n");
-
-	list_for_each_entry_reverse(cls, &system_kset->list, kset.kobj.entry) {
-		pr_debug("Suspending type '%s':\n",
-			 kobject_name(&cls->kset.kobj));
-
-		list_for_each_entry(sysdev, &cls->kset.list, kobj.entry) {
-			pr_debug(" %s\n", kobject_name(&sysdev->kobj));
-
-			/* Call auxiliary drivers first */
-			list_for_each_entry(drv, &cls->drivers, entry) {
-				if (drv->suspend) {
-					ret = drv->suspend(sysdev, state);
-					if (ret)
-						goto aux_driver;
-				}
-				WARN_ONCE(!irqs_disabled(),
-					"Interrupts enabled after %pF\n",
-					drv->suspend);
-			}
-
-			/* Now call the generic one */
-			if (cls->suspend) {
-				ret = cls->suspend(sysdev, state);
-				if (ret)
-					goto cls_driver;
-				WARN_ONCE(!irqs_disabled(),
-					"Interrupts enabled after %pF\n",
-					cls->suspend);
-			}
-		}
-	}
-	return 0;
-	/* resume current sysdev */
-cls_driver:
-	drv = NULL;
-	printk(KERN_ERR "Class suspend failed for %s: %d\n",
-		kobject_name(&sysdev->kobj), ret);
-
-aux_driver:
-	if (drv)
-		printk(KERN_ERR "Class driver suspend failed for %s: %d\n",
-				kobject_name(&sysdev->kobj), ret);
-	list_for_each_entry(err_drv, &cls->drivers, entry) {
-		if (err_drv == drv)
-			break;
-		if (err_drv->resume)
-			err_drv->resume(sysdev);
-	}
-
-	/* resume other sysdevs in current class */
-	list_for_each_entry(err_dev, &cls->kset.list, kobj.entry) {
-		if (err_dev == sysdev)
-			break;
-		pr_debug(" %s\n", kobject_name(&err_dev->kobj));
-		__sysdev_resume(err_dev);
-	}
-
-	/* resume other classes */
-	list_for_each_entry_continue(cls, &system_kset->list, kset.kobj.entry) {
-		list_for_each_entry(err_dev, &cls->kset.list, kobj.entry) {
-			pr_debug(" %s\n", kobject_name(&err_dev->kobj));
-			__sysdev_resume(err_dev);
-		}
-	}
-	return ret;
-}
-EXPORT_SYMBOL_GPL(sysdev_suspend);
-
-/**
- *	sysdev_resume - Bring system devices back to life.
- *
- *	Similar to sysdev_suspend(), but we iterate the list forwards
- *	to guarantee that parent devices are resumed before their children.
- *
- *	Note: Interrupts are disabled when called.
- */
-int sysdev_resume(void)
-{
-	struct sysdev_class *cls;
-
-	WARN_ONCE(!irqs_disabled(),
-		"Interrupts enabled while resuming system devices\n");
-
-	pr_debug("Resuming System Devices\n");
-
-	list_for_each_entry(cls, &system_kset->list, kset.kobj.entry) {
-		struct sys_device *sysdev;
-
-		pr_debug("Resuming type '%s':\n",
-			 kobject_name(&cls->kset.kobj));
-
-		list_for_each_entry(sysdev, &cls->kset.list, kobj.entry) {
-			pr_debug(" %s\n", kobject_name(&sysdev->kobj));
-
-			__sysdev_resume(sysdev);
-		}
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(sysdev_resume);
-#endif /* CONFIG_ARCH_NO_SYSDEV_OPS */
+EXPORT_SYMBOL_GPL(sysdev_register);
+EXPORT_SYMBOL_GPL(sysdev_unregister);
 
 int __init system_bus_init(void)
 {
@@ -534,9 +339,6 @@ int __init system_bus_init(void)
 	return 0;
 }
 
-EXPORT_SYMBOL_GPL(sysdev_register);
-EXPORT_SYMBOL_GPL(sysdev_unregister);
-
 #define to_ext_attr(x) container_of(x, struct sysdev_ext_attribute, attr)
 
 ssize_t sysdev_store_ulong(struct sys_device *sysdev,
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index a2eee574784..0b5366b5be2 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -70,12 +70,7 @@ static int xen_suspend(void *data)
 
 	BUG_ON(!irqs_disabled());
 
-	err = sysdev_suspend(PMSG_FREEZE);
-	if (!err) {
-		err = syscore_suspend();
-		if (err)
-			sysdev_resume();
-	}
+	err = syscore_suspend();
 	if (err) {
 		printk(KERN_ERR "xen_suspend: system core suspend failed: %d\n",
 			err);
@@ -102,7 +97,6 @@ static int xen_suspend(void *data)
 	}
 
 	syscore_resume();
-	sysdev_resume();
 
 	return 0;
 }
diff --git a/include/linux/device.h b/include/linux/device.h
index ab8dfc09570..ea9db9b0f24 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -633,13 +633,6 @@ static inline int devtmpfs_mount(const char *mountpoint) { return 0; }
 /* drivers/base/power/shutdown.c */
 extern void device_shutdown(void);
 
-#ifndef CONFIG_ARCH_NO_SYSDEV_OPS
-/* drivers/base/sys.c */
-extern void sysdev_shutdown(void);
-#else
-static inline void sysdev_shutdown(void) { }
-#endif
-
 /* debugging and troubleshooting/diagnostic helpers. */
 extern const char *dev_driver_string(const struct device *dev);
 
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 512e09177e5..3c053e2beb8 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -529,14 +529,6 @@ struct dev_power_domain {
  */
 
 #ifdef CONFIG_PM_SLEEP
-#ifndef CONFIG_ARCH_NO_SYSDEV_OPS
-extern int sysdev_suspend(pm_message_t state);
-extern int sysdev_resume(void);
-#else
-static inline int sysdev_suspend(pm_message_t state) { return 0; }
-static inline int sysdev_resume(void) { return 0; }
-#endif
-
 extern void device_pm_lock(void);
 extern void dpm_resume_noirq(pm_message_t state);
 extern void dpm_resume_end(pm_message_t state);
diff --git a/include/linux/sysdev.h b/include/linux/sysdev.h
index dfb078db8eb..d35e783a598 100644
--- a/include/linux/sysdev.h
+++ b/include/linux/sysdev.h
@@ -34,12 +34,6 @@ struct sysdev_class {
 	struct list_head	drivers;
 	struct sysdev_class_attribute **attrs;
 	struct kset		kset;
-#ifndef CONFIG_ARCH_NO_SYSDEV_OPS
-	/* Default operations for these types of devices */
-	int	(*shutdown)(struct sys_device *);
-	int	(*suspend)(struct sys_device *, pm_message_t state);
-	int	(*resume)(struct sys_device *);
-#endif
 };
 
 struct sysdev_class_attribute {
@@ -77,11 +71,6 @@ struct sysdev_driver {
 	struct list_head	entry;
 	int	(*add)(struct sys_device *);
 	int	(*remove)(struct sys_device *);
-#ifndef CONFIG_ARCH_NO_SYSDEV_OPS
-	int	(*shutdown)(struct sys_device *);
-	int	(*suspend)(struct sys_device *, pm_message_t state);
-	int	(*resume)(struct sys_device *);
-#endif
 };
 
 
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 87b77de03dd..8d814cbc810 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1531,13 +1531,7 @@ int kernel_kexec(void)
 		if (error)
 			goto Enable_cpus;
 		local_irq_disable();
-		/* Suspend system devices */
-		error = sysdev_suspend(PMSG_FREEZE);
-		if (!error) {
-			error = syscore_suspend();
-			if (error)
-				sysdev_resume();
-		}
+		error = syscore_suspend();
 		if (error)
 			goto Enable_irqs;
 	} else
@@ -1553,7 +1547,6 @@ int kernel_kexec(void)
 #ifdef CONFIG_KEXEC_JUMP
 	if (kexec_image->preserve_context) {
 		syscore_resume();
-		sysdev_resume();
  Enable_irqs:
 		local_irq_enable();
  Enable_cpus:
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 50aae660174..554d3b049f3 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -272,12 +272,7 @@ static int create_image(int platform_mode)
 
 	local_irq_disable();
 
-	error = sysdev_suspend(PMSG_FREEZE);
-	if (!error) {
-		error = syscore_suspend();
-		if (error)
-			sysdev_resume();
-	}
+	error = syscore_suspend();
 	if (error) {
 		printk(KERN_ERR "PM: Some system devices failed to power down, "
 			"aborting hibernation\n");
@@ -302,7 +297,6 @@ static int create_image(int platform_mode)
 
  Power_up:
 	syscore_resume();
-	sysdev_resume();
 	/* NOTE:  dpm_resume_noirq() is just a resume() for devices
 	 * that suspended with irqs off ... no overall powerup.
 	 */
@@ -409,12 +403,7 @@ static int resume_target_kernel(bool platform_mode)
 
 	local_irq_disable();
 
-	error = sysdev_suspend(PMSG_QUIESCE);
-	if (!error) {
-		error = syscore_suspend();
-		if (error)
-			sysdev_resume();
-	}
+	error = syscore_suspend();
 	if (error)
 		goto Enable_irqs;
 
@@ -442,7 +431,6 @@ static int resume_target_kernel(bool platform_mode)
 	touch_softlockup_watchdog();
 
 	syscore_resume();
-	sysdev_resume();
 
  Enable_irqs:
 	local_irq_enable();
@@ -528,7 +516,6 @@ int hibernation_platform_enter(void)
 		goto Platform_finish;
 
 	local_irq_disable();
-	sysdev_suspend(PMSG_HIBERNATE);
 	syscore_suspend();
 	if (pm_wakeup_pending()) {
 		error = -EAGAIN;
@@ -541,7 +528,6 @@ int hibernation_platform_enter(void)
 
  Power_up:
 	syscore_resume();
-	sysdev_resume();
 	local_irq_enable();
 	enable_nonboot_cpus();
 
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 8935369d503..732d77a957e 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -163,19 +163,13 @@ static int suspend_enter(suspend_state_t state)
 	arch_suspend_disable_irqs();
 	BUG_ON(!irqs_disabled());
 
-	error = sysdev_suspend(PMSG_SUSPEND);
-	if (!error) {
-		error = syscore_suspend();
-		if (error)
-			sysdev_resume();
-	}
+	error = syscore_suspend();
 	if (!error) {
 		if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
 			error = suspend_ops->enter(state);
 			events_check_enabled = false;
 		}
 		syscore_resume();
-		sysdev_resume();
 	}
 
 	arch_suspend_enable_irqs();
diff --git a/kernel/sys.c b/kernel/sys.c
index af468edf096..f0c10385f30 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -315,7 +315,6 @@ void kernel_restart_prepare(char *cmd)
 	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
 	system_state = SYSTEM_RESTART;
 	device_shutdown();
-	sysdev_shutdown();
 	syscore_shutdown();
 }
 
@@ -354,7 +353,6 @@ static void kernel_shutdown_prepare(enum system_states state)
 void kernel_halt(void)
 {
 	kernel_shutdown_prepare(SYSTEM_HALT);
-	sysdev_shutdown();
 	syscore_shutdown();
 	printk(KERN_EMERG "System halted.\n");
 	kmsg_dump(KMSG_DUMP_HALT);
@@ -374,7 +372,6 @@ void kernel_power_off(void)
 	if (pm_power_off_prepare)
 		pm_power_off_prepare();
 	disable_nonboot_cpus();
-	sysdev_shutdown();
 	syscore_shutdown();
 	printk(KERN_EMERG "Power down.\n");
 	kmsg_dump(KMSG_DUMP_POWEROFF);
-- 
cgit v1.2.3-70-g09d2


From 449a66fd1fa75d36dca917704827c40c8f416bca Mon Sep 17 00:00:00 2001
From: Richard Weinberger <richard@nod.at>
Date: Thu, 12 May 2011 15:11:12 +0200
Subject: x86: Remove warning and warning_symbol from struct stacktrace_ops

Both warning and warning_symbol are nowhere used.
Let's get rid of them.

Signed-off-by: Richard Weinberger <richard@nod.at>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Soeren Sandmann Pedersen <ssp@redhat.com>
Cc: Namhyung Kim <namhyung@gmail.com>
Cc: x86 <x86@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Link: http://lkml.kernel.org/r/1305205872-10321-2-git-send-email-richard@nod.at
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 arch/x86/include/asm/stacktrace.h |  3 ---
 arch/x86/kernel/cpu/perf_event.c  | 13 -------------
 arch/x86/kernel/dumpstack.c       | 16 ----------------
 arch/x86/kernel/stacktrace.c      | 13 -------------
 arch/x86/oprofile/backtrace.c     | 13 -------------
 5 files changed, 58 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index d7e89c83645..70bbe39043a 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -37,9 +37,6 @@ print_context_stack_bp(struct thread_info *tinfo,
 /* Generic stack tracer with callbacks */
 
 struct stacktrace_ops {
-	void (*warning)(void *data, char *msg);
-	/* msg must contain %s for the symbol */
-	void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
 	void (*address)(void *data, unsigned long address, int reliable);
 	/* On negative return stop dumping */
 	int (*stack)(void *data, char *name);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0de6b2b31f6..3a0338b4b17 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1773,17 +1773,6 @@ static struct pmu pmu = {
  * callchain support
  */
 
-static void
-backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-	/* Ignore warnings */
-}
-
-static void backtrace_warning(void *data, char *msg)
-{
-	/* Ignore warnings */
-}
-
 static int backtrace_stack(void *data, char *name)
 {
 	return 0;
@@ -1797,8 +1786,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
 }
 
 static const struct stacktrace_ops backtrace_ops = {
-	.warning		= backtrace_warning,
-	.warning_symbol		= backtrace_warning_symbol,
 	.stack			= backtrace_stack,
 	.address		= backtrace_address,
 	.walk_stack		= print_context_stack_bp,
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index e2a3f0606da..f478ff6877e 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -135,20 +135,6 @@ print_context_stack_bp(struct thread_info *tinfo,
 }
 EXPORT_SYMBOL_GPL(print_context_stack_bp);
 
-
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-	printk(data);
-	print_symbol(msg, symbol);
-	printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
-	printk("%s%s\n", (char *)data, msg);
-}
-
 static int print_trace_stack(void *data, char *name)
 {
 	printk("%s <%s> ", (char *)data, name);
@@ -166,8 +152,6 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
 }
 
 static const struct stacktrace_ops print_trace_ops = {
-	.warning		= print_trace_warning,
-	.warning_symbol		= print_trace_warning_symbol,
 	.stack			= print_trace_stack,
 	.address		= print_trace_address,
 	.walk_stack		= print_context_stack,
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 6515733a289..55d9bc03f69 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -9,15 +9,6 @@
 #include <linux/uaccess.h>
 #include <asm/stacktrace.h>
 
-static void save_stack_warning(void *data, char *msg)
-{
-}
-
-static void
-save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-}
-
 static int save_stack_stack(void *data, char *name)
 {
 	return 0;
@@ -53,16 +44,12 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
 }
 
 static const struct stacktrace_ops save_stack_ops = {
-	.warning	= save_stack_warning,
-	.warning_symbol	= save_stack_warning_symbol,
 	.stack		= save_stack_stack,
 	.address	= save_stack_address,
 	.walk_stack	= print_context_stack,
 };
 
 static const struct stacktrace_ops save_stack_ops_nosched = {
-	.warning	= save_stack_warning,
-	.warning_symbol	= save_stack_warning_symbol,
 	.stack		= save_stack_stack,
 	.address	= save_stack_address_nosched,
 	.walk_stack	= print_context_stack,
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 2d49d4e19a3..a5b64ab4cd6 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -16,17 +16,6 @@
 #include <asm/stacktrace.h>
 #include <linux/compat.h>
 
-static void backtrace_warning_symbol(void *data, char *msg,
-				     unsigned long symbol)
-{
-	/* Ignore warnings */
-}
-
-static void backtrace_warning(void *data, char *msg)
-{
-	/* Ignore warnings */
-}
-
 static int backtrace_stack(void *data, char *name)
 {
 	/* Yes, we want all stacks */
@@ -42,8 +31,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
 }
 
 static struct stacktrace_ops backtrace_ops = {
-	.warning	= backtrace_warning,
-	.warning_symbol	= backtrace_warning_symbol,
 	.stack		= backtrace_stack,
 	.address	= backtrace_address,
 	.walk_stack	= print_context_stack,
-- 
cgit v1.2.3-70-g09d2


From 7899891c7d161752f29abcc9bc0a9c6c3a3af26c Mon Sep 17 00:00:00 2001
From: "Tian, Kevin" <kevin.tian@intel.com>
Date: Thu, 12 May 2011 10:56:08 +0800
Subject: xen mmu: fix a race window causing leave_mm BUG()

There's a race window in xen_drop_mm_ref, where remote cpu may exit
dirty bitmap between the check on this cpu and the point where remote
cpu handles drop request. So in drop_other_mm_ref we need check
whether TLB state is still lazy before calling into leave_mm. This
bug is rarely observed in earlier kernel, but exaggerated by the
commit 831d52bc153971b70e64eccfbed2b232394f22f8
("x86, mm: avoid possible bogus tlb entries by clearing prev mm_cpumask after switching mm")
which clears bitmap after changing the TLB state. the call trace is as below:

---------------------------------
kernel BUG at arch/x86/mm/tlb.c:61!
invalid opcode: 0000 [#1] SMP
last sysfs file: /sys/devices/system/xen_memory/xen_memory0/info/current_kb
CPU 1
Modules linked in: 8021q garp xen_netback xen_blkback blktap blkback_pagemap nbd bridge stp llc autofs4 ipmi_devintf ipmi_si ipmi_msghandler lockd sunrpc bonding ipv6 xenfs dm_multipath video output sbs sbshc parport_pc lp parport ses enclosure snd_seq_dummy snd_seq_oss snd_seq_midi_event snd_seq snd_seq_device serio_raw bnx2 snd_pcm_oss snd_mixer_oss snd_pcm snd_timer iTCO_wdt snd soundcore snd_page_alloc i2c_i801 iTCO_vendor_support i2c_core pcs pkr pata_acpi ata_generic ata_piix shpchp mptsas mptscsih mptbase [last unloaded: freq_table]
Pid: 25581, comm: khelper Not tainted 2.6.32.36fixxen #1 Tecal RH2285
RIP: e030:[<ffffffff8103a3cb>]  [<ffffffff8103a3cb>] leave_mm+0x15/0x46
RSP: e02b:ffff88002805be48  EFLAGS: 00010046
RAX: 0000000000000000 RBX: 0000000000000001 RCX: ffff88015f8e2da0
RDX: ffff88002805be78 RSI: 0000000000000000 RDI: 0000000000000001
RBP: ffff88002805be48 R08: ffff88009d662000 R09: dead000000200200
R10: dead000000100100 R11: ffffffff814472b2 R12: ffff88009bfc1880
R13: ffff880028063020 R14: 00000000000004f6 R15: 0000000000000000
FS:  00007f62362d66e0(0000) GS:ffff880028058000(0000) knlGS:0000000000000000
CS:  e033 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 0000003aabc11909 CR3: 000000009b8ca000 CR4: 0000000000002660
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 00000000000000 00
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process khelper (pid: 25581, threadinfo ffff88007691e000, task ffff88009b92db40)
Stack:
 ffff88002805be68 ffffffff8100e4ae 0000000000000001 ffff88009d733b88
<0> ffff88002805be98 ffffffff81087224 ffff88002805be78 ffff88002805be78
<0> ffff88015f808360 00000000000004f6 ffff88002805bea8 ffffffff81010108
Call Trace:
 <IRQ>
 [<ffffffff8100e4ae>] drop_other_mm_ref+0x2a/0x53
 [<ffffffff81087224>] generic_smp_call_function_single_interrupt+0xd8/0xfc
 [<ffffffff81010108>] xen_call_function_single_interrupt+0x13/0x28
 [<ffffffff810a936a>] handle_IRQ_event+0x66/0x120
 [<ffffffff810aac5b>] handle_percpu_irq+0x41/0x6e
 [<ffffffff8128c1c0>] __xen_evtchn_do_upcall+0x1ab/0x27d
 [<ffffffff8128dd11>] xen_evtchn_do_upcall+0x33/0x46
 [<ffffffff81013efe>] xen_do_hyper visor_callback+0x1e/0x30
 <EOI>
 [<ffffffff814472b2>] ? _spin_unlock_irqrestore+0x15/0x17
 [<ffffffff8100f8cf>] ? xen_restore_fl_direct_end+0x0/0x1
 [<ffffffff81113f71>] ? flush_old_exec+0x3ac/0x500
 [<ffffffff81150dc5>] ? load_elf_binary+0x0/0x17ef
 [<ffffffff81150dc5>] ? load_elf_binary+0x0/0x17ef
 [<ffffffff8115115d>] ? load_elf_binary+0x398/0x17ef
 [<ffffffff81042fcf>] ? need_resched+0x23/0x2d
 [<ffffffff811f4648>] ? process_measurement+0xc0/0xd7
 [<ffffffff81150dc5>] ? load_elf_binary+0x0/0x17ef
 [<ffffffff81113094>] ? search_binary_handler+0xc8/0x255
 [<ffffffff81114362>] ? do_execve+0x1c3/0x29e
 [<ffffffff8101155d>] ? sys_execve+0x43/0x5d
 [<ffffffff8106fc45>] ? __call_usermodehelper+0x0/0x6f
 [<ffffffff81013e28>] ? kernel_execve+0x68/0xd0
 [<ffffffff 8106fc45>] ? __call_usermodehelper+0x0/0x6f
 [<ffffffff8100f8cf>] ? xen_restore_fl_direct_end+0x0/0x1
 [<ffffffff8106fb64>] ? ____call_usermodehelper+0x113/0x11e
 [<ffffffff81013daa>] ? child_rip+0xa/0x20
 [<ffffffff8106fc45>] ? __call_usermodehelper+0x0/0x6f
 [<ffffffff81012f91>] ? int_ret_from_sys_call+0x7/0x1b
 [<ffffffff8101371d>] ? retint_restore_args+0x5/0x6
 [<ffffffff81013da0>] ? child_rip+0x0/0x20
Code: 41 5e 41 5f c9 c3 55 48 89 e5 0f 1f 44 00 00 e8 17 ff ff ff c9 c3 55 48 89 e5 0f 1f 44 00 00 65 8b 04 25 c8 55 01 00 ff c8 75 04 <0f> 0b eb fe 65 48 8b 34 25 c0 55 01 00 48 81 c6 b8 02 00 00 e8
RIP  [<ffffffff8103a3cb>] leave_mm+0x15/0x46
 RSP <ffff88002805be48>
---[ end trace ce9cee6832a9c503 ]---

Tested-by: Maoxiaoyun<tinnycloud@hotmail.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
[v1: Fleshed out the git description a bit]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 5e92b61ad57..4fd7387222b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1140,7 +1140,7 @@ static void drop_other_mm_ref(void *info)
 
 	active_mm = percpu_read(cpu_tlbstate.active_mm);
 
-	if (active_mm == mm)
+	if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
 		leave_mm(smp_processor_id());
 
 	/* If this cpu still has a stale cr3 reference, then make sure
-- 
cgit v1.2.3-70-g09d2


From 15bfc094517db2ddf38ca7ed47f3a1c0ad24f7c4 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 12 Apr 2011 07:57:15 -0400
Subject: xen/setup: Ignore E820_UNUSABLE when setting 1-1 mappings.

When we parse the raw E820, the Xen hypervisor can set "E820_RAM"
to "E820_UNUSABLE" if the mem=X argument is used. As such we
should _not_ consider the E820_UNUSABLE as an 1-1 identity
mapping, but instead use the same case as for E820_RAM.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 90bac0aac3a..fba4a6cc3a2 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -166,7 +166,7 @@ static unsigned long __init xen_set_identity(const struct e820entry *list,
 		if (last > end)
 			continue;
 
-		if (entry->type == E820_RAM) {
+		if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) {
 			if (start > start_pci)
 				identity += set_phys_range_identity(
 						PFN_UP(start_pci), PFN_DOWN(start));
-- 
cgit v1.2.3-70-g09d2


From 0f16d0dfcdb5aab97d9e368f008b070b5b3ec6d3 Mon Sep 17 00:00:00 2001
From: Daniel Kiper <dkiper@net-space.pl>
Date: Wed, 11 May 2011 22:34:38 +0200
Subject: xen/setup: Fix for incorrect xen_extra_mem_start initialization under
 32-bit

git commit 24bdb0b62cc82120924762ae6bc85afc8c3f2b26 (xen: do not create
the extra e820 region at an addr lower than 4G) does not take into
account that ifdef CONFIG_X86_32 instead of e820_end_of_low_ram_pfn()
find_low_pfn_range() is called (both calls are from arch/x86/kernel/setup.c).
find_low_pfn_range() behaves correctly and does not require change in
xen_extra_mem_start initialization. Additionally, if xen_extra_mem_start
is initialized in the same way as ifdef CONFIG_X86_64 then memory hotplug
support for Xen balloon driver (under development) is broken.

Signed-off-by: Daniel Kiper <dkiper@net-space.pl>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/setup.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index fba4a6cc3a2..ca6297bd4e3 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -227,7 +227,11 @@ char * __init xen_memory_setup(void)
 
 	memcpy(map_raw, map, sizeof(map));
 	e820.nr_map = 0;
+#ifdef CONFIG_X86_32
+	xen_extra_mem_start = mem_end;
+#else
 	xen_extra_mem_start = max((1ULL << 32), mem_end);
+#endif
 	for (i = 0; i < memmap.nr_entries; i++) {
 		unsigned long long end;
 
-- 
cgit v1.2.3-70-g09d2


From 8c5950881c3b5e6e350e4b0438a8ccc513d90df9 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 1 Apr 2011 15:18:48 -0400
Subject: xen/p2m: Create entries in the P2M_MFN trees's to track 1-1 mappings

.. when applicable. We need to track in the p2m_mfn and
p2m_mfn_p the MFNs and pointers, respectivly, for the P2M entries
that are allocated for the identity mappings. Without this,
a PV domain with an E820 that triggers the 1-1 mapping to kick in,
won't be able to be restored as the P2M won't have the identity
mappings.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/p2m.c | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 141eb0de8b0..a01e6532b46 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -522,11 +522,20 @@ static bool __init __early_alloc_p2m(unsigned long pfn)
 	/* Boundary cross-over for the edges: */
 	if (idx) {
 		unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
+		unsigned long *mid_mfn_p;
 
 		p2m_init(p2m);
 
 		p2m_top[topidx][mididx] = p2m;
 
+		/* For save/restore we need to MFN of the P2M saved */
+		
+		mid_mfn_p = p2m_top_mfn_p[topidx];
+		WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing),
+			"P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n",
+			topidx, mididx);
+		mid_mfn_p[mididx] = virt_to_mfn(p2m);
+
 	}
 	return idx != 0;
 }
@@ -549,12 +558,29 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
 		pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
 	{
 		unsigned topidx = p2m_top_index(pfn);
-		if (p2m_top[topidx] == p2m_mid_missing) {
-			unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+		unsigned long *mid_mfn_p;
+		unsigned long **mid;
+
+		mid = p2m_top[topidx];
+		mid_mfn_p = p2m_top_mfn_p[topidx];
+		if (mid == p2m_mid_missing) {
+			mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
 
 			p2m_mid_init(mid);
 
 			p2m_top[topidx] = mid;
+
+			BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+		}
+		/* And the save/restore P2M tables.. */
+		if (mid_mfn_p == p2m_mid_missing_mfn) {
+			mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+			p2m_mid_mfn_init(mid_mfn_p);
+
+			p2m_top_mfn_p[topidx] = mid_mfn_p;
+			p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
+			/* Note: we don't set mid_mfn_p[midix] here,
+		 	 * look in __early_alloc_p2m */
 		}
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 251511a18dcfe1868e3edfdc490777dcfaa1f851 Mon Sep 17 00:00:00 2001
From: Daniel Kiper <dkiper@net-space.pl>
Date: Wed, 4 May 2011 20:16:07 +0200
Subject: arch/x86/xen/irq: Cleanup code/data sections definitions

Cleanup code/data sections definitions
accordingly to include/linux/init.h.

Signed-off-by: Daniel Kiper <dkiper@net-space.pl>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 6a6fe893964..8bbb465b6f0 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -113,7 +113,7 @@ static void xen_halt(void)
 		xen_safe_halt();
 }
 
-static const struct pv_irq_ops xen_irq_ops __initdata = {
+static const struct pv_irq_ops xen_irq_ops __initconst = {
 	.save_fl = PV_CALLEE_SAVE(xen_save_fl),
 	.restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
 	.irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
-- 
cgit v1.2.3-70-g09d2


From ad3062a0f438a5f436dae267f795c0a9686f11d2 Mon Sep 17 00:00:00 2001
From: Daniel Kiper <dkiper@net-space.pl>
Date: Wed, 4 May 2011 20:15:18 +0200
Subject: arch/x86/xen/enlighten: Cleanup code/data sections definitions

Cleanup code/data sections definitions
accordingly to include/linux/init.h.

Signed-off-by: Daniel Kiper <dkiper@net-space.pl>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index e3c6a06cf72..dd7b88f2ec7 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -235,7 +235,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
 	*dx &= maskedx;
 }
 
-static __init void xen_init_cpuid_mask(void)
+static void __init xen_init_cpuid_mask(void)
 {
 	unsigned int ax, bx, cx, dx;
 	unsigned int xsave_mask;
@@ -400,7 +400,7 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 /*
  * load_gdt for early boot, when the gdt is only mapped once
  */
-static __init void xen_load_gdt_boot(const struct desc_ptr *dtr)
+static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
 {
 	unsigned long va = dtr->address;
 	unsigned int size = dtr->size + 1;
@@ -662,7 +662,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
  * Version of write_gdt_entry for use at early boot-time needed to
  * update an entry as simply as possible.
  */
-static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
+static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
 					    const void *desc, int type)
 {
 	switch (type) {
@@ -933,18 +933,18 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
 	return ret;
 }
 
-static const struct pv_info xen_info __initdata = {
+static const struct pv_info xen_info __initconst = {
 	.paravirt_enabled = 1,
 	.shared_kernel_pmd = 0,
 
 	.name = "Xen",
 };
 
-static const struct pv_init_ops xen_init_ops __initdata = {
+static const struct pv_init_ops xen_init_ops __initconst = {
 	.patch = xen_patch,
 };
 
-static const struct pv_cpu_ops xen_cpu_ops __initdata = {
+static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 	.cpuid = xen_cpuid,
 
 	.set_debugreg = xen_set_debugreg,
@@ -1004,7 +1004,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 	.end_context_switch = xen_end_context_switch,
 };
 
-static const struct pv_apic_ops xen_apic_ops __initdata = {
+static const struct pv_apic_ops xen_apic_ops __initconst = {
 #ifdef CONFIG_X86_LOCAL_APIC
 	.startup_ipi_hook = paravirt_nop,
 #endif
@@ -1055,7 +1055,7 @@ int xen_panic_handler_init(void)
 	return 0;
 }
 
-static const struct machine_ops __initdata xen_machine_ops = {
+static const struct machine_ops xen_machine_ops __initconst = {
 	.restart = xen_restart,
 	.halt = xen_machine_halt,
 	.power_off = xen_machine_halt,
@@ -1332,7 +1332,7 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = {
+static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
 	.notifier_call	= xen_hvm_cpu_notify,
 };
 
@@ -1381,7 +1381,7 @@ bool xen_hvm_need_lapic(void)
 }
 EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
 
-const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = {
+const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {
 	.name			= "Xen HVM",
 	.detect			= xen_hvm_platform,
 	.init_platform		= xen_hvm_guest_init,
-- 
cgit v1.2.3-70-g09d2


From ae15a3b4d1374b733016ce4b4148b2ba42bbeb0f Mon Sep 17 00:00:00 2001
From: Daniel Kiper <dkiper@net-space.pl>
Date: Wed, 4 May 2011 20:17:21 +0200
Subject: arch/x86/xen/setup: Cleanup code/data sections definitions

Cleanup code/data sections definitions
accordingly to include/linux/init.h.

Signed-off-by: Daniel Kiper <dkiper@net-space.pl>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/setup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 90bac0aac3a..d3663df2f96 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -50,7 +50,7 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
  */
 #define EXTRA_MEM_RATIO		(10)
 
-static __init void xen_add_extra_mem(unsigned long pages)
+static void __init xen_add_extra_mem(unsigned long pages)
 {
 	unsigned long pfn;
 
@@ -336,7 +336,7 @@ static void __init fiddle_vdso(void)
 #endif
 }
 
-static __cpuinit int register_callback(unsigned type, const void *func)
+static int __cpuinit register_callback(unsigned type, const void *func)
 {
 	struct callback_register callback = {
 		.type = type,
-- 
cgit v1.2.3-70-g09d2


From 82a3242e11d9e63c8195be46c954efaefee35e22 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 12 May 2011 16:01:02 -0700
Subject: sysfs: remove "last sysfs file:" line from the oops messages

On some arches (x86, sh, arm, unicore, powerpc) the oops message would
print out the last sysfs file accessed.

This was very useful in finding a number of sysfs and driver core bugs
in the 2.5 and early 2.6 development days, but it has been a number of
years since this file has actually helped in debugging anything that
couldn't also be trivially determined from the stack traceback.

So it's time to delete the line.  This is good as we need all the space
we can get for oops messages at times on consoles.

Acked-by: Phil Carmody <ext-phil.2.carmody@nokia.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/arm/kernel/traps.c       |  1 -
 arch/powerpc/kernel/traps.c   |  1 -
 arch/sh/kernel/traps_32.c     |  1 -
 arch/unicore32/kernel/traps.c |  1 -
 arch/x86/kernel/dumpstack.c   |  1 -
 fs/sysfs/file.c               | 12 ------------
 include/linux/sysfs.h         |  5 -----
 7 files changed, 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 3b54ad19d48..d52eec268b4 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -234,7 +234,6 @@ static int __die(const char *str, int err, struct thread_info *thread, struct pt
 
 	printk(KERN_EMERG "Internal error: %s: %x [#%d]" S_PREEMPT S_SMP "\n",
 	       str, err, ++die_counter);
-	sysfs_printk_last_file();
 
 	/* trap and error numbers are mostly meaningless on ARM */
 	ret = notify_die(DIE_OOPS, str, regs, err, tsk->thread.trap_no, SIGSEGV);
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 5ddb801bc15..d782cd71c07 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -143,7 +143,6 @@ int die(const char *str, struct pt_regs *regs, long err)
 #endif
 		printk("%s\n", ppc_md.name ? ppc_md.name : "");
 
-		sysfs_printk_last_file();
 		if (notify_die(DIE_OOPS, str, regs, err, 255,
 			       SIGSEGV) == NOTIFY_STOP)
 			return 1;
diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c
index 3484c2f65ab..b51a17104b5 100644
--- a/arch/sh/kernel/traps_32.c
+++ b/arch/sh/kernel/traps_32.c
@@ -87,7 +87,6 @@ void die(const char * str, struct pt_regs * regs, long err)
 	bust_spinlocks(1);
 
 	printk("%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
-	sysfs_printk_last_file();
 	print_modules();
 	show_regs(regs);
 
diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c
index 254e36fa951..b9a26465e72 100644
--- a/arch/unicore32/kernel/traps.c
+++ b/arch/unicore32/kernel/traps.c
@@ -192,7 +192,6 @@ static int __die(const char *str, int err, struct thread_info *thread,
 
 	printk(KERN_EMERG "Internal error: %s: %x [#%d]\n",
 	       str, err, ++die_counter);
-	sysfs_printk_last_file();
 
 	/* trap and error numbers are mostly meaningless on UniCore */
 	ret = notify_die(DIE_OOPS, str, regs, err, tsk->thread.trap_no, \
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index e2a3f0606da..f72e7193acc 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -279,7 +279,6 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 	printk("DEBUG_PAGEALLOC");
 #endif
 	printk("\n");
-	sysfs_printk_last_file();
 	if (notify_die(DIE_OOPS, str, regs, err,
 			current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
 		return 1;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index da3fefe91a8..1ad8c93c1b8 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -24,13 +24,6 @@
 
 #include "sysfs.h"
 
-/* used in crash dumps to help with debugging */
-static char last_sysfs_file[PATH_MAX];
-void sysfs_printk_last_file(void)
-{
-	printk(KERN_EMERG "last sysfs file: %s\n", last_sysfs_file);
-}
-
 /*
  * There's one sysfs_buffer for each open file and one
  * sysfs_open_dirent for each sysfs_dirent with one or more open
@@ -337,11 +330,6 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	struct sysfs_buffer *buffer;
 	const struct sysfs_ops *ops;
 	int error = -EACCES;
-	char *p;
-
-	p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
-	if (!IS_ERR(p))
-		memmove(last_sysfs_file, p, strlen(p) + 1);
 
 	/* need attr_sd for attr and ops, its parent for kobj */
 	if (!sysfs_get_active(attr_sd))
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 30b881555fa..c3acda60eee 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -176,7 +176,6 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
 				      const unsigned char *name);
 struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd);
 void sysfs_put(struct sysfs_dirent *sd);
-void sysfs_printk_last_file(void);
 
 /* Called to clear a ns tag when it is no longer valid */
 void sysfs_exit_ns(enum kobj_ns_type type, const void *tag);
@@ -348,10 +347,6 @@ static inline int __must_check sysfs_init(void)
 	return 0;
 }
 
-static inline void sysfs_printk_last_file(void)
-{
-}
-
 #endif /* CONFIG_SYSFS */
 
 #endif /* _SYSFS_H_ */
-- 
cgit v1.2.3-70-g09d2


From 82491451dd25a3abe8496ddbd04ddb3f77d285c2 Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@arm.linux.org.uk>
Date: Sun, 8 May 2011 18:55:19 +0100
Subject: clocksource: convert x86 to generic i8253 clocksource

Convert x86 i8253 clocksource code to use generic i8253 clocksource.

Acked-by: John Stultz <john.stultz@linaro.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/x86/Kconfig             |  1 +
 arch/x86/include/asm/i8253.h |  2 ++
 arch/x86/kernel/i8253.c      | 79 +-------------------------------------------
 3 files changed, 4 insertions(+), 78 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc6c53a95bf..01115d54c5b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -8,6 +8,7 @@ config 64BIT
 
 config X86_32
 	def_bool !64BIT
+	select CLKSRC_I8253
 
 config X86_64
 	def_bool 64BIT
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
index fc1f579fb96..65aaa91d585 100644
--- a/arch/x86/include/asm/i8253.h
+++ b/arch/x86/include/asm/i8253.h
@@ -6,6 +6,8 @@
 #define PIT_CH0			0x40
 #define PIT_CH2			0x42
 
+#define PIT_LATCH	LATCH
+
 extern raw_spinlock_t i8253_lock;
 
 extern struct clock_event_device *global_clock_event;
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 2dfd3159744..b904dfbf6db 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -117,81 +117,6 @@ void __init setup_pit_timer(void)
 }
 
 #ifndef CONFIG_X86_64
-/*
- * Since the PIT overflows every tick, its not very useful
- * to just read by itself. So use jiffies to emulate a free
- * running counter:
- */
-static cycle_t pit_read(struct clocksource *cs)
-{
-	static int old_count;
-	static u32 old_jifs;
-	unsigned long flags;
-	int count;
-	u32 jifs;
-
-	raw_spin_lock_irqsave(&i8253_lock, flags);
-	/*
-	 * Although our caller may have the read side of xtime_lock,
-	 * this is now a seqlock, and we are cheating in this routine
-	 * by having side effects on state that we cannot undo if
-	 * there is a collision on the seqlock and our caller has to
-	 * retry.  (Namely, old_jifs and old_count.)  So we must treat
-	 * jiffies as volatile despite the lock.  We read jiffies
-	 * before latching the timer count to guarantee that although
-	 * the jiffies value might be older than the count (that is,
-	 * the counter may underflow between the last point where
-	 * jiffies was incremented and the point where we latch the
-	 * count), it cannot be newer.
-	 */
-	jifs = jiffies;
-	outb_pit(0x00, PIT_MODE);	/* latch the count ASAP */
-	count = inb_pit(PIT_CH0);	/* read the latched count */
-	count |= inb_pit(PIT_CH0) << 8;
-
-	/* VIA686a test code... reset the latch if count > max + 1 */
-	if (count > LATCH) {
-		outb_pit(0x34, PIT_MODE);
-		outb_pit(LATCH & 0xff, PIT_CH0);
-		outb_pit(LATCH >> 8, PIT_CH0);
-		count = LATCH - 1;
-	}
-
-	/*
-	 * It's possible for count to appear to go the wrong way for a
-	 * couple of reasons:
-	 *
-	 *  1. The timer counter underflows, but we haven't handled the
-	 *     resulting interrupt and incremented jiffies yet.
-	 *  2. Hardware problem with the timer, not giving us continuous time,
-	 *     the counter does small "jumps" upwards on some Pentium systems,
-	 *     (see c't 95/10 page 335 for Neptun bug.)
-	 *
-	 * Previous attempts to handle these cases intelligently were
-	 * buggy, so we just do the simple thing now.
-	 */
-	if (count > old_count && jifs == old_jifs)
-		count = old_count;
-
-	old_count = count;
-	old_jifs = jifs;
-
-	raw_spin_unlock_irqrestore(&i8253_lock, flags);
-
-	count = (LATCH - 1) - count;
-
-	return (cycle_t)(jifs * LATCH) + count;
-}
-
-static struct clocksource pit_cs = {
-	.name		= "pit",
-	.rating		= 110,
-	.read		= pit_read,
-	.mask		= CLOCKSOURCE_MASK(32),
-	.mult		= 0,
-	.shift		= 20,
-};
-
 static int __init init_pit_clocksource(void)
 {
 	 /*
@@ -205,9 +130,7 @@ static int __init init_pit_clocksource(void)
 	    pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
 		return 0;
 
-	pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift);
-
-	return clocksource_register(&pit_cs);
+	return clocksource_i8253_init();
 }
 arch_initcall(init_pit_clocksource);
 
-- 
cgit v1.2.3-70-g09d2


From 7c1bfd685bcdc822ab1d7411ea05c82bd2a7b260 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 16 May 2011 13:47:30 -0400
Subject: xen/pci: Fix compiler error when CONFIG_XEN_PRIVILEGED_GUEST is not
 set.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If we have CONFIG_XEN and the other parameters to build an
Linux kernel that is non-privileged, the xen_[find|register|unregister]_
device_domain_owner functions should not be compiled. They should
use the nops defined in arch/x86/include/asm/xen/pci.h instead.

This fixes:

arch/x86/pci/xen.c:496: error: redefinition of ‘xen_find_device_domain_owner’
arch/x86/include/asm/xen/pci.h:25: note: previous definition of ‘xen_find_device_domain_owner’ was here
arch/x86/pci/xen.c:510: error: redefinition of ‘xen_register_device_domain_owner’
arch/x86/include/asm/xen/pci.h:29: note: previous definition of ‘xen_register_device_domain_owner’ was here
arch/x86/pci/xen.c:532: error: redefinition of ‘xen_unregister_device_domain_owner’
arch/x86/include/asm/xen/pci.h:34: note: previous definition of ‘xen_unregister_device_domain_owner’ was here

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
---
 arch/x86/pci/xen.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 393981feb12..8214724ce54 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -473,6 +473,7 @@ void __init xen_setup_pirqs(void)
 }
 #endif
 
+#ifdef CONFIG_XEN_DOM0
 struct xen_device_domain_owner {
 	domid_t domain;
 	struct pci_dev *dev;
@@ -545,3 +546,4 @@ int xen_unregister_device_domain_owner(struct pci_dev *dev)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner);
+#endif
-- 
cgit v1.2.3-70-g09d2


From 50e7534427283afd997d58481778c07bea79eb63 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Mon, 16 May 2011 15:39:46 +0200
Subject: x86, AMD, cacheinfo: Fix fallout caused by max3 conversion

732eacc0542d0aa48797f675888b85d6065af837 converted code around the
kernel using nested max() macros to use the new max3 macro but forgot to
remove the old line in intel_cacheinfo.c. Fix it.

Cc: Hagen Paul Pfeifer <hagen@jauu.net>
Cc: Frank Arnold <farnold@amd64.org>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Link: http://lkml.kernel.org/r/1305553188-21061-2-git-send-email-bp@amd64.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 1ce1af2899d..31590a001e2 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -327,7 +327,6 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 	l3->subcaches[2] = sc2 = !(val & BIT(8))  + !(val & BIT(9));
 	l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
 
-	l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
 	l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 42be450565b0fc4607fae3e3a7da038d367a23ed Mon Sep 17 00:00:00 2001
From: Frank Arnold <frank.arnold@amd.com>
Date: Mon, 16 May 2011 15:39:47 +0200
Subject: x86, AMD, cacheinfo: Fix L3 cache index disable checks

We provide two slots to disable cache indices, and have a check to
prevent both slots to be used for the same index.

If the user disables the same index on different subcaches, both slots
will hold the same index, e.g.

  $ echo 2047 > /sys/devices/system/cpu/cpu0/cache/index3/cache_disable_0
  $ cat /sys/devices/system/cpu/cpu0/cache/index3/cache_disable_0
  2047
  $ echo 1050623 > /sys/devices/system/cpu/cpu0/cache/index3/cache_disable_1
  $ cat /sys/devices/system/cpu/cpu0/cache/index3/cache_disable_1
  2047

due to the fact that the check was looking only at index bits [11:0]
and was ignoring writes to bits outside that range. The more correct
fix is to simply check whether the index is within the bounds of
[0..l3->indices].

While at it, cleanup comments and drop now-unused local macros.

Signed-off-by: Frank Arnold <frank.arnold@amd.com>
Link: http://lkml.kernel.org/r/1305553188-21061-3-git-send-email-bp@amd64.org
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 31590a001e2..c105c533ed9 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -453,27 +453,16 @@ int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
 {
 	int ret = 0;
 
-#define SUBCACHE_MASK	(3UL << 20)
-#define SUBCACHE_INDEX	0xfff
-
-	/*
-	 * check whether this slot is already used or
-	 * the index is already disabled
-	 */
+	/*  check if @slot is already used or the index is already disabled */
 	ret = amd_get_l3_disable_slot(l3, slot);
 	if (ret >= 0)
 		return -EINVAL;
 
-	/*
-	 * check whether the other slot has disabled the
-	 * same index already
-	 */
-	if (index == amd_get_l3_disable_slot(l3, !slot))
+	if (index > l3->indices)
 		return -EINVAL;
 
-	/* do not allow writes outside of allowed bits */
-	if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
-	    ((index & SUBCACHE_INDEX) > l3->indices))
+	/* check whether the other slot has disabled the same index already */
+	if (index == amd_get_l3_disable_slot(l3, !slot))
 		return -EINVAL;
 
 	amd_l3_disable_index(l3, cpu, slot, index);
-- 
cgit v1.2.3-70-g09d2


From 2895cd2ab81dfb7bc22637bc110857db44a30b4a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 13 Apr 2011 16:43:29 -0400
Subject: ftrace/x86: Do not trace .discard.text section

The section called .discard.text has tracing attached to it and is
currently ignored by ftrace. But it does include a call to the mcount
stub. Adding a notrace to the code keeps gcc from adding the useless
mcount caller to it.

Link: http://lkml.kernel.org/r/20110421023739.243651696@goodmis.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/include/asm/setup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index db8aa19a08a..647d8a06ce4 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -88,7 +88,7 @@ void *extend_brk(size_t size, size_t align);
  * executable.)
  */
 #define RESERVE_BRK(name,sz)						\
-	static void __section(.discard.text) __used			\
+	static void __section(.discard.text) __used notrace		\
 	__brk_reservation_fn_##name##__(void) {				\
 		asm volatile (						\
 			".pushsection .brk_reservation,\"aw\",@nobits;" \
-- 
cgit v1.2.3-70-g09d2


From 521ccb5c4aece609311bfa7157910a8f0c942af5 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 10 May 2011 10:10:41 +0200
Subject: ftrace/x86: mcount offset calculation

Do the mcount offset adjustment in the recordmcount.pl/recordmcount.[ch]
at compile time and not in ftrace_call_adjust at run time.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/include/asm/ftrace.h | 7 +++----
 scripts/recordmcount.c        | 2 ++
 scripts/recordmcount.pl       | 2 ++
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index db24c2278be..268c783ab1c 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -38,11 +38,10 @@ extern void mcount(void);
 static inline unsigned long ftrace_call_adjust(unsigned long addr)
 {
 	/*
-	 * call mcount is "e8 <4 byte offset>"
-	 * The addr points to the 4 byte offset and the caller of this
-	 * function wants the pointer to e8. Simply subtract one.
+	 * addr is the address of the mcount call instruction.
+	 * recordmcount does the necessary offset calculation.
 	 */
-	return addr - 1;
+	return addr;
 }
 
 #ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c
index 0e18975824f..7648a5d1115 100644
--- a/scripts/recordmcount.c
+++ b/scripts/recordmcount.c
@@ -335,6 +335,7 @@ do_file(char const *const fname)
 		reltype = R_386_32;
 		make_nop = make_nop_x86;
 		ideal_nop = ideal_nop5_x86_32;
+		mcount_adjust_32 = -1;
 		break;
 	case EM_ARM:	 reltype = R_ARM_ABS32;
 			 altmcount = "__gnu_mcount_nc";
@@ -350,6 +351,7 @@ do_file(char const *const fname)
 		make_nop = make_nop_x86;
 		ideal_nop = ideal_nop5_x86_64;
 		reltype = R_X86_64_64;
+		mcount_adjust_64 = -1;
 		break;
 	}  /* end switch */
 
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index a871cd41405..414e7f5e42e 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -223,6 +223,7 @@ if ($arch eq "x86_64") {
     $mcount_regex = "^\\s*([0-9a-fA-F]+):.*\\smcount([+-]0x[0-9a-zA-Z]+)?\$";
     $type = ".quad";
     $alignment = 8;
+    $mcount_adjust = -1;
 
     # force flags for this arch
     $ld .= " -m elf_x86_64";
@@ -232,6 +233,7 @@ if ($arch eq "x86_64") {
 
 } elsif ($arch eq "i386") {
     $alignment = 4;
+    $mcount_adjust = -1;
 
     # force flags for this arch
     $ld .= " -m elf_i386";
-- 
cgit v1.2.3-70-g09d2


From 865be7a81071a77014c83cd01536c989eed362b4 Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@rainbow-software.org>
Date: Mon, 16 May 2011 21:38:08 +0200
Subject: x86, cpu: Fix detection of Celeron Covington stepping A1 and B0

Steppings A1 and B0 of Celeron Covington are currently misdetected as
Pentium II (Dixon). Fix it by removing the stepping check.

[ hpa: this fixes this specific bug... the CPUID documentation
  specifies that the L2 cache size can disambiguate additional CPUs;
  this patch does not fix that. ]

Signed-off-by: Ondrej Zary <linux@rainbow-software.org>
Link: http://lkml.kernel.org/r/201105162138.15416.linux@rainbow-software.org
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/intel.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index df86bc8c859..32e86aa5274 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -400,12 +400,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 
 		switch (c->x86_model) {
 		case 5:
-			if (c->x86_mask == 0) {
-				if (l2 == 0)
-					p = "Celeron (Covington)";
-				else if (l2 == 256)
-					p = "Mobile Pentium II (Dixon)";
-			}
+			if (l2 == 0)
+				p = "Celeron (Covington)";
+			else if (l2 == 256)
+				p = "Mobile Pentium II (Dixon)";
 			break;
 
 		case 6:
-- 
cgit v1.2.3-70-g09d2


From dc382fd5bcca7098a984705ed6ac880f539d068e Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 16 May 2011 13:54:10 -0700
Subject: x86, mm: Allow ZONE_DMA to be configurable

ZONE_DMA is unnecessary for a large number of machines that do not
require less than 32-bit DMA addressing, e.g. ISA legacy DMA or PCI
cards with a restricted DMA address mask.

This patch allows users to disable ZONE_DMA for x86 if they know they
will not be using such devices with their kernel.

This prevents the VM from unnecessarily reserving a ratio of memory
(defaulting to 1/256th of system capacity) with lowmem_reserve_ratio
for such allocations when it will never be used.

Signed-off-by: David Rientjes <rientjes@google.com>
Link: http://lkml.kernel.org/r/alpine.DEB.2.00.1105161353560.4353@chino.kir.corp.google.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/Kconfig      | 9 ++++++++-
 arch/x86/mm/init_32.c | 2 ++
 arch/x86/mm/init_64.c | 2 ++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 648fca42ae6..0eb801a75de 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -112,7 +112,14 @@ config MMU
 	def_bool y
 
 config ZONE_DMA
-	def_bool y
+	bool "DMA memory allocation support" if EXPERT
+	default y
+	help
+	  DMA memory allocation support allows devices with less than 32-bit
+	  addressing to allocate within the first 16MB of address space.
+	  Disable if no such devices will be used.
+
+	  If unsure, say Y.
 
 config SBUS
 	bool
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 2cde0a34bed..29f7c6d9817 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -678,8 +678,10 @@ static void __init zone_sizes_init(void)
 {
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+#ifdef CONFIG_ZONE_DMA
 	max_zone_pfns[ZONE_DMA] =
 		virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+#endif
 	max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 #ifdef CONFIG_HIGHMEM
 	max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 0404bb3a077..d865c4aeec5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -616,7 +616,9 @@ void __init paging_init(void)
 	unsigned long max_zone_pfns[MAX_NR_ZONES];
 
 	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+#ifdef CONFIG_ZONE_DMA
 	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
+#endif
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 	max_zone_pfns[ZONE_NORMAL] = max_pfn;
 
-- 
cgit v1.2.3-70-g09d2


From 2494b030ba9334c7dd7df9b9f7abe4eacc950ec5 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 17 May 2011 12:33:26 -0700
Subject: x86, cpufeature: Fix cpuid leaf 7 feature detection

CPUID leaf 7, subleaf 0 returns the maximum subleaf in EAX, not the
number of subleaves.  Since so far only subleaf 0 is defined (and only
the EBX bitfield) we do not need to qualify the test.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: http://lkml.kernel.org/r/1305660806-17519-1-git-send-email-fenghua.yu@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: <stable@kernel.org> 2.6.36..39
---
 arch/x86/kernel/cpu/common.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e2ced0074a4..173f3a3fa1a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -565,8 +565,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
 
 		cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
 
-		if (eax > 0)
-			c->x86_capability[9] = ebx;
+		c->x86_capability[9] = ebx;
 	}
 
 	/* AMD-defined flags: level 0x80000001 */
-- 
cgit v1.2.3-70-g09d2


From c3b0795c98c08351567464150db66d11e05d7611 Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Tue, 10 May 2011 21:09:38 +0200
Subject: PM / ACPI: Remove acpi_sleep=s4_nonvs

acpi_sleep=s4_nonvs is superseded by acpi_sleep=nonvs, so remove it.

Signed-off-by: WANG Cong <amwang@redhat.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Len Brown <lenb@kernel.org>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 Documentation/feature-removal-schedule.txt | 8 --------
 Documentation/kernel-parameters.txt        | 2 +-
 arch/x86/kernel/acpi/sleep.c               | 5 -----
 3 files changed, 1 insertion(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 492e81df296..f6a24e8aa11 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -460,14 +460,6 @@ Who:	Thomas Gleixner <tglx@linutronix.de>
 
 ----------------------------
 
-What:	The acpi_sleep=s4_nonvs command line option
-When:	2.6.37
-Files:	arch/x86/kernel/acpi/sleep.c
-Why:	superseded by acpi_sleep=nonvs
-Who:	Rafael J. Wysocki <rjw@sisk.pl>
-
-----------------------------
-
 What: 	PCI DMA unmap state API
 When:	August 2012
 Why:	PCI DMA unmap state API (include/linux/pci-dma.h) was replaced
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index cc85a927819..259037b873b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -245,7 +245,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
 	acpi_sleep=	[HW,ACPI] Sleep options
 			Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig,
-				  old_ordering, s4_nonvs, sci_force_enable }
+				  old_ordering, nonvs, sci_force_enable }
 			See Documentation/power/video.txt for information on
 			s3_bios and s3_mode.
 			s3_beep is for debugging; it makes the PC's speaker beep
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ff93bc1b09c..18a857ba7a2 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -112,11 +112,6 @@ static int __init acpi_sleep_setup(char *str)
 #ifdef CONFIG_HIBERNATION
 		if (strncmp(str, "s4_nohwsig", 10) == 0)
 			acpi_no_s4_hw_signature();
-		if (strncmp(str, "s4_nonvs", 8) == 0) {
-			pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, "
-					"please use acpi_sleep=nonvs instead");
-			acpi_nvs_nosave();
-		}
 #endif
 		if (strncmp(str, "nonvs", 5) == 0)
 			acpi_nvs_nosave();
-- 
cgit v1.2.3-70-g09d2


From 724a92ee45c04cb9d82884a856b03b1e594d9de1 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 17 May 2011 15:29:10 -0700
Subject: x86, cpufeature: Add CPU feature bit for enhanced REP MOVSB/STOSB

Intel processors are adding enhancements to REP MOVSB/STOSB and the use of
REP MOVSB/STOSB for optimal memcpy/memset or similar functions is recommended.
Enhancement availability is indicated by CPUID.7.0.EBX[9] (Enhanced REP MOVSB/
STOSB).

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: http://lkml.kernel.org/r/1305671358-14478-2-git-send-email-fenghua.yu@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/cpufeature.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 91f3e087cf2..7f2f7b12329 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -195,6 +195,7 @@
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
 #define X86_FEATURE_FSGSBASE	(9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_ERMS	(9*32+ 9) /* Enhanced REP MOVSB/STOSB */
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
-- 
cgit v1.2.3-70-g09d2


From 161ec53c702ce9df2f439804dfb9331807066daa Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 17 May 2011 15:29:11 -0700
Subject: x86, mem, intel: Initialize Enhanced REP MOVSB/STOSB

If kernel intends to use enhanced REP MOVSB/STOSB, it must ensure
IA32_MISC_ENABLE.Fast_String_Enable (bit 0) is set and CPUID.(EAX=07H, ECX=0H):
EBX[bit 9] also reports 1.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: http://lkml.kernel.org/r/1305671358-14478-3-git-send-email-fenghua.yu@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/intel.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index df86bc8c859..fc73a34ba8c 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -29,10 +29,10 @@
 
 static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 {
+	u64 misc_enable;
+
 	/* Unmask CPUID levels if masked: */
 	if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
-		u64 misc_enable;
-
 		rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
 
 		if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
@@ -118,8 +118,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 	 * (model 2) with the same problem.
 	 */
 	if (c->x86 == 15) {
-		u64 misc_enable;
-
 		rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
 
 		if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
@@ -130,6 +128,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 		}
 	}
 #endif
+
+	/*
+	 * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
+	 * clear the fast string and enhanced fast string CPU capabilities.
+	 */
+	if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
+		rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+		if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
+			printk(KERN_INFO "Disabled fast string operations\n");
+			setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
+			setup_clear_cpu_cap(X86_FEATURE_ERMS);
+		}
+	}
 }
 
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3-70-g09d2


From 509731336313b3799cf03071d72c64fa6383895e Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 17 May 2011 15:29:12 -0700
Subject: x86, alternative, doc: Add comment for applying alternatives order

Some string operation functions may be patched twice, e.g. on enhanced REP MOVSB
/STOSB processors, memcpy is patched first by fast string alternative function,
then it is patched by enhanced REP MOVSB/STOSB alternative function.

Add comment for applying alternatives order to warn people who may change the
applying alternatives order for any reason.

[ Documentation-only patch ]

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: http://lkml.kernel.org/r/1305671358-14478-4-git-send-email-fenghua.yu@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/alternative.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 4a234677e21..f4fe15ddcf9 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -210,6 +210,15 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 	u8 insnbuf[MAX_PATCH_LEN];
 
 	DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
+	/*
+	 * The scan order should be from start to end. A later scanned
+	 * alternative code can overwrite a previous scanned alternative code.
+	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
+	 * patch code.
+	 *
+	 * So be careful if you want to change the scan order to any other
+	 * order.
+	 */
 	for (a = start; a < end; a++) {
 		u8 *instr = a->instr;
 		BUG_ON(a->replacementlen > a->instrlen);
-- 
cgit v1.2.3-70-g09d2


From 9072d11da15a71e086eab3b5085184f2c1d06913 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 17 May 2011 15:29:13 -0700
Subject: x86, alternative: Add altinstruction_entry macro

Add altinstruction_entry macro to generate .altinstructions section
entries from assembly code.  This should be less failure-prone than
open-coding.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: http://lkml.kernel.org/r/1305671358-14478-5-git-send-email-fenghua.yu@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/alternative-asm.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index a63a68be1cc..94d420b360d 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -15,4 +15,13 @@
 	.endm
 #endif
 
+.macro altinstruction_entry orig alt feature orig_len alt_len
+	.align 8
+	.quad \orig
+	.quad \alt
+	.word \feature
+	.byte \orig_len
+	.byte \alt_len
+.endm
+
 #endif  /*  __ASSEMBLY__  */
-- 
cgit v1.2.3-70-g09d2


From e365c9df2f2f001450decf9512412d2d5bd1cdef Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 17 May 2011 15:29:14 -0700
Subject: x86, mem: clear_page_64.S: Support clear_page() with enhanced REP
 MOVSB/STOSB

Intel processors are adding enhancements to REP MOVSB/STOSB and the use of
REP MOVSB/STOSB for optimal memcpy/memset or similar functions is recommended.
Enhancement availability is indicated by CPUID.7.0.EBX[9] (Enhanced REP MOVSB/
STOSB).

Support clear_page() with rep stosb for processor supporting enhanced REP MOVSB
/STOSB. On processors supporting enhanced REP MOVSB/STOSB, the alternative
clear_page_c_e function using enhanced REP STOSB overrides the original function
and the fast string function.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: http://lkml.kernel.org/r/1305671358-14478-6-git-send-email-fenghua.yu@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/lib/clear_page_64.S | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index aa4326bfb24..f2145cfa12a 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,6 @@
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
 
 /*
  * Zero a page. 	
@@ -14,6 +15,15 @@ ENTRY(clear_page_c)
 	CFI_ENDPROC
 ENDPROC(clear_page_c)
 
+ENTRY(clear_page_c_e)
+	CFI_STARTPROC
+	movl $4096,%ecx
+	xorl %eax,%eax
+	rep stosb
+	ret
+	CFI_ENDPROC
+ENDPROC(clear_page_c_e)
+
 ENTRY(clear_page)
 	CFI_STARTPROC
 	xorl   %eax,%eax
@@ -38,21 +48,26 @@ ENTRY(clear_page)
 .Lclear_page_end:
 ENDPROC(clear_page)
 
-	/* Some CPUs run faster using the string instructions.
-	   It is also a lot simpler. Use this when possible */
+	/*
+	 * Some CPUs support enhanced REP MOVSB/STOSB instructions.
+	 * It is recommended to use this when possible.
+	 * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
+	 * Otherwise, use original function.
+	 *
+	 */
 
 #include <asm/cpufeature.h>
 
 	.section .altinstr_replacement,"ax"
 1:	.byte 0xeb					/* jmp <disp8> */
 	.byte (clear_page_c - clear_page) - (2f - 1b)	/* offset */
-2:
+2:	.byte 0xeb					/* jmp <disp8> */
+	.byte (clear_page_c_e - clear_page) - (3f - 2b)	/* offset */
+3:
 	.previous
 	.section .altinstructions,"a"
-	.align 8
-	.quad clear_page
-	.quad 1b
-	.word X86_FEATURE_REP_GOOD
-	.byte .Lclear_page_end - clear_page
-	.byte 2b - 1b
+	altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
+			     .Lclear_page_end-clear_page, 2b-1b
+	altinstruction_entry clear_page,2b,X86_FEATURE_ERMS,   \
+			     .Lclear_page_end-clear_page,3b-2b
 	.previous
-- 
cgit v1.2.3-70-g09d2


From 4307bec9344aed83f8107c3eb4285bd9d218fc10 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 17 May 2011 15:29:15 -0700
Subject: x86, mem: copy_user_64.S: Support copy_to/from_user by enhanced REP
 MOVSB/STOSB

Support copy_to_user/copy_from_user() by enhanced REP MOVSB/STOSB.
On processors supporting enhanced REP MOVSB/STOSB, the alternative
copy_user_enhanced_fast_string function using enhanced rep movsb overrides the
original function and the fast string function.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: http://lkml.kernel.org/r/1305671358-14478-7-git-send-email-fenghua.yu@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/lib/copy_user_64.S | 65 ++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 55 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 99e48261519..d17a1170bbf 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -15,23 +15,30 @@
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
 #include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
 
-	.macro ALTERNATIVE_JUMP feature,orig,alt
+/*
+ * By placing feature2 after feature1 in altinstructions section, we logically
+ * implement:
+ * If CPU has feature2, jmp to alt2 is used
+ * else if CPU has feature1, jmp to alt1 is used
+ * else jmp to orig is used.
+ */
+	.macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
 0:
 	.byte 0xe9	/* 32bit jump */
 	.long \orig-1f	/* by default jump to orig */
 1:
 	.section .altinstr_replacement,"ax"
 2:	.byte 0xe9			/* near jump with 32bit immediate */
-	.long \alt-1b /* offset */   /* or alternatively to alt */
+	.long \alt1-1b /* offset */   /* or alternatively to alt1 */
+3:	.byte 0xe9			/* near jump with 32bit immediate */
+	.long \alt2-1b /* offset */   /* or alternatively to alt2 */
 	.previous
+
 	.section .altinstructions,"a"
-	.align 8
-	.quad  0b
-	.quad  2b
-	.word  \feature			/* when feature is set */
-	.byte  5
-	.byte  5
+	altinstruction_entry 0b,2b,\feature1,5,5
+	altinstruction_entry 0b,3b,\feature2,5,5
 	.previous
 	.endm
 
@@ -73,7 +80,9 @@ ENTRY(_copy_to_user)
 	jc bad_to_user
 	cmpq TI_addr_limit(%rax),%rcx
 	jae bad_to_user
-	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS,	\
+		copy_user_generic_unrolled,copy_user_generic_string,	\
+		copy_user_enhanced_fast_string
 	CFI_ENDPROC
 ENDPROC(_copy_to_user)
 
@@ -86,7 +95,9 @@ ENTRY(_copy_from_user)
 	jc bad_from_user
 	cmpq TI_addr_limit(%rax),%rcx
 	jae bad_from_user
-	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS,	\
+		copy_user_generic_unrolled,copy_user_generic_string,	\
+		copy_user_enhanced_fast_string
 	CFI_ENDPROC
 ENDPROC(_copy_from_user)
 
@@ -255,3 +266,37 @@ ENTRY(copy_user_generic_string)
 	.previous
 	CFI_ENDPROC
 ENDPROC(copy_user_generic_string)
+
+/*
+ * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
+ * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(copy_user_enhanced_fast_string)
+	CFI_STARTPROC
+	andl %edx,%edx
+	jz 2f
+	movl %edx,%ecx
+1:	rep
+	movsb
+2:	xorl %eax,%eax
+	ret
+
+	.section .fixup,"ax"
+12:	movl %ecx,%edx		/* ecx is zerorest also */
+	jmp copy_user_handle_tail
+	.previous
+
+	.section __ex_table,"a"
+	.align 8
+	.quad 1b,12b
+	.previous
+	CFI_ENDPROC
+ENDPROC(copy_user_enhanced_fast_string)
-- 
cgit v1.2.3-70-g09d2


From 101068c1f4a947ffa08f2782c78e40097300754d Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 17 May 2011 15:29:16 -0700
Subject: x86, mem: memcpy_64.S: Optimize memcpy by enhanced REP MOVSB/STOSB

Support memcpy() with enhanced rep movsb. On processors supporting enhanced
rep movsb, the alternative memcpy() function using enhanced rep movsb overrides the original function and the fast string
function.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: http://lkml.kernel.org/r/1305671358-14478-8-git-send-email-fenghua.yu@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/lib/memcpy_64.S | 45 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 75ef61e35e3..daab21dae2d 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -4,6 +4,7 @@
 
 #include <asm/cpufeature.h>
 #include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
 
 /*
  * memcpy - Copy a memory block.
@@ -37,6 +38,23 @@
 .Lmemcpy_e:
 	.previous
 
+/*
+ * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
+ * memcpy_c. Use memcpy_c_e when possible.
+ *
+ * This gets patched over the unrolled variant (below) via the
+ * alternative instructions framework:
+ */
+	.section .altinstr_replacement, "ax", @progbits
+.Lmemcpy_c_e:
+	movq %rdi, %rax
+
+	movl %edx, %ecx
+	rep movsb
+	ret
+.Lmemcpy_e_e:
+	.previous
+
 ENTRY(__memcpy)
 ENTRY(memcpy)
 	CFI_STARTPROC
@@ -171,21 +189,22 @@ ENDPROC(memcpy)
 ENDPROC(__memcpy)
 
 	/*
-	 * Some CPUs run faster using the string copy instructions.
-	 * It is also a lot simpler. Use this when possible:
-	 */
-
-	.section .altinstructions, "a"
-	.align 8
-	.quad memcpy
-	.quad .Lmemcpy_c
-	.word X86_FEATURE_REP_GOOD
-
-	/*
+	 * Some CPUs are adding enhanced REP MOVSB/STOSB feature
+	 * If the feature is supported, memcpy_c_e() is the first choice.
+	 * If enhanced rep movsb copy is not available, use fast string copy
+	 * memcpy_c() when possible. This is faster and code is simpler than
+	 * original memcpy().
+	 * Otherwise, original memcpy() is used.
+	 * In .altinstructions section, ERMS feature is placed after REG_GOOD
+         * feature to implement the right patch order.
+	 *
 	 * Replace only beginning, memcpy is used to apply alternatives,
 	 * so it is silly to overwrite itself with nops - reboot is the
 	 * only outcome...
 	 */
-	.byte .Lmemcpy_e - .Lmemcpy_c
-	.byte .Lmemcpy_e - .Lmemcpy_c
+	.section .altinstructions, "a"
+	altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
+			     .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
+	altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
+			     .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
 	.previous
-- 
cgit v1.2.3-70-g09d2


From 057e05c1d6440117875f455e59da8691e08f65d5 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 17 May 2011 15:29:17 -0700
Subject: x86, mem: memmove_64.S: Optimize memmove by enhanced REP MOVSB/STOSB

Support memmove() by enhanced rep movsb. On processors supporting enhanced
REP MOVSB/STOSB, the alternative memmove() function using enhanced rep movsb
overrides the original function.

The patch doesn't change the backward memmove case to use enhanced rep
movsb.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: http://lkml.kernel.org/r/1305671358-14478-9-git-send-email-fenghua.yu@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/lib/memmove_64.S | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 0ecb8433e5a..d0ec9c2936d 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -8,6 +8,7 @@
 #define _STRING_C
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
 
 #undef memmove
 
@@ -24,6 +25,7 @@
  */
 ENTRY(memmove)
 	CFI_STARTPROC
+
 	/* Handle more 32bytes in loop */
 	mov %rdi, %rax
 	cmp $0x20, %rdx
@@ -31,8 +33,13 @@ ENTRY(memmove)
 
 	/* Decide forward/backward copy mode */
 	cmp %rdi, %rsi
-	jb	2f
+	jge .Lmemmove_begin_forward
+	mov %rsi, %r8
+	add %rdx, %r8
+	cmp %rdi, %r8
+	jg 2f
 
+.Lmemmove_begin_forward:
 	/*
 	 * movsq instruction have many startup latency
 	 * so we handle small size by general register.
@@ -78,6 +85,8 @@ ENTRY(memmove)
 	rep movsq
 	movq %r11, (%r10)
 	jmp 13f
+.Lmemmove_end_forward:
+
 	/*
 	 * Handle data backward by movsq.
 	 */
@@ -194,4 +203,22 @@ ENTRY(memmove)
 13:
 	retq
 	CFI_ENDPROC
+
+	.section .altinstr_replacement,"ax"
+.Lmemmove_begin_forward_efs:
+	/* Forward moving data. */
+	movq %rdx, %rcx
+	rep movsb
+	retq
+.Lmemmove_end_forward_efs:
+	.previous
+
+	.section .altinstructions,"a"
+	.align 8
+	.quad .Lmemmove_begin_forward
+	.quad .Lmemmove_begin_forward_efs
+	.word X86_FEATURE_ERMS
+	.byte .Lmemmove_end_forward-.Lmemmove_begin_forward
+	.byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
+	.previous
 ENDPROC(memmove)
-- 
cgit v1.2.3-70-g09d2


From 2f19e06ac30771c7cb96fd61d8aeacfa74dac21c Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 17 May 2011 15:29:18 -0700
Subject: x86, mem: memset_64.S: Optimize memset by enhanced REP MOVSB/STOSB

Support memset() with enhanced rep stosb. On processors supporting enhanced
REP MOVSB/STOSB, the alternative memset_c_e function using enhanced rep stosb
overrides the fast string alternative memset_c and the original function.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: http://lkml.kernel.org/r/1305671358-14478-10-git-send-email-fenghua.yu@intel.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/lib/memset_64.S | 54 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 09d34426965..79bd454b78a 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -2,9 +2,13 @@
 
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
 
 /*
- * ISO C memset - set a memory block to a byte value.
+ * ISO C memset - set a memory block to a byte value. This function uses fast
+ * string to get better performance than the original function. The code is
+ * simpler and shorter than the orignal function as well.
  *	
  * rdi   destination
  * rsi   value (char) 
@@ -31,6 +35,28 @@
 .Lmemset_e:
 	.previous
 
+/*
+ * ISO C memset - set a memory block to a byte value. This function uses
+ * enhanced rep stosb to override the fast string function.
+ * The code is simpler and shorter than the fast string function as well.
+ *
+ * rdi   destination
+ * rsi   value (char)
+ * rdx   count (bytes)
+ *
+ * rax   original destination
+ */
+	.section .altinstr_replacement, "ax", @progbits
+.Lmemset_c_e:
+	movq %rdi,%r9
+	movb %sil,%al
+	movl %edx,%ecx
+	rep stosb
+	movq %r9,%rax
+	ret
+.Lmemset_e_e:
+	.previous
+
 ENTRY(memset)
 ENTRY(__memset)
 	CFI_STARTPROC
@@ -112,16 +138,20 @@ ENTRY(__memset)
 ENDPROC(memset)
 ENDPROC(__memset)
 
-	/* Some CPUs run faster using the string instructions.
-	   It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
+	/* Some CPUs support enhanced REP MOVSB/STOSB feature.
+	 * It is recommended to use this when possible.
+	 *
+	 * If enhanced REP MOVSB/STOSB feature is not available, use fast string
+	 * instructions.
+	 *
+	 * Otherwise, use original memset function.
+	 *
+	 * In .altinstructions section, ERMS feature is placed after REG_GOOD
+         * feature to implement the right patch order.
+	 */
 	.section .altinstructions,"a"
-	.align 8
-	.quad memset
-	.quad .Lmemset_c
-	.word X86_FEATURE_REP_GOOD
-	.byte .Lfinal - memset
-	.byte .Lmemset_e - .Lmemset_c
+	altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
+			     .Lfinal-memset,.Lmemset_e-.Lmemset_c
+	altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
+			     .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
 	.previous
-- 
cgit v1.2.3-70-g09d2


From d0281a257f370b09c410e466571858b4e12869c9 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 17 May 2011 18:44:26 -0700
Subject: x86, cpufeature: Add cpufeature flag for SMEP

Add support for newly documented SMEP (Supervisor Mode Execution Protection) CPU
feature flag.

SMEP prevents the CPU in kernel-mode to jump to an executable page
that has the user flag set in the PTE.  This prevents the kernel from
executing user-space code accidentally or maliciously, so it for
example prevents kernel exploits from jumping to specially prepared
user-mode shell code.

[ hpa: added better description by Ingo Molnar ]

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
LKML-Reference: <1305683069-25394-2-git-send-email-fenghua.yu@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/cpufeature.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 7f2f7b12329..8808cdb96c3 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -195,6 +195,7 @@
 
 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
 #define X86_FEATURE_FSGSBASE	(9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_SMEP	(9*32+ 7) /* Supervisor Mode Execution Protection */
 #define X86_FEATURE_ERMS	(9*32+ 9) /* Enhanced REP MOVSB/STOSB */
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
-- 
cgit v1.2.3-70-g09d2


From dc23c0bccf5eea171c87b3db285d032b9a5f06c4 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Tue, 17 May 2011 18:44:27 -0700
Subject: x86, cpu: Add SMEP CPU feature in CR4

Add support for newly documented SMEP (Supervisor Mode Execution Protection)
CPU feature in CR4.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
LKML-Reference: <1305683069-25394-3-git-send-email-fenghua.yu@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/processor-flags.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index a898a2b6e10..59ab4dffa37 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -60,6 +60,7 @@
 #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
 #define X86_CR4_VMXE	0x00002000 /* enable VMX virtualization */
 #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
+#define X86_CR4_SMEP	0x00100000 /* enable SMEP support */
 
 /*
  * x86-64 Task Priority Register, CR8
-- 
cgit v1.2.3-70-g09d2


From de5397ad5b9ad22e2401c4dacdf1bb3b19c05679 Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Wed, 11 May 2011 16:51:05 -0700
Subject: x86, cpu: Enable/disable Supervisor Mode Execution Protection

Enable/disable newly documented SMEP (Supervisor Mode Execution Protection) CPU
feature in kernel. CR4.SMEP (bit 20) is 0 at power-on. If the feature is
supported by CPU (X86_FEATURE_SMEP), enable SMEP by setting CR4.SMEP. New kernel
option nosmep disables the feature even if the feature is supported by CPU.

[ hpa: moved the call to setup_smep() until after the vendor-specific
  initialization; that ensures that CPUID features are unmasked.  We
  will still run it before we have userspace (never mind uncontrolled
  userspace). ]

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
LKML-Reference: <1305157865-31727-1-git-send-email-fenghua.yu@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 Documentation/kernel-parameters.txt |  4 ++++
 arch/x86/kernel/cpu/common.c        | 23 +++++++++++++++++++++++
 2 files changed, 27 insertions(+)

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index cc85a927819..56fb8c16e20 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1664,6 +1664,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			noexec=on: enable non-executable mappings (default)
 			noexec=off: disable non-executable mappings
 
+	nosmep		[X86]
+			Disable SMEP (Supervisor Mode Execution Protection)
+			even if it is supported by processor.
+
 	noexec32	[X86-64]
 			This affects only 32-bit executables.
 			noexec32=on: enable non-executable mappings (default)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 173f3a3fa1a..cbc70a27430 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -254,6 +254,25 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
 }
 #endif
 
+static int disable_smep __initdata;
+static __init int setup_disable_smep(char *arg)
+{
+	disable_smep = 1;
+	return 1;
+}
+__setup("nosmep", setup_disable_smep);
+
+static __init void setup_smep(struct cpuinfo_x86 *c)
+{
+	if (cpu_has(c, X86_FEATURE_SMEP)) {
+		if (unlikely(disable_smep)) {
+			setup_clear_cpu_cap(X86_FEATURE_SMEP);
+			clear_in_cr4(X86_CR4_SMEP);
+		} else
+			set_in_cr4(X86_CR4_SMEP);
+	}
+}
+
 /*
  * Some CPU features depend on higher CPUID levels, which may not always
  * be available due to CPUID level capping or broken virtualization
@@ -667,6 +686,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	c->cpu_index = 0;
 #endif
 	filter_cpuid_features(c, false);
+
+	setup_smep(c);
 }
 
 void __init early_cpu_init(void)
@@ -752,6 +773,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
 #endif
 	}
 
+	setup_smep(c);
+
 	get_model_name(c); /* Default name */
 
 	detect_nopl(c);
-- 
cgit v1.2.3-70-g09d2


From 26afb7c661080ae3f1f13ddf7f0c58c4f931c22b Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Thu, 12 May 2011 16:30:30 +0200
Subject: x86, 64-bit: Fix copy_[to/from]_user() checks for the userspace
 address limit

As reported in BZ #30352:

  https://bugzilla.kernel.org/show_bug.cgi?id=30352

there's a kernel bug related to reading the last allowed page on x86_64.

The _copy_to_user() and _copy_from_user() functions use the following
check for address limit:

  if (buf + size >= limit)
	fail();

while it should be more permissive:

  if (buf + size > limit)
	fail();

That's because the size represents the number of bytes being
read/write from/to buf address AND including the buf address.
So the copy function will actually never touch the limit
address even if "buf + size == limit".

Following program fails to use the last page as buffer
due to the wrong limit check:

 #include <sys/mman.h>
 #include <sys/socket.h>
 #include <assert.h>

 #define PAGE_SIZE       (4096)
 #define LAST_PAGE       ((void*)(0x7fffffffe000))

 int main()
 {
        int fds[2], err;
        void * ptr = mmap(LAST_PAGE, PAGE_SIZE, PROT_READ | PROT_WRITE,
                          MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, -1, 0);
        assert(ptr == LAST_PAGE);
        err = socketpair(AF_LOCAL, SOCK_STREAM, 0, fds);
        assert(err == 0);
        err = send(fds[0], ptr, PAGE_SIZE, 0);
        perror("send");
        assert(err == PAGE_SIZE);
        err = recv(fds[1], ptr, PAGE_SIZE, MSG_WAITALL);
        perror("recv");
        assert(err == PAGE_SIZE);
        return 0;
 }

The other place checking the addr limit is the access_ok() function,
which is working properly. There's just a misleading comment
for the __range_not_ok() macro - which this patch fixes as well.

The last page of the user-space address range is a guard page and
Brian Gerst observed that the guard page itself due to an erratum on K8 cpus
(#121 Sequential Execution Across Non-Canonical Boundary Causes Processor
Hang).

However, the test code is using the last valid page before the guard page.
The bug is that the last byte before the guard page can't be read
because of the off-by-one error. The guard page is left in place.

This bug would normally not show up because the last page is
part of the process stack and never accessed via syscalls.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: <stable@kernel.org>
Link: http://lkml.kernel.org/r/1305210630-7136-1-git-send-email-jolsa@redhat.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uaccess.h | 2 +-
 arch/x86/lib/copy_user_64.S    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index abd3e0ea762..99f0ad753f3 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -42,7 +42,7 @@
  * Returns 0 if the range is valid, nonzero otherwise.
  *
  * This is equivalent to the following test:
- * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64)
+ * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64)
  *
  * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
  */
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index d17a1170bbf..024840266ba 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -79,7 +79,7 @@ ENTRY(_copy_to_user)
 	addq %rdx,%rcx
 	jc bad_to_user
 	cmpq TI_addr_limit(%rax),%rcx
-	jae bad_to_user
+	ja bad_to_user
 	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS,	\
 		copy_user_generic_unrolled,copy_user_generic_string,	\
 		copy_user_enhanced_fast_string
@@ -94,7 +94,7 @@ ENTRY(_copy_from_user)
 	addq %rdx,%rcx
 	jc bad_from_user
 	cmpq TI_addr_limit(%rax),%rcx
-	jae bad_from_user
+	ja bad_from_user
 	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS,	\
 		copy_user_generic_unrolled,copy_user_generic_string,	\
 		copy_user_enhanced_fast_string
-- 
cgit v1.2.3-70-g09d2


From 61ee9a4ba05f0a4163d43a33dee7a0651e080b98 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 18 May 2011 21:33:42 +0000
Subject: x86: Convert PIT to clockevents_config_and_register()

Let the core do the work.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Link: http://lkml.kernel.org/r/%3C20110518210136.545615675%40linutronix.de%3E
---
 arch/x86/kernel/i8253.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 577e90cadae..fb66dc9e36c 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -93,7 +93,6 @@ static struct clock_event_device pit_ce = {
 	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
 	.set_mode	= init_pit_timer,
 	.set_next_event = pit_next_event,
-	.shift		= 32,
 	.irq		= 0,
 };
 
@@ -108,11 +107,8 @@ void __init setup_pit_timer(void)
 	 * IO_APIC has been initialized.
 	 */
 	pit_ce.cpumask = cpumask_of(smp_processor_id());
-	pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift);
-	pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce);
-	pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce);
 
-	clockevents_register_device(&pit_ce);
+	clockevents_config_and_register(&pit_ce, CLOCK_TICK_RATE, 0xF, 0x7FFF);
 	global_clock_event = &pit_ce;
 }
 
-- 
cgit v1.2.3-70-g09d2


From ab0e08f15d23628dd8d50bf6ce1a935a8840c7dc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 18 May 2011 21:33:43 +0000
Subject: x86: hpet: Cleanup the clockevents init and register code

No need to recalculate the frequency and the conversion factors over
and over. Calculate the frequency once and use the new config/register
interface and let the core code do the math.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Link: http://lkml.kernel.org/r/%3C20110518210136.646482357%40linutronix.de%3E
---
 arch/x86/kernel/hpet.c | 72 +++++++++++---------------------------------------
 1 file changed, 16 insertions(+), 56 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index bfe8f729e08..6781765b3a0 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -217,7 +217,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { }
 /*
  * Common hpet info
  */
-static unsigned long hpet_period;
+static unsigned long hpet_freq;
 
 static void hpet_legacy_set_mode(enum clock_event_mode mode,
 			  struct clock_event_device *evt);
@@ -232,7 +232,6 @@ static struct clock_event_device hpet_clockevent = {
 	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
 	.set_mode	= hpet_legacy_set_mode,
 	.set_next_event = hpet_legacy_next_event,
-	.shift		= 32,
 	.irq		= 0,
 	.rating		= 50,
 };
@@ -289,29 +288,13 @@ static void hpet_legacy_clockevent_register(void)
 	/* Start HPET legacy interrupts */
 	hpet_enable_legacy_int();
 
-	/*
-	 * The mult factor is defined as (include/linux/clockchips.h)
-	 *  mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h)
-	 * hpet_period is in units of femtoseconds (per cycle), so
-	 *  mult/2^shift = cyc/ns = 10^6/hpet_period
-	 *  mult = (10^6 * 2^shift)/hpet_period
-	 *  mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period
-	 */
-	hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
-				      hpet_period, hpet_clockevent.shift);
-	/* Calculate the min / max delta */
-	hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
-							   &hpet_clockevent);
-	/* Setup minimum reprogramming delta. */
-	hpet_clockevent.min_delta_ns = clockevent_delta2ns(HPET_MIN_PROG_DELTA,
-							   &hpet_clockevent);
-
 	/*
 	 * Start hpet with the boot cpu mask and make it
 	 * global after the IO_APIC has been initialized.
 	 */
 	hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
-	clockevents_register_device(&hpet_clockevent);
+	clockevents_config_and_register(&hpet_clockevent, hpet_freq,
+					HPET_MIN_PROG_DELTA, 0x7FFFFFFF);
 	global_clock_event = &hpet_clockevent;
 	printk(KERN_DEBUG "hpet clockevent registered\n");
 }
@@ -549,7 +532,6 @@ static int hpet_setup_irq(struct hpet_dev *dev)
 static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
 {
 	struct clock_event_device *evt = &hdev->evt;
-	uint64_t hpet_freq;
 
 	WARN_ON(cpu != smp_processor_id());
 	if (!(hdev->flags & HPET_DEV_VALID))
@@ -571,24 +553,10 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
 
 	evt->set_mode = hpet_msi_set_mode;
 	evt->set_next_event = hpet_msi_next_event;
-	evt->shift = 32;
-
-	/*
-	 * The period is a femto seconds value. We need to calculate the
-	 * scaled math multiplication factor for nanosecond to hpet tick
-	 * conversion.
-	 */
-	hpet_freq = FSEC_PER_SEC;
-	do_div(hpet_freq, hpet_period);
-	evt->mult = div_sc((unsigned long) hpet_freq,
-				      NSEC_PER_SEC, evt->shift);
-	/* Calculate the max delta */
-	evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
-	/* 5 usec minimum reprogramming delta. */
-	evt->min_delta_ns = 5000;
-
 	evt->cpumask = cpumask_of(hdev->cpu);
-	clockevents_register_device(evt);
+
+	clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA,
+					0x7FFFFFFF);
 }
 
 #ifdef CONFIG_HPET
@@ -792,7 +760,6 @@ static struct clocksource clocksource_hpet = {
 static int hpet_clocksource_register(void)
 {
 	u64 start, now;
-	u64 hpet_freq;
 	cycle_t t1;
 
 	/* Start the counter */
@@ -819,24 +786,7 @@ static int hpet_clocksource_register(void)
 		return -ENODEV;
 	}
 
-	/*
-	 * The definition of mult is (include/linux/clocksource.h)
-	 * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc
-	 * so we first need to convert hpet_period to ns/cyc units:
-	 *  mult/2^shift = ns/cyc = hpet_period/10^6
-	 *  mult = (hpet_period * 2^shift)/10^6
-	 *  mult = (hpet_period << shift)/FSEC_PER_NSEC
-	 */
-
-	/* Need to convert hpet_period (fsec/cyc) to cyc/sec:
-	 *
-	 * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
-	 * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
-	 */
-	hpet_freq = FSEC_PER_SEC;
-	do_div(hpet_freq, hpet_period);
 	clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
-
 	return 0;
 }
 
@@ -845,7 +795,9 @@ static int hpet_clocksource_register(void)
  */
 int __init hpet_enable(void)
 {
+	unsigned long hpet_period;
 	unsigned int id;
+	u64 freq;
 	int i;
 
 	if (!is_hpet_capable())
@@ -883,6 +835,14 @@ int __init hpet_enable(void)
 	if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
 		goto out_nohpet;
 
+	/*
+	 * The period is a femto seconds value. Convert it to a
+	 * frequency.
+	 */
+	freq = FSEC_PER_SEC;
+	do_div(freq, hpet_period);
+	hpet_freq = freq;
+
 	/*
 	 * Read the HPET ID register to retrieve the IRQ routing
 	 * information and the number of channels
-- 
cgit v1.2.3-70-g09d2


From b87ba87ca26e226b2277a2d5613ed596f408e96d Mon Sep 17 00:00:00 2001
From: "Tian, Kevin" <kevin.tian@intel.com>
Date: Fri, 6 May 2011 14:43:36 +0800
Subject: x86: Skip migrating IRQF_PER_CPU irqs in fixup_irqs()

IRQF_PER_CPU means that the irq cannot be moved away from a given
cpu. So it must not be migrated when the cpu goes offline.

[ tglx: massaged changelog ]

Signed-off-by: Fengzhe Zhang <fengzhe.zhang@intel.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Cc: Ian Campbell <Ian.Campbell@citrix.com>
Cc: Jan Beulich <JBeulich@novell.com>
Cc: "xen-devel@lists.xensource.com" <xen-devel@lists.xensource.com>
Link: http://lkml.kernel.org/r/%3C625BA99ED14B2D499DC4E29D8138F1505C8ED7F7E2%40shsmsx502.ccr.corp.intel.com%3E
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 1cb0b9fc78d..544efe2741b 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -249,7 +249,7 @@ void fixup_irqs(void)
 
 		data = irq_desc_get_irq_data(desc);
 		affinity = data->affinity;
-		if (!irq_has_action(irq) ||
+		if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
 		    cpumask_subset(affinity, cpu_online_mask)) {
 			raw_spin_unlock(&desc->lock);
 			continue;
-- 
cgit v1.2.3-70-g09d2


From 983bbf1af0664b78689612b247acb514300f62c7 Mon Sep 17 00:00:00 2001
From: "Tian, Kevin" <kevin.tian@intel.com>
Date: Fri, 6 May 2011 14:43:56 +0800
Subject: x86: Don't unmask disabled irqs when migrating them

It doesn't make sense to unconditionally unmask a disabled irq when
migrating it from offlined cpu to another. If the irq triggers then it
will be disabled in the interrupt handler anyway. So we can just avoid
unmasking it.

[ tglx: Made masking unconditional again and fixed the changelog ]

Signed-off-by: Fengzhe Zhang <fengzhe.zhang@intel.com>
Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Cc: Ian Campbell <Ian.Campbell@citrix.com>
Cc: Jan Beulich <JBeulich@novell.com>
Cc: "xen-devel@lists.xensource.com" <xen-devel@lists.xensource.com>
Link: http://lkml.kernel.org/r/%3C625BA99ED14B2D499DC4E29D8138F1505C8ED7F7E3%40shsmsx502.ccr.corp.intel.com%3E
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/irq.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 544efe2741b..6c0802eb2f7 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -276,7 +276,8 @@ void fixup_irqs(void)
 		else if (!(warned++))
 			set_affinity = 0;
 
-		if (!irqd_can_move_in_process_context(data) && chip->irq_unmask)
+		if (!irqd_can_move_in_process_context(data) &&
+		    !irqd_irq_disabled(data) && chip->irq_unmask)
 			chip->irq_unmask(data);
 
 		raw_spin_unlock(&desc->lock);
-- 
cgit v1.2.3-70-g09d2


From 3f508953dd2ebcbcd32d28d0b6aad4d76980e722 Mon Sep 17 00:00:00 2001
From: Daniel Kiper <dkiper@net-space.pl>
Date: Thu, 12 May 2011 17:19:53 -0400
Subject: arch/x86/xen/mmu: Cleanup code/data sections definitions

Cleanup code/data sections definitions
accordingly to include/linux/init.h.

Signed-off-by: Daniel Kiper <dkiper@net-space.pl>
[v1: Rebased on top of latest linus's to include fixes in mmu.c]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 0684f3c74d5..b5f776f60b1 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1054,7 +1054,7 @@ void xen_mm_pin_all(void)
  * that's before we have page structures to store the bits.  So do all
  * the book-keeping now.
  */
-static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
+static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
 				  enum pt_level level)
 {
 	SetPagePinned(page);
@@ -1271,7 +1271,7 @@ void xen_exit_mmap(struct mm_struct *mm)
 	spin_unlock(&mm->page_table_lock);
 }
 
-static __init void xen_pagetable_setup_start(pgd_t *base)
+static void __init xen_pagetable_setup_start(pgd_t *base)
 {
 }
 
@@ -1291,7 +1291,7 @@ static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
 
 static void xen_post_allocator_init(void);
 
-static __init void xen_pagetable_setup_done(pgd_t *base)
+static void __init xen_pagetable_setup_done(pgd_t *base)
 {
 	xen_setup_shared_info();
 	xen_post_allocator_init();
@@ -1488,7 +1488,7 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
 }
 
 #ifdef CONFIG_X86_32
-static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 {
 	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
 	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
@@ -1498,7 +1498,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
 	return pte;
 }
 #else /* CONFIG_X86_64 */
-static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
 {
 	unsigned long pfn = pte_pfn(pte);
 
@@ -1519,7 +1519,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
 
 /* Init-time set_pte while constructing initial pagetables, which
    doesn't allow RO pagetable pages to be remapped RW */
-static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
+static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
 {
 	pte = mask_rw_pte(ptep, pte);
 
@@ -1537,7 +1537,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
 
 /* Early in boot, while setting up the initial pagetable, assume
    everything is pinned. */
-static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
+static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
 {
 #ifdef CONFIG_FLATMEM
 	BUG_ON(mem_map);	/* should only be used early */
@@ -1547,7 +1547,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
 }
 
 /* Used for pmd and pud */
-static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
+static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
 {
 #ifdef CONFIG_FLATMEM
 	BUG_ON(mem_map);	/* should only be used early */
@@ -1557,13 +1557,13 @@ static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
 
 /* Early release_pte assumes that all pts are pinned, since there's
    only init_mm and anything attached to that is pinned. */
-static __init void xen_release_pte_init(unsigned long pfn)
+static void __init xen_release_pte_init(unsigned long pfn)
 {
 	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
 	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
 }
 
-static __init void xen_release_pmd_init(unsigned long pfn)
+static void __init xen_release_pmd_init(unsigned long pfn)
 {
 	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
 }
@@ -1689,7 +1689,7 @@ static void set_page_prot(void *addr, pgprot_t prot)
 		BUG();
 }
 
-static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
 {
 	unsigned pmdidx, pteidx;
 	unsigned ident_pte;
@@ -1772,7 +1772,7 @@ static void convert_pfn_mfn(void *v)
  * of the physical mapping once some sort of allocator has been set
  * up.
  */
-__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
 					 unsigned long max_pfn)
 {
 	pud_t *l3;
@@ -1843,7 +1843,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
 static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
 static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
 
-static __init void xen_write_cr3_init(unsigned long cr3)
+static void __init xen_write_cr3_init(unsigned long cr3)
 {
 	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
 
@@ -1880,7 +1880,7 @@ static __init void xen_write_cr3_init(unsigned long cr3)
 	pv_mmu_ops.write_cr3 = &xen_write_cr3;
 }
 
-__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
 					 unsigned long max_pfn)
 {
 	pmd_t *kernel_pmd;
@@ -1986,7 +1986,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 #endif
 }
 
-__init void xen_ident_map_ISA(void)
+void __init xen_ident_map_ISA(void)
 {
 	unsigned long pa;
 
@@ -2009,7 +2009,7 @@ __init void xen_ident_map_ISA(void)
 	xen_flush_tlb();
 }
 
-static __init void xen_post_allocator_init(void)
+static void __init xen_post_allocator_init(void)
 {
 #ifdef CONFIG_XEN_DEBUG
 	pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
@@ -2046,7 +2046,7 @@ static void xen_leave_lazy_mmu(void)
 	preempt_enable();
 }
 
-static const struct pv_mmu_ops xen_mmu_ops __initdata = {
+static const struct pv_mmu_ops xen_mmu_ops __initconst = {
 	.read_cr2 = xen_read_cr2,
 	.write_cr2 = xen_write_cr2,
 
-- 
cgit v1.2.3-70-g09d2


From ad7ba09e658f8e890797ed168dfb3f84be934b09 Mon Sep 17 00:00:00 2001
From: Daniel Kiper <dkiper@net-space.pl>
Date: Wed, 4 May 2011 20:19:15 +0200
Subject: arch/x86/xen/xen-ops: Cleanup code/data sections definitions

Cleanup code/data sections definitions
accordingly to include/linux/init.h.

Signed-off-by: Daniel Kiper <dkiper@net-space.pl>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/xen-ops.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 3112f55638c..97dfdc8757b 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -74,7 +74,7 @@ static inline void xen_hvm_smp_init(void) {}
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 void __init xen_init_spinlocks(void);
-__cpuinit void xen_init_lock_cpu(int cpu);
+void __cpuinit xen_init_lock_cpu(int cpu);
 void xen_uninit_lock_cpu(int cpu);
 #else
 static inline void xen_init_spinlocks(void)
-- 
cgit v1.2.3-70-g09d2


From fb6ce5dea4bb704bfcd9fda7e6b1354da66f4d2f Mon Sep 17 00:00:00 2001
From: Daniel Kiper <dkiper@net-space.pl>
Date: Wed, 4 May 2011 20:18:45 +0200
Subject: arch/x86/xen/time: Cleanup code/data sections definitions

Cleanup code/data sections definitions
accordingly to include/linux/init.h.

Signed-off-by: Daniel Kiper <dkiper@net-space.pl>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/time.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 2e2d370a47b..bd4ffd7d958 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -439,11 +439,11 @@ void xen_timer_resume(void)
 	}
 }
 
-static const struct pv_time_ops xen_time_ops __initdata = {
+static const struct pv_time_ops xen_time_ops __initconst = {
 	.sched_clock = xen_clocksource_read,
 };
 
-static __init void xen_time_init(void)
+static void __init xen_time_init(void)
 {
 	int cpu = smp_processor_id();
 	struct timespec tp;
@@ -468,7 +468,7 @@ static __init void xen_time_init(void)
 	xen_setup_cpu_clockevents();
 }
 
-__init void xen_init_time_ops(void)
+void __init xen_init_time_ops(void)
 {
 	pv_time_ops = xen_time_ops;
 
@@ -490,7 +490,7 @@ static void xen_hvm_setup_cpu_clockevents(void)
 	xen_setup_cpu_clockevents();
 }
 
-__init void xen_hvm_init_time_ops(void)
+void __init xen_hvm_init_time_ops(void)
 {
 	/* vector callback is needed otherwise we cannot receive interrupts
 	 * on cpu > 0 and at this point we don't know how many cpus are
-- 
cgit v1.2.3-70-g09d2


From b53cedebd74918237176520f9157deb7ae066b71 Mon Sep 17 00:00:00 2001
From: Daniel Kiper <dkiper@net-space.pl>
Date: Wed, 4 May 2011 20:18:05 +0200
Subject: arch/x86/xen/smp: Cleanup code/data sections definitions

Cleanup code/data sections definitions
accordingly to include/linux/init.h.

Signed-off-by: Daniel Kiper <dkiper@net-space.pl>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/smp.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 30612441ed9..194a3edef5c 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -57,7 +57,7 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static __cpuinit void cpu_bringup(void)
+static void __cpuinit cpu_bringup(void)
 {
 	int cpu = smp_processor_id();
 
@@ -85,7 +85,7 @@ static __cpuinit void cpu_bringup(void)
 	wmb();			/* make sure everything is out */
 }
 
-static __cpuinit void cpu_bringup_and_idle(void)
+static void __cpuinit cpu_bringup_and_idle(void)
 {
 	cpu_bringup();
 	cpu_idle();
@@ -242,7 +242,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
 	}
 }
 
-static __cpuinit int
+static int __cpuinit
 cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 {
 	struct vcpu_guest_context *ctxt;
@@ -486,7 +486,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static const struct smp_ops xen_smp_ops __initdata = {
+static const struct smp_ops xen_smp_ops __initconst = {
 	.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
 	.smp_prepare_cpus = xen_smp_prepare_cpus,
 	.smp_cpus_done = xen_smp_cpus_done,
-- 
cgit v1.2.3-70-g09d2


From bb0a56ecc4ba2a3db1b6ea6949c309886e3447d3 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 19 May 2011 18:51:07 -0400
Subject: [CPUFREQ] Move x86 drivers to drivers/cpufreq/

Signed-off-by: Dave Jones <davej@redhat.com>
---
 arch/x86/Kconfig                                 |    2 +-
 arch/x86/kernel/cpu/Makefile                     |    1 -
 arch/x86/kernel/cpu/cpufreq/Kconfig              |  266 ----
 arch/x86/kernel/cpu/cpufreq/Makefile             |   21 -
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c       |  773 -----------
 arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c    |  444 ------
 arch/x86/kernel/cpu/cpufreq/e_powersaver.c       |  367 -----
 arch/x86/kernel/cpu/cpufreq/elanfreq.c           |  309 -----
 arch/x86/kernel/cpu/cpufreq/gx-suspmod.c         |  514 -------
 arch/x86/kernel/cpu/cpufreq/longhaul.c           | 1024 --------------
 arch/x86/kernel/cpu/cpufreq/longhaul.h           |  353 -----
 arch/x86/kernel/cpu/cpufreq/longrun.c            |  324 -----
 arch/x86/kernel/cpu/cpufreq/mperf.c              |   51 -
 arch/x86/kernel/cpu/cpufreq/mperf.h              |    9 -
 arch/x86/kernel/cpu/cpufreq/p4-clockmod.c        |  329 -----
 arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c        |  621 ---------
 arch/x86/kernel/cpu/cpufreq/powernow-k6.c        |  261 ----
 arch/x86/kernel/cpu/cpufreq/powernow-k7.c        |  747 ----------
 arch/x86/kernel/cpu/cpufreq/powernow-k7.h        |   43 -
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c        | 1607 ----------------------
 arch/x86/kernel/cpu/cpufreq/powernow-k8.h        |  222 ---
 arch/x86/kernel/cpu/cpufreq/sc520_freq.c         |  192 ---
 arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c |  633 ---------
 arch/x86/kernel/cpu/cpufreq/speedstep-ich.c      |  448 ------
 arch/x86/kernel/cpu/cpufreq/speedstep-lib.c      |  478 -------
 arch/x86/kernel/cpu/cpufreq/speedstep-lib.h      |   49 -
 arch/x86/kernel/cpu/cpufreq/speedstep-smi.c      |  464 -------
 drivers/cpufreq/Kconfig                          |   10 +-
 drivers/cpufreq/Kconfig.x86                      |  255 ++++
 drivers/cpufreq/Makefile                         |   26 +
 drivers/cpufreq/acpi-cpufreq.c                   |  773 +++++++++++
 drivers/cpufreq/cpufreq-nforce2.c                |  444 ++++++
 drivers/cpufreq/e_powersaver.c                   |  367 +++++
 drivers/cpufreq/elanfreq.c                       |  309 +++++
 drivers/cpufreq/gx-suspmod.c                     |  514 +++++++
 drivers/cpufreq/longhaul.c                       | 1024 ++++++++++++++
 drivers/cpufreq/longhaul.h                       |  353 +++++
 drivers/cpufreq/longrun.c                        |  324 +++++
 drivers/cpufreq/mperf.c                          |   51 +
 drivers/cpufreq/mperf.h                          |    9 +
 drivers/cpufreq/p4-clockmod.c                    |  329 +++++
 drivers/cpufreq/pcc-cpufreq.c                    |  621 +++++++++
 drivers/cpufreq/powernow-k6.c                    |  261 ++++
 drivers/cpufreq/powernow-k7.c                    |  747 ++++++++++
 drivers/cpufreq/powernow-k7.h                    |   43 +
 drivers/cpufreq/powernow-k8.c                    | 1607 ++++++++++++++++++++++
 drivers/cpufreq/powernow-k8.h                    |  222 +++
 drivers/cpufreq/sc520_freq.c                     |  192 +++
 drivers/cpufreq/speedstep-centrino.c             |  633 +++++++++
 drivers/cpufreq/speedstep-ich.c                  |  448 ++++++
 drivers/cpufreq/speedstep-lib.c                  |  478 +++++++
 drivers/cpufreq/speedstep-lib.h                  |   49 +
 drivers/cpufreq/speedstep-smi.c                  |  464 +++++++
 53 files changed, 10553 insertions(+), 10552 deletions(-)
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/Kconfig
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/Makefile
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/e_powersaver.c
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/elanfreq.c
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/longhaul.c
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/longhaul.h
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/longrun.c
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/mperf.c
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/mperf.h
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/powernow-k6.c
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/powernow-k7.c
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/powernow-k7.h
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/powernow-k8.c
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/powernow-k8.h
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/sc520_freq.c
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
 delete mode 100644 arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
 create mode 100644 drivers/cpufreq/Kconfig.x86
 create mode 100644 drivers/cpufreq/acpi-cpufreq.c
 create mode 100644 drivers/cpufreq/cpufreq-nforce2.c
 create mode 100644 drivers/cpufreq/e_powersaver.c
 create mode 100644 drivers/cpufreq/elanfreq.c
 create mode 100644 drivers/cpufreq/gx-suspmod.c
 create mode 100644 drivers/cpufreq/longhaul.c
 create mode 100644 drivers/cpufreq/longhaul.h
 create mode 100644 drivers/cpufreq/longrun.c
 create mode 100644 drivers/cpufreq/mperf.c
 create mode 100644 drivers/cpufreq/mperf.h
 create mode 100644 drivers/cpufreq/p4-clockmod.c
 create mode 100644 drivers/cpufreq/pcc-cpufreq.c
 create mode 100644 drivers/cpufreq/powernow-k6.c
 create mode 100644 drivers/cpufreq/powernow-k7.c
 create mode 100644 drivers/cpufreq/powernow-k7.h
 create mode 100644 drivers/cpufreq/powernow-k8.c
 create mode 100644 drivers/cpufreq/powernow-k8.h
 create mode 100644 drivers/cpufreq/sc520_freq.c
 create mode 100644 drivers/cpufreq/speedstep-centrino.c
 create mode 100644 drivers/cpufreq/speedstep-ich.c
 create mode 100644 drivers/cpufreq/speedstep-lib.c
 create mode 100644 drivers/cpufreq/speedstep-lib.h
 create mode 100644 drivers/cpufreq/speedstep-smi.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc6c53a95bf..e7f94a52a5d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1848,7 +1848,7 @@ config APM_ALLOW_INTS
 
 endif # APM
 
-source "arch/x86/kernel/cpu/cpufreq/Kconfig"
+source "drivers/cpufreq/Kconfig"
 
 source "drivers/cpuidle/Kconfig"
 
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3f0ebe429a0..6042981d030 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_PERF_EVENTS)		+= perf_event.o
 
 obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
-obj-$(CONFIG_CPU_FREQ)			+= cpufreq/
 
 obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o
 
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
deleted file mode 100644
index 870e6cc6ad2..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ /dev/null
@@ -1,266 +0,0 @@
-#
-# CPU Frequency scaling
-#
-
-menu "CPU Frequency scaling"
-
-source "drivers/cpufreq/Kconfig"
-
-if CPU_FREQ
-
-comment "CPUFreq processor drivers"
-
-config X86_PCC_CPUFREQ
-	tristate "Processor Clocking Control interface driver"
-	depends on ACPI && ACPI_PROCESSOR
-	help
-	  This driver adds support for the PCC interface.
-
-	  For details, take a look at:
-	  <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called pcc-cpufreq.
-
-	  If in doubt, say N.
-
-config X86_ACPI_CPUFREQ
-	tristate "ACPI Processor P-States driver"
-	select CPU_FREQ_TABLE
-	depends on ACPI_PROCESSOR
-	help
-	  This driver adds a CPUFreq driver which utilizes the ACPI
-	  Processor Performance States.
-	  This driver also supports Intel Enhanced Speedstep.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called acpi-cpufreq.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config ELAN_CPUFREQ
-	tristate "AMD Elan SC400 and SC410"
-	select CPU_FREQ_TABLE
-	depends on X86_ELAN
-	---help---
-	  This adds the CPUFreq driver for AMD Elan SC400 and SC410
-	  processors.
-
-	  You need to specify the processor maximum speed as boot
-	  parameter: elanfreq=maxspeed (in kHz) or as module
-	  parameter "max_freq".
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config SC520_CPUFREQ
-	tristate "AMD Elan SC520"
-	select CPU_FREQ_TABLE
-	depends on X86_ELAN
-	---help---
-	  This adds the CPUFreq driver for AMD Elan SC520 processor.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-
-config X86_POWERNOW_K6
-	tristate "AMD Mobile K6-2/K6-3 PowerNow!"
-	select CPU_FREQ_TABLE
-	depends on X86_32
-	help
-	  This adds the CPUFreq driver for mobile AMD K6-2+ and mobile
-	  AMD K6-3+ processors.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config X86_POWERNOW_K7
-	tristate "AMD Mobile Athlon/Duron PowerNow!"
-	select CPU_FREQ_TABLE
-	depends on X86_32
-	help
-	  This adds the CPUFreq driver for mobile AMD K7 mobile processors.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config X86_POWERNOW_K7_ACPI
-	bool
-	depends on X86_POWERNOW_K7 && ACPI_PROCESSOR
-	depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m)
-	depends on X86_32
-	default y
-
-config X86_POWERNOW_K8
-	tristate "AMD Opteron/Athlon64 PowerNow!"
-	select CPU_FREQ_TABLE
-	depends on ACPI && ACPI_PROCESSOR
-	help
-	  This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called powernow-k8.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-config X86_GX_SUSPMOD
-	tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
-	depends on X86_32 && PCI
-	help
-	 This add the CPUFreq driver for NatSemi Geode processors which
-	 support suspend modulation.
-
-	 For details, take a look at <file:Documentation/cpu-freq/>.
-
-	 If in doubt, say N.
-
-config X86_SPEEDSTEP_CENTRINO
-	tristate "Intel Enhanced SpeedStep (deprecated)"
-	select CPU_FREQ_TABLE
-	select X86_SPEEDSTEP_CENTRINO_TABLE if X86_32
-	depends on X86_32 || (X86_64 && ACPI_PROCESSOR)
-	help
-	  This is deprecated and this functionality is now merged into
-	  acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of
-	  speedstep_centrino.
-	  This adds the CPUFreq driver for Enhanced SpeedStep enabled
-	  mobile CPUs.  This means Intel Pentium M (Centrino) CPUs
-	  or 64bit enabled Intel Xeons.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called speedstep-centrino.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config X86_SPEEDSTEP_CENTRINO_TABLE
-	bool "Built-in tables for Banias CPUs"
-	depends on X86_32 && X86_SPEEDSTEP_CENTRINO
-	default y
-	help
-	  Use built-in tables for Banias CPUs if ACPI encoding
-	  is not available.
-
-	  If in doubt, say N.
-
-config X86_SPEEDSTEP_ICH
-	tristate "Intel Speedstep on ICH-M chipsets (ioport interface)"
-	select CPU_FREQ_TABLE
-	depends on X86_32
-	help
-	  This adds the CPUFreq driver for certain mobile Intel Pentium III
-	  (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all
-	  mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2,
-	  ICH3 or ICH4 southbridge.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config X86_SPEEDSTEP_SMI
-	tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)"
-	select CPU_FREQ_TABLE
-	depends on X86_32 && EXPERIMENTAL
-	help
-	  This adds the CPUFreq driver for certain mobile Intel Pentium III
-	  (Coppermine), all mobile Intel Pentium III-M (Tualatin)
-	  on systems which have an Intel 440BX/ZX/MX southbridge.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config X86_P4_CLOCKMOD
-	tristate "Intel Pentium 4 clock modulation"
-	select CPU_FREQ_TABLE
-	help
-	  This adds the CPUFreq driver for Intel Pentium 4 / XEON
-	  processors.  When enabled it will lower CPU temperature by skipping
-	  clocks.
-
-	  This driver should be only used in exceptional
-	  circumstances when very low power is needed because it causes severe
-	  slowdowns and noticeable latencies.  Normally Speedstep should be used
-	  instead.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called p4-clockmod.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  Unless you are absolutely sure say N.
-
-config X86_CPUFREQ_NFORCE2
-	tristate "nVidia nForce2 FSB changing"
-	depends on X86_32 && EXPERIMENTAL
-	help
-	  This adds the CPUFreq driver for FSB changing on nVidia nForce2
-	  platforms.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config X86_LONGRUN
-	tristate "Transmeta LongRun"
-	depends on X86_32
-	help
-	  This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors
-	  which support LongRun.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config X86_LONGHAUL
-	tristate "VIA Cyrix III Longhaul"
-	select CPU_FREQ_TABLE
-	depends on X86_32 && ACPI_PROCESSOR
-	help
-	  This adds the CPUFreq driver for VIA Samuel/CyrixIII,
-	  VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T
-	  processors.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config X86_E_POWERSAVER
-	tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
-	select CPU_FREQ_TABLE
-	depends on X86_32 && EXPERIMENTAL
-	help
-	  This adds the CPUFreq driver for VIA C7 processors.  However, this driver
-	  does not have any safeguards to prevent operating the CPU out of spec
-	  and is thus considered dangerous.  Please use the regular ACPI cpufreq
-	  driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
-
-	  If in doubt, say N.
-
-comment "shared options"
-
-config X86_SPEEDSTEP_LIB
-	tristate
-	default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD)
-
-config X86_SPEEDSTEP_RELAXED_CAP_CHECK
-	bool "Relaxed speedstep capability checks"
-	depends on X86_32 && (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH)
-	help
-	  Don't perform all checks for a speedstep capable system which would
-	  normally be done. Some ancient or strange systems, though speedstep
-	  capable, don't always indicate that they are speedstep capable. This
-	  option lets the probing code bypass some of those checks if the
-	  parameter "relaxed_check=1" is passed to the module.
-
-endif	# CPU_FREQ
-
-endmenu
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
deleted file mode 100644
index bd54bf67e6f..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-# Link order matters. K8 is preferred to ACPI because of firmware bugs in early
-# K8 systems. ACPI is preferred to all other hardware-specific drivers.
-# speedstep-* is preferred over p4-clockmod.
-
-obj-$(CONFIG_X86_POWERNOW_K8)		+= powernow-k8.o mperf.o
-obj-$(CONFIG_X86_ACPI_CPUFREQ)		+= acpi-cpufreq.o mperf.o
-obj-$(CONFIG_X86_PCC_CPUFREQ)		+= pcc-cpufreq.o
-obj-$(CONFIG_X86_POWERNOW_K6)		+= powernow-k6.o
-obj-$(CONFIG_X86_POWERNOW_K7)		+= powernow-k7.o
-obj-$(CONFIG_X86_LONGHAUL)		+= longhaul.o
-obj-$(CONFIG_X86_E_POWERSAVER)		+= e_powersaver.o
-obj-$(CONFIG_ELAN_CPUFREQ)		+= elanfreq.o
-obj-$(CONFIG_SC520_CPUFREQ)		+= sc520_freq.o
-obj-$(CONFIG_X86_LONGRUN)		+= longrun.o  
-obj-$(CONFIG_X86_GX_SUSPMOD)		+= gx-suspmod.o
-obj-$(CONFIG_X86_SPEEDSTEP_ICH)		+= speedstep-ich.o
-obj-$(CONFIG_X86_SPEEDSTEP_LIB)		+= speedstep-lib.o
-obj-$(CONFIG_X86_SPEEDSTEP_SMI)		+= speedstep-smi.o
-obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO)	+= speedstep-centrino.o
-obj-$(CONFIG_X86_P4_CLOCKMOD)		+= p4-clockmod.o
-obj-$(CONFIG_X86_CPUFREQ_NFORCE2)	+= cpufreq-nforce2.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
deleted file mode 100644
index 4e04e127438..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ /dev/null
@@ -1,773 +0,0 @@
-/*
- * acpi-cpufreq.c - ACPI Processor P-States Driver
- *
- *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
- *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
- *  Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
- *  Copyright (C) 2006       Denis Sadykov <denis.m.sadykov@intel.com>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or (at
- *  your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/cpufreq.h>
-#include <linux/compiler.h>
-#include <linux/dmi.h>
-#include <linux/slab.h>
-
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/uaccess.h>
-
-#include <acpi/processor.h>
-
-#include <asm/msr.h>
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-#include "mperf.h"
-
-MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
-MODULE_DESCRIPTION("ACPI Processor P-States Driver");
-MODULE_LICENSE("GPL");
-
-enum {
-	UNDEFINED_CAPABLE = 0,
-	SYSTEM_INTEL_MSR_CAPABLE,
-	SYSTEM_IO_CAPABLE,
-};
-
-#define INTEL_MSR_RANGE		(0xffff)
-
-struct acpi_cpufreq_data {
-	struct acpi_processor_performance *acpi_data;
-	struct cpufreq_frequency_table *freq_table;
-	unsigned int resume;
-	unsigned int cpu_feature;
-};
-
-static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
-
-/* acpi_perf_data is a pointer to percpu data. */
-static struct acpi_processor_performance __percpu *acpi_perf_data;
-
-static struct cpufreq_driver acpi_cpufreq_driver;
-
-static unsigned int acpi_pstate_strict;
-
-static int check_est_cpu(unsigned int cpuid)
-{
-	struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
-
-	return cpu_has(cpu, X86_FEATURE_EST);
-}
-
-static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
-{
-	struct acpi_processor_performance *perf;
-	int i;
-
-	perf = data->acpi_data;
-
-	for (i = 0; i < perf->state_count; i++) {
-		if (value == perf->states[i].status)
-			return data->freq_table[i].frequency;
-	}
-	return 0;
-}
-
-static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
-{
-	int i;
-	struct acpi_processor_performance *perf;
-
-	msr &= INTEL_MSR_RANGE;
-	perf = data->acpi_data;
-
-	for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
-		if (msr == perf->states[data->freq_table[i].index].status)
-			return data->freq_table[i].frequency;
-	}
-	return data->freq_table[0].frequency;
-}
-
-static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
-{
-	switch (data->cpu_feature) {
-	case SYSTEM_INTEL_MSR_CAPABLE:
-		return extract_msr(val, data);
-	case SYSTEM_IO_CAPABLE:
-		return extract_io(val, data);
-	default:
-		return 0;
-	}
-}
-
-struct msr_addr {
-	u32 reg;
-};
-
-struct io_addr {
-	u16 port;
-	u8 bit_width;
-};
-
-struct drv_cmd {
-	unsigned int type;
-	const struct cpumask *mask;
-	union {
-		struct msr_addr msr;
-		struct io_addr io;
-	} addr;
-	u32 val;
-};
-
-/* Called via smp_call_function_single(), on the target CPU */
-static void do_drv_read(void *_cmd)
-{
-	struct drv_cmd *cmd = _cmd;
-	u32 h;
-
-	switch (cmd->type) {
-	case SYSTEM_INTEL_MSR_CAPABLE:
-		rdmsr(cmd->addr.msr.reg, cmd->val, h);
-		break;
-	case SYSTEM_IO_CAPABLE:
-		acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
-				&cmd->val,
-				(u32)cmd->addr.io.bit_width);
-		break;
-	default:
-		break;
-	}
-}
-
-/* Called via smp_call_function_many(), on the target CPUs */
-static void do_drv_write(void *_cmd)
-{
-	struct drv_cmd *cmd = _cmd;
-	u32 lo, hi;
-
-	switch (cmd->type) {
-	case SYSTEM_INTEL_MSR_CAPABLE:
-		rdmsr(cmd->addr.msr.reg, lo, hi);
-		lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
-		wrmsr(cmd->addr.msr.reg, lo, hi);
-		break;
-	case SYSTEM_IO_CAPABLE:
-		acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
-				cmd->val,
-				(u32)cmd->addr.io.bit_width);
-		break;
-	default:
-		break;
-	}
-}
-
-static void drv_read(struct drv_cmd *cmd)
-{
-	int err;
-	cmd->val = 0;
-
-	err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
-	WARN_ON_ONCE(err);	/* smp_call_function_any() was buggy? */
-}
-
-static void drv_write(struct drv_cmd *cmd)
-{
-	int this_cpu;
-
-	this_cpu = get_cpu();
-	if (cpumask_test_cpu(this_cpu, cmd->mask))
-		do_drv_write(cmd);
-	smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
-	put_cpu();
-}
-
-static u32 get_cur_val(const struct cpumask *mask)
-{
-	struct acpi_processor_performance *perf;
-	struct drv_cmd cmd;
-
-	if (unlikely(cpumask_empty(mask)))
-		return 0;
-
-	switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) {
-	case SYSTEM_INTEL_MSR_CAPABLE:
-		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
-		cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
-		break;
-	case SYSTEM_IO_CAPABLE:
-		cmd.type = SYSTEM_IO_CAPABLE;
-		perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data;
-		cmd.addr.io.port = perf->control_register.address;
-		cmd.addr.io.bit_width = perf->control_register.bit_width;
-		break;
-	default:
-		return 0;
-	}
-
-	cmd.mask = mask;
-	drv_read(&cmd);
-
-	pr_debug("get_cur_val = %u\n", cmd.val);
-
-	return cmd.val;
-}
-
-static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
-{
-	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
-	unsigned int freq;
-	unsigned int cached_freq;
-
-	pr_debug("get_cur_freq_on_cpu (%d)\n", cpu);
-
-	if (unlikely(data == NULL ||
-		     data->acpi_data == NULL || data->freq_table == NULL)) {
-		return 0;
-	}
-
-	cached_freq = data->freq_table[data->acpi_data->state].frequency;
-	freq = extract_freq(get_cur_val(cpumask_of(cpu)), data);
-	if (freq != cached_freq) {
-		/*
-		 * The dreaded BIOS frequency change behind our back.
-		 * Force set the frequency on next target call.
-		 */
-		data->resume = 1;
-	}
-
-	pr_debug("cur freq = %u\n", freq);
-
-	return freq;
-}
-
-static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
-				struct acpi_cpufreq_data *data)
-{
-	unsigned int cur_freq;
-	unsigned int i;
-
-	for (i = 0; i < 100; i++) {
-		cur_freq = extract_freq(get_cur_val(mask), data);
-		if (cur_freq == freq)
-			return 1;
-		udelay(10);
-	}
-	return 0;
-}
-
-static int acpi_cpufreq_target(struct cpufreq_policy *policy,
-			       unsigned int target_freq, unsigned int relation)
-{
-	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-	struct acpi_processor_performance *perf;
-	struct cpufreq_freqs freqs;
-	struct drv_cmd cmd;
-	unsigned int next_state = 0; /* Index into freq_table */
-	unsigned int next_perf_state = 0; /* Index into perf table */
-	unsigned int i;
-	int result = 0;
-
-	pr_debug("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
-
-	if (unlikely(data == NULL ||
-	     data->acpi_data == NULL || data->freq_table == NULL)) {
-		return -ENODEV;
-	}
-
-	perf = data->acpi_data;
-	result = cpufreq_frequency_table_target(policy,
-						data->freq_table,
-						target_freq,
-						relation, &next_state);
-	if (unlikely(result)) {
-		result = -ENODEV;
-		goto out;
-	}
-
-	next_perf_state = data->freq_table[next_state].index;
-	if (perf->state == next_perf_state) {
-		if (unlikely(data->resume)) {
-			pr_debug("Called after resume, resetting to P%d\n",
-				next_perf_state);
-			data->resume = 0;
-		} else {
-			pr_debug("Already at target state (P%d)\n",
-				next_perf_state);
-			goto out;
-		}
-	}
-
-	switch (data->cpu_feature) {
-	case SYSTEM_INTEL_MSR_CAPABLE:
-		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
-		cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
-		cmd.val = (u32) perf->states[next_perf_state].control;
-		break;
-	case SYSTEM_IO_CAPABLE:
-		cmd.type = SYSTEM_IO_CAPABLE;
-		cmd.addr.io.port = perf->control_register.address;
-		cmd.addr.io.bit_width = perf->control_register.bit_width;
-		cmd.val = (u32) perf->states[next_perf_state].control;
-		break;
-	default:
-		result = -ENODEV;
-		goto out;
-	}
-
-	/* cpufreq holds the hotplug lock, so we are safe from here on */
-	if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
-		cmd.mask = policy->cpus;
-	else
-		cmd.mask = cpumask_of(policy->cpu);
-
-	freqs.old = perf->states[perf->state].core_frequency * 1000;
-	freqs.new = data->freq_table[next_state].frequency;
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	}
-
-	drv_write(&cmd);
-
-	if (acpi_pstate_strict) {
-		if (!check_freqs(cmd.mask, freqs.new, data)) {
-			pr_debug("acpi_cpufreq_target failed (%d)\n",
-				policy->cpu);
-			result = -EAGAIN;
-			goto out;
-		}
-	}
-
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
-	perf->state = next_perf_state;
-
-out:
-	return result;
-}
-
-static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
-{
-	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-
-	pr_debug("acpi_cpufreq_verify\n");
-
-	return cpufreq_frequency_table_verify(policy, data->freq_table);
-}
-
-static unsigned long
-acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
-{
-	struct acpi_processor_performance *perf = data->acpi_data;
-
-	if (cpu_khz) {
-		/* search the closest match to cpu_khz */
-		unsigned int i;
-		unsigned long freq;
-		unsigned long freqn = perf->states[0].core_frequency * 1000;
-
-		for (i = 0; i < (perf->state_count-1); i++) {
-			freq = freqn;
-			freqn = perf->states[i+1].core_frequency * 1000;
-			if ((2 * cpu_khz) > (freqn + freq)) {
-				perf->state = i;
-				return freq;
-			}
-		}
-		perf->state = perf->state_count-1;
-		return freqn;
-	} else {
-		/* assume CPU is at P0... */
-		perf->state = 0;
-		return perf->states[0].core_frequency * 1000;
-	}
-}
-
-static void free_acpi_perf_data(void)
-{
-	unsigned int i;
-
-	/* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
-	for_each_possible_cpu(i)
-		free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
-				 ->shared_cpu_map);
-	free_percpu(acpi_perf_data);
-}
-
-/*
- * acpi_cpufreq_early_init - initialize ACPI P-States library
- *
- * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c)
- * in order to determine correct frequency and voltage pairings. We can
- * do _PDC and _PSD and find out the processor dependency for the
- * actual init that will happen later...
- */
-static int __init acpi_cpufreq_early_init(void)
-{
-	unsigned int i;
-	pr_debug("acpi_cpufreq_early_init\n");
-
-	acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
-	if (!acpi_perf_data) {
-		pr_debug("Memory allocation error for acpi_perf_data.\n");
-		return -ENOMEM;
-	}
-	for_each_possible_cpu(i) {
-		if (!zalloc_cpumask_var_node(
-			&per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
-			GFP_KERNEL, cpu_to_node(i))) {
-
-			/* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
-			free_acpi_perf_data();
-			return -ENOMEM;
-		}
-	}
-
-	/* Do initialization in ACPI core */
-	acpi_processor_preregister_performance(acpi_perf_data);
-	return 0;
-}
-
-#ifdef CONFIG_SMP
-/*
- * Some BIOSes do SW_ANY coordination internally, either set it up in hw
- * or do it in BIOS firmware and won't inform about it to OS. If not
- * detected, this has a side effect of making CPU run at a different speed
- * than OS intended it to run at. Detect it and handle it cleanly.
- */
-static int bios_with_sw_any_bug;
-
-static int sw_any_bug_found(const struct dmi_system_id *d)
-{
-	bios_with_sw_any_bug = 1;
-	return 0;
-}
-
-static const struct dmi_system_id sw_any_bug_dmi_table[] = {
-	{
-		.callback = sw_any_bug_found,
-		.ident = "Supermicro Server X6DLP",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
-			DMI_MATCH(DMI_BIOS_VERSION, "080010"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),
-		},
-	},
-	{ }
-};
-
-static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
-{
-	/* Intel Xeon Processor 7100 Series Specification Update
-	 * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
-	 * AL30: A Machine Check Exception (MCE) Occurring during an
-	 * Enhanced Intel SpeedStep Technology Ratio Change May Cause
-	 * Both Processor Cores to Lock Up. */
-	if (c->x86_vendor == X86_VENDOR_INTEL) {
-		if ((c->x86 == 15) &&
-		    (c->x86_model == 6) &&
-		    (c->x86_mask == 8)) {
-			printk(KERN_INFO "acpi-cpufreq: Intel(R) "
-			    "Xeon(R) 7100 Errata AL30, processors may "
-			    "lock up on frequency changes: disabling "
-			    "acpi-cpufreq.\n");
-			return -ENODEV;
-		    }
-		}
-	return 0;
-}
-#endif
-
-static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
-{
-	unsigned int i;
-	unsigned int valid_states = 0;
-	unsigned int cpu = policy->cpu;
-	struct acpi_cpufreq_data *data;
-	unsigned int result = 0;
-	struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
-	struct acpi_processor_performance *perf;
-#ifdef CONFIG_SMP
-	static int blacklisted;
-#endif
-
-	pr_debug("acpi_cpufreq_cpu_init\n");
-
-#ifdef CONFIG_SMP
-	if (blacklisted)
-		return blacklisted;
-	blacklisted = acpi_cpufreq_blacklist(c);
-	if (blacklisted)
-		return blacklisted;
-#endif
-
-	data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
-	data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
-	per_cpu(acfreq_data, cpu) = data;
-
-	if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
-		acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
-
-	result = acpi_processor_register_performance(data->acpi_data, cpu);
-	if (result)
-		goto err_free;
-
-	perf = data->acpi_data;
-	policy->shared_type = perf->shared_type;
-
-	/*
-	 * Will let policy->cpus know about dependency only when software
-	 * coordination is required.
-	 */
-	if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
-	    policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
-		cpumask_copy(policy->cpus, perf->shared_cpu_map);
-	}
-	cpumask_copy(policy->related_cpus, perf->shared_cpu_map);
-
-#ifdef CONFIG_SMP
-	dmi_check_system(sw_any_bug_dmi_table);
-	if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) {
-		policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
-		cpumask_copy(policy->cpus, cpu_core_mask(cpu));
-	}
-#endif
-
-	/* capability check */
-	if (perf->state_count <= 1) {
-		pr_debug("No P-States\n");
-		result = -ENODEV;
-		goto err_unreg;
-	}
-
-	if (perf->control_register.space_id != perf->status_register.space_id) {
-		result = -ENODEV;
-		goto err_unreg;
-	}
-
-	switch (perf->control_register.space_id) {
-	case ACPI_ADR_SPACE_SYSTEM_IO:
-		pr_debug("SYSTEM IO addr space\n");
-		data->cpu_feature = SYSTEM_IO_CAPABLE;
-		break;
-	case ACPI_ADR_SPACE_FIXED_HARDWARE:
-		pr_debug("HARDWARE addr space\n");
-		if (!check_est_cpu(cpu)) {
-			result = -ENODEV;
-			goto err_unreg;
-		}
-		data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
-		break;
-	default:
-		pr_debug("Unknown addr space %d\n",
-			(u32) (perf->control_register.space_id));
-		result = -ENODEV;
-		goto err_unreg;
-	}
-
-	data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *
-		    (perf->state_count+1), GFP_KERNEL);
-	if (!data->freq_table) {
-		result = -ENOMEM;
-		goto err_unreg;
-	}
-
-	/* detect transition latency */
-	policy->cpuinfo.transition_latency = 0;
-	for (i = 0; i < perf->state_count; i++) {
-		if ((perf->states[i].transition_latency * 1000) >
-		    policy->cpuinfo.transition_latency)
-			policy->cpuinfo.transition_latency =
-			    perf->states[i].transition_latency * 1000;
-	}
-
-	/* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
-	if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
-	    policy->cpuinfo.transition_latency > 20 * 1000) {
-		policy->cpuinfo.transition_latency = 20 * 1000;
-		printk_once(KERN_INFO
-			    "P-state transition latency capped at 20 uS\n");
-	}
-
-	/* table init */
-	for (i = 0; i < perf->state_count; i++) {
-		if (i > 0 && perf->states[i].core_frequency >=
-		    data->freq_table[valid_states-1].frequency / 1000)
-			continue;
-
-		data->freq_table[valid_states].index = i;
-		data->freq_table[valid_states].frequency =
-		    perf->states[i].core_frequency * 1000;
-		valid_states++;
-	}
-	data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
-	perf->state = 0;
-
-	result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
-	if (result)
-		goto err_freqfree;
-
-	if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq)
-		printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n");
-
-	switch (perf->control_register.space_id) {
-	case ACPI_ADR_SPACE_SYSTEM_IO:
-		/* Current speed is unknown and not detectable by IO port */
-		policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
-		break;
-	case ACPI_ADR_SPACE_FIXED_HARDWARE:
-		acpi_cpufreq_driver.get = get_cur_freq_on_cpu;
-		policy->cur = get_cur_freq_on_cpu(cpu);
-		break;
-	default:
-		break;
-	}
-
-	/* notify BIOS that we exist */
-	acpi_processor_notify_smm(THIS_MODULE);
-
-	/* Check for APERF/MPERF support in hardware */
-	if (cpu_has(c, X86_FEATURE_APERFMPERF))
-		acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
-
-	pr_debug("CPU%u - ACPI performance management activated.\n", cpu);
-	for (i = 0; i < perf->state_count; i++)
-		pr_debug("     %cP%d: %d MHz, %d mW, %d uS\n",
-			(i == perf->state ? '*' : ' '), i,
-			(u32) perf->states[i].core_frequency,
-			(u32) perf->states[i].power,
-			(u32) perf->states[i].transition_latency);
-
-	cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
-
-	/*
-	 * the first call to ->target() should result in us actually
-	 * writing something to the appropriate registers.
-	 */
-	data->resume = 1;
-
-	return result;
-
-err_freqfree:
-	kfree(data->freq_table);
-err_unreg:
-	acpi_processor_unregister_performance(perf, cpu);
-err_free:
-	kfree(data);
-	per_cpu(acfreq_data, cpu) = NULL;
-
-	return result;
-}
-
-static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
-{
-	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-
-	pr_debug("acpi_cpufreq_cpu_exit\n");
-
-	if (data) {
-		cpufreq_frequency_table_put_attr(policy->cpu);
-		per_cpu(acfreq_data, policy->cpu) = NULL;
-		acpi_processor_unregister_performance(data->acpi_data,
-						      policy->cpu);
-		kfree(data->freq_table);
-		kfree(data);
-	}
-
-	return 0;
-}
-
-static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
-{
-	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-
-	pr_debug("acpi_cpufreq_resume\n");
-
-	data->resume = 1;
-
-	return 0;
-}
-
-static struct freq_attr *acpi_cpufreq_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-static struct cpufreq_driver acpi_cpufreq_driver = {
-	.verify		= acpi_cpufreq_verify,
-	.target		= acpi_cpufreq_target,
-	.bios_limit	= acpi_processor_get_bios_limit,
-	.init		= acpi_cpufreq_cpu_init,
-	.exit		= acpi_cpufreq_cpu_exit,
-	.resume		= acpi_cpufreq_resume,
-	.name		= "acpi-cpufreq",
-	.owner		= THIS_MODULE,
-	.attr		= acpi_cpufreq_attr,
-};
-
-static int __init acpi_cpufreq_init(void)
-{
-	int ret;
-
-	if (acpi_disabled)
-		return 0;
-
-	pr_debug("acpi_cpufreq_init\n");
-
-	ret = acpi_cpufreq_early_init();
-	if (ret)
-		return ret;
-
-	ret = cpufreq_register_driver(&acpi_cpufreq_driver);
-	if (ret)
-		free_acpi_perf_data();
-
-	return ret;
-}
-
-static void __exit acpi_cpufreq_exit(void)
-{
-	pr_debug("acpi_cpufreq_exit\n");
-
-	cpufreq_unregister_driver(&acpi_cpufreq_driver);
-
-	free_percpu(acpi_perf_data);
-}
-
-module_param(acpi_pstate_strict, uint, 0644);
-MODULE_PARM_DESC(acpi_pstate_strict,
-	"value 0 or non-zero. non-zero -> strict ACPI checks are "
-	"performed during frequency changes.");
-
-late_initcall(acpi_cpufreq_init);
-module_exit(acpi_cpufreq_exit);
-
-MODULE_ALIAS("acpi");
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
deleted file mode 100644
index 7bac808804f..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ /dev/null
@@ -1,444 +0,0 @@
-/*
- * (C) 2004-2006  Sebastian Witt <se.witt@gmx.net>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *  Based upon reverse engineered information
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-
-#define NFORCE2_XTAL 25
-#define NFORCE2_BOOTFSB 0x48
-#define NFORCE2_PLLENABLE 0xa8
-#define NFORCE2_PLLREG 0xa4
-#define NFORCE2_PLLADR 0xa0
-#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div)
-
-#define NFORCE2_MIN_FSB 50
-#define NFORCE2_SAFE_DISTANCE 50
-
-/* Delay in ms between FSB changes */
-/* #define NFORCE2_DELAY 10 */
-
-/*
- * nforce2_chipset:
- * FSB is changed using the chipset
- */
-static struct pci_dev *nforce2_dev;
-
-/* fid:
- * multiplier * 10
- */
-static int fid;
-
-/* min_fsb, max_fsb:
- * minimum and maximum FSB (= FSB at boot time)
- */
-static int min_fsb;
-static int max_fsb;
-
-MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
-MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
-MODULE_LICENSE("GPL");
-
-module_param(fid, int, 0444);
-module_param(min_fsb, int, 0444);
-
-MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
-MODULE_PARM_DESC(min_fsb,
-		"Minimum FSB to use, if not defined: current FSB - 50");
-
-#define PFX "cpufreq-nforce2: "
-
-/**
- * nforce2_calc_fsb - calculate FSB
- * @pll: PLL value
- *
- *   Calculates FSB from PLL value
- */
-static int nforce2_calc_fsb(int pll)
-{
-	unsigned char mul, div;
-
-	mul = (pll >> 8) & 0xff;
-	div = pll & 0xff;
-
-	if (div > 0)
-		return NFORCE2_XTAL * mul / div;
-
-	return 0;
-}
-
-/**
- * nforce2_calc_pll - calculate PLL value
- * @fsb: FSB
- *
- *   Calculate PLL value for given FSB
- */
-static int nforce2_calc_pll(unsigned int fsb)
-{
-	unsigned char xmul, xdiv;
-	unsigned char mul = 0, div = 0;
-	int tried = 0;
-
-	/* Try to calculate multiplier and divider up to 4 times */
-	while (((mul == 0) || (div == 0)) && (tried <= 3)) {
-		for (xdiv = 2; xdiv <= 0x80; xdiv++)
-			for (xmul = 1; xmul <= 0xfe; xmul++)
-				if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) ==
-				    fsb + tried) {
-					mul = xmul;
-					div = xdiv;
-				}
-		tried++;
-	}
-
-	if ((mul == 0) || (div == 0))
-		return -1;
-
-	return NFORCE2_PLL(mul, div);
-}
-
-/**
- * nforce2_write_pll - write PLL value to chipset
- * @pll: PLL value
- *
- *   Writes new FSB PLL value to chipset
- */
-static void nforce2_write_pll(int pll)
-{
-	int temp;
-
-	/* Set the pll addr. to 0x00 */
-	pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0);
-
-	/* Now write the value in all 64 registers */
-	for (temp = 0; temp <= 0x3f; temp++)
-		pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll);
-
-	return;
-}
-
-/**
- * nforce2_fsb_read - Read FSB
- *
- *   Read FSB from chipset
- *   If bootfsb != 0, return FSB at boot-time
- */
-static unsigned int nforce2_fsb_read(int bootfsb)
-{
-	struct pci_dev *nforce2_sub5;
-	u32 fsb, temp = 0;
-
-	/* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
-	nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF,
-				PCI_ANY_ID, PCI_ANY_ID, NULL);
-	if (!nforce2_sub5)
-		return 0;
-
-	pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb);
-	fsb /= 1000000;
-
-	/* Check if PLL register is already set */
-	pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
-
-	if (bootfsb || !temp)
-		return fsb;
-
-	/* Use PLL register FSB value */
-	pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp);
-	fsb = nforce2_calc_fsb(temp);
-
-	return fsb;
-}
-
-/**
- * nforce2_set_fsb - set new FSB
- * @fsb: New FSB
- *
- *   Sets new FSB
- */
-static int nforce2_set_fsb(unsigned int fsb)
-{
-	u32 temp = 0;
-	unsigned int tfsb;
-	int diff;
-	int pll = 0;
-
-	if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) {
-		printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb);
-		return -EINVAL;
-	}
-
-	tfsb = nforce2_fsb_read(0);
-	if (!tfsb) {
-		printk(KERN_ERR PFX "Error while reading the FSB\n");
-		return -EINVAL;
-	}
-
-	/* First write? Then set actual value */
-	pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
-	if (!temp) {
-		pll = nforce2_calc_pll(tfsb);
-
-		if (pll < 0)
-			return -EINVAL;
-
-		nforce2_write_pll(pll);
-	}
-
-	/* Enable write access */
-	temp = 0x01;
-	pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp);
-
-	diff = tfsb - fsb;
-
-	if (!diff)
-		return 0;
-
-	while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) {
-		if (diff < 0)
-			tfsb++;
-		else
-			tfsb--;
-
-		/* Calculate the PLL reg. value */
-		pll = nforce2_calc_pll(tfsb);
-		if (pll == -1)
-			return -EINVAL;
-
-		nforce2_write_pll(pll);
-#ifdef NFORCE2_DELAY
-		mdelay(NFORCE2_DELAY);
-#endif
-	}
-
-	temp = 0x40;
-	pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp);
-
-	return 0;
-}
-
-/**
- * nforce2_get - get the CPU frequency
- * @cpu: CPU number
- *
- * Returns the CPU frequency
- */
-static unsigned int nforce2_get(unsigned int cpu)
-{
-	if (cpu)
-		return 0;
-	return nforce2_fsb_read(0) * fid * 100;
-}
-
-/**
- * nforce2_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- *  (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int nforce2_target(struct cpufreq_policy *policy,
-			  unsigned int target_freq, unsigned int relation)
-{
-/*        unsigned long         flags; */
-	struct cpufreq_freqs freqs;
-	unsigned int target_fsb;
-
-	if ((target_freq > policy->max) || (target_freq < policy->min))
-		return -EINVAL;
-
-	target_fsb = target_freq / (fid * 100);
-
-	freqs.old = nforce2_get(policy->cpu);
-	freqs.new = target_fsb * fid * 100;
-	freqs.cpu = 0;		/* Only one CPU on nForce2 platforms */
-
-	if (freqs.old == freqs.new)
-		return 0;
-
-	pr_debug("Old CPU frequency %d kHz, new %d kHz\n",
-	       freqs.old, freqs.new);
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
-	/* Disable IRQs */
-	/* local_irq_save(flags); */
-
-	if (nforce2_set_fsb(target_fsb) < 0)
-		printk(KERN_ERR PFX "Changing FSB to %d failed\n",
-			target_fsb);
-	else
-		pr_debug("Changed FSB successfully to %d\n",
-			target_fsb);
-
-	/* Enable IRQs */
-	/* local_irq_restore(flags); */
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
-	return 0;
-}
-
-/**
- * nforce2_verify - verifies a new CPUFreq policy
- * @policy: new policy
- */
-static int nforce2_verify(struct cpufreq_policy *policy)
-{
-	unsigned int fsb_pol_max;
-
-	fsb_pol_max = policy->max / (fid * 100);
-
-	if (policy->min < (fsb_pol_max * fid * 100))
-		policy->max = (fsb_pol_max + 1) * fid * 100;
-
-	cpufreq_verify_within_limits(policy,
-				     policy->cpuinfo.min_freq,
-				     policy->cpuinfo.max_freq);
-	return 0;
-}
-
-static int nforce2_cpu_init(struct cpufreq_policy *policy)
-{
-	unsigned int fsb;
-	unsigned int rfid;
-
-	/* capability check */
-	if (policy->cpu != 0)
-		return -ENODEV;
-
-	/* Get current FSB */
-	fsb = nforce2_fsb_read(0);
-
-	if (!fsb)
-		return -EIO;
-
-	/* FIX: Get FID from CPU */
-	if (!fid) {
-		if (!cpu_khz) {
-			printk(KERN_WARNING PFX
-			"cpu_khz not set, can't calculate multiplier!\n");
-			return -ENODEV;
-		}
-
-		fid = cpu_khz / (fsb * 100);
-		rfid = fid % 5;
-
-		if (rfid) {
-			if (rfid > 2)
-				fid += 5 - rfid;
-			else
-				fid -= rfid;
-		}
-	}
-
-	printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb,
-	       fid / 10, fid % 10);
-
-	/* Set maximum FSB to FSB at boot time */
-	max_fsb = nforce2_fsb_read(1);
-
-	if (!max_fsb)
-		return -EIO;
-
-	if (!min_fsb)
-		min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE;
-
-	if (min_fsb < NFORCE2_MIN_FSB)
-		min_fsb = NFORCE2_MIN_FSB;
-
-	/* cpuinfo and default policy values */
-	policy->cpuinfo.min_freq = min_fsb * fid * 100;
-	policy->cpuinfo.max_freq = max_fsb * fid * 100;
-	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-	policy->cur = nforce2_get(policy->cpu);
-	policy->min = policy->cpuinfo.min_freq;
-	policy->max = policy->cpuinfo.max_freq;
-
-	return 0;
-}
-
-static int nforce2_cpu_exit(struct cpufreq_policy *policy)
-{
-	return 0;
-}
-
-static struct cpufreq_driver nforce2_driver = {
-	.name = "nforce2",
-	.verify = nforce2_verify,
-	.target = nforce2_target,
-	.get = nforce2_get,
-	.init = nforce2_cpu_init,
-	.exit = nforce2_cpu_exit,
-	.owner = THIS_MODULE,
-};
-
-/**
- * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic
- *
- * Detects nForce2 A2 and C1 stepping
- *
- */
-static int nforce2_detect_chipset(void)
-{
-	nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
-					PCI_DEVICE_ID_NVIDIA_NFORCE2,
-					PCI_ANY_ID, PCI_ANY_ID, NULL);
-
-	if (nforce2_dev == NULL)
-		return -ENODEV;
-
-	printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n",
-	       nforce2_dev->revision);
-	printk(KERN_INFO PFX
-	       "FSB changing is maybe unstable and can lead to "
-	       "crashes and data loss.\n");
-
-	return 0;
-}
-
-/**
- * nforce2_init - initializes the nForce2 CPUFreq driver
- *
- * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported
- * devices, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init nforce2_init(void)
-{
-	/* TODO: do we need to detect the processor? */
-
-	/* detect chipset */
-	if (nforce2_detect_chipset()) {
-		printk(KERN_INFO PFX "No nForce2 chipset.\n");
-		return -ENODEV;
-	}
-
-	return cpufreq_register_driver(&nforce2_driver);
-}
-
-/**
- * nforce2_exit - unregisters cpufreq module
- *
- *   Unregisters nForce2 FSB change support.
- */
-static void __exit nforce2_exit(void)
-{
-	cpufreq_unregister_driver(&nforce2_driver);
-}
-
-module_init(nforce2_init);
-module_exit(nforce2_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
deleted file mode 100644
index 35a257dd4bb..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- *  Based on documentation provided by Dave Jones. Thanks!
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/ioport.h>
-#include <linux/slab.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/msr.h>
-#include <asm/tsc.h>
-
-#define EPS_BRAND_C7M	0
-#define EPS_BRAND_C7	1
-#define EPS_BRAND_EDEN	2
-#define EPS_BRAND_C3	3
-#define EPS_BRAND_C7D	4
-
-struct eps_cpu_data {
-	u32 fsb;
-	struct cpufreq_frequency_table freq_table[];
-};
-
-static struct eps_cpu_data *eps_cpu[NR_CPUS];
-
-
-static unsigned int eps_get(unsigned int cpu)
-{
-	struct eps_cpu_data *centaur;
-	u32 lo, hi;
-
-	if (cpu)
-		return 0;
-	centaur = eps_cpu[cpu];
-	if (centaur == NULL)
-		return 0;
-
-	/* Return current frequency */
-	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-	return centaur->fsb * ((lo >> 8) & 0xff);
-}
-
-static int eps_set_state(struct eps_cpu_data *centaur,
-			 unsigned int cpu,
-			 u32 dest_state)
-{
-	struct cpufreq_freqs freqs;
-	u32 lo, hi;
-	int err = 0;
-	int i;
-
-	freqs.old = eps_get(cpu);
-	freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff);
-	freqs.cpu = cpu;
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
-	/* Wait while CPU is busy */
-	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-	i = 0;
-	while (lo & ((1 << 16) | (1 << 17))) {
-		udelay(16);
-		rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-		i++;
-		if (unlikely(i > 64)) {
-			err = -ENODEV;
-			goto postchange;
-		}
-	}
-	/* Set new multiplier and voltage */
-	wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0);
-	/* Wait until transition end */
-	i = 0;
-	do {
-		udelay(16);
-		rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-		i++;
-		if (unlikely(i > 64)) {
-			err = -ENODEV;
-			goto postchange;
-		}
-	} while (lo & ((1 << 16) | (1 << 17)));
-
-	/* Return current frequency */
-postchange:
-	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-	freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
-
-#ifdef DEBUG
-	{
-	u8 current_multiplier, current_voltage;
-
-	/* Print voltage and multiplier */
-	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-	current_voltage = lo & 0xff;
-	printk(KERN_INFO "eps: Current voltage = %dmV\n",
-		current_voltage * 16 + 700);
-	current_multiplier = (lo >> 8) & 0xff;
-	printk(KERN_INFO "eps: Current multiplier = %d\n",
-		current_multiplier);
-	}
-#endif
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	return err;
-}
-
-static int eps_target(struct cpufreq_policy *policy,
-			       unsigned int target_freq,
-			       unsigned int relation)
-{
-	struct eps_cpu_data *centaur;
-	unsigned int newstate = 0;
-	unsigned int cpu = policy->cpu;
-	unsigned int dest_state;
-	int ret;
-
-	if (unlikely(eps_cpu[cpu] == NULL))
-		return -ENODEV;
-	centaur = eps_cpu[cpu];
-
-	if (unlikely(cpufreq_frequency_table_target(policy,
-			&eps_cpu[cpu]->freq_table[0],
-			target_freq,
-			relation,
-			&newstate))) {
-		return -EINVAL;
-	}
-
-	/* Make frequency transition */
-	dest_state = centaur->freq_table[newstate].index & 0xffff;
-	ret = eps_set_state(centaur, cpu, dest_state);
-	if (ret)
-		printk(KERN_ERR "eps: Timeout!\n");
-	return ret;
-}
-
-static int eps_verify(struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy,
-			&eps_cpu[policy->cpu]->freq_table[0]);
-}
-
-static int eps_cpu_init(struct cpufreq_policy *policy)
-{
-	unsigned int i;
-	u32 lo, hi;
-	u64 val;
-	u8 current_multiplier, current_voltage;
-	u8 max_multiplier, max_voltage;
-	u8 min_multiplier, min_voltage;
-	u8 brand = 0;
-	u32 fsb;
-	struct eps_cpu_data *centaur;
-	struct cpuinfo_x86 *c = &cpu_data(0);
-	struct cpufreq_frequency_table *f_table;
-	int k, step, voltage;
-	int ret;
-	int states;
-
-	if (policy->cpu != 0)
-		return -ENODEV;
-
-	/* Check brand */
-	printk(KERN_INFO "eps: Detected VIA ");
-
-	switch (c->x86_model) {
-	case 10:
-		rdmsr(0x1153, lo, hi);
-		brand = (((lo >> 2) ^ lo) >> 18) & 3;
-		printk(KERN_CONT "Model A ");
-		break;
-	case 13:
-		rdmsr(0x1154, lo, hi);
-		brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff;
-		printk(KERN_CONT "Model D ");
-		break;
-	}
-
-	switch (brand) {
-	case EPS_BRAND_C7M:
-		printk(KERN_CONT "C7-M\n");
-		break;
-	case EPS_BRAND_C7:
-		printk(KERN_CONT "C7\n");
-		break;
-	case EPS_BRAND_EDEN:
-		printk(KERN_CONT "Eden\n");
-		break;
-	case EPS_BRAND_C7D:
-		printk(KERN_CONT "C7-D\n");
-		break;
-	case EPS_BRAND_C3:
-		printk(KERN_CONT "C3\n");
-		return -ENODEV;
-		break;
-	}
-	/* Enable Enhanced PowerSaver */
-	rdmsrl(MSR_IA32_MISC_ENABLE, val);
-	if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
-		val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
-		wrmsrl(MSR_IA32_MISC_ENABLE, val);
-		/* Can be locked at 0 */
-		rdmsrl(MSR_IA32_MISC_ENABLE, val);
-		if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
-			printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n");
-			return -ENODEV;
-		}
-	}
-
-	/* Print voltage and multiplier */
-	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-	current_voltage = lo & 0xff;
-	printk(KERN_INFO "eps: Current voltage = %dmV\n",
-			current_voltage * 16 + 700);
-	current_multiplier = (lo >> 8) & 0xff;
-	printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier);
-
-	/* Print limits */
-	max_voltage = hi & 0xff;
-	printk(KERN_INFO "eps: Highest voltage = %dmV\n",
-			max_voltage * 16 + 700);
-	max_multiplier = (hi >> 8) & 0xff;
-	printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier);
-	min_voltage = (hi >> 16) & 0xff;
-	printk(KERN_INFO "eps: Lowest voltage = %dmV\n",
-			min_voltage * 16 + 700);
-	min_multiplier = (hi >> 24) & 0xff;
-	printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier);
-
-	/* Sanity checks */
-	if (current_multiplier == 0 || max_multiplier == 0
-	    || min_multiplier == 0)
-		return -EINVAL;
-	if (current_multiplier > max_multiplier
-	    || max_multiplier <= min_multiplier)
-		return -EINVAL;
-	if (current_voltage > 0x1f || max_voltage > 0x1f)
-		return -EINVAL;
-	if (max_voltage < min_voltage)
-		return -EINVAL;
-
-	/* Calc FSB speed */
-	fsb = cpu_khz / current_multiplier;
-	/* Calc number of p-states supported */
-	if (brand == EPS_BRAND_C7M)
-		states = max_multiplier - min_multiplier + 1;
-	else
-		states = 2;
-
-	/* Allocate private data and frequency table for current cpu */
-	centaur = kzalloc(sizeof(struct eps_cpu_data)
-		    + (states + 1) * sizeof(struct cpufreq_frequency_table),
-		    GFP_KERNEL);
-	if (!centaur)
-		return -ENOMEM;
-	eps_cpu[0] = centaur;
-
-	/* Copy basic values */
-	centaur->fsb = fsb;
-
-	/* Fill frequency and MSR value table */
-	f_table = &centaur->freq_table[0];
-	if (brand != EPS_BRAND_C7M) {
-		f_table[0].frequency = fsb * min_multiplier;
-		f_table[0].index = (min_multiplier << 8) | min_voltage;
-		f_table[1].frequency = fsb * max_multiplier;
-		f_table[1].index = (max_multiplier << 8) | max_voltage;
-		f_table[2].frequency = CPUFREQ_TABLE_END;
-	} else {
-		k = 0;
-		step = ((max_voltage - min_voltage) * 256)
-			/ (max_multiplier - min_multiplier);
-		for (i = min_multiplier; i <= max_multiplier; i++) {
-			voltage = (k * step) / 256 + min_voltage;
-			f_table[k].frequency = fsb * i;
-			f_table[k].index = (i << 8) | voltage;
-			k++;
-		}
-		f_table[k].frequency = CPUFREQ_TABLE_END;
-	}
-
-	policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */
-	policy->cur = fsb * current_multiplier;
-
-	ret = cpufreq_frequency_table_cpuinfo(policy, &centaur->freq_table[0]);
-	if (ret) {
-		kfree(centaur);
-		return ret;
-	}
-
-	cpufreq_frequency_table_get_attr(&centaur->freq_table[0], policy->cpu);
-	return 0;
-}
-
-static int eps_cpu_exit(struct cpufreq_policy *policy)
-{
-	unsigned int cpu = policy->cpu;
-	struct eps_cpu_data *centaur;
-	u32 lo, hi;
-
-	if (eps_cpu[cpu] == NULL)
-		return -ENODEV;
-	centaur = eps_cpu[cpu];
-
-	/* Get max frequency */
-	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-	/* Set max frequency */
-	eps_set_state(centaur, cpu, hi & 0xffff);
-	/* Bye */
-	cpufreq_frequency_table_put_attr(policy->cpu);
-	kfree(eps_cpu[cpu]);
-	eps_cpu[cpu] = NULL;
-	return 0;
-}
-
-static struct freq_attr *eps_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-static struct cpufreq_driver eps_driver = {
-	.verify		= eps_verify,
-	.target		= eps_target,
-	.init		= eps_cpu_init,
-	.exit		= eps_cpu_exit,
-	.get		= eps_get,
-	.name		= "e_powersaver",
-	.owner		= THIS_MODULE,
-	.attr		= eps_attr,
-};
-
-static int __init eps_init(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-
-	/* This driver will work only on Centaur C7 processors with
-	 * Enhanced SpeedStep/PowerSaver registers */
-	if (c->x86_vendor != X86_VENDOR_CENTAUR
-	    || c->x86 != 6 || c->x86_model < 10)
-		return -ENODEV;
-	if (!cpu_has(c, X86_FEATURE_EST))
-		return -ENODEV;
-
-	if (cpufreq_register_driver(&eps_driver))
-		return -EINVAL;
-	return 0;
-}
-
-static void __exit eps_exit(void)
-{
-	cpufreq_unregister_driver(&eps_driver);
-}
-
-MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>");
-MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
-MODULE_LICENSE("GPL");
-
-module_init(eps_init);
-module_exit(eps_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
deleted file mode 100644
index c587db472a7..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- *	elanfreq:	cpufreq driver for the AMD ELAN family
- *
- *	(c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de>
- *
- *	Parts of this code are (c) Sven Geggus <sven@geggus.net>
- *
- *      All Rights Reserved.
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- *	2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-
-#include <linux/delay.h>
-#include <linux/cpufreq.h>
-
-#include <asm/msr.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#define REG_CSCIR 0x22		/* Chip Setup and Control Index Register    */
-#define REG_CSCDR 0x23		/* Chip Setup and Control Data  Register    */
-
-/* Module parameter */
-static int max_freq;
-
-struct s_elan_multiplier {
-	int clock;		/* frequency in kHz                         */
-	int val40h;		/* PMU Force Mode register                  */
-	int val80h;		/* CPU Clock Speed Register                 */
-};
-
-/*
- * It is important that the frequencies
- * are listed in ascending order here!
- */
-static struct s_elan_multiplier elan_multiplier[] = {
-	{1000,	0x02,	0x18},
-	{2000,	0x02,	0x10},
-	{4000,	0x02,	0x08},
-	{8000,	0x00,	0x00},
-	{16000,	0x00,	0x02},
-	{33000,	0x00,	0x04},
-	{66000,	0x01,	0x04},
-	{99000,	0x01,	0x05}
-};
-
-static struct cpufreq_frequency_table elanfreq_table[] = {
-	{0,	1000},
-	{1,	2000},
-	{2,	4000},
-	{3,	8000},
-	{4,	16000},
-	{5,	33000},
-	{6,	66000},
-	{7,	99000},
-	{0,	CPUFREQ_TABLE_END},
-};
-
-
-/**
- *	elanfreq_get_cpu_frequency: determine current cpu speed
- *
- *	Finds out at which frequency the CPU of the Elan SOC runs
- *	at the moment. Frequencies from 1 to 33 MHz are generated
- *	the normal way, 66 and 99 MHz are called "Hyperspeed Mode"
- *	and have the rest of the chip running with 33 MHz.
- */
-
-static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
-{
-	u8 clockspeed_reg;    /* Clock Speed Register */
-
-	local_irq_disable();
-	outb_p(0x80, REG_CSCIR);
-	clockspeed_reg = inb_p(REG_CSCDR);
-	local_irq_enable();
-
-	if ((clockspeed_reg & 0xE0) == 0xE0)
-		return 0;
-
-	/* Are we in CPU clock multiplied mode (66/99 MHz)? */
-	if ((clockspeed_reg & 0xE0) == 0xC0) {
-		if ((clockspeed_reg & 0x01) == 0)
-			return 66000;
-		else
-			return 99000;
-	}
-
-	/* 33 MHz is not 32 MHz... */
-	if ((clockspeed_reg & 0xE0) == 0xA0)
-		return 33000;
-
-	return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
-}
-
-
-/**
- *	elanfreq_set_cpu_frequency: Change the CPU core frequency
- *	@cpu: cpu number
- *	@freq: frequency in kHz
- *
- *	This function takes a frequency value and changes the CPU frequency
- *	according to this. Note that the frequency has to be checked by
- *	elanfreq_validatespeed() for correctness!
- *
- *	There is no return value.
- */
-
-static void elanfreq_set_cpu_state(unsigned int state)
-{
-	struct cpufreq_freqs    freqs;
-
-	freqs.old = elanfreq_get_cpu_frequency(0);
-	freqs.new = elan_multiplier[state].clock;
-	freqs.cpu = 0; /* elanfreq.c is UP only driver */
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
-	printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n",
-			elan_multiplier[state].clock);
-
-
-	/*
-	 * Access to the Elan's internal registers is indexed via
-	 * 0x22: Chip Setup & Control Register Index Register (CSCI)
-	 * 0x23: Chip Setup & Control Register Data  Register (CSCD)
-	 *
-	 */
-
-	/*
-	 * 0x40 is the Power Management Unit's Force Mode Register.
-	 * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency)
-	 */
-
-	local_irq_disable();
-	outb_p(0x40, REG_CSCIR);		/* Disable hyperspeed mode */
-	outb_p(0x00, REG_CSCDR);
-	local_irq_enable();		/* wait till internal pipelines and */
-	udelay(1000);			/* buffers have cleaned up          */
-
-	local_irq_disable();
-
-	/* now, set the CPU clock speed register (0x80) */
-	outb_p(0x80, REG_CSCIR);
-	outb_p(elan_multiplier[state].val80h, REG_CSCDR);
-
-	/* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
-	outb_p(0x40, REG_CSCIR);
-	outb_p(elan_multiplier[state].val40h, REG_CSCDR);
-	udelay(10000);
-	local_irq_enable();
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-};
-
-
-/**
- *	elanfreq_validatespeed: test if frequency range is valid
- *	@policy: the policy to validate
- *
- *	This function checks if a given frequency range in kHz is valid
- *	for the hardware supported by the driver.
- */
-
-static int elanfreq_verify(struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
-}
-
-static int elanfreq_target(struct cpufreq_policy *policy,
-			    unsigned int target_freq,
-			    unsigned int relation)
-{
-	unsigned int newstate = 0;
-
-	if (cpufreq_frequency_table_target(policy, &elanfreq_table[0],
-				target_freq, relation, &newstate))
-		return -EINVAL;
-
-	elanfreq_set_cpu_state(newstate);
-
-	return 0;
-}
-
-
-/*
- *	Module init and exit code
- */
-
-static int elanfreq_cpu_init(struct cpufreq_policy *policy)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-	unsigned int i;
-	int result;
-
-	/* capability check */
-	if ((c->x86_vendor != X86_VENDOR_AMD) ||
-	    (c->x86 != 4) || (c->x86_model != 10))
-		return -ENODEV;
-
-	/* max freq */
-	if (!max_freq)
-		max_freq = elanfreq_get_cpu_frequency(0);
-
-	/* table init */
-	for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
-		if (elanfreq_table[i].frequency > max_freq)
-			elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
-	}
-
-	/* cpuinfo and default policy values */
-	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-	policy->cur = elanfreq_get_cpu_frequency(0);
-
-	result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
-	if (result)
-		return result;
-
-	cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
-	return 0;
-}
-
-
-static int elanfreq_cpu_exit(struct cpufreq_policy *policy)
-{
-	cpufreq_frequency_table_put_attr(policy->cpu);
-	return 0;
-}
-
-
-#ifndef MODULE
-/**
- * elanfreq_setup - elanfreq command line parameter parsing
- *
- * elanfreq command line parameter.  Use:
- *  elanfreq=66000
- * to set the maximum CPU frequency to 66 MHz. Note that in
- * case you do not give this boot parameter, the maximum
- * frequency will fall back to _current_ CPU frequency which
- * might be lower. If you build this as a module, use the
- * max_freq module parameter instead.
- */
-static int __init elanfreq_setup(char *str)
-{
-	max_freq = simple_strtoul(str, &str, 0);
-	printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n");
-	return 1;
-}
-__setup("elanfreq=", elanfreq_setup);
-#endif
-
-
-static struct freq_attr *elanfreq_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-
-static struct cpufreq_driver elanfreq_driver = {
-	.get		= elanfreq_get_cpu_frequency,
-	.verify		= elanfreq_verify,
-	.target		= elanfreq_target,
-	.init		= elanfreq_cpu_init,
-	.exit		= elanfreq_cpu_exit,
-	.name		= "elanfreq",
-	.owner		= THIS_MODULE,
-	.attr		= elanfreq_attr,
-};
-
-
-static int __init elanfreq_init(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-
-	/* Test if we have the right hardware */
-	if ((c->x86_vendor != X86_VENDOR_AMD) ||
-		(c->x86 != 4) || (c->x86_model != 10)) {
-		printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
-		return -ENODEV;
-	}
-	return cpufreq_register_driver(&elanfreq_driver);
-}
-
-
-static void __exit elanfreq_exit(void)
-{
-	cpufreq_unregister_driver(&elanfreq_driver);
-}
-
-
-module_param(max_freq, int, 0444);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, "
-		"Sven Geggus <sven@geggus.net>");
-MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs");
-
-module_init(elanfreq_init);
-module_exit(elanfreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
deleted file mode 100644
index ffe1f2c92ed..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ /dev/null
@@ -1,514 +0,0 @@
-/*
- *	Cyrix MediaGX and NatSemi Geode Suspend Modulation
- *	(C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
- *	(C) 2002 Hiroshi Miura   <miura@da-cha.org>
- *	All Rights Reserved
- *
- *	This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      version 2 as published by the Free Software Foundation
- *
- *      The author(s) of this software shall not be held liable for damages
- *      of any nature resulting due to the use of this software. This
- *      software is provided AS-IS with no warranties.
- *
- * Theoretical note:
- *
- *	(see Geode(tm) CS5530 manual (rev.4.1) page.56)
- *
- *	CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0
- *	are based on Suspend Modulation.
- *
- *	Suspend Modulation works by asserting and de-asserting the SUSP# pin
- *	to CPU(GX1/GXLV) for configurable durations. When asserting SUSP#
- *	the CPU enters an idle state. GX1 stops its core clock when SUSP# is
- *	asserted then power consumption is reduced.
- *
- *	Suspend Modulation's OFF/ON duration are configurable
- *	with 'Suspend Modulation OFF Count Register'
- *	and 'Suspend Modulation ON Count Register'.
- *	These registers are 8bit counters that represent the number of
- *	32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF)
- *	to the processor.
- *
- *	These counters define a ratio which is the effective frequency
- *	of operation of the system.
- *
- *			       OFF Count
- *	F_eff = Fgx * ----------------------
- *	                OFF Count + ON Count
- *
- *	0 <= On Count, Off Count <= 255
- *
- *	From these limits, we can get register values
- *
- *	off_duration + on_duration <= MAX_DURATION
- *	on_duration = off_duration * (stock_freq - freq) / freq
- *
- *      off_duration  =  (freq * DURATION) / stock_freq
- *      on_duration = DURATION - off_duration
- *
- *
- *---------------------------------------------------------------------------
- *
- * ChangeLog:
- *	Dec. 12, 2003	Hiroshi Miura <miura@da-cha.org>
- *		- fix on/off register mistake
- *		- fix cpu_khz calc when it stops cpu modulation.
- *
- *	Dec. 11, 2002	Hiroshi Miura <miura@da-cha.org>
- *		- rewrite for Cyrix MediaGX Cx5510/5520 and
- *		  NatSemi Geode Cs5530(A).
- *
- *	Jul. ??, 2002  Zwane Mwaikambo <zwane@commfireservices.com>
- *		- cs5530_mod patch for 2.4.19-rc1.
- *
- *---------------------------------------------------------------------------
- *
- * Todo
- *	Test on machines with 5510, 5530, 5530A
- */
-
-/************************************************************************
- *			Suspend Modulation - Definitions		*
- ************************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-
-#include <asm/processor-cyrix.h>
-
-/* PCI config registers, all at F0 */
-#define PCI_PMER1	0x80	/* power management enable register 1 */
-#define PCI_PMER2	0x81	/* power management enable register 2 */
-#define PCI_PMER3	0x82	/* power management enable register 3 */
-#define PCI_IRQTC	0x8c	/* irq speedup timer counter register:typical 2 to 4ms */
-#define PCI_VIDTC	0x8d	/* video speedup timer counter register: typical 50 to 100ms */
-#define PCI_MODOFF	0x94	/* suspend modulation OFF counter register, 1 = 32us */
-#define PCI_MODON	0x95	/* suspend modulation ON counter register */
-#define PCI_SUSCFG	0x96	/* suspend configuration register */
-
-/* PMER1 bits */
-#define GPM		(1<<0)	/* global power management */
-#define GIT		(1<<1)	/* globally enable PM device idle timers */
-#define GTR		(1<<2)	/* globally enable IO traps */
-#define IRQ_SPDUP	(1<<3)	/* disable clock throttle during interrupt handling */
-#define VID_SPDUP	(1<<4)	/* disable clock throttle during vga video handling */
-
-/* SUSCFG bits */
-#define SUSMOD		(1<<0)	/* enable/disable suspend modulation */
-/* the below is supported only with cs5530 (after rev.1.2)/cs5530A */
-#define SMISPDUP	(1<<1)	/* select how SMI re-enable suspend modulation: */
-				/* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */
-#define SUSCFG		(1<<2)	/* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */
-/* the below is supported only with cs5530A */
-#define PWRSVE_ISA	(1<<3)	/* stop ISA clock  */
-#define PWRSVE		(1<<4)	/* active idle */
-
-struct gxfreq_params {
-	u8 on_duration;
-	u8 off_duration;
-	u8 pci_suscfg;
-	u8 pci_pmer1;
-	u8 pci_pmer2;
-	struct pci_dev *cs55x0;
-};
-
-static struct gxfreq_params *gx_params;
-static int stock_freq;
-
-/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */
-static int pci_busclk;
-module_param(pci_busclk, int, 0444);
-
-/* maximum duration for which the cpu may be suspended
- * (32us * MAX_DURATION). If no parameter is given, this defaults
- * to 255.
- * Note that this leads to a maximum of 8 ms(!) where the CPU clock
- * is suspended -- processing power is just 0.39% of what it used to be,
- * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */
-static int max_duration = 255;
-module_param(max_duration, int, 0444);
-
-/* For the default policy, we want at least some processing power
- * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV)
- */
-#define POLICY_MIN_DIV 20
-
-
-/**
- * we can detect a core multipiler from dir0_lsb
- * from GX1 datasheet p.56,
- *	MULT[3:0]:
- *	0000 = SYSCLK multiplied by 4 (test only)
- *	0001 = SYSCLK multiplied by 10
- *	0010 = SYSCLK multiplied by 4
- *	0011 = SYSCLK multiplied by 6
- *	0100 = SYSCLK multiplied by 9
- *	0101 = SYSCLK multiplied by 5
- *	0110 = SYSCLK multiplied by 7
- *	0111 = SYSCLK multiplied by 8
- *              of 33.3MHz
- **/
-static int gx_freq_mult[16] = {
-		4, 10, 4, 6, 9, 5, 7, 8,
-		0, 0, 0, 0, 0, 0, 0, 0
-};
-
-
-/****************************************************************
- *	Low Level chipset interface				*
- ****************************************************************/
-static struct pci_device_id gx_chipset_tbl[] __initdata = {
-	{ PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
-	{ PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
-	{ PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
-	{ 0, },
-};
-
-static void gx_write_byte(int reg, int value)
-{
-	pci_write_config_byte(gx_params->cs55x0, reg, value);
-}
-
-/**
- * gx_detect_chipset:
- *
- **/
-static __init struct pci_dev *gx_detect_chipset(void)
-{
-	struct pci_dev *gx_pci = NULL;
-
-	/* check if CPU is a MediaGX or a Geode. */
-	if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) &&
-	    (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) {
-		pr_debug("error: no MediaGX/Geode processor found!\n");
-		return NULL;
-	}
-
-	/* detect which companion chip is used */
-	for_each_pci_dev(gx_pci) {
-		if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
-			return gx_pci;
-	}
-
-	pr_debug("error: no supported chipset found!\n");
-	return NULL;
-}
-
-/**
- * gx_get_cpuspeed:
- *
- * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi
- * Geode CPU runs.
- */
-static unsigned int gx_get_cpuspeed(unsigned int cpu)
-{
-	if ((gx_params->pci_suscfg & SUSMOD) == 0)
-		return stock_freq;
-
-	return (stock_freq * gx_params->off_duration)
-		/ (gx_params->on_duration + gx_params->off_duration);
-}
-
-/**
- *      gx_validate_speed:
- *      determine current cpu speed
- *
- **/
-
-static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration,
-		u8 *off_duration)
-{
-	unsigned int i;
-	u8 tmp_on, tmp_off;
-	int old_tmp_freq = stock_freq;
-	int tmp_freq;
-
-	*off_duration = 1;
-	*on_duration = 0;
-
-	for (i = max_duration; i > 0; i--) {
-		tmp_off = ((khz * i) / stock_freq) & 0xff;
-		tmp_on = i - tmp_off;
-		tmp_freq = (stock_freq * tmp_off) / i;
-		/* if this relation is closer to khz, use this. If it's equal,
-		 * prefer it, too - lower latency */
-		if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) {
-			*on_duration = tmp_on;
-			*off_duration = tmp_off;
-			old_tmp_freq = tmp_freq;
-		}
-	}
-
-	return old_tmp_freq;
-}
-
-
-/**
- * gx_set_cpuspeed:
- * set cpu speed in khz.
- **/
-
-static void gx_set_cpuspeed(unsigned int khz)
-{
-	u8 suscfg, pmer1;
-	unsigned int new_khz;
-	unsigned long flags;
-	struct cpufreq_freqs freqs;
-
-	freqs.cpu = 0;
-	freqs.old = gx_get_cpuspeed(0);
-
-	new_khz = gx_validate_speed(khz, &gx_params->on_duration,
-			&gx_params->off_duration);
-
-	freqs.new = new_khz;
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	local_irq_save(flags);
-
-
-
-	if (new_khz != stock_freq) {
-		/* if new khz == 100% of CPU speed, it is special case */
-		switch (gx_params->cs55x0->device) {
-		case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
-			pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP;
-			/* FIXME: need to test other values -- Zwane,Miura */
-			/* typical 2 to 4ms */
-			gx_write_byte(PCI_IRQTC, 4);
-			/* typical 50 to 100ms */
-			gx_write_byte(PCI_VIDTC, 100);
-			gx_write_byte(PCI_PMER1, pmer1);
-
-			if (gx_params->cs55x0->revision < 0x10) {
-				/* CS5530(rev 1.2, 1.3) */
-				suscfg = gx_params->pci_suscfg|SUSMOD;
-			} else {
-				/* CS5530A,B.. */
-				suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE;
-			}
-			break;
-		case PCI_DEVICE_ID_CYRIX_5520:
-		case PCI_DEVICE_ID_CYRIX_5510:
-			suscfg = gx_params->pci_suscfg | SUSMOD;
-			break;
-		default:
-			local_irq_restore(flags);
-			pr_debug("fatal: try to set unknown chipset.\n");
-			return;
-		}
-	} else {
-		suscfg = gx_params->pci_suscfg & ~(SUSMOD);
-		gx_params->off_duration = 0;
-		gx_params->on_duration = 0;
-		pr_debug("suspend modulation disabled: cpu runs 100%% speed.\n");
-	}
-
-	gx_write_byte(PCI_MODOFF, gx_params->off_duration);
-	gx_write_byte(PCI_MODON, gx_params->on_duration);
-
-	gx_write_byte(PCI_SUSCFG, suscfg);
-	pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg);
-
-	local_irq_restore(flags);
-
-	gx_params->pci_suscfg = suscfg;
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
-	pr_debug("suspend modulation w/ duration of ON:%d us, OFF:%d us\n",
-		gx_params->on_duration * 32, gx_params->off_duration * 32);
-	pr_debug("suspend modulation w/ clock speed: %d kHz.\n", freqs.new);
-}
-
-/****************************************************************
- *             High level functions                             *
- ****************************************************************/
-
-/*
- *	cpufreq_gx_verify: test if frequency range is valid
- *
- *	This function checks if a given frequency range in kHz is valid
- *      for the hardware supported by the driver.
- */
-
-static int cpufreq_gx_verify(struct cpufreq_policy *policy)
-{
-	unsigned int tmp_freq = 0;
-	u8 tmp1, tmp2;
-
-	if (!stock_freq || !policy)
-		return -EINVAL;
-
-	policy->cpu = 0;
-	cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
-			stock_freq);
-
-	/* it needs to be assured that at least one supported frequency is
-	 * within policy->min and policy->max. If it is not, policy->max
-	 * needs to be increased until one freuqency is supported.
-	 * policy->min may not be decreased, though. This way we guarantee a
-	 * specific processing capacity.
-	 */
-	tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2);
-	if (tmp_freq < policy->min)
-		tmp_freq += stock_freq / max_duration;
-	policy->min = tmp_freq;
-	if (policy->min > policy->max)
-		policy->max = tmp_freq;
-	tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2);
-	if (tmp_freq > policy->max)
-		tmp_freq -= stock_freq / max_duration;
-	policy->max = tmp_freq;
-	if (policy->max < policy->min)
-		policy->max = policy->min;
-	cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
-			stock_freq);
-
-	return 0;
-}
-
-/*
- *      cpufreq_gx_target:
- *
- */
-static int cpufreq_gx_target(struct cpufreq_policy *policy,
-			     unsigned int target_freq,
-			     unsigned int relation)
-{
-	u8 tmp1, tmp2;
-	unsigned int tmp_freq;
-
-	if (!stock_freq || !policy)
-		return -EINVAL;
-
-	policy->cpu = 0;
-
-	tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2);
-	while (tmp_freq < policy->min) {
-		tmp_freq += stock_freq / max_duration;
-		tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
-	}
-	while (tmp_freq > policy->max) {
-		tmp_freq -= stock_freq / max_duration;
-		tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
-	}
-
-	gx_set_cpuspeed(tmp_freq);
-
-	return 0;
-}
-
-static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
-{
-	unsigned int maxfreq, curfreq;
-
-	if (!policy || policy->cpu != 0)
-		return -ENODEV;
-
-	/* determine maximum frequency */
-	if (pci_busclk)
-		maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
-	else if (cpu_khz)
-		maxfreq = cpu_khz;
-	else
-		maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
-
-	stock_freq = maxfreq;
-	curfreq = gx_get_cpuspeed(0);
-
-	pr_debug("cpu max frequency is %d.\n", maxfreq);
-	pr_debug("cpu current frequency is %dkHz.\n", curfreq);
-
-	/* setup basic struct for cpufreq API */
-	policy->cpu = 0;
-
-	if (max_duration < POLICY_MIN_DIV)
-		policy->min = maxfreq / max_duration;
-	else
-		policy->min = maxfreq / POLICY_MIN_DIV;
-	policy->max = maxfreq;
-	policy->cur = curfreq;
-	policy->cpuinfo.min_freq = maxfreq / max_duration;
-	policy->cpuinfo.max_freq = maxfreq;
-	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-
-	return 0;
-}
-
-/*
- * cpufreq_gx_init:
- *   MediaGX/Geode GX initialize cpufreq driver
- */
-static struct cpufreq_driver gx_suspmod_driver = {
-	.get		= gx_get_cpuspeed,
-	.verify		= cpufreq_gx_verify,
-	.target		= cpufreq_gx_target,
-	.init		= cpufreq_gx_cpu_init,
-	.name		= "gx-suspmod",
-	.owner		= THIS_MODULE,
-};
-
-static int __init cpufreq_gx_init(void)
-{
-	int ret;
-	struct gxfreq_params *params;
-	struct pci_dev *gx_pci;
-
-	/* Test if we have the right hardware */
-	gx_pci = gx_detect_chipset();
-	if (gx_pci == NULL)
-		return -ENODEV;
-
-	/* check whether module parameters are sane */
-	if (max_duration > 0xff)
-		max_duration = 0xff;
-
-	pr_debug("geode suspend modulation available.\n");
-
-	params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL);
-	if (params == NULL)
-		return -ENOMEM;
-
-	params->cs55x0 = gx_pci;
-	gx_params = params;
-
-	/* keep cs55x0 configurations */
-	pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg));
-	pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1));
-	pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
-	pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
-	pci_read_config_byte(params->cs55x0, PCI_MODOFF,
-			&(params->off_duration));
-
-	ret = cpufreq_register_driver(&gx_suspmod_driver);
-	if (ret) {
-		kfree(params);
-		return ret;                   /* register error! */
-	}
-
-	return 0;
-}
-
-static void __exit cpufreq_gx_exit(void)
-{
-	cpufreq_unregister_driver(&gx_suspmod_driver);
-	pci_dev_put(gx_params->cs55x0);
-	kfree(gx_params);
-}
-
-MODULE_AUTHOR("Hiroshi Miura <miura@da-cha.org>");
-MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
-MODULE_LICENSE("GPL");
-
-module_init(cpufreq_gx_init);
-module_exit(cpufreq_gx_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
deleted file mode 100644
index f47d26e2a13..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ /dev/null
@@ -1,1024 +0,0 @@
-/*
- *  (C) 2001-2004  Dave Jones. <davej@redhat.com>
- *  (C) 2002  Padraig Brady. <padraig@antefacto.com>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *  Based upon datasheets & sample CPUs kindly provided by VIA.
- *
- *  VIA have currently 3 different versions of Longhaul.
- *  Version 1 (Longhaul) uses the BCR2 MSR at 0x1147.
- *   It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0.
- *  Version 2 of longhaul is backward compatible with v1, but adds
- *   LONGHAUL MSR for purpose of both frequency and voltage scaling.
- *   Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C).
- *  Version 3 of longhaul got renamed to Powersaver and redesigned
- *   to use only the POWERSAVER MSR at 0x110a.
- *   It is present in Ezra-T (C5M), Nehemiah (C5X) and above.
- *   It's pretty much the same feature wise to longhaul v2, though
- *   there is provision for scaling FSB too, but this doesn't work
- *   too well in practice so we don't even try to use this.
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/delay.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-#include <linux/acpi.h>
-
-#include <asm/msr.h>
-#include <acpi/processor.h>
-
-#include "longhaul.h"
-
-#define PFX "longhaul: "
-
-#define TYPE_LONGHAUL_V1	1
-#define TYPE_LONGHAUL_V2	2
-#define TYPE_POWERSAVER		3
-
-#define	CPU_SAMUEL	1
-#define	CPU_SAMUEL2	2
-#define	CPU_EZRA	3
-#define	CPU_EZRA_T	4
-#define	CPU_NEHEMIAH	5
-#define	CPU_NEHEMIAH_C	6
-
-/* Flags */
-#define USE_ACPI_C3		(1 << 1)
-#define USE_NORTHBRIDGE		(1 << 2)
-
-static int cpu_model;
-static unsigned int numscales = 16;
-static unsigned int fsb;
-
-static const struct mV_pos *vrm_mV_table;
-static const unsigned char *mV_vrm_table;
-
-static unsigned int highest_speed, lowest_speed; /* kHz */
-static unsigned int minmult, maxmult;
-static int can_scale_voltage;
-static struct acpi_processor *pr;
-static struct acpi_processor_cx *cx;
-static u32 acpi_regs_addr;
-static u8 longhaul_flags;
-static unsigned int longhaul_index;
-
-/* Module parameters */
-static int scale_voltage;
-static int disable_acpi_c3;
-static int revid_errata;
-
-
-/* Clock ratios multiplied by 10 */
-static int mults[32];
-static int eblcr[32];
-static int longhaul_version;
-static struct cpufreq_frequency_table *longhaul_table;
-
-static char speedbuffer[8];
-
-static char *print_speed(int speed)
-{
-	if (speed < 1000) {
-		snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed);
-		return speedbuffer;
-	}
-
-	if (speed%1000 == 0)
-		snprintf(speedbuffer, sizeof(speedbuffer),
-			"%dGHz", speed/1000);
-	else
-		snprintf(speedbuffer, sizeof(speedbuffer),
-			"%d.%dGHz", speed/1000, (speed%1000)/100);
-
-	return speedbuffer;
-}
-
-
-static unsigned int calc_speed(int mult)
-{
-	int khz;
-	khz = (mult/10)*fsb;
-	if (mult%10)
-		khz += fsb/2;
-	khz *= 1000;
-	return khz;
-}
-
-
-static int longhaul_get_cpu_mult(void)
-{
-	unsigned long invalue = 0, lo, hi;
-
-	rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi);
-	invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22;
-	if (longhaul_version == TYPE_LONGHAUL_V2 ||
-	    longhaul_version == TYPE_POWERSAVER) {
-		if (lo & (1<<27))
-			invalue += 16;
-	}
-	return eblcr[invalue];
-}
-
-/* For processor with BCR2 MSR */
-
-static void do_longhaul1(unsigned int mults_index)
-{
-	union msr_bcr2 bcr2;
-
-	rdmsrl(MSR_VIA_BCR2, bcr2.val);
-	/* Enable software clock multiplier */
-	bcr2.bits.ESOFTBF = 1;
-	bcr2.bits.CLOCKMUL = mults_index & 0xff;
-
-	/* Sync to timer tick */
-	safe_halt();
-	/* Change frequency on next halt or sleep */
-	wrmsrl(MSR_VIA_BCR2, bcr2.val);
-	/* Invoke transition */
-	ACPI_FLUSH_CPU_CACHE();
-	halt();
-
-	/* Disable software clock multiplier */
-	local_irq_disable();
-	rdmsrl(MSR_VIA_BCR2, bcr2.val);
-	bcr2.bits.ESOFTBF = 0;
-	wrmsrl(MSR_VIA_BCR2, bcr2.val);
-}
-
-/* For processor with Longhaul MSR */
-
-static void do_powersaver(int cx_address, unsigned int mults_index,
-			  unsigned int dir)
-{
-	union msr_longhaul longhaul;
-	u32 t;
-
-	rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-	/* Setup new frequency */
-	if (!revid_errata)
-		longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
-	else
-		longhaul.bits.RevisionKey = 0;
-	longhaul.bits.SoftBusRatio = mults_index & 0xf;
-	longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4;
-	/* Setup new voltage */
-	if (can_scale_voltage)
-		longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f;
-	/* Sync to timer tick */
-	safe_halt();
-	/* Raise voltage if necessary */
-	if (can_scale_voltage && dir) {
-		longhaul.bits.EnableSoftVID = 1;
-		wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-		/* Change voltage */
-		if (!cx_address) {
-			ACPI_FLUSH_CPU_CACHE();
-			halt();
-		} else {
-			ACPI_FLUSH_CPU_CACHE();
-			/* Invoke C3 */
-			inb(cx_address);
-			/* Dummy op - must do something useless after P_LVL3
-			 * read */
-			t = inl(acpi_gbl_FADT.xpm_timer_block.address);
-		}
-		longhaul.bits.EnableSoftVID = 0;
-		wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-	}
-
-	/* Change frequency on next halt or sleep */
-	longhaul.bits.EnableSoftBusRatio = 1;
-	wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-	if (!cx_address) {
-		ACPI_FLUSH_CPU_CACHE();
-		halt();
-	} else {
-		ACPI_FLUSH_CPU_CACHE();
-		/* Invoke C3 */
-		inb(cx_address);
-		/* Dummy op - must do something useless after P_LVL3 read */
-		t = inl(acpi_gbl_FADT.xpm_timer_block.address);
-	}
-	/* Disable bus ratio bit */
-	longhaul.bits.EnableSoftBusRatio = 0;
-	wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-
-	/* Reduce voltage if necessary */
-	if (can_scale_voltage && !dir) {
-		longhaul.bits.EnableSoftVID = 1;
-		wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-		/* Change voltage */
-		if (!cx_address) {
-			ACPI_FLUSH_CPU_CACHE();
-			halt();
-		} else {
-			ACPI_FLUSH_CPU_CACHE();
-			/* Invoke C3 */
-			inb(cx_address);
-			/* Dummy op - must do something useless after P_LVL3
-			 * read */
-			t = inl(acpi_gbl_FADT.xpm_timer_block.address);
-		}
-		longhaul.bits.EnableSoftVID = 0;
-		wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-	}
-}
-
-/**
- * longhaul_set_cpu_frequency()
- * @mults_index : bitpattern of the new multiplier.
- *
- * Sets a new clock ratio.
- */
-
-static void longhaul_setstate(unsigned int table_index)
-{
-	unsigned int mults_index;
-	int speed, mult;
-	struct cpufreq_freqs freqs;
-	unsigned long flags;
-	unsigned int pic1_mask, pic2_mask;
-	u16 bm_status = 0;
-	u32 bm_timeout = 1000;
-	unsigned int dir = 0;
-
-	mults_index = longhaul_table[table_index].index;
-	/* Safety precautions */
-	mult = mults[mults_index & 0x1f];
-	if (mult == -1)
-		return;
-	speed = calc_speed(mult);
-	if ((speed > highest_speed) || (speed < lowest_speed))
-		return;
-	/* Voltage transition before frequency transition? */
-	if (can_scale_voltage && longhaul_index < table_index)
-		dir = 1;
-
-	freqs.old = calc_speed(longhaul_get_cpu_mult());
-	freqs.new = speed;
-	freqs.cpu = 0; /* longhaul.c is UP only driver */
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
-	pr_debug("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
-			fsb, mult/10, mult%10, print_speed(speed/1000));
-retry_loop:
-	preempt_disable();
-	local_irq_save(flags);
-
-	pic2_mask = inb(0xA1);
-	pic1_mask = inb(0x21);	/* works on C3. save mask. */
-	outb(0xFF, 0xA1);	/* Overkill */
-	outb(0xFE, 0x21);	/* TMR0 only */
-
-	/* Wait while PCI bus is busy. */
-	if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE
-	    || ((pr != NULL) && pr->flags.bm_control))) {
-		bm_status = inw(acpi_regs_addr);
-		bm_status &= 1 << 4;
-		while (bm_status && bm_timeout) {
-			outw(1 << 4, acpi_regs_addr);
-			bm_timeout--;
-			bm_status = inw(acpi_regs_addr);
-			bm_status &= 1 << 4;
-		}
-	}
-
-	if (longhaul_flags & USE_NORTHBRIDGE) {
-		/* Disable AGP and PCI arbiters */
-		outb(3, 0x22);
-	} else if ((pr != NULL) && pr->flags.bm_control) {
-		/* Disable bus master arbitration */
-		acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1);
-	}
-	switch (longhaul_version) {
-
-	/*
-	 * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B])
-	 * Software controlled multipliers only.
-	 */
-	case TYPE_LONGHAUL_V1:
-		do_longhaul1(mults_index);
-		break;
-
-	/*
-	 * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C]
-	 *
-	 * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N])
-	 * Nehemiah can do FSB scaling too, but this has never been proven
-	 * to work in practice.
-	 */
-	case TYPE_LONGHAUL_V2:
-	case TYPE_POWERSAVER:
-		if (longhaul_flags & USE_ACPI_C3) {
-			/* Don't allow wakeup */
-			acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
-			do_powersaver(cx->address, mults_index, dir);
-		} else {
-			do_powersaver(0, mults_index, dir);
-		}
-		break;
-	}
-
-	if (longhaul_flags & USE_NORTHBRIDGE) {
-		/* Enable arbiters */
-		outb(0, 0x22);
-	} else if ((pr != NULL) && pr->flags.bm_control) {
-		/* Enable bus master arbitration */
-		acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0);
-	}
-	outb(pic2_mask, 0xA1);	/* restore mask */
-	outb(pic1_mask, 0x21);
-
-	local_irq_restore(flags);
-	preempt_enable();
-
-	freqs.new = calc_speed(longhaul_get_cpu_mult());
-	/* Check if requested frequency is set. */
-	if (unlikely(freqs.new != speed)) {
-		printk(KERN_INFO PFX "Failed to set requested frequency!\n");
-		/* Revision ID = 1 but processor is expecting revision key
-		 * equal to 0. Jumpers at the bottom of processor will change
-		 * multiplier and FSB, but will not change bits in Longhaul
-		 * MSR nor enable voltage scaling. */
-		if (!revid_errata) {
-			printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" "
-						"option.\n");
-			revid_errata = 1;
-			msleep(200);
-			goto retry_loop;
-		}
-		/* Why ACPI C3 sometimes doesn't work is a mystery for me.
-		 * But it does happen. Processor is entering ACPI C3 state,
-		 * but it doesn't change frequency. I tried poking various
-		 * bits in northbridge registers, but without success. */
-		if (longhaul_flags & USE_ACPI_C3) {
-			printk(KERN_INFO PFX "Disabling ACPI C3 support.\n");
-			longhaul_flags &= ~USE_ACPI_C3;
-			if (revid_errata) {
-				printk(KERN_INFO PFX "Disabling \"Ignore "
-						"Revision ID\" option.\n");
-				revid_errata = 0;
-			}
-			msleep(200);
-			goto retry_loop;
-		}
-		/* This shouldn't happen. Longhaul ver. 2 was reported not
-		 * working on processors without voltage scaling, but with
-		 * RevID = 1. RevID errata will make things right. Just
-		 * to be 100% sure. */
-		if (longhaul_version == TYPE_LONGHAUL_V2) {
-			printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n");
-			longhaul_version = TYPE_LONGHAUL_V1;
-			msleep(200);
-			goto retry_loop;
-		}
-	}
-	/* Report true CPU frequency */
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
-	if (!bm_timeout)
-		printk(KERN_INFO PFX "Warning: Timeout while waiting for "
-				"idle PCI bus.\n");
-}
-
-/*
- * Centaur decided to make life a little more tricky.
- * Only longhaul v1 is allowed to read EBLCR BSEL[0:1].
- * Samuel2 and above have to try and guess what the FSB is.
- * We do this by assuming we booted at maximum multiplier, and interpolate
- * between that value multiplied by possible FSBs and cpu_mhz which
- * was calculated at boot time. Really ugly, but no other way to do this.
- */
-
-#define ROUNDING	0xf
-
-static int guess_fsb(int mult)
-{
-	int speed = cpu_khz / 1000;
-	int i;
-	int speeds[] = { 666, 1000, 1333, 2000 };
-	int f_max, f_min;
-
-	for (i = 0; i < 4; i++) {
-		f_max = ((speeds[i] * mult) + 50) / 100;
-		f_max += (ROUNDING / 2);
-		f_min = f_max - ROUNDING;
-		if ((speed <= f_max) && (speed >= f_min))
-			return speeds[i] / 10;
-	}
-	return 0;
-}
-
-
-static int __cpuinit longhaul_get_ranges(void)
-{
-	unsigned int i, j, k = 0;
-	unsigned int ratio;
-	int mult;
-
-	/* Get current frequency */
-	mult = longhaul_get_cpu_mult();
-	if (mult == -1) {
-		printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n");
-		return -EINVAL;
-	}
-	fsb = guess_fsb(mult);
-	if (fsb == 0) {
-		printk(KERN_INFO PFX "Invalid (reserved) FSB!\n");
-		return -EINVAL;
-	}
-	/* Get max multiplier - as we always did.
-	 * Longhaul MSR is useful only when voltage scaling is enabled.
-	 * C3 is booting at max anyway. */
-	maxmult = mult;
-	/* Get min multiplier */
-	switch (cpu_model) {
-	case CPU_NEHEMIAH:
-		minmult = 50;
-		break;
-	case CPU_NEHEMIAH_C:
-		minmult = 40;
-		break;
-	default:
-		minmult = 30;
-		break;
-	}
-
-	pr_debug("MinMult:%d.%dx MaxMult:%d.%dx\n",
-		 minmult/10, minmult%10, maxmult/10, maxmult%10);
-
-	highest_speed = calc_speed(maxmult);
-	lowest_speed = calc_speed(minmult);
-	pr_debug("FSB:%dMHz  Lowest speed: %s   Highest speed:%s\n", fsb,
-		 print_speed(lowest_speed/1000),
-		 print_speed(highest_speed/1000));
-
-	if (lowest_speed == highest_speed) {
-		printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n");
-		return -EINVAL;
-	}
-	if (lowest_speed > highest_speed) {
-		printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
-			lowest_speed, highest_speed);
-		return -EINVAL;
-	}
-
-	longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table),
-			GFP_KERNEL);
-	if (!longhaul_table)
-		return -ENOMEM;
-
-	for (j = 0; j < numscales; j++) {
-		ratio = mults[j];
-		if (ratio == -1)
-			continue;
-		if (ratio > maxmult || ratio < minmult)
-			continue;
-		longhaul_table[k].frequency = calc_speed(ratio);
-		longhaul_table[k].index	= j;
-		k++;
-	}
-	if (k <= 1) {
-		kfree(longhaul_table);
-		return -ENODEV;
-	}
-	/* Sort */
-	for (j = 0; j < k - 1; j++) {
-		unsigned int min_f, min_i;
-		min_f = longhaul_table[j].frequency;
-		min_i = j;
-		for (i = j + 1; i < k; i++) {
-			if (longhaul_table[i].frequency < min_f) {
-				min_f = longhaul_table[i].frequency;
-				min_i = i;
-			}
-		}
-		if (min_i != j) {
-			swap(longhaul_table[j].frequency,
-			     longhaul_table[min_i].frequency);
-			swap(longhaul_table[j].index,
-			     longhaul_table[min_i].index);
-		}
-	}
-
-	longhaul_table[k].frequency = CPUFREQ_TABLE_END;
-
-	/* Find index we are running on */
-	for (j = 0; j < k; j++) {
-		if (mults[longhaul_table[j].index & 0x1f] == mult) {
-			longhaul_index = j;
-			break;
-		}
-	}
-	return 0;
-}
-
-
-static void __cpuinit longhaul_setup_voltagescaling(void)
-{
-	union msr_longhaul longhaul;
-	struct mV_pos minvid, maxvid, vid;
-	unsigned int j, speed, pos, kHz_step, numvscales;
-	int min_vid_speed;
-
-	rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-	if (!(longhaul.bits.RevisionID & 1)) {
-		printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n");
-		return;
-	}
-
-	if (!longhaul.bits.VRMRev) {
-		printk(KERN_INFO PFX "VRM 8.5\n");
-		vrm_mV_table = &vrm85_mV[0];
-		mV_vrm_table = &mV_vrm85[0];
-	} else {
-		printk(KERN_INFO PFX "Mobile VRM\n");
-		if (cpu_model < CPU_NEHEMIAH)
-			return;
-		vrm_mV_table = &mobilevrm_mV[0];
-		mV_vrm_table = &mV_mobilevrm[0];
-	}
-
-	minvid = vrm_mV_table[longhaul.bits.MinimumVID];
-	maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
-
-	if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
-		printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
-					"Voltage scaling disabled.\n",
-					minvid.mV/1000, minvid.mV%1000,
-					maxvid.mV/1000, maxvid.mV%1000);
-		return;
-	}
-
-	if (minvid.mV == maxvid.mV) {
-		printk(KERN_INFO PFX "Claims to support voltage scaling but "
-				"min & max are both %d.%03d. "
-				"Voltage scaling disabled\n",
-				maxvid.mV/1000, maxvid.mV%1000);
-		return;
-	}
-
-	/* How many voltage steps*/
-	numvscales = maxvid.pos - minvid.pos + 1;
-	printk(KERN_INFO PFX
-		"Max VID=%d.%03d  "
-		"Min VID=%d.%03d, "
-		"%d possible voltage scales\n",
-		maxvid.mV/1000, maxvid.mV%1000,
-		minvid.mV/1000, minvid.mV%1000,
-		numvscales);
-
-	/* Calculate max frequency at min voltage */
-	j = longhaul.bits.MinMHzBR;
-	if (longhaul.bits.MinMHzBR4)
-		j += 16;
-	min_vid_speed = eblcr[j];
-	if (min_vid_speed == -1)
-		return;
-	switch (longhaul.bits.MinMHzFSB) {
-	case 0:
-		min_vid_speed *= 13333;
-		break;
-	case 1:
-		min_vid_speed *= 10000;
-		break;
-	case 3:
-		min_vid_speed *= 6666;
-		break;
-	default:
-		return;
-		break;
-	}
-	if (min_vid_speed >= highest_speed)
-		return;
-	/* Calculate kHz for one voltage step */
-	kHz_step = (highest_speed - min_vid_speed) / numvscales;
-
-	j = 0;
-	while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
-		speed = longhaul_table[j].frequency;
-		if (speed > min_vid_speed)
-			pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
-		else
-			pos = minvid.pos;
-		longhaul_table[j].index |= mV_vrm_table[pos] << 8;
-		vid = vrm_mV_table[mV_vrm_table[pos]];
-		printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n",
-				speed, j, vid.mV);
-		j++;
-	}
-
-	can_scale_voltage = 1;
-	printk(KERN_INFO PFX "Voltage scaling enabled.\n");
-}
-
-
-static int longhaul_verify(struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy, longhaul_table);
-}
-
-
-static int longhaul_target(struct cpufreq_policy *policy,
-			    unsigned int target_freq, unsigned int relation)
-{
-	unsigned int table_index = 0;
-	unsigned int i;
-	unsigned int dir = 0;
-	u8 vid, current_vid;
-
-	if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq,
-				relation, &table_index))
-		return -EINVAL;
-
-	/* Don't set same frequency again */
-	if (longhaul_index == table_index)
-		return 0;
-
-	if (!can_scale_voltage)
-		longhaul_setstate(table_index);
-	else {
-		/* On test system voltage transitions exceeding single
-		 * step up or down were turning motherboard off. Both
-		 * "ondemand" and "userspace" are unsafe. C7 is doing
-		 * this in hardware, C3 is old and we need to do this
-		 * in software. */
-		i = longhaul_index;
-		current_vid = (longhaul_table[longhaul_index].index >> 8);
-		current_vid &= 0x1f;
-		if (table_index > longhaul_index)
-			dir = 1;
-		while (i != table_index) {
-			vid = (longhaul_table[i].index >> 8) & 0x1f;
-			if (vid != current_vid) {
-				longhaul_setstate(i);
-				current_vid = vid;
-				msleep(200);
-			}
-			if (dir)
-				i++;
-			else
-				i--;
-		}
-		longhaul_setstate(table_index);
-	}
-	longhaul_index = table_index;
-	return 0;
-}
-
-
-static unsigned int longhaul_get(unsigned int cpu)
-{
-	if (cpu)
-		return 0;
-	return calc_speed(longhaul_get_cpu_mult());
-}
-
-static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
-					  u32 nesting_level,
-					  void *context, void **return_value)
-{
-	struct acpi_device *d;
-
-	if (acpi_bus_get_device(obj_handle, &d))
-		return 0;
-
-	*return_value = acpi_driver_data(d);
-	return 1;
-}
-
-/* VIA don't support PM2 reg, but have something similar */
-static int enable_arbiter_disable(void)
-{
-	struct pci_dev *dev;
-	int status = 1;
-	int reg;
-	u8 pci_cmd;
-
-	/* Find PLE133 host bridge */
-	reg = 0x78;
-	dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0,
-			     NULL);
-	/* Find PM133/VT8605 host bridge */
-	if (dev == NULL)
-		dev = pci_get_device(PCI_VENDOR_ID_VIA,
-				     PCI_DEVICE_ID_VIA_8605_0, NULL);
-	/* Find CLE266 host bridge */
-	if (dev == NULL) {
-		reg = 0x76;
-		dev = pci_get_device(PCI_VENDOR_ID_VIA,
-				     PCI_DEVICE_ID_VIA_862X_0, NULL);
-		/* Find CN400 V-Link host bridge */
-		if (dev == NULL)
-			dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL);
-	}
-	if (dev != NULL) {
-		/* Enable access to port 0x22 */
-		pci_read_config_byte(dev, reg, &pci_cmd);
-		if (!(pci_cmd & 1<<7)) {
-			pci_cmd |= 1<<7;
-			pci_write_config_byte(dev, reg, pci_cmd);
-			pci_read_config_byte(dev, reg, &pci_cmd);
-			if (!(pci_cmd & 1<<7)) {
-				printk(KERN_ERR PFX
-					"Can't enable access to port 0x22.\n");
-				status = 0;
-			}
-		}
-		pci_dev_put(dev);
-		return status;
-	}
-	return 0;
-}
-
-static int longhaul_setup_southbridge(void)
-{
-	struct pci_dev *dev;
-	u8 pci_cmd;
-
-	/* Find VT8235 southbridge */
-	dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
-	if (dev == NULL)
-		/* Find VT8237 southbridge */
-		dev = pci_get_device(PCI_VENDOR_ID_VIA,
-				     PCI_DEVICE_ID_VIA_8237, NULL);
-	if (dev != NULL) {
-		/* Set transition time to max */
-		pci_read_config_byte(dev, 0xec, &pci_cmd);
-		pci_cmd &= ~(1 << 2);
-		pci_write_config_byte(dev, 0xec, pci_cmd);
-		pci_read_config_byte(dev, 0xe4, &pci_cmd);
-		pci_cmd &= ~(1 << 7);
-		pci_write_config_byte(dev, 0xe4, pci_cmd);
-		pci_read_config_byte(dev, 0xe5, &pci_cmd);
-		pci_cmd |= 1 << 7;
-		pci_write_config_byte(dev, 0xe5, pci_cmd);
-		/* Get address of ACPI registers block*/
-		pci_read_config_byte(dev, 0x81, &pci_cmd);
-		if (pci_cmd & 1 << 7) {
-			pci_read_config_dword(dev, 0x88, &acpi_regs_addr);
-			acpi_regs_addr &= 0xff00;
-			printk(KERN_INFO PFX "ACPI I/O at 0x%x\n",
-					acpi_regs_addr);
-		}
-
-		pci_dev_put(dev);
-		return 1;
-	}
-	return 0;
-}
-
-static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-	char *cpuname = NULL;
-	int ret;
-	u32 lo, hi;
-
-	/* Check what we have on this motherboard */
-	switch (c->x86_model) {
-	case 6:
-		cpu_model = CPU_SAMUEL;
-		cpuname = "C3 'Samuel' [C5A]";
-		longhaul_version = TYPE_LONGHAUL_V1;
-		memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
-		memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr));
-		break;
-
-	case 7:
-		switch (c->x86_mask) {
-		case 0:
-			longhaul_version = TYPE_LONGHAUL_V1;
-			cpu_model = CPU_SAMUEL2;
-			cpuname = "C3 'Samuel 2' [C5B]";
-			/* Note, this is not a typo, early Samuel2's had
-			 * Samuel1 ratios. */
-			memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
-			memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
-			break;
-		case 1 ... 15:
-			longhaul_version = TYPE_LONGHAUL_V2;
-			if (c->x86_mask < 8) {
-				cpu_model = CPU_SAMUEL2;
-				cpuname = "C3 'Samuel 2' [C5B]";
-			} else {
-				cpu_model = CPU_EZRA;
-				cpuname = "C3 'Ezra' [C5C]";
-			}
-			memcpy(mults, ezra_mults, sizeof(ezra_mults));
-			memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr));
-			break;
-		}
-		break;
-
-	case 8:
-		cpu_model = CPU_EZRA_T;
-		cpuname = "C3 'Ezra-T' [C5M]";
-		longhaul_version = TYPE_POWERSAVER;
-		numscales = 32;
-		memcpy(mults, ezrat_mults, sizeof(ezrat_mults));
-		memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr));
-		break;
-
-	case 9:
-		longhaul_version = TYPE_POWERSAVER;
-		numscales = 32;
-		memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults));
-		memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr));
-		switch (c->x86_mask) {
-		case 0 ... 1:
-			cpu_model = CPU_NEHEMIAH;
-			cpuname = "C3 'Nehemiah A' [C5XLOE]";
-			break;
-		case 2 ... 4:
-			cpu_model = CPU_NEHEMIAH;
-			cpuname = "C3 'Nehemiah B' [C5XLOH]";
-			break;
-		case 5 ... 15:
-			cpu_model = CPU_NEHEMIAH_C;
-			cpuname = "C3 'Nehemiah C' [C5P]";
-			break;
-		}
-		break;
-
-	default:
-		cpuname = "Unknown";
-		break;
-	}
-	/* Check Longhaul ver. 2 */
-	if (longhaul_version == TYPE_LONGHAUL_V2) {
-		rdmsr(MSR_VIA_LONGHAUL, lo, hi);
-		if (lo == 0 && hi == 0)
-			/* Looks like MSR isn't present */
-			longhaul_version = TYPE_LONGHAUL_V1;
-	}
-
-	printk(KERN_INFO PFX "VIA %s CPU detected.  ", cpuname);
-	switch (longhaul_version) {
-	case TYPE_LONGHAUL_V1:
-	case TYPE_LONGHAUL_V2:
-		printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version);
-		break;
-	case TYPE_POWERSAVER:
-		printk(KERN_CONT "Powersaver supported.\n");
-		break;
-	};
-
-	/* Doesn't hurt */
-	longhaul_setup_southbridge();
-
-	/* Find ACPI data for processor */
-	acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
-				ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
-				NULL, (void *)&pr);
-
-	/* Check ACPI support for C3 state */
-	if (pr != NULL && longhaul_version == TYPE_POWERSAVER) {
-		cx = &pr->power.states[ACPI_STATE_C3];
-		if (cx->address > 0 && cx->latency <= 1000)
-			longhaul_flags |= USE_ACPI_C3;
-	}
-	/* Disable if it isn't working */
-	if (disable_acpi_c3)
-		longhaul_flags &= ~USE_ACPI_C3;
-	/* Check if northbridge is friendly */
-	if (enable_arbiter_disable())
-		longhaul_flags |= USE_NORTHBRIDGE;
-
-	/* Check ACPI support for bus master arbiter disable */
-	if (!(longhaul_flags & USE_ACPI_C3
-	     || longhaul_flags & USE_NORTHBRIDGE)
-	    && ((pr == NULL) || !(pr->flags.bm_control))) {
-		printk(KERN_ERR PFX
-			"No ACPI support. Unsupported northbridge.\n");
-		return -ENODEV;
-	}
-
-	if (longhaul_flags & USE_NORTHBRIDGE)
-		printk(KERN_INFO PFX "Using northbridge support.\n");
-	if (longhaul_flags & USE_ACPI_C3)
-		printk(KERN_INFO PFX "Using ACPI support.\n");
-
-	ret = longhaul_get_ranges();
-	if (ret != 0)
-		return ret;
-
-	if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0))
-		longhaul_setup_voltagescaling();
-
-	policy->cpuinfo.transition_latency = 200000;	/* nsec */
-	policy->cur = calc_speed(longhaul_get_cpu_mult());
-
-	ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table);
-	if (ret)
-		return ret;
-
-	cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu);
-
-	return 0;
-}
-
-static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
-{
-	cpufreq_frequency_table_put_attr(policy->cpu);
-	return 0;
-}
-
-static struct freq_attr *longhaul_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-static struct cpufreq_driver longhaul_driver = {
-	.verify	= longhaul_verify,
-	.target	= longhaul_target,
-	.get	= longhaul_get,
-	.init	= longhaul_cpu_init,
-	.exit	= __devexit_p(longhaul_cpu_exit),
-	.name	= "longhaul",
-	.owner	= THIS_MODULE,
-	.attr	= longhaul_attr,
-};
-
-
-static int __init longhaul_init(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-
-	if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6)
-		return -ENODEV;
-
-#ifdef CONFIG_SMP
-	if (num_online_cpus() > 1) {
-		printk(KERN_ERR PFX "More than 1 CPU detected, "
-				"longhaul disabled.\n");
-		return -ENODEV;
-	}
-#endif
-#ifdef CONFIG_X86_IO_APIC
-	if (cpu_has_apic) {
-		printk(KERN_ERR PFX "APIC detected. Longhaul is currently "
-				"broken in this configuration.\n");
-		return -ENODEV;
-	}
-#endif
-	switch (c->x86_model) {
-	case 6 ... 9:
-		return cpufreq_register_driver(&longhaul_driver);
-	case 10:
-		printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n");
-	default:
-		;
-	}
-
-	return -ENODEV;
-}
-
-
-static void __exit longhaul_exit(void)
-{
-	int i;
-
-	for (i = 0; i < numscales; i++) {
-		if (mults[i] == maxmult) {
-			longhaul_setstate(i);
-			break;
-		}
-	}
-
-	cpufreq_unregister_driver(&longhaul_driver);
-	kfree(longhaul_table);
-}
-
-/* Even if BIOS is exporting ACPI C3 state, and it is used
- * with success when CPU is idle, this state doesn't
- * trigger frequency transition in some cases. */
-module_param(disable_acpi_c3, int, 0644);
-MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
-/* Change CPU voltage with frequency. Very useful to save
- * power, but most VIA C3 processors aren't supporting it. */
-module_param(scale_voltage, int, 0644);
-MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
-/* Force revision key to 0 for processors which doesn't
- * support voltage scaling, but are introducing itself as
- * such. */
-module_param(revid_errata, int, 0644);
-MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
-
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
-MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors.");
-MODULE_LICENSE("GPL");
-
-late_initcall(longhaul_init);
-module_exit(longhaul_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
deleted file mode 100644
index cbf48fbca88..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- *  longhaul.h
- *  (C) 2003 Dave Jones.
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  VIA-specific information
- */
-
-union msr_bcr2 {
-	struct {
-		unsigned Reseved:19,	// 18:0
-		ESOFTBF:1,		// 19
-		Reserved2:3,		// 22:20
-		CLOCKMUL:4,		// 26:23
-		Reserved3:5;		// 31:27
-	} bits;
-	unsigned long val;
-};
-
-union msr_longhaul {
-	struct {
-		unsigned RevisionID:4,	// 3:0
-		RevisionKey:4,		// 7:4
-		EnableSoftBusRatio:1,	// 8
-		EnableSoftVID:1,	// 9
-		EnableSoftBSEL:1,	// 10
-		Reserved:3,		// 11:13
-		SoftBusRatio4:1,	// 14
-		VRMRev:1,		// 15
-		SoftBusRatio:4,		// 19:16
-		SoftVID:5,		// 24:20
-		Reserved2:3,		// 27:25
-		SoftBSEL:2,		// 29:28
-		Reserved3:2,		// 31:30
-		MaxMHzBR:4,		// 35:32
-		MaximumVID:5,		// 40:36
-		MaxMHzFSB:2,		// 42:41
-		MaxMHzBR4:1,		// 43
-		Reserved4:4,		// 47:44
-		MinMHzBR:4,		// 51:48
-		MinimumVID:5,		// 56:52
-		MinMHzFSB:2,		// 58:57
-		MinMHzBR4:1,		// 59
-		Reserved5:4;		// 63:60
-	} bits;
-	unsigned long long val;
-};
-
-/*
- * Clock ratio tables. Div/Mod by 10 to get ratio.
- * The eblcr values specify the ratio read from the CPU.
- * The mults values specify what to write to the CPU.
- */
-
-/*
- * VIA C3 Samuel 1  & Samuel 2 (stepping 0)
- */
-static const int __cpuinitdata samuel1_mults[16] = {
-	-1, /* 0000 -> RESERVED */
-	30, /* 0001 ->  3.0x */
-	40, /* 0010 ->  4.0x */
-	-1, /* 0011 -> RESERVED */
-	-1, /* 0100 -> RESERVED */
-	35, /* 0101 ->  3.5x */
-	45, /* 0110 ->  4.5x */
-	55, /* 0111 ->  5.5x */
-	60, /* 1000 ->  6.0x */
-	70, /* 1001 ->  7.0x */
-	80, /* 1010 ->  8.0x */
-	50, /* 1011 ->  5.0x */
-	65, /* 1100 ->  6.5x */
-	75, /* 1101 ->  7.5x */
-	-1, /* 1110 -> RESERVED */
-	-1, /* 1111 -> RESERVED */
-};
-
-static const int __cpuinitdata samuel1_eblcr[16] = {
-	50, /* 0000 -> RESERVED */
-	30, /* 0001 ->  3.0x */
-	40, /* 0010 ->  4.0x */
-	-1, /* 0011 -> RESERVED */
-	55, /* 0100 ->  5.5x */
-	35, /* 0101 ->  3.5x */
-	45, /* 0110 ->  4.5x */
-	-1, /* 0111 -> RESERVED */
-	-1, /* 1000 -> RESERVED */
-	70, /* 1001 ->  7.0x */
-	80, /* 1010 ->  8.0x */
-	60, /* 1011 ->  6.0x */
-	-1, /* 1100 -> RESERVED */
-	75, /* 1101 ->  7.5x */
-	-1, /* 1110 -> RESERVED */
-	65, /* 1111 ->  6.5x */
-};
-
-/*
- * VIA C3 Samuel2 Stepping 1->15
- */
-static const int __cpuinitdata samuel2_eblcr[16] = {
-	50,  /* 0000 ->  5.0x */
-	30,  /* 0001 ->  3.0x */
-	40,  /* 0010 ->  4.0x */
-	100, /* 0011 -> 10.0x */
-	55,  /* 0100 ->  5.5x */
-	35,  /* 0101 ->  3.5x */
-	45,  /* 0110 ->  4.5x */
-	110, /* 0111 -> 11.0x */
-	90,  /* 1000 ->  9.0x */
-	70,  /* 1001 ->  7.0x */
-	80,  /* 1010 ->  8.0x */
-	60,  /* 1011 ->  6.0x */
-	120, /* 1100 -> 12.0x */
-	75,  /* 1101 ->  7.5x */
-	130, /* 1110 -> 13.0x */
-	65,  /* 1111 ->  6.5x */
-};
-
-/*
- * VIA C3 Ezra
- */
-static const int __cpuinitdata ezra_mults[16] = {
-	100, /* 0000 -> 10.0x */
-	30,  /* 0001 ->  3.0x */
-	40,  /* 0010 ->  4.0x */
-	90,  /* 0011 ->  9.0x */
-	95,  /* 0100 ->  9.5x */
-	35,  /* 0101 ->  3.5x */
-	45,  /* 0110 ->  4.5x */
-	55,  /* 0111 ->  5.5x */
-	60,  /* 1000 ->  6.0x */
-	70,  /* 1001 ->  7.0x */
-	80,  /* 1010 ->  8.0x */
-	50,  /* 1011 ->  5.0x */
-	65,  /* 1100 ->  6.5x */
-	75,  /* 1101 ->  7.5x */
-	85,  /* 1110 ->  8.5x */
-	120, /* 1111 -> 12.0x */
-};
-
-static const int __cpuinitdata ezra_eblcr[16] = {
-	50,  /* 0000 ->  5.0x */
-	30,  /* 0001 ->  3.0x */
-	40,  /* 0010 ->  4.0x */
-	100, /* 0011 -> 10.0x */
-	55,  /* 0100 ->  5.5x */
-	35,  /* 0101 ->  3.5x */
-	45,  /* 0110 ->  4.5x */
-	95,  /* 0111 ->  9.5x */
-	90,  /* 1000 ->  9.0x */
-	70,  /* 1001 ->  7.0x */
-	80,  /* 1010 ->  8.0x */
-	60,  /* 1011 ->  6.0x */
-	120, /* 1100 -> 12.0x */
-	75,  /* 1101 ->  7.5x */
-	85,  /* 1110 ->  8.5x */
-	65,  /* 1111 ->  6.5x */
-};
-
-/*
- * VIA C3 (Ezra-T) [C5M].
- */
-static const int __cpuinitdata ezrat_mults[32] = {
-	100, /* 0000 -> 10.0x */
-	30,  /* 0001 ->  3.0x */
-	40,  /* 0010 ->  4.0x */
-	90,  /* 0011 ->  9.0x */
-	95,  /* 0100 ->  9.5x */
-	35,  /* 0101 ->  3.5x */
-	45,  /* 0110 ->  4.5x */
-	55,  /* 0111 ->  5.5x */
-	60,  /* 1000 ->  6.0x */
-	70,  /* 1001 ->  7.0x */
-	80,  /* 1010 ->  8.0x */
-	50,  /* 1011 ->  5.0x */
-	65,  /* 1100 ->  6.5x */
-	75,  /* 1101 ->  7.5x */
-	85,  /* 1110 ->  8.5x */
-	120, /* 1111 ->  12.0x */
-
-	-1,  /* 0000 -> RESERVED (10.0x) */
-	110, /* 0001 -> 11.0x */
-	-1, /* 0010 -> 12.0x */
-	-1,  /* 0011 -> RESERVED (9.0x)*/
-	105, /* 0100 -> 10.5x */
-	115, /* 0101 -> 11.5x */
-	125, /* 0110 -> 12.5x */
-	135, /* 0111 -> 13.5x */
-	140, /* 1000 -> 14.0x */
-	150, /* 1001 -> 15.0x */
-	160, /* 1010 -> 16.0x */
-	130, /* 1011 -> 13.0x */
-	145, /* 1100 -> 14.5x */
-	155, /* 1101 -> 15.5x */
-	-1,  /* 1110 -> RESERVED (13.0x) */
-	-1,  /* 1111 -> RESERVED (12.0x) */
-};
-
-static const int __cpuinitdata ezrat_eblcr[32] = {
-	50,  /* 0000 ->  5.0x */
-	30,  /* 0001 ->  3.0x */
-	40,  /* 0010 ->  4.0x */
-	100, /* 0011 -> 10.0x */
-	55,  /* 0100 ->  5.5x */
-	35,  /* 0101 ->  3.5x */
-	45,  /* 0110 ->  4.5x */
-	95,  /* 0111 ->  9.5x */
-	90,  /* 1000 ->  9.0x */
-	70,  /* 1001 ->  7.0x */
-	80,  /* 1010 ->  8.0x */
-	60,  /* 1011 ->  6.0x */
-	120, /* 1100 -> 12.0x */
-	75,  /* 1101 ->  7.5x */
-	85,  /* 1110 ->  8.5x */
-	65,  /* 1111 ->  6.5x */
-
-	-1,  /* 0000 -> RESERVED (9.0x) */
-	110, /* 0001 -> 11.0x */
-	120, /* 0010 -> 12.0x */
-	-1,  /* 0011 -> RESERVED (10.0x)*/
-	135, /* 0100 -> 13.5x */
-	115, /* 0101 -> 11.5x */
-	125, /* 0110 -> 12.5x */
-	105, /* 0111 -> 10.5x */
-	130, /* 1000 -> 13.0x */
-	150, /* 1001 -> 15.0x */
-	160, /* 1010 -> 16.0x */
-	140, /* 1011 -> 14.0x */
-	-1,  /* 1100 -> RESERVED (12.0x) */
-	155, /* 1101 -> 15.5x */
-	-1,  /* 1110 -> RESERVED (13.0x) */
-	145, /* 1111 -> 14.5x */
-};
-
-/*
- * VIA C3 Nehemiah */
-
-static const int __cpuinitdata nehemiah_mults[32] = {
-	100, /* 0000 -> 10.0x */
-	-1, /* 0001 -> 16.0x */
-	40,  /* 0010 ->  4.0x */
-	90,  /* 0011 ->  9.0x */
-	95,  /* 0100 ->  9.5x */
-	-1,  /* 0101 ->  RESERVED */
-	45,  /* 0110 ->  4.5x */
-	55,  /* 0111 ->  5.5x */
-	60,  /* 1000 ->  6.0x */
-	70,  /* 1001 ->  7.0x */
-	80,  /* 1010 ->  8.0x */
-	50,  /* 1011 ->  5.0x */
-	65,  /* 1100 ->  6.5x */
-	75,  /* 1101 ->  7.5x */
-	85,  /* 1110 ->  8.5x */
-	120, /* 1111 -> 12.0x */
-	-1, /* 0000 -> 10.0x */
-	110, /* 0001 -> 11.0x */
-	-1, /* 0010 -> 12.0x */
-	-1,  /* 0011 ->  9.0x */
-	105, /* 0100 -> 10.5x */
-	115, /* 0101 -> 11.5x */
-	125, /* 0110 -> 12.5x */
-	135, /* 0111 -> 13.5x */
-	140, /* 1000 -> 14.0x */
-	150, /* 1001 -> 15.0x */
-	160, /* 1010 -> 16.0x */
-	130, /* 1011 -> 13.0x */
-	145, /* 1100 -> 14.5x */
-	155, /* 1101 -> 15.5x */
-	-1,  /* 1110 -> RESERVED (13.0x) */
-	-1, /* 1111 -> 12.0x */
-};
-
-static const int __cpuinitdata nehemiah_eblcr[32] = {
-	50,  /* 0000 ->  5.0x */
-	160, /* 0001 -> 16.0x */
-	40,  /* 0010 ->  4.0x */
-	100, /* 0011 -> 10.0x */
-	55,  /* 0100 ->  5.5x */
-	-1,  /* 0101 ->  RESERVED */
-	45,  /* 0110 ->  4.5x */
-	95,  /* 0111 ->  9.5x */
-	90,  /* 1000 ->  9.0x */
-	70,  /* 1001 ->  7.0x */
-	80,  /* 1010 ->  8.0x */
-	60,  /* 1011 ->  6.0x */
-	120, /* 1100 -> 12.0x */
-	75,  /* 1101 ->  7.5x */
-	85,  /* 1110 ->  8.5x */
-	65,  /* 1111 ->  6.5x */
-	90,  /* 0000 ->  9.0x */
-	110, /* 0001 -> 11.0x */
-	120, /* 0010 -> 12.0x */
-	100, /* 0011 -> 10.0x */
-	135, /* 0100 -> 13.5x */
-	115, /* 0101 -> 11.5x */
-	125, /* 0110 -> 12.5x */
-	105, /* 0111 -> 10.5x */
-	130, /* 1000 -> 13.0x */
-	150, /* 1001 -> 15.0x */
-	160, /* 1010 -> 16.0x */
-	140, /* 1011 -> 14.0x */
-	120, /* 1100 -> 12.0x */
-	155, /* 1101 -> 15.5x */
-	-1,  /* 1110 -> RESERVED (13.0x) */
-	145 /* 1111 -> 14.5x */
-};
-
-/*
- * Voltage scales. Div/Mod by 1000 to get actual voltage.
- * Which scale to use depends on the VRM type in use.
- */
-
-struct mV_pos {
-	unsigned short mV;
-	unsigned short pos;
-};
-
-static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
-	{1250, 8},	{1200, 6},	{1150, 4},	{1100, 2},
-	{1050, 0},	{1800, 30},	{1750, 28},	{1700, 26},
-	{1650, 24},	{1600, 22},	{1550, 20},	{1500, 18},
-	{1450, 16},	{1400, 14},	{1350, 12},	{1300, 10},
-	{1275, 9},	{1225, 7},	{1175, 5},	{1125, 3},
-	{1075, 1},	{1825, 31},	{1775, 29},	{1725, 27},
-	{1675, 25},	{1625, 23},	{1575, 21},	{1525, 19},
-	{1475, 17},	{1425, 15},	{1375, 13},	{1325, 11}
-};
-
-static const unsigned char __cpuinitdata mV_vrm85[32] = {
-	0x04,	0x14,	0x03,	0x13,	0x02,	0x12,	0x01,	0x11,
-	0x00,	0x10,	0x0f,	0x1f,	0x0e,	0x1e,	0x0d,	0x1d,
-	0x0c,	0x1c,	0x0b,	0x1b,	0x0a,	0x1a,	0x09,	0x19,
-	0x08,	0x18,	0x07,	0x17,	0x06,	0x16,	0x05,	0x15
-};
-
-static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
-	{1750, 31},	{1700, 30},	{1650, 29},	{1600, 28},
-	{1550, 27},	{1500, 26},	{1450, 25},	{1400, 24},
-	{1350, 23},	{1300, 22},	{1250, 21},	{1200, 20},
-	{1150, 19},	{1100, 18},	{1050, 17},	{1000, 16},
-	{975, 15},	{950, 14},	{925, 13},	{900, 12},
-	{875, 11},	{850, 10},	{825, 9},	{800, 8},
-	{775, 7},	{750, 6},	{725, 5},	{700, 4},
-	{675, 3},	{650, 2},	{625, 1},	{600, 0}
-};
-
-static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
-	0x1f,	0x1e,	0x1d,	0x1c,	0x1b,	0x1a,	0x19,	0x18,
-	0x17,	0x16,	0x15,	0x14,	0x13,	0x12,	0x11,	0x10,
-	0x0f,	0x0e,	0x0d,	0x0c,	0x0b,	0x0a,	0x09,	0x08,
-	0x07,	0x06,	0x05,	0x04,	0x03,	0x02,	0x01,	0x00
-};
-
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
deleted file mode 100644
index 34ea359b370..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ /dev/null
@@ -1,324 +0,0 @@
-/*
- * (C) 2002 - 2003  Dominik Brodowski <linux@brodo.de>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/timex.h>
-
-#include <asm/msr.h>
-#include <asm/processor.h>
-
-static struct cpufreq_driver	longrun_driver;
-
-/**
- * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz
- * values into per cent values. In TMTA microcode, the following is valid:
- * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
- */
-static unsigned int longrun_low_freq, longrun_high_freq;
-
-
-/**
- * longrun_get_policy - get the current LongRun policy
- * @policy: struct cpufreq_policy where current policy is written into
- *
- * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
- * and MSR_TMTA_LONGRUN_CTRL
- */
-static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy)
-{
-	u32 msr_lo, msr_hi;
-
-	rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
-	pr_debug("longrun flags are %x - %x\n", msr_lo, msr_hi);
-	if (msr_lo & 0x01)
-		policy->policy = CPUFREQ_POLICY_PERFORMANCE;
-	else
-		policy->policy = CPUFREQ_POLICY_POWERSAVE;
-
-	rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-	pr_debug("longrun ctrl is %x - %x\n", msr_lo, msr_hi);
-	msr_lo &= 0x0000007F;
-	msr_hi &= 0x0000007F;
-
-	if (longrun_high_freq <= longrun_low_freq) {
-		/* Assume degenerate Longrun table */
-		policy->min = policy->max = longrun_high_freq;
-	} else {
-		policy->min = longrun_low_freq + msr_lo *
-			((longrun_high_freq - longrun_low_freq) / 100);
-		policy->max = longrun_low_freq + msr_hi *
-			((longrun_high_freq - longrun_low_freq) / 100);
-	}
-	policy->cpu = 0;
-}
-
-
-/**
- * longrun_set_policy - sets a new CPUFreq policy
- * @policy: new policy
- *
- * Sets a new CPUFreq policy on LongRun-capable processors. This function
- * has to be called with cpufreq_driver locked.
- */
-static int longrun_set_policy(struct cpufreq_policy *policy)
-{
-	u32 msr_lo, msr_hi;
-	u32 pctg_lo, pctg_hi;
-
-	if (!policy)
-		return -EINVAL;
-
-	if (longrun_high_freq <= longrun_low_freq) {
-		/* Assume degenerate Longrun table */
-		pctg_lo = pctg_hi = 100;
-	} else {
-		pctg_lo = (policy->min - longrun_low_freq) /
-			((longrun_high_freq - longrun_low_freq) / 100);
-		pctg_hi = (policy->max - longrun_low_freq) /
-			((longrun_high_freq - longrun_low_freq) / 100);
-	}
-
-	if (pctg_hi > 100)
-		pctg_hi = 100;
-	if (pctg_lo > pctg_hi)
-		pctg_lo = pctg_hi;
-
-	/* performance or economy mode */
-	rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
-	msr_lo &= 0xFFFFFFFE;
-	switch (policy->policy) {
-	case CPUFREQ_POLICY_PERFORMANCE:
-		msr_lo |= 0x00000001;
-		break;
-	case CPUFREQ_POLICY_POWERSAVE:
-		break;
-	}
-	wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
-
-	/* lower and upper boundary */
-	rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-	msr_lo &= 0xFFFFFF80;
-	msr_hi &= 0xFFFFFF80;
-	msr_lo |= pctg_lo;
-	msr_hi |= pctg_hi;
-	wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-
-	return 0;
-}
-
-
-/**
- * longrun_verify_poliy - verifies a new CPUFreq policy
- * @policy: the policy to verify
- *
- * Validates a new CPUFreq policy. This function has to be called with
- * cpufreq_driver locked.
- */
-static int longrun_verify_policy(struct cpufreq_policy *policy)
-{
-	if (!policy)
-		return -EINVAL;
-
-	policy->cpu = 0;
-	cpufreq_verify_within_limits(policy,
-		policy->cpuinfo.min_freq,
-		policy->cpuinfo.max_freq);
-
-	if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) &&
-	    (policy->policy != CPUFREQ_POLICY_PERFORMANCE))
-		return -EINVAL;
-
-	return 0;
-}
-
-static unsigned int longrun_get(unsigned int cpu)
-{
-	u32 eax, ebx, ecx, edx;
-
-	if (cpu)
-		return 0;
-
-	cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
-	pr_debug("cpuid eax is %u\n", eax);
-
-	return eax * 1000;
-}
-
-/**
- * longrun_determine_freqs - determines the lowest and highest possible core frequency
- * @low_freq: an int to put the lowest frequency into
- * @high_freq: an int to put the highest frequency into
- *
- * Determines the lowest and highest possible core frequencies on this CPU.
- * This is necessary to calculate the performance percentage according to
- * TMTA rules:
- * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
- */
-static int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
-						      unsigned int *high_freq)
-{
-	u32 msr_lo, msr_hi;
-	u32 save_lo, save_hi;
-	u32 eax, ebx, ecx, edx;
-	u32 try_hi;
-	struct cpuinfo_x86 *c = &cpu_data(0);
-
-	if (!low_freq || !high_freq)
-		return -EINVAL;
-
-	if (cpu_has(c, X86_FEATURE_LRTI)) {
-		/* if the LongRun Table Interface is present, the
-		 * detection is a bit easier:
-		 * For minimum frequency, read out the maximum
-		 * level (msr_hi), write that into "currently
-		 * selected level", and read out the frequency.
-		 * For maximum frequency, read out level zero.
-		 */
-		/* minimum */
-		rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi);
-		wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi);
-		rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
-		*low_freq = msr_lo * 1000; /* to kHz */
-
-		/* maximum */
-		wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi);
-		rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
-		*high_freq = msr_lo * 1000; /* to kHz */
-
-		pr_debug("longrun table interface told %u - %u kHz\n",
-				*low_freq, *high_freq);
-
-		if (*low_freq > *high_freq)
-			*low_freq = *high_freq;
-		return 0;
-	}
-
-	/* set the upper border to the value determined during TSC init */
-	*high_freq = (cpu_khz / 1000);
-	*high_freq = *high_freq * 1000;
-	pr_debug("high frequency is %u kHz\n", *high_freq);
-
-	/* get current borders */
-	rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-	save_lo = msr_lo & 0x0000007F;
-	save_hi = msr_hi & 0x0000007F;
-
-	/* if current perf_pctg is larger than 90%, we need to decrease the
-	 * upper limit to make the calculation more accurate.
-	 */
-	cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
-	/* try decreasing in 10% steps, some processors react only
-	 * on some barrier values */
-	for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) {
-		/* set to 0 to try_hi perf_pctg */
-		msr_lo &= 0xFFFFFF80;
-		msr_hi &= 0xFFFFFF80;
-		msr_hi |= try_hi;
-		wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-
-		/* read out current core MHz and current perf_pctg */
-		cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
-
-		/* restore values */
-		wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi);
-	}
-	pr_debug("percentage is %u %%, freq is %u MHz\n", ecx, eax);
-
-	/* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
-	 * eqals
-	 * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
-	 *
-	 * high_freq * perf_pctg is stored tempoarily into "ebx".
-	 */
-	ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */
-
-	if ((ecx > 95) || (ecx == 0) || (eax < ebx))
-		return -EIO;
-
-	edx = ((eax - ebx) * 100) / (100 - ecx);
-	*low_freq = edx * 1000; /* back to kHz */
-
-	pr_debug("low frequency is %u kHz\n", *low_freq);
-
-	if (*low_freq > *high_freq)
-		*low_freq = *high_freq;
-
-	return 0;
-}
-
-
-static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
-{
-	int result = 0;
-
-	/* capability check */
-	if (policy->cpu != 0)
-		return -ENODEV;
-
-	/* detect low and high frequency */
-	result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq);
-	if (result)
-		return result;
-
-	/* cpuinfo and default policy values */
-	policy->cpuinfo.min_freq = longrun_low_freq;
-	policy->cpuinfo.max_freq = longrun_high_freq;
-	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-	longrun_get_policy(policy);
-
-	return 0;
-}
-
-
-static struct cpufreq_driver longrun_driver = {
-	.flags		= CPUFREQ_CONST_LOOPS,
-	.verify		= longrun_verify_policy,
-	.setpolicy	= longrun_set_policy,
-	.get		= longrun_get,
-	.init		= longrun_cpu_init,
-	.name		= "longrun",
-	.owner		= THIS_MODULE,
-};
-
-
-/**
- * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver
- *
- * Initializes the LongRun support.
- */
-static int __init longrun_init(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-
-	if (c->x86_vendor != X86_VENDOR_TRANSMETA ||
-	    !cpu_has(c, X86_FEATURE_LONGRUN))
-		return -ENODEV;
-
-	return cpufreq_register_driver(&longrun_driver);
-}
-
-
-/**
- * longrun_exit - unregisters LongRun support
- */
-static void __exit longrun_exit(void)
-{
-	cpufreq_unregister_driver(&longrun_driver);
-}
-
-
-MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and "
-		"Efficeon processors.");
-MODULE_LICENSE("GPL");
-
-module_init(longrun_init);
-module_exit(longrun_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
deleted file mode 100644
index 911e193018a..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.c
+++ /dev/null
@@ -1,51 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-
-#include "mperf.h"
-
-static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
-
-/* Called via smp_call_function_single(), on the target CPU */
-static void read_measured_perf_ctrs(void *_cur)
-{
-	struct aperfmperf *am = _cur;
-
-	get_aperfmperf(am);
-}
-
-/*
- * Return the measured active (C0) frequency on this CPU since last call
- * to this function.
- * Input: cpu number
- * Return: Average CPU frequency in terms of max frequency (zero on error)
- *
- * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
- * over a period of time, while CPU is in C0 state.
- * IA32_MPERF counts at the rate of max advertised frequency
- * IA32_APERF counts at the rate of actual CPU frequency
- * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
- * no meaning should be associated with absolute values of these MSRs.
- */
-unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
-					unsigned int cpu)
-{
-	struct aperfmperf perf;
-	unsigned long ratio;
-	unsigned int retval;
-
-	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
-		return 0;
-
-	ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
-	per_cpu(acfreq_old_perf, cpu) = perf;
-
-	retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
-
-	return retval;
-}
-EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
deleted file mode 100644
index 5dbf2950dc2..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- *  (c) 2010 Advanced Micro Devices, Inc.
- *  Your use of this code is subject to the terms and conditions of the
- *  GNU general public license version 2. See "COPYING" or
- *  http://www.gnu.org/licenses/gpl.html
- */
-
-unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
-					unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
deleted file mode 100644
index 6be3e0760c2..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ /dev/null
@@ -1,329 +0,0 @@
-/*
- *	Pentium 4/Xeon CPU on demand clock modulation/speed scaling
- *	(C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *	(C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
- *	(C) 2002 Arjan van de Ven <arjanv@redhat.com>
- *	(C) 2002 Tora T. Engstad
- *	All Rights Reserved
- *
- *	This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- *
- *      The author(s) of this software shall not be held liable for damages
- *      of any nature resulting due to the use of this software. This
- *      software is provided AS-IS with no warranties.
- *
- *	Date		Errata			Description
- *	20020525	N44, O17	12.5% or 25% DC causes lockup
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpufreq.h>
-#include <linux/cpumask.h>
-#include <linux/timex.h>
-
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/timer.h>
-
-#include "speedstep-lib.h"
-
-#define PFX	"p4-clockmod: "
-
-/*
- * Duty Cycle (3bits), note DC_DISABLE is not specified in
- * intel docs i just use it to mean disable
- */
-enum {
-	DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT,
-	DC_64PT, DC_75PT, DC_88PT, DC_DISABLE
-};
-
-#define DC_ENTRIES	8
-
-
-static int has_N44_O17_errata[NR_CPUS];
-static unsigned int stock_freq;
-static struct cpufreq_driver p4clockmod_driver;
-static unsigned int cpufreq_p4_get(unsigned int cpu);
-
-static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
-{
-	u32 l, h;
-
-	if (!cpu_online(cpu) ||
-	    (newstate > DC_DISABLE) || (newstate == DC_RESV))
-		return -EINVAL;
-
-	rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
-
-	if (l & 0x01)
-		pr_debug("CPU#%d currently thermal throttled\n", cpu);
-
-	if (has_N44_O17_errata[cpu] &&
-	    (newstate == DC_25PT || newstate == DC_DFLT))
-		newstate = DC_38PT;
-
-	rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
-	if (newstate == DC_DISABLE) {
-		pr_debug("CPU#%d disabling modulation\n", cpu);
-		wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
-	} else {
-		pr_debug("CPU#%d setting duty cycle to %d%%\n",
-			cpu, ((125 * newstate) / 10));
-		/* bits 63 - 5	: reserved
-		 * bit  4	: enable/disable
-		 * bits 3-1	: duty cycle
-		 * bit  0	: reserved
-		 */
-		l = (l & ~14);
-		l = l | (1<<4) | ((newstate & 0x7)<<1);
-		wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h);
-	}
-
-	return 0;
-}
-
-
-static struct cpufreq_frequency_table p4clockmod_table[] = {
-	{DC_RESV, CPUFREQ_ENTRY_INVALID},
-	{DC_DFLT, 0},
-	{DC_25PT, 0},
-	{DC_38PT, 0},
-	{DC_50PT, 0},
-	{DC_64PT, 0},
-	{DC_75PT, 0},
-	{DC_88PT, 0},
-	{DC_DISABLE, 0},
-	{DC_RESV, CPUFREQ_TABLE_END},
-};
-
-
-static int cpufreq_p4_target(struct cpufreq_policy *policy,
-			     unsigned int target_freq,
-			     unsigned int relation)
-{
-	unsigned int    newstate = DC_RESV;
-	struct cpufreq_freqs freqs;
-	int i;
-
-	if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0],
-				target_freq, relation, &newstate))
-		return -EINVAL;
-
-	freqs.old = cpufreq_p4_get(policy->cpu);
-	freqs.new = stock_freq * p4clockmod_table[newstate].index / 8;
-
-	if (freqs.new == freqs.old)
-		return 0;
-
-	/* notifiers */
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	}
-
-	/* run on each logical CPU,
-	 * see section 13.15.3 of IA32 Intel Architecture Software
-	 * Developer's Manual, Volume 3
-	 */
-	for_each_cpu(i, policy->cpus)
-		cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
-
-	/* notifiers */
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
-
-	return 0;
-}
-
-
-static int cpufreq_p4_verify(struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]);
-}
-
-
-static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
-{
-	if (c->x86 == 0x06) {
-		if (cpu_has(c, X86_FEATURE_EST))
-			printk_once(KERN_WARNING PFX "Warning: EST-capable "
-			       "CPU detected. The acpi-cpufreq module offers "
-			       "voltage scaling in addition to frequency "
-			       "scaling. You should use that instead of "
-			       "p4-clockmod, if possible.\n");
-		switch (c->x86_model) {
-		case 0x0E: /* Core */
-		case 0x0F: /* Core Duo */
-		case 0x16: /* Celeron Core */
-		case 0x1C: /* Atom */
-			p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
-			return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
-		case 0x0D: /* Pentium M (Dothan) */
-			p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
-			/* fall through */
-		case 0x09: /* Pentium M (Banias) */
-			return speedstep_get_frequency(SPEEDSTEP_CPU_PM);
-		}
-	}
-
-	if (c->x86 != 0xF)
-		return 0;
-
-	/* on P-4s, the TSC runs with constant frequency independent whether
-	 * throttling is active or not. */
-	p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
-
-	if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) {
-		printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. "
-		       "The speedstep-ich or acpi cpufreq modules offer "
-		       "voltage scaling in addition of frequency scaling. "
-		       "You should use either one instead of p4-clockmod, "
-		       "if possible.\n");
-		return speedstep_get_frequency(SPEEDSTEP_CPU_P4M);
-	}
-
-	return speedstep_get_frequency(SPEEDSTEP_CPU_P4D);
-}
-
-
-
-static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
-{
-	struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
-	int cpuid = 0;
-	unsigned int i;
-
-#ifdef CONFIG_SMP
-	cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
-#endif
-
-	/* Errata workaround */
-	cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask;
-	switch (cpuid) {
-	case 0x0f07:
-	case 0x0f0a:
-	case 0x0f11:
-	case 0x0f12:
-		has_N44_O17_errata[policy->cpu] = 1;
-		pr_debug("has errata -- disabling low frequencies\n");
-	}
-
-	if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D &&
-	    c->x86_model < 2) {
-		/* switch to maximum frequency and measure result */
-		cpufreq_p4_setdc(policy->cpu, DC_DISABLE);
-		recalibrate_cpu_khz();
-	}
-	/* get max frequency */
-	stock_freq = cpufreq_p4_get_frequency(c);
-	if (!stock_freq)
-		return -EINVAL;
-
-	/* table init */
-	for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
-		if ((i < 2) && (has_N44_O17_errata[policy->cpu]))
-			p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
-		else
-			p4clockmod_table[i].frequency = (stock_freq * i)/8;
-	}
-	cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu);
-
-	/* cpuinfo and default policy values */
-
-	/* the transition latency is set to be 1 higher than the maximum
-	 * transition latency of the ondemand governor */
-	policy->cpuinfo.transition_latency = 10000001;
-	policy->cur = stock_freq;
-
-	return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]);
-}
-
-
-static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy)
-{
-	cpufreq_frequency_table_put_attr(policy->cpu);
-	return 0;
-}
-
-static unsigned int cpufreq_p4_get(unsigned int cpu)
-{
-	u32 l, h;
-
-	rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
-
-	if (l & 0x10) {
-		l = l >> 1;
-		l &= 0x7;
-	} else
-		l = DC_DISABLE;
-
-	if (l != DC_DISABLE)
-		return stock_freq * l / 8;
-
-	return stock_freq;
-}
-
-static struct freq_attr *p4clockmod_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-static struct cpufreq_driver p4clockmod_driver = {
-	.verify		= cpufreq_p4_verify,
-	.target		= cpufreq_p4_target,
-	.init		= cpufreq_p4_cpu_init,
-	.exit		= cpufreq_p4_cpu_exit,
-	.get		= cpufreq_p4_get,
-	.name		= "p4-clockmod",
-	.owner		= THIS_MODULE,
-	.attr		= p4clockmod_attr,
-};
-
-
-static int __init cpufreq_p4_init(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-	int ret;
-
-	/*
-	 * THERM_CONTROL is architectural for IA32 now, so
-	 * we can rely on the capability checks
-	 */
-	if (c->x86_vendor != X86_VENDOR_INTEL)
-		return -ENODEV;
-
-	if (!test_cpu_cap(c, X86_FEATURE_ACPI) ||
-				!test_cpu_cap(c, X86_FEATURE_ACC))
-		return -ENODEV;
-
-	ret = cpufreq_register_driver(&p4clockmod_driver);
-	if (!ret)
-		printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock "
-				"Modulation available\n");
-
-	return ret;
-}
-
-
-static void __exit cpufreq_p4_exit(void)
-{
-	cpufreq_unregister_driver(&p4clockmod_driver);
-}
-
-
-MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>");
-MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
-MODULE_LICENSE("GPL");
-
-late_initcall(cpufreq_p4_init);
-module_exit(cpufreq_p4_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
deleted file mode 100644
index 7b0603eb012..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ /dev/null
@@ -1,621 +0,0 @@
-/*
- *  pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
- *
- *  Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
- *  Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
- *	Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; version 2 of the License.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
- *  INFRINGEMENT. See the GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/cpufreq.h>
-#include <linux/compiler.h>
-#include <linux/slab.h>
-
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/spinlock.h>
-#include <linux/uaccess.h>
-
-#include <acpi/processor.h>
-
-#define PCC_VERSION	"1.10.00"
-#define POLL_LOOPS 	300
-
-#define CMD_COMPLETE 	0x1
-#define CMD_GET_FREQ 	0x0
-#define CMD_SET_FREQ 	0x1
-
-#define BUF_SZ		4
-
-struct pcc_register_resource {
-	u8 descriptor;
-	u16 length;
-	u8 space_id;
-	u8 bit_width;
-	u8 bit_offset;
-	u8 access_size;
-	u64 address;
-} __attribute__ ((packed));
-
-struct pcc_memory_resource {
-	u8 descriptor;
-	u16 length;
-	u8 space_id;
-	u8 resource_usage;
-	u8 type_specific;
-	u64 granularity;
-	u64 minimum;
-	u64 maximum;
-	u64 translation_offset;
-	u64 address_length;
-} __attribute__ ((packed));
-
-static struct cpufreq_driver pcc_cpufreq_driver;
-
-struct pcc_header {
-	u32 signature;
-	u16 length;
-	u8 major;
-	u8 minor;
-	u32 features;
-	u16 command;
-	u16 status;
-	u32 latency;
-	u32 minimum_time;
-	u32 maximum_time;
-	u32 nominal;
-	u32 throttled_frequency;
-	u32 minimum_frequency;
-};
-
-static void __iomem *pcch_virt_addr;
-static struct pcc_header __iomem *pcch_hdr;
-
-static DEFINE_SPINLOCK(pcc_lock);
-
-static struct acpi_generic_address doorbell;
-
-static u64 doorbell_preserve;
-static u64 doorbell_write;
-
-static u8 OSC_UUID[16] = {0x9F, 0x2C, 0x9B, 0x63, 0x91, 0x70, 0x1f, 0x49,
-			  0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
-
-struct pcc_cpu {
-	u32 input_offset;
-	u32 output_offset;
-};
-
-static struct pcc_cpu __percpu *pcc_cpu_info;
-
-static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
-{
-	cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
-				     policy->cpuinfo.max_freq);
-	return 0;
-}
-
-static inline void pcc_cmd(void)
-{
-	u64 doorbell_value;
-	int i;
-
-	acpi_read(&doorbell_value, &doorbell);
-	acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
-		   &doorbell);
-
-	for (i = 0; i < POLL_LOOPS; i++) {
-		if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
-			break;
-	}
-}
-
-static inline void pcc_clear_mapping(void)
-{
-	if (pcch_virt_addr)
-		iounmap(pcch_virt_addr);
-	pcch_virt_addr = NULL;
-}
-
-static unsigned int pcc_get_freq(unsigned int cpu)
-{
-	struct pcc_cpu *pcc_cpu_data;
-	unsigned int curr_freq;
-	unsigned int freq_limit;
-	u16 status;
-	u32 input_buffer;
-	u32 output_buffer;
-
-	spin_lock(&pcc_lock);
-
-	pr_debug("get: get_freq for CPU %d\n", cpu);
-	pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
-
-	input_buffer = 0x1;
-	iowrite32(input_buffer,
-			(pcch_virt_addr + pcc_cpu_data->input_offset));
-	iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
-
-	pcc_cmd();
-
-	output_buffer =
-		ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
-
-	/* Clear the input buffer - we are done with the current command */
-	memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
-
-	status = ioread16(&pcch_hdr->status);
-	if (status != CMD_COMPLETE) {
-		pr_debug("get: FAILED: for CPU %d, status is %d\n",
-			cpu, status);
-		goto cmd_incomplete;
-	}
-	iowrite16(0, &pcch_hdr->status);
-	curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
-			/ 100) * 1000);
-
-	pr_debug("get: SUCCESS: (virtual) output_offset for cpu %d is "
-		"0x%p, contains a value of: 0x%x. Speed is: %d MHz\n",
-		cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
-		output_buffer, curr_freq);
-
-	freq_limit = (output_buffer >> 8) & 0xff;
-	if (freq_limit != 0xff) {
-		pr_debug("get: frequency for cpu %d is being temporarily"
-			" capped at %d\n", cpu, curr_freq);
-	}
-
-	spin_unlock(&pcc_lock);
-	return curr_freq;
-
-cmd_incomplete:
-	iowrite16(0, &pcch_hdr->status);
-	spin_unlock(&pcc_lock);
-	return 0;
-}
-
-static int pcc_cpufreq_target(struct cpufreq_policy *policy,
-			      unsigned int target_freq,
-			      unsigned int relation)
-{
-	struct pcc_cpu *pcc_cpu_data;
-	struct cpufreq_freqs freqs;
-	u16 status;
-	u32 input_buffer;
-	int cpu;
-
-	spin_lock(&pcc_lock);
-	cpu = policy->cpu;
-	pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
-
-	pr_debug("target: CPU %d should go to target freq: %d "
-		"(virtual) input_offset is 0x%p\n",
-		cpu, target_freq,
-		(pcch_virt_addr + pcc_cpu_data->input_offset));
-
-	freqs.new = target_freq;
-	freqs.cpu = cpu;
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
-	input_buffer = 0x1 | (((target_freq * 100)
-			       / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
-	iowrite32(input_buffer,
-			(pcch_virt_addr + pcc_cpu_data->input_offset));
-	iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
-
-	pcc_cmd();
-
-	/* Clear the input buffer - we are done with the current command */
-	memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
-
-	status = ioread16(&pcch_hdr->status);
-	if (status != CMD_COMPLETE) {
-		pr_debug("target: FAILED for cpu %d, with status: 0x%x\n",
-			cpu, status);
-		goto cmd_incomplete;
-	}
-	iowrite16(0, &pcch_hdr->status);
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	pr_debug("target: was SUCCESSFUL for cpu %d\n", cpu);
-	spin_unlock(&pcc_lock);
-
-	return 0;
-
-cmd_incomplete:
-	iowrite16(0, &pcch_hdr->status);
-	spin_unlock(&pcc_lock);
-	return -EINVAL;
-}
-
-static int pcc_get_offset(int cpu)
-{
-	acpi_status status;
-	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
-	union acpi_object *pccp, *offset;
-	struct pcc_cpu *pcc_cpu_data;
-	struct acpi_processor *pr;
-	int ret = 0;
-
-	pr = per_cpu(processors, cpu);
-	pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
-
-	status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
-	if (ACPI_FAILURE(status))
-		return -ENODEV;
-
-	pccp = buffer.pointer;
-	if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
-		ret = -ENODEV;
-		goto out_free;
-	};
-
-	offset = &(pccp->package.elements[0]);
-	if (!offset || offset->type != ACPI_TYPE_INTEGER) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	pcc_cpu_data->input_offset = offset->integer.value;
-
-	offset = &(pccp->package.elements[1]);
-	if (!offset || offset->type != ACPI_TYPE_INTEGER) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	pcc_cpu_data->output_offset = offset->integer.value;
-
-	memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
-	memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
-
-	pr_debug("pcc_get_offset: for CPU %d: pcc_cpu_data "
-		"input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
-		cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
-out_free:
-	kfree(buffer.pointer);
-	return ret;
-}
-
-static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
-{
-	acpi_status status;
-	struct acpi_object_list input;
-	struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
-	union acpi_object in_params[4];
-	union acpi_object *out_obj;
-	u32 capabilities[2];
-	u32 errors;
-	u32 supported;
-	int ret = 0;
-
-	input.count = 4;
-	input.pointer = in_params;
-	in_params[0].type               = ACPI_TYPE_BUFFER;
-	in_params[0].buffer.length      = 16;
-	in_params[0].buffer.pointer     = OSC_UUID;
-	in_params[1].type               = ACPI_TYPE_INTEGER;
-	in_params[1].integer.value      = 1;
-	in_params[2].type               = ACPI_TYPE_INTEGER;
-	in_params[2].integer.value      = 2;
-	in_params[3].type               = ACPI_TYPE_BUFFER;
-	in_params[3].buffer.length      = 8;
-	in_params[3].buffer.pointer     = (u8 *)&capabilities;
-
-	capabilities[0] = OSC_QUERY_ENABLE;
-	capabilities[1] = 0x1;
-
-	status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
-	if (ACPI_FAILURE(status))
-		return -ENODEV;
-
-	if (!output.length)
-		return -ENODEV;
-
-	out_obj = output.pointer;
-	if (out_obj->type != ACPI_TYPE_BUFFER) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
-	if (errors) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	supported = *((u32 *)(out_obj->buffer.pointer + 4));
-	if (!(supported & 0x1)) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	kfree(output.pointer);
-	capabilities[0] = 0x0;
-	capabilities[1] = 0x1;
-
-	status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
-	if (ACPI_FAILURE(status))
-		return -ENODEV;
-
-	if (!output.length)
-		return -ENODEV;
-
-	out_obj = output.pointer;
-	if (out_obj->type != ACPI_TYPE_BUFFER) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
-	if (errors) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	supported = *((u32 *)(out_obj->buffer.pointer + 4));
-	if (!(supported & 0x1)) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-out_free:
-	kfree(output.pointer);
-	return ret;
-}
-
-static int __init pcc_cpufreq_probe(void)
-{
-	acpi_status status;
-	struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
-	struct pcc_memory_resource *mem_resource;
-	struct pcc_register_resource *reg_resource;
-	union acpi_object *out_obj, *member;
-	acpi_handle handle, osc_handle, pcch_handle;
-	int ret = 0;
-
-	status = acpi_get_handle(NULL, "\\_SB", &handle);
-	if (ACPI_FAILURE(status))
-		return -ENODEV;
-
-	status = acpi_get_handle(handle, "PCCH", &pcch_handle);
-	if (ACPI_FAILURE(status))
-		return -ENODEV;
-
-	status = acpi_get_handle(handle, "_OSC", &osc_handle);
-	if (ACPI_SUCCESS(status)) {
-		ret = pcc_cpufreq_do_osc(&osc_handle);
-		if (ret)
-			pr_debug("probe: _OSC evaluation did not succeed\n");
-		/* Firmware's use of _OSC is optional */
-		ret = 0;
-	}
-
-	status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
-	if (ACPI_FAILURE(status))
-		return -ENODEV;
-
-	out_obj = output.pointer;
-	if (out_obj->type != ACPI_TYPE_PACKAGE) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	member = &out_obj->package.elements[0];
-	if (member->type != ACPI_TYPE_BUFFER) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
-
-	pr_debug("probe: mem_resource descriptor: 0x%x,"
-		" length: %d, space_id: %d, resource_usage: %d,"
-		" type_specific: %d, granularity: 0x%llx,"
-		" minimum: 0x%llx, maximum: 0x%llx,"
-		" translation_offset: 0x%llx, address_length: 0x%llx\n",
-		mem_resource->descriptor, mem_resource->length,
-		mem_resource->space_id, mem_resource->resource_usage,
-		mem_resource->type_specific, mem_resource->granularity,
-		mem_resource->minimum, mem_resource->maximum,
-		mem_resource->translation_offset,
-		mem_resource->address_length);
-
-	if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
-					mem_resource->address_length);
-	if (pcch_virt_addr == NULL) {
-		pr_debug("probe: could not map shared mem region\n");
-		goto out_free;
-	}
-	pcch_hdr = pcch_virt_addr;
-
-	pr_debug("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
-	pr_debug("probe: PCCH header is at physical address: 0x%llx,"
-		" signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
-		" supported features: 0x%x, command field: 0x%x,"
-		" status field: 0x%x, nominal latency: %d us\n",
-		mem_resource->minimum, ioread32(&pcch_hdr->signature),
-		ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
-		ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
-		ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
-		ioread32(&pcch_hdr->latency));
-
-	pr_debug("probe: min time between commands: %d us,"
-		" max time between commands: %d us,"
-		" nominal CPU frequency: %d MHz,"
-		" minimum CPU frequency: %d MHz,"
-		" minimum CPU frequency without throttling: %d MHz\n",
-		ioread32(&pcch_hdr->minimum_time),
-		ioread32(&pcch_hdr->maximum_time),
-		ioread32(&pcch_hdr->nominal),
-		ioread32(&pcch_hdr->throttled_frequency),
-		ioread32(&pcch_hdr->minimum_frequency));
-
-	member = &out_obj->package.elements[1];
-	if (member->type != ACPI_TYPE_BUFFER) {
-		ret = -ENODEV;
-		goto pcch_free;
-	}
-
-	reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
-
-	doorbell.space_id = reg_resource->space_id;
-	doorbell.bit_width = reg_resource->bit_width;
-	doorbell.bit_offset = reg_resource->bit_offset;
-	doorbell.access_width = 64;
-	doorbell.address = reg_resource->address;
-
-	pr_debug("probe: doorbell: space_id is %d, bit_width is %d, "
-		"bit_offset is %d, access_width is %d, address is 0x%llx\n",
-		doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
-		doorbell.access_width, reg_resource->address);
-
-	member = &out_obj->package.elements[2];
-	if (member->type != ACPI_TYPE_INTEGER) {
-		ret = -ENODEV;
-		goto pcch_free;
-	}
-
-	doorbell_preserve = member->integer.value;
-
-	member = &out_obj->package.elements[3];
-	if (member->type != ACPI_TYPE_INTEGER) {
-		ret = -ENODEV;
-		goto pcch_free;
-	}
-
-	doorbell_write = member->integer.value;
-
-	pr_debug("probe: doorbell_preserve: 0x%llx,"
-		" doorbell_write: 0x%llx\n",
-		doorbell_preserve, doorbell_write);
-
-	pcc_cpu_info = alloc_percpu(struct pcc_cpu);
-	if (!pcc_cpu_info) {
-		ret = -ENOMEM;
-		goto pcch_free;
-	}
-
-	printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
-	       " limits: %d MHz, %d MHz\n", PCC_VERSION,
-	       ioread32(&pcch_hdr->minimum_frequency),
-	       ioread32(&pcch_hdr->nominal));
-	kfree(output.pointer);
-	return ret;
-pcch_free:
-	pcc_clear_mapping();
-out_free:
-	kfree(output.pointer);
-	return ret;
-}
-
-static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
-{
-	unsigned int cpu = policy->cpu;
-	unsigned int result = 0;
-
-	if (!pcch_virt_addr) {
-		result = -1;
-		goto out;
-	}
-
-	result = pcc_get_offset(cpu);
-	if (result) {
-		pr_debug("init: PCCP evaluation failed\n");
-		goto out;
-	}
-
-	policy->max = policy->cpuinfo.max_freq =
-		ioread32(&pcch_hdr->nominal) * 1000;
-	policy->min = policy->cpuinfo.min_freq =
-		ioread32(&pcch_hdr->minimum_frequency) * 1000;
-	policy->cur = pcc_get_freq(cpu);
-
-	if (!policy->cur) {
-		pr_debug("init: Unable to get current CPU frequency\n");
-		result = -EINVAL;
-		goto out;
-	}
-
-	pr_debug("init: policy->max is %d, policy->min is %d\n",
-		policy->max, policy->min);
-out:
-	return result;
-}
-
-static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
-{
-	return 0;
-}
-
-static struct cpufreq_driver pcc_cpufreq_driver = {
-	.flags = CPUFREQ_CONST_LOOPS,
-	.get = pcc_get_freq,
-	.verify = pcc_cpufreq_verify,
-	.target = pcc_cpufreq_target,
-	.init = pcc_cpufreq_cpu_init,
-	.exit = pcc_cpufreq_cpu_exit,
-	.name = "pcc-cpufreq",
-	.owner = THIS_MODULE,
-};
-
-static int __init pcc_cpufreq_init(void)
-{
-	int ret;
-
-	if (acpi_disabled)
-		return 0;
-
-	ret = pcc_cpufreq_probe();
-	if (ret) {
-		pr_debug("pcc_cpufreq_init: PCCH evaluation failed\n");
-		return ret;
-	}
-
-	ret = cpufreq_register_driver(&pcc_cpufreq_driver);
-
-	return ret;
-}
-
-static void __exit pcc_cpufreq_exit(void)
-{
-	cpufreq_unregister_driver(&pcc_cpufreq_driver);
-
-	pcc_clear_mapping();
-
-	free_percpu(pcc_cpu_info);
-}
-
-MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
-MODULE_VERSION(PCC_VERSION);
-MODULE_DESCRIPTION("Processor Clocking Control interface driver");
-MODULE_LICENSE("GPL");
-
-late_initcall(pcc_cpufreq_init);
-module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
deleted file mode 100644
index b3379d6a5c5..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- *  This file was based upon code in Powertweak Linux (http://powertweak.sf.net)
- *  (C) 2000-2003  Dave Jones, Arjan van de Ven, Janne Pänkälä,
- *                 Dominik Brodowski.
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/ioport.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <asm/msr.h>
-
-#define POWERNOW_IOPORT 0xfff0          /* it doesn't matter where, as long
-					   as it is unused */
-
-#define PFX "powernow-k6: "
-static unsigned int                     busfreq;   /* FSB, in 10 kHz */
-static unsigned int                     max_multiplier;
-
-
-/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */
-static struct cpufreq_frequency_table clock_ratio[] = {
-	{45,  /* 000 -> 4.5x */ 0},
-	{50,  /* 001 -> 5.0x */ 0},
-	{40,  /* 010 -> 4.0x */ 0},
-	{55,  /* 011 -> 5.5x */ 0},
-	{20,  /* 100 -> 2.0x */ 0},
-	{30,  /* 101 -> 3.0x */ 0},
-	{60,  /* 110 -> 6.0x */ 0},
-	{35,  /* 111 -> 3.5x */ 0},
-	{0, CPUFREQ_TABLE_END}
-};
-
-
-/**
- * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier
- *
- *   Returns the current setting of the frequency multiplier. Core clock
- * speed is frequency of the Front-Side Bus multiplied with this value.
- */
-static int powernow_k6_get_cpu_multiplier(void)
-{
-	u64 invalue = 0;
-	u32 msrval;
-
-	msrval = POWERNOW_IOPORT + 0x1;
-	wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
-	invalue = inl(POWERNOW_IOPORT + 0x8);
-	msrval = POWERNOW_IOPORT + 0x0;
-	wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
-
-	return clock_ratio[(invalue >> 5)&7].index;
-}
-
-
-/**
- * powernow_k6_set_state - set the PowerNow! multiplier
- * @best_i: clock_ratio[best_i] is the target multiplier
- *
- *   Tries to change the PowerNow! multiplier
- */
-static void powernow_k6_set_state(unsigned int best_i)
-{
-	unsigned long outvalue = 0, invalue = 0;
-	unsigned long msrval;
-	struct cpufreq_freqs freqs;
-
-	if (clock_ratio[best_i].index > max_multiplier) {
-		printk(KERN_ERR PFX "invalid target frequency\n");
-		return;
-	}
-
-	freqs.old = busfreq * powernow_k6_get_cpu_multiplier();
-	freqs.new = busfreq * clock_ratio[best_i].index;
-	freqs.cpu = 0; /* powernow-k6.c is UP only driver */
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
-	/* we now need to transform best_i to the BVC format, see AMD#23446 */
-
-	outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5);
-
-	msrval = POWERNOW_IOPORT + 0x1;
-	wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
-	invalue = inl(POWERNOW_IOPORT + 0x8);
-	invalue = invalue & 0xf;
-	outvalue = outvalue | invalue;
-	outl(outvalue , (POWERNOW_IOPORT + 0x8));
-	msrval = POWERNOW_IOPORT + 0x0;
-	wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
-	return;
-}
-
-
-/**
- * powernow_k6_verify - verifies a new CPUfreq policy
- * @policy: new policy
- *
- * Policy must be within lowest and highest possible CPU Frequency,
- * and at least one possible state must be within min and max.
- */
-static int powernow_k6_verify(struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy, &clock_ratio[0]);
-}
-
-
-/**
- * powernow_k6_setpolicy - sets a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- *  (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * sets a new CPUFreq policy
- */
-static int powernow_k6_target(struct cpufreq_policy *policy,
-			       unsigned int target_freq,
-			       unsigned int relation)
-{
-	unsigned int newstate = 0;
-
-	if (cpufreq_frequency_table_target(policy, &clock_ratio[0],
-				target_freq, relation, &newstate))
-		return -EINVAL;
-
-	powernow_k6_set_state(newstate);
-
-	return 0;
-}
-
-
-static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
-{
-	unsigned int i, f;
-	int result;
-
-	if (policy->cpu != 0)
-		return -ENODEV;
-
-	/* get frequencies */
-	max_multiplier = powernow_k6_get_cpu_multiplier();
-	busfreq = cpu_khz / max_multiplier;
-
-	/* table init */
-	for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
-		f = clock_ratio[i].index;
-		if (f > max_multiplier)
-			clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
-		else
-			clock_ratio[i].frequency = busfreq * f;
-	}
-
-	/* cpuinfo and default policy values */
-	policy->cpuinfo.transition_latency = 200000;
-	policy->cur = busfreq * max_multiplier;
-
-	result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
-	if (result)
-		return result;
-
-	cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
-
-	return 0;
-}
-
-
-static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
-{
-	unsigned int i;
-	for (i = 0; i < 8; i++) {
-		if (i == max_multiplier)
-			powernow_k6_set_state(i);
-	}
-	cpufreq_frequency_table_put_attr(policy->cpu);
-	return 0;
-}
-
-static unsigned int powernow_k6_get(unsigned int cpu)
-{
-	unsigned int ret;
-	ret = (busfreq * powernow_k6_get_cpu_multiplier());
-	return ret;
-}
-
-static struct freq_attr *powernow_k6_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-static struct cpufreq_driver powernow_k6_driver = {
-	.verify		= powernow_k6_verify,
-	.target		= powernow_k6_target,
-	.init		= powernow_k6_cpu_init,
-	.exit		= powernow_k6_cpu_exit,
-	.get		= powernow_k6_get,
-	.name		= "powernow-k6",
-	.owner		= THIS_MODULE,
-	.attr		= powernow_k6_attr,
-};
-
-
-/**
- * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver
- *
- *   Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported
- * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero
- * on success.
- */
-static int __init powernow_k6_init(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-
-	if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) ||
-		((c->x86_model != 12) && (c->x86_model != 13)))
-		return -ENODEV;
-
-	if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) {
-		printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n");
-		return -EIO;
-	}
-
-	if (cpufreq_register_driver(&powernow_k6_driver)) {
-		release_region(POWERNOW_IOPORT, 16);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-
-/**
- * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support
- *
- *   Unregisters AMD K6-2+ / K6-3+ PowerNow! support.
- */
-static void __exit powernow_k6_exit(void)
-{
-	cpufreq_unregister_driver(&powernow_k6_driver);
-	release_region(POWERNOW_IOPORT, 16);
-}
-
-
-MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, "
-		"Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
-MODULE_LICENSE("GPL");
-
-module_init(powernow_k6_init);
-module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
deleted file mode 100644
index d71d9f37235..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ /dev/null
@@ -1,747 +0,0 @@
-/*
- *  AMD K7 Powernow driver.
- *  (C) 2003 Dave Jones on behalf of SuSE Labs.
- *  (C) 2003-2004 Dave Jones <davej@redhat.com>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *  Based upon datasheets & sample CPUs kindly provided by AMD.
- *
- * Errata 5:
- *  CPU may fail to execute a FID/VID change in presence of interrupt.
- *  - We cli/sti on stepping A0 CPUs around the FID/VID transition.
- * Errata 15:
- *  CPU with half frequency multipliers may hang upon wakeup from disconnect.
- *  - We disable half multipliers if ACPI is used on A0 stepping CPUs.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/dmi.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <asm/timer.h>		/* Needed for recalibrate_cpu_khz() */
-#include <asm/msr.h>
-#include <asm/system.h>
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-#include <linux/acpi.h>
-#include <acpi/processor.h>
-#endif
-
-#include "powernow-k7.h"
-
-#define PFX "powernow: "
-
-
-struct psb_s {
-	u8 signature[10];
-	u8 tableversion;
-	u8 flags;
-	u16 settlingtime;
-	u8 reserved1;
-	u8 numpst;
-};
-
-struct pst_s {
-	u32 cpuid;
-	u8 fsbspeed;
-	u8 maxfid;
-	u8 startvid;
-	u8 numpstates;
-};
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-union powernow_acpi_control_t {
-	struct {
-		unsigned long fid:5,
-			vid:5,
-			sgtc:20,
-			res1:2;
-	} bits;
-	unsigned long val;
-};
-#endif
-
-/* divide by 1000 to get VCore voltage in V. */
-static const int mobile_vid_table[32] = {
-    2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
-    1600, 1550, 1500, 1450, 1400, 1350, 1300, 0,
-    1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
-    1075, 1050, 1025, 1000, 975, 950, 925, 0,
-};
-
-/* divide by 10 to get FID. */
-static const int fid_codes[32] = {
-    110, 115, 120, 125, 50, 55, 60, 65,
-    70, 75, 80, 85, 90, 95, 100, 105,
-    30, 190, 40, 200, 130, 135, 140, 210,
-    150, 225, 160, 165, 170, 180, -1, -1,
-};
-
-/* This parameter is used in order to force ACPI instead of legacy method for
- * configuration purpose.
- */
-
-static int acpi_force;
-
-static struct cpufreq_frequency_table *powernow_table;
-
-static unsigned int can_scale_bus;
-static unsigned int can_scale_vid;
-static unsigned int minimum_speed = -1;
-static unsigned int maximum_speed;
-static unsigned int number_scales;
-static unsigned int fsb;
-static unsigned int latency;
-static char have_a0;
-
-static int check_fsb(unsigned int fsbspeed)
-{
-	int delta;
-	unsigned int f = fsb / 1000;
-
-	delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
-	return delta < 5;
-}
-
-static int check_powernow(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-	unsigned int maxei, eax, ebx, ecx, edx;
-
-	if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) {
-#ifdef MODULE
-		printk(KERN_INFO PFX "This module only works with "
-				"AMD K7 CPUs\n");
-#endif
-		return 0;
-	}
-
-	/* Get maximum capabilities */
-	maxei = cpuid_eax(0x80000000);
-	if (maxei < 0x80000007) {	/* Any powernow info ? */
-#ifdef MODULE
-		printk(KERN_INFO PFX "No powernow capabilities detected\n");
-#endif
-		return 0;
-	}
-
-	if ((c->x86_model == 6) && (c->x86_mask == 0)) {
-		printk(KERN_INFO PFX "K7 660[A0] core detected, "
-				"enabling errata workarounds\n");
-		have_a0 = 1;
-	}
-
-	cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
-
-	/* Check we can actually do something before we say anything.*/
-	if (!(edx & (1 << 1 | 1 << 2)))
-		return 0;
-
-	printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
-
-	if (edx & 1 << 1) {
-		printk("frequency");
-		can_scale_bus = 1;
-	}
-
-	if ((edx & (1 << 1 | 1 << 2)) == 0x6)
-		printk(" and ");
-
-	if (edx & 1 << 2) {
-		printk("voltage");
-		can_scale_vid = 1;
-	}
-
-	printk(".\n");
-	return 1;
-}
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-static void invalidate_entry(unsigned int entry)
-{
-	powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
-}
-#endif
-
-static int get_ranges(unsigned char *pst)
-{
-	unsigned int j;
-	unsigned int speed;
-	u8 fid, vid;
-
-	powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
-				(number_scales + 1)), GFP_KERNEL);
-	if (!powernow_table)
-		return -ENOMEM;
-
-	for (j = 0 ; j < number_scales; j++) {
-		fid = *pst++;
-
-		powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
-		powernow_table[j].index = fid; /* lower 8 bits */
-
-		speed = powernow_table[j].frequency;
-
-		if ((fid_codes[fid] % 10) == 5) {
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-			if (have_a0 == 1)
-				invalidate_entry(j);
-#endif
-		}
-
-		if (speed < minimum_speed)
-			minimum_speed = speed;
-		if (speed > maximum_speed)
-			maximum_speed = speed;
-
-		vid = *pst++;
-		powernow_table[j].index |= (vid << 8); /* upper 8 bits */
-
-		pr_debug("   FID: 0x%x (%d.%dx [%dMHz])  "
-			 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
-			 fid_codes[fid] % 10, speed/1000, vid,
-			 mobile_vid_table[vid]/1000,
-			 mobile_vid_table[vid]%1000);
-	}
-	powernow_table[number_scales].frequency = CPUFREQ_TABLE_END;
-	powernow_table[number_scales].index = 0;
-
-	return 0;
-}
-
-
-static void change_FID(int fid)
-{
-	union msr_fidvidctl fidvidctl;
-
-	rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
-	if (fidvidctl.bits.FID != fid) {
-		fidvidctl.bits.SGTC = latency;
-		fidvidctl.bits.FID = fid;
-		fidvidctl.bits.VIDC = 0;
-		fidvidctl.bits.FIDC = 1;
-		wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
-	}
-}
-
-
-static void change_VID(int vid)
-{
-	union msr_fidvidctl fidvidctl;
-
-	rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
-	if (fidvidctl.bits.VID != vid) {
-		fidvidctl.bits.SGTC = latency;
-		fidvidctl.bits.VID = vid;
-		fidvidctl.bits.FIDC = 0;
-		fidvidctl.bits.VIDC = 1;
-		wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
-	}
-}
-
-
-static void change_speed(unsigned int index)
-{
-	u8 fid, vid;
-	struct cpufreq_freqs freqs;
-	union msr_fidvidstatus fidvidstatus;
-	int cfid;
-
-	/* fid are the lower 8 bits of the index we stored into
-	 * the cpufreq frequency table in powernow_decode_bios,
-	 * vid are the upper 8 bits.
-	 */
-
-	fid = powernow_table[index].index & 0xFF;
-	vid = (powernow_table[index].index & 0xFF00) >> 8;
-
-	freqs.cpu = 0;
-
-	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
-	cfid = fidvidstatus.bits.CFID;
-	freqs.old = fsb * fid_codes[cfid] / 10;
-
-	freqs.new = powernow_table[index].frequency;
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
-	/* Now do the magic poking into the MSRs.  */
-
-	if (have_a0 == 1)	/* A0 errata 5 */
-		local_irq_disable();
-
-	if (freqs.old > freqs.new) {
-		/* Going down, so change FID first */
-		change_FID(fid);
-		change_VID(vid);
-	} else {
-		/* Going up, so change VID first */
-		change_VID(vid);
-		change_FID(fid);
-	}
-
-
-	if (have_a0 == 1)
-		local_irq_enable();
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-}
-
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-
-static struct acpi_processor_performance *acpi_processor_perf;
-
-static int powernow_acpi_init(void)
-{
-	int i;
-	int retval = 0;
-	union powernow_acpi_control_t pc;
-
-	if (acpi_processor_perf != NULL && powernow_table != NULL) {
-		retval = -EINVAL;
-		goto err0;
-	}
-
-	acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance),
-				      GFP_KERNEL);
-	if (!acpi_processor_perf) {
-		retval = -ENOMEM;
-		goto err0;
-	}
-
-	if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
-								GFP_KERNEL)) {
-		retval = -ENOMEM;
-		goto err05;
-	}
-
-	if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
-		retval = -EIO;
-		goto err1;
-	}
-
-	if (acpi_processor_perf->control_register.space_id !=
-			ACPI_ADR_SPACE_FIXED_HARDWARE) {
-		retval = -ENODEV;
-		goto err2;
-	}
-
-	if (acpi_processor_perf->status_register.space_id !=
-			ACPI_ADR_SPACE_FIXED_HARDWARE) {
-		retval = -ENODEV;
-		goto err2;
-	}
-
-	number_scales = acpi_processor_perf->state_count;
-
-	if (number_scales < 2) {
-		retval = -ENODEV;
-		goto err2;
-	}
-
-	powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
-				(number_scales + 1)), GFP_KERNEL);
-	if (!powernow_table) {
-		retval = -ENOMEM;
-		goto err2;
-	}
-
-	pc.val = (unsigned long) acpi_processor_perf->states[0].control;
-	for (i = 0; i < number_scales; i++) {
-		u8 fid, vid;
-		struct acpi_processor_px *state =
-			&acpi_processor_perf->states[i];
-		unsigned int speed, speed_mhz;
-
-		pc.val = (unsigned long) state->control;
-		pr_debug("acpi:  P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
-			 i,
-			 (u32) state->core_frequency,
-			 (u32) state->power,
-			 (u32) state->transition_latency,
-			 (u32) state->control,
-			 pc.bits.sgtc);
-
-		vid = pc.bits.vid;
-		fid = pc.bits.fid;
-
-		powernow_table[i].frequency = fsb * fid_codes[fid] / 10;
-		powernow_table[i].index = fid; /* lower 8 bits */
-		powernow_table[i].index |= (vid << 8); /* upper 8 bits */
-
-		speed = powernow_table[i].frequency;
-		speed_mhz = speed / 1000;
-
-		/* processor_perflib will multiply the MHz value by 1000 to
-		 * get a KHz value (e.g. 1266000). However, powernow-k7 works
-		 * with true KHz values (e.g. 1266768). To ensure that all
-		 * powernow frequencies are available, we must ensure that
-		 * ACPI doesn't restrict them, so we round up the MHz value
-		 * to ensure that perflib's computed KHz value is greater than
-		 * or equal to powernow's KHz value.
-		 */
-		if (speed % 1000 > 0)
-			speed_mhz++;
-
-		if ((fid_codes[fid] % 10) == 5) {
-			if (have_a0 == 1)
-				invalidate_entry(i);
-		}
-
-		pr_debug("   FID: 0x%x (%d.%dx [%dMHz])  "
-			 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
-			 fid_codes[fid] % 10, speed_mhz, vid,
-			 mobile_vid_table[vid]/1000,
-			 mobile_vid_table[vid]%1000);
-
-		if (state->core_frequency != speed_mhz) {
-			state->core_frequency = speed_mhz;
-			pr_debug("   Corrected ACPI frequency to %d\n",
-				speed_mhz);
-		}
-
-		if (latency < pc.bits.sgtc)
-			latency = pc.bits.sgtc;
-
-		if (speed < minimum_speed)
-			minimum_speed = speed;
-		if (speed > maximum_speed)
-			maximum_speed = speed;
-	}
-
-	powernow_table[i].frequency = CPUFREQ_TABLE_END;
-	powernow_table[i].index = 0;
-
-	/* notify BIOS that we exist */
-	acpi_processor_notify_smm(THIS_MODULE);
-
-	return 0;
-
-err2:
-	acpi_processor_unregister_performance(acpi_processor_perf, 0);
-err1:
-	free_cpumask_var(acpi_processor_perf->shared_cpu_map);
-err05:
-	kfree(acpi_processor_perf);
-err0:
-	printk(KERN_WARNING PFX "ACPI perflib can not be used on "
-			"this platform\n");
-	acpi_processor_perf = NULL;
-	return retval;
-}
-#else
-static int powernow_acpi_init(void)
-{
-	printk(KERN_INFO PFX "no support for ACPI processor found."
-	       "  Please recompile your kernel with ACPI processor\n");
-	return -EINVAL;
-}
-#endif
-
-static void print_pst_entry(struct pst_s *pst, unsigned int j)
-{
-	pr_debug("PST:%d (@%p)\n", j, pst);
-	pr_debug(" cpuid: 0x%x  fsb: %d  maxFID: 0x%x  startvid: 0x%x\n",
-		pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
-}
-
-static int powernow_decode_bios(int maxfid, int startvid)
-{
-	struct psb_s *psb;
-	struct pst_s *pst;
-	unsigned int i, j;
-	unsigned char *p;
-	unsigned int etuple;
-	unsigned int ret;
-
-	etuple = cpuid_eax(0x80000001);
-
-	for (i = 0xC0000; i < 0xffff0 ; i += 16) {
-
-		p = phys_to_virt(i);
-
-		if (memcmp(p, "AMDK7PNOW!",  10) == 0) {
-			pr_debug("Found PSB header at %p\n", p);
-			psb = (struct psb_s *) p;
-			pr_debug("Table version: 0x%x\n", psb->tableversion);
-			if (psb->tableversion != 0x12) {
-				printk(KERN_INFO PFX "Sorry, only v1.2 tables"
-						" supported right now\n");
-				return -ENODEV;
-			}
-
-			pr_debug("Flags: 0x%x\n", psb->flags);
-			if ((psb->flags & 1) == 0)
-				pr_debug("Mobile voltage regulator\n");
-			else
-				pr_debug("Desktop voltage regulator\n");
-
-			latency = psb->settlingtime;
-			if (latency < 100) {
-				printk(KERN_INFO PFX "BIOS set settling time "
-						"to %d microseconds. "
-						"Should be at least 100. "
-						"Correcting.\n", latency);
-				latency = 100;
-			}
-			pr_debug("Settling Time: %d microseconds.\n",
-					psb->settlingtime);
-			pr_debug("Has %d PST tables. (Only dumping ones "
-					"relevant to this CPU).\n",
-					psb->numpst);
-
-			p += sizeof(struct psb_s);
-
-			pst = (struct pst_s *) p;
-
-			for (j = 0; j < psb->numpst; j++) {
-				pst = (struct pst_s *) p;
-				number_scales = pst->numpstates;
-
-				if ((etuple == pst->cpuid) &&
-				    check_fsb(pst->fsbspeed) &&
-				    (maxfid == pst->maxfid) &&
-				    (startvid == pst->startvid)) {
-					print_pst_entry(pst, j);
-					p = (char *)pst + sizeof(struct pst_s);
-					ret = get_ranges(p);
-					return ret;
-				} else {
-					unsigned int k;
-					p = (char *)pst + sizeof(struct pst_s);
-					for (k = 0; k < number_scales; k++)
-						p += 2;
-				}
-			}
-			printk(KERN_INFO PFX "No PST tables match this cpuid "
-					"(0x%x)\n", etuple);
-			printk(KERN_INFO PFX "This is indicative of a broken "
-					"BIOS.\n");
-
-			return -EINVAL;
-		}
-		p++;
-	}
-
-	return -ENODEV;
-}
-
-
-static int powernow_target(struct cpufreq_policy *policy,
-			    unsigned int target_freq,
-			    unsigned int relation)
-{
-	unsigned int newstate;
-
-	if (cpufreq_frequency_table_target(policy, powernow_table, target_freq,
-				relation, &newstate))
-		return -EINVAL;
-
-	change_speed(newstate);
-
-	return 0;
-}
-
-
-static int powernow_verify(struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy, powernow_table);
-}
-
-/*
- * We use the fact that the bus frequency is somehow
- * a multiple of 100000/3 khz, then we compute sgtc according
- * to this multiple.
- * That way, we match more how AMD thinks all of that work.
- * We will then get the same kind of behaviour already tested under
- * the "well-known" other OS.
- */
-static int __cpuinit fixup_sgtc(void)
-{
-	unsigned int sgtc;
-	unsigned int m;
-
-	m = fsb / 3333;
-	if ((m % 10) >= 5)
-		m += 5;
-
-	m /= 10;
-
-	sgtc = 100 * m * latency;
-	sgtc = sgtc / 3;
-	if (sgtc > 0xfffff) {
-		printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc);
-		sgtc = 0xfffff;
-	}
-	return sgtc;
-}
-
-static unsigned int powernow_get(unsigned int cpu)
-{
-	union msr_fidvidstatus fidvidstatus;
-	unsigned int cfid;
-
-	if (cpu)
-		return 0;
-	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
-	cfid = fidvidstatus.bits.CFID;
-
-	return fsb * fid_codes[cfid] / 10;
-}
-
-
-static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
-{
-	printk(KERN_WARNING PFX
-		"%s laptop with broken PST tables in BIOS detected.\n",
-		d->ident);
-	printk(KERN_WARNING PFX
-		"You need to downgrade to 3A21 (09/09/2002), or try a newer "
-		"BIOS than 3A71 (01/20/2003)\n");
-	printk(KERN_WARNING PFX
-		"cpufreq scaling has been disabled as a result of this.\n");
-	return 0;
-}
-
-/*
- * Some Athlon laptops have really fucked PST tables.
- * A BIOS update is all that can save them.
- * Mention this, and disable cpufreq.
- */
-static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
-	{
-		.callback = acer_cpufreq_pst,
-		.ident = "Acer Aspire",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"),
-			DMI_MATCH(DMI_BIOS_VERSION, "3A71"),
-		},
-	},
-	{ }
-};
-
-static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
-{
-	union msr_fidvidstatus fidvidstatus;
-	int result;
-
-	if (policy->cpu != 0)
-		return -ENODEV;
-
-	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
-
-	recalibrate_cpu_khz();
-
-	fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID];
-	if (!fsb) {
-		printk(KERN_WARNING PFX "can not determine bus frequency\n");
-		return -EINVAL;
-	}
-	pr_debug("FSB: %3dMHz\n", fsb/1000);
-
-	if (dmi_check_system(powernow_dmi_table) || acpi_force) {
-		printk(KERN_INFO PFX "PSB/PST known to be broken.  "
-				"Trying ACPI instead\n");
-		result = powernow_acpi_init();
-	} else {
-		result = powernow_decode_bios(fidvidstatus.bits.MFID,
-				fidvidstatus.bits.SVID);
-		if (result) {
-			printk(KERN_INFO PFX "Trying ACPI perflib\n");
-			maximum_speed = 0;
-			minimum_speed = -1;
-			latency = 0;
-			result = powernow_acpi_init();
-			if (result) {
-				printk(KERN_INFO PFX
-					"ACPI and legacy methods failed\n");
-			}
-		} else {
-			/* SGTC use the bus clock as timer */
-			latency = fixup_sgtc();
-			printk(KERN_INFO PFX "SGTC: %d\n", latency);
-		}
-	}
-
-	if (result)
-		return result;
-
-	printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
-				minimum_speed/1000, maximum_speed/1000);
-
-	policy->cpuinfo.transition_latency =
-		cpufreq_scale(2000000UL, fsb, latency);
-
-	policy->cur = powernow_get(0);
-
-	cpufreq_frequency_table_get_attr(powernow_table, policy->cpu);
-
-	return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
-}
-
-static int powernow_cpu_exit(struct cpufreq_policy *policy)
-{
-	cpufreq_frequency_table_put_attr(policy->cpu);
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-	if (acpi_processor_perf) {
-		acpi_processor_unregister_performance(acpi_processor_perf, 0);
-		free_cpumask_var(acpi_processor_perf->shared_cpu_map);
-		kfree(acpi_processor_perf);
-	}
-#endif
-
-	kfree(powernow_table);
-	return 0;
-}
-
-static struct freq_attr *powernow_table_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-static struct cpufreq_driver powernow_driver = {
-	.verify		= powernow_verify,
-	.target		= powernow_target,
-	.get		= powernow_get,
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-	.bios_limit	= acpi_processor_get_bios_limit,
-#endif
-	.init		= powernow_cpu_init,
-	.exit		= powernow_cpu_exit,
-	.name		= "powernow-k7",
-	.owner		= THIS_MODULE,
-	.attr		= powernow_table_attr,
-};
-
-static int __init powernow_init(void)
-{
-	if (check_powernow() == 0)
-		return -ENODEV;
-	return cpufreq_register_driver(&powernow_driver);
-}
-
-
-static void __exit powernow_exit(void)
-{
-	cpufreq_unregister_driver(&powernow_driver);
-}
-
-module_param(acpi_force,  int, 0444);
-MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
-
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
-MODULE_DESCRIPTION("Powernow driver for AMD K7 processors.");
-MODULE_LICENSE("GPL");
-
-late_initcall(powernow_init);
-module_exit(powernow_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
deleted file mode 100644
index 35fb4eaf6e1..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- *  (C) 2003 Dave Jones.
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  AMD-specific information
- *
- */
-
-union msr_fidvidctl {
-	struct {
-		unsigned FID:5,			// 4:0
-		reserved1:3,	// 7:5
-		VID:5,			// 12:8
-		reserved2:3,	// 15:13
-		FIDC:1,			// 16
-		VIDC:1,			// 17
-		reserved3:2,	// 19:18
-		FIDCHGRATIO:1,	// 20
-		reserved4:11,	// 31-21
-		SGTC:20,		// 32:51
-		reserved5:12;	// 63:52
-	} bits;
-	unsigned long long val;
-};
-
-union msr_fidvidstatus {
-	struct {
-		unsigned CFID:5,			// 4:0
-		reserved1:3,	// 7:5
-		SFID:5,			// 12:8
-		reserved2:3,	// 15:13
-		MFID:5,			// 20:16
-		reserved3:11,	// 31:21
-		CVID:5,			// 36:32
-		reserved4:3,	// 39:37
-		SVID:5,			// 44:40
-		reserved5:3,	// 47:45
-		MVID:5,			// 52:48
-		reserved6:11;	// 63:53
-	} bits;
-	unsigned long long val;
-};
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
deleted file mode 100644
index 83479b6fb9a..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ /dev/null
@@ -1,1607 +0,0 @@
-/*
- *   (c) 2003-2010 Advanced Micro Devices, Inc.
- *  Your use of this code is subject to the terms and conditions of the
- *  GNU general public license version 2. See "COPYING" or
- *  http://www.gnu.org/licenses/gpl.html
- *
- *  Support : mark.langsdorf@amd.com
- *
- *  Based on the powernow-k7.c module written by Dave Jones.
- *  (C) 2003 Dave Jones on behalf of SuSE Labs
- *  (C) 2004 Dominik Brodowski <linux@brodo.de>
- *  (C) 2004 Pavel Machek <pavel@ucw.cz>
- *  Licensed under the terms of the GNU GPL License version 2.
- *  Based upon datasheets & sample CPUs kindly provided by AMD.
- *
- *  Valuable input gratefully received from Dave Jones, Pavel Machek,
- *  Dominik Brodowski, Jacob Shin, and others.
- *  Originally developed by Paul Devriendt.
- *  Processor information obtained from Chapter 9 (Power and Thermal Management)
- *  of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
- *  Opteron Processors" available for download from www.amd.com
- *
- *  Tables for specific CPUs can be inferred from
- *     http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf
- */
-
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/cpumask.h>
-#include <linux/sched.h>	/* for current / set_cpus_allowed() */
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/msr.h>
-
-#include <linux/acpi.h>
-#include <linux/mutex.h>
-#include <acpi/processor.h>
-
-#define PFX "powernow-k8: "
-#define VERSION "version 2.20.00"
-#include "powernow-k8.h"
-#include "mperf.h"
-
-/* serialize freq changes  */
-static DEFINE_MUTEX(fidvid_mutex);
-
-static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
-
-static int cpu_family = CPU_OPTERON;
-
-/* core performance boost */
-static bool cpb_capable, cpb_enabled;
-static struct msr __percpu *msrs;
-
-static struct cpufreq_driver cpufreq_amd64_driver;
-
-#ifndef CONFIG_SMP
-static inline const struct cpumask *cpu_core_mask(int cpu)
-{
-	return cpumask_of(0);
-}
-#endif
-
-/* Return a frequency in MHz, given an input fid */
-static u32 find_freq_from_fid(u32 fid)
-{
-	return 800 + (fid * 100);
-}
-
-/* Return a frequency in KHz, given an input fid */
-static u32 find_khz_freq_from_fid(u32 fid)
-{
-	return 1000 * find_freq_from_fid(fid);
-}
-
-static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data,
-		u32 pstate)
-{
-	return data[pstate].frequency;
-}
-
-/* Return the vco fid for an input fid
- *
- * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
- * only from corresponding high fids. This returns "high" fid corresponding to
- * "low" one.
- */
-static u32 convert_fid_to_vco_fid(u32 fid)
-{
-	if (fid < HI_FID_TABLE_BOTTOM)
-		return 8 + (2 * fid);
-	else
-		return fid;
-}
-
-/*
- * Return 1 if the pending bit is set. Unless we just instructed the processor
- * to transition to a new state, seeing this bit set is really bad news.
- */
-static int pending_bit_stuck(void)
-{
-	u32 lo, hi;
-
-	if (cpu_family == CPU_HW_PSTATE)
-		return 0;
-
-	rdmsr(MSR_FIDVID_STATUS, lo, hi);
-	return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0;
-}
-
-/*
- * Update the global current fid / vid values from the status msr.
- * Returns 1 on error.
- */
-static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
-{
-	u32 lo, hi;
-	u32 i = 0;
-
-	if (cpu_family == CPU_HW_PSTATE) {
-		rdmsr(MSR_PSTATE_STATUS, lo, hi);
-		i = lo & HW_PSTATE_MASK;
-		data->currpstate = i;
-
-		/*
-		 * a workaround for family 11h erratum 311 might cause
-		 * an "out-of-range Pstate if the core is in Pstate-0
-		 */
-		if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps))
-			data->currpstate = HW_PSTATE_0;
-
-		return 0;
-	}
-	do {
-		if (i++ > 10000) {
-			pr_debug("detected change pending stuck\n");
-			return 1;
-		}
-		rdmsr(MSR_FIDVID_STATUS, lo, hi);
-	} while (lo & MSR_S_LO_CHANGE_PENDING);
-
-	data->currvid = hi & MSR_S_HI_CURRENT_VID;
-	data->currfid = lo & MSR_S_LO_CURRENT_FID;
-
-	return 0;
-}
-
-/* the isochronous relief time */
-static void count_off_irt(struct powernow_k8_data *data)
-{
-	udelay((1 << data->irt) * 10);
-	return;
-}
-
-/* the voltage stabilization time */
-static void count_off_vst(struct powernow_k8_data *data)
-{
-	udelay(data->vstable * VST_UNITS_20US);
-	return;
-}
-
-/* need to init the control msr to a safe value (for each cpu) */
-static void fidvid_msr_init(void)
-{
-	u32 lo, hi;
-	u8 fid, vid;
-
-	rdmsr(MSR_FIDVID_STATUS, lo, hi);
-	vid = hi & MSR_S_HI_CURRENT_VID;
-	fid = lo & MSR_S_LO_CURRENT_FID;
-	lo = fid | (vid << MSR_C_LO_VID_SHIFT);
-	hi = MSR_C_HI_STP_GNT_BENIGN;
-	pr_debug("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);
-	wrmsr(MSR_FIDVID_CTL, lo, hi);
-}
-
-/* write the new fid value along with the other control fields to the msr */
-static int write_new_fid(struct powernow_k8_data *data, u32 fid)
-{
-	u32 lo;
-	u32 savevid = data->currvid;
-	u32 i = 0;
-
-	if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) {
-		printk(KERN_ERR PFX "internal error - overflow on fid write\n");
-		return 1;
-	}
-
-	lo = fid;
-	lo |= (data->currvid << MSR_C_LO_VID_SHIFT);
-	lo |= MSR_C_LO_INIT_FID_VID;
-
-	pr_debug("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
-		fid, lo, data->plllock * PLL_LOCK_CONVERSION);
-
-	do {
-		wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
-		if (i++ > 100) {
-			printk(KERN_ERR PFX
-				"Hardware error - pending bit very stuck - "
-				"no further pstate changes possible\n");
-			return 1;
-		}
-	} while (query_current_values_with_pending_wait(data));
-
-	count_off_irt(data);
-
-	if (savevid != data->currvid) {
-		printk(KERN_ERR PFX
-			"vid change on fid trans, old 0x%x, new 0x%x\n",
-			savevid, data->currvid);
-		return 1;
-	}
-
-	if (fid != data->currfid) {
-		printk(KERN_ERR PFX
-			"fid trans failed, fid 0x%x, curr 0x%x\n", fid,
-			data->currfid);
-		return 1;
-	}
-
-	return 0;
-}
-
-/* Write a new vid to the hardware */
-static int write_new_vid(struct powernow_k8_data *data, u32 vid)
-{
-	u32 lo;
-	u32 savefid = data->currfid;
-	int i = 0;
-
-	if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) {
-		printk(KERN_ERR PFX "internal error - overflow on vid write\n");
-		return 1;
-	}
-
-	lo = data->currfid;
-	lo |= (vid << MSR_C_LO_VID_SHIFT);
-	lo |= MSR_C_LO_INIT_FID_VID;
-
-	pr_debug("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
-		vid, lo, STOP_GRANT_5NS);
-
-	do {
-		wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
-		if (i++ > 100) {
-			printk(KERN_ERR PFX "internal error - pending bit "
-					"very stuck - no further pstate "
-					"changes possible\n");
-			return 1;
-		}
-	} while (query_current_values_with_pending_wait(data));
-
-	if (savefid != data->currfid) {
-		printk(KERN_ERR PFX "fid changed on vid trans, old "
-			"0x%x new 0x%x\n",
-		       savefid, data->currfid);
-		return 1;
-	}
-
-	if (vid != data->currvid) {
-		printk(KERN_ERR PFX "vid trans failed, vid 0x%x, "
-				"curr 0x%x\n",
-				vid, data->currvid);
-		return 1;
-	}
-
-	return 0;
-}
-
-/*
- * Reduce the vid by the max of step or reqvid.
- * Decreasing vid codes represent increasing voltages:
- * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.
- */
-static int decrease_vid_code_by_step(struct powernow_k8_data *data,
-		u32 reqvid, u32 step)
-{
-	if ((data->currvid - reqvid) > step)
-		reqvid = data->currvid - step;
-
-	if (write_new_vid(data, reqvid))
-		return 1;
-
-	count_off_vst(data);
-
-	return 0;
-}
-
-/* Change hardware pstate by single MSR write */
-static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
-{
-	wrmsr(MSR_PSTATE_CTRL, pstate, 0);
-	data->currpstate = pstate;
-	return 0;
-}
-
-/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
-static int transition_fid_vid(struct powernow_k8_data *data,
-		u32 reqfid, u32 reqvid)
-{
-	if (core_voltage_pre_transition(data, reqvid, reqfid))
-		return 1;
-
-	if (core_frequency_transition(data, reqfid))
-		return 1;
-
-	if (core_voltage_post_transition(data, reqvid))
-		return 1;
-
-	if (query_current_values_with_pending_wait(data))
-		return 1;
-
-	if ((reqfid != data->currfid) || (reqvid != data->currvid)) {
-		printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, "
-				"curr 0x%x 0x%x\n",
-				smp_processor_id(),
-				reqfid, reqvid, data->currfid, data->currvid);
-		return 1;
-	}
-
-	pr_debug("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",
-		smp_processor_id(), data->currfid, data->currvid);
-
-	return 0;
-}
-
-/* Phase 1 - core voltage transition ... setup voltage */
-static int core_voltage_pre_transition(struct powernow_k8_data *data,
-		u32 reqvid, u32 reqfid)
-{
-	u32 rvosteps = data->rvo;
-	u32 savefid = data->currfid;
-	u32 maxvid, lo, rvomult = 1;
-
-	pr_debug("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
-		"reqvid 0x%x, rvo 0x%x\n",
-		smp_processor_id(),
-		data->currfid, data->currvid, reqvid, data->rvo);
-
-	if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP))
-		rvomult = 2;
-	rvosteps *= rvomult;
-	rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
-	maxvid = 0x1f & (maxvid >> 16);
-	pr_debug("ph1 maxvid=0x%x\n", maxvid);
-	if (reqvid < maxvid) /* lower numbers are higher voltages */
-		reqvid = maxvid;
-
-	while (data->currvid > reqvid) {
-		pr_debug("ph1: curr 0x%x, req vid 0x%x\n",
-			data->currvid, reqvid);
-		if (decrease_vid_code_by_step(data, reqvid, data->vidmvs))
-			return 1;
-	}
-
-	while ((rvosteps > 0) &&
-			((rvomult * data->rvo + data->currvid) > reqvid)) {
-		if (data->currvid == maxvid) {
-			rvosteps = 0;
-		} else {
-			pr_debug("ph1: changing vid for rvo, req 0x%x\n",
-				data->currvid - 1);
-			if (decrease_vid_code_by_step(data, data->currvid-1, 1))
-				return 1;
-			rvosteps--;
-		}
-	}
-
-	if (query_current_values_with_pending_wait(data))
-		return 1;
-
-	if (savefid != data->currfid) {
-		printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n",
-				data->currfid);
-		return 1;
-	}
-
-	pr_debug("ph1 complete, currfid 0x%x, currvid 0x%x\n",
-		data->currfid, data->currvid);
-
-	return 0;
-}
-
-/* Phase 2 - core frequency transition */
-static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
-{
-	u32 vcoreqfid, vcocurrfid, vcofiddiff;
-	u32 fid_interval, savevid = data->currvid;
-
-	if (data->currfid == reqfid) {
-		printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
-				data->currfid);
-		return 0;
-	}
-
-	pr_debug("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "
-		"reqfid 0x%x\n",
-		smp_processor_id(),
-		data->currfid, data->currvid, reqfid);
-
-	vcoreqfid = convert_fid_to_vco_fid(reqfid);
-	vcocurrfid = convert_fid_to_vco_fid(data->currfid);
-	vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
-	    : vcoreqfid - vcocurrfid;
-
-	if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP))
-		vcofiddiff = 0;
-
-	while (vcofiddiff > 2) {
-		(data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
-
-		if (reqfid > data->currfid) {
-			if (data->currfid > LO_FID_TABLE_TOP) {
-				if (write_new_fid(data,
-						data->currfid + fid_interval))
-					return 1;
-			} else {
-				if (write_new_fid
-				    (data,
-				     2 + convert_fid_to_vco_fid(data->currfid)))
-					return 1;
-			}
-		} else {
-			if (write_new_fid(data, data->currfid - fid_interval))
-				return 1;
-		}
-
-		vcocurrfid = convert_fid_to_vco_fid(data->currfid);
-		vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
-		    : vcoreqfid - vcocurrfid;
-	}
-
-	if (write_new_fid(data, reqfid))
-		return 1;
-
-	if (query_current_values_with_pending_wait(data))
-		return 1;
-
-	if (data->currfid != reqfid) {
-		printk(KERN_ERR PFX
-			"ph2: mismatch, failed fid transition, "
-			"curr 0x%x, req 0x%x\n",
-			data->currfid, reqfid);
-		return 1;
-	}
-
-	if (savevid != data->currvid) {
-		printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n",
-			savevid, data->currvid);
-		return 1;
-	}
-
-	pr_debug("ph2 complete, currfid 0x%x, currvid 0x%x\n",
-		data->currfid, data->currvid);
-
-	return 0;
-}
-
-/* Phase 3 - core voltage transition flow ... jump to the final vid. */
-static int core_voltage_post_transition(struct powernow_k8_data *data,
-		u32 reqvid)
-{
-	u32 savefid = data->currfid;
-	u32 savereqvid = reqvid;
-
-	pr_debug("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",
-		smp_processor_id(),
-		data->currfid, data->currvid);
-
-	if (reqvid != data->currvid) {
-		if (write_new_vid(data, reqvid))
-			return 1;
-
-		if (savefid != data->currfid) {
-			printk(KERN_ERR PFX
-			       "ph3: bad fid change, save 0x%x, curr 0x%x\n",
-			       savefid, data->currfid);
-			return 1;
-		}
-
-		if (data->currvid != reqvid) {
-			printk(KERN_ERR PFX
-			       "ph3: failed vid transition\n, "
-			       "req 0x%x, curr 0x%x",
-			       reqvid, data->currvid);
-			return 1;
-		}
-	}
-
-	if (query_current_values_with_pending_wait(data))
-		return 1;
-
-	if (savereqvid != data->currvid) {
-		pr_debug("ph3 failed, currvid 0x%x\n", data->currvid);
-		return 1;
-	}
-
-	if (savefid != data->currfid) {
-		pr_debug("ph3 failed, currfid changed 0x%x\n",
-			data->currfid);
-		return 1;
-	}
-
-	pr_debug("ph3 complete, currfid 0x%x, currvid 0x%x\n",
-		data->currfid, data->currvid);
-
-	return 0;
-}
-
-static void check_supported_cpu(void *_rc)
-{
-	u32 eax, ebx, ecx, edx;
-	int *rc = _rc;
-
-	*rc = -ENODEV;
-
-	if (__this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_AMD)
-		return;
-
-	eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
-	if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
-	    ((eax & CPUID_XFAM) < CPUID_XFAM_10H))
-		return;
-
-	if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
-		if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
-		    ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
-			printk(KERN_INFO PFX
-				"Processor cpuid %x not supported\n", eax);
-			return;
-		}
-
-		eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
-		if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
-			printk(KERN_INFO PFX
-			       "No frequency change capabilities detected\n");
-			return;
-		}
-
-		cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
-		if ((edx & P_STATE_TRANSITION_CAPABLE)
-			!= P_STATE_TRANSITION_CAPABLE) {
-			printk(KERN_INFO PFX
-				"Power state transitions not supported\n");
-			return;
-		}
-	} else { /* must be a HW Pstate capable processor */
-		cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
-		if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
-			cpu_family = CPU_HW_PSTATE;
-		else
-			return;
-	}
-
-	*rc = 0;
-}
-
-static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
-		u8 maxvid)
-{
-	unsigned int j;
-	u8 lastfid = 0xff;
-
-	for (j = 0; j < data->numps; j++) {
-		if (pst[j].vid > LEAST_VID) {
-			printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
-			       j, pst[j].vid);
-			return -EINVAL;
-		}
-		if (pst[j].vid < data->rvo) {
-			/* vid + rvo >= 0 */
-			printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
-			       " %d\n", j);
-			return -ENODEV;
-		}
-		if (pst[j].vid < maxvid + data->rvo) {
-			/* vid + rvo >= maxvid */
-			printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
-			       " %d\n", j);
-			return -ENODEV;
-		}
-		if (pst[j].fid > MAX_FID) {
-			printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
-			       " %d\n", j);
-			return -ENODEV;
-		}
-		if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
-			/* Only first fid is allowed to be in "low" range */
-			printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
-			       "0x%x\n", j, pst[j].fid);
-			return -EINVAL;
-		}
-		if (pst[j].fid < lastfid)
-			lastfid = pst[j].fid;
-	}
-	if (lastfid & 1) {
-		printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
-		return -EINVAL;
-	}
-	if (lastfid > LO_FID_TABLE_TOP)
-		printk(KERN_INFO FW_BUG PFX
-			"first fid not from lo freq table\n");
-
-	return 0;
-}
-
-static void invalidate_entry(struct cpufreq_frequency_table *powernow_table,
-		unsigned int entry)
-{
-	powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
-}
-
-static void print_basics(struct powernow_k8_data *data)
-{
-	int j;
-	for (j = 0; j < data->numps; j++) {
-		if (data->powernow_table[j].frequency !=
-				CPUFREQ_ENTRY_INVALID) {
-			if (cpu_family == CPU_HW_PSTATE) {
-				printk(KERN_INFO PFX
-					"   %d : pstate %d (%d MHz)\n", j,
-					data->powernow_table[j].index,
-					data->powernow_table[j].frequency/1000);
-			} else {
-				printk(KERN_INFO PFX
-					"fid 0x%x (%d MHz), vid 0x%x\n",
-					data->powernow_table[j].index & 0xff,
-					data->powernow_table[j].frequency/1000,
-					data->powernow_table[j].index >> 8);
-			}
-		}
-	}
-	if (data->batps)
-		printk(KERN_INFO PFX "Only %d pstates on battery\n",
-				data->batps);
-}
-
-static u32 freq_from_fid_did(u32 fid, u32 did)
-{
-	u32 mhz = 0;
-
-	if (boot_cpu_data.x86 == 0x10)
-		mhz = (100 * (fid + 0x10)) >> did;
-	else if (boot_cpu_data.x86 == 0x11)
-		mhz = (100 * (fid + 8)) >> did;
-	else
-		BUG();
-
-	return mhz * 1000;
-}
-
-static int fill_powernow_table(struct powernow_k8_data *data,
-		struct pst_s *pst, u8 maxvid)
-{
-	struct cpufreq_frequency_table *powernow_table;
-	unsigned int j;
-
-	if (data->batps) {
-		/* use ACPI support to get full speed on mains power */
-		printk(KERN_WARNING PFX
-			"Only %d pstates usable (use ACPI driver for full "
-			"range\n", data->batps);
-		data->numps = data->batps;
-	}
-
-	for (j = 1; j < data->numps; j++) {
-		if (pst[j-1].fid >= pst[j].fid) {
-			printk(KERN_ERR PFX "PST out of sequence\n");
-			return -EINVAL;
-		}
-	}
-
-	if (data->numps < 2) {
-		printk(KERN_ERR PFX "no p states to transition\n");
-		return -ENODEV;
-	}
-
-	if (check_pst_table(data, pst, maxvid))
-		return -EINVAL;
-
-	powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
-		* (data->numps + 1)), GFP_KERNEL);
-	if (!powernow_table) {
-		printk(KERN_ERR PFX "powernow_table memory alloc failure\n");
-		return -ENOMEM;
-	}
-
-	for (j = 0; j < data->numps; j++) {
-		int freq;
-		powernow_table[j].index = pst[j].fid; /* lower 8 bits */
-		powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
-		freq = find_khz_freq_from_fid(pst[j].fid);
-		powernow_table[j].frequency = freq;
-	}
-	powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
-	powernow_table[data->numps].index = 0;
-
-	if (query_current_values_with_pending_wait(data)) {
-		kfree(powernow_table);
-		return -EIO;
-	}
-
-	pr_debug("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
-	data->powernow_table = powernow_table;
-	if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
-		print_basics(data);
-
-	for (j = 0; j < data->numps; j++)
-		if ((pst[j].fid == data->currfid) &&
-		    (pst[j].vid == data->currvid))
-			return 0;
-
-	pr_debug("currfid/vid do not match PST, ignoring\n");
-	return 0;
-}
-
-/* Find and validate the PSB/PST table in BIOS. */
-static int find_psb_table(struct powernow_k8_data *data)
-{
-	struct psb_s *psb;
-	unsigned int i;
-	u32 mvs;
-	u8 maxvid;
-	u32 cpst = 0;
-	u32 thiscpuid;
-
-	for (i = 0xc0000; i < 0xffff0; i += 0x10) {
-		/* Scan BIOS looking for the signature. */
-		/* It can not be at ffff0 - it is too big. */
-
-		psb = phys_to_virt(i);
-		if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0)
-			continue;
-
-		pr_debug("found PSB header at 0x%p\n", psb);
-
-		pr_debug("table vers: 0x%x\n", psb->tableversion);
-		if (psb->tableversion != PSB_VERSION_1_4) {
-			printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
-			return -ENODEV;
-		}
-
-		pr_debug("flags: 0x%x\n", psb->flags1);
-		if (psb->flags1) {
-			printk(KERN_ERR FW_BUG PFX "unknown flags\n");
-			return -ENODEV;
-		}
-
-		data->vstable = psb->vstable;
-		pr_debug("voltage stabilization time: %d(*20us)\n",
-				data->vstable);
-
-		pr_debug("flags2: 0x%x\n", psb->flags2);
-		data->rvo = psb->flags2 & 3;
-		data->irt = ((psb->flags2) >> 2) & 3;
-		mvs = ((psb->flags2) >> 4) & 3;
-		data->vidmvs = 1 << mvs;
-		data->batps = ((psb->flags2) >> 6) & 3;
-
-		pr_debug("ramp voltage offset: %d\n", data->rvo);
-		pr_debug("isochronous relief time: %d\n", data->irt);
-		pr_debug("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);
-
-		pr_debug("numpst: 0x%x\n", psb->num_tables);
-		cpst = psb->num_tables;
-		if ((psb->cpuid == 0x00000fc0) ||
-		    (psb->cpuid == 0x00000fe0)) {
-			thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
-			if ((thiscpuid == 0x00000fc0) ||
-			    (thiscpuid == 0x00000fe0))
-				cpst = 1;
-		}
-		if (cpst != 1) {
-			printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
-			return -ENODEV;
-		}
-
-		data->plllock = psb->plllocktime;
-		pr_debug("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);
-		pr_debug("maxfid: 0x%x\n", psb->maxfid);
-		pr_debug("maxvid: 0x%x\n", psb->maxvid);
-		maxvid = psb->maxvid;
-
-		data->numps = psb->numps;
-		pr_debug("numpstates: 0x%x\n", data->numps);
-		return fill_powernow_table(data,
-				(struct pst_s *)(psb+1), maxvid);
-	}
-	/*
-	 * If you see this message, complain to BIOS manufacturer. If
-	 * he tells you "we do not support Linux" or some similar
-	 * nonsense, remember that Windows 2000 uses the same legacy
-	 * mechanism that the old Linux PSB driver uses. Tell them it
-	 * is broken with Windows 2000.
-	 *
-	 * The reference to the AMD documentation is chapter 9 in the
-	 * BIOS and Kernel Developer's Guide, which is available on
-	 * www.amd.com
-	 */
-	printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
-	printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
-		" and Cool'N'Quiet support is enabled in BIOS setup\n");
-	return -ENODEV;
-}
-
-static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
-		unsigned int index)
-{
-	u64 control;
-
-	if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
-		return;
-
-	control = data->acpi_data.states[index].control;
-	data->irt = (control >> IRT_SHIFT) & IRT_MASK;
-	data->rvo = (control >> RVO_SHIFT) & RVO_MASK;
-	data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
-	data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK;
-	data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK);
-	data->vstable = (control >> VST_SHIFT) & VST_MASK;
-}
-
-static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
-{
-	struct cpufreq_frequency_table *powernow_table;
-	int ret_val = -ENODEV;
-	u64 control, status;
-
-	if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
-		pr_debug("register performance failed: bad ACPI data\n");
-		return -EIO;
-	}
-
-	/* verify the data contained in the ACPI structures */
-	if (data->acpi_data.state_count <= 1) {
-		pr_debug("No ACPI P-States\n");
-		goto err_out;
-	}
-
-	control = data->acpi_data.control_register.space_id;
-	status = data->acpi_data.status_register.space_id;
-
-	if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
-	    (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
-		pr_debug("Invalid control/status registers (%llx - %llx)\n",
-			control, status);
-		goto err_out;
-	}
-
-	/* fill in data->powernow_table */
-	powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
-		* (data->acpi_data.state_count + 1)), GFP_KERNEL);
-	if (!powernow_table) {
-		pr_debug("powernow_table memory alloc failure\n");
-		goto err_out;
-	}
-
-	/* fill in data */
-	data->numps = data->acpi_data.state_count;
-	powernow_k8_acpi_pst_values(data, 0);
-
-	if (cpu_family == CPU_HW_PSTATE)
-		ret_val = fill_powernow_table_pstate(data, powernow_table);
-	else
-		ret_val = fill_powernow_table_fidvid(data, powernow_table);
-	if (ret_val)
-		goto err_out_mem;
-
-	powernow_table[data->acpi_data.state_count].frequency =
-		CPUFREQ_TABLE_END;
-	powernow_table[data->acpi_data.state_count].index = 0;
-	data->powernow_table = powernow_table;
-
-	if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
-		print_basics(data);
-
-	/* notify BIOS that we exist */
-	acpi_processor_notify_smm(THIS_MODULE);
-
-	if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
-		printk(KERN_ERR PFX
-				"unable to alloc powernow_k8_data cpumask\n");
-		ret_val = -ENOMEM;
-		goto err_out_mem;
-	}
-
-	return 0;
-
-err_out_mem:
-	kfree(powernow_table);
-
-err_out:
-	acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
-
-	/* data->acpi_data.state_count informs us at ->exit()
-	 * whether ACPI was used */
-	data->acpi_data.state_count = 0;
-
-	return ret_val;
-}
-
-static int fill_powernow_table_pstate(struct powernow_k8_data *data,
-		struct cpufreq_frequency_table *powernow_table)
-{
-	int i;
-	u32 hi = 0, lo = 0;
-	rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
-	data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
-
-	for (i = 0; i < data->acpi_data.state_count; i++) {
-		u32 index;
-
-		index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
-		if (index > data->max_hw_pstate) {
-			printk(KERN_ERR PFX "invalid pstate %d - "
-					"bad value %d.\n", i, index);
-			printk(KERN_ERR PFX "Please report to BIOS "
-					"manufacturer\n");
-			invalidate_entry(powernow_table, i);
-			continue;
-		}
-		rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
-		if (!(hi & HW_PSTATE_VALID_MASK)) {
-			pr_debug("invalid pstate %d, ignoring\n", index);
-			invalidate_entry(powernow_table, i);
-			continue;
-		}
-
-		powernow_table[i].index = index;
-
-		/* Frequency may be rounded for these */
-		if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
-				 || boot_cpu_data.x86 == 0x11) {
-			powernow_table[i].frequency =
-				freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
-		} else
-			powernow_table[i].frequency =
-				data->acpi_data.states[i].core_frequency * 1000;
-	}
-	return 0;
-}
-
-static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
-		struct cpufreq_frequency_table *powernow_table)
-{
-	int i;
-
-	for (i = 0; i < data->acpi_data.state_count; i++) {
-		u32 fid;
-		u32 vid;
-		u32 freq, index;
-		u64 status, control;
-
-		if (data->exttype) {
-			status =  data->acpi_data.states[i].status;
-			fid = status & EXT_FID_MASK;
-			vid = (status >> VID_SHIFT) & EXT_VID_MASK;
-		} else {
-			control =  data->acpi_data.states[i].control;
-			fid = control & FID_MASK;
-			vid = (control >> VID_SHIFT) & VID_MASK;
-		}
-
-		pr_debug("   %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
-
-		index = fid | (vid<<8);
-		powernow_table[i].index = index;
-
-		freq = find_khz_freq_from_fid(fid);
-		powernow_table[i].frequency = freq;
-
-		/* verify frequency is OK */
-		if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
-			pr_debug("invalid freq %u kHz, ignoring\n", freq);
-			invalidate_entry(powernow_table, i);
-			continue;
-		}
-
-		/* verify voltage is OK -
-		 * BIOSs are using "off" to indicate invalid */
-		if (vid == VID_OFF) {
-			pr_debug("invalid vid %u, ignoring\n", vid);
-			invalidate_entry(powernow_table, i);
-			continue;
-		}
-
-		if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
-			printk(KERN_INFO PFX "invalid freq entries "
-				"%u kHz vs. %u kHz\n", freq,
-				(unsigned int)
-				(data->acpi_data.states[i].core_frequency
-				 * 1000));
-			invalidate_entry(powernow_table, i);
-			continue;
-		}
-	}
-	return 0;
-}
-
-static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
-{
-	if (data->acpi_data.state_count)
-		acpi_processor_unregister_performance(&data->acpi_data,
-				data->cpu);
-	free_cpumask_var(data->acpi_data.shared_cpu_map);
-}
-
-static int get_transition_latency(struct powernow_k8_data *data)
-{
-	int max_latency = 0;
-	int i;
-	for (i = 0; i < data->acpi_data.state_count; i++) {
-		int cur_latency = data->acpi_data.states[i].transition_latency
-			+ data->acpi_data.states[i].bus_master_latency;
-		if (cur_latency > max_latency)
-			max_latency = cur_latency;
-	}
-	if (max_latency == 0) {
-		/*
-		 * Fam 11h and later may return 0 as transition latency. This
-		 * is intended and means "very fast". While cpufreq core and
-		 * governors currently can handle that gracefully, better set it
-		 * to 1 to avoid problems in the future.
-		 */
-		if (boot_cpu_data.x86 < 0x11)
-			printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
-				"latency\n");
-		max_latency = 1;
-	}
-	/* value in usecs, needs to be in nanoseconds */
-	return 1000 * max_latency;
-}
-
-/* Take a frequency, and issue the fid/vid transition command */
-static int transition_frequency_fidvid(struct powernow_k8_data *data,
-		unsigned int index)
-{
-	u32 fid = 0;
-	u32 vid = 0;
-	int res, i;
-	struct cpufreq_freqs freqs;
-
-	pr_debug("cpu %d transition to index %u\n", smp_processor_id(), index);
-
-	/* fid/vid correctness check for k8 */
-	/* fid are the lower 8 bits of the index we stored into
-	 * the cpufreq frequency table in find_psb_table, vid
-	 * are the upper 8 bits.
-	 */
-	fid = data->powernow_table[index].index & 0xFF;
-	vid = (data->powernow_table[index].index & 0xFF00) >> 8;
-
-	pr_debug("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
-
-	if (query_current_values_with_pending_wait(data))
-		return 1;
-
-	if ((data->currvid == vid) && (data->currfid == fid)) {
-		pr_debug("target matches current values (fid 0x%x, vid 0x%x)\n",
-			fid, vid);
-		return 0;
-	}
-
-	pr_debug("cpu %d, changing to fid 0x%x, vid 0x%x\n",
-		smp_processor_id(), fid, vid);
-	freqs.old = find_khz_freq_from_fid(data->currfid);
-	freqs.new = find_khz_freq_from_fid(fid);
-
-	for_each_cpu(i, data->available_cores) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	}
-
-	res = transition_fid_vid(data, fid, vid);
-	freqs.new = find_khz_freq_from_fid(data->currfid);
-
-	for_each_cpu(i, data->available_cores) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
-	return res;
-}
-
-/* Take a frequency, and issue the hardware pstate transition command */
-static int transition_frequency_pstate(struct powernow_k8_data *data,
-		unsigned int index)
-{
-	u32 pstate = 0;
-	int res, i;
-	struct cpufreq_freqs freqs;
-
-	pr_debug("cpu %d transition to index %u\n", smp_processor_id(), index);
-
-	/* get MSR index for hardware pstate transition */
-	pstate = index & HW_PSTATE_MASK;
-	if (pstate > data->max_hw_pstate)
-		return 0;
-	freqs.old = find_khz_freq_from_pstate(data->powernow_table,
-			data->currpstate);
-	freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
-
-	for_each_cpu(i, data->available_cores) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	}
-
-	res = transition_pstate(data, pstate);
-	freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
-
-	for_each_cpu(i, data->available_cores) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
-	return res;
-}
-
-/* Driver entry point to switch to the target frequency */
-static int powernowk8_target(struct cpufreq_policy *pol,
-		unsigned targfreq, unsigned relation)
-{
-	cpumask_var_t oldmask;
-	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
-	u32 checkfid;
-	u32 checkvid;
-	unsigned int newstate;
-	int ret = -EIO;
-
-	if (!data)
-		return -EINVAL;
-
-	checkfid = data->currfid;
-	checkvid = data->currvid;
-
-	/* only run on specific CPU from here on. */
-	/* This is poor form: use a workqueue or smp_call_function_single */
-	if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
-		return -ENOMEM;
-
-	cpumask_copy(oldmask, tsk_cpus_allowed(current));
-	set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
-
-	if (smp_processor_id() != pol->cpu) {
-		printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
-		goto err_out;
-	}
-
-	if (pending_bit_stuck()) {
-		printk(KERN_ERR PFX "failing targ, change pending bit set\n");
-		goto err_out;
-	}
-
-	pr_debug("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
-		pol->cpu, targfreq, pol->min, pol->max, relation);
-
-	if (query_current_values_with_pending_wait(data))
-		goto err_out;
-
-	if (cpu_family != CPU_HW_PSTATE) {
-		pr_debug("targ: curr fid 0x%x, vid 0x%x\n",
-		data->currfid, data->currvid);
-
-		if ((checkvid != data->currvid) ||
-		    (checkfid != data->currfid)) {
-			printk(KERN_INFO PFX
-				"error - out of sync, fix 0x%x 0x%x, "
-				"vid 0x%x 0x%x\n",
-				checkfid, data->currfid,
-				checkvid, data->currvid);
-		}
-	}
-
-	if (cpufreq_frequency_table_target(pol, data->powernow_table,
-				targfreq, relation, &newstate))
-		goto err_out;
-
-	mutex_lock(&fidvid_mutex);
-
-	powernow_k8_acpi_pst_values(data, newstate);
-
-	if (cpu_family == CPU_HW_PSTATE)
-		ret = transition_frequency_pstate(data, newstate);
-	else
-		ret = transition_frequency_fidvid(data, newstate);
-	if (ret) {
-		printk(KERN_ERR PFX "transition frequency failed\n");
-		ret = 1;
-		mutex_unlock(&fidvid_mutex);
-		goto err_out;
-	}
-	mutex_unlock(&fidvid_mutex);
-
-	if (cpu_family == CPU_HW_PSTATE)
-		pol->cur = find_khz_freq_from_pstate(data->powernow_table,
-				newstate);
-	else
-		pol->cur = find_khz_freq_from_fid(data->currfid);
-	ret = 0;
-
-err_out:
-	set_cpus_allowed_ptr(current, oldmask);
-	free_cpumask_var(oldmask);
-	return ret;
-}
-
-/* Driver entry point to verify the policy and range of frequencies */
-static int powernowk8_verify(struct cpufreq_policy *pol)
-{
-	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
-
-	if (!data)
-		return -EINVAL;
-
-	return cpufreq_frequency_table_verify(pol, data->powernow_table);
-}
-
-struct init_on_cpu {
-	struct powernow_k8_data *data;
-	int rc;
-};
-
-static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu)
-{
-	struct init_on_cpu *init_on_cpu = _init_on_cpu;
-
-	if (pending_bit_stuck()) {
-		printk(KERN_ERR PFX "failing init, change pending bit set\n");
-		init_on_cpu->rc = -ENODEV;
-		return;
-	}
-
-	if (query_current_values_with_pending_wait(init_on_cpu->data)) {
-		init_on_cpu->rc = -ENODEV;
-		return;
-	}
-
-	if (cpu_family == CPU_OPTERON)
-		fidvid_msr_init();
-
-	init_on_cpu->rc = 0;
-}
-
-/* per CPU init entry point to the driver */
-static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
-{
-	static const char ACPI_PSS_BIOS_BUG_MSG[] =
-		KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
-		FW_BUG PFX "Try again with latest BIOS.\n";
-	struct powernow_k8_data *data;
-	struct init_on_cpu init_on_cpu;
-	int rc;
-	struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
-
-	if (!cpu_online(pol->cpu))
-		return -ENODEV;
-
-	smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1);
-	if (rc)
-		return -ENODEV;
-
-	data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
-	if (!data) {
-		printk(KERN_ERR PFX "unable to alloc powernow_k8_data");
-		return -ENOMEM;
-	}
-
-	data->cpu = pol->cpu;
-	data->currpstate = HW_PSTATE_INVALID;
-
-	if (powernow_k8_cpu_init_acpi(data)) {
-		/*
-		 * Use the PSB BIOS structure. This is only available on
-		 * an UP version, and is deprecated by AMD.
-		 */
-		if (num_online_cpus() != 1) {
-			printk_once(ACPI_PSS_BIOS_BUG_MSG);
-			goto err_out;
-		}
-		if (pol->cpu != 0) {
-			printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
-			       "CPU other than CPU0. Complain to your BIOS "
-			       "vendor.\n");
-			goto err_out;
-		}
-		rc = find_psb_table(data);
-		if (rc)
-			goto err_out;
-
-		/* Take a crude guess here.
-		 * That guess was in microseconds, so multiply with 1000 */
-		pol->cpuinfo.transition_latency = (
-			 ((data->rvo + 8) * data->vstable * VST_UNITS_20US) +
-			 ((1 << data->irt) * 30)) * 1000;
-	} else /* ACPI _PSS objects available */
-		pol->cpuinfo.transition_latency = get_transition_latency(data);
-
-	/* only run on specific CPU from here on */
-	init_on_cpu.data = data;
-	smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu,
-				 &init_on_cpu, 1);
-	rc = init_on_cpu.rc;
-	if (rc != 0)
-		goto err_out_exit_acpi;
-
-	if (cpu_family == CPU_HW_PSTATE)
-		cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
-	else
-		cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));
-	data->available_cores = pol->cpus;
-
-	if (cpu_family == CPU_HW_PSTATE)
-		pol->cur = find_khz_freq_from_pstate(data->powernow_table,
-				data->currpstate);
-	else
-		pol->cur = find_khz_freq_from_fid(data->currfid);
-	pr_debug("policy current frequency %d kHz\n", pol->cur);
-
-	/* min/max the cpu is capable of */
-	if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
-		printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
-		powernow_k8_cpu_exit_acpi(data);
-		kfree(data->powernow_table);
-		kfree(data);
-		return -EINVAL;
-	}
-
-	/* Check for APERF/MPERF support in hardware */
-	if (cpu_has(c, X86_FEATURE_APERFMPERF))
-		cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
-
-	cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
-
-	if (cpu_family == CPU_HW_PSTATE)
-		pr_debug("cpu_init done, current pstate 0x%x\n",
-				data->currpstate);
-	else
-		pr_debug("cpu_init done, current fid 0x%x, vid 0x%x\n",
-			data->currfid, data->currvid);
-
-	per_cpu(powernow_data, pol->cpu) = data;
-
-	return 0;
-
-err_out_exit_acpi:
-	powernow_k8_cpu_exit_acpi(data);
-
-err_out:
-	kfree(data);
-	return -ENODEV;
-}
-
-static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
-{
-	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
-
-	if (!data)
-		return -EINVAL;
-
-	powernow_k8_cpu_exit_acpi(data);
-
-	cpufreq_frequency_table_put_attr(pol->cpu);
-
-	kfree(data->powernow_table);
-	kfree(data);
-	per_cpu(powernow_data, pol->cpu) = NULL;
-
-	return 0;
-}
-
-static void query_values_on_cpu(void *_err)
-{
-	int *err = _err;
-	struct powernow_k8_data *data = __this_cpu_read(powernow_data);
-
-	*err = query_current_values_with_pending_wait(data);
-}
-
-static unsigned int powernowk8_get(unsigned int cpu)
-{
-	struct powernow_k8_data *data = per_cpu(powernow_data, cpu);
-	unsigned int khz = 0;
-	int err;
-
-	if (!data)
-		return 0;
-
-	smp_call_function_single(cpu, query_values_on_cpu, &err, true);
-	if (err)
-		goto out;
-
-	if (cpu_family == CPU_HW_PSTATE)
-		khz = find_khz_freq_from_pstate(data->powernow_table,
-						data->currpstate);
-	else
-		khz = find_khz_freq_from_fid(data->currfid);
-
-
-out:
-	return khz;
-}
-
-static void _cpb_toggle_msrs(bool t)
-{
-	int cpu;
-
-	get_online_cpus();
-
-	rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
-
-	for_each_cpu(cpu, cpu_online_mask) {
-		struct msr *reg = per_cpu_ptr(msrs, cpu);
-		if (t)
-			reg->l &= ~BIT(25);
-		else
-			reg->l |= BIT(25);
-	}
-	wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
-
-	put_online_cpus();
-}
-
-/*
- * Switch on/off core performance boosting.
- *
- * 0=disable
- * 1=enable.
- */
-static void cpb_toggle(bool t)
-{
-	if (!cpb_capable)
-		return;
-
-	if (t && !cpb_enabled) {
-		cpb_enabled = true;
-		_cpb_toggle_msrs(t);
-		printk(KERN_INFO PFX "Core Boosting enabled.\n");
-	} else if (!t && cpb_enabled) {
-		cpb_enabled = false;
-		_cpb_toggle_msrs(t);
-		printk(KERN_INFO PFX "Core Boosting disabled.\n");
-	}
-}
-
-static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
-				 size_t count)
-{
-	int ret = -EINVAL;
-	unsigned long val = 0;
-
-	ret = strict_strtoul(buf, 10, &val);
-	if (!ret && (val == 0 || val == 1) && cpb_capable)
-		cpb_toggle(val);
-	else
-		return -EINVAL;
-
-	return count;
-}
-
-static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
-{
-	return sprintf(buf, "%u\n", cpb_enabled);
-}
-
-#define define_one_rw(_name) \
-static struct freq_attr _name = \
-__ATTR(_name, 0644, show_##_name, store_##_name)
-
-define_one_rw(cpb);
-
-static struct freq_attr *powernow_k8_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	&cpb,
-	NULL,
-};
-
-static struct cpufreq_driver cpufreq_amd64_driver = {
-	.verify		= powernowk8_verify,
-	.target		= powernowk8_target,
-	.bios_limit	= acpi_processor_get_bios_limit,
-	.init		= powernowk8_cpu_init,
-	.exit		= __devexit_p(powernowk8_cpu_exit),
-	.get		= powernowk8_get,
-	.name		= "powernow-k8",
-	.owner		= THIS_MODULE,
-	.attr		= powernow_k8_attr,
-};
-
-/*
- * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
- * cannot block the remaining ones from boosting. On the CPU_UP path we
- * simply keep the boost-disable flag in sync with the current global
- * state.
- */
-static int cpb_notify(struct notifier_block *nb, unsigned long action,
-		      void *hcpu)
-{
-	unsigned cpu = (long)hcpu;
-	u32 lo, hi;
-
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-
-		if (!cpb_enabled) {
-			rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
-			lo |= BIT(25);
-			wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
-		}
-		break;
-
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-		rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
-		lo &= ~BIT(25);
-		wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
-		break;
-
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block cpb_nb = {
-	.notifier_call		= cpb_notify,
-};
-
-/* driver entry point for init */
-static int __cpuinit powernowk8_init(void)
-{
-	unsigned int i, supported_cpus = 0, cpu;
-	int rv;
-
-	for_each_online_cpu(i) {
-		int rc;
-		smp_call_function_single(i, check_supported_cpu, &rc, 1);
-		if (rc == 0)
-			supported_cpus++;
-	}
-
-	if (supported_cpus != num_online_cpus())
-		return -ENODEV;
-
-	printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
-		num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
-
-	if (boot_cpu_has(X86_FEATURE_CPB)) {
-
-		cpb_capable = true;
-
-		msrs = msrs_alloc();
-		if (!msrs) {
-			printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
-			return -ENOMEM;
-		}
-
-		register_cpu_notifier(&cpb_nb);
-
-		rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
-
-		for_each_cpu(cpu, cpu_online_mask) {
-			struct msr *reg = per_cpu_ptr(msrs, cpu);
-			cpb_enabled |= !(!!(reg->l & BIT(25)));
-		}
-
-		printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
-			(cpb_enabled ? "on" : "off"));
-	}
-
-	rv = cpufreq_register_driver(&cpufreq_amd64_driver);
-	if (rv < 0 && boot_cpu_has(X86_FEATURE_CPB)) {
-		unregister_cpu_notifier(&cpb_nb);
-		msrs_free(msrs);
-		msrs = NULL;
-	}
-	return rv;
-}
-
-/* driver entry point for term */
-static void __exit powernowk8_exit(void)
-{
-	pr_debug("exit\n");
-
-	if (boot_cpu_has(X86_FEATURE_CPB)) {
-		msrs_free(msrs);
-		msrs = NULL;
-
-		unregister_cpu_notifier(&cpb_nb);
-	}
-
-	cpufreq_unregister_driver(&cpufreq_amd64_driver);
-}
-
-MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and "
-		"Mark Langsdorf <mark.langsdorf@amd.com>");
-MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
-MODULE_LICENSE("GPL");
-
-late_initcall(powernowk8_init);
-module_exit(powernowk8_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
deleted file mode 100644
index 3744d26cdc2..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- *  (c) 2003-2006 Advanced Micro Devices, Inc.
- *  Your use of this code is subject to the terms and conditions of the
- *  GNU general public license version 2. See "COPYING" or
- *  http://www.gnu.org/licenses/gpl.html
- */
-
-enum pstate {
-	HW_PSTATE_INVALID = 0xff,
-	HW_PSTATE_0 = 0,
-	HW_PSTATE_1 = 1,
-	HW_PSTATE_2 = 2,
-	HW_PSTATE_3 = 3,
-	HW_PSTATE_4 = 4,
-	HW_PSTATE_5 = 5,
-	HW_PSTATE_6 = 6,
-	HW_PSTATE_7 = 7,
-};
-
-struct powernow_k8_data {
-	unsigned int cpu;
-
-	u32 numps;  /* number of p-states */
-	u32 batps;  /* number of p-states supported on battery */
-	u32 max_hw_pstate; /* maximum legal hardware pstate */
-
-	/* these values are constant when the PSB is used to determine
-	 * vid/fid pairings, but are modified during the ->target() call
-	 * when ACPI is used */
-	u32 rvo;     /* ramp voltage offset */
-	u32 irt;     /* isochronous relief time */
-	u32 vidmvs;  /* usable value calculated from mvs */
-	u32 vstable; /* voltage stabilization time, units 20 us */
-	u32 plllock; /* pll lock time, units 1 us */
-        u32 exttype; /* extended interface = 1 */
-
-	/* keep track of the current fid / vid or pstate */
-	u32 currvid;
-	u32 currfid;
-	enum pstate currpstate;
-
-	/* the powernow_table includes all frequency and vid/fid pairings:
-	 * fid are the lower 8 bits of the index, vid are the upper 8 bits.
-	 * frequency is in kHz */
-	struct cpufreq_frequency_table  *powernow_table;
-
-	/* the acpi table needs to be kept. it's only available if ACPI was
-	 * used to determine valid frequency/vid/fid states */
-	struct acpi_processor_performance acpi_data;
-
-	/* we need to keep track of associated cores, but let cpufreq
-	 * handle hotplug events - so just point at cpufreq pol->cpus
-	 * structure */
-	struct cpumask *available_cores;
-};
-
-/* processor's cpuid instruction support */
-#define CPUID_PROCESSOR_SIGNATURE	1	/* function 1 */
-#define CPUID_XFAM			0x0ff00000	/* extended family */
-#define CPUID_XFAM_K8			0
-#define CPUID_XMOD			0x000f0000	/* extended model */
-#define CPUID_XMOD_REV_MASK		0x000c0000
-#define CPUID_XFAM_10H			0x00100000	/* family 0x10 */
-#define CPUID_USE_XFAM_XMOD		0x00000f00
-#define CPUID_GET_MAX_CAPABILITIES	0x80000000
-#define CPUID_FREQ_VOLT_CAPABILITIES	0x80000007
-#define P_STATE_TRANSITION_CAPABLE	6
-
-/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For     */
-/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and   */
-/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */
-/* the register number is placed in ecx, and the data is returned in edx:eax. */
-
-#define MSR_FIDVID_CTL      0xc0010041
-#define MSR_FIDVID_STATUS   0xc0010042
-
-/* Field definitions within the FID VID Low Control MSR : */
-#define MSR_C_LO_INIT_FID_VID     0x00010000
-#define MSR_C_LO_NEW_VID          0x00003f00
-#define MSR_C_LO_NEW_FID          0x0000003f
-#define MSR_C_LO_VID_SHIFT        8
-
-/* Field definitions within the FID VID High Control MSR : */
-#define MSR_C_HI_STP_GNT_TO	  0x000fffff
-
-/* Field definitions within the FID VID Low Status MSR : */
-#define MSR_S_LO_CHANGE_PENDING   0x80000000   /* cleared when completed */
-#define MSR_S_LO_MAX_RAMP_VID     0x3f000000
-#define MSR_S_LO_MAX_FID          0x003f0000
-#define MSR_S_LO_START_FID        0x00003f00
-#define MSR_S_LO_CURRENT_FID      0x0000003f
-
-/* Field definitions within the FID VID High Status MSR : */
-#define MSR_S_HI_MIN_WORKING_VID  0x3f000000
-#define MSR_S_HI_MAX_WORKING_VID  0x003f0000
-#define MSR_S_HI_START_VID        0x00003f00
-#define MSR_S_HI_CURRENT_VID      0x0000003f
-#define MSR_C_HI_STP_GNT_BENIGN	  0x00000001
-
-
-/* Hardware Pstate _PSS and MSR definitions */
-#define USE_HW_PSTATE		0x00000080
-#define HW_PSTATE_MASK 		0x00000007
-#define HW_PSTATE_VALID_MASK 	0x80000000
-#define HW_PSTATE_MAX_MASK	0x000000f0
-#define HW_PSTATE_MAX_SHIFT	4
-#define MSR_PSTATE_DEF_BASE 	0xc0010064 /* base of Pstate MSRs */
-#define MSR_PSTATE_STATUS 	0xc0010063 /* Pstate Status MSR */
-#define MSR_PSTATE_CTRL 	0xc0010062 /* Pstate control MSR */
-#define MSR_PSTATE_CUR_LIMIT	0xc0010061 /* pstate current limit MSR */
-
-/* define the two driver architectures */
-#define CPU_OPTERON 0
-#define CPU_HW_PSTATE 1
-
-
-/*
- * There are restrictions frequencies have to follow:
- * - only 1 entry in the low fid table ( <=1.4GHz )
- * - lowest entry in the high fid table must be >= 2 * the entry in the
- *   low fid table
- * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry
- *   in the low fid table
- * - the parts can only step at <= 200 MHz intervals, odd fid values are
- *   supported in revision G and later revisions.
- * - lowest frequency must be >= interprocessor hypertransport link speed
- *   (only applies to MP systems obviously)
- */
-
-/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */
-#define LO_FID_TABLE_TOP     7	/* fid values marking the boundary    */
-#define HI_FID_TABLE_BOTTOM  8	/* between the low and high tables    */
-
-#define LO_VCOFREQ_TABLE_TOP    1400	/* corresponding vco frequency values */
-#define HI_VCOFREQ_TABLE_BOTTOM 1600
-
-#define MIN_FREQ_RESOLUTION  200 /* fids jump by 2 matching freq jumps by 200 */
-
-#define MAX_FID 0x2a	/* Spec only gives FID values as far as 5 GHz */
-#define LEAST_VID 0x3e	/* Lowest (numerically highest) useful vid value */
-
-#define MIN_FREQ 800	/* Min and max freqs, per spec */
-#define MAX_FREQ 5000
-
-#define INVALID_FID_MASK 0xffffffc0  /* not a valid fid if these bits are set */
-#define INVALID_VID_MASK 0xffffffc0  /* not a valid vid if these bits are set */
-
-#define VID_OFF 0x3f
-
-#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */
-
-#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */
-
-#define MAXIMUM_VID_STEPS 1  /* Current cpus only allow a single step of 25mV */
-#define VST_UNITS_20US 20   /* Voltage Stabilization Time is in units of 20us */
-
-/*
- * Most values of interest are encoded in a single field of the _PSS
- * entries: the "control" value.
- */
-
-#define IRT_SHIFT      30
-#define RVO_SHIFT      28
-#define EXT_TYPE_SHIFT 27
-#define PLL_L_SHIFT    20
-#define MVS_SHIFT      18
-#define VST_SHIFT      11
-#define VID_SHIFT       6
-#define IRT_MASK        3
-#define RVO_MASK        3
-#define EXT_TYPE_MASK   1
-#define PLL_L_MASK   0x7f
-#define MVS_MASK        3
-#define VST_MASK     0x7f
-#define VID_MASK     0x1f
-#define FID_MASK     0x1f
-#define EXT_VID_MASK 0x3f
-#define EXT_FID_MASK 0x3f
-
-
-/*
- * Version 1.4 of the PSB table. This table is constructed by BIOS and is
- * to tell the OS's power management driver which VIDs and FIDs are
- * supported by this particular processor.
- * If the data in the PSB / PST is wrong, then this driver will program the
- * wrong values into hardware, which is very likely to lead to a crash.
- */
-
-#define PSB_ID_STRING      "AMDK7PNOW!"
-#define PSB_ID_STRING_LEN  10
-
-#define PSB_VERSION_1_4  0x14
-
-struct psb_s {
-	u8 signature[10];
-	u8 tableversion;
-	u8 flags1;
-	u16 vstable;
-	u8 flags2;
-	u8 num_tables;
-	u32 cpuid;
-	u8 plllocktime;
-	u8 maxfid;
-	u8 maxvid;
-	u8 numps;
-};
-
-/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */
-struct pst_s {
-	u8 fid;
-	u8 vid;
-};
-
-static int core_voltage_pre_transition(struct powernow_k8_data *data,
-	u32 reqvid, u32 regfid);
-static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
-static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
-
-static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
-
-static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
-static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
deleted file mode 100644
index 1e205e6b172..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- *	sc520_freq.c: cpufreq driver for the AMD Elan sc520
- *
- *	Copyright (C) 2005 Sean Young <sean@mess.org>
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- *	Based on elanfreq.c
- *
- *	2005-03-30: - initial revision
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-
-#include <linux/delay.h>
-#include <linux/cpufreq.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <asm/msr.h>
-
-#define MMCR_BASE	0xfffef000	/* The default base address */
-#define OFFS_CPUCTL	0x2   /* CPU Control Register */
-
-static __u8 __iomem *cpuctl;
-
-#define PFX "sc520_freq: "
-
-static struct cpufreq_frequency_table sc520_freq_table[] = {
-	{0x01,	100000},
-	{0x02,	133000},
-	{0,	CPUFREQ_TABLE_END},
-};
-
-static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
-{
-	u8 clockspeed_reg = *cpuctl;
-
-	switch (clockspeed_reg & 0x03) {
-	default:
-		printk(KERN_ERR PFX "error: cpuctl register has unexpected "
-				"value %02x\n", clockspeed_reg);
-	case 0x01:
-		return 100000;
-	case 0x02:
-		return 133000;
-	}
-}
-
-static void sc520_freq_set_cpu_state(unsigned int state)
-{
-
-	struct cpufreq_freqs	freqs;
-	u8 clockspeed_reg;
-
-	freqs.old = sc520_freq_get_cpu_frequency(0);
-	freqs.new = sc520_freq_table[state].frequency;
-	freqs.cpu = 0; /* AMD Elan is UP */
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
-	pr_debug("attempting to set frequency to %i kHz\n",
-			sc520_freq_table[state].frequency);
-
-	local_irq_disable();
-
-	clockspeed_reg = *cpuctl & ~0x03;
-	*cpuctl = clockspeed_reg | sc520_freq_table[state].index;
-
-	local_irq_enable();
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-};
-
-static int sc520_freq_verify(struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]);
-}
-
-static int sc520_freq_target(struct cpufreq_policy *policy,
-			    unsigned int target_freq,
-			    unsigned int relation)
-{
-	unsigned int newstate = 0;
-
-	if (cpufreq_frequency_table_target(policy, sc520_freq_table,
-				target_freq, relation, &newstate))
-		return -EINVAL;
-
-	sc520_freq_set_cpu_state(newstate);
-
-	return 0;
-}
-
-
-/*
- *	Module init and exit code
- */
-
-static int sc520_freq_cpu_init(struct cpufreq_policy *policy)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-	int result;
-
-	/* capability check */
-	if (c->x86_vendor != X86_VENDOR_AMD ||
-	    c->x86 != 4 || c->x86_model != 9)
-		return -ENODEV;
-
-	/* cpuinfo and default policy values */
-	policy->cpuinfo.transition_latency = 1000000; /* 1ms */
-	policy->cur = sc520_freq_get_cpu_frequency(0);
-
-	result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table);
-	if (result)
-		return result;
-
-	cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu);
-
-	return 0;
-}
-
-
-static int sc520_freq_cpu_exit(struct cpufreq_policy *policy)
-{
-	cpufreq_frequency_table_put_attr(policy->cpu);
-	return 0;
-}
-
-
-static struct freq_attr *sc520_freq_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-
-static struct cpufreq_driver sc520_freq_driver = {
-	.get	= sc520_freq_get_cpu_frequency,
-	.verify	= sc520_freq_verify,
-	.target	= sc520_freq_target,
-	.init	= sc520_freq_cpu_init,
-	.exit	= sc520_freq_cpu_exit,
-	.name	= "sc520_freq",
-	.owner	= THIS_MODULE,
-	.attr	= sc520_freq_attr,
-};
-
-
-static int __init sc520_freq_init(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-	int err;
-
-	/* Test if we have the right hardware */
-	if (c->x86_vendor != X86_VENDOR_AMD ||
-	    c->x86 != 4 || c->x86_model != 9) {
-		pr_debug("no Elan SC520 processor found!\n");
-		return -ENODEV;
-	}
-	cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1);
-	if (!cpuctl) {
-		printk(KERN_ERR "sc520_freq: error: failed to remap memory\n");
-		return -ENOMEM;
-	}
-
-	err = cpufreq_register_driver(&sc520_freq_driver);
-	if (err)
-		iounmap(cpuctl);
-
-	return err;
-}
-
-
-static void __exit sc520_freq_exit(void)
-{
-	cpufreq_unregister_driver(&sc520_freq_driver);
-	iounmap(cpuctl);
-}
-
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Sean Young <sean@mess.org>");
-MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU");
-
-module_init(sc520_freq_init);
-module_exit(sc520_freq_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
deleted file mode 100644
index 6ea3455def2..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ /dev/null
@@ -1,633 +0,0 @@
-/*
- * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium
- * M (part of the Centrino chipset).
- *
- * Since the original Pentium M, most new Intel CPUs support Enhanced
- * SpeedStep.
- *
- * Despite the "SpeedStep" in the name, this is almost entirely unlike
- * traditional SpeedStep.
- *
- * Modelled on speedstep.c
- *
- * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/sched.h>	/* current */
-#include <linux/delay.h>
-#include <linux/compiler.h>
-#include <linux/gfp.h>
-
-#include <asm/msr.h>
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-
-#define PFX		"speedstep-centrino: "
-#define MAINTAINER	"cpufreq@vger.kernel.org"
-
-#define INTEL_MSR_RANGE	(0xffff)
-
-struct cpu_id
-{
-	__u8	x86;            /* CPU family */
-	__u8	x86_model;	/* model */
-	__u8	x86_mask;	/* stepping */
-};
-
-enum {
-	CPU_BANIAS,
-	CPU_DOTHAN_A1,
-	CPU_DOTHAN_A2,
-	CPU_DOTHAN_B0,
-	CPU_MP4HT_D0,
-	CPU_MP4HT_E0,
-};
-
-static const struct cpu_id cpu_ids[] = {
-	[CPU_BANIAS]	= { 6,  9, 5 },
-	[CPU_DOTHAN_A1]	= { 6, 13, 1 },
-	[CPU_DOTHAN_A2]	= { 6, 13, 2 },
-	[CPU_DOTHAN_B0]	= { 6, 13, 6 },
-	[CPU_MP4HT_D0]	= {15,  3, 4 },
-	[CPU_MP4HT_E0]	= {15,  4, 1 },
-};
-#define N_IDS	ARRAY_SIZE(cpu_ids)
-
-struct cpu_model
-{
-	const struct cpu_id *cpu_id;
-	const char	*model_name;
-	unsigned	max_freq; /* max clock in kHz */
-
-	struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
-};
-static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
-				  const struct cpu_id *x);
-
-/* Operating points for current CPU */
-static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
-static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
-
-static struct cpufreq_driver centrino_driver;
-
-#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE
-
-/* Computes the correct form for IA32_PERF_CTL MSR for a particular
-   frequency/voltage operating point; frequency in MHz, volts in mV.
-   This is stored as "index" in the structure. */
-#define OP(mhz, mv)							\
-	{								\
-		.frequency = (mhz) * 1000,				\
-		.index = (((mhz)/100) << 8) | ((mv - 700) / 16)		\
-	}
-
-/*
- * These voltage tables were derived from the Intel Pentium M
- * datasheet, document 25261202.pdf, Table 5.  I have verified they
- * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium
- * M.
- */
-
-/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */
-static struct cpufreq_frequency_table banias_900[] =
-{
-	OP(600,  844),
-	OP(800,  988),
-	OP(900, 1004),
-	{ .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */
-static struct cpufreq_frequency_table banias_1000[] =
-{
-	OP(600,   844),
-	OP(800,   972),
-	OP(900,   988),
-	OP(1000, 1004),
-	{ .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */
-static struct cpufreq_frequency_table banias_1100[] =
-{
-	OP( 600,  956),
-	OP( 800, 1020),
-	OP( 900, 1100),
-	OP(1000, 1164),
-	OP(1100, 1180),
-	{ .frequency = CPUFREQ_TABLE_END }
-};
-
-
-/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */
-static struct cpufreq_frequency_table banias_1200[] =
-{
-	OP( 600,  956),
-	OP( 800, 1004),
-	OP( 900, 1020),
-	OP(1000, 1100),
-	OP(1100, 1164),
-	OP(1200, 1180),
-	{ .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.30GHz (Banias) */
-static struct cpufreq_frequency_table banias_1300[] =
-{
-	OP( 600,  956),
-	OP( 800, 1260),
-	OP(1000, 1292),
-	OP(1200, 1356),
-	OP(1300, 1388),
-	{ .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.40GHz (Banias) */
-static struct cpufreq_frequency_table banias_1400[] =
-{
-	OP( 600,  956),
-	OP( 800, 1180),
-	OP(1000, 1308),
-	OP(1200, 1436),
-	OP(1400, 1484),
-	{ .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.50GHz (Banias) */
-static struct cpufreq_frequency_table banias_1500[] =
-{
-	OP( 600,  956),
-	OP( 800, 1116),
-	OP(1000, 1228),
-	OP(1200, 1356),
-	OP(1400, 1452),
-	OP(1500, 1484),
-	{ .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.60GHz (Banias) */
-static struct cpufreq_frequency_table banias_1600[] =
-{
-	OP( 600,  956),
-	OP( 800, 1036),
-	OP(1000, 1164),
-	OP(1200, 1276),
-	OP(1400, 1420),
-	OP(1600, 1484),
-	{ .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.70GHz (Banias) */
-static struct cpufreq_frequency_table banias_1700[] =
-{
-	OP( 600,  956),
-	OP( 800, 1004),
-	OP(1000, 1116),
-	OP(1200, 1228),
-	OP(1400, 1308),
-	OP(1700, 1484),
-	{ .frequency = CPUFREQ_TABLE_END }
-};
-#undef OP
-
-#define _BANIAS(cpuid, max, name)	\
-{	.cpu_id		= cpuid,	\
-	.model_name	= "Intel(R) Pentium(R) M processor " name "MHz", \
-	.max_freq	= (max)*1000,	\
-	.op_points	= banias_##max,	\
-}
-#define BANIAS(max)	_BANIAS(&cpu_ids[CPU_BANIAS], max, #max)
-
-/* CPU models, their operating frequency range, and freq/voltage
-   operating points */
-static struct cpu_model models[] =
-{
-	_BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"),
-	BANIAS(1000),
-	BANIAS(1100),
-	BANIAS(1200),
-	BANIAS(1300),
-	BANIAS(1400),
-	BANIAS(1500),
-	BANIAS(1600),
-	BANIAS(1700),
-
-	/* NULL model_name is a wildcard */
-	{ &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL },
-	{ &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL },
-	{ &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL },
-	{ &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL },
-	{ &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL },
-
-	{ NULL, }
-};
-#undef _BANIAS
-#undef BANIAS
-
-static int centrino_cpu_init_table(struct cpufreq_policy *policy)
-{
-	struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
-	struct cpu_model *model;
-
-	for(model = models; model->cpu_id != NULL; model++)
-		if (centrino_verify_cpu_id(cpu, model->cpu_id) &&
-		    (model->model_name == NULL ||
-		     strcmp(cpu->x86_model_id, model->model_name) == 0))
-			break;
-
-	if (model->cpu_id == NULL) {
-		/* No match at all */
-		pr_debug("no support for CPU model \"%s\": "
-		       "send /proc/cpuinfo to " MAINTAINER "\n",
-		       cpu->x86_model_id);
-		return -ENOENT;
-	}
-
-	if (model->op_points == NULL) {
-		/* Matched a non-match */
-		pr_debug("no table support for CPU model \"%s\"\n",
-		       cpu->x86_model_id);
-		pr_debug("try using the acpi-cpufreq driver\n");
-		return -ENOENT;
-	}
-
-	per_cpu(centrino_model, policy->cpu) = model;
-
-	pr_debug("found \"%s\": max frequency: %dkHz\n",
-	       model->model_name, model->max_freq);
-
-	return 0;
-}
-
-#else
-static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
-{
-	return -ENODEV;
-}
-#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
-
-static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
-				  const struct cpu_id *x)
-{
-	if ((c->x86 == x->x86) &&
-	    (c->x86_model == x->x86_model) &&
-	    (c->x86_mask == x->x86_mask))
-		return 1;
-	return 0;
-}
-
-/* To be called only after centrino_model is initialized */
-static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
-{
-	int i;
-
-	/*
-	 * Extract clock in kHz from PERF_CTL value
-	 * for centrino, as some DSDTs are buggy.
-	 * Ideally, this can be done using the acpi_data structure.
-	 */
-	if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
-	    (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
-	    (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
-		msr = (msr >> 8) & 0xff;
-		return msr * 100000;
-	}
-
-	if ((!per_cpu(centrino_model, cpu)) ||
-	    (!per_cpu(centrino_model, cpu)->op_points))
-		return 0;
-
-	msr &= 0xffff;
-	for (i = 0;
-		per_cpu(centrino_model, cpu)->op_points[i].frequency
-							!= CPUFREQ_TABLE_END;
-	     i++) {
-		if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
-			return per_cpu(centrino_model, cpu)->
-							op_points[i].frequency;
-	}
-	if (failsafe)
-		return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
-	else
-		return 0;
-}
-
-/* Return the current CPU frequency in kHz */
-static unsigned int get_cur_freq(unsigned int cpu)
-{
-	unsigned l, h;
-	unsigned clock_freq;
-
-	rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h);
-	clock_freq = extract_clock(l, cpu, 0);
-
-	if (unlikely(clock_freq == 0)) {
-		/*
-		 * On some CPUs, we can see transient MSR values (which are
-		 * not present in _PSS), while CPU is doing some automatic
-		 * P-state transition (like TM2). Get the last freq set 
-		 * in PERF_CTL.
-		 */
-		rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h);
-		clock_freq = extract_clock(l, cpu, 1);
-	}
-	return clock_freq;
-}
-
-
-static int centrino_cpu_init(struct cpufreq_policy *policy)
-{
-	struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
-	unsigned freq;
-	unsigned l, h;
-	int ret;
-	int i;
-
-	/* Only Intel makes Enhanced Speedstep-capable CPUs */
-	if (cpu->x86_vendor != X86_VENDOR_INTEL ||
-	    !cpu_has(cpu, X86_FEATURE_EST))
-		return -ENODEV;
-
-	if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
-		centrino_driver.flags |= CPUFREQ_CONST_LOOPS;
-
-	if (policy->cpu != 0)
-		return -ENODEV;
-
-	for (i = 0; i < N_IDS; i++)
-		if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))
-			break;
-
-	if (i != N_IDS)
-		per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
-
-	if (!per_cpu(centrino_cpu, policy->cpu)) {
-		pr_debug("found unsupported CPU with "
-		"Enhanced SpeedStep: send /proc/cpuinfo to "
-		MAINTAINER "\n");
-		return -ENODEV;
-	}
-
-	if (centrino_cpu_init_table(policy)) {
-		return -ENODEV;
-	}
-
-	/* Check to see if Enhanced SpeedStep is enabled, and try to
-	   enable it if not. */
-	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-
-	if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
-		l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
-		pr_debug("trying to enable Enhanced SpeedStep (%x)\n", l);
-		wrmsr(MSR_IA32_MISC_ENABLE, l, h);
-
-		/* check to see if it stuck */
-		rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-		if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
-			printk(KERN_INFO PFX
-				"couldn't enable Enhanced SpeedStep\n");
-			return -ENODEV;
-		}
-	}
-
-	freq = get_cur_freq(policy->cpu);
-	policy->cpuinfo.transition_latency = 10000;
-						/* 10uS transition latency */
-	policy->cur = freq;
-
-	pr_debug("centrino_cpu_init: cur=%dkHz\n", policy->cur);
-
-	ret = cpufreq_frequency_table_cpuinfo(policy,
-		per_cpu(centrino_model, policy->cpu)->op_points);
-	if (ret)
-		return (ret);
-
-	cpufreq_frequency_table_get_attr(
-		per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
-
-	return 0;
-}
-
-static int centrino_cpu_exit(struct cpufreq_policy *policy)
-{
-	unsigned int cpu = policy->cpu;
-
-	if (!per_cpu(centrino_model, cpu))
-		return -ENODEV;
-
-	cpufreq_frequency_table_put_attr(cpu);
-
-	per_cpu(centrino_model, cpu) = NULL;
-
-	return 0;
-}
-
-/**
- * centrino_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within this model's frequency range at least one
- * border included.
- */
-static int centrino_verify (struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy,
-			per_cpu(centrino_model, policy->cpu)->op_points);
-}
-
-/**
- * centrino_setpolicy - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- *	(CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int centrino_target (struct cpufreq_policy *policy,
-			    unsigned int target_freq,
-			    unsigned int relation)
-{
-	unsigned int    newstate = 0;
-	unsigned int	msr, oldmsr = 0, h = 0, cpu = policy->cpu;
-	struct cpufreq_freqs	freqs;
-	int			retval = 0;
-	unsigned int		j, k, first_cpu, tmp;
-	cpumask_var_t covered_cpus;
-
-	if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))
-		return -ENOMEM;
-
-	if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
-		retval = -ENODEV;
-		goto out;
-	}
-
-	if (unlikely(cpufreq_frequency_table_target(policy,
-			per_cpu(centrino_model, cpu)->op_points,
-			target_freq,
-			relation,
-			&newstate))) {
-		retval = -EINVAL;
-		goto out;
-	}
-
-	first_cpu = 1;
-	for_each_cpu(j, policy->cpus) {
-		int good_cpu;
-
-		/* cpufreq holds the hotplug lock, so we are safe here */
-		if (!cpu_online(j))
-			continue;
-
-		/*
-		 * Support for SMP systems.
-		 * Make sure we are running on CPU that wants to change freq
-		 */
-		if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
-			good_cpu = cpumask_any_and(policy->cpus,
-						   cpu_online_mask);
-		else
-			good_cpu = j;
-
-		if (good_cpu >= nr_cpu_ids) {
-			pr_debug("couldn't limit to CPUs in this domain\n");
-			retval = -EAGAIN;
-			if (first_cpu) {
-				/* We haven't started the transition yet. */
-				goto out;
-			}
-			break;
-		}
-
-		msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
-
-		if (first_cpu) {
-			rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);
-			if (msr == (oldmsr & 0xffff)) {
-				pr_debug("no change needed - msr was and needs "
-					"to be %x\n", oldmsr);
-				retval = 0;
-				goto out;
-			}
-
-			freqs.old = extract_clock(oldmsr, cpu, 0);
-			freqs.new = extract_clock(msr, cpu, 0);
-
-			pr_debug("target=%dkHz old=%d new=%d msr=%04x\n",
-				target_freq, freqs.old, freqs.new, msr);
-
-			for_each_cpu(k, policy->cpus) {
-				if (!cpu_online(k))
-					continue;
-				freqs.cpu = k;
-				cpufreq_notify_transition(&freqs,
-					CPUFREQ_PRECHANGE);
-			}
-
-			first_cpu = 0;
-			/* all but 16 LSB are reserved, treat them with care */
-			oldmsr &= ~0xffff;
-			msr &= 0xffff;
-			oldmsr |= msr;
-		}
-
-		wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h);
-		if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
-			break;
-
-		cpumask_set_cpu(j, covered_cpus);
-	}
-
-	for_each_cpu(k, policy->cpus) {
-		if (!cpu_online(k))
-			continue;
-		freqs.cpu = k;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
-
-	if (unlikely(retval)) {
-		/*
-		 * We have failed halfway through the frequency change.
-		 * We have sent callbacks to policy->cpus and
-		 * MSRs have already been written on coverd_cpus.
-		 * Best effort undo..
-		 */
-
-		for_each_cpu(j, covered_cpus)
-			wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h);
-
-		tmp = freqs.new;
-		freqs.new = freqs.old;
-		freqs.old = tmp;
-		for_each_cpu(j, policy->cpus) {
-			if (!cpu_online(j))
-				continue;
-			cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-			cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-		}
-	}
-	retval = 0;
-
-out:
-	free_cpumask_var(covered_cpus);
-	return retval;
-}
-
-static struct freq_attr* centrino_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-static struct cpufreq_driver centrino_driver = {
-	.name		= "centrino", /* should be speedstep-centrino,
-					 but there's a 16 char limit */
-	.init		= centrino_cpu_init,
-	.exit		= centrino_cpu_exit,
-	.verify		= centrino_verify,
-	.target		= centrino_target,
-	.get		= get_cur_freq,
-	.attr           = centrino_attr,
-	.owner		= THIS_MODULE,
-};
-
-
-/**
- * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver
- *
- * Initializes the Enhanced SpeedStep support. Returns -ENODEV on
- * unsupported devices, -ENOENT if there's no voltage table for this
- * particular CPU model, -EINVAL on problems during initiatization,
- * and zero on success.
- *
- * This is quite picky.  Not only does the CPU have to advertise the
- * "est" flag in the cpuid capability flags, we look for a specific
- * CPU model and stepping, and we need to have the exact model name in
- * our voltage tables.  That is, be paranoid about not releasing
- * someone's valuable magic smoke.
- */
-static int __init centrino_init(void)
-{
-	struct cpuinfo_x86 *cpu = &cpu_data(0);
-
-	if (!cpu_has(cpu, X86_FEATURE_EST))
-		return -ENODEV;
-
-	return cpufreq_register_driver(&centrino_driver);
-}
-
-static void __exit centrino_exit(void)
-{
-	cpufreq_unregister_driver(&centrino_driver);
-}
-
-MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>");
-MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors.");
-MODULE_LICENSE ("GPL");
-
-late_initcall(centrino_init);
-module_exit(centrino_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
deleted file mode 100644
index a748ce782fe..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ /dev/null
@@ -1,448 +0,0 @@
-/*
- * (C) 2001  Dave Jones, Arjan van de ven.
- * (C) 2002 - 2003  Dominik Brodowski <linux@brodo.de>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *  Based upon reverse engineered information, and on Intel documentation
- *  for chipsets ICH2-M and ICH3-M.
- *
- *  Many thanks to Ducrot Bruno for finding and fixing the last
- *  "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler
- *  for extensive testing.
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-
-/*********************************************************************
- *                        SPEEDSTEP - DEFINITIONS                    *
- *********************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/sched.h>
-
-#include "speedstep-lib.h"
-
-
-/* speedstep_chipset:
- *   It is necessary to know which chipset is used. As accesses to
- * this device occur at various places in this module, we need a
- * static struct pci_dev * pointing to that device.
- */
-static struct pci_dev *speedstep_chipset_dev;
-
-
-/* speedstep_processor
- */
-static enum speedstep_processor speedstep_processor;
-
-static u32 pmbase;
-
-/*
- *   There are only two frequency states for each processor. Values
- * are in kHz for the time being.
- */
-static struct cpufreq_frequency_table speedstep_freqs[] = {
-	{SPEEDSTEP_HIGH,	0},
-	{SPEEDSTEP_LOW,		0},
-	{0,			CPUFREQ_TABLE_END},
-};
-
-
-/**
- * speedstep_find_register - read the PMBASE address
- *
- * Returns: -ENODEV if no register could be found
- */
-static int speedstep_find_register(void)
-{
-	if (!speedstep_chipset_dev)
-		return -ENODEV;
-
-	/* get PMBASE */
-	pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase);
-	if (!(pmbase & 0x01)) {
-		printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
-		return -ENODEV;
-	}
-
-	pmbase &= 0xFFFFFFFE;
-	if (!pmbase) {
-		printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
-		return -ENODEV;
-	}
-
-	pr_debug("pmbase is 0x%x\n", pmbase);
-	return 0;
-}
-
-/**
- * speedstep_set_state - set the SpeedStep state
- * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- *   Tries to change the SpeedStep state.  Can be called from
- *   smp_call_function_single.
- */
-static void speedstep_set_state(unsigned int state)
-{
-	u8 pm2_blk;
-	u8 value;
-	unsigned long flags;
-
-	if (state > 0x1)
-		return;
-
-	/* Disable IRQs */
-	local_irq_save(flags);
-
-	/* read state */
-	value = inb(pmbase + 0x50);
-
-	pr_debug("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
-
-	/* write new state */
-	value &= 0xFE;
-	value |= state;
-
-	pr_debug("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase);
-
-	/* Disable bus master arbitration */
-	pm2_blk = inb(pmbase + 0x20);
-	pm2_blk |= 0x01;
-	outb(pm2_blk, (pmbase + 0x20));
-
-	/* Actual transition */
-	outb(value, (pmbase + 0x50));
-
-	/* Restore bus master arbitration */
-	pm2_blk &= 0xfe;
-	outb(pm2_blk, (pmbase + 0x20));
-
-	/* check if transition was successful */
-	value = inb(pmbase + 0x50);
-
-	/* Enable IRQs */
-	local_irq_restore(flags);
-
-	pr_debug("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
-
-	if (state == (value & 0x1))
-		pr_debug("change to %u MHz succeeded\n",
-			speedstep_get_frequency(speedstep_processor) / 1000);
-	else
-		printk(KERN_ERR "cpufreq: change failed - I/O error\n");
-
-	return;
-}
-
-/* Wrapper for smp_call_function_single. */
-static void _speedstep_set_state(void *_state)
-{
-	speedstep_set_state(*(unsigned int *)_state);
-}
-
-/**
- * speedstep_activate - activate SpeedStep control in the chipset
- *
- *   Tries to activate the SpeedStep status and control registers.
- * Returns -EINVAL on an unsupported chipset, and zero on success.
- */
-static int speedstep_activate(void)
-{
-	u16 value = 0;
-
-	if (!speedstep_chipset_dev)
-		return -EINVAL;
-
-	pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value);
-	if (!(value & 0x08)) {
-		value |= 0x08;
-		pr_debug("activating SpeedStep (TM) registers\n");
-		pci_write_config_word(speedstep_chipset_dev, 0x00A0, value);
-	}
-
-	return 0;
-}
-
-
-/**
- * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic
- *
- *   Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to
- * the LPC bridge / PM module which contains all power-management
- * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
- * chipset, or zero on failure.
- */
-static unsigned int speedstep_detect_chipset(void)
-{
-	speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
-			      PCI_DEVICE_ID_INTEL_82801DB_12,
-			      PCI_ANY_ID, PCI_ANY_ID,
-			      NULL);
-	if (speedstep_chipset_dev)
-		return 4; /* 4-M */
-
-	speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
-			      PCI_DEVICE_ID_INTEL_82801CA_12,
-			      PCI_ANY_ID, PCI_ANY_ID,
-			      NULL);
-	if (speedstep_chipset_dev)
-		return 3; /* 3-M */
-
-
-	speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
-			      PCI_DEVICE_ID_INTEL_82801BA_10,
-			      PCI_ANY_ID, PCI_ANY_ID,
-			      NULL);
-	if (speedstep_chipset_dev) {
-		/* speedstep.c causes lockups on Dell Inspirons 8000 and
-		 * 8100 which use a pretty old revision of the 82815
-		 * host brige. Abort on these systems.
-		 */
-		static struct pci_dev *hostbridge;
-
-		hostbridge  = pci_get_subsys(PCI_VENDOR_ID_INTEL,
-			      PCI_DEVICE_ID_INTEL_82815_MC,
-			      PCI_ANY_ID, PCI_ANY_ID,
-			      NULL);
-
-		if (!hostbridge)
-			return 2; /* 2-M */
-
-		if (hostbridge->revision < 5) {
-			pr_debug("hostbridge does not support speedstep\n");
-			speedstep_chipset_dev = NULL;
-			pci_dev_put(hostbridge);
-			return 0;
-		}
-
-		pci_dev_put(hostbridge);
-		return 2; /* 2-M */
-	}
-
-	return 0;
-}
-
-static void get_freq_data(void *_speed)
-{
-	unsigned int *speed = _speed;
-
-	*speed = speedstep_get_frequency(speedstep_processor);
-}
-
-static unsigned int speedstep_get(unsigned int cpu)
-{
-	unsigned int speed;
-
-	/* You're supposed to ensure CPU is online. */
-	if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
-		BUG();
-
-	pr_debug("detected %u kHz as current frequency\n", speed);
-	return speed;
-}
-
-/**
- * speedstep_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- *	(CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int speedstep_target(struct cpufreq_policy *policy,
-			     unsigned int target_freq,
-			     unsigned int relation)
-{
-	unsigned int newstate = 0, policy_cpu;
-	struct cpufreq_freqs freqs;
-	int i;
-
-	if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
-				target_freq, relation, &newstate))
-		return -EINVAL;
-
-	policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
-	freqs.old = speedstep_get(policy_cpu);
-	freqs.new = speedstep_freqs[newstate].frequency;
-	freqs.cpu = policy->cpu;
-
-	pr_debug("transiting from %u to %u kHz\n", freqs.old, freqs.new);
-
-	/* no transition necessary */
-	if (freqs.old == freqs.new)
-		return 0;
-
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	}
-
-	smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate,
-				 true);
-
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
-
-	return 0;
-}
-
-
-/**
- * speedstep_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within speedstep_low_freq and speedstep_high_freq, with
- * at least one border included.
- */
-static int speedstep_verify(struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
-}
-
-struct get_freqs {
-	struct cpufreq_policy *policy;
-	int ret;
-};
-
-static void get_freqs_on_cpu(void *_get_freqs)
-{
-	struct get_freqs *get_freqs = _get_freqs;
-
-	get_freqs->ret =
-		speedstep_get_freqs(speedstep_processor,
-			    &speedstep_freqs[SPEEDSTEP_LOW].frequency,
-			    &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
-			    &get_freqs->policy->cpuinfo.transition_latency,
-			    &speedstep_set_state);
-}
-
-static int speedstep_cpu_init(struct cpufreq_policy *policy)
-{
-	int result;
-	unsigned int policy_cpu, speed;
-	struct get_freqs gf;
-
-	/* only run on CPU to be set, or on its sibling */
-#ifdef CONFIG_SMP
-	cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
-#endif
-	policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
-
-	/* detect low and high frequency and transition latency */
-	gf.policy = policy;
-	smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1);
-	if (gf.ret)
-		return gf.ret;
-
-	/* get current speed setting */
-	speed = speedstep_get(policy_cpu);
-	if (!speed)
-		return -EIO;
-
-	pr_debug("currently at %s speed setting - %i MHz\n",
-		(speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
-		? "low" : "high",
-		(speed / 1000));
-
-	/* cpuinfo and default policy values */
-	policy->cur = speed;
-
-	result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
-	if (result)
-		return result;
-
-	cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
-
-	return 0;
-}
-
-
-static int speedstep_cpu_exit(struct cpufreq_policy *policy)
-{
-	cpufreq_frequency_table_put_attr(policy->cpu);
-	return 0;
-}
-
-static struct freq_attr *speedstep_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-
-static struct cpufreq_driver speedstep_driver = {
-	.name	= "speedstep-ich",
-	.verify	= speedstep_verify,
-	.target	= speedstep_target,
-	.init	= speedstep_cpu_init,
-	.exit	= speedstep_cpu_exit,
-	.get	= speedstep_get,
-	.owner	= THIS_MODULE,
-	.attr	= speedstep_attr,
-};
-
-
-/**
- * speedstep_init - initializes the SpeedStep CPUFreq driver
- *
- *   Initializes the SpeedStep support. Returns -ENODEV on unsupported
- * devices, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init speedstep_init(void)
-{
-	/* detect processor */
-	speedstep_processor = speedstep_detect_processor();
-	if (!speedstep_processor) {
-		pr_debug("Intel(R) SpeedStep(TM) capable processor "
-				"not found\n");
-		return -ENODEV;
-	}
-
-	/* detect chipset */
-	if (!speedstep_detect_chipset()) {
-		pr_debug("Intel(R) SpeedStep(TM) for this chipset not "
-				"(yet) available.\n");
-		return -ENODEV;
-	}
-
-	/* activate speedstep support */
-	if (speedstep_activate()) {
-		pci_dev_put(speedstep_chipset_dev);
-		return -EINVAL;
-	}
-
-	if (speedstep_find_register())
-		return -ENODEV;
-
-	return cpufreq_register_driver(&speedstep_driver);
-}
-
-
-/**
- * speedstep_exit - unregisters SpeedStep support
- *
- *   Unregisters SpeedStep support.
- */
-static void __exit speedstep_exit(void)
-{
-	pci_dev_put(speedstep_chipset_dev);
-	cpufreq_unregister_driver(&speedstep_driver);
-}
-
-
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>, "
-		"Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets "
-		"with ICH-M southbridges.");
-MODULE_LICENSE("GPL");
-
-module_init(speedstep_init);
-module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
deleted file mode 100644
index 8af2d2fd9d5..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ /dev/null
@@ -1,478 +0,0 @@
-/*
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  Library for common functions for Intel SpeedStep v.1 and v.2 support
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-
-#include <asm/msr.h>
-#include <asm/tsc.h>
-#include "speedstep-lib.h"
-
-#define PFX "speedstep-lib: "
-
-#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
-static int relaxed_check;
-#else
-#define relaxed_check 0
-#endif
-
-/*********************************************************************
- *                   GET PROCESSOR CORE SPEED IN KHZ                 *
- *********************************************************************/
-
-static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
-{
-	/* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
-	struct {
-		unsigned int ratio;	/* Frequency Multiplier (x10) */
-		u8 bitmap;		/* power on configuration bits
-					[27, 25:22] (in MSR 0x2a) */
-	} msr_decode_mult[] = {
-		{ 30, 0x01 },
-		{ 35, 0x05 },
-		{ 40, 0x02 },
-		{ 45, 0x06 },
-		{ 50, 0x00 },
-		{ 55, 0x04 },
-		{ 60, 0x0b },
-		{ 65, 0x0f },
-		{ 70, 0x09 },
-		{ 75, 0x0d },
-		{ 80, 0x0a },
-		{ 85, 0x26 },
-		{ 90, 0x20 },
-		{ 100, 0x2b },
-		{ 0, 0xff }	/* error or unknown value */
-	};
-
-	/* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */
-	struct {
-		unsigned int value;	/* Front Side Bus speed in MHz */
-		u8 bitmap;		/* power on configuration bits [18: 19]
-					(in MSR 0x2a) */
-	} msr_decode_fsb[] = {
-		{  66, 0x0 },
-		{ 100, 0x2 },
-		{ 133, 0x1 },
-		{   0, 0xff}
-	};
-
-	u32 msr_lo, msr_tmp;
-	int i = 0, j = 0;
-
-	/* read MSR 0x2a - we only need the low 32 bits */
-	rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
-	pr_debug("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
-	msr_tmp = msr_lo;
-
-	/* decode the FSB */
-	msr_tmp &= 0x00c0000;
-	msr_tmp >>= 18;
-	while (msr_tmp != msr_decode_fsb[i].bitmap) {
-		if (msr_decode_fsb[i].bitmap == 0xff)
-			return 0;
-		i++;
-	}
-
-	/* decode the multiplier */
-	if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) {
-		pr_debug("workaround for early PIIIs\n");
-		msr_lo &= 0x03c00000;
-	} else
-		msr_lo &= 0x0bc00000;
-	msr_lo >>= 22;
-	while (msr_lo != msr_decode_mult[j].bitmap) {
-		if (msr_decode_mult[j].bitmap == 0xff)
-			return 0;
-		j++;
-	}
-
-	pr_debug("speed is %u\n",
-		(msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
-
-	return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100;
-}
-
-
-static unsigned int pentiumM_get_frequency(void)
-{
-	u32 msr_lo, msr_tmp;
-
-	rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
-	pr_debug("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
-
-	/* see table B-2 of 24547212.pdf */
-	if (msr_lo & 0x00040000) {
-		printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n",
-				msr_lo, msr_tmp);
-		return 0;
-	}
-
-	msr_tmp = (msr_lo >> 22) & 0x1f;
-	pr_debug("bits 22-26 are 0x%x, speed is %u\n",
-			msr_tmp, (msr_tmp * 100 * 1000));
-
-	return msr_tmp * 100 * 1000;
-}
-
-static unsigned int pentium_core_get_frequency(void)
-{
-	u32 fsb = 0;
-	u32 msr_lo, msr_tmp;
-	int ret;
-
-	rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp);
-	/* see table B-2 of 25366920.pdf */
-	switch (msr_lo & 0x07) {
-	case 5:
-		fsb = 100000;
-		break;
-	case 1:
-		fsb = 133333;
-		break;
-	case 3:
-		fsb = 166667;
-		break;
-	case 2:
-		fsb = 200000;
-		break;
-	case 0:
-		fsb = 266667;
-		break;
-	case 4:
-		fsb = 333333;
-		break;
-	default:
-		printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value");
-	}
-
-	rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
-	pr_debug("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n",
-			msr_lo, msr_tmp);
-
-	msr_tmp = (msr_lo >> 22) & 0x1f;
-	pr_debug("bits 22-26 are 0x%x, speed is %u\n",
-			msr_tmp, (msr_tmp * fsb));
-
-	ret = (msr_tmp * fsb);
-	return ret;
-}
-
-
-static unsigned int pentium4_get_frequency(void)
-{
-	struct cpuinfo_x86 *c = &boot_cpu_data;
-	u32 msr_lo, msr_hi, mult;
-	unsigned int fsb = 0;
-	unsigned int ret;
-	u8 fsb_code;
-
-	/* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency
-	 * to System Bus Frequency Ratio Field in the Processor Frequency
-	 * Configuration Register of the MSR. Therefore the current
-	 * frequency cannot be calculated and has to be measured.
-	 */
-	if (c->x86_model < 2)
-		return cpu_khz;
-
-	rdmsr(0x2c, msr_lo, msr_hi);
-
-	pr_debug("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi);
-
-	/* decode the FSB: see IA-32 Intel (C) Architecture Software
-	 * Developer's Manual, Volume 3: System Prgramming Guide,
-	 * revision #12 in Table B-1: MSRs in the Pentium 4 and
-	 * Intel Xeon Processors, on page B-4 and B-5.
-	 */
-	fsb_code = (msr_lo >> 16) & 0x7;
-	switch (fsb_code) {
-	case 0:
-		fsb = 100 * 1000;
-		break;
-	case 1:
-		fsb = 13333 * 10;
-		break;
-	case 2:
-		fsb = 200 * 1000;
-		break;
-	}
-
-	if (!fsb)
-		printk(KERN_DEBUG PFX "couldn't detect FSB speed. "
-				"Please send an e-mail to <linux@brodo.de>\n");
-
-	/* Multiplier. */
-	mult = msr_lo >> 24;
-
-	pr_debug("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n",
-			fsb, mult, (fsb * mult));
-
-	ret = (fsb * mult);
-	return ret;
-}
-
-
-/* Warning: may get called from smp_call_function_single. */
-unsigned int speedstep_get_frequency(enum speedstep_processor processor)
-{
-	switch (processor) {
-	case SPEEDSTEP_CPU_PCORE:
-		return pentium_core_get_frequency();
-	case SPEEDSTEP_CPU_PM:
-		return pentiumM_get_frequency();
-	case SPEEDSTEP_CPU_P4D:
-	case SPEEDSTEP_CPU_P4M:
-		return pentium4_get_frequency();
-	case SPEEDSTEP_CPU_PIII_T:
-	case SPEEDSTEP_CPU_PIII_C:
-	case SPEEDSTEP_CPU_PIII_C_EARLY:
-		return pentium3_get_frequency(processor);
-	default:
-		return 0;
-	};
-	return 0;
-}
-EXPORT_SYMBOL_GPL(speedstep_get_frequency);
-
-
-/*********************************************************************
- *                 DETECT SPEEDSTEP-CAPABLE PROCESSOR                *
- *********************************************************************/
-
-unsigned int speedstep_detect_processor(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-	u32 ebx, msr_lo, msr_hi;
-
-	pr_debug("x86: %x, model: %x\n", c->x86, c->x86_model);
-
-	if ((c->x86_vendor != X86_VENDOR_INTEL) ||
-	    ((c->x86 != 6) && (c->x86 != 0xF)))
-		return 0;
-
-	if (c->x86 == 0xF) {
-		/* Intel Mobile Pentium 4-M
-		 * or Intel Mobile Pentium 4 with 533 MHz FSB */
-		if (c->x86_model != 2)
-			return 0;
-
-		ebx = cpuid_ebx(0x00000001);
-		ebx &= 0x000000FF;
-
-		pr_debug("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask);
-
-		switch (c->x86_mask) {
-		case 4:
-			/*
-			 * B-stepping [M-P4-M]
-			 * sample has ebx = 0x0f, production has 0x0e.
-			 */
-			if ((ebx == 0x0e) || (ebx == 0x0f))
-				return SPEEDSTEP_CPU_P4M;
-			break;
-		case 7:
-			/*
-			 * C-stepping [M-P4-M]
-			 * needs to have ebx=0x0e, else it's a celeron:
-			 * cf. 25130917.pdf / page 7, footnote 5 even
-			 * though 25072120.pdf / page 7 doesn't say
-			 * samples are only of B-stepping...
-			 */
-			if (ebx == 0x0e)
-				return SPEEDSTEP_CPU_P4M;
-			break;
-		case 9:
-			/*
-			 * D-stepping [M-P4-M or M-P4/533]
-			 *
-			 * this is totally strange: CPUID 0x0F29 is
-			 * used by M-P4-M, M-P4/533 and(!) Celeron CPUs.
-			 * The latter need to be sorted out as they don't
-			 * support speedstep.
-			 * Celerons with CPUID 0x0F29 may have either
-			 * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything
-			 * specific.
-			 * M-P4-Ms may have either ebx=0xe or 0xf [see above]
-			 * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf]
-			 * also, M-P4M HTs have ebx=0x8, too
-			 * For now, they are distinguished by the model_id
-			 * string
-			 */
-			if ((ebx == 0x0e) ||
-				(strstr(c->x86_model_id,
-				    "Mobile Intel(R) Pentium(R) 4") != NULL))
-				return SPEEDSTEP_CPU_P4M;
-			break;
-		default:
-			break;
-		}
-		return 0;
-	}
-
-	switch (c->x86_model) {
-	case 0x0B: /* Intel PIII [Tualatin] */
-		/* cpuid_ebx(1) is 0x04 for desktop PIII,
-		 * 0x06 for mobile PIII-M */
-		ebx = cpuid_ebx(0x00000001);
-		pr_debug("ebx is %x\n", ebx);
-
-		ebx &= 0x000000FF;
-
-		if (ebx != 0x06)
-			return 0;
-
-		/* So far all PIII-M processors support SpeedStep. See
-		 * Intel's 24540640.pdf of June 2003
-		 */
-		return SPEEDSTEP_CPU_PIII_T;
-
-	case 0x08: /* Intel PIII [Coppermine] */
-
-		/* all mobile PIII Coppermines have FSB 100 MHz
-		 * ==> sort out a few desktop PIIIs. */
-		rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
-		pr_debug("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n",
-				msr_lo, msr_hi);
-		msr_lo &= 0x00c0000;
-		if (msr_lo != 0x0080000)
-			return 0;
-
-		/*
-		 * If the processor is a mobile version,
-		 * platform ID has bit 50 set
-		 * it has SpeedStep technology if either
-		 * bit 56 or 57 is set
-		 */
-		rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
-		pr_debug("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n",
-				msr_lo, msr_hi);
-		if ((msr_hi & (1<<18)) &&
-		    (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
-			if (c->x86_mask == 0x01) {
-				pr_debug("early PIII version\n");
-				return SPEEDSTEP_CPU_PIII_C_EARLY;
-			} else
-				return SPEEDSTEP_CPU_PIII_C;
-		}
-
-	default:
-		return 0;
-	}
-}
-EXPORT_SYMBOL_GPL(speedstep_detect_processor);
-
-
-/*********************************************************************
- *                     DETECT SPEEDSTEP SPEEDS                       *
- *********************************************************************/
-
-unsigned int speedstep_get_freqs(enum speedstep_processor processor,
-				  unsigned int *low_speed,
-				  unsigned int *high_speed,
-				  unsigned int *transition_latency,
-				  void (*set_state) (unsigned int state))
-{
-	unsigned int prev_speed;
-	unsigned int ret = 0;
-	unsigned long flags;
-	struct timeval tv1, tv2;
-
-	if ((!processor) || (!low_speed) || (!high_speed) || (!set_state))
-		return -EINVAL;
-
-	pr_debug("trying to determine both speeds\n");
-
-	/* get current speed */
-	prev_speed = speedstep_get_frequency(processor);
-	if (!prev_speed)
-		return -EIO;
-
-	pr_debug("previous speed is %u\n", prev_speed);
-
-	local_irq_save(flags);
-
-	/* switch to low state */
-	set_state(SPEEDSTEP_LOW);
-	*low_speed = speedstep_get_frequency(processor);
-	if (!*low_speed) {
-		ret = -EIO;
-		goto out;
-	}
-
-	pr_debug("low speed is %u\n", *low_speed);
-
-	/* start latency measurement */
-	if (transition_latency)
-		do_gettimeofday(&tv1);
-
-	/* switch to high state */
-	set_state(SPEEDSTEP_HIGH);
-
-	/* end latency measurement */
-	if (transition_latency)
-		do_gettimeofday(&tv2);
-
-	*high_speed = speedstep_get_frequency(processor);
-	if (!*high_speed) {
-		ret = -EIO;
-		goto out;
-	}
-
-	pr_debug("high speed is %u\n", *high_speed);
-
-	if (*low_speed == *high_speed) {
-		ret = -ENODEV;
-		goto out;
-	}
-
-	/* switch to previous state, if necessary */
-	if (*high_speed != prev_speed)
-		set_state(SPEEDSTEP_LOW);
-
-	if (transition_latency) {
-		*transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC +
-			tv2.tv_usec - tv1.tv_usec;
-		pr_debug("transition latency is %u uSec\n", *transition_latency);
-
-		/* convert uSec to nSec and add 20% for safety reasons */
-		*transition_latency *= 1200;
-
-		/* check if the latency measurement is too high or too low
-		 * and set it to a safe value (500uSec) in that case
-		 */
-		if (*transition_latency > 10000000 ||
-		    *transition_latency < 50000) {
-			printk(KERN_WARNING PFX "frequency transition "
-					"measured seems out of range (%u "
-					"nSec), falling back to a safe one of"
-					"%u nSec.\n",
-					*transition_latency, 500000);
-			*transition_latency = 500000;
-		}
-	}
-
-out:
-	local_irq_restore(flags);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(speedstep_get_freqs);
-
-#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
-module_param(relaxed_check, int, 0444);
-MODULE_PARM_DESC(relaxed_check,
-		"Don't do all checks for speedstep capability.");
-#endif
-
-MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
deleted file mode 100644
index 70d9cea1219..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  Library for common functions for Intel SpeedStep v.1 and v.2 support
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-
-
-/* processors */
-enum speedstep_processor {
-	SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001,  /* Coppermine core */
-	SPEEDSTEP_CPU_PIII_C	   = 0x00000002,  /* Coppermine core */
-	SPEEDSTEP_CPU_PIII_T	   = 0x00000003,  /* Tualatin core */
-	SPEEDSTEP_CPU_P4M	   = 0x00000004,  /* P4-M  */
-/* the following processors are not speedstep-capable and are not auto-detected
- * in speedstep_detect_processor(). However, their speed can be detected using
- * the speedstep_get_frequency() call. */
-	SPEEDSTEP_CPU_PM	   = 0xFFFFFF03,  /* Pentium M  */
-	SPEEDSTEP_CPU_P4D	   = 0xFFFFFF04,  /* desktop P4  */
-	SPEEDSTEP_CPU_PCORE	   = 0xFFFFFF05,  /* Core */
-};
-
-/* speedstep states -- only two of them */
-
-#define SPEEDSTEP_HIGH	0x00000000
-#define SPEEDSTEP_LOW	0x00000001
-
-
-/* detect a speedstep-capable processor */
-extern enum speedstep_processor speedstep_detect_processor(void);
-
-/* detect the current speed (in khz) of the processor */
-extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
-
-
-/* detect the low and high speeds of the processor. The callback
- * set_state"'s first argument is either SPEEDSTEP_HIGH or
- * SPEEDSTEP_LOW; the second argument is zero so that no
- * cpufreq_notify_transition calls are initiated.
- */
-extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
-	unsigned int *low_speed,
-	unsigned int *high_speed,
-	unsigned int *transition_latency,
-	void (*set_state) (unsigned int state));
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
deleted file mode 100644
index c76ead3490b..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ /dev/null
@@ -1,464 +0,0 @@
-/*
- * Intel SpeedStep SMI driver.
- *
- * (C) 2003  Hiroshi Miura <miura@da-cha.org>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- */
-
-
-/*********************************************************************
- *                        SPEEDSTEP - DEFINITIONS                    *
- *********************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/delay.h>
-#include <linux/io.h>
-#include <asm/ist.h>
-
-#include "speedstep-lib.h"
-
-/* speedstep system management interface port/command.
- *
- * These parameters are got from IST-SMI BIOS call.
- * If user gives it, these are used.
- *
- */
-static int smi_port;
-static int smi_cmd;
-static unsigned int smi_sig;
-
-/* info about the processor */
-static enum speedstep_processor speedstep_processor;
-
-/*
- * There are only two frequency states for each processor. Values
- * are in kHz for the time being.
- */
-static struct cpufreq_frequency_table speedstep_freqs[] = {
-	{SPEEDSTEP_HIGH,	0},
-	{SPEEDSTEP_LOW,		0},
-	{0,			CPUFREQ_TABLE_END},
-};
-
-#define GET_SPEEDSTEP_OWNER 0
-#define GET_SPEEDSTEP_STATE 1
-#define SET_SPEEDSTEP_STATE 2
-#define GET_SPEEDSTEP_FREQS 4
-
-/* how often shall the SMI call be tried if it failed, e.g. because
- * of DMA activity going on? */
-#define SMI_TRIES 5
-
-/**
- * speedstep_smi_ownership
- */
-static int speedstep_smi_ownership(void)
-{
-	u32 command, result, magic, dummy;
-	u32 function = GET_SPEEDSTEP_OWNER;
-	unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation";
-
-	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-	magic = virt_to_phys(magic_data);
-
-	pr_debug("trying to obtain ownership with command %x at port %x\n",
-			command, smi_port);
-
-	__asm__ __volatile__(
-		"push %%ebp\n"
-		"out %%al, (%%dx)\n"
-		"pop %%ebp\n"
-		: "=D" (result),
-		  "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy),
-		  "=S" (dummy)
-		: "a" (command), "b" (function), "c" (0), "d" (smi_port),
-		  "D" (0), "S" (magic)
-		: "memory"
-	);
-
-	pr_debug("result is %x\n", result);
-
-	return result;
-}
-
-/**
- * speedstep_smi_get_freqs - get SpeedStep preferred & current freq.
- * @low: the low frequency value is placed here
- * @high: the high frequency value is placed here
- *
- * Only available on later SpeedStep-enabled systems, returns false results or
- * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing
- * shows that the latter occurs if !(ist_info.event & 0xFFFF).
- */
-static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high)
-{
-	u32 command, result = 0, edi, high_mhz, low_mhz, dummy;
-	u32 state = 0;
-	u32 function = GET_SPEEDSTEP_FREQS;
-
-	if (!(ist_info.event & 0xFFFF)) {
-		pr_debug("bug #1422 -- can't read freqs from BIOS\n");
-		return -ENODEV;
-	}
-
-	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-
-	pr_debug("trying to determine frequencies with command %x at port %x\n",
-			command, smi_port);
-
-	__asm__ __volatile__(
-		"push %%ebp\n"
-		"out %%al, (%%dx)\n"
-		"pop %%ebp"
-		: "=a" (result),
-		  "=b" (high_mhz),
-		  "=c" (low_mhz),
-		  "=d" (state), "=D" (edi), "=S" (dummy)
-		: "a" (command),
-		  "b" (function),
-		  "c" (state),
-		  "d" (smi_port), "S" (0), "D" (0)
-	);
-
-	pr_debug("result %x, low_freq %u, high_freq %u\n",
-			result, low_mhz, high_mhz);
-
-	/* abort if results are obviously incorrect... */
-	if ((high_mhz + low_mhz) < 600)
-		return -EINVAL;
-
-	*high = high_mhz * 1000;
-	*low  = low_mhz  * 1000;
-
-	return result;
-}
-
-/**
- * speedstep_get_state - set the SpeedStep state
- * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- */
-static int speedstep_get_state(void)
-{
-	u32 function = GET_SPEEDSTEP_STATE;
-	u32 result, state, edi, command, dummy;
-
-	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-
-	pr_debug("trying to determine current setting with command %x "
-		"at port %x\n", command, smi_port);
-
-	__asm__ __volatile__(
-		"push %%ebp\n"
-		"out %%al, (%%dx)\n"
-		"pop %%ebp\n"
-		: "=a" (result),
-		  "=b" (state), "=D" (edi),
-		  "=c" (dummy), "=d" (dummy), "=S" (dummy)
-		: "a" (command), "b" (function), "c" (0),
-		  "d" (smi_port), "S" (0), "D" (0)
-	);
-
-	pr_debug("state is %x, result is %x\n", state, result);
-
-	return state & 1;
-}
-
-
-/**
- * speedstep_set_state - set the SpeedStep state
- * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- */
-static void speedstep_set_state(unsigned int state)
-{
-	unsigned int result = 0, command, new_state, dummy;
-	unsigned long flags;
-	unsigned int function = SET_SPEEDSTEP_STATE;
-	unsigned int retry = 0;
-
-	if (state > 0x1)
-		return;
-
-	/* Disable IRQs */
-	local_irq_save(flags);
-
-	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-
-	pr_debug("trying to set frequency to state %u "
-		"with command %x at port %x\n",
-		state, command, smi_port);
-
-	do {
-		if (retry) {
-			pr_debug("retry %u, previous result %u, waiting...\n",
-					retry, result);
-			mdelay(retry * 50);
-		}
-		retry++;
-		__asm__ __volatile__(
-			"push %%ebp\n"
-			"out %%al, (%%dx)\n"
-			"pop %%ebp"
-			: "=b" (new_state), "=D" (result),
-			  "=c" (dummy), "=a" (dummy),
-			  "=d" (dummy), "=S" (dummy)
-			: "a" (command), "b" (function), "c" (state),
-			  "d" (smi_port), "S" (0), "D" (0)
-			);
-	} while ((new_state != state) && (retry <= SMI_TRIES));
-
-	/* enable IRQs */
-	local_irq_restore(flags);
-
-	if (new_state == state)
-		pr_debug("change to %u MHz succeeded after %u tries "
-			"with result %u\n",
-			(speedstep_freqs[new_state].frequency / 1000),
-			retry, result);
-	else
-		printk(KERN_ERR "cpufreq: change to state %u "
-			"failed with new_state %u and result %u\n",
-			state, new_state, result);
-
-	return;
-}
-
-
-/**
- * speedstep_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: new freq
- * @relation:
- *
- * Sets a new CPUFreq policy/freq.
- */
-static int speedstep_target(struct cpufreq_policy *policy,
-			unsigned int target_freq, unsigned int relation)
-{
-	unsigned int newstate = 0;
-	struct cpufreq_freqs freqs;
-
-	if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
-				target_freq, relation, &newstate))
-		return -EINVAL;
-
-	freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
-	freqs.new = speedstep_freqs[newstate].frequency;
-	freqs.cpu = 0; /* speedstep.c is UP only driver */
-
-	if (freqs.old == freqs.new)
-		return 0;
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	speedstep_set_state(newstate);
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
-	return 0;
-}
-
-
-/**
- * speedstep_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within speedstep_low_freq and speedstep_high_freq, with
- * at least one border included.
- */
-static int speedstep_verify(struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
-}
-
-
-static int speedstep_cpu_init(struct cpufreq_policy *policy)
-{
-	int result;
-	unsigned int speed, state;
-	unsigned int *low, *high;
-
-	/* capability check */
-	if (policy->cpu != 0)
-		return -ENODEV;
-
-	result = speedstep_smi_ownership();
-	if (result) {
-		pr_debug("fails in acquiring ownership of a SMI interface.\n");
-		return -EINVAL;
-	}
-
-	/* detect low and high frequency */
-	low = &speedstep_freqs[SPEEDSTEP_LOW].frequency;
-	high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency;
-
-	result = speedstep_smi_get_freqs(low, high);
-	if (result) {
-		/* fall back to speedstep_lib.c dection mechanism:
-		 * try both states out */
-		pr_debug("could not detect low and high frequencies "
-				"by SMI call.\n");
-		result = speedstep_get_freqs(speedstep_processor,
-				low, high,
-				NULL,
-				&speedstep_set_state);
-
-		if (result) {
-			pr_debug("could not detect two different speeds"
-					" -- aborting.\n");
-			return result;
-		} else
-			pr_debug("workaround worked.\n");
-	}
-
-	/* get current speed setting */
-	state = speedstep_get_state();
-	speed = speedstep_freqs[state].frequency;
-
-	pr_debug("currently at %s speed setting - %i MHz\n",
-		(speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
-		? "low" : "high",
-		(speed / 1000));
-
-	/* cpuinfo and default policy values */
-	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-	policy->cur = speed;
-
-	result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
-	if (result)
-		return result;
-
-	cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
-
-	return 0;
-}
-
-static int speedstep_cpu_exit(struct cpufreq_policy *policy)
-{
-	cpufreq_frequency_table_put_attr(policy->cpu);
-	return 0;
-}
-
-static unsigned int speedstep_get(unsigned int cpu)
-{
-	if (cpu)
-		return -ENODEV;
-	return speedstep_get_frequency(speedstep_processor);
-}
-
-
-static int speedstep_resume(struct cpufreq_policy *policy)
-{
-	int result = speedstep_smi_ownership();
-
-	if (result)
-		pr_debug("fails in re-acquiring ownership of a SMI interface.\n");
-
-	return result;
-}
-
-static struct freq_attr *speedstep_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-static struct cpufreq_driver speedstep_driver = {
-	.name		= "speedstep-smi",
-	.verify		= speedstep_verify,
-	.target		= speedstep_target,
-	.init		= speedstep_cpu_init,
-	.exit		= speedstep_cpu_exit,
-	.get		= speedstep_get,
-	.resume		= speedstep_resume,
-	.owner		= THIS_MODULE,
-	.attr		= speedstep_attr,
-};
-
-/**
- * speedstep_init - initializes the SpeedStep CPUFreq driver
- *
- *   Initializes the SpeedStep support. Returns -ENODEV on unsupported
- * BIOS, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init speedstep_init(void)
-{
-	speedstep_processor = speedstep_detect_processor();
-
-	switch (speedstep_processor) {
-	case SPEEDSTEP_CPU_PIII_T:
-	case SPEEDSTEP_CPU_PIII_C:
-	case SPEEDSTEP_CPU_PIII_C_EARLY:
-		break;
-	default:
-		speedstep_processor = 0;
-	}
-
-	if (!speedstep_processor) {
-		pr_debug("No supported Intel CPU detected.\n");
-		return -ENODEV;
-	}
-
-	pr_debug("signature:0x%.8ulx, command:0x%.8ulx, "
-		"event:0x%.8ulx, perf_level:0x%.8ulx.\n",
-		ist_info.signature, ist_info.command,
-		ist_info.event, ist_info.perf_level);
-
-	/* Error if no IST-SMI BIOS or no PARM
-		 sig= 'ISGE' aka 'Intel Speedstep Gate E' */
-	if ((ist_info.signature !=  0x47534943) && (
-	    (smi_port == 0) || (smi_cmd == 0)))
-		return -ENODEV;
-
-	if (smi_sig == 1)
-		smi_sig = 0x47534943;
-	else
-		smi_sig = ist_info.signature;
-
-	/* setup smi_port from MODLULE_PARM or BIOS */
-	if ((smi_port > 0xff) || (smi_port < 0))
-		return -EINVAL;
-	else if (smi_port == 0)
-		smi_port = ist_info.command & 0xff;
-
-	if ((smi_cmd > 0xff) || (smi_cmd < 0))
-		return -EINVAL;
-	else if (smi_cmd == 0)
-		smi_cmd = (ist_info.command >> 16) & 0xff;
-
-	return cpufreq_register_driver(&speedstep_driver);
-}
-
-
-/**
- * speedstep_exit - unregisters SpeedStep support
- *
- *   Unregisters SpeedStep support.
- */
-static void __exit speedstep_exit(void)
-{
-	cpufreq_unregister_driver(&speedstep_driver);
-}
-
-module_param(smi_port, int, 0444);
-module_param(smi_cmd,  int, 0444);
-module_param(smi_sig, uint, 0444);
-
-MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value "
-		"-- Intel's default setting is 0xb2");
-MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value "
-		"-- Intel's default setting is 0x82");
-MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the "
-		"SMI interface.");
-
-MODULE_AUTHOR("Hiroshi Miura");
-MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface.");
-MODULE_LICENSE("GPL");
-
-module_init(speedstep_init);
-module_exit(speedstep_exit);
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index b78baa547ef..9fb84853d8e 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -1,3 +1,5 @@
+menu "CPU Frequency scaling"
+
 config CPU_FREQ
 	bool "CPU Frequency scaling"
 	help
@@ -177,4 +179,10 @@ config CPU_FREQ_GOV_CONSERVATIVE
 
 	  If in doubt, say N.
 
-endif	# CPU_FREQ
+menu "x86 CPU frequency scaling drivers"
+depends on X86
+source "drivers/cpufreq/Kconfig.x86"
+endmenu
+
+endif
+endmenu
diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
new file mode 100644
index 00000000000..343f8476048
--- /dev/null
+++ b/drivers/cpufreq/Kconfig.x86
@@ -0,0 +1,255 @@
+#
+# x86 CPU Frequency scaling drivers
+#
+
+config X86_PCC_CPUFREQ
+	tristate "Processor Clocking Control interface driver"
+	depends on ACPI && ACPI_PROCESSOR
+	help
+	  This driver adds support for the PCC interface.
+
+	  For details, take a look at:
+	  <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called pcc-cpufreq.
+
+	  If in doubt, say N.
+
+config X86_ACPI_CPUFREQ
+	tristate "ACPI Processor P-States driver"
+	select CPU_FREQ_TABLE
+	depends on ACPI_PROCESSOR
+	help
+	  This driver adds a CPUFreq driver which utilizes the ACPI
+	  Processor Performance States.
+	  This driver also supports Intel Enhanced Speedstep.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called acpi-cpufreq.
+
+	  For details, take a look at <file:Documentation/cpu-freq/>.
+
+	  If in doubt, say N.
+
+config ELAN_CPUFREQ
+	tristate "AMD Elan SC400 and SC410"
+	select CPU_FREQ_TABLE
+	depends on X86_ELAN
+	---help---
+	  This adds the CPUFreq driver for AMD Elan SC400 and SC410
+	  processors.
+
+	  You need to specify the processor maximum speed as boot
+	  parameter: elanfreq=maxspeed (in kHz) or as module
+	  parameter "max_freq".
+
+	  For details, take a look at <file:Documentation/cpu-freq/>.
+
+	  If in doubt, say N.
+
+config SC520_CPUFREQ
+	tristate "AMD Elan SC520"
+	select CPU_FREQ_TABLE
+	depends on X86_ELAN
+	---help---
+	  This adds the CPUFreq driver for AMD Elan SC520 processor.
+
+	  For details, take a look at <file:Documentation/cpu-freq/>.
+
+	  If in doubt, say N.
+
+
+config X86_POWERNOW_K6
+	tristate "AMD Mobile K6-2/K6-3 PowerNow!"
+	select CPU_FREQ_TABLE
+	depends on X86_32
+	help
+	  This adds the CPUFreq driver for mobile AMD K6-2+ and mobile
+	  AMD K6-3+ processors.
+
+	  For details, take a look at <file:Documentation/cpu-freq/>.
+
+	  If in doubt, say N.
+
+config X86_POWERNOW_K7
+	tristate "AMD Mobile Athlon/Duron PowerNow!"
+	select CPU_FREQ_TABLE
+	depends on X86_32
+	help
+	  This adds the CPUFreq driver for mobile AMD K7 mobile processors.
+
+	  For details, take a look at <file:Documentation/cpu-freq/>.
+
+	  If in doubt, say N.
+
+config X86_POWERNOW_K7_ACPI
+	bool
+	depends on X86_POWERNOW_K7 && ACPI_PROCESSOR
+	depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m)
+	depends on X86_32
+	default y
+
+config X86_POWERNOW_K8
+	tristate "AMD Opteron/Athlon64 PowerNow!"
+	select CPU_FREQ_TABLE
+	depends on ACPI && ACPI_PROCESSOR
+	help
+	  This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called powernow-k8.
+
+	  For details, take a look at <file:Documentation/cpu-freq/>.
+
+config X86_GX_SUSPMOD
+	tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
+	depends on X86_32 && PCI
+	help
+	 This add the CPUFreq driver for NatSemi Geode processors which
+	 support suspend modulation.
+
+	 For details, take a look at <file:Documentation/cpu-freq/>.
+
+	 If in doubt, say N.
+
+config X86_SPEEDSTEP_CENTRINO
+	tristate "Intel Enhanced SpeedStep (deprecated)"
+	select CPU_FREQ_TABLE
+	select X86_SPEEDSTEP_CENTRINO_TABLE if X86_32
+	depends on X86_32 || (X86_64 && ACPI_PROCESSOR)
+	help
+	  This is deprecated and this functionality is now merged into
+	  acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of
+	  speedstep_centrino.
+	  This adds the CPUFreq driver for Enhanced SpeedStep enabled
+	  mobile CPUs.  This means Intel Pentium M (Centrino) CPUs
+	  or 64bit enabled Intel Xeons.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called speedstep-centrino.
+
+	  For details, take a look at <file:Documentation/cpu-freq/>.
+
+	  If in doubt, say N.
+
+config X86_SPEEDSTEP_CENTRINO_TABLE
+	bool "Built-in tables for Banias CPUs"
+	depends on X86_32 && X86_SPEEDSTEP_CENTRINO
+	default y
+	help
+	  Use built-in tables for Banias CPUs if ACPI encoding
+	  is not available.
+
+	  If in doubt, say N.
+
+config X86_SPEEDSTEP_ICH
+	tristate "Intel Speedstep on ICH-M chipsets (ioport interface)"
+	select CPU_FREQ_TABLE
+	depends on X86_32
+	help
+	  This adds the CPUFreq driver for certain mobile Intel Pentium III
+	  (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all
+	  mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2,
+	  ICH3 or ICH4 southbridge.
+
+	  For details, take a look at <file:Documentation/cpu-freq/>.
+
+	  If in doubt, say N.
+
+config X86_SPEEDSTEP_SMI
+	tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)"
+	select CPU_FREQ_TABLE
+	depends on X86_32 && EXPERIMENTAL
+	help
+	  This adds the CPUFreq driver for certain mobile Intel Pentium III
+	  (Coppermine), all mobile Intel Pentium III-M (Tualatin)
+	  on systems which have an Intel 440BX/ZX/MX southbridge.
+
+	  For details, take a look at <file:Documentation/cpu-freq/>.
+
+	  If in doubt, say N.
+
+config X86_P4_CLOCKMOD
+	tristate "Intel Pentium 4 clock modulation"
+	select CPU_FREQ_TABLE
+	help
+	  This adds the CPUFreq driver for Intel Pentium 4 / XEON
+	  processors.  When enabled it will lower CPU temperature by skipping
+	  clocks.
+
+	  This driver should be only used in exceptional
+	  circumstances when very low power is needed because it causes severe
+	  slowdowns and noticeable latencies.  Normally Speedstep should be used
+	  instead.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called p4-clockmod.
+
+	  For details, take a look at <file:Documentation/cpu-freq/>.
+
+	  Unless you are absolutely sure say N.
+
+config X86_CPUFREQ_NFORCE2
+	tristate "nVidia nForce2 FSB changing"
+	depends on X86_32 && EXPERIMENTAL
+	help
+	  This adds the CPUFreq driver for FSB changing on nVidia nForce2
+	  platforms.
+
+	  For details, take a look at <file:Documentation/cpu-freq/>.
+
+	  If in doubt, say N.
+
+config X86_LONGRUN
+	tristate "Transmeta LongRun"
+	depends on X86_32
+	help
+	  This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors
+	  which support LongRun.
+
+	  For details, take a look at <file:Documentation/cpu-freq/>.
+
+	  If in doubt, say N.
+
+config X86_LONGHAUL
+	tristate "VIA Cyrix III Longhaul"
+	select CPU_FREQ_TABLE
+	depends on X86_32 && ACPI_PROCESSOR
+	help
+	  This adds the CPUFreq driver for VIA Samuel/CyrixIII,
+	  VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T
+	  processors.
+
+	  For details, take a look at <file:Documentation/cpu-freq/>.
+
+	  If in doubt, say N.
+
+config X86_E_POWERSAVER
+	tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
+	select CPU_FREQ_TABLE
+	depends on X86_32 && EXPERIMENTAL
+	help
+	  This adds the CPUFreq driver for VIA C7 processors.  However, this driver
+	  does not have any safeguards to prevent operating the CPU out of spec
+	  and is thus considered dangerous.  Please use the regular ACPI cpufreq
+	  driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
+
+	  If in doubt, say N.
+
+comment "shared options"
+
+config X86_SPEEDSTEP_LIB
+	tristate
+	default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD)
+
+config X86_SPEEDSTEP_RELAXED_CAP_CHECK
+	bool "Relaxed speedstep capability checks"
+	depends on X86_32 && (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH)
+	help
+	  Don't perform all checks for a speedstep capable system which would
+	  normally be done. Some ancient or strange systems, though speedstep
+	  capable, don't always indicate that they are speedstep capable. This
+	  option lets the probing code bypass some of those checks if the
+	  parameter "relaxed_check=1" is passed to the module.
+
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 71fc3b4173f..c7f1a6f16b6 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -13,3 +13,29 @@ obj-$(CONFIG_CPU_FREQ_GOV_CONSERVATIVE)	+= cpufreq_conservative.o
 # CPUfreq cross-arch helpers
 obj-$(CONFIG_CPU_FREQ_TABLE)		+= freq_table.o
 
+##################################################################################d
+# x86 drivers.
+# Link order matters. K8 is preferred to ACPI because of firmware bugs in early
+# K8 systems. ACPI is preferred to all other hardware-specific drivers.
+# speedstep-* is preferred over p4-clockmod.
+
+obj-$(CONFIG_X86_POWERNOW_K8)		+= powernow-k8.o mperf.o
+obj-$(CONFIG_X86_ACPI_CPUFREQ)		+= acpi-cpufreq.o mperf.o
+obj-$(CONFIG_X86_PCC_CPUFREQ)		+= pcc-cpufreq.o
+obj-$(CONFIG_X86_POWERNOW_K6)		+= powernow-k6.o
+obj-$(CONFIG_X86_POWERNOW_K7)		+= powernow-k7.o
+obj-$(CONFIG_X86_LONGHAUL)		+= longhaul.o
+obj-$(CONFIG_X86_E_POWERSAVER)		+= e_powersaver.o
+obj-$(CONFIG_ELAN_CPUFREQ)		+= elanfreq.o
+obj-$(CONFIG_SC520_CPUFREQ)		+= sc520_freq.o
+obj-$(CONFIG_X86_LONGRUN)		+= longrun.o
+obj-$(CONFIG_X86_GX_SUSPMOD)		+= gx-suspmod.o
+obj-$(CONFIG_X86_SPEEDSTEP_ICH)		+= speedstep-ich.o
+obj-$(CONFIG_X86_SPEEDSTEP_LIB)		+= speedstep-lib.o
+obj-$(CONFIG_X86_SPEEDSTEP_SMI)		+= speedstep-smi.o
+obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO)	+= speedstep-centrino.o
+obj-$(CONFIG_X86_P4_CLOCKMOD)		+= p4-clockmod.o
+obj-$(CONFIG_X86_CPUFREQ_NFORCE2)	+= cpufreq-nforce2.o
+
+##################################################################################d
+
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
new file mode 100644
index 00000000000..4e04e127438
--- /dev/null
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -0,0 +1,773 @@
+/*
+ * acpi-cpufreq.c - ACPI Processor P-States Driver
+ *
+ *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
+ *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
+ *  Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
+ *  Copyright (C) 2006       Denis Sadykov <denis.m.sadykov@intel.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+#include <linux/cpufreq.h>
+#include <linux/compiler.h>
+#include <linux/dmi.h>
+#include <linux/slab.h>
+
+#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/uaccess.h>
+
+#include <acpi/processor.h>
+
+#include <asm/msr.h>
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+#include "mperf.h"
+
+MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
+MODULE_DESCRIPTION("ACPI Processor P-States Driver");
+MODULE_LICENSE("GPL");
+
+enum {
+	UNDEFINED_CAPABLE = 0,
+	SYSTEM_INTEL_MSR_CAPABLE,
+	SYSTEM_IO_CAPABLE,
+};
+
+#define INTEL_MSR_RANGE		(0xffff)
+
+struct acpi_cpufreq_data {
+	struct acpi_processor_performance *acpi_data;
+	struct cpufreq_frequency_table *freq_table;
+	unsigned int resume;
+	unsigned int cpu_feature;
+};
+
+static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
+
+/* acpi_perf_data is a pointer to percpu data. */
+static struct acpi_processor_performance __percpu *acpi_perf_data;
+
+static struct cpufreq_driver acpi_cpufreq_driver;
+
+static unsigned int acpi_pstate_strict;
+
+static int check_est_cpu(unsigned int cpuid)
+{
+	struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
+
+	return cpu_has(cpu, X86_FEATURE_EST);
+}
+
+static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
+{
+	struct acpi_processor_performance *perf;
+	int i;
+
+	perf = data->acpi_data;
+
+	for (i = 0; i < perf->state_count; i++) {
+		if (value == perf->states[i].status)
+			return data->freq_table[i].frequency;
+	}
+	return 0;
+}
+
+static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
+{
+	int i;
+	struct acpi_processor_performance *perf;
+
+	msr &= INTEL_MSR_RANGE;
+	perf = data->acpi_data;
+
+	for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
+		if (msr == perf->states[data->freq_table[i].index].status)
+			return data->freq_table[i].frequency;
+	}
+	return data->freq_table[0].frequency;
+}
+
+static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
+{
+	switch (data->cpu_feature) {
+	case SYSTEM_INTEL_MSR_CAPABLE:
+		return extract_msr(val, data);
+	case SYSTEM_IO_CAPABLE:
+		return extract_io(val, data);
+	default:
+		return 0;
+	}
+}
+
+struct msr_addr {
+	u32 reg;
+};
+
+struct io_addr {
+	u16 port;
+	u8 bit_width;
+};
+
+struct drv_cmd {
+	unsigned int type;
+	const struct cpumask *mask;
+	union {
+		struct msr_addr msr;
+		struct io_addr io;
+	} addr;
+	u32 val;
+};
+
+/* Called via smp_call_function_single(), on the target CPU */
+static void do_drv_read(void *_cmd)
+{
+	struct drv_cmd *cmd = _cmd;
+	u32 h;
+
+	switch (cmd->type) {
+	case SYSTEM_INTEL_MSR_CAPABLE:
+		rdmsr(cmd->addr.msr.reg, cmd->val, h);
+		break;
+	case SYSTEM_IO_CAPABLE:
+		acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
+				&cmd->val,
+				(u32)cmd->addr.io.bit_width);
+		break;
+	default:
+		break;
+	}
+}
+
+/* Called via smp_call_function_many(), on the target CPUs */
+static void do_drv_write(void *_cmd)
+{
+	struct drv_cmd *cmd = _cmd;
+	u32 lo, hi;
+
+	switch (cmd->type) {
+	case SYSTEM_INTEL_MSR_CAPABLE:
+		rdmsr(cmd->addr.msr.reg, lo, hi);
+		lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
+		wrmsr(cmd->addr.msr.reg, lo, hi);
+		break;
+	case SYSTEM_IO_CAPABLE:
+		acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
+				cmd->val,
+				(u32)cmd->addr.io.bit_width);
+		break;
+	default:
+		break;
+	}
+}
+
+static void drv_read(struct drv_cmd *cmd)
+{
+	int err;
+	cmd->val = 0;
+
+	err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
+	WARN_ON_ONCE(err);	/* smp_call_function_any() was buggy? */
+}
+
+static void drv_write(struct drv_cmd *cmd)
+{
+	int this_cpu;
+
+	this_cpu = get_cpu();
+	if (cpumask_test_cpu(this_cpu, cmd->mask))
+		do_drv_write(cmd);
+	smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
+	put_cpu();
+}
+
+static u32 get_cur_val(const struct cpumask *mask)
+{
+	struct acpi_processor_performance *perf;
+	struct drv_cmd cmd;
+
+	if (unlikely(cpumask_empty(mask)))
+		return 0;
+
+	switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) {
+	case SYSTEM_INTEL_MSR_CAPABLE:
+		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
+		cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
+		break;
+	case SYSTEM_IO_CAPABLE:
+		cmd.type = SYSTEM_IO_CAPABLE;
+		perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data;
+		cmd.addr.io.port = perf->control_register.address;
+		cmd.addr.io.bit_width = perf->control_register.bit_width;
+		break;
+	default:
+		return 0;
+	}
+
+	cmd.mask = mask;
+	drv_read(&cmd);
+
+	pr_debug("get_cur_val = %u\n", cmd.val);
+
+	return cmd.val;
+}
+
+static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
+{
+	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
+	unsigned int freq;
+	unsigned int cached_freq;
+
+	pr_debug("get_cur_freq_on_cpu (%d)\n", cpu);
+
+	if (unlikely(data == NULL ||
+		     data->acpi_data == NULL || data->freq_table == NULL)) {
+		return 0;
+	}
+
+	cached_freq = data->freq_table[data->acpi_data->state].frequency;
+	freq = extract_freq(get_cur_val(cpumask_of(cpu)), data);
+	if (freq != cached_freq) {
+		/*
+		 * The dreaded BIOS frequency change behind our back.
+		 * Force set the frequency on next target call.
+		 */
+		data->resume = 1;
+	}
+
+	pr_debug("cur freq = %u\n", freq);
+
+	return freq;
+}
+
+static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
+				struct acpi_cpufreq_data *data)
+{
+	unsigned int cur_freq;
+	unsigned int i;
+
+	for (i = 0; i < 100; i++) {
+		cur_freq = extract_freq(get_cur_val(mask), data);
+		if (cur_freq == freq)
+			return 1;
+		udelay(10);
+	}
+	return 0;
+}
+
+static int acpi_cpufreq_target(struct cpufreq_policy *policy,
+			       unsigned int target_freq, unsigned int relation)
+{
+	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
+	struct acpi_processor_performance *perf;
+	struct cpufreq_freqs freqs;
+	struct drv_cmd cmd;
+	unsigned int next_state = 0; /* Index into freq_table */
+	unsigned int next_perf_state = 0; /* Index into perf table */
+	unsigned int i;
+	int result = 0;
+
+	pr_debug("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
+
+	if (unlikely(data == NULL ||
+	     data->acpi_data == NULL || data->freq_table == NULL)) {
+		return -ENODEV;
+	}
+
+	perf = data->acpi_data;
+	result = cpufreq_frequency_table_target(policy,
+						data->freq_table,
+						target_freq,
+						relation, &next_state);
+	if (unlikely(result)) {
+		result = -ENODEV;
+		goto out;
+	}
+
+	next_perf_state = data->freq_table[next_state].index;
+	if (perf->state == next_perf_state) {
+		if (unlikely(data->resume)) {
+			pr_debug("Called after resume, resetting to P%d\n",
+				next_perf_state);
+			data->resume = 0;
+		} else {
+			pr_debug("Already at target state (P%d)\n",
+				next_perf_state);
+			goto out;
+		}
+	}
+
+	switch (data->cpu_feature) {
+	case SYSTEM_INTEL_MSR_CAPABLE:
+		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
+		cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
+		cmd.val = (u32) perf->states[next_perf_state].control;
+		break;
+	case SYSTEM_IO_CAPABLE:
+		cmd.type = SYSTEM_IO_CAPABLE;
+		cmd.addr.io.port = perf->control_register.address;
+		cmd.addr.io.bit_width = perf->control_register.bit_width;
+		cmd.val = (u32) perf->states[next_perf_state].control;
+		break;
+	default:
+		result = -ENODEV;
+		goto out;
+	}
+
+	/* cpufreq holds the hotplug lock, so we are safe from here on */
+	if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
+		cmd.mask = policy->cpus;
+	else
+		cmd.mask = cpumask_of(policy->cpu);
+
+	freqs.old = perf->states[perf->state].core_frequency * 1000;
+	freqs.new = data->freq_table[next_state].frequency;
+	for_each_cpu(i, policy->cpus) {
+		freqs.cpu = i;
+		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	}
+
+	drv_write(&cmd);
+
+	if (acpi_pstate_strict) {
+		if (!check_freqs(cmd.mask, freqs.new, data)) {
+			pr_debug("acpi_cpufreq_target failed (%d)\n",
+				policy->cpu);
+			result = -EAGAIN;
+			goto out;
+		}
+	}
+
+	for_each_cpu(i, policy->cpus) {
+		freqs.cpu = i;
+		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	}
+	perf->state = next_perf_state;
+
+out:
+	return result;
+}
+
+static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
+{
+	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
+
+	pr_debug("acpi_cpufreq_verify\n");
+
+	return cpufreq_frequency_table_verify(policy, data->freq_table);
+}
+
+static unsigned long
+acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
+{
+	struct acpi_processor_performance *perf = data->acpi_data;
+
+	if (cpu_khz) {
+		/* search the closest match to cpu_khz */
+		unsigned int i;
+		unsigned long freq;
+		unsigned long freqn = perf->states[0].core_frequency * 1000;
+
+		for (i = 0; i < (perf->state_count-1); i++) {
+			freq = freqn;
+			freqn = perf->states[i+1].core_frequency * 1000;
+			if ((2 * cpu_khz) > (freqn + freq)) {
+				perf->state = i;
+				return freq;
+			}
+		}
+		perf->state = perf->state_count-1;
+		return freqn;
+	} else {
+		/* assume CPU is at P0... */
+		perf->state = 0;
+		return perf->states[0].core_frequency * 1000;
+	}
+}
+
+static void free_acpi_perf_data(void)
+{
+	unsigned int i;
+
+	/* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
+	for_each_possible_cpu(i)
+		free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
+				 ->shared_cpu_map);
+	free_percpu(acpi_perf_data);
+}
+
+/*
+ * acpi_cpufreq_early_init - initialize ACPI P-States library
+ *
+ * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c)
+ * in order to determine correct frequency and voltage pairings. We can
+ * do _PDC and _PSD and find out the processor dependency for the
+ * actual init that will happen later...
+ */
+static int __init acpi_cpufreq_early_init(void)
+{
+	unsigned int i;
+	pr_debug("acpi_cpufreq_early_init\n");
+
+	acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
+	if (!acpi_perf_data) {
+		pr_debug("Memory allocation error for acpi_perf_data.\n");
+		return -ENOMEM;
+	}
+	for_each_possible_cpu(i) {
+		if (!zalloc_cpumask_var_node(
+			&per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
+			GFP_KERNEL, cpu_to_node(i))) {
+
+			/* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
+			free_acpi_perf_data();
+			return -ENOMEM;
+		}
+	}
+
+	/* Do initialization in ACPI core */
+	acpi_processor_preregister_performance(acpi_perf_data);
+	return 0;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Some BIOSes do SW_ANY coordination internally, either set it up in hw
+ * or do it in BIOS firmware and won't inform about it to OS. If not
+ * detected, this has a side effect of making CPU run at a different speed
+ * than OS intended it to run at. Detect it and handle it cleanly.
+ */
+static int bios_with_sw_any_bug;
+
+static int sw_any_bug_found(const struct dmi_system_id *d)
+{
+	bios_with_sw_any_bug = 1;
+	return 0;
+}
+
+static const struct dmi_system_id sw_any_bug_dmi_table[] = {
+	{
+		.callback = sw_any_bug_found,
+		.ident = "Supermicro Server X6DLP",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
+			DMI_MATCH(DMI_BIOS_VERSION, "080010"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),
+		},
+	},
+	{ }
+};
+
+static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
+{
+	/* Intel Xeon Processor 7100 Series Specification Update
+	 * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
+	 * AL30: A Machine Check Exception (MCE) Occurring during an
+	 * Enhanced Intel SpeedStep Technology Ratio Change May Cause
+	 * Both Processor Cores to Lock Up. */
+	if (c->x86_vendor == X86_VENDOR_INTEL) {
+		if ((c->x86 == 15) &&
+		    (c->x86_model == 6) &&
+		    (c->x86_mask == 8)) {
+			printk(KERN_INFO "acpi-cpufreq: Intel(R) "
+			    "Xeon(R) 7100 Errata AL30, processors may "
+			    "lock up on frequency changes: disabling "
+			    "acpi-cpufreq.\n");
+			return -ENODEV;
+		    }
+		}
+	return 0;
+}
+#endif
+
+static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+	unsigned int i;
+	unsigned int valid_states = 0;
+	unsigned int cpu = policy->cpu;
+	struct acpi_cpufreq_data *data;
+	unsigned int result = 0;
+	struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
+	struct acpi_processor_performance *perf;
+#ifdef CONFIG_SMP
+	static int blacklisted;
+#endif
+
+	pr_debug("acpi_cpufreq_cpu_init\n");
+
+#ifdef CONFIG_SMP
+	if (blacklisted)
+		return blacklisted;
+	blacklisted = acpi_cpufreq_blacklist(c);
+	if (blacklisted)
+		return blacklisted;
+#endif
+
+	data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
+	per_cpu(acfreq_data, cpu) = data;
+
+	if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
+		acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
+
+	result = acpi_processor_register_performance(data->acpi_data, cpu);
+	if (result)
+		goto err_free;
+
+	perf = data->acpi_data;
+	policy->shared_type = perf->shared_type;
+
+	/*
+	 * Will let policy->cpus know about dependency only when software
+	 * coordination is required.
+	 */
+	if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
+	    policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
+		cpumask_copy(policy->cpus, perf->shared_cpu_map);
+	}
+	cpumask_copy(policy->related_cpus, perf->shared_cpu_map);
+
+#ifdef CONFIG_SMP
+	dmi_check_system(sw_any_bug_dmi_table);
+	if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) {
+		policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
+		cpumask_copy(policy->cpus, cpu_core_mask(cpu));
+	}
+#endif
+
+	/* capability check */
+	if (perf->state_count <= 1) {
+		pr_debug("No P-States\n");
+		result = -ENODEV;
+		goto err_unreg;
+	}
+
+	if (perf->control_register.space_id != perf->status_register.space_id) {
+		result = -ENODEV;
+		goto err_unreg;
+	}
+
+	switch (perf->control_register.space_id) {
+	case ACPI_ADR_SPACE_SYSTEM_IO:
+		pr_debug("SYSTEM IO addr space\n");
+		data->cpu_feature = SYSTEM_IO_CAPABLE;
+		break;
+	case ACPI_ADR_SPACE_FIXED_HARDWARE:
+		pr_debug("HARDWARE addr space\n");
+		if (!check_est_cpu(cpu)) {
+			result = -ENODEV;
+			goto err_unreg;
+		}
+		data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
+		break;
+	default:
+		pr_debug("Unknown addr space %d\n",
+			(u32) (perf->control_register.space_id));
+		result = -ENODEV;
+		goto err_unreg;
+	}
+
+	data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *
+		    (perf->state_count+1), GFP_KERNEL);
+	if (!data->freq_table) {
+		result = -ENOMEM;
+		goto err_unreg;
+	}
+
+	/* detect transition latency */
+	policy->cpuinfo.transition_latency = 0;
+	for (i = 0; i < perf->state_count; i++) {
+		if ((perf->states[i].transition_latency * 1000) >
+		    policy->cpuinfo.transition_latency)
+			policy->cpuinfo.transition_latency =
+			    perf->states[i].transition_latency * 1000;
+	}
+
+	/* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
+	if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
+	    policy->cpuinfo.transition_latency > 20 * 1000) {
+		policy->cpuinfo.transition_latency = 20 * 1000;
+		printk_once(KERN_INFO
+			    "P-state transition latency capped at 20 uS\n");
+	}
+
+	/* table init */
+	for (i = 0; i < perf->state_count; i++) {
+		if (i > 0 && perf->states[i].core_frequency >=
+		    data->freq_table[valid_states-1].frequency / 1000)
+			continue;
+
+		data->freq_table[valid_states].index = i;
+		data->freq_table[valid_states].frequency =
+		    perf->states[i].core_frequency * 1000;
+		valid_states++;
+	}
+	data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
+	perf->state = 0;
+
+	result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
+	if (result)
+		goto err_freqfree;
+
+	if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq)
+		printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n");
+
+	switch (perf->control_register.space_id) {
+	case ACPI_ADR_SPACE_SYSTEM_IO:
+		/* Current speed is unknown and not detectable by IO port */
+		policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
+		break;
+	case ACPI_ADR_SPACE_FIXED_HARDWARE:
+		acpi_cpufreq_driver.get = get_cur_freq_on_cpu;
+		policy->cur = get_cur_freq_on_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+
+	/* notify BIOS that we exist */
+	acpi_processor_notify_smm(THIS_MODULE);
+
+	/* Check for APERF/MPERF support in hardware */
+	if (cpu_has(c, X86_FEATURE_APERFMPERF))
+		acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
+
+	pr_debug("CPU%u - ACPI performance management activated.\n", cpu);
+	for (i = 0; i < perf->state_count; i++)
+		pr_debug("     %cP%d: %d MHz, %d mW, %d uS\n",
+			(i == perf->state ? '*' : ' '), i,
+			(u32) perf->states[i].core_frequency,
+			(u32) perf->states[i].power,
+			(u32) perf->states[i].transition_latency);
+
+	cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
+
+	/*
+	 * the first call to ->target() should result in us actually
+	 * writing something to the appropriate registers.
+	 */
+	data->resume = 1;
+
+	return result;
+
+err_freqfree:
+	kfree(data->freq_table);
+err_unreg:
+	acpi_processor_unregister_performance(perf, cpu);
+err_free:
+	kfree(data);
+	per_cpu(acfreq_data, cpu) = NULL;
+
+	return result;
+}
+
+static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
+
+	pr_debug("acpi_cpufreq_cpu_exit\n");
+
+	if (data) {
+		cpufreq_frequency_table_put_attr(policy->cpu);
+		per_cpu(acfreq_data, policy->cpu) = NULL;
+		acpi_processor_unregister_performance(data->acpi_data,
+						      policy->cpu);
+		kfree(data->freq_table);
+		kfree(data);
+	}
+
+	return 0;
+}
+
+static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
+{
+	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
+
+	pr_debug("acpi_cpufreq_resume\n");
+
+	data->resume = 1;
+
+	return 0;
+}
+
+static struct freq_attr *acpi_cpufreq_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	NULL,
+};
+
+static struct cpufreq_driver acpi_cpufreq_driver = {
+	.verify		= acpi_cpufreq_verify,
+	.target		= acpi_cpufreq_target,
+	.bios_limit	= acpi_processor_get_bios_limit,
+	.init		= acpi_cpufreq_cpu_init,
+	.exit		= acpi_cpufreq_cpu_exit,
+	.resume		= acpi_cpufreq_resume,
+	.name		= "acpi-cpufreq",
+	.owner		= THIS_MODULE,
+	.attr		= acpi_cpufreq_attr,
+};
+
+static int __init acpi_cpufreq_init(void)
+{
+	int ret;
+
+	if (acpi_disabled)
+		return 0;
+
+	pr_debug("acpi_cpufreq_init\n");
+
+	ret = acpi_cpufreq_early_init();
+	if (ret)
+		return ret;
+
+	ret = cpufreq_register_driver(&acpi_cpufreq_driver);
+	if (ret)
+		free_acpi_perf_data();
+
+	return ret;
+}
+
+static void __exit acpi_cpufreq_exit(void)
+{
+	pr_debug("acpi_cpufreq_exit\n");
+
+	cpufreq_unregister_driver(&acpi_cpufreq_driver);
+
+	free_percpu(acpi_perf_data);
+}
+
+module_param(acpi_pstate_strict, uint, 0644);
+MODULE_PARM_DESC(acpi_pstate_strict,
+	"value 0 or non-zero. non-zero -> strict ACPI checks are "
+	"performed during frequency changes.");
+
+late_initcall(acpi_cpufreq_init);
+module_exit(acpi_cpufreq_exit);
+
+MODULE_ALIAS("acpi");
diff --git a/drivers/cpufreq/cpufreq-nforce2.c b/drivers/cpufreq/cpufreq-nforce2.c
new file mode 100644
index 00000000000..7bac808804f
--- /dev/null
+++ b/drivers/cpufreq/cpufreq-nforce2.c
@@ -0,0 +1,444 @@
+/*
+ * (C) 2004-2006  Sebastian Witt <se.witt@gmx.net>
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *  Based upon reverse engineered information
+ *
+ *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+
+#define NFORCE2_XTAL 25
+#define NFORCE2_BOOTFSB 0x48
+#define NFORCE2_PLLENABLE 0xa8
+#define NFORCE2_PLLREG 0xa4
+#define NFORCE2_PLLADR 0xa0
+#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div)
+
+#define NFORCE2_MIN_FSB 50
+#define NFORCE2_SAFE_DISTANCE 50
+
+/* Delay in ms between FSB changes */
+/* #define NFORCE2_DELAY 10 */
+
+/*
+ * nforce2_chipset:
+ * FSB is changed using the chipset
+ */
+static struct pci_dev *nforce2_dev;
+
+/* fid:
+ * multiplier * 10
+ */
+static int fid;
+
+/* min_fsb, max_fsb:
+ * minimum and maximum FSB (= FSB at boot time)
+ */
+static int min_fsb;
+static int max_fsb;
+
+MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
+MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
+MODULE_LICENSE("GPL");
+
+module_param(fid, int, 0444);
+module_param(min_fsb, int, 0444);
+
+MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
+MODULE_PARM_DESC(min_fsb,
+		"Minimum FSB to use, if not defined: current FSB - 50");
+
+#define PFX "cpufreq-nforce2: "
+
+/**
+ * nforce2_calc_fsb - calculate FSB
+ * @pll: PLL value
+ *
+ *   Calculates FSB from PLL value
+ */
+static int nforce2_calc_fsb(int pll)
+{
+	unsigned char mul, div;
+
+	mul = (pll >> 8) & 0xff;
+	div = pll & 0xff;
+
+	if (div > 0)
+		return NFORCE2_XTAL * mul / div;
+
+	return 0;
+}
+
+/**
+ * nforce2_calc_pll - calculate PLL value
+ * @fsb: FSB
+ *
+ *   Calculate PLL value for given FSB
+ */
+static int nforce2_calc_pll(unsigned int fsb)
+{
+	unsigned char xmul, xdiv;
+	unsigned char mul = 0, div = 0;
+	int tried = 0;
+
+	/* Try to calculate multiplier and divider up to 4 times */
+	while (((mul == 0) || (div == 0)) && (tried <= 3)) {
+		for (xdiv = 2; xdiv <= 0x80; xdiv++)
+			for (xmul = 1; xmul <= 0xfe; xmul++)
+				if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) ==
+				    fsb + tried) {
+					mul = xmul;
+					div = xdiv;
+				}
+		tried++;
+	}
+
+	if ((mul == 0) || (div == 0))
+		return -1;
+
+	return NFORCE2_PLL(mul, div);
+}
+
+/**
+ * nforce2_write_pll - write PLL value to chipset
+ * @pll: PLL value
+ *
+ *   Writes new FSB PLL value to chipset
+ */
+static void nforce2_write_pll(int pll)
+{
+	int temp;
+
+	/* Set the pll addr. to 0x00 */
+	pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0);
+
+	/* Now write the value in all 64 registers */
+	for (temp = 0; temp <= 0x3f; temp++)
+		pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll);
+
+	return;
+}
+
+/**
+ * nforce2_fsb_read - Read FSB
+ *
+ *   Read FSB from chipset
+ *   If bootfsb != 0, return FSB at boot-time
+ */
+static unsigned int nforce2_fsb_read(int bootfsb)
+{
+	struct pci_dev *nforce2_sub5;
+	u32 fsb, temp = 0;
+
+	/* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
+	nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF,
+				PCI_ANY_ID, PCI_ANY_ID, NULL);
+	if (!nforce2_sub5)
+		return 0;
+
+	pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb);
+	fsb /= 1000000;
+
+	/* Check if PLL register is already set */
+	pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
+
+	if (bootfsb || !temp)
+		return fsb;
+
+	/* Use PLL register FSB value */
+	pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp);
+	fsb = nforce2_calc_fsb(temp);
+
+	return fsb;
+}
+
+/**
+ * nforce2_set_fsb - set new FSB
+ * @fsb: New FSB
+ *
+ *   Sets new FSB
+ */
+static int nforce2_set_fsb(unsigned int fsb)
+{
+	u32 temp = 0;
+	unsigned int tfsb;
+	int diff;
+	int pll = 0;
+
+	if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) {
+		printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb);
+		return -EINVAL;
+	}
+
+	tfsb = nforce2_fsb_read(0);
+	if (!tfsb) {
+		printk(KERN_ERR PFX "Error while reading the FSB\n");
+		return -EINVAL;
+	}
+
+	/* First write? Then set actual value */
+	pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
+	if (!temp) {
+		pll = nforce2_calc_pll(tfsb);
+
+		if (pll < 0)
+			return -EINVAL;
+
+		nforce2_write_pll(pll);
+	}
+
+	/* Enable write access */
+	temp = 0x01;
+	pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp);
+
+	diff = tfsb - fsb;
+
+	if (!diff)
+		return 0;
+
+	while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) {
+		if (diff < 0)
+			tfsb++;
+		else
+			tfsb--;
+
+		/* Calculate the PLL reg. value */
+		pll = nforce2_calc_pll(tfsb);
+		if (pll == -1)
+			return -EINVAL;
+
+		nforce2_write_pll(pll);
+#ifdef NFORCE2_DELAY
+		mdelay(NFORCE2_DELAY);
+#endif
+	}
+
+	temp = 0x40;
+	pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp);
+
+	return 0;
+}
+
+/**
+ * nforce2_get - get the CPU frequency
+ * @cpu: CPU number
+ *
+ * Returns the CPU frequency
+ */
+static unsigned int nforce2_get(unsigned int cpu)
+{
+	if (cpu)
+		return 0;
+	return nforce2_fsb_read(0) * fid * 100;
+}
+
+/**
+ * nforce2_target - set a new CPUFreq policy
+ * @policy: new policy
+ * @target_freq: the target frequency
+ * @relation: how that frequency relates to achieved frequency
+ *  (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
+ *
+ * Sets a new CPUFreq policy.
+ */
+static int nforce2_target(struct cpufreq_policy *policy,
+			  unsigned int target_freq, unsigned int relation)
+{
+/*        unsigned long         flags; */
+	struct cpufreq_freqs freqs;
+	unsigned int target_fsb;
+
+	if ((target_freq > policy->max) || (target_freq < policy->min))
+		return -EINVAL;
+
+	target_fsb = target_freq / (fid * 100);
+
+	freqs.old = nforce2_get(policy->cpu);
+	freqs.new = target_fsb * fid * 100;
+	freqs.cpu = 0;		/* Only one CPU on nForce2 platforms */
+
+	if (freqs.old == freqs.new)
+		return 0;
+
+	pr_debug("Old CPU frequency %d kHz, new %d kHz\n",
+	       freqs.old, freqs.new);
+
+	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+
+	/* Disable IRQs */
+	/* local_irq_save(flags); */
+
+	if (nforce2_set_fsb(target_fsb) < 0)
+		printk(KERN_ERR PFX "Changing FSB to %d failed\n",
+			target_fsb);
+	else
+		pr_debug("Changed FSB successfully to %d\n",
+			target_fsb);
+
+	/* Enable IRQs */
+	/* local_irq_restore(flags); */
+
+	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+
+	return 0;
+}
+
+/**
+ * nforce2_verify - verifies a new CPUFreq policy
+ * @policy: new policy
+ */
+static int nforce2_verify(struct cpufreq_policy *policy)
+{
+	unsigned int fsb_pol_max;
+
+	fsb_pol_max = policy->max / (fid * 100);
+
+	if (policy->min < (fsb_pol_max * fid * 100))
+		policy->max = (fsb_pol_max + 1) * fid * 100;
+
+	cpufreq_verify_within_limits(policy,
+				     policy->cpuinfo.min_freq,
+				     policy->cpuinfo.max_freq);
+	return 0;
+}
+
+static int nforce2_cpu_init(struct cpufreq_policy *policy)
+{
+	unsigned int fsb;
+	unsigned int rfid;
+
+	/* capability check */
+	if (policy->cpu != 0)
+		return -ENODEV;
+
+	/* Get current FSB */
+	fsb = nforce2_fsb_read(0);
+
+	if (!fsb)
+		return -EIO;
+
+	/* FIX: Get FID from CPU */
+	if (!fid) {
+		if (!cpu_khz) {
+			printk(KERN_WARNING PFX
+			"cpu_khz not set, can't calculate multiplier!\n");
+			return -ENODEV;
+		}
+
+		fid = cpu_khz / (fsb * 100);
+		rfid = fid % 5;
+
+		if (rfid) {
+			if (rfid > 2)
+				fid += 5 - rfid;
+			else
+				fid -= rfid;
+		}
+	}
+
+	printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb,
+	       fid / 10, fid % 10);
+
+	/* Set maximum FSB to FSB at boot time */
+	max_fsb = nforce2_fsb_read(1);
+
+	if (!max_fsb)
+		return -EIO;
+
+	if (!min_fsb)
+		min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE;
+
+	if (min_fsb < NFORCE2_MIN_FSB)
+		min_fsb = NFORCE2_MIN_FSB;
+
+	/* cpuinfo and default policy values */
+	policy->cpuinfo.min_freq = min_fsb * fid * 100;
+	policy->cpuinfo.max_freq = max_fsb * fid * 100;
+	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+	policy->cur = nforce2_get(policy->cpu);
+	policy->min = policy->cpuinfo.min_freq;
+	policy->max = policy->cpuinfo.max_freq;
+
+	return 0;
+}
+
+static int nforce2_cpu_exit(struct cpufreq_policy *policy)
+{
+	return 0;
+}
+
+static struct cpufreq_driver nforce2_driver = {
+	.name = "nforce2",
+	.verify = nforce2_verify,
+	.target = nforce2_target,
+	.get = nforce2_get,
+	.init = nforce2_cpu_init,
+	.exit = nforce2_cpu_exit,
+	.owner = THIS_MODULE,
+};
+
+/**
+ * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic
+ *
+ * Detects nForce2 A2 and C1 stepping
+ *
+ */
+static int nforce2_detect_chipset(void)
+{
+	nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
+					PCI_DEVICE_ID_NVIDIA_NFORCE2,
+					PCI_ANY_ID, PCI_ANY_ID, NULL);
+
+	if (nforce2_dev == NULL)
+		return -ENODEV;
+
+	printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n",
+	       nforce2_dev->revision);
+	printk(KERN_INFO PFX
+	       "FSB changing is maybe unstable and can lead to "
+	       "crashes and data loss.\n");
+
+	return 0;
+}
+
+/**
+ * nforce2_init - initializes the nForce2 CPUFreq driver
+ *
+ * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported
+ * devices, -EINVAL on problems during initiatization, and zero on
+ * success.
+ */
+static int __init nforce2_init(void)
+{
+	/* TODO: do we need to detect the processor? */
+
+	/* detect chipset */
+	if (nforce2_detect_chipset()) {
+		printk(KERN_INFO PFX "No nForce2 chipset.\n");
+		return -ENODEV;
+	}
+
+	return cpufreq_register_driver(&nforce2_driver);
+}
+
+/**
+ * nforce2_exit - unregisters cpufreq module
+ *
+ *   Unregisters nForce2 FSB change support.
+ */
+static void __exit nforce2_exit(void)
+{
+	cpufreq_unregister_driver(&nforce2_driver);
+}
+
+module_init(nforce2_init);
+module_exit(nforce2_exit);
+
diff --git a/drivers/cpufreq/e_powersaver.c b/drivers/cpufreq/e_powersaver.c
new file mode 100644
index 00000000000..35a257dd4bb
--- /dev/null
+++ b/drivers/cpufreq/e_powersaver.c
@@ -0,0 +1,367 @@
+/*
+ *  Based on documentation provided by Dave Jones. Thanks!
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *
+ *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/ioport.h>
+#include <linux/slab.h>
+#include <linux/timex.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+
+#include <asm/msr.h>
+#include <asm/tsc.h>
+
+#define EPS_BRAND_C7M	0
+#define EPS_BRAND_C7	1
+#define EPS_BRAND_EDEN	2
+#define EPS_BRAND_C3	3
+#define EPS_BRAND_C7D	4
+
+struct eps_cpu_data {
+	u32 fsb;
+	struct cpufreq_frequency_table freq_table[];
+};
+
+static struct eps_cpu_data *eps_cpu[NR_CPUS];
+
+
+static unsigned int eps_get(unsigned int cpu)
+{
+	struct eps_cpu_data *centaur;
+	u32 lo, hi;
+
+	if (cpu)
+		return 0;
+	centaur = eps_cpu[cpu];
+	if (centaur == NULL)
+		return 0;
+
+	/* Return current frequency */
+	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+	return centaur->fsb * ((lo >> 8) & 0xff);
+}
+
+static int eps_set_state(struct eps_cpu_data *centaur,
+			 unsigned int cpu,
+			 u32 dest_state)
+{
+	struct cpufreq_freqs freqs;
+	u32 lo, hi;
+	int err = 0;
+	int i;
+
+	freqs.old = eps_get(cpu);
+	freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff);
+	freqs.cpu = cpu;
+	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+
+	/* Wait while CPU is busy */
+	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+	i = 0;
+	while (lo & ((1 << 16) | (1 << 17))) {
+		udelay(16);
+		rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+		i++;
+		if (unlikely(i > 64)) {
+			err = -ENODEV;
+			goto postchange;
+		}
+	}
+	/* Set new multiplier and voltage */
+	wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0);
+	/* Wait until transition end */
+	i = 0;
+	do {
+		udelay(16);
+		rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+		i++;
+		if (unlikely(i > 64)) {
+			err = -ENODEV;
+			goto postchange;
+		}
+	} while (lo & ((1 << 16) | (1 << 17)));
+
+	/* Return current frequency */
+postchange:
+	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+	freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
+
+#ifdef DEBUG
+	{
+	u8 current_multiplier, current_voltage;
+
+	/* Print voltage and multiplier */
+	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+	current_voltage = lo & 0xff;
+	printk(KERN_INFO "eps: Current voltage = %dmV\n",
+		current_voltage * 16 + 700);
+	current_multiplier = (lo >> 8) & 0xff;
+	printk(KERN_INFO "eps: Current multiplier = %d\n",
+		current_multiplier);
+	}
+#endif
+	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	return err;
+}
+
+static int eps_target(struct cpufreq_policy *policy,
+			       unsigned int target_freq,
+			       unsigned int relation)
+{
+	struct eps_cpu_data *centaur;
+	unsigned int newstate = 0;
+	unsigned int cpu = policy->cpu;
+	unsigned int dest_state;
+	int ret;
+
+	if (unlikely(eps_cpu[cpu] == NULL))
+		return -ENODEV;
+	centaur = eps_cpu[cpu];
+
+	if (unlikely(cpufreq_frequency_table_target(policy,
+			&eps_cpu[cpu]->freq_table[0],
+			target_freq,
+			relation,
+			&newstate))) {
+		return -EINVAL;
+	}
+
+	/* Make frequency transition */
+	dest_state = centaur->freq_table[newstate].index & 0xffff;
+	ret = eps_set_state(centaur, cpu, dest_state);
+	if (ret)
+		printk(KERN_ERR "eps: Timeout!\n");
+	return ret;
+}
+
+static int eps_verify(struct cpufreq_policy *policy)
+{
+	return cpufreq_frequency_table_verify(policy,
+			&eps_cpu[policy->cpu]->freq_table[0]);
+}
+
+static int eps_cpu_init(struct cpufreq_policy *policy)
+{
+	unsigned int i;
+	u32 lo, hi;
+	u64 val;
+	u8 current_multiplier, current_voltage;
+	u8 max_multiplier, max_voltage;
+	u8 min_multiplier, min_voltage;
+	u8 brand = 0;
+	u32 fsb;
+	struct eps_cpu_data *centaur;
+	struct cpuinfo_x86 *c = &cpu_data(0);
+	struct cpufreq_frequency_table *f_table;
+	int k, step, voltage;
+	int ret;
+	int states;
+
+	if (policy->cpu != 0)
+		return -ENODEV;
+
+	/* Check brand */
+	printk(KERN_INFO "eps: Detected VIA ");
+
+	switch (c->x86_model) {
+	case 10:
+		rdmsr(0x1153, lo, hi);
+		brand = (((lo >> 2) ^ lo) >> 18) & 3;
+		printk(KERN_CONT "Model A ");
+		break;
+	case 13:
+		rdmsr(0x1154, lo, hi);
+		brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff;
+		printk(KERN_CONT "Model D ");
+		break;
+	}
+
+	switch (brand) {
+	case EPS_BRAND_C7M:
+		printk(KERN_CONT "C7-M\n");
+		break;
+	case EPS_BRAND_C7:
+		printk(KERN_CONT "C7\n");
+		break;
+	case EPS_BRAND_EDEN:
+		printk(KERN_CONT "Eden\n");
+		break;
+	case EPS_BRAND_C7D:
+		printk(KERN_CONT "C7-D\n");
+		break;
+	case EPS_BRAND_C3:
+		printk(KERN_CONT "C3\n");
+		return -ENODEV;
+		break;
+	}
+	/* Enable Enhanced PowerSaver */
+	rdmsrl(MSR_IA32_MISC_ENABLE, val);
+	if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
+		val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
+		wrmsrl(MSR_IA32_MISC_ENABLE, val);
+		/* Can be locked at 0 */
+		rdmsrl(MSR_IA32_MISC_ENABLE, val);
+		if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
+			printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n");
+			return -ENODEV;
+		}
+	}
+
+	/* Print voltage and multiplier */
+	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+	current_voltage = lo & 0xff;
+	printk(KERN_INFO "eps: Current voltage = %dmV\n",
+			current_voltage * 16 + 700);
+	current_multiplier = (lo >> 8) & 0xff;
+	printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier);
+
+	/* Print limits */
+	max_voltage = hi & 0xff;
+	printk(KERN_INFO "eps: Highest voltage = %dmV\n",
+			max_voltage * 16 + 700);
+	max_multiplier = (hi >> 8) & 0xff;
+	printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier);
+	min_voltage = (hi >> 16) & 0xff;
+	printk(KERN_INFO "eps: Lowest voltage = %dmV\n",
+			min_voltage * 16 + 700);
+	min_multiplier = (hi >> 24) & 0xff;
+	printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier);
+
+	/* Sanity checks */
+	if (current_multiplier == 0 || max_multiplier == 0
+	    || min_multiplier == 0)
+		return -EINVAL;
+	if (current_multiplier > max_multiplier
+	    || max_multiplier <= min_multiplier)
+		return -EINVAL;
+	if (current_voltage > 0x1f || max_voltage > 0x1f)
+		return -EINVAL;
+	if (max_voltage < min_voltage)
+		return -EINVAL;
+
+	/* Calc FSB speed */
+	fsb = cpu_khz / current_multiplier;
+	/* Calc number of p-states supported */
+	if (brand == EPS_BRAND_C7M)
+		states = max_multiplier - min_multiplier + 1;
+	else
+		states = 2;
+
+	/* Allocate private data and frequency table for current cpu */
+	centaur = kzalloc(sizeof(struct eps_cpu_data)
+		    + (states + 1) * sizeof(struct cpufreq_frequency_table),
+		    GFP_KERNEL);
+	if (!centaur)
+		return -ENOMEM;
+	eps_cpu[0] = centaur;
+
+	/* Copy basic values */
+	centaur->fsb = fsb;
+
+	/* Fill frequency and MSR value table */
+	f_table = &centaur->freq_table[0];
+	if (brand != EPS_BRAND_C7M) {
+		f_table[0].frequency = fsb * min_multiplier;
+		f_table[0].index = (min_multiplier << 8) | min_voltage;
+		f_table[1].frequency = fsb * max_multiplier;
+		f_table[1].index = (max_multiplier << 8) | max_voltage;
+		f_table[2].frequency = CPUFREQ_TABLE_END;
+	} else {
+		k = 0;
+		step = ((max_voltage - min_voltage) * 256)
+			/ (max_multiplier - min_multiplier);
+		for (i = min_multiplier; i <= max_multiplier; i++) {
+			voltage = (k * step) / 256 + min_voltage;
+			f_table[k].frequency = fsb * i;
+			f_table[k].index = (i << 8) | voltage;
+			k++;
+		}
+		f_table[k].frequency = CPUFREQ_TABLE_END;
+	}
+
+	policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */
+	policy->cur = fsb * current_multiplier;
+
+	ret = cpufreq_frequency_table_cpuinfo(policy, &centaur->freq_table[0]);
+	if (ret) {
+		kfree(centaur);
+		return ret;
+	}
+
+	cpufreq_frequency_table_get_attr(&centaur->freq_table[0], policy->cpu);
+	return 0;
+}
+
+static int eps_cpu_exit(struct cpufreq_policy *policy)
+{
+	unsigned int cpu = policy->cpu;
+	struct eps_cpu_data *centaur;
+	u32 lo, hi;
+
+	if (eps_cpu[cpu] == NULL)
+		return -ENODEV;
+	centaur = eps_cpu[cpu];
+
+	/* Get max frequency */
+	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
+	/* Set max frequency */
+	eps_set_state(centaur, cpu, hi & 0xffff);
+	/* Bye */
+	cpufreq_frequency_table_put_attr(policy->cpu);
+	kfree(eps_cpu[cpu]);
+	eps_cpu[cpu] = NULL;
+	return 0;
+}
+
+static struct freq_attr *eps_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	NULL,
+};
+
+static struct cpufreq_driver eps_driver = {
+	.verify		= eps_verify,
+	.target		= eps_target,
+	.init		= eps_cpu_init,
+	.exit		= eps_cpu_exit,
+	.get		= eps_get,
+	.name		= "e_powersaver",
+	.owner		= THIS_MODULE,
+	.attr		= eps_attr,
+};
+
+static int __init eps_init(void)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+
+	/* This driver will work only on Centaur C7 processors with
+	 * Enhanced SpeedStep/PowerSaver registers */
+	if (c->x86_vendor != X86_VENDOR_CENTAUR
+	    || c->x86 != 6 || c->x86_model < 10)
+		return -ENODEV;
+	if (!cpu_has(c, X86_FEATURE_EST))
+		return -ENODEV;
+
+	if (cpufreq_register_driver(&eps_driver))
+		return -EINVAL;
+	return 0;
+}
+
+static void __exit eps_exit(void)
+{
+	cpufreq_unregister_driver(&eps_driver);
+}
+
+MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>");
+MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
+MODULE_LICENSE("GPL");
+
+module_init(eps_init);
+module_exit(eps_exit);
diff --git a/drivers/cpufreq/elanfreq.c b/drivers/cpufreq/elanfreq.c
new file mode 100644
index 00000000000..c587db472a7
--- /dev/null
+++ b/drivers/cpufreq/elanfreq.c
@@ -0,0 +1,309 @@
+/*
+ *	elanfreq:	cpufreq driver for the AMD ELAN family
+ *
+ *	(c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de>
+ *
+ *	Parts of this code are (c) Sven Geggus <sven@geggus.net>
+ *
+ *      All Rights Reserved.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/delay.h>
+#include <linux/cpufreq.h>
+
+#include <asm/msr.h>
+#include <linux/timex.h>
+#include <linux/io.h>
+
+#define REG_CSCIR 0x22		/* Chip Setup and Control Index Register    */
+#define REG_CSCDR 0x23		/* Chip Setup and Control Data  Register    */
+
+/* Module parameter */
+static int max_freq;
+
+struct s_elan_multiplier {
+	int clock;		/* frequency in kHz                         */
+	int val40h;		/* PMU Force Mode register                  */
+	int val80h;		/* CPU Clock Speed Register                 */
+};
+
+/*
+ * It is important that the frequencies
+ * are listed in ascending order here!
+ */
+static struct s_elan_multiplier elan_multiplier[] = {
+	{1000,	0x02,	0x18},
+	{2000,	0x02,	0x10},
+	{4000,	0x02,	0x08},
+	{8000,	0x00,	0x00},
+	{16000,	0x00,	0x02},
+	{33000,	0x00,	0x04},
+	{66000,	0x01,	0x04},
+	{99000,	0x01,	0x05}
+};
+
+static struct cpufreq_frequency_table elanfreq_table[] = {
+	{0,	1000},
+	{1,	2000},
+	{2,	4000},
+	{3,	8000},
+	{4,	16000},
+	{5,	33000},
+	{6,	66000},
+	{7,	99000},
+	{0,	CPUFREQ_TABLE_END},
+};
+
+
+/**
+ *	elanfreq_get_cpu_frequency: determine current cpu speed
+ *
+ *	Finds out at which frequency the CPU of the Elan SOC runs
+ *	at the moment. Frequencies from 1 to 33 MHz are generated
+ *	the normal way, 66 and 99 MHz are called "Hyperspeed Mode"
+ *	and have the rest of the chip running with 33 MHz.
+ */
+
+static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
+{
+	u8 clockspeed_reg;    /* Clock Speed Register */
+
+	local_irq_disable();
+	outb_p(0x80, REG_CSCIR);
+	clockspeed_reg = inb_p(REG_CSCDR);
+	local_irq_enable();
+
+	if ((clockspeed_reg & 0xE0) == 0xE0)
+		return 0;
+
+	/* Are we in CPU clock multiplied mode (66/99 MHz)? */
+	if ((clockspeed_reg & 0xE0) == 0xC0) {
+		if ((clockspeed_reg & 0x01) == 0)
+			return 66000;
+		else
+			return 99000;
+	}
+
+	/* 33 MHz is not 32 MHz... */
+	if ((clockspeed_reg & 0xE0) == 0xA0)
+		return 33000;
+
+	return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
+}
+
+
+/**
+ *	elanfreq_set_cpu_frequency: Change the CPU core frequency
+ *	@cpu: cpu number
+ *	@freq: frequency in kHz
+ *
+ *	This function takes a frequency value and changes the CPU frequency
+ *	according to this. Note that the frequency has to be checked by
+ *	elanfreq_validatespeed() for correctness!
+ *
+ *	There is no return value.
+ */
+
+static void elanfreq_set_cpu_state(unsigned int state)
+{
+	struct cpufreq_freqs    freqs;
+
+	freqs.old = elanfreq_get_cpu_frequency(0);
+	freqs.new = elan_multiplier[state].clock;
+	freqs.cpu = 0; /* elanfreq.c is UP only driver */
+
+	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+
+	printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n",
+			elan_multiplier[state].clock);
+
+
+	/*
+	 * Access to the Elan's internal registers is indexed via
+	 * 0x22: Chip Setup & Control Register Index Register (CSCI)
+	 * 0x23: Chip Setup & Control Register Data  Register (CSCD)
+	 *
+	 */
+
+	/*
+	 * 0x40 is the Power Management Unit's Force Mode Register.
+	 * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency)
+	 */
+
+	local_irq_disable();
+	outb_p(0x40, REG_CSCIR);		/* Disable hyperspeed mode */
+	outb_p(0x00, REG_CSCDR);
+	local_irq_enable();		/* wait till internal pipelines and */
+	udelay(1000);			/* buffers have cleaned up          */
+
+	local_irq_disable();
+
+	/* now, set the CPU clock speed register (0x80) */
+	outb_p(0x80, REG_CSCIR);
+	outb_p(elan_multiplier[state].val80h, REG_CSCDR);
+
+	/* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
+	outb_p(0x40, REG_CSCIR);
+	outb_p(elan_multiplier[state].val40h, REG_CSCDR);
+	udelay(10000);
+	local_irq_enable();
+
+	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+};
+
+
+/**
+ *	elanfreq_validatespeed: test if frequency range is valid
+ *	@policy: the policy to validate
+ *
+ *	This function checks if a given frequency range in kHz is valid
+ *	for the hardware supported by the driver.
+ */
+
+static int elanfreq_verify(struct cpufreq_policy *policy)
+{
+	return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
+}
+
+static int elanfreq_target(struct cpufreq_policy *policy,
+			    unsigned int target_freq,
+			    unsigned int relation)
+{
+	unsigned int newstate = 0;
+
+	if (cpufreq_frequency_table_target(policy, &elanfreq_table[0],
+				target_freq, relation, &newstate))
+		return -EINVAL;
+
+	elanfreq_set_cpu_state(newstate);
+
+	return 0;
+}
+
+
+/*
+ *	Module init and exit code
+ */
+
+static int elanfreq_cpu_init(struct cpufreq_policy *policy)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+	unsigned int i;
+	int result;
+
+	/* capability check */
+	if ((c->x86_vendor != X86_VENDOR_AMD) ||
+	    (c->x86 != 4) || (c->x86_model != 10))
+		return -ENODEV;
+
+	/* max freq */
+	if (!max_freq)
+		max_freq = elanfreq_get_cpu_frequency(0);
+
+	/* table init */
+	for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
+		if (elanfreq_table[i].frequency > max_freq)
+			elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+	}
+
+	/* cpuinfo and default policy values */
+	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+	policy->cur = elanfreq_get_cpu_frequency(0);
+
+	result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
+	if (result)
+		return result;
+
+	cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
+	return 0;
+}
+
+
+static int elanfreq_cpu_exit(struct cpufreq_policy *policy)
+{
+	cpufreq_frequency_table_put_attr(policy->cpu);
+	return 0;
+}
+
+
+#ifndef MODULE
+/**
+ * elanfreq_setup - elanfreq command line parameter parsing
+ *
+ * elanfreq command line parameter.  Use:
+ *  elanfreq=66000
+ * to set the maximum CPU frequency to 66 MHz. Note that in
+ * case you do not give this boot parameter, the maximum
+ * frequency will fall back to _current_ CPU frequency which
+ * might be lower. If you build this as a module, use the
+ * max_freq module parameter instead.
+ */
+static int __init elanfreq_setup(char *str)
+{
+	max_freq = simple_strtoul(str, &str, 0);
+	printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n");
+	return 1;
+}
+__setup("elanfreq=", elanfreq_setup);
+#endif
+
+
+static struct freq_attr *elanfreq_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	NULL,
+};
+
+
+static struct cpufreq_driver elanfreq_driver = {
+	.get		= elanfreq_get_cpu_frequency,
+	.verify		= elanfreq_verify,
+	.target		= elanfreq_target,
+	.init		= elanfreq_cpu_init,
+	.exit		= elanfreq_cpu_exit,
+	.name		= "elanfreq",
+	.owner		= THIS_MODULE,
+	.attr		= elanfreq_attr,
+};
+
+
+static int __init elanfreq_init(void)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+
+	/* Test if we have the right hardware */
+	if ((c->x86_vendor != X86_VENDOR_AMD) ||
+		(c->x86 != 4) || (c->x86_model != 10)) {
+		printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
+		return -ENODEV;
+	}
+	return cpufreq_register_driver(&elanfreq_driver);
+}
+
+
+static void __exit elanfreq_exit(void)
+{
+	cpufreq_unregister_driver(&elanfreq_driver);
+}
+
+
+module_param(max_freq, int, 0444);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, "
+		"Sven Geggus <sven@geggus.net>");
+MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs");
+
+module_init(elanfreq_init);
+module_exit(elanfreq_exit);
diff --git a/drivers/cpufreq/gx-suspmod.c b/drivers/cpufreq/gx-suspmod.c
new file mode 100644
index 00000000000..ffe1f2c92ed
--- /dev/null
+++ b/drivers/cpufreq/gx-suspmod.c
@@ -0,0 +1,514 @@
+/*
+ *	Cyrix MediaGX and NatSemi Geode Suspend Modulation
+ *	(C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
+ *	(C) 2002 Hiroshi Miura   <miura@da-cha.org>
+ *	All Rights Reserved
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      version 2 as published by the Free Software Foundation
+ *
+ *      The author(s) of this software shall not be held liable for damages
+ *      of any nature resulting due to the use of this software. This
+ *      software is provided AS-IS with no warranties.
+ *
+ * Theoretical note:
+ *
+ *	(see Geode(tm) CS5530 manual (rev.4.1) page.56)
+ *
+ *	CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0
+ *	are based on Suspend Modulation.
+ *
+ *	Suspend Modulation works by asserting and de-asserting the SUSP# pin
+ *	to CPU(GX1/GXLV) for configurable durations. When asserting SUSP#
+ *	the CPU enters an idle state. GX1 stops its core clock when SUSP# is
+ *	asserted then power consumption is reduced.
+ *
+ *	Suspend Modulation's OFF/ON duration are configurable
+ *	with 'Suspend Modulation OFF Count Register'
+ *	and 'Suspend Modulation ON Count Register'.
+ *	These registers are 8bit counters that represent the number of
+ *	32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF)
+ *	to the processor.
+ *
+ *	These counters define a ratio which is the effective frequency
+ *	of operation of the system.
+ *
+ *			       OFF Count
+ *	F_eff = Fgx * ----------------------
+ *	                OFF Count + ON Count
+ *
+ *	0 <= On Count, Off Count <= 255
+ *
+ *	From these limits, we can get register values
+ *
+ *	off_duration + on_duration <= MAX_DURATION
+ *	on_duration = off_duration * (stock_freq - freq) / freq
+ *
+ *      off_duration  =  (freq * DURATION) / stock_freq
+ *      on_duration = DURATION - off_duration
+ *
+ *
+ *---------------------------------------------------------------------------
+ *
+ * ChangeLog:
+ *	Dec. 12, 2003	Hiroshi Miura <miura@da-cha.org>
+ *		- fix on/off register mistake
+ *		- fix cpu_khz calc when it stops cpu modulation.
+ *
+ *	Dec. 11, 2002	Hiroshi Miura <miura@da-cha.org>
+ *		- rewrite for Cyrix MediaGX Cx5510/5520 and
+ *		  NatSemi Geode Cs5530(A).
+ *
+ *	Jul. ??, 2002  Zwane Mwaikambo <zwane@commfireservices.com>
+ *		- cs5530_mod patch for 2.4.19-rc1.
+ *
+ *---------------------------------------------------------------------------
+ *
+ * Todo
+ *	Test on machines with 5510, 5530, 5530A
+ */
+
+/************************************************************************
+ *			Suspend Modulation - Definitions		*
+ ************************************************************************/
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/cpufreq.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+
+#include <asm/processor-cyrix.h>
+
+/* PCI config registers, all at F0 */
+#define PCI_PMER1	0x80	/* power management enable register 1 */
+#define PCI_PMER2	0x81	/* power management enable register 2 */
+#define PCI_PMER3	0x82	/* power management enable register 3 */
+#define PCI_IRQTC	0x8c	/* irq speedup timer counter register:typical 2 to 4ms */
+#define PCI_VIDTC	0x8d	/* video speedup timer counter register: typical 50 to 100ms */
+#define PCI_MODOFF	0x94	/* suspend modulation OFF counter register, 1 = 32us */
+#define PCI_MODON	0x95	/* suspend modulation ON counter register */
+#define PCI_SUSCFG	0x96	/* suspend configuration register */
+
+/* PMER1 bits */
+#define GPM		(1<<0)	/* global power management */
+#define GIT		(1<<1)	/* globally enable PM device idle timers */
+#define GTR		(1<<2)	/* globally enable IO traps */
+#define IRQ_SPDUP	(1<<3)	/* disable clock throttle during interrupt handling */
+#define VID_SPDUP	(1<<4)	/* disable clock throttle during vga video handling */
+
+/* SUSCFG bits */
+#define SUSMOD		(1<<0)	/* enable/disable suspend modulation */
+/* the below is supported only with cs5530 (after rev.1.2)/cs5530A */
+#define SMISPDUP	(1<<1)	/* select how SMI re-enable suspend modulation: */
+				/* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */
+#define SUSCFG		(1<<2)	/* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */
+/* the below is supported only with cs5530A */
+#define PWRSVE_ISA	(1<<3)	/* stop ISA clock  */
+#define PWRSVE		(1<<4)	/* active idle */
+
+struct gxfreq_params {
+	u8 on_duration;
+	u8 off_duration;
+	u8 pci_suscfg;
+	u8 pci_pmer1;
+	u8 pci_pmer2;
+	struct pci_dev *cs55x0;
+};
+
+static struct gxfreq_params *gx_params;
+static int stock_freq;
+
+/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */
+static int pci_busclk;
+module_param(pci_busclk, int, 0444);
+
+/* maximum duration for which the cpu may be suspended
+ * (32us * MAX_DURATION). If no parameter is given, this defaults
+ * to 255.
+ * Note that this leads to a maximum of 8 ms(!) where the CPU clock
+ * is suspended -- processing power is just 0.39% of what it used to be,
+ * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */
+static int max_duration = 255;
+module_param(max_duration, int, 0444);
+
+/* For the default policy, we want at least some processing power
+ * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV)
+ */
+#define POLICY_MIN_DIV 20
+
+
+/**
+ * we can detect a core multipiler from dir0_lsb
+ * from GX1 datasheet p.56,
+ *	MULT[3:0]:
+ *	0000 = SYSCLK multiplied by 4 (test only)
+ *	0001 = SYSCLK multiplied by 10
+ *	0010 = SYSCLK multiplied by 4
+ *	0011 = SYSCLK multiplied by 6
+ *	0100 = SYSCLK multiplied by 9
+ *	0101 = SYSCLK multiplied by 5
+ *	0110 = SYSCLK multiplied by 7
+ *	0111 = SYSCLK multiplied by 8
+ *              of 33.3MHz
+ **/
+static int gx_freq_mult[16] = {
+		4, 10, 4, 6, 9, 5, 7, 8,
+		0, 0, 0, 0, 0, 0, 0, 0
+};
+
+
+/****************************************************************
+ *	Low Level chipset interface				*
+ ****************************************************************/
+static struct pci_device_id gx_chipset_tbl[] __initdata = {
+	{ PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
+	{ PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
+	{ PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
+	{ 0, },
+};
+
+static void gx_write_byte(int reg, int value)
+{
+	pci_write_config_byte(gx_params->cs55x0, reg, value);
+}
+
+/**
+ * gx_detect_chipset:
+ *
+ **/
+static __init struct pci_dev *gx_detect_chipset(void)
+{
+	struct pci_dev *gx_pci = NULL;
+
+	/* check if CPU is a MediaGX or a Geode. */
+	if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) &&
+	    (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) {
+		pr_debug("error: no MediaGX/Geode processor found!\n");
+		return NULL;
+	}
+
+	/* detect which companion chip is used */
+	for_each_pci_dev(gx_pci) {
+		if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
+			return gx_pci;
+	}
+
+	pr_debug("error: no supported chipset found!\n");
+	return NULL;
+}
+
+/**
+ * gx_get_cpuspeed:
+ *
+ * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi
+ * Geode CPU runs.
+ */
+static unsigned int gx_get_cpuspeed(unsigned int cpu)
+{
+	if ((gx_params->pci_suscfg & SUSMOD) == 0)
+		return stock_freq;
+
+	return (stock_freq * gx_params->off_duration)
+		/ (gx_params->on_duration + gx_params->off_duration);
+}
+
+/**
+ *      gx_validate_speed:
+ *      determine current cpu speed
+ *
+ **/
+
+static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration,
+		u8 *off_duration)
+{
+	unsigned int i;
+	u8 tmp_on, tmp_off;
+	int old_tmp_freq = stock_freq;
+	int tmp_freq;
+
+	*off_duration = 1;
+	*on_duration = 0;
+
+	for (i = max_duration; i > 0; i--) {
+		tmp_off = ((khz * i) / stock_freq) & 0xff;
+		tmp_on = i - tmp_off;
+		tmp_freq = (stock_freq * tmp_off) / i;
+		/* if this relation is closer to khz, use this. If it's equal,
+		 * prefer it, too - lower latency */
+		if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) {
+			*on_duration = tmp_on;
+			*off_duration = tmp_off;
+			old_tmp_freq = tmp_freq;
+		}
+	}
+
+	return old_tmp_freq;
+}
+
+
+/**
+ * gx_set_cpuspeed:
+ * set cpu speed in khz.
+ **/
+
+static void gx_set_cpuspeed(unsigned int khz)
+{
+	u8 suscfg, pmer1;
+	unsigned int new_khz;
+	unsigned long flags;
+	struct cpufreq_freqs freqs;
+
+	freqs.cpu = 0;
+	freqs.old = gx_get_cpuspeed(0);
+
+	new_khz = gx_validate_speed(khz, &gx_params->on_duration,
+			&gx_params->off_duration);
+
+	freqs.new = new_khz;
+
+	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	local_irq_save(flags);
+
+
+
+	if (new_khz != stock_freq) {
+		/* if new khz == 100% of CPU speed, it is special case */
+		switch (gx_params->cs55x0->device) {
+		case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
+			pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP;
+			/* FIXME: need to test other values -- Zwane,Miura */
+			/* typical 2 to 4ms */
+			gx_write_byte(PCI_IRQTC, 4);
+			/* typical 50 to 100ms */
+			gx_write_byte(PCI_VIDTC, 100);
+			gx_write_byte(PCI_PMER1, pmer1);
+
+			if (gx_params->cs55x0->revision < 0x10) {
+				/* CS5530(rev 1.2, 1.3) */
+				suscfg = gx_params->pci_suscfg|SUSMOD;
+			} else {
+				/* CS5530A,B.. */
+				suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE;
+			}
+			break;
+		case PCI_DEVICE_ID_CYRIX_5520:
+		case PCI_DEVICE_ID_CYRIX_5510:
+			suscfg = gx_params->pci_suscfg | SUSMOD;
+			break;
+		default:
+			local_irq_restore(flags);
+			pr_debug("fatal: try to set unknown chipset.\n");
+			return;
+		}
+	} else {
+		suscfg = gx_params->pci_suscfg & ~(SUSMOD);
+		gx_params->off_duration = 0;
+		gx_params->on_duration = 0;
+		pr_debug("suspend modulation disabled: cpu runs 100%% speed.\n");
+	}
+
+	gx_write_byte(PCI_MODOFF, gx_params->off_duration);
+	gx_write_byte(PCI_MODON, gx_params->on_duration);
+
+	gx_write_byte(PCI_SUSCFG, suscfg);
+	pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg);
+
+	local_irq_restore(flags);
+
+	gx_params->pci_suscfg = suscfg;
+
+	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+
+	pr_debug("suspend modulation w/ duration of ON:%d us, OFF:%d us\n",
+		gx_params->on_duration * 32, gx_params->off_duration * 32);
+	pr_debug("suspend modulation w/ clock speed: %d kHz.\n", freqs.new);
+}
+
+/****************************************************************
+ *             High level functions                             *
+ ****************************************************************/
+
+/*
+ *	cpufreq_gx_verify: test if frequency range is valid
+ *
+ *	This function checks if a given frequency range in kHz is valid
+ *      for the hardware supported by the driver.
+ */
+
+static int cpufreq_gx_verify(struct cpufreq_policy *policy)
+{
+	unsigned int tmp_freq = 0;
+	u8 tmp1, tmp2;
+
+	if (!stock_freq || !policy)
+		return -EINVAL;
+
+	policy->cpu = 0;
+	cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
+			stock_freq);
+
+	/* it needs to be assured that at least one supported frequency is
+	 * within policy->min and policy->max. If it is not, policy->max
+	 * needs to be increased until one freuqency is supported.
+	 * policy->min may not be decreased, though. This way we guarantee a
+	 * specific processing capacity.
+	 */
+	tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2);
+	if (tmp_freq < policy->min)
+		tmp_freq += stock_freq / max_duration;
+	policy->min = tmp_freq;
+	if (policy->min > policy->max)
+		policy->max = tmp_freq;
+	tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2);
+	if (tmp_freq > policy->max)
+		tmp_freq -= stock_freq / max_duration;
+	policy->max = tmp_freq;
+	if (policy->max < policy->min)
+		policy->max = policy->min;
+	cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
+			stock_freq);
+
+	return 0;
+}
+
+/*
+ *      cpufreq_gx_target:
+ *
+ */
+static int cpufreq_gx_target(struct cpufreq_policy *policy,
+			     unsigned int target_freq,
+			     unsigned int relation)
+{
+	u8 tmp1, tmp2;
+	unsigned int tmp_freq;
+
+	if (!stock_freq || !policy)
+		return -EINVAL;
+
+	policy->cpu = 0;
+
+	tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2);
+	while (tmp_freq < policy->min) {
+		tmp_freq += stock_freq / max_duration;
+		tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
+	}
+	while (tmp_freq > policy->max) {
+		tmp_freq -= stock_freq / max_duration;
+		tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
+	}
+
+	gx_set_cpuspeed(tmp_freq);
+
+	return 0;
+}
+
+static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
+{
+	unsigned int maxfreq, curfreq;
+
+	if (!policy || policy->cpu != 0)
+		return -ENODEV;
+
+	/* determine maximum frequency */
+	if (pci_busclk)
+		maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
+	else if (cpu_khz)
+		maxfreq = cpu_khz;
+	else
+		maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
+
+	stock_freq = maxfreq;
+	curfreq = gx_get_cpuspeed(0);
+
+	pr_debug("cpu max frequency is %d.\n", maxfreq);
+	pr_debug("cpu current frequency is %dkHz.\n", curfreq);
+
+	/* setup basic struct for cpufreq API */
+	policy->cpu = 0;
+
+	if (max_duration < POLICY_MIN_DIV)
+		policy->min = maxfreq / max_duration;
+	else
+		policy->min = maxfreq / POLICY_MIN_DIV;
+	policy->max = maxfreq;
+	policy->cur = curfreq;
+	policy->cpuinfo.min_freq = maxfreq / max_duration;
+	policy->cpuinfo.max_freq = maxfreq;
+	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+
+	return 0;
+}
+
+/*
+ * cpufreq_gx_init:
+ *   MediaGX/Geode GX initialize cpufreq driver
+ */
+static struct cpufreq_driver gx_suspmod_driver = {
+	.get		= gx_get_cpuspeed,
+	.verify		= cpufreq_gx_verify,
+	.target		= cpufreq_gx_target,
+	.init		= cpufreq_gx_cpu_init,
+	.name		= "gx-suspmod",
+	.owner		= THIS_MODULE,
+};
+
+static int __init cpufreq_gx_init(void)
+{
+	int ret;
+	struct gxfreq_params *params;
+	struct pci_dev *gx_pci;
+
+	/* Test if we have the right hardware */
+	gx_pci = gx_detect_chipset();
+	if (gx_pci == NULL)
+		return -ENODEV;
+
+	/* check whether module parameters are sane */
+	if (max_duration > 0xff)
+		max_duration = 0xff;
+
+	pr_debug("geode suspend modulation available.\n");
+
+	params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL);
+	if (params == NULL)
+		return -ENOMEM;
+
+	params->cs55x0 = gx_pci;
+	gx_params = params;
+
+	/* keep cs55x0 configurations */
+	pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg));
+	pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1));
+	pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
+	pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
+	pci_read_config_byte(params->cs55x0, PCI_MODOFF,
+			&(params->off_duration));
+
+	ret = cpufreq_register_driver(&gx_suspmod_driver);
+	if (ret) {
+		kfree(params);
+		return ret;                   /* register error! */
+	}
+
+	return 0;
+}
+
+static void __exit cpufreq_gx_exit(void)
+{
+	cpufreq_unregister_driver(&gx_suspmod_driver);
+	pci_dev_put(gx_params->cs55x0);
+	kfree(gx_params);
+}
+
+MODULE_AUTHOR("Hiroshi Miura <miura@da-cha.org>");
+MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
+MODULE_LICENSE("GPL");
+
+module_init(cpufreq_gx_init);
+module_exit(cpufreq_gx_exit);
+
diff --git a/drivers/cpufreq/longhaul.c b/drivers/cpufreq/longhaul.c
new file mode 100644
index 00000000000..f47d26e2a13
--- /dev/null
+++ b/drivers/cpufreq/longhaul.c
@@ -0,0 +1,1024 @@
+/*
+ *  (C) 2001-2004  Dave Jones. <davej@redhat.com>
+ *  (C) 2002  Padraig Brady. <padraig@antefacto.com>
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *  Based upon datasheets & sample CPUs kindly provided by VIA.
+ *
+ *  VIA have currently 3 different versions of Longhaul.
+ *  Version 1 (Longhaul) uses the BCR2 MSR at 0x1147.
+ *   It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0.
+ *  Version 2 of longhaul is backward compatible with v1, but adds
+ *   LONGHAUL MSR for purpose of both frequency and voltage scaling.
+ *   Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C).
+ *  Version 3 of longhaul got renamed to Powersaver and redesigned
+ *   to use only the POWERSAVER MSR at 0x110a.
+ *   It is present in Ezra-T (C5M), Nehemiah (C5X) and above.
+ *   It's pretty much the same feature wise to longhaul v2, though
+ *   there is provision for scaling FSB too, but this doesn't work
+ *   too well in practice so we don't even try to use this.
+ *
+ *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/timex.h>
+#include <linux/io.h>
+#include <linux/acpi.h>
+
+#include <asm/msr.h>
+#include <acpi/processor.h>
+
+#include "longhaul.h"
+
+#define PFX "longhaul: "
+
+#define TYPE_LONGHAUL_V1	1
+#define TYPE_LONGHAUL_V2	2
+#define TYPE_POWERSAVER		3
+
+#define	CPU_SAMUEL	1
+#define	CPU_SAMUEL2	2
+#define	CPU_EZRA	3
+#define	CPU_EZRA_T	4
+#define	CPU_NEHEMIAH	5
+#define	CPU_NEHEMIAH_C	6
+
+/* Flags */
+#define USE_ACPI_C3		(1 << 1)
+#define USE_NORTHBRIDGE		(1 << 2)
+
+static int cpu_model;
+static unsigned int numscales = 16;
+static unsigned int fsb;
+
+static const struct mV_pos *vrm_mV_table;
+static const unsigned char *mV_vrm_table;
+
+static unsigned int highest_speed, lowest_speed; /* kHz */
+static unsigned int minmult, maxmult;
+static int can_scale_voltage;
+static struct acpi_processor *pr;
+static struct acpi_processor_cx *cx;
+static u32 acpi_regs_addr;
+static u8 longhaul_flags;
+static unsigned int longhaul_index;
+
+/* Module parameters */
+static int scale_voltage;
+static int disable_acpi_c3;
+static int revid_errata;
+
+
+/* Clock ratios multiplied by 10 */
+static int mults[32];
+static int eblcr[32];
+static int longhaul_version;
+static struct cpufreq_frequency_table *longhaul_table;
+
+static char speedbuffer[8];
+
+static char *print_speed(int speed)
+{
+	if (speed < 1000) {
+		snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed);
+		return speedbuffer;
+	}
+
+	if (speed%1000 == 0)
+		snprintf(speedbuffer, sizeof(speedbuffer),
+			"%dGHz", speed/1000);
+	else
+		snprintf(speedbuffer, sizeof(speedbuffer),
+			"%d.%dGHz", speed/1000, (speed%1000)/100);
+
+	return speedbuffer;
+}
+
+
+static unsigned int calc_speed(int mult)
+{
+	int khz;
+	khz = (mult/10)*fsb;
+	if (mult%10)
+		khz += fsb/2;
+	khz *= 1000;
+	return khz;
+}
+
+
+static int longhaul_get_cpu_mult(void)
+{
+	unsigned long invalue = 0, lo, hi;
+
+	rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi);
+	invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22;
+	if (longhaul_version == TYPE_LONGHAUL_V2 ||
+	    longhaul_version == TYPE_POWERSAVER) {
+		if (lo & (1<<27))
+			invalue += 16;
+	}
+	return eblcr[invalue];
+}
+
+/* For processor with BCR2 MSR */
+
+static void do_longhaul1(unsigned int mults_index)
+{
+	union msr_bcr2 bcr2;
+
+	rdmsrl(MSR_VIA_BCR2, bcr2.val);
+	/* Enable software clock multiplier */
+	bcr2.bits.ESOFTBF = 1;
+	bcr2.bits.CLOCKMUL = mults_index & 0xff;
+
+	/* Sync to timer tick */
+	safe_halt();
+	/* Change frequency on next halt or sleep */
+	wrmsrl(MSR_VIA_BCR2, bcr2.val);
+	/* Invoke transition */
+	ACPI_FLUSH_CPU_CACHE();
+	halt();
+
+	/* Disable software clock multiplier */
+	local_irq_disable();
+	rdmsrl(MSR_VIA_BCR2, bcr2.val);
+	bcr2.bits.ESOFTBF = 0;
+	wrmsrl(MSR_VIA_BCR2, bcr2.val);
+}
+
+/* For processor with Longhaul MSR */
+
+static void do_powersaver(int cx_address, unsigned int mults_index,
+			  unsigned int dir)
+{
+	union msr_longhaul longhaul;
+	u32 t;
+
+	rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+	/* Setup new frequency */
+	if (!revid_errata)
+		longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
+	else
+		longhaul.bits.RevisionKey = 0;
+	longhaul.bits.SoftBusRatio = mults_index & 0xf;
+	longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4;
+	/* Setup new voltage */
+	if (can_scale_voltage)
+		longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f;
+	/* Sync to timer tick */
+	safe_halt();
+	/* Raise voltage if necessary */
+	if (can_scale_voltage && dir) {
+		longhaul.bits.EnableSoftVID = 1;
+		wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+		/* Change voltage */
+		if (!cx_address) {
+			ACPI_FLUSH_CPU_CACHE();
+			halt();
+		} else {
+			ACPI_FLUSH_CPU_CACHE();
+			/* Invoke C3 */
+			inb(cx_address);
+			/* Dummy op - must do something useless after P_LVL3
+			 * read */
+			t = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		}
+		longhaul.bits.EnableSoftVID = 0;
+		wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+	}
+
+	/* Change frequency on next halt or sleep */
+	longhaul.bits.EnableSoftBusRatio = 1;
+	wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+	if (!cx_address) {
+		ACPI_FLUSH_CPU_CACHE();
+		halt();
+	} else {
+		ACPI_FLUSH_CPU_CACHE();
+		/* Invoke C3 */
+		inb(cx_address);
+		/* Dummy op - must do something useless after P_LVL3 read */
+		t = inl(acpi_gbl_FADT.xpm_timer_block.address);
+	}
+	/* Disable bus ratio bit */
+	longhaul.bits.EnableSoftBusRatio = 0;
+	wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+
+	/* Reduce voltage if necessary */
+	if (can_scale_voltage && !dir) {
+		longhaul.bits.EnableSoftVID = 1;
+		wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+		/* Change voltage */
+		if (!cx_address) {
+			ACPI_FLUSH_CPU_CACHE();
+			halt();
+		} else {
+			ACPI_FLUSH_CPU_CACHE();
+			/* Invoke C3 */
+			inb(cx_address);
+			/* Dummy op - must do something useless after P_LVL3
+			 * read */
+			t = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		}
+		longhaul.bits.EnableSoftVID = 0;
+		wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+	}
+}
+
+/**
+ * longhaul_set_cpu_frequency()
+ * @mults_index : bitpattern of the new multiplier.
+ *
+ * Sets a new clock ratio.
+ */
+
+static void longhaul_setstate(unsigned int table_index)
+{
+	unsigned int mults_index;
+	int speed, mult;
+	struct cpufreq_freqs freqs;
+	unsigned long flags;
+	unsigned int pic1_mask, pic2_mask;
+	u16 bm_status = 0;
+	u32 bm_timeout = 1000;
+	unsigned int dir = 0;
+
+	mults_index = longhaul_table[table_index].index;
+	/* Safety precautions */
+	mult = mults[mults_index & 0x1f];
+	if (mult == -1)
+		return;
+	speed = calc_speed(mult);
+	if ((speed > highest_speed) || (speed < lowest_speed))
+		return;
+	/* Voltage transition before frequency transition? */
+	if (can_scale_voltage && longhaul_index < table_index)
+		dir = 1;
+
+	freqs.old = calc_speed(longhaul_get_cpu_mult());
+	freqs.new = speed;
+	freqs.cpu = 0; /* longhaul.c is UP only driver */
+
+	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+
+	pr_debug("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
+			fsb, mult/10, mult%10, print_speed(speed/1000));
+retry_loop:
+	preempt_disable();
+	local_irq_save(flags);
+
+	pic2_mask = inb(0xA1);
+	pic1_mask = inb(0x21);	/* works on C3. save mask. */
+	outb(0xFF, 0xA1);	/* Overkill */
+	outb(0xFE, 0x21);	/* TMR0 only */
+
+	/* Wait while PCI bus is busy. */
+	if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE
+	    || ((pr != NULL) && pr->flags.bm_control))) {
+		bm_status = inw(acpi_regs_addr);
+		bm_status &= 1 << 4;
+		while (bm_status && bm_timeout) {
+			outw(1 << 4, acpi_regs_addr);
+			bm_timeout--;
+			bm_status = inw(acpi_regs_addr);
+			bm_status &= 1 << 4;
+		}
+	}
+
+	if (longhaul_flags & USE_NORTHBRIDGE) {
+		/* Disable AGP and PCI arbiters */
+		outb(3, 0x22);
+	} else if ((pr != NULL) && pr->flags.bm_control) {
+		/* Disable bus master arbitration */
+		acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1);
+	}
+	switch (longhaul_version) {
+
+	/*
+	 * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B])
+	 * Software controlled multipliers only.
+	 */
+	case TYPE_LONGHAUL_V1:
+		do_longhaul1(mults_index);
+		break;
+
+	/*
+	 * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C]
+	 *
+	 * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N])
+	 * Nehemiah can do FSB scaling too, but this has never been proven
+	 * to work in practice.
+	 */
+	case TYPE_LONGHAUL_V2:
+	case TYPE_POWERSAVER:
+		if (longhaul_flags & USE_ACPI_C3) {
+			/* Don't allow wakeup */
+			acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
+			do_powersaver(cx->address, mults_index, dir);
+		} else {
+			do_powersaver(0, mults_index, dir);
+		}
+		break;
+	}
+
+	if (longhaul_flags & USE_NORTHBRIDGE) {
+		/* Enable arbiters */
+		outb(0, 0x22);
+	} else if ((pr != NULL) && pr->flags.bm_control) {
+		/* Enable bus master arbitration */
+		acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0);
+	}
+	outb(pic2_mask, 0xA1);	/* restore mask */
+	outb(pic1_mask, 0x21);
+
+	local_irq_restore(flags);
+	preempt_enable();
+
+	freqs.new = calc_speed(longhaul_get_cpu_mult());
+	/* Check if requested frequency is set. */
+	if (unlikely(freqs.new != speed)) {
+		printk(KERN_INFO PFX "Failed to set requested frequency!\n");
+		/* Revision ID = 1 but processor is expecting revision key
+		 * equal to 0. Jumpers at the bottom of processor will change
+		 * multiplier and FSB, but will not change bits in Longhaul
+		 * MSR nor enable voltage scaling. */
+		if (!revid_errata) {
+			printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" "
+						"option.\n");
+			revid_errata = 1;
+			msleep(200);
+			goto retry_loop;
+		}
+		/* Why ACPI C3 sometimes doesn't work is a mystery for me.
+		 * But it does happen. Processor is entering ACPI C3 state,
+		 * but it doesn't change frequency. I tried poking various
+		 * bits in northbridge registers, but without success. */
+		if (longhaul_flags & USE_ACPI_C3) {
+			printk(KERN_INFO PFX "Disabling ACPI C3 support.\n");
+			longhaul_flags &= ~USE_ACPI_C3;
+			if (revid_errata) {
+				printk(KERN_INFO PFX "Disabling \"Ignore "
+						"Revision ID\" option.\n");
+				revid_errata = 0;
+			}
+			msleep(200);
+			goto retry_loop;
+		}
+		/* This shouldn't happen. Longhaul ver. 2 was reported not
+		 * working on processors without voltage scaling, but with
+		 * RevID = 1. RevID errata will make things right. Just
+		 * to be 100% sure. */
+		if (longhaul_version == TYPE_LONGHAUL_V2) {
+			printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n");
+			longhaul_version = TYPE_LONGHAUL_V1;
+			msleep(200);
+			goto retry_loop;
+		}
+	}
+	/* Report true CPU frequency */
+	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+
+	if (!bm_timeout)
+		printk(KERN_INFO PFX "Warning: Timeout while waiting for "
+				"idle PCI bus.\n");
+}
+
+/*
+ * Centaur decided to make life a little more tricky.
+ * Only longhaul v1 is allowed to read EBLCR BSEL[0:1].
+ * Samuel2 and above have to try and guess what the FSB is.
+ * We do this by assuming we booted at maximum multiplier, and interpolate
+ * between that value multiplied by possible FSBs and cpu_mhz which
+ * was calculated at boot time. Really ugly, but no other way to do this.
+ */
+
+#define ROUNDING	0xf
+
+static int guess_fsb(int mult)
+{
+	int speed = cpu_khz / 1000;
+	int i;
+	int speeds[] = { 666, 1000, 1333, 2000 };
+	int f_max, f_min;
+
+	for (i = 0; i < 4; i++) {
+		f_max = ((speeds[i] * mult) + 50) / 100;
+		f_max += (ROUNDING / 2);
+		f_min = f_max - ROUNDING;
+		if ((speed <= f_max) && (speed >= f_min))
+			return speeds[i] / 10;
+	}
+	return 0;
+}
+
+
+static int __cpuinit longhaul_get_ranges(void)
+{
+	unsigned int i, j, k = 0;
+	unsigned int ratio;
+	int mult;
+
+	/* Get current frequency */
+	mult = longhaul_get_cpu_mult();
+	if (mult == -1) {
+		printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n");
+		return -EINVAL;
+	}
+	fsb = guess_fsb(mult);
+	if (fsb == 0) {
+		printk(KERN_INFO PFX "Invalid (reserved) FSB!\n");
+		return -EINVAL;
+	}
+	/* Get max multiplier - as we always did.
+	 * Longhaul MSR is useful only when voltage scaling is enabled.
+	 * C3 is booting at max anyway. */
+	maxmult = mult;
+	/* Get min multiplier */
+	switch (cpu_model) {
+	case CPU_NEHEMIAH:
+		minmult = 50;
+		break;
+	case CPU_NEHEMIAH_C:
+		minmult = 40;
+		break;
+	default:
+		minmult = 30;
+		break;
+	}
+
+	pr_debug("MinMult:%d.%dx MaxMult:%d.%dx\n",
+		 minmult/10, minmult%10, maxmult/10, maxmult%10);
+
+	highest_speed = calc_speed(maxmult);
+	lowest_speed = calc_speed(minmult);
+	pr_debug("FSB:%dMHz  Lowest speed: %s   Highest speed:%s\n", fsb,
+		 print_speed(lowest_speed/1000),
+		 print_speed(highest_speed/1000));
+
+	if (lowest_speed == highest_speed) {
+		printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n");
+		return -EINVAL;
+	}
+	if (lowest_speed > highest_speed) {
+		printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
+			lowest_speed, highest_speed);
+		return -EINVAL;
+	}
+
+	longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table),
+			GFP_KERNEL);
+	if (!longhaul_table)
+		return -ENOMEM;
+
+	for (j = 0; j < numscales; j++) {
+		ratio = mults[j];
+		if (ratio == -1)
+			continue;
+		if (ratio > maxmult || ratio < minmult)
+			continue;
+		longhaul_table[k].frequency = calc_speed(ratio);
+		longhaul_table[k].index	= j;
+		k++;
+	}
+	if (k <= 1) {
+		kfree(longhaul_table);
+		return -ENODEV;
+	}
+	/* Sort */
+	for (j = 0; j < k - 1; j++) {
+		unsigned int min_f, min_i;
+		min_f = longhaul_table[j].frequency;
+		min_i = j;
+		for (i = j + 1; i < k; i++) {
+			if (longhaul_table[i].frequency < min_f) {
+				min_f = longhaul_table[i].frequency;
+				min_i = i;
+			}
+		}
+		if (min_i != j) {
+			swap(longhaul_table[j].frequency,
+			     longhaul_table[min_i].frequency);
+			swap(longhaul_table[j].index,
+			     longhaul_table[min_i].index);
+		}
+	}
+
+	longhaul_table[k].frequency = CPUFREQ_TABLE_END;
+
+	/* Find index we are running on */
+	for (j = 0; j < k; j++) {
+		if (mults[longhaul_table[j].index & 0x1f] == mult) {
+			longhaul_index = j;
+			break;
+		}
+	}
+	return 0;
+}
+
+
+static void __cpuinit longhaul_setup_voltagescaling(void)
+{
+	union msr_longhaul longhaul;
+	struct mV_pos minvid, maxvid, vid;
+	unsigned int j, speed, pos, kHz_step, numvscales;
+	int min_vid_speed;
+
+	rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
+	if (!(longhaul.bits.RevisionID & 1)) {
+		printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n");
+		return;
+	}
+
+	if (!longhaul.bits.VRMRev) {
+		printk(KERN_INFO PFX "VRM 8.5\n");
+		vrm_mV_table = &vrm85_mV[0];
+		mV_vrm_table = &mV_vrm85[0];
+	} else {
+		printk(KERN_INFO PFX "Mobile VRM\n");
+		if (cpu_model < CPU_NEHEMIAH)
+			return;
+		vrm_mV_table = &mobilevrm_mV[0];
+		mV_vrm_table = &mV_mobilevrm[0];
+	}
+
+	minvid = vrm_mV_table[longhaul.bits.MinimumVID];
+	maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
+
+	if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
+		printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
+					"Voltage scaling disabled.\n",
+					minvid.mV/1000, minvid.mV%1000,
+					maxvid.mV/1000, maxvid.mV%1000);
+		return;
+	}
+
+	if (minvid.mV == maxvid.mV) {
+		printk(KERN_INFO PFX "Claims to support voltage scaling but "
+				"min & max are both %d.%03d. "
+				"Voltage scaling disabled\n",
+				maxvid.mV/1000, maxvid.mV%1000);
+		return;
+	}
+
+	/* How many voltage steps*/
+	numvscales = maxvid.pos - minvid.pos + 1;
+	printk(KERN_INFO PFX
+		"Max VID=%d.%03d  "
+		"Min VID=%d.%03d, "
+		"%d possible voltage scales\n",
+		maxvid.mV/1000, maxvid.mV%1000,
+		minvid.mV/1000, minvid.mV%1000,
+		numvscales);
+
+	/* Calculate max frequency at min voltage */
+	j = longhaul.bits.MinMHzBR;
+	if (longhaul.bits.MinMHzBR4)
+		j += 16;
+	min_vid_speed = eblcr[j];
+	if (min_vid_speed == -1)
+		return;
+	switch (longhaul.bits.MinMHzFSB) {
+	case 0:
+		min_vid_speed *= 13333;
+		break;
+	case 1:
+		min_vid_speed *= 10000;
+		break;
+	case 3:
+		min_vid_speed *= 6666;
+		break;
+	default:
+		return;
+		break;
+	}
+	if (min_vid_speed >= highest_speed)
+		return;
+	/* Calculate kHz for one voltage step */
+	kHz_step = (highest_speed - min_vid_speed) / numvscales;
+
+	j = 0;
+	while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
+		speed = longhaul_table[j].frequency;
+		if (speed > min_vid_speed)
+			pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
+		else
+			pos = minvid.pos;
+		longhaul_table[j].index |= mV_vrm_table[pos] << 8;
+		vid = vrm_mV_table[mV_vrm_table[pos]];
+		printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n",
+				speed, j, vid.mV);
+		j++;
+	}
+
+	can_scale_voltage = 1;
+	printk(KERN_INFO PFX "Voltage scaling enabled.\n");
+}
+
+
+static int longhaul_verify(struct cpufreq_policy *policy)
+{
+	return cpufreq_frequency_table_verify(policy, longhaul_table);
+}
+
+
+static int longhaul_target(struct cpufreq_policy *policy,
+			    unsigned int target_freq, unsigned int relation)
+{
+	unsigned int table_index = 0;
+	unsigned int i;
+	unsigned int dir = 0;
+	u8 vid, current_vid;
+
+	if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq,
+				relation, &table_index))
+		return -EINVAL;
+
+	/* Don't set same frequency again */
+	if (longhaul_index == table_index)
+		return 0;
+
+	if (!can_scale_voltage)
+		longhaul_setstate(table_index);
+	else {
+		/* On test system voltage transitions exceeding single
+		 * step up or down were turning motherboard off. Both
+		 * "ondemand" and "userspace" are unsafe. C7 is doing
+		 * this in hardware, C3 is old and we need to do this
+		 * in software. */
+		i = longhaul_index;
+		current_vid = (longhaul_table[longhaul_index].index >> 8);
+		current_vid &= 0x1f;
+		if (table_index > longhaul_index)
+			dir = 1;
+		while (i != table_index) {
+			vid = (longhaul_table[i].index >> 8) & 0x1f;
+			if (vid != current_vid) {
+				longhaul_setstate(i);
+				current_vid = vid;
+				msleep(200);
+			}
+			if (dir)
+				i++;
+			else
+				i--;
+		}
+		longhaul_setstate(table_index);
+	}
+	longhaul_index = table_index;
+	return 0;
+}
+
+
+static unsigned int longhaul_get(unsigned int cpu)
+{
+	if (cpu)
+		return 0;
+	return calc_speed(longhaul_get_cpu_mult());
+}
+
+static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
+					  u32 nesting_level,
+					  void *context, void **return_value)
+{
+	struct acpi_device *d;
+
+	if (acpi_bus_get_device(obj_handle, &d))
+		return 0;
+
+	*return_value = acpi_driver_data(d);
+	return 1;
+}
+
+/* VIA don't support PM2 reg, but have something similar */
+static int enable_arbiter_disable(void)
+{
+	struct pci_dev *dev;
+	int status = 1;
+	int reg;
+	u8 pci_cmd;
+
+	/* Find PLE133 host bridge */
+	reg = 0x78;
+	dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0,
+			     NULL);
+	/* Find PM133/VT8605 host bridge */
+	if (dev == NULL)
+		dev = pci_get_device(PCI_VENDOR_ID_VIA,
+				     PCI_DEVICE_ID_VIA_8605_0, NULL);
+	/* Find CLE266 host bridge */
+	if (dev == NULL) {
+		reg = 0x76;
+		dev = pci_get_device(PCI_VENDOR_ID_VIA,
+				     PCI_DEVICE_ID_VIA_862X_0, NULL);
+		/* Find CN400 V-Link host bridge */
+		if (dev == NULL)
+			dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL);
+	}
+	if (dev != NULL) {
+		/* Enable access to port 0x22 */
+		pci_read_config_byte(dev, reg, &pci_cmd);
+		if (!(pci_cmd & 1<<7)) {
+			pci_cmd |= 1<<7;
+			pci_write_config_byte(dev, reg, pci_cmd);
+			pci_read_config_byte(dev, reg, &pci_cmd);
+			if (!(pci_cmd & 1<<7)) {
+				printk(KERN_ERR PFX
+					"Can't enable access to port 0x22.\n");
+				status = 0;
+			}
+		}
+		pci_dev_put(dev);
+		return status;
+	}
+	return 0;
+}
+
+static int longhaul_setup_southbridge(void)
+{
+	struct pci_dev *dev;
+	u8 pci_cmd;
+
+	/* Find VT8235 southbridge */
+	dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
+	if (dev == NULL)
+		/* Find VT8237 southbridge */
+		dev = pci_get_device(PCI_VENDOR_ID_VIA,
+				     PCI_DEVICE_ID_VIA_8237, NULL);
+	if (dev != NULL) {
+		/* Set transition time to max */
+		pci_read_config_byte(dev, 0xec, &pci_cmd);
+		pci_cmd &= ~(1 << 2);
+		pci_write_config_byte(dev, 0xec, pci_cmd);
+		pci_read_config_byte(dev, 0xe4, &pci_cmd);
+		pci_cmd &= ~(1 << 7);
+		pci_write_config_byte(dev, 0xe4, pci_cmd);
+		pci_read_config_byte(dev, 0xe5, &pci_cmd);
+		pci_cmd |= 1 << 7;
+		pci_write_config_byte(dev, 0xe5, pci_cmd);
+		/* Get address of ACPI registers block*/
+		pci_read_config_byte(dev, 0x81, &pci_cmd);
+		if (pci_cmd & 1 << 7) {
+			pci_read_config_dword(dev, 0x88, &acpi_regs_addr);
+			acpi_regs_addr &= 0xff00;
+			printk(KERN_INFO PFX "ACPI I/O at 0x%x\n",
+					acpi_regs_addr);
+		}
+
+		pci_dev_put(dev);
+		return 1;
+	}
+	return 0;
+}
+
+static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+	char *cpuname = NULL;
+	int ret;
+	u32 lo, hi;
+
+	/* Check what we have on this motherboard */
+	switch (c->x86_model) {
+	case 6:
+		cpu_model = CPU_SAMUEL;
+		cpuname = "C3 'Samuel' [C5A]";
+		longhaul_version = TYPE_LONGHAUL_V1;
+		memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
+		memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr));
+		break;
+
+	case 7:
+		switch (c->x86_mask) {
+		case 0:
+			longhaul_version = TYPE_LONGHAUL_V1;
+			cpu_model = CPU_SAMUEL2;
+			cpuname = "C3 'Samuel 2' [C5B]";
+			/* Note, this is not a typo, early Samuel2's had
+			 * Samuel1 ratios. */
+			memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
+			memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
+			break;
+		case 1 ... 15:
+			longhaul_version = TYPE_LONGHAUL_V2;
+			if (c->x86_mask < 8) {
+				cpu_model = CPU_SAMUEL2;
+				cpuname = "C3 'Samuel 2' [C5B]";
+			} else {
+				cpu_model = CPU_EZRA;
+				cpuname = "C3 'Ezra' [C5C]";
+			}
+			memcpy(mults, ezra_mults, sizeof(ezra_mults));
+			memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr));
+			break;
+		}
+		break;
+
+	case 8:
+		cpu_model = CPU_EZRA_T;
+		cpuname = "C3 'Ezra-T' [C5M]";
+		longhaul_version = TYPE_POWERSAVER;
+		numscales = 32;
+		memcpy(mults, ezrat_mults, sizeof(ezrat_mults));
+		memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr));
+		break;
+
+	case 9:
+		longhaul_version = TYPE_POWERSAVER;
+		numscales = 32;
+		memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults));
+		memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr));
+		switch (c->x86_mask) {
+		case 0 ... 1:
+			cpu_model = CPU_NEHEMIAH;
+			cpuname = "C3 'Nehemiah A' [C5XLOE]";
+			break;
+		case 2 ... 4:
+			cpu_model = CPU_NEHEMIAH;
+			cpuname = "C3 'Nehemiah B' [C5XLOH]";
+			break;
+		case 5 ... 15:
+			cpu_model = CPU_NEHEMIAH_C;
+			cpuname = "C3 'Nehemiah C' [C5P]";
+			break;
+		}
+		break;
+
+	default:
+		cpuname = "Unknown";
+		break;
+	}
+	/* Check Longhaul ver. 2 */
+	if (longhaul_version == TYPE_LONGHAUL_V2) {
+		rdmsr(MSR_VIA_LONGHAUL, lo, hi);
+		if (lo == 0 && hi == 0)
+			/* Looks like MSR isn't present */
+			longhaul_version = TYPE_LONGHAUL_V1;
+	}
+
+	printk(KERN_INFO PFX "VIA %s CPU detected.  ", cpuname);
+	switch (longhaul_version) {
+	case TYPE_LONGHAUL_V1:
+	case TYPE_LONGHAUL_V2:
+		printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version);
+		break;
+	case TYPE_POWERSAVER:
+		printk(KERN_CONT "Powersaver supported.\n");
+		break;
+	};
+
+	/* Doesn't hurt */
+	longhaul_setup_southbridge();
+
+	/* Find ACPI data for processor */
+	acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
+				ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
+				NULL, (void *)&pr);
+
+	/* Check ACPI support for C3 state */
+	if (pr != NULL && longhaul_version == TYPE_POWERSAVER) {
+		cx = &pr->power.states[ACPI_STATE_C3];
+		if (cx->address > 0 && cx->latency <= 1000)
+			longhaul_flags |= USE_ACPI_C3;
+	}
+	/* Disable if it isn't working */
+	if (disable_acpi_c3)
+		longhaul_flags &= ~USE_ACPI_C3;
+	/* Check if northbridge is friendly */
+	if (enable_arbiter_disable())
+		longhaul_flags |= USE_NORTHBRIDGE;
+
+	/* Check ACPI support for bus master arbiter disable */
+	if (!(longhaul_flags & USE_ACPI_C3
+	     || longhaul_flags & USE_NORTHBRIDGE)
+	    && ((pr == NULL) || !(pr->flags.bm_control))) {
+		printk(KERN_ERR PFX
+			"No ACPI support. Unsupported northbridge.\n");
+		return -ENODEV;
+	}
+
+	if (longhaul_flags & USE_NORTHBRIDGE)
+		printk(KERN_INFO PFX "Using northbridge support.\n");
+	if (longhaul_flags & USE_ACPI_C3)
+		printk(KERN_INFO PFX "Using ACPI support.\n");
+
+	ret = longhaul_get_ranges();
+	if (ret != 0)
+		return ret;
+
+	if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0))
+		longhaul_setup_voltagescaling();
+
+	policy->cpuinfo.transition_latency = 200000;	/* nsec */
+	policy->cur = calc_speed(longhaul_get_cpu_mult());
+
+	ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table);
+	if (ret)
+		return ret;
+
+	cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu);
+
+	return 0;
+}
+
+static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
+{
+	cpufreq_frequency_table_put_attr(policy->cpu);
+	return 0;
+}
+
+static struct freq_attr *longhaul_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	NULL,
+};
+
+static struct cpufreq_driver longhaul_driver = {
+	.verify	= longhaul_verify,
+	.target	= longhaul_target,
+	.get	= longhaul_get,
+	.init	= longhaul_cpu_init,
+	.exit	= __devexit_p(longhaul_cpu_exit),
+	.name	= "longhaul",
+	.owner	= THIS_MODULE,
+	.attr	= longhaul_attr,
+};
+
+
+static int __init longhaul_init(void)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+
+	if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6)
+		return -ENODEV;
+
+#ifdef CONFIG_SMP
+	if (num_online_cpus() > 1) {
+		printk(KERN_ERR PFX "More than 1 CPU detected, "
+				"longhaul disabled.\n");
+		return -ENODEV;
+	}
+#endif
+#ifdef CONFIG_X86_IO_APIC
+	if (cpu_has_apic) {
+		printk(KERN_ERR PFX "APIC detected. Longhaul is currently "
+				"broken in this configuration.\n");
+		return -ENODEV;
+	}
+#endif
+	switch (c->x86_model) {
+	case 6 ... 9:
+		return cpufreq_register_driver(&longhaul_driver);
+	case 10:
+		printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n");
+	default:
+		;
+	}
+
+	return -ENODEV;
+}
+
+
+static void __exit longhaul_exit(void)
+{
+	int i;
+
+	for (i = 0; i < numscales; i++) {
+		if (mults[i] == maxmult) {
+			longhaul_setstate(i);
+			break;
+		}
+	}
+
+	cpufreq_unregister_driver(&longhaul_driver);
+	kfree(longhaul_table);
+}
+
+/* Even if BIOS is exporting ACPI C3 state, and it is used
+ * with success when CPU is idle, this state doesn't
+ * trigger frequency transition in some cases. */
+module_param(disable_acpi_c3, int, 0644);
+MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
+/* Change CPU voltage with frequency. Very useful to save
+ * power, but most VIA C3 processors aren't supporting it. */
+module_param(scale_voltage, int, 0644);
+MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
+/* Force revision key to 0 for processors which doesn't
+ * support voltage scaling, but are introducing itself as
+ * such. */
+module_param(revid_errata, int, 0644);
+MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
+
+MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
+MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors.");
+MODULE_LICENSE("GPL");
+
+late_initcall(longhaul_init);
+module_exit(longhaul_exit);
diff --git a/drivers/cpufreq/longhaul.h b/drivers/cpufreq/longhaul.h
new file mode 100644
index 00000000000..cbf48fbca88
--- /dev/null
+++ b/drivers/cpufreq/longhaul.h
@@ -0,0 +1,353 @@
+/*
+ *  longhaul.h
+ *  (C) 2003 Dave Jones.
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *
+ *  VIA-specific information
+ */
+
+union msr_bcr2 {
+	struct {
+		unsigned Reseved:19,	// 18:0
+		ESOFTBF:1,		// 19
+		Reserved2:3,		// 22:20
+		CLOCKMUL:4,		// 26:23
+		Reserved3:5;		// 31:27
+	} bits;
+	unsigned long val;
+};
+
+union msr_longhaul {
+	struct {
+		unsigned RevisionID:4,	// 3:0
+		RevisionKey:4,		// 7:4
+		EnableSoftBusRatio:1,	// 8
+		EnableSoftVID:1,	// 9
+		EnableSoftBSEL:1,	// 10
+		Reserved:3,		// 11:13
+		SoftBusRatio4:1,	// 14
+		VRMRev:1,		// 15
+		SoftBusRatio:4,		// 19:16
+		SoftVID:5,		// 24:20
+		Reserved2:3,		// 27:25
+		SoftBSEL:2,		// 29:28
+		Reserved3:2,		// 31:30
+		MaxMHzBR:4,		// 35:32
+		MaximumVID:5,		// 40:36
+		MaxMHzFSB:2,		// 42:41
+		MaxMHzBR4:1,		// 43
+		Reserved4:4,		// 47:44
+		MinMHzBR:4,		// 51:48
+		MinimumVID:5,		// 56:52
+		MinMHzFSB:2,		// 58:57
+		MinMHzBR4:1,		// 59
+		Reserved5:4;		// 63:60
+	} bits;
+	unsigned long long val;
+};
+
+/*
+ * Clock ratio tables. Div/Mod by 10 to get ratio.
+ * The eblcr values specify the ratio read from the CPU.
+ * The mults values specify what to write to the CPU.
+ */
+
+/*
+ * VIA C3 Samuel 1  & Samuel 2 (stepping 0)
+ */
+static const int __cpuinitdata samuel1_mults[16] = {
+	-1, /* 0000 -> RESERVED */
+	30, /* 0001 ->  3.0x */
+	40, /* 0010 ->  4.0x */
+	-1, /* 0011 -> RESERVED */
+	-1, /* 0100 -> RESERVED */
+	35, /* 0101 ->  3.5x */
+	45, /* 0110 ->  4.5x */
+	55, /* 0111 ->  5.5x */
+	60, /* 1000 ->  6.0x */
+	70, /* 1001 ->  7.0x */
+	80, /* 1010 ->  8.0x */
+	50, /* 1011 ->  5.0x */
+	65, /* 1100 ->  6.5x */
+	75, /* 1101 ->  7.5x */
+	-1, /* 1110 -> RESERVED */
+	-1, /* 1111 -> RESERVED */
+};
+
+static const int __cpuinitdata samuel1_eblcr[16] = {
+	50, /* 0000 -> RESERVED */
+	30, /* 0001 ->  3.0x */
+	40, /* 0010 ->  4.0x */
+	-1, /* 0011 -> RESERVED */
+	55, /* 0100 ->  5.5x */
+	35, /* 0101 ->  3.5x */
+	45, /* 0110 ->  4.5x */
+	-1, /* 0111 -> RESERVED */
+	-1, /* 1000 -> RESERVED */
+	70, /* 1001 ->  7.0x */
+	80, /* 1010 ->  8.0x */
+	60, /* 1011 ->  6.0x */
+	-1, /* 1100 -> RESERVED */
+	75, /* 1101 ->  7.5x */
+	-1, /* 1110 -> RESERVED */
+	65, /* 1111 ->  6.5x */
+};
+
+/*
+ * VIA C3 Samuel2 Stepping 1->15
+ */
+static const int __cpuinitdata samuel2_eblcr[16] = {
+	50,  /* 0000 ->  5.0x */
+	30,  /* 0001 ->  3.0x */
+	40,  /* 0010 ->  4.0x */
+	100, /* 0011 -> 10.0x */
+	55,  /* 0100 ->  5.5x */
+	35,  /* 0101 ->  3.5x */
+	45,  /* 0110 ->  4.5x */
+	110, /* 0111 -> 11.0x */
+	90,  /* 1000 ->  9.0x */
+	70,  /* 1001 ->  7.0x */
+	80,  /* 1010 ->  8.0x */
+	60,  /* 1011 ->  6.0x */
+	120, /* 1100 -> 12.0x */
+	75,  /* 1101 ->  7.5x */
+	130, /* 1110 -> 13.0x */
+	65,  /* 1111 ->  6.5x */
+};
+
+/*
+ * VIA C3 Ezra
+ */
+static const int __cpuinitdata ezra_mults[16] = {
+	100, /* 0000 -> 10.0x */
+	30,  /* 0001 ->  3.0x */
+	40,  /* 0010 ->  4.0x */
+	90,  /* 0011 ->  9.0x */
+	95,  /* 0100 ->  9.5x */
+	35,  /* 0101 ->  3.5x */
+	45,  /* 0110 ->  4.5x */
+	55,  /* 0111 ->  5.5x */
+	60,  /* 1000 ->  6.0x */
+	70,  /* 1001 ->  7.0x */
+	80,  /* 1010 ->  8.0x */
+	50,  /* 1011 ->  5.0x */
+	65,  /* 1100 ->  6.5x */
+	75,  /* 1101 ->  7.5x */
+	85,  /* 1110 ->  8.5x */
+	120, /* 1111 -> 12.0x */
+};
+
+static const int __cpuinitdata ezra_eblcr[16] = {
+	50,  /* 0000 ->  5.0x */
+	30,  /* 0001 ->  3.0x */
+	40,  /* 0010 ->  4.0x */
+	100, /* 0011 -> 10.0x */
+	55,  /* 0100 ->  5.5x */
+	35,  /* 0101 ->  3.5x */
+	45,  /* 0110 ->  4.5x */
+	95,  /* 0111 ->  9.5x */
+	90,  /* 1000 ->  9.0x */
+	70,  /* 1001 ->  7.0x */
+	80,  /* 1010 ->  8.0x */
+	60,  /* 1011 ->  6.0x */
+	120, /* 1100 -> 12.0x */
+	75,  /* 1101 ->  7.5x */
+	85,  /* 1110 ->  8.5x */
+	65,  /* 1111 ->  6.5x */
+};
+
+/*
+ * VIA C3 (Ezra-T) [C5M].
+ */
+static const int __cpuinitdata ezrat_mults[32] = {
+	100, /* 0000 -> 10.0x */
+	30,  /* 0001 ->  3.0x */
+	40,  /* 0010 ->  4.0x */
+	90,  /* 0011 ->  9.0x */
+	95,  /* 0100 ->  9.5x */
+	35,  /* 0101 ->  3.5x */
+	45,  /* 0110 ->  4.5x */
+	55,  /* 0111 ->  5.5x */
+	60,  /* 1000 ->  6.0x */
+	70,  /* 1001 ->  7.0x */
+	80,  /* 1010 ->  8.0x */
+	50,  /* 1011 ->  5.0x */
+	65,  /* 1100 ->  6.5x */
+	75,  /* 1101 ->  7.5x */
+	85,  /* 1110 ->  8.5x */
+	120, /* 1111 ->  12.0x */
+
+	-1,  /* 0000 -> RESERVED (10.0x) */
+	110, /* 0001 -> 11.0x */
+	-1, /* 0010 -> 12.0x */
+	-1,  /* 0011 -> RESERVED (9.0x)*/
+	105, /* 0100 -> 10.5x */
+	115, /* 0101 -> 11.5x */
+	125, /* 0110 -> 12.5x */
+	135, /* 0111 -> 13.5x */
+	140, /* 1000 -> 14.0x */
+	150, /* 1001 -> 15.0x */
+	160, /* 1010 -> 16.0x */
+	130, /* 1011 -> 13.0x */
+	145, /* 1100 -> 14.5x */
+	155, /* 1101 -> 15.5x */
+	-1,  /* 1110 -> RESERVED (13.0x) */
+	-1,  /* 1111 -> RESERVED (12.0x) */
+};
+
+static const int __cpuinitdata ezrat_eblcr[32] = {
+	50,  /* 0000 ->  5.0x */
+	30,  /* 0001 ->  3.0x */
+	40,  /* 0010 ->  4.0x */
+	100, /* 0011 -> 10.0x */
+	55,  /* 0100 ->  5.5x */
+	35,  /* 0101 ->  3.5x */
+	45,  /* 0110 ->  4.5x */
+	95,  /* 0111 ->  9.5x */
+	90,  /* 1000 ->  9.0x */
+	70,  /* 1001 ->  7.0x */
+	80,  /* 1010 ->  8.0x */
+	60,  /* 1011 ->  6.0x */
+	120, /* 1100 -> 12.0x */
+	75,  /* 1101 ->  7.5x */
+	85,  /* 1110 ->  8.5x */
+	65,  /* 1111 ->  6.5x */
+
+	-1,  /* 0000 -> RESERVED (9.0x) */
+	110, /* 0001 -> 11.0x */
+	120, /* 0010 -> 12.0x */
+	-1,  /* 0011 -> RESERVED (10.0x)*/
+	135, /* 0100 -> 13.5x */
+	115, /* 0101 -> 11.5x */
+	125, /* 0110 -> 12.5x */
+	105, /* 0111 -> 10.5x */
+	130, /* 1000 -> 13.0x */
+	150, /* 1001 -> 15.0x */
+	160, /* 1010 -> 16.0x */
+	140, /* 1011 -> 14.0x */
+	-1,  /* 1100 -> RESERVED (12.0x) */
+	155, /* 1101 -> 15.5x */
+	-1,  /* 1110 -> RESERVED (13.0x) */
+	145, /* 1111 -> 14.5x */
+};
+
+/*
+ * VIA C3 Nehemiah */
+
+static const int __cpuinitdata nehemiah_mults[32] = {
+	100, /* 0000 -> 10.0x */
+	-1, /* 0001 -> 16.0x */
+	40,  /* 0010 ->  4.0x */
+	90,  /* 0011 ->  9.0x */
+	95,  /* 0100 ->  9.5x */
+	-1,  /* 0101 ->  RESERVED */
+	45,  /* 0110 ->  4.5x */
+	55,  /* 0111 ->  5.5x */
+	60,  /* 1000 ->  6.0x */
+	70,  /* 1001 ->  7.0x */
+	80,  /* 1010 ->  8.0x */
+	50,  /* 1011 ->  5.0x */
+	65,  /* 1100 ->  6.5x */
+	75,  /* 1101 ->  7.5x */
+	85,  /* 1110 ->  8.5x */
+	120, /* 1111 -> 12.0x */
+	-1, /* 0000 -> 10.0x */
+	110, /* 0001 -> 11.0x */
+	-1, /* 0010 -> 12.0x */
+	-1,  /* 0011 ->  9.0x */
+	105, /* 0100 -> 10.5x */
+	115, /* 0101 -> 11.5x */
+	125, /* 0110 -> 12.5x */
+	135, /* 0111 -> 13.5x */
+	140, /* 1000 -> 14.0x */
+	150, /* 1001 -> 15.0x */
+	160, /* 1010 -> 16.0x */
+	130, /* 1011 -> 13.0x */
+	145, /* 1100 -> 14.5x */
+	155, /* 1101 -> 15.5x */
+	-1,  /* 1110 -> RESERVED (13.0x) */
+	-1, /* 1111 -> 12.0x */
+};
+
+static const int __cpuinitdata nehemiah_eblcr[32] = {
+	50,  /* 0000 ->  5.0x */
+	160, /* 0001 -> 16.0x */
+	40,  /* 0010 ->  4.0x */
+	100, /* 0011 -> 10.0x */
+	55,  /* 0100 ->  5.5x */
+	-1,  /* 0101 ->  RESERVED */
+	45,  /* 0110 ->  4.5x */
+	95,  /* 0111 ->  9.5x */
+	90,  /* 1000 ->  9.0x */
+	70,  /* 1001 ->  7.0x */
+	80,  /* 1010 ->  8.0x */
+	60,  /* 1011 ->  6.0x */
+	120, /* 1100 -> 12.0x */
+	75,  /* 1101 ->  7.5x */
+	85,  /* 1110 ->  8.5x */
+	65,  /* 1111 ->  6.5x */
+	90,  /* 0000 ->  9.0x */
+	110, /* 0001 -> 11.0x */
+	120, /* 0010 -> 12.0x */
+	100, /* 0011 -> 10.0x */
+	135, /* 0100 -> 13.5x */
+	115, /* 0101 -> 11.5x */
+	125, /* 0110 -> 12.5x */
+	105, /* 0111 -> 10.5x */
+	130, /* 1000 -> 13.0x */
+	150, /* 1001 -> 15.0x */
+	160, /* 1010 -> 16.0x */
+	140, /* 1011 -> 14.0x */
+	120, /* 1100 -> 12.0x */
+	155, /* 1101 -> 15.5x */
+	-1,  /* 1110 -> RESERVED (13.0x) */
+	145 /* 1111 -> 14.5x */
+};
+
+/*
+ * Voltage scales. Div/Mod by 1000 to get actual voltage.
+ * Which scale to use depends on the VRM type in use.
+ */
+
+struct mV_pos {
+	unsigned short mV;
+	unsigned short pos;
+};
+
+static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
+	{1250, 8},	{1200, 6},	{1150, 4},	{1100, 2},
+	{1050, 0},	{1800, 30},	{1750, 28},	{1700, 26},
+	{1650, 24},	{1600, 22},	{1550, 20},	{1500, 18},
+	{1450, 16},	{1400, 14},	{1350, 12},	{1300, 10},
+	{1275, 9},	{1225, 7},	{1175, 5},	{1125, 3},
+	{1075, 1},	{1825, 31},	{1775, 29},	{1725, 27},
+	{1675, 25},	{1625, 23},	{1575, 21},	{1525, 19},
+	{1475, 17},	{1425, 15},	{1375, 13},	{1325, 11}
+};
+
+static const unsigned char __cpuinitdata mV_vrm85[32] = {
+	0x04,	0x14,	0x03,	0x13,	0x02,	0x12,	0x01,	0x11,
+	0x00,	0x10,	0x0f,	0x1f,	0x0e,	0x1e,	0x0d,	0x1d,
+	0x0c,	0x1c,	0x0b,	0x1b,	0x0a,	0x1a,	0x09,	0x19,
+	0x08,	0x18,	0x07,	0x17,	0x06,	0x16,	0x05,	0x15
+};
+
+static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
+	{1750, 31},	{1700, 30},	{1650, 29},	{1600, 28},
+	{1550, 27},	{1500, 26},	{1450, 25},	{1400, 24},
+	{1350, 23},	{1300, 22},	{1250, 21},	{1200, 20},
+	{1150, 19},	{1100, 18},	{1050, 17},	{1000, 16},
+	{975, 15},	{950, 14},	{925, 13},	{900, 12},
+	{875, 11},	{850, 10},	{825, 9},	{800, 8},
+	{775, 7},	{750, 6},	{725, 5},	{700, 4},
+	{675, 3},	{650, 2},	{625, 1},	{600, 0}
+};
+
+static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
+	0x1f,	0x1e,	0x1d,	0x1c,	0x1b,	0x1a,	0x19,	0x18,
+	0x17,	0x16,	0x15,	0x14,	0x13,	0x12,	0x11,	0x10,
+	0x0f,	0x0e,	0x0d,	0x0c,	0x0b,	0x0a,	0x09,	0x08,
+	0x07,	0x06,	0x05,	0x04,	0x03,	0x02,	0x01,	0x00
+};
+
diff --git a/drivers/cpufreq/longrun.c b/drivers/cpufreq/longrun.c
new file mode 100644
index 00000000000..34ea359b370
--- /dev/null
+++ b/drivers/cpufreq/longrun.c
@@ -0,0 +1,324 @@
+/*
+ * (C) 2002 - 2003  Dominik Brodowski <linux@brodo.de>
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *
+ *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/timex.h>
+
+#include <asm/msr.h>
+#include <asm/processor.h>
+
+static struct cpufreq_driver	longrun_driver;
+
+/**
+ * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz
+ * values into per cent values. In TMTA microcode, the following is valid:
+ * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
+ */
+static unsigned int longrun_low_freq, longrun_high_freq;
+
+
+/**
+ * longrun_get_policy - get the current LongRun policy
+ * @policy: struct cpufreq_policy where current policy is written into
+ *
+ * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
+ * and MSR_TMTA_LONGRUN_CTRL
+ */
+static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy)
+{
+	u32 msr_lo, msr_hi;
+
+	rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
+	pr_debug("longrun flags are %x - %x\n", msr_lo, msr_hi);
+	if (msr_lo & 0x01)
+		policy->policy = CPUFREQ_POLICY_PERFORMANCE;
+	else
+		policy->policy = CPUFREQ_POLICY_POWERSAVE;
+
+	rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
+	pr_debug("longrun ctrl is %x - %x\n", msr_lo, msr_hi);
+	msr_lo &= 0x0000007F;
+	msr_hi &= 0x0000007F;
+
+	if (longrun_high_freq <= longrun_low_freq) {
+		/* Assume degenerate Longrun table */
+		policy->min = policy->max = longrun_high_freq;
+	} else {
+		policy->min = longrun_low_freq + msr_lo *
+			((longrun_high_freq - longrun_low_freq) / 100);
+		policy->max = longrun_low_freq + msr_hi *
+			((longrun_high_freq - longrun_low_freq) / 100);
+	}
+	policy->cpu = 0;
+}
+
+
+/**
+ * longrun_set_policy - sets a new CPUFreq policy
+ * @policy: new policy
+ *
+ * Sets a new CPUFreq policy on LongRun-capable processors. This function
+ * has to be called with cpufreq_driver locked.
+ */
+static int longrun_set_policy(struct cpufreq_policy *policy)
+{
+	u32 msr_lo, msr_hi;
+	u32 pctg_lo, pctg_hi;
+
+	if (!policy)
+		return -EINVAL;
+
+	if (longrun_high_freq <= longrun_low_freq) {
+		/* Assume degenerate Longrun table */
+		pctg_lo = pctg_hi = 100;
+	} else {
+		pctg_lo = (policy->min - longrun_low_freq) /
+			((longrun_high_freq - longrun_low_freq) / 100);
+		pctg_hi = (policy->max - longrun_low_freq) /
+			((longrun_high_freq - longrun_low_freq) / 100);
+	}
+
+	if (pctg_hi > 100)
+		pctg_hi = 100;
+	if (pctg_lo > pctg_hi)
+		pctg_lo = pctg_hi;
+
+	/* performance or economy mode */
+	rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
+	msr_lo &= 0xFFFFFFFE;
+	switch (policy->policy) {
+	case CPUFREQ_POLICY_PERFORMANCE:
+		msr_lo |= 0x00000001;
+		break;
+	case CPUFREQ_POLICY_POWERSAVE:
+		break;
+	}
+	wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
+
+	/* lower and upper boundary */
+	rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
+	msr_lo &= 0xFFFFFF80;
+	msr_hi &= 0xFFFFFF80;
+	msr_lo |= pctg_lo;
+	msr_hi |= pctg_hi;
+	wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
+
+	return 0;
+}
+
+
+/**
+ * longrun_verify_poliy - verifies a new CPUFreq policy
+ * @policy: the policy to verify
+ *
+ * Validates a new CPUFreq policy. This function has to be called with
+ * cpufreq_driver locked.
+ */
+static int longrun_verify_policy(struct cpufreq_policy *policy)
+{
+	if (!policy)
+		return -EINVAL;
+
+	policy->cpu = 0;
+	cpufreq_verify_within_limits(policy,
+		policy->cpuinfo.min_freq,
+		policy->cpuinfo.max_freq);
+
+	if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) &&
+	    (policy->policy != CPUFREQ_POLICY_PERFORMANCE))
+		return -EINVAL;
+
+	return 0;
+}
+
+static unsigned int longrun_get(unsigned int cpu)
+{
+	u32 eax, ebx, ecx, edx;
+
+	if (cpu)
+		return 0;
+
+	cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
+	pr_debug("cpuid eax is %u\n", eax);
+
+	return eax * 1000;
+}
+
+/**
+ * longrun_determine_freqs - determines the lowest and highest possible core frequency
+ * @low_freq: an int to put the lowest frequency into
+ * @high_freq: an int to put the highest frequency into
+ *
+ * Determines the lowest and highest possible core frequencies on this CPU.
+ * This is necessary to calculate the performance percentage according to
+ * TMTA rules:
+ * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
+ */
+static int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
+						      unsigned int *high_freq)
+{
+	u32 msr_lo, msr_hi;
+	u32 save_lo, save_hi;
+	u32 eax, ebx, ecx, edx;
+	u32 try_hi;
+	struct cpuinfo_x86 *c = &cpu_data(0);
+
+	if (!low_freq || !high_freq)
+		return -EINVAL;
+
+	if (cpu_has(c, X86_FEATURE_LRTI)) {
+		/* if the LongRun Table Interface is present, the
+		 * detection is a bit easier:
+		 * For minimum frequency, read out the maximum
+		 * level (msr_hi), write that into "currently
+		 * selected level", and read out the frequency.
+		 * For maximum frequency, read out level zero.
+		 */
+		/* minimum */
+		rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi);
+		wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi);
+		rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
+		*low_freq = msr_lo * 1000; /* to kHz */
+
+		/* maximum */
+		wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi);
+		rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
+		*high_freq = msr_lo * 1000; /* to kHz */
+
+		pr_debug("longrun table interface told %u - %u kHz\n",
+				*low_freq, *high_freq);
+
+		if (*low_freq > *high_freq)
+			*low_freq = *high_freq;
+		return 0;
+	}
+
+	/* set the upper border to the value determined during TSC init */
+	*high_freq = (cpu_khz / 1000);
+	*high_freq = *high_freq * 1000;
+	pr_debug("high frequency is %u kHz\n", *high_freq);
+
+	/* get current borders */
+	rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
+	save_lo = msr_lo & 0x0000007F;
+	save_hi = msr_hi & 0x0000007F;
+
+	/* if current perf_pctg is larger than 90%, we need to decrease the
+	 * upper limit to make the calculation more accurate.
+	 */
+	cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
+	/* try decreasing in 10% steps, some processors react only
+	 * on some barrier values */
+	for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) {
+		/* set to 0 to try_hi perf_pctg */
+		msr_lo &= 0xFFFFFF80;
+		msr_hi &= 0xFFFFFF80;
+		msr_hi |= try_hi;
+		wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
+
+		/* read out current core MHz and current perf_pctg */
+		cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
+
+		/* restore values */
+		wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi);
+	}
+	pr_debug("percentage is %u %%, freq is %u MHz\n", ecx, eax);
+
+	/* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
+	 * eqals
+	 * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
+	 *
+	 * high_freq * perf_pctg is stored tempoarily into "ebx".
+	 */
+	ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */
+
+	if ((ecx > 95) || (ecx == 0) || (eax < ebx))
+		return -EIO;
+
+	edx = ((eax - ebx) * 100) / (100 - ecx);
+	*low_freq = edx * 1000; /* back to kHz */
+
+	pr_debug("low frequency is %u kHz\n", *low_freq);
+
+	if (*low_freq > *high_freq)
+		*low_freq = *high_freq;
+
+	return 0;
+}
+
+
+static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
+{
+	int result = 0;
+
+	/* capability check */
+	if (policy->cpu != 0)
+		return -ENODEV;
+
+	/* detect low and high frequency */
+	result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq);
+	if (result)
+		return result;
+
+	/* cpuinfo and default policy values */
+	policy->cpuinfo.min_freq = longrun_low_freq;
+	policy->cpuinfo.max_freq = longrun_high_freq;
+	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+	longrun_get_policy(policy);
+
+	return 0;
+}
+
+
+static struct cpufreq_driver longrun_driver = {
+	.flags		= CPUFREQ_CONST_LOOPS,
+	.verify		= longrun_verify_policy,
+	.setpolicy	= longrun_set_policy,
+	.get		= longrun_get,
+	.init		= longrun_cpu_init,
+	.name		= "longrun",
+	.owner		= THIS_MODULE,
+};
+
+
+/**
+ * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver
+ *
+ * Initializes the LongRun support.
+ */
+static int __init longrun_init(void)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+
+	if (c->x86_vendor != X86_VENDOR_TRANSMETA ||
+	    !cpu_has(c, X86_FEATURE_LONGRUN))
+		return -ENODEV;
+
+	return cpufreq_register_driver(&longrun_driver);
+}
+
+
+/**
+ * longrun_exit - unregisters LongRun support
+ */
+static void __exit longrun_exit(void)
+{
+	cpufreq_unregister_driver(&longrun_driver);
+}
+
+
+MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
+MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and "
+		"Efficeon processors.");
+MODULE_LICENSE("GPL");
+
+module_init(longrun_init);
+module_exit(longrun_exit);
diff --git a/drivers/cpufreq/mperf.c b/drivers/cpufreq/mperf.c
new file mode 100644
index 00000000000..911e193018a
--- /dev/null
+++ b/drivers/cpufreq/mperf.c
@@ -0,0 +1,51 @@
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/slab.h>
+
+#include "mperf.h"
+
+static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
+
+/* Called via smp_call_function_single(), on the target CPU */
+static void read_measured_perf_ctrs(void *_cur)
+{
+	struct aperfmperf *am = _cur;
+
+	get_aperfmperf(am);
+}
+
+/*
+ * Return the measured active (C0) frequency on this CPU since last call
+ * to this function.
+ * Input: cpu number
+ * Return: Average CPU frequency in terms of max frequency (zero on error)
+ *
+ * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
+ * over a period of time, while CPU is in C0 state.
+ * IA32_MPERF counts at the rate of max advertised frequency
+ * IA32_APERF counts at the rate of actual CPU frequency
+ * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
+ * no meaning should be associated with absolute values of these MSRs.
+ */
+unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
+					unsigned int cpu)
+{
+	struct aperfmperf perf;
+	unsigned long ratio;
+	unsigned int retval;
+
+	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
+		return 0;
+
+	ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
+	per_cpu(acfreq_old_perf, cpu) = perf;
+
+	retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
+MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/mperf.h b/drivers/cpufreq/mperf.h
new file mode 100644
index 00000000000..5dbf2950dc2
--- /dev/null
+++ b/drivers/cpufreq/mperf.h
@@ -0,0 +1,9 @@
+/*
+ *  (c) 2010 Advanced Micro Devices, Inc.
+ *  Your use of this code is subject to the terms and conditions of the
+ *  GNU general public license version 2. See "COPYING" or
+ *  http://www.gnu.org/licenses/gpl.html
+ */
+
+unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
+					unsigned int cpu);
diff --git a/drivers/cpufreq/p4-clockmod.c b/drivers/cpufreq/p4-clockmod.c
new file mode 100644
index 00000000000..6be3e0760c2
--- /dev/null
+++ b/drivers/cpufreq/p4-clockmod.c
@@ -0,0 +1,329 @@
+/*
+ *	Pentium 4/Xeon CPU on demand clock modulation/speed scaling
+ *	(C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
+ *	(C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
+ *	(C) 2002 Arjan van de Ven <arjanv@redhat.com>
+ *	(C) 2002 Tora T. Engstad
+ *	All Rights Reserved
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ *      The author(s) of this software shall not be held liable for damages
+ *      of any nature resulting due to the use of this software. This
+ *      software is provided AS-IS with no warranties.
+ *
+ *	Date		Errata			Description
+ *	20020525	N44, O17	12.5% or 25% DC causes lockup
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/cpufreq.h>
+#include <linux/cpumask.h>
+#include <linux/timex.h>
+
+#include <asm/processor.h>
+#include <asm/msr.h>
+#include <asm/timer.h>
+
+#include "speedstep-lib.h"
+
+#define PFX	"p4-clockmod: "
+
+/*
+ * Duty Cycle (3bits), note DC_DISABLE is not specified in
+ * intel docs i just use it to mean disable
+ */
+enum {
+	DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT,
+	DC_64PT, DC_75PT, DC_88PT, DC_DISABLE
+};
+
+#define DC_ENTRIES	8
+
+
+static int has_N44_O17_errata[NR_CPUS];
+static unsigned int stock_freq;
+static struct cpufreq_driver p4clockmod_driver;
+static unsigned int cpufreq_p4_get(unsigned int cpu);
+
+static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
+{
+	u32 l, h;
+
+	if (!cpu_online(cpu) ||
+	    (newstate > DC_DISABLE) || (newstate == DC_RESV))
+		return -EINVAL;
+
+	rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
+
+	if (l & 0x01)
+		pr_debug("CPU#%d currently thermal throttled\n", cpu);
+
+	if (has_N44_O17_errata[cpu] &&
+	    (newstate == DC_25PT || newstate == DC_DFLT))
+		newstate = DC_38PT;
+
+	rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
+	if (newstate == DC_DISABLE) {
+		pr_debug("CPU#%d disabling modulation\n", cpu);
+		wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
+	} else {
+		pr_debug("CPU#%d setting duty cycle to %d%%\n",
+			cpu, ((125 * newstate) / 10));
+		/* bits 63 - 5	: reserved
+		 * bit  4	: enable/disable
+		 * bits 3-1	: duty cycle
+		 * bit  0	: reserved
+		 */
+		l = (l & ~14);
+		l = l | (1<<4) | ((newstate & 0x7)<<1);
+		wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h);
+	}
+
+	return 0;
+}
+
+
+static struct cpufreq_frequency_table p4clockmod_table[] = {
+	{DC_RESV, CPUFREQ_ENTRY_INVALID},
+	{DC_DFLT, 0},
+	{DC_25PT, 0},
+	{DC_38PT, 0},
+	{DC_50PT, 0},
+	{DC_64PT, 0},
+	{DC_75PT, 0},
+	{DC_88PT, 0},
+	{DC_DISABLE, 0},
+	{DC_RESV, CPUFREQ_TABLE_END},
+};
+
+
+static int cpufreq_p4_target(struct cpufreq_policy *policy,
+			     unsigned int target_freq,
+			     unsigned int relation)
+{
+	unsigned int    newstate = DC_RESV;
+	struct cpufreq_freqs freqs;
+	int i;
+
+	if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0],
+				target_freq, relation, &newstate))
+		return -EINVAL;
+
+	freqs.old = cpufreq_p4_get(policy->cpu);
+	freqs.new = stock_freq * p4clockmod_table[newstate].index / 8;
+
+	if (freqs.new == freqs.old)
+		return 0;
+
+	/* notifiers */
+	for_each_cpu(i, policy->cpus) {
+		freqs.cpu = i;
+		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	}
+
+	/* run on each logical CPU,
+	 * see section 13.15.3 of IA32 Intel Architecture Software
+	 * Developer's Manual, Volume 3
+	 */
+	for_each_cpu(i, policy->cpus)
+		cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
+
+	/* notifiers */
+	for_each_cpu(i, policy->cpus) {
+		freqs.cpu = i;
+		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	}
+
+	return 0;
+}
+
+
+static int cpufreq_p4_verify(struct cpufreq_policy *policy)
+{
+	return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]);
+}
+
+
+static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
+{
+	if (c->x86 == 0x06) {
+		if (cpu_has(c, X86_FEATURE_EST))
+			printk_once(KERN_WARNING PFX "Warning: EST-capable "
+			       "CPU detected. The acpi-cpufreq module offers "
+			       "voltage scaling in addition to frequency "
+			       "scaling. You should use that instead of "
+			       "p4-clockmod, if possible.\n");
+		switch (c->x86_model) {
+		case 0x0E: /* Core */
+		case 0x0F: /* Core Duo */
+		case 0x16: /* Celeron Core */
+		case 0x1C: /* Atom */
+			p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
+			return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
+		case 0x0D: /* Pentium M (Dothan) */
+			p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
+			/* fall through */
+		case 0x09: /* Pentium M (Banias) */
+			return speedstep_get_frequency(SPEEDSTEP_CPU_PM);
+		}
+	}
+
+	if (c->x86 != 0xF)
+		return 0;
+
+	/* on P-4s, the TSC runs with constant frequency independent whether
+	 * throttling is active or not. */
+	p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
+
+	if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) {
+		printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. "
+		       "The speedstep-ich or acpi cpufreq modules offer "
+		       "voltage scaling in addition of frequency scaling. "
+		       "You should use either one instead of p4-clockmod, "
+		       "if possible.\n");
+		return speedstep_get_frequency(SPEEDSTEP_CPU_P4M);
+	}
+
+	return speedstep_get_frequency(SPEEDSTEP_CPU_P4D);
+}
+
+
+
+static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
+{
+	struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
+	int cpuid = 0;
+	unsigned int i;
+
+#ifdef CONFIG_SMP
+	cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
+#endif
+
+	/* Errata workaround */
+	cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask;
+	switch (cpuid) {
+	case 0x0f07:
+	case 0x0f0a:
+	case 0x0f11:
+	case 0x0f12:
+		has_N44_O17_errata[policy->cpu] = 1;
+		pr_debug("has errata -- disabling low frequencies\n");
+	}
+
+	if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D &&
+	    c->x86_model < 2) {
+		/* switch to maximum frequency and measure result */
+		cpufreq_p4_setdc(policy->cpu, DC_DISABLE);
+		recalibrate_cpu_khz();
+	}
+	/* get max frequency */
+	stock_freq = cpufreq_p4_get_frequency(c);
+	if (!stock_freq)
+		return -EINVAL;
+
+	/* table init */
+	for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
+		if ((i < 2) && (has_N44_O17_errata[policy->cpu]))
+			p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
+		else
+			p4clockmod_table[i].frequency = (stock_freq * i)/8;
+	}
+	cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu);
+
+	/* cpuinfo and default policy values */
+
+	/* the transition latency is set to be 1 higher than the maximum
+	 * transition latency of the ondemand governor */
+	policy->cpuinfo.transition_latency = 10000001;
+	policy->cur = stock_freq;
+
+	return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]);
+}
+
+
+static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy)
+{
+	cpufreq_frequency_table_put_attr(policy->cpu);
+	return 0;
+}
+
+static unsigned int cpufreq_p4_get(unsigned int cpu)
+{
+	u32 l, h;
+
+	rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
+
+	if (l & 0x10) {
+		l = l >> 1;
+		l &= 0x7;
+	} else
+		l = DC_DISABLE;
+
+	if (l != DC_DISABLE)
+		return stock_freq * l / 8;
+
+	return stock_freq;
+}
+
+static struct freq_attr *p4clockmod_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	NULL,
+};
+
+static struct cpufreq_driver p4clockmod_driver = {
+	.verify		= cpufreq_p4_verify,
+	.target		= cpufreq_p4_target,
+	.init		= cpufreq_p4_cpu_init,
+	.exit		= cpufreq_p4_cpu_exit,
+	.get		= cpufreq_p4_get,
+	.name		= "p4-clockmod",
+	.owner		= THIS_MODULE,
+	.attr		= p4clockmod_attr,
+};
+
+
+static int __init cpufreq_p4_init(void)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+	int ret;
+
+	/*
+	 * THERM_CONTROL is architectural for IA32 now, so
+	 * we can rely on the capability checks
+	 */
+	if (c->x86_vendor != X86_VENDOR_INTEL)
+		return -ENODEV;
+
+	if (!test_cpu_cap(c, X86_FEATURE_ACPI) ||
+				!test_cpu_cap(c, X86_FEATURE_ACC))
+		return -ENODEV;
+
+	ret = cpufreq_register_driver(&p4clockmod_driver);
+	if (!ret)
+		printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock "
+				"Modulation available\n");
+
+	return ret;
+}
+
+
+static void __exit cpufreq_p4_exit(void)
+{
+	cpufreq_unregister_driver(&p4clockmod_driver);
+}
+
+
+MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>");
+MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
+MODULE_LICENSE("GPL");
+
+late_initcall(cpufreq_p4_init);
+module_exit(cpufreq_p4_exit);
diff --git a/drivers/cpufreq/pcc-cpufreq.c b/drivers/cpufreq/pcc-cpufreq.c
new file mode 100644
index 00000000000..7b0603eb012
--- /dev/null
+++ b/drivers/cpufreq/pcc-cpufreq.c
@@ -0,0 +1,621 @@
+/*
+ *  pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
+ *
+ *  Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
+ *  Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
+ *	Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
+ *  INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+#include <linux/cpufreq.h>
+#include <linux/compiler.h>
+#include <linux/slab.h>
+
+#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+
+#include <acpi/processor.h>
+
+#define PCC_VERSION	"1.10.00"
+#define POLL_LOOPS 	300
+
+#define CMD_COMPLETE 	0x1
+#define CMD_GET_FREQ 	0x0
+#define CMD_SET_FREQ 	0x1
+
+#define BUF_SZ		4
+
+struct pcc_register_resource {
+	u8 descriptor;
+	u16 length;
+	u8 space_id;
+	u8 bit_width;
+	u8 bit_offset;
+	u8 access_size;
+	u64 address;
+} __attribute__ ((packed));
+
+struct pcc_memory_resource {
+	u8 descriptor;
+	u16 length;
+	u8 space_id;
+	u8 resource_usage;
+	u8 type_specific;
+	u64 granularity;
+	u64 minimum;
+	u64 maximum;
+	u64 translation_offset;
+	u64 address_length;
+} __attribute__ ((packed));
+
+static struct cpufreq_driver pcc_cpufreq_driver;
+
+struct pcc_header {
+	u32 signature;
+	u16 length;
+	u8 major;
+	u8 minor;
+	u32 features;
+	u16 command;
+	u16 status;
+	u32 latency;
+	u32 minimum_time;
+	u32 maximum_time;
+	u32 nominal;
+	u32 throttled_frequency;
+	u32 minimum_frequency;
+};
+
+static void __iomem *pcch_virt_addr;
+static struct pcc_header __iomem *pcch_hdr;
+
+static DEFINE_SPINLOCK(pcc_lock);
+
+static struct acpi_generic_address doorbell;
+
+static u64 doorbell_preserve;
+static u64 doorbell_write;
+
+static u8 OSC_UUID[16] = {0x9F, 0x2C, 0x9B, 0x63, 0x91, 0x70, 0x1f, 0x49,
+			  0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
+
+struct pcc_cpu {
+	u32 input_offset;
+	u32 output_offset;
+};
+
+static struct pcc_cpu __percpu *pcc_cpu_info;
+
+static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
+{
+	cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
+				     policy->cpuinfo.max_freq);
+	return 0;
+}
+
+static inline void pcc_cmd(void)
+{
+	u64 doorbell_value;
+	int i;
+
+	acpi_read(&doorbell_value, &doorbell);
+	acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
+		   &doorbell);
+
+	for (i = 0; i < POLL_LOOPS; i++) {
+		if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
+			break;
+	}
+}
+
+static inline void pcc_clear_mapping(void)
+{
+	if (pcch_virt_addr)
+		iounmap(pcch_virt_addr);
+	pcch_virt_addr = NULL;
+}
+
+static unsigned int pcc_get_freq(unsigned int cpu)
+{
+	struct pcc_cpu *pcc_cpu_data;
+	unsigned int curr_freq;
+	unsigned int freq_limit;
+	u16 status;
+	u32 input_buffer;
+	u32 output_buffer;
+
+	spin_lock(&pcc_lock);
+
+	pr_debug("get: get_freq for CPU %d\n", cpu);
+	pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
+
+	input_buffer = 0x1;
+	iowrite32(input_buffer,
+			(pcch_virt_addr + pcc_cpu_data->input_offset));
+	iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
+
+	pcc_cmd();
+
+	output_buffer =
+		ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
+
+	/* Clear the input buffer - we are done with the current command */
+	memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
+
+	status = ioread16(&pcch_hdr->status);
+	if (status != CMD_COMPLETE) {
+		pr_debug("get: FAILED: for CPU %d, status is %d\n",
+			cpu, status);
+		goto cmd_incomplete;
+	}
+	iowrite16(0, &pcch_hdr->status);
+	curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
+			/ 100) * 1000);
+
+	pr_debug("get: SUCCESS: (virtual) output_offset for cpu %d is "
+		"0x%p, contains a value of: 0x%x. Speed is: %d MHz\n",
+		cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
+		output_buffer, curr_freq);
+
+	freq_limit = (output_buffer >> 8) & 0xff;
+	if (freq_limit != 0xff) {
+		pr_debug("get: frequency for cpu %d is being temporarily"
+			" capped at %d\n", cpu, curr_freq);
+	}
+
+	spin_unlock(&pcc_lock);
+	return curr_freq;
+
+cmd_incomplete:
+	iowrite16(0, &pcch_hdr->status);
+	spin_unlock(&pcc_lock);
+	return 0;
+}
+
+static int pcc_cpufreq_target(struct cpufreq_policy *policy,
+			      unsigned int target_freq,
+			      unsigned int relation)
+{
+	struct pcc_cpu *pcc_cpu_data;
+	struct cpufreq_freqs freqs;
+	u16 status;
+	u32 input_buffer;
+	int cpu;
+
+	spin_lock(&pcc_lock);
+	cpu = policy->cpu;
+	pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
+
+	pr_debug("target: CPU %d should go to target freq: %d "
+		"(virtual) input_offset is 0x%p\n",
+		cpu, target_freq,
+		(pcch_virt_addr + pcc_cpu_data->input_offset));
+
+	freqs.new = target_freq;
+	freqs.cpu = cpu;
+	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+
+	input_buffer = 0x1 | (((target_freq * 100)
+			       / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
+	iowrite32(input_buffer,
+			(pcch_virt_addr + pcc_cpu_data->input_offset));
+	iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
+
+	pcc_cmd();
+
+	/* Clear the input buffer - we are done with the current command */
+	memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
+
+	status = ioread16(&pcch_hdr->status);
+	if (status != CMD_COMPLETE) {
+		pr_debug("target: FAILED for cpu %d, with status: 0x%x\n",
+			cpu, status);
+		goto cmd_incomplete;
+	}
+	iowrite16(0, &pcch_hdr->status);
+
+	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	pr_debug("target: was SUCCESSFUL for cpu %d\n", cpu);
+	spin_unlock(&pcc_lock);
+
+	return 0;
+
+cmd_incomplete:
+	iowrite16(0, &pcch_hdr->status);
+	spin_unlock(&pcc_lock);
+	return -EINVAL;
+}
+
+static int pcc_get_offset(int cpu)
+{
+	acpi_status status;
+	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
+	union acpi_object *pccp, *offset;
+	struct pcc_cpu *pcc_cpu_data;
+	struct acpi_processor *pr;
+	int ret = 0;
+
+	pr = per_cpu(processors, cpu);
+	pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
+
+	status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
+
+	pccp = buffer.pointer;
+	if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
+		ret = -ENODEV;
+		goto out_free;
+	};
+
+	offset = &(pccp->package.elements[0]);
+	if (!offset || offset->type != ACPI_TYPE_INTEGER) {
+		ret = -ENODEV;
+		goto out_free;
+	}
+
+	pcc_cpu_data->input_offset = offset->integer.value;
+
+	offset = &(pccp->package.elements[1]);
+	if (!offset || offset->type != ACPI_TYPE_INTEGER) {
+		ret = -ENODEV;
+		goto out_free;
+	}
+
+	pcc_cpu_data->output_offset = offset->integer.value;
+
+	memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
+	memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
+
+	pr_debug("pcc_get_offset: for CPU %d: pcc_cpu_data "
+		"input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
+		cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
+out_free:
+	kfree(buffer.pointer);
+	return ret;
+}
+
+static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
+{
+	acpi_status status;
+	struct acpi_object_list input;
+	struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
+	union acpi_object in_params[4];
+	union acpi_object *out_obj;
+	u32 capabilities[2];
+	u32 errors;
+	u32 supported;
+	int ret = 0;
+
+	input.count = 4;
+	input.pointer = in_params;
+	in_params[0].type               = ACPI_TYPE_BUFFER;
+	in_params[0].buffer.length      = 16;
+	in_params[0].buffer.pointer     = OSC_UUID;
+	in_params[1].type               = ACPI_TYPE_INTEGER;
+	in_params[1].integer.value      = 1;
+	in_params[2].type               = ACPI_TYPE_INTEGER;
+	in_params[2].integer.value      = 2;
+	in_params[3].type               = ACPI_TYPE_BUFFER;
+	in_params[3].buffer.length      = 8;
+	in_params[3].buffer.pointer     = (u8 *)&capabilities;
+
+	capabilities[0] = OSC_QUERY_ENABLE;
+	capabilities[1] = 0x1;
+
+	status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
+
+	if (!output.length)
+		return -ENODEV;
+
+	out_obj = output.pointer;
+	if (out_obj->type != ACPI_TYPE_BUFFER) {
+		ret = -ENODEV;
+		goto out_free;
+	}
+
+	errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
+	if (errors) {
+		ret = -ENODEV;
+		goto out_free;
+	}
+
+	supported = *((u32 *)(out_obj->buffer.pointer + 4));
+	if (!(supported & 0x1)) {
+		ret = -ENODEV;
+		goto out_free;
+	}
+
+	kfree(output.pointer);
+	capabilities[0] = 0x0;
+	capabilities[1] = 0x1;
+
+	status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
+
+	if (!output.length)
+		return -ENODEV;
+
+	out_obj = output.pointer;
+	if (out_obj->type != ACPI_TYPE_BUFFER) {
+		ret = -ENODEV;
+		goto out_free;
+	}
+
+	errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
+	if (errors) {
+		ret = -ENODEV;
+		goto out_free;
+	}
+
+	supported = *((u32 *)(out_obj->buffer.pointer + 4));
+	if (!(supported & 0x1)) {
+		ret = -ENODEV;
+		goto out_free;
+	}
+
+out_free:
+	kfree(output.pointer);
+	return ret;
+}
+
+static int __init pcc_cpufreq_probe(void)
+{
+	acpi_status status;
+	struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
+	struct pcc_memory_resource *mem_resource;
+	struct pcc_register_resource *reg_resource;
+	union acpi_object *out_obj, *member;
+	acpi_handle handle, osc_handle, pcch_handle;
+	int ret = 0;
+
+	status = acpi_get_handle(NULL, "\\_SB", &handle);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
+
+	status = acpi_get_handle(handle, "PCCH", &pcch_handle);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
+
+	status = acpi_get_handle(handle, "_OSC", &osc_handle);
+	if (ACPI_SUCCESS(status)) {
+		ret = pcc_cpufreq_do_osc(&osc_handle);
+		if (ret)
+			pr_debug("probe: _OSC evaluation did not succeed\n");
+		/* Firmware's use of _OSC is optional */
+		ret = 0;
+	}
+
+	status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
+
+	out_obj = output.pointer;
+	if (out_obj->type != ACPI_TYPE_PACKAGE) {
+		ret = -ENODEV;
+		goto out_free;
+	}
+
+	member = &out_obj->package.elements[0];
+	if (member->type != ACPI_TYPE_BUFFER) {
+		ret = -ENODEV;
+		goto out_free;
+	}
+
+	mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
+
+	pr_debug("probe: mem_resource descriptor: 0x%x,"
+		" length: %d, space_id: %d, resource_usage: %d,"
+		" type_specific: %d, granularity: 0x%llx,"
+		" minimum: 0x%llx, maximum: 0x%llx,"
+		" translation_offset: 0x%llx, address_length: 0x%llx\n",
+		mem_resource->descriptor, mem_resource->length,
+		mem_resource->space_id, mem_resource->resource_usage,
+		mem_resource->type_specific, mem_resource->granularity,
+		mem_resource->minimum, mem_resource->maximum,
+		mem_resource->translation_offset,
+		mem_resource->address_length);
+
+	if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
+		ret = -ENODEV;
+		goto out_free;
+	}
+
+	pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
+					mem_resource->address_length);
+	if (pcch_virt_addr == NULL) {
+		pr_debug("probe: could not map shared mem region\n");
+		goto out_free;
+	}
+	pcch_hdr = pcch_virt_addr;
+
+	pr_debug("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
+	pr_debug("probe: PCCH header is at physical address: 0x%llx,"
+		" signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
+		" supported features: 0x%x, command field: 0x%x,"
+		" status field: 0x%x, nominal latency: %d us\n",
+		mem_resource->minimum, ioread32(&pcch_hdr->signature),
+		ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
+		ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
+		ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
+		ioread32(&pcch_hdr->latency));
+
+	pr_debug("probe: min time between commands: %d us,"
+		" max time between commands: %d us,"
+		" nominal CPU frequency: %d MHz,"
+		" minimum CPU frequency: %d MHz,"
+		" minimum CPU frequency without throttling: %d MHz\n",
+		ioread32(&pcch_hdr->minimum_time),
+		ioread32(&pcch_hdr->maximum_time),
+		ioread32(&pcch_hdr->nominal),
+		ioread32(&pcch_hdr->throttled_frequency),
+		ioread32(&pcch_hdr->minimum_frequency));
+
+	member = &out_obj->package.elements[1];
+	if (member->type != ACPI_TYPE_BUFFER) {
+		ret = -ENODEV;
+		goto pcch_free;
+	}
+
+	reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
+
+	doorbell.space_id = reg_resource->space_id;
+	doorbell.bit_width = reg_resource->bit_width;
+	doorbell.bit_offset = reg_resource->bit_offset;
+	doorbell.access_width = 64;
+	doorbell.address = reg_resource->address;
+
+	pr_debug("probe: doorbell: space_id is %d, bit_width is %d, "
+		"bit_offset is %d, access_width is %d, address is 0x%llx\n",
+		doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
+		doorbell.access_width, reg_resource->address);
+
+	member = &out_obj->package.elements[2];
+	if (member->type != ACPI_TYPE_INTEGER) {
+		ret = -ENODEV;
+		goto pcch_free;
+	}
+
+	doorbell_preserve = member->integer.value;
+
+	member = &out_obj->package.elements[3];
+	if (member->type != ACPI_TYPE_INTEGER) {
+		ret = -ENODEV;
+		goto pcch_free;
+	}
+
+	doorbell_write = member->integer.value;
+
+	pr_debug("probe: doorbell_preserve: 0x%llx,"
+		" doorbell_write: 0x%llx\n",
+		doorbell_preserve, doorbell_write);
+
+	pcc_cpu_info = alloc_percpu(struct pcc_cpu);
+	if (!pcc_cpu_info) {
+		ret = -ENOMEM;
+		goto pcch_free;
+	}
+
+	printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
+	       " limits: %d MHz, %d MHz\n", PCC_VERSION,
+	       ioread32(&pcch_hdr->minimum_frequency),
+	       ioread32(&pcch_hdr->nominal));
+	kfree(output.pointer);
+	return ret;
+pcch_free:
+	pcc_clear_mapping();
+out_free:
+	kfree(output.pointer);
+	return ret;
+}
+
+static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+	unsigned int cpu = policy->cpu;
+	unsigned int result = 0;
+
+	if (!pcch_virt_addr) {
+		result = -1;
+		goto out;
+	}
+
+	result = pcc_get_offset(cpu);
+	if (result) {
+		pr_debug("init: PCCP evaluation failed\n");
+		goto out;
+	}
+
+	policy->max = policy->cpuinfo.max_freq =
+		ioread32(&pcch_hdr->nominal) * 1000;
+	policy->min = policy->cpuinfo.min_freq =
+		ioread32(&pcch_hdr->minimum_frequency) * 1000;
+	policy->cur = pcc_get_freq(cpu);
+
+	if (!policy->cur) {
+		pr_debug("init: Unable to get current CPU frequency\n");
+		result = -EINVAL;
+		goto out;
+	}
+
+	pr_debug("init: policy->max is %d, policy->min is %d\n",
+		policy->max, policy->min);
+out:
+	return result;
+}
+
+static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
+{
+	return 0;
+}
+
+static struct cpufreq_driver pcc_cpufreq_driver = {
+	.flags = CPUFREQ_CONST_LOOPS,
+	.get = pcc_get_freq,
+	.verify = pcc_cpufreq_verify,
+	.target = pcc_cpufreq_target,
+	.init = pcc_cpufreq_cpu_init,
+	.exit = pcc_cpufreq_cpu_exit,
+	.name = "pcc-cpufreq",
+	.owner = THIS_MODULE,
+};
+
+static int __init pcc_cpufreq_init(void)
+{
+	int ret;
+
+	if (acpi_disabled)
+		return 0;
+
+	ret = pcc_cpufreq_probe();
+	if (ret) {
+		pr_debug("pcc_cpufreq_init: PCCH evaluation failed\n");
+		return ret;
+	}
+
+	ret = cpufreq_register_driver(&pcc_cpufreq_driver);
+
+	return ret;
+}
+
+static void __exit pcc_cpufreq_exit(void)
+{
+	cpufreq_unregister_driver(&pcc_cpufreq_driver);
+
+	pcc_clear_mapping();
+
+	free_percpu(pcc_cpu_info);
+}
+
+MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
+MODULE_VERSION(PCC_VERSION);
+MODULE_DESCRIPTION("Processor Clocking Control interface driver");
+MODULE_LICENSE("GPL");
+
+late_initcall(pcc_cpufreq_init);
+module_exit(pcc_cpufreq_exit);
diff --git a/drivers/cpufreq/powernow-k6.c b/drivers/cpufreq/powernow-k6.c
new file mode 100644
index 00000000000..b3379d6a5c5
--- /dev/null
+++ b/drivers/cpufreq/powernow-k6.c
@@ -0,0 +1,261 @@
+/*
+ *  This file was based upon code in Powertweak Linux (http://powertweak.sf.net)
+ *  (C) 2000-2003  Dave Jones, Arjan van de Ven, Janne Pänkälä,
+ *                 Dominik Brodowski.
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *
+ *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/ioport.h>
+#include <linux/timex.h>
+#include <linux/io.h>
+
+#include <asm/msr.h>
+
+#define POWERNOW_IOPORT 0xfff0          /* it doesn't matter where, as long
+					   as it is unused */
+
+#define PFX "powernow-k6: "
+static unsigned int                     busfreq;   /* FSB, in 10 kHz */
+static unsigned int                     max_multiplier;
+
+
+/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */
+static struct cpufreq_frequency_table clock_ratio[] = {
+	{45,  /* 000 -> 4.5x */ 0},
+	{50,  /* 001 -> 5.0x */ 0},
+	{40,  /* 010 -> 4.0x */ 0},
+	{55,  /* 011 -> 5.5x */ 0},
+	{20,  /* 100 -> 2.0x */ 0},
+	{30,  /* 101 -> 3.0x */ 0},
+	{60,  /* 110 -> 6.0x */ 0},
+	{35,  /* 111 -> 3.5x */ 0},
+	{0, CPUFREQ_TABLE_END}
+};
+
+
+/**
+ * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier
+ *
+ *   Returns the current setting of the frequency multiplier. Core clock
+ * speed is frequency of the Front-Side Bus multiplied with this value.
+ */
+static int powernow_k6_get_cpu_multiplier(void)
+{
+	u64 invalue = 0;
+	u32 msrval;
+
+	msrval = POWERNOW_IOPORT + 0x1;
+	wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
+	invalue = inl(POWERNOW_IOPORT + 0x8);
+	msrval = POWERNOW_IOPORT + 0x0;
+	wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
+
+	return clock_ratio[(invalue >> 5)&7].index;
+}
+
+
+/**
+ * powernow_k6_set_state - set the PowerNow! multiplier
+ * @best_i: clock_ratio[best_i] is the target multiplier
+ *
+ *   Tries to change the PowerNow! multiplier
+ */
+static void powernow_k6_set_state(unsigned int best_i)
+{
+	unsigned long outvalue = 0, invalue = 0;
+	unsigned long msrval;
+	struct cpufreq_freqs freqs;
+
+	if (clock_ratio[best_i].index > max_multiplier) {
+		printk(KERN_ERR PFX "invalid target frequency\n");
+		return;
+	}
+
+	freqs.old = busfreq * powernow_k6_get_cpu_multiplier();
+	freqs.new = busfreq * clock_ratio[best_i].index;
+	freqs.cpu = 0; /* powernow-k6.c is UP only driver */
+
+	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+
+	/* we now need to transform best_i to the BVC format, see AMD#23446 */
+
+	outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5);
+
+	msrval = POWERNOW_IOPORT + 0x1;
+	wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
+	invalue = inl(POWERNOW_IOPORT + 0x8);
+	invalue = invalue & 0xf;
+	outvalue = outvalue | invalue;
+	outl(outvalue , (POWERNOW_IOPORT + 0x8));
+	msrval = POWERNOW_IOPORT + 0x0;
+	wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
+
+	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+
+	return;
+}
+
+
+/**
+ * powernow_k6_verify - verifies a new CPUfreq policy
+ * @policy: new policy
+ *
+ * Policy must be within lowest and highest possible CPU Frequency,
+ * and at least one possible state must be within min and max.
+ */
+static int powernow_k6_verify(struct cpufreq_policy *policy)
+{
+	return cpufreq_frequency_table_verify(policy, &clock_ratio[0]);
+}
+
+
+/**
+ * powernow_k6_setpolicy - sets a new CPUFreq policy
+ * @policy: new policy
+ * @target_freq: the target frequency
+ * @relation: how that frequency relates to achieved frequency
+ *  (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
+ *
+ * sets a new CPUFreq policy
+ */
+static int powernow_k6_target(struct cpufreq_policy *policy,
+			       unsigned int target_freq,
+			       unsigned int relation)
+{
+	unsigned int newstate = 0;
+
+	if (cpufreq_frequency_table_target(policy, &clock_ratio[0],
+				target_freq, relation, &newstate))
+		return -EINVAL;
+
+	powernow_k6_set_state(newstate);
+
+	return 0;
+}
+
+
+static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
+{
+	unsigned int i, f;
+	int result;
+
+	if (policy->cpu != 0)
+		return -ENODEV;
+
+	/* get frequencies */
+	max_multiplier = powernow_k6_get_cpu_multiplier();
+	busfreq = cpu_khz / max_multiplier;
+
+	/* table init */
+	for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
+		f = clock_ratio[i].index;
+		if (f > max_multiplier)
+			clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
+		else
+			clock_ratio[i].frequency = busfreq * f;
+	}
+
+	/* cpuinfo and default policy values */
+	policy->cpuinfo.transition_latency = 200000;
+	policy->cur = busfreq * max_multiplier;
+
+	result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
+	if (result)
+		return result;
+
+	cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
+
+	return 0;
+}
+
+
+static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
+{
+	unsigned int i;
+	for (i = 0; i < 8; i++) {
+		if (i == max_multiplier)
+			powernow_k6_set_state(i);
+	}
+	cpufreq_frequency_table_put_attr(policy->cpu);
+	return 0;
+}
+
+static unsigned int powernow_k6_get(unsigned int cpu)
+{
+	unsigned int ret;
+	ret = (busfreq * powernow_k6_get_cpu_multiplier());
+	return ret;
+}
+
+static struct freq_attr *powernow_k6_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	NULL,
+};
+
+static struct cpufreq_driver powernow_k6_driver = {
+	.verify		= powernow_k6_verify,
+	.target		= powernow_k6_target,
+	.init		= powernow_k6_cpu_init,
+	.exit		= powernow_k6_cpu_exit,
+	.get		= powernow_k6_get,
+	.name		= "powernow-k6",
+	.owner		= THIS_MODULE,
+	.attr		= powernow_k6_attr,
+};
+
+
+/**
+ * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver
+ *
+ *   Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported
+ * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero
+ * on success.
+ */
+static int __init powernow_k6_init(void)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+
+	if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) ||
+		((c->x86_model != 12) && (c->x86_model != 13)))
+		return -ENODEV;
+
+	if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) {
+		printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n");
+		return -EIO;
+	}
+
+	if (cpufreq_register_driver(&powernow_k6_driver)) {
+		release_region(POWERNOW_IOPORT, 16);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+
+/**
+ * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support
+ *
+ *   Unregisters AMD K6-2+ / K6-3+ PowerNow! support.
+ */
+static void __exit powernow_k6_exit(void)
+{
+	cpufreq_unregister_driver(&powernow_k6_driver);
+	release_region(POWERNOW_IOPORT, 16);
+}
+
+
+MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, "
+		"Dominik Brodowski <linux@brodo.de>");
+MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
+MODULE_LICENSE("GPL");
+
+module_init(powernow_k6_init);
+module_exit(powernow_k6_exit);
diff --git a/drivers/cpufreq/powernow-k7.c b/drivers/cpufreq/powernow-k7.c
new file mode 100644
index 00000000000..d71d9f37235
--- /dev/null
+++ b/drivers/cpufreq/powernow-k7.c
@@ -0,0 +1,747 @@
+/*
+ *  AMD K7 Powernow driver.
+ *  (C) 2003 Dave Jones on behalf of SuSE Labs.
+ *  (C) 2003-2004 Dave Jones <davej@redhat.com>
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *  Based upon datasheets & sample CPUs kindly provided by AMD.
+ *
+ * Errata 5:
+ *  CPU may fail to execute a FID/VID change in presence of interrupt.
+ *  - We cli/sti on stepping A0 CPUs around the FID/VID transition.
+ * Errata 15:
+ *  CPU with half frequency multipliers may hang upon wakeup from disconnect.
+ *  - We disable half multipliers if ACPI is used on A0 stepping CPUs.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/dmi.h>
+#include <linux/timex.h>
+#include <linux/io.h>
+
+#include <asm/timer.h>		/* Needed for recalibrate_cpu_khz() */
+#include <asm/msr.h>
+#include <asm/system.h>
+
+#ifdef CONFIG_X86_POWERNOW_K7_ACPI
+#include <linux/acpi.h>
+#include <acpi/processor.h>
+#endif
+
+#include "powernow-k7.h"
+
+#define PFX "powernow: "
+
+
+struct psb_s {
+	u8 signature[10];
+	u8 tableversion;
+	u8 flags;
+	u16 settlingtime;
+	u8 reserved1;
+	u8 numpst;
+};
+
+struct pst_s {
+	u32 cpuid;
+	u8 fsbspeed;
+	u8 maxfid;
+	u8 startvid;
+	u8 numpstates;
+};
+
+#ifdef CONFIG_X86_POWERNOW_K7_ACPI
+union powernow_acpi_control_t {
+	struct {
+		unsigned long fid:5,
+			vid:5,
+			sgtc:20,
+			res1:2;
+	} bits;
+	unsigned long val;
+};
+#endif
+
+/* divide by 1000 to get VCore voltage in V. */
+static const int mobile_vid_table[32] = {
+    2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
+    1600, 1550, 1500, 1450, 1400, 1350, 1300, 0,
+    1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
+    1075, 1050, 1025, 1000, 975, 950, 925, 0,
+};
+
+/* divide by 10 to get FID. */
+static const int fid_codes[32] = {
+    110, 115, 120, 125, 50, 55, 60, 65,
+    70, 75, 80, 85, 90, 95, 100, 105,
+    30, 190, 40, 200, 130, 135, 140, 210,
+    150, 225, 160, 165, 170, 180, -1, -1,
+};
+
+/* This parameter is used in order to force ACPI instead of legacy method for
+ * configuration purpose.
+ */
+
+static int acpi_force;
+
+static struct cpufreq_frequency_table *powernow_table;
+
+static unsigned int can_scale_bus;
+static unsigned int can_scale_vid;
+static unsigned int minimum_speed = -1;
+static unsigned int maximum_speed;
+static unsigned int number_scales;
+static unsigned int fsb;
+static unsigned int latency;
+static char have_a0;
+
+static int check_fsb(unsigned int fsbspeed)
+{
+	int delta;
+	unsigned int f = fsb / 1000;
+
+	delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
+	return delta < 5;
+}
+
+static int check_powernow(void)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+	unsigned int maxei, eax, ebx, ecx, edx;
+
+	if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) {
+#ifdef MODULE
+		printk(KERN_INFO PFX "This module only works with "
+				"AMD K7 CPUs\n");
+#endif
+		return 0;
+	}
+
+	/* Get maximum capabilities */
+	maxei = cpuid_eax(0x80000000);
+	if (maxei < 0x80000007) {	/* Any powernow info ? */
+#ifdef MODULE
+		printk(KERN_INFO PFX "No powernow capabilities detected\n");
+#endif
+		return 0;
+	}
+
+	if ((c->x86_model == 6) && (c->x86_mask == 0)) {
+		printk(KERN_INFO PFX "K7 660[A0] core detected, "
+				"enabling errata workarounds\n");
+		have_a0 = 1;
+	}
+
+	cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
+
+	/* Check we can actually do something before we say anything.*/
+	if (!(edx & (1 << 1 | 1 << 2)))
+		return 0;
+
+	printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
+
+	if (edx & 1 << 1) {
+		printk("frequency");
+		can_scale_bus = 1;
+	}
+
+	if ((edx & (1 << 1 | 1 << 2)) == 0x6)
+		printk(" and ");
+
+	if (edx & 1 << 2) {
+		printk("voltage");
+		can_scale_vid = 1;
+	}
+
+	printk(".\n");
+	return 1;
+}
+
+#ifdef CONFIG_X86_POWERNOW_K7_ACPI
+static void invalidate_entry(unsigned int entry)
+{
+	powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
+}
+#endif
+
+static int get_ranges(unsigned char *pst)
+{
+	unsigned int j;
+	unsigned int speed;
+	u8 fid, vid;
+
+	powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
+				(number_scales + 1)), GFP_KERNEL);
+	if (!powernow_table)
+		return -ENOMEM;
+
+	for (j = 0 ; j < number_scales; j++) {
+		fid = *pst++;
+
+		powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
+		powernow_table[j].index = fid; /* lower 8 bits */
+
+		speed = powernow_table[j].frequency;
+
+		if ((fid_codes[fid] % 10) == 5) {
+#ifdef CONFIG_X86_POWERNOW_K7_ACPI
+			if (have_a0 == 1)
+				invalidate_entry(j);
+#endif
+		}
+
+		if (speed < minimum_speed)
+			minimum_speed = speed;
+		if (speed > maximum_speed)
+			maximum_speed = speed;
+
+		vid = *pst++;
+		powernow_table[j].index |= (vid << 8); /* upper 8 bits */
+
+		pr_debug("   FID: 0x%x (%d.%dx [%dMHz])  "
+			 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
+			 fid_codes[fid] % 10, speed/1000, vid,
+			 mobile_vid_table[vid]/1000,
+			 mobile_vid_table[vid]%1000);
+	}
+	powernow_table[number_scales].frequency = CPUFREQ_TABLE_END;
+	powernow_table[number_scales].index = 0;
+
+	return 0;
+}
+
+
+static void change_FID(int fid)
+{
+	union msr_fidvidctl fidvidctl;
+
+	rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
+	if (fidvidctl.bits.FID != fid) {
+		fidvidctl.bits.SGTC = latency;
+		fidvidctl.bits.FID = fid;
+		fidvidctl.bits.VIDC = 0;
+		fidvidctl.bits.FIDC = 1;
+		wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
+	}
+}
+
+
+static void change_VID(int vid)
+{
+	union msr_fidvidctl fidvidctl;
+
+	rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
+	if (fidvidctl.bits.VID != vid) {
+		fidvidctl.bits.SGTC = latency;
+		fidvidctl.bits.VID = vid;
+		fidvidctl.bits.FIDC = 0;
+		fidvidctl.bits.VIDC = 1;
+		wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
+	}
+}
+
+
+static void change_speed(unsigned int index)
+{
+	u8 fid, vid;
+	struct cpufreq_freqs freqs;
+	union msr_fidvidstatus fidvidstatus;
+	int cfid;
+
+	/* fid are the lower 8 bits of the index we stored into
+	 * the cpufreq frequency table in powernow_decode_bios,
+	 * vid are the upper 8 bits.
+	 */
+
+	fid = powernow_table[index].index & 0xFF;
+	vid = (powernow_table[index].index & 0xFF00) >> 8;
+
+	freqs.cpu = 0;
+
+	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
+	cfid = fidvidstatus.bits.CFID;
+	freqs.old = fsb * fid_codes[cfid] / 10;
+
+	freqs.new = powernow_table[index].frequency;
+
+	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+
+	/* Now do the magic poking into the MSRs.  */
+
+	if (have_a0 == 1)	/* A0 errata 5 */
+		local_irq_disable();
+
+	if (freqs.old > freqs.new) {
+		/* Going down, so change FID first */
+		change_FID(fid);
+		change_VID(vid);
+	} else {
+		/* Going up, so change VID first */
+		change_VID(vid);
+		change_FID(fid);
+	}
+
+
+	if (have_a0 == 1)
+		local_irq_enable();
+
+	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+}
+
+
+#ifdef CONFIG_X86_POWERNOW_K7_ACPI
+
+static struct acpi_processor_performance *acpi_processor_perf;
+
+static int powernow_acpi_init(void)
+{
+	int i;
+	int retval = 0;
+	union powernow_acpi_control_t pc;
+
+	if (acpi_processor_perf != NULL && powernow_table != NULL) {
+		retval = -EINVAL;
+		goto err0;
+	}
+
+	acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance),
+				      GFP_KERNEL);
+	if (!acpi_processor_perf) {
+		retval = -ENOMEM;
+		goto err0;
+	}
+
+	if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
+								GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto err05;
+	}
+
+	if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
+		retval = -EIO;
+		goto err1;
+	}
+
+	if (acpi_processor_perf->control_register.space_id !=
+			ACPI_ADR_SPACE_FIXED_HARDWARE) {
+		retval = -ENODEV;
+		goto err2;
+	}
+
+	if (acpi_processor_perf->status_register.space_id !=
+			ACPI_ADR_SPACE_FIXED_HARDWARE) {
+		retval = -ENODEV;
+		goto err2;
+	}
+
+	number_scales = acpi_processor_perf->state_count;
+
+	if (number_scales < 2) {
+		retval = -ENODEV;
+		goto err2;
+	}
+
+	powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
+				(number_scales + 1)), GFP_KERNEL);
+	if (!powernow_table) {
+		retval = -ENOMEM;
+		goto err2;
+	}
+
+	pc.val = (unsigned long) acpi_processor_perf->states[0].control;
+	for (i = 0; i < number_scales; i++) {
+		u8 fid, vid;
+		struct acpi_processor_px *state =
+			&acpi_processor_perf->states[i];
+		unsigned int speed, speed_mhz;
+
+		pc.val = (unsigned long) state->control;
+		pr_debug("acpi:  P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
+			 i,
+			 (u32) state->core_frequency,
+			 (u32) state->power,
+			 (u32) state->transition_latency,
+			 (u32) state->control,
+			 pc.bits.sgtc);
+
+		vid = pc.bits.vid;
+		fid = pc.bits.fid;
+
+		powernow_table[i].frequency = fsb * fid_codes[fid] / 10;
+		powernow_table[i].index = fid; /* lower 8 bits */
+		powernow_table[i].index |= (vid << 8); /* upper 8 bits */
+
+		speed = powernow_table[i].frequency;
+		speed_mhz = speed / 1000;
+
+		/* processor_perflib will multiply the MHz value by 1000 to
+		 * get a KHz value (e.g. 1266000). However, powernow-k7 works
+		 * with true KHz values (e.g. 1266768). To ensure that all
+		 * powernow frequencies are available, we must ensure that
+		 * ACPI doesn't restrict them, so we round up the MHz value
+		 * to ensure that perflib's computed KHz value is greater than
+		 * or equal to powernow's KHz value.
+		 */
+		if (speed % 1000 > 0)
+			speed_mhz++;
+
+		if ((fid_codes[fid] % 10) == 5) {
+			if (have_a0 == 1)
+				invalidate_entry(i);
+		}
+
+		pr_debug("   FID: 0x%x (%d.%dx [%dMHz])  "
+			 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
+			 fid_codes[fid] % 10, speed_mhz, vid,
+			 mobile_vid_table[vid]/1000,
+			 mobile_vid_table[vid]%1000);
+
+		if (state->core_frequency != speed_mhz) {
+			state->core_frequency = speed_mhz;
+			pr_debug("   Corrected ACPI frequency to %d\n",
+				speed_mhz);
+		}
+
+		if (latency < pc.bits.sgtc)
+			latency = pc.bits.sgtc;
+
+		if (speed < minimum_speed)
+			minimum_speed = speed;
+		if (speed > maximum_speed)
+			maximum_speed = speed;
+	}
+
+	powernow_table[i].frequency = CPUFREQ_TABLE_END;
+	powernow_table[i].index = 0;
+
+	/* notify BIOS that we exist */
+	acpi_processor_notify_smm(THIS_MODULE);
+
+	return 0;
+
+err2:
+	acpi_processor_unregister_performance(acpi_processor_perf, 0);
+err1:
+	free_cpumask_var(acpi_processor_perf->shared_cpu_map);
+err05:
+	kfree(acpi_processor_perf);
+err0:
+	printk(KERN_WARNING PFX "ACPI perflib can not be used on "
+			"this platform\n");
+	acpi_processor_perf = NULL;
+	return retval;
+}
+#else
+static int powernow_acpi_init(void)
+{
+	printk(KERN_INFO PFX "no support for ACPI processor found."
+	       "  Please recompile your kernel with ACPI processor\n");
+	return -EINVAL;
+}
+#endif
+
+static void print_pst_entry(struct pst_s *pst, unsigned int j)
+{
+	pr_debug("PST:%d (@%p)\n", j, pst);
+	pr_debug(" cpuid: 0x%x  fsb: %d  maxFID: 0x%x  startvid: 0x%x\n",
+		pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
+}
+
+static int powernow_decode_bios(int maxfid, int startvid)
+{
+	struct psb_s *psb;
+	struct pst_s *pst;
+	unsigned int i, j;
+	unsigned char *p;
+	unsigned int etuple;
+	unsigned int ret;
+
+	etuple = cpuid_eax(0x80000001);
+
+	for (i = 0xC0000; i < 0xffff0 ; i += 16) {
+
+		p = phys_to_virt(i);
+
+		if (memcmp(p, "AMDK7PNOW!",  10) == 0) {
+			pr_debug("Found PSB header at %p\n", p);
+			psb = (struct psb_s *) p;
+			pr_debug("Table version: 0x%x\n", psb->tableversion);
+			if (psb->tableversion != 0x12) {
+				printk(KERN_INFO PFX "Sorry, only v1.2 tables"
+						" supported right now\n");
+				return -ENODEV;
+			}
+
+			pr_debug("Flags: 0x%x\n", psb->flags);
+			if ((psb->flags & 1) == 0)
+				pr_debug("Mobile voltage regulator\n");
+			else
+				pr_debug("Desktop voltage regulator\n");
+
+			latency = psb->settlingtime;
+			if (latency < 100) {
+				printk(KERN_INFO PFX "BIOS set settling time "
+						"to %d microseconds. "
+						"Should be at least 100. "
+						"Correcting.\n", latency);
+				latency = 100;
+			}
+			pr_debug("Settling Time: %d microseconds.\n",
+					psb->settlingtime);
+			pr_debug("Has %d PST tables. (Only dumping ones "
+					"relevant to this CPU).\n",
+					psb->numpst);
+
+			p += sizeof(struct psb_s);
+
+			pst = (struct pst_s *) p;
+
+			for (j = 0; j < psb->numpst; j++) {
+				pst = (struct pst_s *) p;
+				number_scales = pst->numpstates;
+
+				if ((etuple == pst->cpuid) &&
+				    check_fsb(pst->fsbspeed) &&
+				    (maxfid == pst->maxfid) &&
+				    (startvid == pst->startvid)) {
+					print_pst_entry(pst, j);
+					p = (char *)pst + sizeof(struct pst_s);
+					ret = get_ranges(p);
+					return ret;
+				} else {
+					unsigned int k;
+					p = (char *)pst + sizeof(struct pst_s);
+					for (k = 0; k < number_scales; k++)
+						p += 2;
+				}
+			}
+			printk(KERN_INFO PFX "No PST tables match this cpuid "
+					"(0x%x)\n", etuple);
+			printk(KERN_INFO PFX "This is indicative of a broken "
+					"BIOS.\n");
+
+			return -EINVAL;
+		}
+		p++;
+	}
+
+	return -ENODEV;
+}
+
+
+static int powernow_target(struct cpufreq_policy *policy,
+			    unsigned int target_freq,
+			    unsigned int relation)
+{
+	unsigned int newstate;
+
+	if (cpufreq_frequency_table_target(policy, powernow_table, target_freq,
+				relation, &newstate))
+		return -EINVAL;
+
+	change_speed(newstate);
+
+	return 0;
+}
+
+
+static int powernow_verify(struct cpufreq_policy *policy)
+{
+	return cpufreq_frequency_table_verify(policy, powernow_table);
+}
+
+/*
+ * We use the fact that the bus frequency is somehow
+ * a multiple of 100000/3 khz, then we compute sgtc according
+ * to this multiple.
+ * That way, we match more how AMD thinks all of that work.
+ * We will then get the same kind of behaviour already tested under
+ * the "well-known" other OS.
+ */
+static int __cpuinit fixup_sgtc(void)
+{
+	unsigned int sgtc;
+	unsigned int m;
+
+	m = fsb / 3333;
+	if ((m % 10) >= 5)
+		m += 5;
+
+	m /= 10;
+
+	sgtc = 100 * m * latency;
+	sgtc = sgtc / 3;
+	if (sgtc > 0xfffff) {
+		printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc);
+		sgtc = 0xfffff;
+	}
+	return sgtc;
+}
+
+static unsigned int powernow_get(unsigned int cpu)
+{
+	union msr_fidvidstatus fidvidstatus;
+	unsigned int cfid;
+
+	if (cpu)
+		return 0;
+	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
+	cfid = fidvidstatus.bits.CFID;
+
+	return fsb * fid_codes[cfid] / 10;
+}
+
+
+static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
+{
+	printk(KERN_WARNING PFX
+		"%s laptop with broken PST tables in BIOS detected.\n",
+		d->ident);
+	printk(KERN_WARNING PFX
+		"You need to downgrade to 3A21 (09/09/2002), or try a newer "
+		"BIOS than 3A71 (01/20/2003)\n");
+	printk(KERN_WARNING PFX
+		"cpufreq scaling has been disabled as a result of this.\n");
+	return 0;
+}
+
+/*
+ * Some Athlon laptops have really fucked PST tables.
+ * A BIOS update is all that can save them.
+ * Mention this, and disable cpufreq.
+ */
+static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
+	{
+		.callback = acer_cpufreq_pst,
+		.ident = "Acer Aspire",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"),
+			DMI_MATCH(DMI_BIOS_VERSION, "3A71"),
+		},
+	},
+	{ }
+};
+
+static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
+{
+	union msr_fidvidstatus fidvidstatus;
+	int result;
+
+	if (policy->cpu != 0)
+		return -ENODEV;
+
+	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
+
+	recalibrate_cpu_khz();
+
+	fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID];
+	if (!fsb) {
+		printk(KERN_WARNING PFX "can not determine bus frequency\n");
+		return -EINVAL;
+	}
+	pr_debug("FSB: %3dMHz\n", fsb/1000);
+
+	if (dmi_check_system(powernow_dmi_table) || acpi_force) {
+		printk(KERN_INFO PFX "PSB/PST known to be broken.  "
+				"Trying ACPI instead\n");
+		result = powernow_acpi_init();
+	} else {
+		result = powernow_decode_bios(fidvidstatus.bits.MFID,
+				fidvidstatus.bits.SVID);
+		if (result) {
+			printk(KERN_INFO PFX "Trying ACPI perflib\n");
+			maximum_speed = 0;
+			minimum_speed = -1;
+			latency = 0;
+			result = powernow_acpi_init();
+			if (result) {
+				printk(KERN_INFO PFX
+					"ACPI and legacy methods failed\n");
+			}
+		} else {
+			/* SGTC use the bus clock as timer */
+			latency = fixup_sgtc();
+			printk(KERN_INFO PFX "SGTC: %d\n", latency);
+		}
+	}
+
+	if (result)
+		return result;
+
+	printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
+				minimum_speed/1000, maximum_speed/1000);
+
+	policy->cpuinfo.transition_latency =
+		cpufreq_scale(2000000UL, fsb, latency);
+
+	policy->cur = powernow_get(0);
+
+	cpufreq_frequency_table_get_attr(powernow_table, policy->cpu);
+
+	return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
+}
+
+static int powernow_cpu_exit(struct cpufreq_policy *policy)
+{
+	cpufreq_frequency_table_put_attr(policy->cpu);
+
+#ifdef CONFIG_X86_POWERNOW_K7_ACPI
+	if (acpi_processor_perf) {
+		acpi_processor_unregister_performance(acpi_processor_perf, 0);
+		free_cpumask_var(acpi_processor_perf->shared_cpu_map);
+		kfree(acpi_processor_perf);
+	}
+#endif
+
+	kfree(powernow_table);
+	return 0;
+}
+
+static struct freq_attr *powernow_table_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	NULL,
+};
+
+static struct cpufreq_driver powernow_driver = {
+	.verify		= powernow_verify,
+	.target		= powernow_target,
+	.get		= powernow_get,
+#ifdef CONFIG_X86_POWERNOW_K7_ACPI
+	.bios_limit	= acpi_processor_get_bios_limit,
+#endif
+	.init		= powernow_cpu_init,
+	.exit		= powernow_cpu_exit,
+	.name		= "powernow-k7",
+	.owner		= THIS_MODULE,
+	.attr		= powernow_table_attr,
+};
+
+static int __init powernow_init(void)
+{
+	if (check_powernow() == 0)
+		return -ENODEV;
+	return cpufreq_register_driver(&powernow_driver);
+}
+
+
+static void __exit powernow_exit(void)
+{
+	cpufreq_unregister_driver(&powernow_driver);
+}
+
+module_param(acpi_force,  int, 0444);
+MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
+
+MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
+MODULE_DESCRIPTION("Powernow driver for AMD K7 processors.");
+MODULE_LICENSE("GPL");
+
+late_initcall(powernow_init);
+module_exit(powernow_exit);
+
diff --git a/drivers/cpufreq/powernow-k7.h b/drivers/cpufreq/powernow-k7.h
new file mode 100644
index 00000000000..35fb4eaf6e1
--- /dev/null
+++ b/drivers/cpufreq/powernow-k7.h
@@ -0,0 +1,43 @@
+/*
+ *  (C) 2003 Dave Jones.
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *
+ *  AMD-specific information
+ *
+ */
+
+union msr_fidvidctl {
+	struct {
+		unsigned FID:5,			// 4:0
+		reserved1:3,	// 7:5
+		VID:5,			// 12:8
+		reserved2:3,	// 15:13
+		FIDC:1,			// 16
+		VIDC:1,			// 17
+		reserved3:2,	// 19:18
+		FIDCHGRATIO:1,	// 20
+		reserved4:11,	// 31-21
+		SGTC:20,		// 32:51
+		reserved5:12;	// 63:52
+	} bits;
+	unsigned long long val;
+};
+
+union msr_fidvidstatus {
+	struct {
+		unsigned CFID:5,			// 4:0
+		reserved1:3,	// 7:5
+		SFID:5,			// 12:8
+		reserved2:3,	// 15:13
+		MFID:5,			// 20:16
+		reserved3:11,	// 31:21
+		CVID:5,			// 36:32
+		reserved4:3,	// 39:37
+		SVID:5,			// 44:40
+		reserved5:3,	// 47:45
+		MVID:5,			// 52:48
+		reserved6:11;	// 63:53
+	} bits;
+	unsigned long long val;
+};
diff --git a/drivers/cpufreq/powernow-k8.c b/drivers/cpufreq/powernow-k8.c
new file mode 100644
index 00000000000..83479b6fb9a
--- /dev/null
+++ b/drivers/cpufreq/powernow-k8.c
@@ -0,0 +1,1607 @@
+/*
+ *   (c) 2003-2010 Advanced Micro Devices, Inc.
+ *  Your use of this code is subject to the terms and conditions of the
+ *  GNU general public license version 2. See "COPYING" or
+ *  http://www.gnu.org/licenses/gpl.html
+ *
+ *  Support : mark.langsdorf@amd.com
+ *
+ *  Based on the powernow-k7.c module written by Dave Jones.
+ *  (C) 2003 Dave Jones on behalf of SuSE Labs
+ *  (C) 2004 Dominik Brodowski <linux@brodo.de>
+ *  (C) 2004 Pavel Machek <pavel@ucw.cz>
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *  Based upon datasheets & sample CPUs kindly provided by AMD.
+ *
+ *  Valuable input gratefully received from Dave Jones, Pavel Machek,
+ *  Dominik Brodowski, Jacob Shin, and others.
+ *  Originally developed by Paul Devriendt.
+ *  Processor information obtained from Chapter 9 (Power and Thermal Management)
+ *  of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
+ *  Opteron Processors" available for download from www.amd.com
+ *
+ *  Tables for specific CPUs can be inferred from
+ *     http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf
+ */
+
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/cpumask.h>
+#include <linux/sched.h>	/* for current / set_cpus_allowed() */
+#include <linux/io.h>
+#include <linux/delay.h>
+
+#include <asm/msr.h>
+
+#include <linux/acpi.h>
+#include <linux/mutex.h>
+#include <acpi/processor.h>
+
+#define PFX "powernow-k8: "
+#define VERSION "version 2.20.00"
+#include "powernow-k8.h"
+#include "mperf.h"
+
+/* serialize freq changes  */
+static DEFINE_MUTEX(fidvid_mutex);
+
+static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
+
+static int cpu_family = CPU_OPTERON;
+
+/* core performance boost */
+static bool cpb_capable, cpb_enabled;
+static struct msr __percpu *msrs;
+
+static struct cpufreq_driver cpufreq_amd64_driver;
+
+#ifndef CONFIG_SMP
+static inline const struct cpumask *cpu_core_mask(int cpu)
+{
+	return cpumask_of(0);
+}
+#endif
+
+/* Return a frequency in MHz, given an input fid */
+static u32 find_freq_from_fid(u32 fid)
+{
+	return 800 + (fid * 100);
+}
+
+/* Return a frequency in KHz, given an input fid */
+static u32 find_khz_freq_from_fid(u32 fid)
+{
+	return 1000 * find_freq_from_fid(fid);
+}
+
+static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data,
+		u32 pstate)
+{
+	return data[pstate].frequency;
+}
+
+/* Return the vco fid for an input fid
+ *
+ * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
+ * only from corresponding high fids. This returns "high" fid corresponding to
+ * "low" one.
+ */
+static u32 convert_fid_to_vco_fid(u32 fid)
+{
+	if (fid < HI_FID_TABLE_BOTTOM)
+		return 8 + (2 * fid);
+	else
+		return fid;
+}
+
+/*
+ * Return 1 if the pending bit is set. Unless we just instructed the processor
+ * to transition to a new state, seeing this bit set is really bad news.
+ */
+static int pending_bit_stuck(void)
+{
+	u32 lo, hi;
+
+	if (cpu_family == CPU_HW_PSTATE)
+		return 0;
+
+	rdmsr(MSR_FIDVID_STATUS, lo, hi);
+	return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0;
+}
+
+/*
+ * Update the global current fid / vid values from the status msr.
+ * Returns 1 on error.
+ */
+static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
+{
+	u32 lo, hi;
+	u32 i = 0;
+
+	if (cpu_family == CPU_HW_PSTATE) {
+		rdmsr(MSR_PSTATE_STATUS, lo, hi);
+		i = lo & HW_PSTATE_MASK;
+		data->currpstate = i;
+
+		/*
+		 * a workaround for family 11h erratum 311 might cause
+		 * an "out-of-range Pstate if the core is in Pstate-0
+		 */
+		if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps))
+			data->currpstate = HW_PSTATE_0;
+
+		return 0;
+	}
+	do {
+		if (i++ > 10000) {
+			pr_debug("detected change pending stuck\n");
+			return 1;
+		}
+		rdmsr(MSR_FIDVID_STATUS, lo, hi);
+	} while (lo & MSR_S_LO_CHANGE_PENDING);
+
+	data->currvid = hi & MSR_S_HI_CURRENT_VID;
+	data->currfid = lo & MSR_S_LO_CURRENT_FID;
+
+	return 0;
+}
+
+/* the isochronous relief time */
+static void count_off_irt(struct powernow_k8_data *data)
+{
+	udelay((1 << data->irt) * 10);
+	return;
+}
+
+/* the voltage stabilization time */
+static void count_off_vst(struct powernow_k8_data *data)
+{
+	udelay(data->vstable * VST_UNITS_20US);
+	return;
+}
+
+/* need to init the control msr to a safe value (for each cpu) */
+static void fidvid_msr_init(void)
+{
+	u32 lo, hi;
+	u8 fid, vid;
+
+	rdmsr(MSR_FIDVID_STATUS, lo, hi);
+	vid = hi & MSR_S_HI_CURRENT_VID;
+	fid = lo & MSR_S_LO_CURRENT_FID;
+	lo = fid | (vid << MSR_C_LO_VID_SHIFT);
+	hi = MSR_C_HI_STP_GNT_BENIGN;
+	pr_debug("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);
+	wrmsr(MSR_FIDVID_CTL, lo, hi);
+}
+
+/* write the new fid value along with the other control fields to the msr */
+static int write_new_fid(struct powernow_k8_data *data, u32 fid)
+{
+	u32 lo;
+	u32 savevid = data->currvid;
+	u32 i = 0;
+
+	if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) {
+		printk(KERN_ERR PFX "internal error - overflow on fid write\n");
+		return 1;
+	}
+
+	lo = fid;
+	lo |= (data->currvid << MSR_C_LO_VID_SHIFT);
+	lo |= MSR_C_LO_INIT_FID_VID;
+
+	pr_debug("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
+		fid, lo, data->plllock * PLL_LOCK_CONVERSION);
+
+	do {
+		wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
+		if (i++ > 100) {
+			printk(KERN_ERR PFX
+				"Hardware error - pending bit very stuck - "
+				"no further pstate changes possible\n");
+			return 1;
+		}
+	} while (query_current_values_with_pending_wait(data));
+
+	count_off_irt(data);
+
+	if (savevid != data->currvid) {
+		printk(KERN_ERR PFX
+			"vid change on fid trans, old 0x%x, new 0x%x\n",
+			savevid, data->currvid);
+		return 1;
+	}
+
+	if (fid != data->currfid) {
+		printk(KERN_ERR PFX
+			"fid trans failed, fid 0x%x, curr 0x%x\n", fid,
+			data->currfid);
+		return 1;
+	}
+
+	return 0;
+}
+
+/* Write a new vid to the hardware */
+static int write_new_vid(struct powernow_k8_data *data, u32 vid)
+{
+	u32 lo;
+	u32 savefid = data->currfid;
+	int i = 0;
+
+	if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) {
+		printk(KERN_ERR PFX "internal error - overflow on vid write\n");
+		return 1;
+	}
+
+	lo = data->currfid;
+	lo |= (vid << MSR_C_LO_VID_SHIFT);
+	lo |= MSR_C_LO_INIT_FID_VID;
+
+	pr_debug("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
+		vid, lo, STOP_GRANT_5NS);
+
+	do {
+		wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
+		if (i++ > 100) {
+			printk(KERN_ERR PFX "internal error - pending bit "
+					"very stuck - no further pstate "
+					"changes possible\n");
+			return 1;
+		}
+	} while (query_current_values_with_pending_wait(data));
+
+	if (savefid != data->currfid) {
+		printk(KERN_ERR PFX "fid changed on vid trans, old "
+			"0x%x new 0x%x\n",
+		       savefid, data->currfid);
+		return 1;
+	}
+
+	if (vid != data->currvid) {
+		printk(KERN_ERR PFX "vid trans failed, vid 0x%x, "
+				"curr 0x%x\n",
+				vid, data->currvid);
+		return 1;
+	}
+
+	return 0;
+}
+
+/*
+ * Reduce the vid by the max of step or reqvid.
+ * Decreasing vid codes represent increasing voltages:
+ * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.
+ */
+static int decrease_vid_code_by_step(struct powernow_k8_data *data,
+		u32 reqvid, u32 step)
+{
+	if ((data->currvid - reqvid) > step)
+		reqvid = data->currvid - step;
+
+	if (write_new_vid(data, reqvid))
+		return 1;
+
+	count_off_vst(data);
+
+	return 0;
+}
+
+/* Change hardware pstate by single MSR write */
+static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
+{
+	wrmsr(MSR_PSTATE_CTRL, pstate, 0);
+	data->currpstate = pstate;
+	return 0;
+}
+
+/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
+static int transition_fid_vid(struct powernow_k8_data *data,
+		u32 reqfid, u32 reqvid)
+{
+	if (core_voltage_pre_transition(data, reqvid, reqfid))
+		return 1;
+
+	if (core_frequency_transition(data, reqfid))
+		return 1;
+
+	if (core_voltage_post_transition(data, reqvid))
+		return 1;
+
+	if (query_current_values_with_pending_wait(data))
+		return 1;
+
+	if ((reqfid != data->currfid) || (reqvid != data->currvid)) {
+		printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, "
+				"curr 0x%x 0x%x\n",
+				smp_processor_id(),
+				reqfid, reqvid, data->currfid, data->currvid);
+		return 1;
+	}
+
+	pr_debug("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",
+		smp_processor_id(), data->currfid, data->currvid);
+
+	return 0;
+}
+
+/* Phase 1 - core voltage transition ... setup voltage */
+static int core_voltage_pre_transition(struct powernow_k8_data *data,
+		u32 reqvid, u32 reqfid)
+{
+	u32 rvosteps = data->rvo;
+	u32 savefid = data->currfid;
+	u32 maxvid, lo, rvomult = 1;
+
+	pr_debug("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
+		"reqvid 0x%x, rvo 0x%x\n",
+		smp_processor_id(),
+		data->currfid, data->currvid, reqvid, data->rvo);
+
+	if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP))
+		rvomult = 2;
+	rvosteps *= rvomult;
+	rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
+	maxvid = 0x1f & (maxvid >> 16);
+	pr_debug("ph1 maxvid=0x%x\n", maxvid);
+	if (reqvid < maxvid) /* lower numbers are higher voltages */
+		reqvid = maxvid;
+
+	while (data->currvid > reqvid) {
+		pr_debug("ph1: curr 0x%x, req vid 0x%x\n",
+			data->currvid, reqvid);
+		if (decrease_vid_code_by_step(data, reqvid, data->vidmvs))
+			return 1;
+	}
+
+	while ((rvosteps > 0) &&
+			((rvomult * data->rvo + data->currvid) > reqvid)) {
+		if (data->currvid == maxvid) {
+			rvosteps = 0;
+		} else {
+			pr_debug("ph1: changing vid for rvo, req 0x%x\n",
+				data->currvid - 1);
+			if (decrease_vid_code_by_step(data, data->currvid-1, 1))
+				return 1;
+			rvosteps--;
+		}
+	}
+
+	if (query_current_values_with_pending_wait(data))
+		return 1;
+
+	if (savefid != data->currfid) {
+		printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n",
+				data->currfid);
+		return 1;
+	}
+
+	pr_debug("ph1 complete, currfid 0x%x, currvid 0x%x\n",
+		data->currfid, data->currvid);
+
+	return 0;
+}
+
+/* Phase 2 - core frequency transition */
+static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
+{
+	u32 vcoreqfid, vcocurrfid, vcofiddiff;
+	u32 fid_interval, savevid = data->currvid;
+
+	if (data->currfid == reqfid) {
+		printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
+				data->currfid);
+		return 0;
+	}
+
+	pr_debug("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "
+		"reqfid 0x%x\n",
+		smp_processor_id(),
+		data->currfid, data->currvid, reqfid);
+
+	vcoreqfid = convert_fid_to_vco_fid(reqfid);
+	vcocurrfid = convert_fid_to_vco_fid(data->currfid);
+	vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
+	    : vcoreqfid - vcocurrfid;
+
+	if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP))
+		vcofiddiff = 0;
+
+	while (vcofiddiff > 2) {
+		(data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
+
+		if (reqfid > data->currfid) {
+			if (data->currfid > LO_FID_TABLE_TOP) {
+				if (write_new_fid(data,
+						data->currfid + fid_interval))
+					return 1;
+			} else {
+				if (write_new_fid
+				    (data,
+				     2 + convert_fid_to_vco_fid(data->currfid)))
+					return 1;
+			}
+		} else {
+			if (write_new_fid(data, data->currfid - fid_interval))
+				return 1;
+		}
+
+		vcocurrfid = convert_fid_to_vco_fid(data->currfid);
+		vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
+		    : vcoreqfid - vcocurrfid;
+	}
+
+	if (write_new_fid(data, reqfid))
+		return 1;
+
+	if (query_current_values_with_pending_wait(data))
+		return 1;
+
+	if (data->currfid != reqfid) {
+		printk(KERN_ERR PFX
+			"ph2: mismatch, failed fid transition, "
+			"curr 0x%x, req 0x%x\n",
+			data->currfid, reqfid);
+		return 1;
+	}
+
+	if (savevid != data->currvid) {
+		printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n",
+			savevid, data->currvid);
+		return 1;
+	}
+
+	pr_debug("ph2 complete, currfid 0x%x, currvid 0x%x\n",
+		data->currfid, data->currvid);
+
+	return 0;
+}
+
+/* Phase 3 - core voltage transition flow ... jump to the final vid. */
+static int core_voltage_post_transition(struct powernow_k8_data *data,
+		u32 reqvid)
+{
+	u32 savefid = data->currfid;
+	u32 savereqvid = reqvid;
+
+	pr_debug("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",
+		smp_processor_id(),
+		data->currfid, data->currvid);
+
+	if (reqvid != data->currvid) {
+		if (write_new_vid(data, reqvid))
+			return 1;
+
+		if (savefid != data->currfid) {
+			printk(KERN_ERR PFX
+			       "ph3: bad fid change, save 0x%x, curr 0x%x\n",
+			       savefid, data->currfid);
+			return 1;
+		}
+
+		if (data->currvid != reqvid) {
+			printk(KERN_ERR PFX
+			       "ph3: failed vid transition\n, "
+			       "req 0x%x, curr 0x%x",
+			       reqvid, data->currvid);
+			return 1;
+		}
+	}
+
+	if (query_current_values_with_pending_wait(data))
+		return 1;
+
+	if (savereqvid != data->currvid) {
+		pr_debug("ph3 failed, currvid 0x%x\n", data->currvid);
+		return 1;
+	}
+
+	if (savefid != data->currfid) {
+		pr_debug("ph3 failed, currfid changed 0x%x\n",
+			data->currfid);
+		return 1;
+	}
+
+	pr_debug("ph3 complete, currfid 0x%x, currvid 0x%x\n",
+		data->currfid, data->currvid);
+
+	return 0;
+}
+
+static void check_supported_cpu(void *_rc)
+{
+	u32 eax, ebx, ecx, edx;
+	int *rc = _rc;
+
+	*rc = -ENODEV;
+
+	if (__this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_AMD)
+		return;
+
+	eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
+	if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
+	    ((eax & CPUID_XFAM) < CPUID_XFAM_10H))
+		return;
+
+	if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
+		if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
+		    ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
+			printk(KERN_INFO PFX
+				"Processor cpuid %x not supported\n", eax);
+			return;
+		}
+
+		eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
+		if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
+			printk(KERN_INFO PFX
+			       "No frequency change capabilities detected\n");
+			return;
+		}
+
+		cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
+		if ((edx & P_STATE_TRANSITION_CAPABLE)
+			!= P_STATE_TRANSITION_CAPABLE) {
+			printk(KERN_INFO PFX
+				"Power state transitions not supported\n");
+			return;
+		}
+	} else { /* must be a HW Pstate capable processor */
+		cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
+		if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
+			cpu_family = CPU_HW_PSTATE;
+		else
+			return;
+	}
+
+	*rc = 0;
+}
+
+static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
+		u8 maxvid)
+{
+	unsigned int j;
+	u8 lastfid = 0xff;
+
+	for (j = 0; j < data->numps; j++) {
+		if (pst[j].vid > LEAST_VID) {
+			printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
+			       j, pst[j].vid);
+			return -EINVAL;
+		}
+		if (pst[j].vid < data->rvo) {
+			/* vid + rvo >= 0 */
+			printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
+			       " %d\n", j);
+			return -ENODEV;
+		}
+		if (pst[j].vid < maxvid + data->rvo) {
+			/* vid + rvo >= maxvid */
+			printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
+			       " %d\n", j);
+			return -ENODEV;
+		}
+		if (pst[j].fid > MAX_FID) {
+			printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
+			       " %d\n", j);
+			return -ENODEV;
+		}
+		if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
+			/* Only first fid is allowed to be in "low" range */
+			printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
+			       "0x%x\n", j, pst[j].fid);
+			return -EINVAL;
+		}
+		if (pst[j].fid < lastfid)
+			lastfid = pst[j].fid;
+	}
+	if (lastfid & 1) {
+		printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
+		return -EINVAL;
+	}
+	if (lastfid > LO_FID_TABLE_TOP)
+		printk(KERN_INFO FW_BUG PFX
+			"first fid not from lo freq table\n");
+
+	return 0;
+}
+
+static void invalidate_entry(struct cpufreq_frequency_table *powernow_table,
+		unsigned int entry)
+{
+	powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
+}
+
+static void print_basics(struct powernow_k8_data *data)
+{
+	int j;
+	for (j = 0; j < data->numps; j++) {
+		if (data->powernow_table[j].frequency !=
+				CPUFREQ_ENTRY_INVALID) {
+			if (cpu_family == CPU_HW_PSTATE) {
+				printk(KERN_INFO PFX
+					"   %d : pstate %d (%d MHz)\n", j,
+					data->powernow_table[j].index,
+					data->powernow_table[j].frequency/1000);
+			} else {
+				printk(KERN_INFO PFX
+					"fid 0x%x (%d MHz), vid 0x%x\n",
+					data->powernow_table[j].index & 0xff,
+					data->powernow_table[j].frequency/1000,
+					data->powernow_table[j].index >> 8);
+			}
+		}
+	}
+	if (data->batps)
+		printk(KERN_INFO PFX "Only %d pstates on battery\n",
+				data->batps);
+}
+
+static u32 freq_from_fid_did(u32 fid, u32 did)
+{
+	u32 mhz = 0;
+
+	if (boot_cpu_data.x86 == 0x10)
+		mhz = (100 * (fid + 0x10)) >> did;
+	else if (boot_cpu_data.x86 == 0x11)
+		mhz = (100 * (fid + 8)) >> did;
+	else
+		BUG();
+
+	return mhz * 1000;
+}
+
+static int fill_powernow_table(struct powernow_k8_data *data,
+		struct pst_s *pst, u8 maxvid)
+{
+	struct cpufreq_frequency_table *powernow_table;
+	unsigned int j;
+
+	if (data->batps) {
+		/* use ACPI support to get full speed on mains power */
+		printk(KERN_WARNING PFX
+			"Only %d pstates usable (use ACPI driver for full "
+			"range\n", data->batps);
+		data->numps = data->batps;
+	}
+
+	for (j = 1; j < data->numps; j++) {
+		if (pst[j-1].fid >= pst[j].fid) {
+			printk(KERN_ERR PFX "PST out of sequence\n");
+			return -EINVAL;
+		}
+	}
+
+	if (data->numps < 2) {
+		printk(KERN_ERR PFX "no p states to transition\n");
+		return -ENODEV;
+	}
+
+	if (check_pst_table(data, pst, maxvid))
+		return -EINVAL;
+
+	powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
+		* (data->numps + 1)), GFP_KERNEL);
+	if (!powernow_table) {
+		printk(KERN_ERR PFX "powernow_table memory alloc failure\n");
+		return -ENOMEM;
+	}
+
+	for (j = 0; j < data->numps; j++) {
+		int freq;
+		powernow_table[j].index = pst[j].fid; /* lower 8 bits */
+		powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
+		freq = find_khz_freq_from_fid(pst[j].fid);
+		powernow_table[j].frequency = freq;
+	}
+	powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
+	powernow_table[data->numps].index = 0;
+
+	if (query_current_values_with_pending_wait(data)) {
+		kfree(powernow_table);
+		return -EIO;
+	}
+
+	pr_debug("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
+	data->powernow_table = powernow_table;
+	if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
+		print_basics(data);
+
+	for (j = 0; j < data->numps; j++)
+		if ((pst[j].fid == data->currfid) &&
+		    (pst[j].vid == data->currvid))
+			return 0;
+
+	pr_debug("currfid/vid do not match PST, ignoring\n");
+	return 0;
+}
+
+/* Find and validate the PSB/PST table in BIOS. */
+static int find_psb_table(struct powernow_k8_data *data)
+{
+	struct psb_s *psb;
+	unsigned int i;
+	u32 mvs;
+	u8 maxvid;
+	u32 cpst = 0;
+	u32 thiscpuid;
+
+	for (i = 0xc0000; i < 0xffff0; i += 0x10) {
+		/* Scan BIOS looking for the signature. */
+		/* It can not be at ffff0 - it is too big. */
+
+		psb = phys_to_virt(i);
+		if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0)
+			continue;
+
+		pr_debug("found PSB header at 0x%p\n", psb);
+
+		pr_debug("table vers: 0x%x\n", psb->tableversion);
+		if (psb->tableversion != PSB_VERSION_1_4) {
+			printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
+			return -ENODEV;
+		}
+
+		pr_debug("flags: 0x%x\n", psb->flags1);
+		if (psb->flags1) {
+			printk(KERN_ERR FW_BUG PFX "unknown flags\n");
+			return -ENODEV;
+		}
+
+		data->vstable = psb->vstable;
+		pr_debug("voltage stabilization time: %d(*20us)\n",
+				data->vstable);
+
+		pr_debug("flags2: 0x%x\n", psb->flags2);
+		data->rvo = psb->flags2 & 3;
+		data->irt = ((psb->flags2) >> 2) & 3;
+		mvs = ((psb->flags2) >> 4) & 3;
+		data->vidmvs = 1 << mvs;
+		data->batps = ((psb->flags2) >> 6) & 3;
+
+		pr_debug("ramp voltage offset: %d\n", data->rvo);
+		pr_debug("isochronous relief time: %d\n", data->irt);
+		pr_debug("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);
+
+		pr_debug("numpst: 0x%x\n", psb->num_tables);
+		cpst = psb->num_tables;
+		if ((psb->cpuid == 0x00000fc0) ||
+		    (psb->cpuid == 0x00000fe0)) {
+			thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
+			if ((thiscpuid == 0x00000fc0) ||
+			    (thiscpuid == 0x00000fe0))
+				cpst = 1;
+		}
+		if (cpst != 1) {
+			printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
+			return -ENODEV;
+		}
+
+		data->plllock = psb->plllocktime;
+		pr_debug("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);
+		pr_debug("maxfid: 0x%x\n", psb->maxfid);
+		pr_debug("maxvid: 0x%x\n", psb->maxvid);
+		maxvid = psb->maxvid;
+
+		data->numps = psb->numps;
+		pr_debug("numpstates: 0x%x\n", data->numps);
+		return fill_powernow_table(data,
+				(struct pst_s *)(psb+1), maxvid);
+	}
+	/*
+	 * If you see this message, complain to BIOS manufacturer. If
+	 * he tells you "we do not support Linux" or some similar
+	 * nonsense, remember that Windows 2000 uses the same legacy
+	 * mechanism that the old Linux PSB driver uses. Tell them it
+	 * is broken with Windows 2000.
+	 *
+	 * The reference to the AMD documentation is chapter 9 in the
+	 * BIOS and Kernel Developer's Guide, which is available on
+	 * www.amd.com
+	 */
+	printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
+	printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
+		" and Cool'N'Quiet support is enabled in BIOS setup\n");
+	return -ENODEV;
+}
+
+static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
+		unsigned int index)
+{
+	u64 control;
+
+	if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
+		return;
+
+	control = data->acpi_data.states[index].control;
+	data->irt = (control >> IRT_SHIFT) & IRT_MASK;
+	data->rvo = (control >> RVO_SHIFT) & RVO_MASK;
+	data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
+	data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK;
+	data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK);
+	data->vstable = (control >> VST_SHIFT) & VST_MASK;
+}
+
+static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
+{
+	struct cpufreq_frequency_table *powernow_table;
+	int ret_val = -ENODEV;
+	u64 control, status;
+
+	if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
+		pr_debug("register performance failed: bad ACPI data\n");
+		return -EIO;
+	}
+
+	/* verify the data contained in the ACPI structures */
+	if (data->acpi_data.state_count <= 1) {
+		pr_debug("No ACPI P-States\n");
+		goto err_out;
+	}
+
+	control = data->acpi_data.control_register.space_id;
+	status = data->acpi_data.status_register.space_id;
+
+	if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
+	    (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
+		pr_debug("Invalid control/status registers (%llx - %llx)\n",
+			control, status);
+		goto err_out;
+	}
+
+	/* fill in data->powernow_table */
+	powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
+		* (data->acpi_data.state_count + 1)), GFP_KERNEL);
+	if (!powernow_table) {
+		pr_debug("powernow_table memory alloc failure\n");
+		goto err_out;
+	}
+
+	/* fill in data */
+	data->numps = data->acpi_data.state_count;
+	powernow_k8_acpi_pst_values(data, 0);
+
+	if (cpu_family == CPU_HW_PSTATE)
+		ret_val = fill_powernow_table_pstate(data, powernow_table);
+	else
+		ret_val = fill_powernow_table_fidvid(data, powernow_table);
+	if (ret_val)
+		goto err_out_mem;
+
+	powernow_table[data->acpi_data.state_count].frequency =
+		CPUFREQ_TABLE_END;
+	powernow_table[data->acpi_data.state_count].index = 0;
+	data->powernow_table = powernow_table;
+
+	if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
+		print_basics(data);
+
+	/* notify BIOS that we exist */
+	acpi_processor_notify_smm(THIS_MODULE);
+
+	if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
+		printk(KERN_ERR PFX
+				"unable to alloc powernow_k8_data cpumask\n");
+		ret_val = -ENOMEM;
+		goto err_out_mem;
+	}
+
+	return 0;
+
+err_out_mem:
+	kfree(powernow_table);
+
+err_out:
+	acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
+
+	/* data->acpi_data.state_count informs us at ->exit()
+	 * whether ACPI was used */
+	data->acpi_data.state_count = 0;
+
+	return ret_val;
+}
+
+static int fill_powernow_table_pstate(struct powernow_k8_data *data,
+		struct cpufreq_frequency_table *powernow_table)
+{
+	int i;
+	u32 hi = 0, lo = 0;
+	rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
+	data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
+
+	for (i = 0; i < data->acpi_data.state_count; i++) {
+		u32 index;
+
+		index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
+		if (index > data->max_hw_pstate) {
+			printk(KERN_ERR PFX "invalid pstate %d - "
+					"bad value %d.\n", i, index);
+			printk(KERN_ERR PFX "Please report to BIOS "
+					"manufacturer\n");
+			invalidate_entry(powernow_table, i);
+			continue;
+		}
+		rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
+		if (!(hi & HW_PSTATE_VALID_MASK)) {
+			pr_debug("invalid pstate %d, ignoring\n", index);
+			invalidate_entry(powernow_table, i);
+			continue;
+		}
+
+		powernow_table[i].index = index;
+
+		/* Frequency may be rounded for these */
+		if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
+				 || boot_cpu_data.x86 == 0x11) {
+			powernow_table[i].frequency =
+				freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
+		} else
+			powernow_table[i].frequency =
+				data->acpi_data.states[i].core_frequency * 1000;
+	}
+	return 0;
+}
+
+static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
+		struct cpufreq_frequency_table *powernow_table)
+{
+	int i;
+
+	for (i = 0; i < data->acpi_data.state_count; i++) {
+		u32 fid;
+		u32 vid;
+		u32 freq, index;
+		u64 status, control;
+
+		if (data->exttype) {
+			status =  data->acpi_data.states[i].status;
+			fid = status & EXT_FID_MASK;
+			vid = (status >> VID_SHIFT) & EXT_VID_MASK;
+		} else {
+			control =  data->acpi_data.states[i].control;
+			fid = control & FID_MASK;
+			vid = (control >> VID_SHIFT) & VID_MASK;
+		}
+
+		pr_debug("   %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
+
+		index = fid | (vid<<8);
+		powernow_table[i].index = index;
+
+		freq = find_khz_freq_from_fid(fid);
+		powernow_table[i].frequency = freq;
+
+		/* verify frequency is OK */
+		if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
+			pr_debug("invalid freq %u kHz, ignoring\n", freq);
+			invalidate_entry(powernow_table, i);
+			continue;
+		}
+
+		/* verify voltage is OK -
+		 * BIOSs are using "off" to indicate invalid */
+		if (vid == VID_OFF) {
+			pr_debug("invalid vid %u, ignoring\n", vid);
+			invalidate_entry(powernow_table, i);
+			continue;
+		}
+
+		if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
+			printk(KERN_INFO PFX "invalid freq entries "
+				"%u kHz vs. %u kHz\n", freq,
+				(unsigned int)
+				(data->acpi_data.states[i].core_frequency
+				 * 1000));
+			invalidate_entry(powernow_table, i);
+			continue;
+		}
+	}
+	return 0;
+}
+
+static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
+{
+	if (data->acpi_data.state_count)
+		acpi_processor_unregister_performance(&data->acpi_data,
+				data->cpu);
+	free_cpumask_var(data->acpi_data.shared_cpu_map);
+}
+
+static int get_transition_latency(struct powernow_k8_data *data)
+{
+	int max_latency = 0;
+	int i;
+	for (i = 0; i < data->acpi_data.state_count; i++) {
+		int cur_latency = data->acpi_data.states[i].transition_latency
+			+ data->acpi_data.states[i].bus_master_latency;
+		if (cur_latency > max_latency)
+			max_latency = cur_latency;
+	}
+	if (max_latency == 0) {
+		/*
+		 * Fam 11h and later may return 0 as transition latency. This
+		 * is intended and means "very fast". While cpufreq core and
+		 * governors currently can handle that gracefully, better set it
+		 * to 1 to avoid problems in the future.
+		 */
+		if (boot_cpu_data.x86 < 0x11)
+			printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
+				"latency\n");
+		max_latency = 1;
+	}
+	/* value in usecs, needs to be in nanoseconds */
+	return 1000 * max_latency;
+}
+
+/* Take a frequency, and issue the fid/vid transition command */
+static int transition_frequency_fidvid(struct powernow_k8_data *data,
+		unsigned int index)
+{
+	u32 fid = 0;
+	u32 vid = 0;
+	int res, i;
+	struct cpufreq_freqs freqs;
+
+	pr_debug("cpu %d transition to index %u\n", smp_processor_id(), index);
+
+	/* fid/vid correctness check for k8 */
+	/* fid are the lower 8 bits of the index we stored into
+	 * the cpufreq frequency table in find_psb_table, vid
+	 * are the upper 8 bits.
+	 */
+	fid = data->powernow_table[index].index & 0xFF;
+	vid = (data->powernow_table[index].index & 0xFF00) >> 8;
+
+	pr_debug("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
+
+	if (query_current_values_with_pending_wait(data))
+		return 1;
+
+	if ((data->currvid == vid) && (data->currfid == fid)) {
+		pr_debug("target matches current values (fid 0x%x, vid 0x%x)\n",
+			fid, vid);
+		return 0;
+	}
+
+	pr_debug("cpu %d, changing to fid 0x%x, vid 0x%x\n",
+		smp_processor_id(), fid, vid);
+	freqs.old = find_khz_freq_from_fid(data->currfid);
+	freqs.new = find_khz_freq_from_fid(fid);
+
+	for_each_cpu(i, data->available_cores) {
+		freqs.cpu = i;
+		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	}
+
+	res = transition_fid_vid(data, fid, vid);
+	freqs.new = find_khz_freq_from_fid(data->currfid);
+
+	for_each_cpu(i, data->available_cores) {
+		freqs.cpu = i;
+		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	}
+	return res;
+}
+
+/* Take a frequency, and issue the hardware pstate transition command */
+static int transition_frequency_pstate(struct powernow_k8_data *data,
+		unsigned int index)
+{
+	u32 pstate = 0;
+	int res, i;
+	struct cpufreq_freqs freqs;
+
+	pr_debug("cpu %d transition to index %u\n", smp_processor_id(), index);
+
+	/* get MSR index for hardware pstate transition */
+	pstate = index & HW_PSTATE_MASK;
+	if (pstate > data->max_hw_pstate)
+		return 0;
+	freqs.old = find_khz_freq_from_pstate(data->powernow_table,
+			data->currpstate);
+	freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
+
+	for_each_cpu(i, data->available_cores) {
+		freqs.cpu = i;
+		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	}
+
+	res = transition_pstate(data, pstate);
+	freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
+
+	for_each_cpu(i, data->available_cores) {
+		freqs.cpu = i;
+		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	}
+	return res;
+}
+
+/* Driver entry point to switch to the target frequency */
+static int powernowk8_target(struct cpufreq_policy *pol,
+		unsigned targfreq, unsigned relation)
+{
+	cpumask_var_t oldmask;
+	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
+	u32 checkfid;
+	u32 checkvid;
+	unsigned int newstate;
+	int ret = -EIO;
+
+	if (!data)
+		return -EINVAL;
+
+	checkfid = data->currfid;
+	checkvid = data->currvid;
+
+	/* only run on specific CPU from here on. */
+	/* This is poor form: use a workqueue or smp_call_function_single */
+	if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
+		return -ENOMEM;
+
+	cpumask_copy(oldmask, tsk_cpus_allowed(current));
+	set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
+
+	if (smp_processor_id() != pol->cpu) {
+		printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
+		goto err_out;
+	}
+
+	if (pending_bit_stuck()) {
+		printk(KERN_ERR PFX "failing targ, change pending bit set\n");
+		goto err_out;
+	}
+
+	pr_debug("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
+		pol->cpu, targfreq, pol->min, pol->max, relation);
+
+	if (query_current_values_with_pending_wait(data))
+		goto err_out;
+
+	if (cpu_family != CPU_HW_PSTATE) {
+		pr_debug("targ: curr fid 0x%x, vid 0x%x\n",
+		data->currfid, data->currvid);
+
+		if ((checkvid != data->currvid) ||
+		    (checkfid != data->currfid)) {
+			printk(KERN_INFO PFX
+				"error - out of sync, fix 0x%x 0x%x, "
+				"vid 0x%x 0x%x\n",
+				checkfid, data->currfid,
+				checkvid, data->currvid);
+		}
+	}
+
+	if (cpufreq_frequency_table_target(pol, data->powernow_table,
+				targfreq, relation, &newstate))
+		goto err_out;
+
+	mutex_lock(&fidvid_mutex);
+
+	powernow_k8_acpi_pst_values(data, newstate);
+
+	if (cpu_family == CPU_HW_PSTATE)
+		ret = transition_frequency_pstate(data, newstate);
+	else
+		ret = transition_frequency_fidvid(data, newstate);
+	if (ret) {
+		printk(KERN_ERR PFX "transition frequency failed\n");
+		ret = 1;
+		mutex_unlock(&fidvid_mutex);
+		goto err_out;
+	}
+	mutex_unlock(&fidvid_mutex);
+
+	if (cpu_family == CPU_HW_PSTATE)
+		pol->cur = find_khz_freq_from_pstate(data->powernow_table,
+				newstate);
+	else
+		pol->cur = find_khz_freq_from_fid(data->currfid);
+	ret = 0;
+
+err_out:
+	set_cpus_allowed_ptr(current, oldmask);
+	free_cpumask_var(oldmask);
+	return ret;
+}
+
+/* Driver entry point to verify the policy and range of frequencies */
+static int powernowk8_verify(struct cpufreq_policy *pol)
+{
+	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
+
+	if (!data)
+		return -EINVAL;
+
+	return cpufreq_frequency_table_verify(pol, data->powernow_table);
+}
+
+struct init_on_cpu {
+	struct powernow_k8_data *data;
+	int rc;
+};
+
+static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu)
+{
+	struct init_on_cpu *init_on_cpu = _init_on_cpu;
+
+	if (pending_bit_stuck()) {
+		printk(KERN_ERR PFX "failing init, change pending bit set\n");
+		init_on_cpu->rc = -ENODEV;
+		return;
+	}
+
+	if (query_current_values_with_pending_wait(init_on_cpu->data)) {
+		init_on_cpu->rc = -ENODEV;
+		return;
+	}
+
+	if (cpu_family == CPU_OPTERON)
+		fidvid_msr_init();
+
+	init_on_cpu->rc = 0;
+}
+
+/* per CPU init entry point to the driver */
+static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
+{
+	static const char ACPI_PSS_BIOS_BUG_MSG[] =
+		KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
+		FW_BUG PFX "Try again with latest BIOS.\n";
+	struct powernow_k8_data *data;
+	struct init_on_cpu init_on_cpu;
+	int rc;
+	struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
+
+	if (!cpu_online(pol->cpu))
+		return -ENODEV;
+
+	smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1);
+	if (rc)
+		return -ENODEV;
+
+	data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
+	if (!data) {
+		printk(KERN_ERR PFX "unable to alloc powernow_k8_data");
+		return -ENOMEM;
+	}
+
+	data->cpu = pol->cpu;
+	data->currpstate = HW_PSTATE_INVALID;
+
+	if (powernow_k8_cpu_init_acpi(data)) {
+		/*
+		 * Use the PSB BIOS structure. This is only available on
+		 * an UP version, and is deprecated by AMD.
+		 */
+		if (num_online_cpus() != 1) {
+			printk_once(ACPI_PSS_BIOS_BUG_MSG);
+			goto err_out;
+		}
+		if (pol->cpu != 0) {
+			printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
+			       "CPU other than CPU0. Complain to your BIOS "
+			       "vendor.\n");
+			goto err_out;
+		}
+		rc = find_psb_table(data);
+		if (rc)
+			goto err_out;
+
+		/* Take a crude guess here.
+		 * That guess was in microseconds, so multiply with 1000 */
+		pol->cpuinfo.transition_latency = (
+			 ((data->rvo + 8) * data->vstable * VST_UNITS_20US) +
+			 ((1 << data->irt) * 30)) * 1000;
+	} else /* ACPI _PSS objects available */
+		pol->cpuinfo.transition_latency = get_transition_latency(data);
+
+	/* only run on specific CPU from here on */
+	init_on_cpu.data = data;
+	smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu,
+				 &init_on_cpu, 1);
+	rc = init_on_cpu.rc;
+	if (rc != 0)
+		goto err_out_exit_acpi;
+
+	if (cpu_family == CPU_HW_PSTATE)
+		cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
+	else
+		cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));
+	data->available_cores = pol->cpus;
+
+	if (cpu_family == CPU_HW_PSTATE)
+		pol->cur = find_khz_freq_from_pstate(data->powernow_table,
+				data->currpstate);
+	else
+		pol->cur = find_khz_freq_from_fid(data->currfid);
+	pr_debug("policy current frequency %d kHz\n", pol->cur);
+
+	/* min/max the cpu is capable of */
+	if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
+		printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
+		powernow_k8_cpu_exit_acpi(data);
+		kfree(data->powernow_table);
+		kfree(data);
+		return -EINVAL;
+	}
+
+	/* Check for APERF/MPERF support in hardware */
+	if (cpu_has(c, X86_FEATURE_APERFMPERF))
+		cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
+
+	cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
+
+	if (cpu_family == CPU_HW_PSTATE)
+		pr_debug("cpu_init done, current pstate 0x%x\n",
+				data->currpstate);
+	else
+		pr_debug("cpu_init done, current fid 0x%x, vid 0x%x\n",
+			data->currfid, data->currvid);
+
+	per_cpu(powernow_data, pol->cpu) = data;
+
+	return 0;
+
+err_out_exit_acpi:
+	powernow_k8_cpu_exit_acpi(data);
+
+err_out:
+	kfree(data);
+	return -ENODEV;
+}
+
+static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
+{
+	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
+
+	if (!data)
+		return -EINVAL;
+
+	powernow_k8_cpu_exit_acpi(data);
+
+	cpufreq_frequency_table_put_attr(pol->cpu);
+
+	kfree(data->powernow_table);
+	kfree(data);
+	per_cpu(powernow_data, pol->cpu) = NULL;
+
+	return 0;
+}
+
+static void query_values_on_cpu(void *_err)
+{
+	int *err = _err;
+	struct powernow_k8_data *data = __this_cpu_read(powernow_data);
+
+	*err = query_current_values_with_pending_wait(data);
+}
+
+static unsigned int powernowk8_get(unsigned int cpu)
+{
+	struct powernow_k8_data *data = per_cpu(powernow_data, cpu);
+	unsigned int khz = 0;
+	int err;
+
+	if (!data)
+		return 0;
+
+	smp_call_function_single(cpu, query_values_on_cpu, &err, true);
+	if (err)
+		goto out;
+
+	if (cpu_family == CPU_HW_PSTATE)
+		khz = find_khz_freq_from_pstate(data->powernow_table,
+						data->currpstate);
+	else
+		khz = find_khz_freq_from_fid(data->currfid);
+
+
+out:
+	return khz;
+}
+
+static void _cpb_toggle_msrs(bool t)
+{
+	int cpu;
+
+	get_online_cpus();
+
+	rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+	for_each_cpu(cpu, cpu_online_mask) {
+		struct msr *reg = per_cpu_ptr(msrs, cpu);
+		if (t)
+			reg->l &= ~BIT(25);
+		else
+			reg->l |= BIT(25);
+	}
+	wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+	put_online_cpus();
+}
+
+/*
+ * Switch on/off core performance boosting.
+ *
+ * 0=disable
+ * 1=enable.
+ */
+static void cpb_toggle(bool t)
+{
+	if (!cpb_capable)
+		return;
+
+	if (t && !cpb_enabled) {
+		cpb_enabled = true;
+		_cpb_toggle_msrs(t);
+		printk(KERN_INFO PFX "Core Boosting enabled.\n");
+	} else if (!t && cpb_enabled) {
+		cpb_enabled = false;
+		_cpb_toggle_msrs(t);
+		printk(KERN_INFO PFX "Core Boosting disabled.\n");
+	}
+}
+
+static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
+				 size_t count)
+{
+	int ret = -EINVAL;
+	unsigned long val = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (!ret && (val == 0 || val == 1) && cpb_capable)
+		cpb_toggle(val);
+	else
+		return -EINVAL;
+
+	return count;
+}
+
+static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
+{
+	return sprintf(buf, "%u\n", cpb_enabled);
+}
+
+#define define_one_rw(_name) \
+static struct freq_attr _name = \
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+define_one_rw(cpb);
+
+static struct freq_attr *powernow_k8_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	&cpb,
+	NULL,
+};
+
+static struct cpufreq_driver cpufreq_amd64_driver = {
+	.verify		= powernowk8_verify,
+	.target		= powernowk8_target,
+	.bios_limit	= acpi_processor_get_bios_limit,
+	.init		= powernowk8_cpu_init,
+	.exit		= __devexit_p(powernowk8_cpu_exit),
+	.get		= powernowk8_get,
+	.name		= "powernow-k8",
+	.owner		= THIS_MODULE,
+	.attr		= powernow_k8_attr,
+};
+
+/*
+ * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
+ * cannot block the remaining ones from boosting. On the CPU_UP path we
+ * simply keep the boost-disable flag in sync with the current global
+ * state.
+ */
+static int cpb_notify(struct notifier_block *nb, unsigned long action,
+		      void *hcpu)
+{
+	unsigned cpu = (long)hcpu;
+	u32 lo, hi;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+
+		if (!cpb_enabled) {
+			rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
+			lo |= BIT(25);
+			wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
+		}
+		break;
+
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
+		lo &= ~BIT(25);
+		wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
+		break;
+
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpb_nb = {
+	.notifier_call		= cpb_notify,
+};
+
+/* driver entry point for init */
+static int __cpuinit powernowk8_init(void)
+{
+	unsigned int i, supported_cpus = 0, cpu;
+	int rv;
+
+	for_each_online_cpu(i) {
+		int rc;
+		smp_call_function_single(i, check_supported_cpu, &rc, 1);
+		if (rc == 0)
+			supported_cpus++;
+	}
+
+	if (supported_cpus != num_online_cpus())
+		return -ENODEV;
+
+	printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
+		num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
+
+	if (boot_cpu_has(X86_FEATURE_CPB)) {
+
+		cpb_capable = true;
+
+		msrs = msrs_alloc();
+		if (!msrs) {
+			printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
+			return -ENOMEM;
+		}
+
+		register_cpu_notifier(&cpb_nb);
+
+		rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+		for_each_cpu(cpu, cpu_online_mask) {
+			struct msr *reg = per_cpu_ptr(msrs, cpu);
+			cpb_enabled |= !(!!(reg->l & BIT(25)));
+		}
+
+		printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
+			(cpb_enabled ? "on" : "off"));
+	}
+
+	rv = cpufreq_register_driver(&cpufreq_amd64_driver);
+	if (rv < 0 && boot_cpu_has(X86_FEATURE_CPB)) {
+		unregister_cpu_notifier(&cpb_nb);
+		msrs_free(msrs);
+		msrs = NULL;
+	}
+	return rv;
+}
+
+/* driver entry point for term */
+static void __exit powernowk8_exit(void)
+{
+	pr_debug("exit\n");
+
+	if (boot_cpu_has(X86_FEATURE_CPB)) {
+		msrs_free(msrs);
+		msrs = NULL;
+
+		unregister_cpu_notifier(&cpb_nb);
+	}
+
+	cpufreq_unregister_driver(&cpufreq_amd64_driver);
+}
+
+MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and "
+		"Mark Langsdorf <mark.langsdorf@amd.com>");
+MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
+MODULE_LICENSE("GPL");
+
+late_initcall(powernowk8_init);
+module_exit(powernowk8_exit);
diff --git a/drivers/cpufreq/powernow-k8.h b/drivers/cpufreq/powernow-k8.h
new file mode 100644
index 00000000000..3744d26cdc2
--- /dev/null
+++ b/drivers/cpufreq/powernow-k8.h
@@ -0,0 +1,222 @@
+/*
+ *  (c) 2003-2006 Advanced Micro Devices, Inc.
+ *  Your use of this code is subject to the terms and conditions of the
+ *  GNU general public license version 2. See "COPYING" or
+ *  http://www.gnu.org/licenses/gpl.html
+ */
+
+enum pstate {
+	HW_PSTATE_INVALID = 0xff,
+	HW_PSTATE_0 = 0,
+	HW_PSTATE_1 = 1,
+	HW_PSTATE_2 = 2,
+	HW_PSTATE_3 = 3,
+	HW_PSTATE_4 = 4,
+	HW_PSTATE_5 = 5,
+	HW_PSTATE_6 = 6,
+	HW_PSTATE_7 = 7,
+};
+
+struct powernow_k8_data {
+	unsigned int cpu;
+
+	u32 numps;  /* number of p-states */
+	u32 batps;  /* number of p-states supported on battery */
+	u32 max_hw_pstate; /* maximum legal hardware pstate */
+
+	/* these values are constant when the PSB is used to determine
+	 * vid/fid pairings, but are modified during the ->target() call
+	 * when ACPI is used */
+	u32 rvo;     /* ramp voltage offset */
+	u32 irt;     /* isochronous relief time */
+	u32 vidmvs;  /* usable value calculated from mvs */
+	u32 vstable; /* voltage stabilization time, units 20 us */
+	u32 plllock; /* pll lock time, units 1 us */
+        u32 exttype; /* extended interface = 1 */
+
+	/* keep track of the current fid / vid or pstate */
+	u32 currvid;
+	u32 currfid;
+	enum pstate currpstate;
+
+	/* the powernow_table includes all frequency and vid/fid pairings:
+	 * fid are the lower 8 bits of the index, vid are the upper 8 bits.
+	 * frequency is in kHz */
+	struct cpufreq_frequency_table  *powernow_table;
+
+	/* the acpi table needs to be kept. it's only available if ACPI was
+	 * used to determine valid frequency/vid/fid states */
+	struct acpi_processor_performance acpi_data;
+
+	/* we need to keep track of associated cores, but let cpufreq
+	 * handle hotplug events - so just point at cpufreq pol->cpus
+	 * structure */
+	struct cpumask *available_cores;
+};
+
+/* processor's cpuid instruction support */
+#define CPUID_PROCESSOR_SIGNATURE	1	/* function 1 */
+#define CPUID_XFAM			0x0ff00000	/* extended family */
+#define CPUID_XFAM_K8			0
+#define CPUID_XMOD			0x000f0000	/* extended model */
+#define CPUID_XMOD_REV_MASK		0x000c0000
+#define CPUID_XFAM_10H			0x00100000	/* family 0x10 */
+#define CPUID_USE_XFAM_XMOD		0x00000f00
+#define CPUID_GET_MAX_CAPABILITIES	0x80000000
+#define CPUID_FREQ_VOLT_CAPABILITIES	0x80000007
+#define P_STATE_TRANSITION_CAPABLE	6
+
+/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For     */
+/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and   */
+/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */
+/* the register number is placed in ecx, and the data is returned in edx:eax. */
+
+#define MSR_FIDVID_CTL      0xc0010041
+#define MSR_FIDVID_STATUS   0xc0010042
+
+/* Field definitions within the FID VID Low Control MSR : */
+#define MSR_C_LO_INIT_FID_VID     0x00010000
+#define MSR_C_LO_NEW_VID          0x00003f00
+#define MSR_C_LO_NEW_FID          0x0000003f
+#define MSR_C_LO_VID_SHIFT        8
+
+/* Field definitions within the FID VID High Control MSR : */
+#define MSR_C_HI_STP_GNT_TO	  0x000fffff
+
+/* Field definitions within the FID VID Low Status MSR : */
+#define MSR_S_LO_CHANGE_PENDING   0x80000000   /* cleared when completed */
+#define MSR_S_LO_MAX_RAMP_VID     0x3f000000
+#define MSR_S_LO_MAX_FID          0x003f0000
+#define MSR_S_LO_START_FID        0x00003f00
+#define MSR_S_LO_CURRENT_FID      0x0000003f
+
+/* Field definitions within the FID VID High Status MSR : */
+#define MSR_S_HI_MIN_WORKING_VID  0x3f000000
+#define MSR_S_HI_MAX_WORKING_VID  0x003f0000
+#define MSR_S_HI_START_VID        0x00003f00
+#define MSR_S_HI_CURRENT_VID      0x0000003f
+#define MSR_C_HI_STP_GNT_BENIGN	  0x00000001
+
+
+/* Hardware Pstate _PSS and MSR definitions */
+#define USE_HW_PSTATE		0x00000080
+#define HW_PSTATE_MASK 		0x00000007
+#define HW_PSTATE_VALID_MASK 	0x80000000
+#define HW_PSTATE_MAX_MASK	0x000000f0
+#define HW_PSTATE_MAX_SHIFT	4
+#define MSR_PSTATE_DEF_BASE 	0xc0010064 /* base of Pstate MSRs */
+#define MSR_PSTATE_STATUS 	0xc0010063 /* Pstate Status MSR */
+#define MSR_PSTATE_CTRL 	0xc0010062 /* Pstate control MSR */
+#define MSR_PSTATE_CUR_LIMIT	0xc0010061 /* pstate current limit MSR */
+
+/* define the two driver architectures */
+#define CPU_OPTERON 0
+#define CPU_HW_PSTATE 1
+
+
+/*
+ * There are restrictions frequencies have to follow:
+ * - only 1 entry in the low fid table ( <=1.4GHz )
+ * - lowest entry in the high fid table must be >= 2 * the entry in the
+ *   low fid table
+ * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry
+ *   in the low fid table
+ * - the parts can only step at <= 200 MHz intervals, odd fid values are
+ *   supported in revision G and later revisions.
+ * - lowest frequency must be >= interprocessor hypertransport link speed
+ *   (only applies to MP systems obviously)
+ */
+
+/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */
+#define LO_FID_TABLE_TOP     7	/* fid values marking the boundary    */
+#define HI_FID_TABLE_BOTTOM  8	/* between the low and high tables    */
+
+#define LO_VCOFREQ_TABLE_TOP    1400	/* corresponding vco frequency values */
+#define HI_VCOFREQ_TABLE_BOTTOM 1600
+
+#define MIN_FREQ_RESOLUTION  200 /* fids jump by 2 matching freq jumps by 200 */
+
+#define MAX_FID 0x2a	/* Spec only gives FID values as far as 5 GHz */
+#define LEAST_VID 0x3e	/* Lowest (numerically highest) useful vid value */
+
+#define MIN_FREQ 800	/* Min and max freqs, per spec */
+#define MAX_FREQ 5000
+
+#define INVALID_FID_MASK 0xffffffc0  /* not a valid fid if these bits are set */
+#define INVALID_VID_MASK 0xffffffc0  /* not a valid vid if these bits are set */
+
+#define VID_OFF 0x3f
+
+#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */
+
+#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */
+
+#define MAXIMUM_VID_STEPS 1  /* Current cpus only allow a single step of 25mV */
+#define VST_UNITS_20US 20   /* Voltage Stabilization Time is in units of 20us */
+
+/*
+ * Most values of interest are encoded in a single field of the _PSS
+ * entries: the "control" value.
+ */
+
+#define IRT_SHIFT      30
+#define RVO_SHIFT      28
+#define EXT_TYPE_SHIFT 27
+#define PLL_L_SHIFT    20
+#define MVS_SHIFT      18
+#define VST_SHIFT      11
+#define VID_SHIFT       6
+#define IRT_MASK        3
+#define RVO_MASK        3
+#define EXT_TYPE_MASK   1
+#define PLL_L_MASK   0x7f
+#define MVS_MASK        3
+#define VST_MASK     0x7f
+#define VID_MASK     0x1f
+#define FID_MASK     0x1f
+#define EXT_VID_MASK 0x3f
+#define EXT_FID_MASK 0x3f
+
+
+/*
+ * Version 1.4 of the PSB table. This table is constructed by BIOS and is
+ * to tell the OS's power management driver which VIDs and FIDs are
+ * supported by this particular processor.
+ * If the data in the PSB / PST is wrong, then this driver will program the
+ * wrong values into hardware, which is very likely to lead to a crash.
+ */
+
+#define PSB_ID_STRING      "AMDK7PNOW!"
+#define PSB_ID_STRING_LEN  10
+
+#define PSB_VERSION_1_4  0x14
+
+struct psb_s {
+	u8 signature[10];
+	u8 tableversion;
+	u8 flags1;
+	u16 vstable;
+	u8 flags2;
+	u8 num_tables;
+	u32 cpuid;
+	u8 plllocktime;
+	u8 maxfid;
+	u8 maxvid;
+	u8 numps;
+};
+
+/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */
+struct pst_s {
+	u8 fid;
+	u8 vid;
+};
+
+static int core_voltage_pre_transition(struct powernow_k8_data *data,
+	u32 reqvid, u32 regfid);
+static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
+static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
+
+static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
+
+static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
+static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
diff --git a/drivers/cpufreq/sc520_freq.c b/drivers/cpufreq/sc520_freq.c
new file mode 100644
index 00000000000..1e205e6b172
--- /dev/null
+++ b/drivers/cpufreq/sc520_freq.c
@@ -0,0 +1,192 @@
+/*
+ *	sc520_freq.c: cpufreq driver for the AMD Elan sc520
+ *
+ *	Copyright (C) 2005 Sean Young <sean@mess.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	Based on elanfreq.c
+ *
+ *	2005-03-30: - initial revision
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/delay.h>
+#include <linux/cpufreq.h>
+#include <linux/timex.h>
+#include <linux/io.h>
+
+#include <asm/msr.h>
+
+#define MMCR_BASE	0xfffef000	/* The default base address */
+#define OFFS_CPUCTL	0x2   /* CPU Control Register */
+
+static __u8 __iomem *cpuctl;
+
+#define PFX "sc520_freq: "
+
+static struct cpufreq_frequency_table sc520_freq_table[] = {
+	{0x01,	100000},
+	{0x02,	133000},
+	{0,	CPUFREQ_TABLE_END},
+};
+
+static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
+{
+	u8 clockspeed_reg = *cpuctl;
+
+	switch (clockspeed_reg & 0x03) {
+	default:
+		printk(KERN_ERR PFX "error: cpuctl register has unexpected "
+				"value %02x\n", clockspeed_reg);
+	case 0x01:
+		return 100000;
+	case 0x02:
+		return 133000;
+	}
+}
+
+static void sc520_freq_set_cpu_state(unsigned int state)
+{
+
+	struct cpufreq_freqs	freqs;
+	u8 clockspeed_reg;
+
+	freqs.old = sc520_freq_get_cpu_frequency(0);
+	freqs.new = sc520_freq_table[state].frequency;
+	freqs.cpu = 0; /* AMD Elan is UP */
+
+	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+
+	pr_debug("attempting to set frequency to %i kHz\n",
+			sc520_freq_table[state].frequency);
+
+	local_irq_disable();
+
+	clockspeed_reg = *cpuctl & ~0x03;
+	*cpuctl = clockspeed_reg | sc520_freq_table[state].index;
+
+	local_irq_enable();
+
+	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+};
+
+static int sc520_freq_verify(struct cpufreq_policy *policy)
+{
+	return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]);
+}
+
+static int sc520_freq_target(struct cpufreq_policy *policy,
+			    unsigned int target_freq,
+			    unsigned int relation)
+{
+	unsigned int newstate = 0;
+
+	if (cpufreq_frequency_table_target(policy, sc520_freq_table,
+				target_freq, relation, &newstate))
+		return -EINVAL;
+
+	sc520_freq_set_cpu_state(newstate);
+
+	return 0;
+}
+
+
+/*
+ *	Module init and exit code
+ */
+
+static int sc520_freq_cpu_init(struct cpufreq_policy *policy)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+	int result;
+
+	/* capability check */
+	if (c->x86_vendor != X86_VENDOR_AMD ||
+	    c->x86 != 4 || c->x86_model != 9)
+		return -ENODEV;
+
+	/* cpuinfo and default policy values */
+	policy->cpuinfo.transition_latency = 1000000; /* 1ms */
+	policy->cur = sc520_freq_get_cpu_frequency(0);
+
+	result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table);
+	if (result)
+		return result;
+
+	cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu);
+
+	return 0;
+}
+
+
+static int sc520_freq_cpu_exit(struct cpufreq_policy *policy)
+{
+	cpufreq_frequency_table_put_attr(policy->cpu);
+	return 0;
+}
+
+
+static struct freq_attr *sc520_freq_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	NULL,
+};
+
+
+static struct cpufreq_driver sc520_freq_driver = {
+	.get	= sc520_freq_get_cpu_frequency,
+	.verify	= sc520_freq_verify,
+	.target	= sc520_freq_target,
+	.init	= sc520_freq_cpu_init,
+	.exit	= sc520_freq_cpu_exit,
+	.name	= "sc520_freq",
+	.owner	= THIS_MODULE,
+	.attr	= sc520_freq_attr,
+};
+
+
+static int __init sc520_freq_init(void)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+	int err;
+
+	/* Test if we have the right hardware */
+	if (c->x86_vendor != X86_VENDOR_AMD ||
+	    c->x86 != 4 || c->x86_model != 9) {
+		pr_debug("no Elan SC520 processor found!\n");
+		return -ENODEV;
+	}
+	cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1);
+	if (!cpuctl) {
+		printk(KERN_ERR "sc520_freq: error: failed to remap memory\n");
+		return -ENOMEM;
+	}
+
+	err = cpufreq_register_driver(&sc520_freq_driver);
+	if (err)
+		iounmap(cpuctl);
+
+	return err;
+}
+
+
+static void __exit sc520_freq_exit(void)
+{
+	cpufreq_unregister_driver(&sc520_freq_driver);
+	iounmap(cpuctl);
+}
+
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sean Young <sean@mess.org>");
+MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU");
+
+module_init(sc520_freq_init);
+module_exit(sc520_freq_exit);
+
diff --git a/drivers/cpufreq/speedstep-centrino.c b/drivers/cpufreq/speedstep-centrino.c
new file mode 100644
index 00000000000..6ea3455def2
--- /dev/null
+++ b/drivers/cpufreq/speedstep-centrino.c
@@ -0,0 +1,633 @@
+/*
+ * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium
+ * M (part of the Centrino chipset).
+ *
+ * Since the original Pentium M, most new Intel CPUs support Enhanced
+ * SpeedStep.
+ *
+ * Despite the "SpeedStep" in the name, this is almost entirely unlike
+ * traditional SpeedStep.
+ *
+ * Modelled on speedstep.c
+ *
+ * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/sched.h>	/* current */
+#include <linux/delay.h>
+#include <linux/compiler.h>
+#include <linux/gfp.h>
+
+#include <asm/msr.h>
+#include <asm/processor.h>
+#include <asm/cpufeature.h>
+
+#define PFX		"speedstep-centrino: "
+#define MAINTAINER	"cpufreq@vger.kernel.org"
+
+#define INTEL_MSR_RANGE	(0xffff)
+
+struct cpu_id
+{
+	__u8	x86;            /* CPU family */
+	__u8	x86_model;	/* model */
+	__u8	x86_mask;	/* stepping */
+};
+
+enum {
+	CPU_BANIAS,
+	CPU_DOTHAN_A1,
+	CPU_DOTHAN_A2,
+	CPU_DOTHAN_B0,
+	CPU_MP4HT_D0,
+	CPU_MP4HT_E0,
+};
+
+static const struct cpu_id cpu_ids[] = {
+	[CPU_BANIAS]	= { 6,  9, 5 },
+	[CPU_DOTHAN_A1]	= { 6, 13, 1 },
+	[CPU_DOTHAN_A2]	= { 6, 13, 2 },
+	[CPU_DOTHAN_B0]	= { 6, 13, 6 },
+	[CPU_MP4HT_D0]	= {15,  3, 4 },
+	[CPU_MP4HT_E0]	= {15,  4, 1 },
+};
+#define N_IDS	ARRAY_SIZE(cpu_ids)
+
+struct cpu_model
+{
+	const struct cpu_id *cpu_id;
+	const char	*model_name;
+	unsigned	max_freq; /* max clock in kHz */
+
+	struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
+};
+static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
+				  const struct cpu_id *x);
+
+/* Operating points for current CPU */
+static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
+static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
+
+static struct cpufreq_driver centrino_driver;
+
+#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE
+
+/* Computes the correct form for IA32_PERF_CTL MSR for a particular
+   frequency/voltage operating point; frequency in MHz, volts in mV.
+   This is stored as "index" in the structure. */
+#define OP(mhz, mv)							\
+	{								\
+		.frequency = (mhz) * 1000,				\
+		.index = (((mhz)/100) << 8) | ((mv - 700) / 16)		\
+	}
+
+/*
+ * These voltage tables were derived from the Intel Pentium M
+ * datasheet, document 25261202.pdf, Table 5.  I have verified they
+ * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium
+ * M.
+ */
+
+/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */
+static struct cpufreq_frequency_table banias_900[] =
+{
+	OP(600,  844),
+	OP(800,  988),
+	OP(900, 1004),
+	{ .frequency = CPUFREQ_TABLE_END }
+};
+
+/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */
+static struct cpufreq_frequency_table banias_1000[] =
+{
+	OP(600,   844),
+	OP(800,   972),
+	OP(900,   988),
+	OP(1000, 1004),
+	{ .frequency = CPUFREQ_TABLE_END }
+};
+
+/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */
+static struct cpufreq_frequency_table banias_1100[] =
+{
+	OP( 600,  956),
+	OP( 800, 1020),
+	OP( 900, 1100),
+	OP(1000, 1164),
+	OP(1100, 1180),
+	{ .frequency = CPUFREQ_TABLE_END }
+};
+
+
+/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */
+static struct cpufreq_frequency_table banias_1200[] =
+{
+	OP( 600,  956),
+	OP( 800, 1004),
+	OP( 900, 1020),
+	OP(1000, 1100),
+	OP(1100, 1164),
+	OP(1200, 1180),
+	{ .frequency = CPUFREQ_TABLE_END }
+};
+
+/* Intel Pentium M processor 1.30GHz (Banias) */
+static struct cpufreq_frequency_table banias_1300[] =
+{
+	OP( 600,  956),
+	OP( 800, 1260),
+	OP(1000, 1292),
+	OP(1200, 1356),
+	OP(1300, 1388),
+	{ .frequency = CPUFREQ_TABLE_END }
+};
+
+/* Intel Pentium M processor 1.40GHz (Banias) */
+static struct cpufreq_frequency_table banias_1400[] =
+{
+	OP( 600,  956),
+	OP( 800, 1180),
+	OP(1000, 1308),
+	OP(1200, 1436),
+	OP(1400, 1484),
+	{ .frequency = CPUFREQ_TABLE_END }
+};
+
+/* Intel Pentium M processor 1.50GHz (Banias) */
+static struct cpufreq_frequency_table banias_1500[] =
+{
+	OP( 600,  956),
+	OP( 800, 1116),
+	OP(1000, 1228),
+	OP(1200, 1356),
+	OP(1400, 1452),
+	OP(1500, 1484),
+	{ .frequency = CPUFREQ_TABLE_END }
+};
+
+/* Intel Pentium M processor 1.60GHz (Banias) */
+static struct cpufreq_frequency_table banias_1600[] =
+{
+	OP( 600,  956),
+	OP( 800, 1036),
+	OP(1000, 1164),
+	OP(1200, 1276),
+	OP(1400, 1420),
+	OP(1600, 1484),
+	{ .frequency = CPUFREQ_TABLE_END }
+};
+
+/* Intel Pentium M processor 1.70GHz (Banias) */
+static struct cpufreq_frequency_table banias_1700[] =
+{
+	OP( 600,  956),
+	OP( 800, 1004),
+	OP(1000, 1116),
+	OP(1200, 1228),
+	OP(1400, 1308),
+	OP(1700, 1484),
+	{ .frequency = CPUFREQ_TABLE_END }
+};
+#undef OP
+
+#define _BANIAS(cpuid, max, name)	\
+{	.cpu_id		= cpuid,	\
+	.model_name	= "Intel(R) Pentium(R) M processor " name "MHz", \
+	.max_freq	= (max)*1000,	\
+	.op_points	= banias_##max,	\
+}
+#define BANIAS(max)	_BANIAS(&cpu_ids[CPU_BANIAS], max, #max)
+
+/* CPU models, their operating frequency range, and freq/voltage
+   operating points */
+static struct cpu_model models[] =
+{
+	_BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"),
+	BANIAS(1000),
+	BANIAS(1100),
+	BANIAS(1200),
+	BANIAS(1300),
+	BANIAS(1400),
+	BANIAS(1500),
+	BANIAS(1600),
+	BANIAS(1700),
+
+	/* NULL model_name is a wildcard */
+	{ &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL },
+	{ &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL },
+	{ &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL },
+	{ &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL },
+	{ &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL },
+
+	{ NULL, }
+};
+#undef _BANIAS
+#undef BANIAS
+
+static int centrino_cpu_init_table(struct cpufreq_policy *policy)
+{
+	struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
+	struct cpu_model *model;
+
+	for(model = models; model->cpu_id != NULL; model++)
+		if (centrino_verify_cpu_id(cpu, model->cpu_id) &&
+		    (model->model_name == NULL ||
+		     strcmp(cpu->x86_model_id, model->model_name) == 0))
+			break;
+
+	if (model->cpu_id == NULL) {
+		/* No match at all */
+		pr_debug("no support for CPU model \"%s\": "
+		       "send /proc/cpuinfo to " MAINTAINER "\n",
+		       cpu->x86_model_id);
+		return -ENOENT;
+	}
+
+	if (model->op_points == NULL) {
+		/* Matched a non-match */
+		pr_debug("no table support for CPU model \"%s\"\n",
+		       cpu->x86_model_id);
+		pr_debug("try using the acpi-cpufreq driver\n");
+		return -ENOENT;
+	}
+
+	per_cpu(centrino_model, policy->cpu) = model;
+
+	pr_debug("found \"%s\": max frequency: %dkHz\n",
+	       model->model_name, model->max_freq);
+
+	return 0;
+}
+
+#else
+static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
+{
+	return -ENODEV;
+}
+#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
+
+static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
+				  const struct cpu_id *x)
+{
+	if ((c->x86 == x->x86) &&
+	    (c->x86_model == x->x86_model) &&
+	    (c->x86_mask == x->x86_mask))
+		return 1;
+	return 0;
+}
+
+/* To be called only after centrino_model is initialized */
+static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
+{
+	int i;
+
+	/*
+	 * Extract clock in kHz from PERF_CTL value
+	 * for centrino, as some DSDTs are buggy.
+	 * Ideally, this can be done using the acpi_data structure.
+	 */
+	if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
+	    (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
+	    (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
+		msr = (msr >> 8) & 0xff;
+		return msr * 100000;
+	}
+
+	if ((!per_cpu(centrino_model, cpu)) ||
+	    (!per_cpu(centrino_model, cpu)->op_points))
+		return 0;
+
+	msr &= 0xffff;
+	for (i = 0;
+		per_cpu(centrino_model, cpu)->op_points[i].frequency
+							!= CPUFREQ_TABLE_END;
+	     i++) {
+		if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
+			return per_cpu(centrino_model, cpu)->
+							op_points[i].frequency;
+	}
+	if (failsafe)
+		return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
+	else
+		return 0;
+}
+
+/* Return the current CPU frequency in kHz */
+static unsigned int get_cur_freq(unsigned int cpu)
+{
+	unsigned l, h;
+	unsigned clock_freq;
+
+	rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h);
+	clock_freq = extract_clock(l, cpu, 0);
+
+	if (unlikely(clock_freq == 0)) {
+		/*
+		 * On some CPUs, we can see transient MSR values (which are
+		 * not present in _PSS), while CPU is doing some automatic
+		 * P-state transition (like TM2). Get the last freq set 
+		 * in PERF_CTL.
+		 */
+		rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h);
+		clock_freq = extract_clock(l, cpu, 1);
+	}
+	return clock_freq;
+}
+
+
+static int centrino_cpu_init(struct cpufreq_policy *policy)
+{
+	struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
+	unsigned freq;
+	unsigned l, h;
+	int ret;
+	int i;
+
+	/* Only Intel makes Enhanced Speedstep-capable CPUs */
+	if (cpu->x86_vendor != X86_VENDOR_INTEL ||
+	    !cpu_has(cpu, X86_FEATURE_EST))
+		return -ENODEV;
+
+	if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
+		centrino_driver.flags |= CPUFREQ_CONST_LOOPS;
+
+	if (policy->cpu != 0)
+		return -ENODEV;
+
+	for (i = 0; i < N_IDS; i++)
+		if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))
+			break;
+
+	if (i != N_IDS)
+		per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
+
+	if (!per_cpu(centrino_cpu, policy->cpu)) {
+		pr_debug("found unsupported CPU with "
+		"Enhanced SpeedStep: send /proc/cpuinfo to "
+		MAINTAINER "\n");
+		return -ENODEV;
+	}
+
+	if (centrino_cpu_init_table(policy)) {
+		return -ENODEV;
+	}
+
+	/* Check to see if Enhanced SpeedStep is enabled, and try to
+	   enable it if not. */
+	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+
+	if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
+		l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
+		pr_debug("trying to enable Enhanced SpeedStep (%x)\n", l);
+		wrmsr(MSR_IA32_MISC_ENABLE, l, h);
+
+		/* check to see if it stuck */
+		rdmsr(MSR_IA32_MISC_ENABLE, l, h);
+		if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
+			printk(KERN_INFO PFX
+				"couldn't enable Enhanced SpeedStep\n");
+			return -ENODEV;
+		}
+	}
+
+	freq = get_cur_freq(policy->cpu);
+	policy->cpuinfo.transition_latency = 10000;
+						/* 10uS transition latency */
+	policy->cur = freq;
+
+	pr_debug("centrino_cpu_init: cur=%dkHz\n", policy->cur);
+
+	ret = cpufreq_frequency_table_cpuinfo(policy,
+		per_cpu(centrino_model, policy->cpu)->op_points);
+	if (ret)
+		return (ret);
+
+	cpufreq_frequency_table_get_attr(
+		per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
+
+	return 0;
+}
+
+static int centrino_cpu_exit(struct cpufreq_policy *policy)
+{
+	unsigned int cpu = policy->cpu;
+
+	if (!per_cpu(centrino_model, cpu))
+		return -ENODEV;
+
+	cpufreq_frequency_table_put_attr(cpu);
+
+	per_cpu(centrino_model, cpu) = NULL;
+
+	return 0;
+}
+
+/**
+ * centrino_verify - verifies a new CPUFreq policy
+ * @policy: new policy
+ *
+ * Limit must be within this model's frequency range at least one
+ * border included.
+ */
+static int centrino_verify (struct cpufreq_policy *policy)
+{
+	return cpufreq_frequency_table_verify(policy,
+			per_cpu(centrino_model, policy->cpu)->op_points);
+}
+
+/**
+ * centrino_setpolicy - set a new CPUFreq policy
+ * @policy: new policy
+ * @target_freq: the target frequency
+ * @relation: how that frequency relates to achieved frequency
+ *	(CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
+ *
+ * Sets a new CPUFreq policy.
+ */
+static int centrino_target (struct cpufreq_policy *policy,
+			    unsigned int target_freq,
+			    unsigned int relation)
+{
+	unsigned int    newstate = 0;
+	unsigned int	msr, oldmsr = 0, h = 0, cpu = policy->cpu;
+	struct cpufreq_freqs	freqs;
+	int			retval = 0;
+	unsigned int		j, k, first_cpu, tmp;
+	cpumask_var_t covered_cpus;
+
+	if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))
+		return -ENOMEM;
+
+	if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
+		retval = -ENODEV;
+		goto out;
+	}
+
+	if (unlikely(cpufreq_frequency_table_target(policy,
+			per_cpu(centrino_model, cpu)->op_points,
+			target_freq,
+			relation,
+			&newstate))) {
+		retval = -EINVAL;
+		goto out;
+	}
+
+	first_cpu = 1;
+	for_each_cpu(j, policy->cpus) {
+		int good_cpu;
+
+		/* cpufreq holds the hotplug lock, so we are safe here */
+		if (!cpu_online(j))
+			continue;
+
+		/*
+		 * Support for SMP systems.
+		 * Make sure we are running on CPU that wants to change freq
+		 */
+		if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
+			good_cpu = cpumask_any_and(policy->cpus,
+						   cpu_online_mask);
+		else
+			good_cpu = j;
+
+		if (good_cpu >= nr_cpu_ids) {
+			pr_debug("couldn't limit to CPUs in this domain\n");
+			retval = -EAGAIN;
+			if (first_cpu) {
+				/* We haven't started the transition yet. */
+				goto out;
+			}
+			break;
+		}
+
+		msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
+
+		if (first_cpu) {
+			rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);
+			if (msr == (oldmsr & 0xffff)) {
+				pr_debug("no change needed - msr was and needs "
+					"to be %x\n", oldmsr);
+				retval = 0;
+				goto out;
+			}
+
+			freqs.old = extract_clock(oldmsr, cpu, 0);
+			freqs.new = extract_clock(msr, cpu, 0);
+
+			pr_debug("target=%dkHz old=%d new=%d msr=%04x\n",
+				target_freq, freqs.old, freqs.new, msr);
+
+			for_each_cpu(k, policy->cpus) {
+				if (!cpu_online(k))
+					continue;
+				freqs.cpu = k;
+				cpufreq_notify_transition(&freqs,
+					CPUFREQ_PRECHANGE);
+			}
+
+			first_cpu = 0;
+			/* all but 16 LSB are reserved, treat them with care */
+			oldmsr &= ~0xffff;
+			msr &= 0xffff;
+			oldmsr |= msr;
+		}
+
+		wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h);
+		if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
+			break;
+
+		cpumask_set_cpu(j, covered_cpus);
+	}
+
+	for_each_cpu(k, policy->cpus) {
+		if (!cpu_online(k))
+			continue;
+		freqs.cpu = k;
+		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	}
+
+	if (unlikely(retval)) {
+		/*
+		 * We have failed halfway through the frequency change.
+		 * We have sent callbacks to policy->cpus and
+		 * MSRs have already been written on coverd_cpus.
+		 * Best effort undo..
+		 */
+
+		for_each_cpu(j, covered_cpus)
+			wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h);
+
+		tmp = freqs.new;
+		freqs.new = freqs.old;
+		freqs.old = tmp;
+		for_each_cpu(j, policy->cpus) {
+			if (!cpu_online(j))
+				continue;
+			cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+			cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+		}
+	}
+	retval = 0;
+
+out:
+	free_cpumask_var(covered_cpus);
+	return retval;
+}
+
+static struct freq_attr* centrino_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	NULL,
+};
+
+static struct cpufreq_driver centrino_driver = {
+	.name		= "centrino", /* should be speedstep-centrino,
+					 but there's a 16 char limit */
+	.init		= centrino_cpu_init,
+	.exit		= centrino_cpu_exit,
+	.verify		= centrino_verify,
+	.target		= centrino_target,
+	.get		= get_cur_freq,
+	.attr           = centrino_attr,
+	.owner		= THIS_MODULE,
+};
+
+
+/**
+ * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver
+ *
+ * Initializes the Enhanced SpeedStep support. Returns -ENODEV on
+ * unsupported devices, -ENOENT if there's no voltage table for this
+ * particular CPU model, -EINVAL on problems during initiatization,
+ * and zero on success.
+ *
+ * This is quite picky.  Not only does the CPU have to advertise the
+ * "est" flag in the cpuid capability flags, we look for a specific
+ * CPU model and stepping, and we need to have the exact model name in
+ * our voltage tables.  That is, be paranoid about not releasing
+ * someone's valuable magic smoke.
+ */
+static int __init centrino_init(void)
+{
+	struct cpuinfo_x86 *cpu = &cpu_data(0);
+
+	if (!cpu_has(cpu, X86_FEATURE_EST))
+		return -ENODEV;
+
+	return cpufreq_register_driver(&centrino_driver);
+}
+
+static void __exit centrino_exit(void)
+{
+	cpufreq_unregister_driver(&centrino_driver);
+}
+
+MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>");
+MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors.");
+MODULE_LICENSE ("GPL");
+
+late_initcall(centrino_init);
+module_exit(centrino_exit);
diff --git a/drivers/cpufreq/speedstep-ich.c b/drivers/cpufreq/speedstep-ich.c
new file mode 100644
index 00000000000..a748ce782fe
--- /dev/null
+++ b/drivers/cpufreq/speedstep-ich.c
@@ -0,0 +1,448 @@
+/*
+ * (C) 2001  Dave Jones, Arjan van de ven.
+ * (C) 2002 - 2003  Dominik Brodowski <linux@brodo.de>
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *  Based upon reverse engineered information, and on Intel documentation
+ *  for chipsets ICH2-M and ICH3-M.
+ *
+ *  Many thanks to Ducrot Bruno for finding and fixing the last
+ *  "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler
+ *  for extensive testing.
+ *
+ *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
+ */
+
+
+/*********************************************************************
+ *                        SPEEDSTEP - DEFINITIONS                    *
+ *********************************************************************/
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/pci.h>
+#include <linux/sched.h>
+
+#include "speedstep-lib.h"
+
+
+/* speedstep_chipset:
+ *   It is necessary to know which chipset is used. As accesses to
+ * this device occur at various places in this module, we need a
+ * static struct pci_dev * pointing to that device.
+ */
+static struct pci_dev *speedstep_chipset_dev;
+
+
+/* speedstep_processor
+ */
+static enum speedstep_processor speedstep_processor;
+
+static u32 pmbase;
+
+/*
+ *   There are only two frequency states for each processor. Values
+ * are in kHz for the time being.
+ */
+static struct cpufreq_frequency_table speedstep_freqs[] = {
+	{SPEEDSTEP_HIGH,	0},
+	{SPEEDSTEP_LOW,		0},
+	{0,			CPUFREQ_TABLE_END},
+};
+
+
+/**
+ * speedstep_find_register - read the PMBASE address
+ *
+ * Returns: -ENODEV if no register could be found
+ */
+static int speedstep_find_register(void)
+{
+	if (!speedstep_chipset_dev)
+		return -ENODEV;
+
+	/* get PMBASE */
+	pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase);
+	if (!(pmbase & 0x01)) {
+		printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
+		return -ENODEV;
+	}
+
+	pmbase &= 0xFFFFFFFE;
+	if (!pmbase) {
+		printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
+		return -ENODEV;
+	}
+
+	pr_debug("pmbase is 0x%x\n", pmbase);
+	return 0;
+}
+
+/**
+ * speedstep_set_state - set the SpeedStep state
+ * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
+ *
+ *   Tries to change the SpeedStep state.  Can be called from
+ *   smp_call_function_single.
+ */
+static void speedstep_set_state(unsigned int state)
+{
+	u8 pm2_blk;
+	u8 value;
+	unsigned long flags;
+
+	if (state > 0x1)
+		return;
+
+	/* Disable IRQs */
+	local_irq_save(flags);
+
+	/* read state */
+	value = inb(pmbase + 0x50);
+
+	pr_debug("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
+
+	/* write new state */
+	value &= 0xFE;
+	value |= state;
+
+	pr_debug("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase);
+
+	/* Disable bus master arbitration */
+	pm2_blk = inb(pmbase + 0x20);
+	pm2_blk |= 0x01;
+	outb(pm2_blk, (pmbase + 0x20));
+
+	/* Actual transition */
+	outb(value, (pmbase + 0x50));
+
+	/* Restore bus master arbitration */
+	pm2_blk &= 0xfe;
+	outb(pm2_blk, (pmbase + 0x20));
+
+	/* check if transition was successful */
+	value = inb(pmbase + 0x50);
+
+	/* Enable IRQs */
+	local_irq_restore(flags);
+
+	pr_debug("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
+
+	if (state == (value & 0x1))
+		pr_debug("change to %u MHz succeeded\n",
+			speedstep_get_frequency(speedstep_processor) / 1000);
+	else
+		printk(KERN_ERR "cpufreq: change failed - I/O error\n");
+
+	return;
+}
+
+/* Wrapper for smp_call_function_single. */
+static void _speedstep_set_state(void *_state)
+{
+	speedstep_set_state(*(unsigned int *)_state);
+}
+
+/**
+ * speedstep_activate - activate SpeedStep control in the chipset
+ *
+ *   Tries to activate the SpeedStep status and control registers.
+ * Returns -EINVAL on an unsupported chipset, and zero on success.
+ */
+static int speedstep_activate(void)
+{
+	u16 value = 0;
+
+	if (!speedstep_chipset_dev)
+		return -EINVAL;
+
+	pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value);
+	if (!(value & 0x08)) {
+		value |= 0x08;
+		pr_debug("activating SpeedStep (TM) registers\n");
+		pci_write_config_word(speedstep_chipset_dev, 0x00A0, value);
+	}
+
+	return 0;
+}
+
+
+/**
+ * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic
+ *
+ *   Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to
+ * the LPC bridge / PM module which contains all power-management
+ * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
+ * chipset, or zero on failure.
+ */
+static unsigned int speedstep_detect_chipset(void)
+{
+	speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
+			      PCI_DEVICE_ID_INTEL_82801DB_12,
+			      PCI_ANY_ID, PCI_ANY_ID,
+			      NULL);
+	if (speedstep_chipset_dev)
+		return 4; /* 4-M */
+
+	speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
+			      PCI_DEVICE_ID_INTEL_82801CA_12,
+			      PCI_ANY_ID, PCI_ANY_ID,
+			      NULL);
+	if (speedstep_chipset_dev)
+		return 3; /* 3-M */
+
+
+	speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
+			      PCI_DEVICE_ID_INTEL_82801BA_10,
+			      PCI_ANY_ID, PCI_ANY_ID,
+			      NULL);
+	if (speedstep_chipset_dev) {
+		/* speedstep.c causes lockups on Dell Inspirons 8000 and
+		 * 8100 which use a pretty old revision of the 82815
+		 * host brige. Abort on these systems.
+		 */
+		static struct pci_dev *hostbridge;
+
+		hostbridge  = pci_get_subsys(PCI_VENDOR_ID_INTEL,
+			      PCI_DEVICE_ID_INTEL_82815_MC,
+			      PCI_ANY_ID, PCI_ANY_ID,
+			      NULL);
+
+		if (!hostbridge)
+			return 2; /* 2-M */
+
+		if (hostbridge->revision < 5) {
+			pr_debug("hostbridge does not support speedstep\n");
+			speedstep_chipset_dev = NULL;
+			pci_dev_put(hostbridge);
+			return 0;
+		}
+
+		pci_dev_put(hostbridge);
+		return 2; /* 2-M */
+	}
+
+	return 0;
+}
+
+static void get_freq_data(void *_speed)
+{
+	unsigned int *speed = _speed;
+
+	*speed = speedstep_get_frequency(speedstep_processor);
+}
+
+static unsigned int speedstep_get(unsigned int cpu)
+{
+	unsigned int speed;
+
+	/* You're supposed to ensure CPU is online. */
+	if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
+		BUG();
+
+	pr_debug("detected %u kHz as current frequency\n", speed);
+	return speed;
+}
+
+/**
+ * speedstep_target - set a new CPUFreq policy
+ * @policy: new policy
+ * @target_freq: the target frequency
+ * @relation: how that frequency relates to achieved frequency
+ *	(CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
+ *
+ * Sets a new CPUFreq policy.
+ */
+static int speedstep_target(struct cpufreq_policy *policy,
+			     unsigned int target_freq,
+			     unsigned int relation)
+{
+	unsigned int newstate = 0, policy_cpu;
+	struct cpufreq_freqs freqs;
+	int i;
+
+	if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
+				target_freq, relation, &newstate))
+		return -EINVAL;
+
+	policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
+	freqs.old = speedstep_get(policy_cpu);
+	freqs.new = speedstep_freqs[newstate].frequency;
+	freqs.cpu = policy->cpu;
+
+	pr_debug("transiting from %u to %u kHz\n", freqs.old, freqs.new);
+
+	/* no transition necessary */
+	if (freqs.old == freqs.new)
+		return 0;
+
+	for_each_cpu(i, policy->cpus) {
+		freqs.cpu = i;
+		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	}
+
+	smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate,
+				 true);
+
+	for_each_cpu(i, policy->cpus) {
+		freqs.cpu = i;
+		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	}
+
+	return 0;
+}
+
+
+/**
+ * speedstep_verify - verifies a new CPUFreq policy
+ * @policy: new policy
+ *
+ * Limit must be within speedstep_low_freq and speedstep_high_freq, with
+ * at least one border included.
+ */
+static int speedstep_verify(struct cpufreq_policy *policy)
+{
+	return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
+}
+
+struct get_freqs {
+	struct cpufreq_policy *policy;
+	int ret;
+};
+
+static void get_freqs_on_cpu(void *_get_freqs)
+{
+	struct get_freqs *get_freqs = _get_freqs;
+
+	get_freqs->ret =
+		speedstep_get_freqs(speedstep_processor,
+			    &speedstep_freqs[SPEEDSTEP_LOW].frequency,
+			    &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
+			    &get_freqs->policy->cpuinfo.transition_latency,
+			    &speedstep_set_state);
+}
+
+static int speedstep_cpu_init(struct cpufreq_policy *policy)
+{
+	int result;
+	unsigned int policy_cpu, speed;
+	struct get_freqs gf;
+
+	/* only run on CPU to be set, or on its sibling */
+#ifdef CONFIG_SMP
+	cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
+#endif
+	policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
+
+	/* detect low and high frequency and transition latency */
+	gf.policy = policy;
+	smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1);
+	if (gf.ret)
+		return gf.ret;
+
+	/* get current speed setting */
+	speed = speedstep_get(policy_cpu);
+	if (!speed)
+		return -EIO;
+
+	pr_debug("currently at %s speed setting - %i MHz\n",
+		(speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
+		? "low" : "high",
+		(speed / 1000));
+
+	/* cpuinfo and default policy values */
+	policy->cur = speed;
+
+	result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
+	if (result)
+		return result;
+
+	cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
+
+	return 0;
+}
+
+
+static int speedstep_cpu_exit(struct cpufreq_policy *policy)
+{
+	cpufreq_frequency_table_put_attr(policy->cpu);
+	return 0;
+}
+
+static struct freq_attr *speedstep_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	NULL,
+};
+
+
+static struct cpufreq_driver speedstep_driver = {
+	.name	= "speedstep-ich",
+	.verify	= speedstep_verify,
+	.target	= speedstep_target,
+	.init	= speedstep_cpu_init,
+	.exit	= speedstep_cpu_exit,
+	.get	= speedstep_get,
+	.owner	= THIS_MODULE,
+	.attr	= speedstep_attr,
+};
+
+
+/**
+ * speedstep_init - initializes the SpeedStep CPUFreq driver
+ *
+ *   Initializes the SpeedStep support. Returns -ENODEV on unsupported
+ * devices, -EINVAL on problems during initiatization, and zero on
+ * success.
+ */
+static int __init speedstep_init(void)
+{
+	/* detect processor */
+	speedstep_processor = speedstep_detect_processor();
+	if (!speedstep_processor) {
+		pr_debug("Intel(R) SpeedStep(TM) capable processor "
+				"not found\n");
+		return -ENODEV;
+	}
+
+	/* detect chipset */
+	if (!speedstep_detect_chipset()) {
+		pr_debug("Intel(R) SpeedStep(TM) for this chipset not "
+				"(yet) available.\n");
+		return -ENODEV;
+	}
+
+	/* activate speedstep support */
+	if (speedstep_activate()) {
+		pci_dev_put(speedstep_chipset_dev);
+		return -EINVAL;
+	}
+
+	if (speedstep_find_register())
+		return -ENODEV;
+
+	return cpufreq_register_driver(&speedstep_driver);
+}
+
+
+/**
+ * speedstep_exit - unregisters SpeedStep support
+ *
+ *   Unregisters SpeedStep support.
+ */
+static void __exit speedstep_exit(void)
+{
+	pci_dev_put(speedstep_chipset_dev);
+	cpufreq_unregister_driver(&speedstep_driver);
+}
+
+
+MODULE_AUTHOR("Dave Jones <davej@redhat.com>, "
+		"Dominik Brodowski <linux@brodo.de>");
+MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets "
+		"with ICH-M southbridges.");
+MODULE_LICENSE("GPL");
+
+module_init(speedstep_init);
+module_exit(speedstep_exit);
diff --git a/drivers/cpufreq/speedstep-lib.c b/drivers/cpufreq/speedstep-lib.c
new file mode 100644
index 00000000000..8af2d2fd9d5
--- /dev/null
+++ b/drivers/cpufreq/speedstep-lib.c
@@ -0,0 +1,478 @@
+/*
+ * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *
+ *  Library for common functions for Intel SpeedStep v.1 and v.2 support
+ *
+ *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+
+#include <asm/msr.h>
+#include <asm/tsc.h>
+#include "speedstep-lib.h"
+
+#define PFX "speedstep-lib: "
+
+#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
+static int relaxed_check;
+#else
+#define relaxed_check 0
+#endif
+
+/*********************************************************************
+ *                   GET PROCESSOR CORE SPEED IN KHZ                 *
+ *********************************************************************/
+
+static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
+{
+	/* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
+	struct {
+		unsigned int ratio;	/* Frequency Multiplier (x10) */
+		u8 bitmap;		/* power on configuration bits
+					[27, 25:22] (in MSR 0x2a) */
+	} msr_decode_mult[] = {
+		{ 30, 0x01 },
+		{ 35, 0x05 },
+		{ 40, 0x02 },
+		{ 45, 0x06 },
+		{ 50, 0x00 },
+		{ 55, 0x04 },
+		{ 60, 0x0b },
+		{ 65, 0x0f },
+		{ 70, 0x09 },
+		{ 75, 0x0d },
+		{ 80, 0x0a },
+		{ 85, 0x26 },
+		{ 90, 0x20 },
+		{ 100, 0x2b },
+		{ 0, 0xff }	/* error or unknown value */
+	};
+
+	/* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */
+	struct {
+		unsigned int value;	/* Front Side Bus speed in MHz */
+		u8 bitmap;		/* power on configuration bits [18: 19]
+					(in MSR 0x2a) */
+	} msr_decode_fsb[] = {
+		{  66, 0x0 },
+		{ 100, 0x2 },
+		{ 133, 0x1 },
+		{   0, 0xff}
+	};
+
+	u32 msr_lo, msr_tmp;
+	int i = 0, j = 0;
+
+	/* read MSR 0x2a - we only need the low 32 bits */
+	rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
+	pr_debug("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
+	msr_tmp = msr_lo;
+
+	/* decode the FSB */
+	msr_tmp &= 0x00c0000;
+	msr_tmp >>= 18;
+	while (msr_tmp != msr_decode_fsb[i].bitmap) {
+		if (msr_decode_fsb[i].bitmap == 0xff)
+			return 0;
+		i++;
+	}
+
+	/* decode the multiplier */
+	if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) {
+		pr_debug("workaround for early PIIIs\n");
+		msr_lo &= 0x03c00000;
+	} else
+		msr_lo &= 0x0bc00000;
+	msr_lo >>= 22;
+	while (msr_lo != msr_decode_mult[j].bitmap) {
+		if (msr_decode_mult[j].bitmap == 0xff)
+			return 0;
+		j++;
+	}
+
+	pr_debug("speed is %u\n",
+		(msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
+
+	return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100;
+}
+
+
+static unsigned int pentiumM_get_frequency(void)
+{
+	u32 msr_lo, msr_tmp;
+
+	rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
+	pr_debug("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
+
+	/* see table B-2 of 24547212.pdf */
+	if (msr_lo & 0x00040000) {
+		printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n",
+				msr_lo, msr_tmp);
+		return 0;
+	}
+
+	msr_tmp = (msr_lo >> 22) & 0x1f;
+	pr_debug("bits 22-26 are 0x%x, speed is %u\n",
+			msr_tmp, (msr_tmp * 100 * 1000));
+
+	return msr_tmp * 100 * 1000;
+}
+
+static unsigned int pentium_core_get_frequency(void)
+{
+	u32 fsb = 0;
+	u32 msr_lo, msr_tmp;
+	int ret;
+
+	rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp);
+	/* see table B-2 of 25366920.pdf */
+	switch (msr_lo & 0x07) {
+	case 5:
+		fsb = 100000;
+		break;
+	case 1:
+		fsb = 133333;
+		break;
+	case 3:
+		fsb = 166667;
+		break;
+	case 2:
+		fsb = 200000;
+		break;
+	case 0:
+		fsb = 266667;
+		break;
+	case 4:
+		fsb = 333333;
+		break;
+	default:
+		printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value");
+	}
+
+	rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
+	pr_debug("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n",
+			msr_lo, msr_tmp);
+
+	msr_tmp = (msr_lo >> 22) & 0x1f;
+	pr_debug("bits 22-26 are 0x%x, speed is %u\n",
+			msr_tmp, (msr_tmp * fsb));
+
+	ret = (msr_tmp * fsb);
+	return ret;
+}
+
+
+static unsigned int pentium4_get_frequency(void)
+{
+	struct cpuinfo_x86 *c = &boot_cpu_data;
+	u32 msr_lo, msr_hi, mult;
+	unsigned int fsb = 0;
+	unsigned int ret;
+	u8 fsb_code;
+
+	/* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency
+	 * to System Bus Frequency Ratio Field in the Processor Frequency
+	 * Configuration Register of the MSR. Therefore the current
+	 * frequency cannot be calculated and has to be measured.
+	 */
+	if (c->x86_model < 2)
+		return cpu_khz;
+
+	rdmsr(0x2c, msr_lo, msr_hi);
+
+	pr_debug("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi);
+
+	/* decode the FSB: see IA-32 Intel (C) Architecture Software
+	 * Developer's Manual, Volume 3: System Prgramming Guide,
+	 * revision #12 in Table B-1: MSRs in the Pentium 4 and
+	 * Intel Xeon Processors, on page B-4 and B-5.
+	 */
+	fsb_code = (msr_lo >> 16) & 0x7;
+	switch (fsb_code) {
+	case 0:
+		fsb = 100 * 1000;
+		break;
+	case 1:
+		fsb = 13333 * 10;
+		break;
+	case 2:
+		fsb = 200 * 1000;
+		break;
+	}
+
+	if (!fsb)
+		printk(KERN_DEBUG PFX "couldn't detect FSB speed. "
+				"Please send an e-mail to <linux@brodo.de>\n");
+
+	/* Multiplier. */
+	mult = msr_lo >> 24;
+
+	pr_debug("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n",
+			fsb, mult, (fsb * mult));
+
+	ret = (fsb * mult);
+	return ret;
+}
+
+
+/* Warning: may get called from smp_call_function_single. */
+unsigned int speedstep_get_frequency(enum speedstep_processor processor)
+{
+	switch (processor) {
+	case SPEEDSTEP_CPU_PCORE:
+		return pentium_core_get_frequency();
+	case SPEEDSTEP_CPU_PM:
+		return pentiumM_get_frequency();
+	case SPEEDSTEP_CPU_P4D:
+	case SPEEDSTEP_CPU_P4M:
+		return pentium4_get_frequency();
+	case SPEEDSTEP_CPU_PIII_T:
+	case SPEEDSTEP_CPU_PIII_C:
+	case SPEEDSTEP_CPU_PIII_C_EARLY:
+		return pentium3_get_frequency(processor);
+	default:
+		return 0;
+	};
+	return 0;
+}
+EXPORT_SYMBOL_GPL(speedstep_get_frequency);
+
+
+/*********************************************************************
+ *                 DETECT SPEEDSTEP-CAPABLE PROCESSOR                *
+ *********************************************************************/
+
+unsigned int speedstep_detect_processor(void)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+	u32 ebx, msr_lo, msr_hi;
+
+	pr_debug("x86: %x, model: %x\n", c->x86, c->x86_model);
+
+	if ((c->x86_vendor != X86_VENDOR_INTEL) ||
+	    ((c->x86 != 6) && (c->x86 != 0xF)))
+		return 0;
+
+	if (c->x86 == 0xF) {
+		/* Intel Mobile Pentium 4-M
+		 * or Intel Mobile Pentium 4 with 533 MHz FSB */
+		if (c->x86_model != 2)
+			return 0;
+
+		ebx = cpuid_ebx(0x00000001);
+		ebx &= 0x000000FF;
+
+		pr_debug("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask);
+
+		switch (c->x86_mask) {
+		case 4:
+			/*
+			 * B-stepping [M-P4-M]
+			 * sample has ebx = 0x0f, production has 0x0e.
+			 */
+			if ((ebx == 0x0e) || (ebx == 0x0f))
+				return SPEEDSTEP_CPU_P4M;
+			break;
+		case 7:
+			/*
+			 * C-stepping [M-P4-M]
+			 * needs to have ebx=0x0e, else it's a celeron:
+			 * cf. 25130917.pdf / page 7, footnote 5 even
+			 * though 25072120.pdf / page 7 doesn't say
+			 * samples are only of B-stepping...
+			 */
+			if (ebx == 0x0e)
+				return SPEEDSTEP_CPU_P4M;
+			break;
+		case 9:
+			/*
+			 * D-stepping [M-P4-M or M-P4/533]
+			 *
+			 * this is totally strange: CPUID 0x0F29 is
+			 * used by M-P4-M, M-P4/533 and(!) Celeron CPUs.
+			 * The latter need to be sorted out as they don't
+			 * support speedstep.
+			 * Celerons with CPUID 0x0F29 may have either
+			 * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything
+			 * specific.
+			 * M-P4-Ms may have either ebx=0xe or 0xf [see above]
+			 * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf]
+			 * also, M-P4M HTs have ebx=0x8, too
+			 * For now, they are distinguished by the model_id
+			 * string
+			 */
+			if ((ebx == 0x0e) ||
+				(strstr(c->x86_model_id,
+				    "Mobile Intel(R) Pentium(R) 4") != NULL))
+				return SPEEDSTEP_CPU_P4M;
+			break;
+		default:
+			break;
+		}
+		return 0;
+	}
+
+	switch (c->x86_model) {
+	case 0x0B: /* Intel PIII [Tualatin] */
+		/* cpuid_ebx(1) is 0x04 for desktop PIII,
+		 * 0x06 for mobile PIII-M */
+		ebx = cpuid_ebx(0x00000001);
+		pr_debug("ebx is %x\n", ebx);
+
+		ebx &= 0x000000FF;
+
+		if (ebx != 0x06)
+			return 0;
+
+		/* So far all PIII-M processors support SpeedStep. See
+		 * Intel's 24540640.pdf of June 2003
+		 */
+		return SPEEDSTEP_CPU_PIII_T;
+
+	case 0x08: /* Intel PIII [Coppermine] */
+
+		/* all mobile PIII Coppermines have FSB 100 MHz
+		 * ==> sort out a few desktop PIIIs. */
+		rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
+		pr_debug("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n",
+				msr_lo, msr_hi);
+		msr_lo &= 0x00c0000;
+		if (msr_lo != 0x0080000)
+			return 0;
+
+		/*
+		 * If the processor is a mobile version,
+		 * platform ID has bit 50 set
+		 * it has SpeedStep technology if either
+		 * bit 56 or 57 is set
+		 */
+		rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
+		pr_debug("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n",
+				msr_lo, msr_hi);
+		if ((msr_hi & (1<<18)) &&
+		    (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
+			if (c->x86_mask == 0x01) {
+				pr_debug("early PIII version\n");
+				return SPEEDSTEP_CPU_PIII_C_EARLY;
+			} else
+				return SPEEDSTEP_CPU_PIII_C;
+		}
+
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL_GPL(speedstep_detect_processor);
+
+
+/*********************************************************************
+ *                     DETECT SPEEDSTEP SPEEDS                       *
+ *********************************************************************/
+
+unsigned int speedstep_get_freqs(enum speedstep_processor processor,
+				  unsigned int *low_speed,
+				  unsigned int *high_speed,
+				  unsigned int *transition_latency,
+				  void (*set_state) (unsigned int state))
+{
+	unsigned int prev_speed;
+	unsigned int ret = 0;
+	unsigned long flags;
+	struct timeval tv1, tv2;
+
+	if ((!processor) || (!low_speed) || (!high_speed) || (!set_state))
+		return -EINVAL;
+
+	pr_debug("trying to determine both speeds\n");
+
+	/* get current speed */
+	prev_speed = speedstep_get_frequency(processor);
+	if (!prev_speed)
+		return -EIO;
+
+	pr_debug("previous speed is %u\n", prev_speed);
+
+	local_irq_save(flags);
+
+	/* switch to low state */
+	set_state(SPEEDSTEP_LOW);
+	*low_speed = speedstep_get_frequency(processor);
+	if (!*low_speed) {
+		ret = -EIO;
+		goto out;
+	}
+
+	pr_debug("low speed is %u\n", *low_speed);
+
+	/* start latency measurement */
+	if (transition_latency)
+		do_gettimeofday(&tv1);
+
+	/* switch to high state */
+	set_state(SPEEDSTEP_HIGH);
+
+	/* end latency measurement */
+	if (transition_latency)
+		do_gettimeofday(&tv2);
+
+	*high_speed = speedstep_get_frequency(processor);
+	if (!*high_speed) {
+		ret = -EIO;
+		goto out;
+	}
+
+	pr_debug("high speed is %u\n", *high_speed);
+
+	if (*low_speed == *high_speed) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	/* switch to previous state, if necessary */
+	if (*high_speed != prev_speed)
+		set_state(SPEEDSTEP_LOW);
+
+	if (transition_latency) {
+		*transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC +
+			tv2.tv_usec - tv1.tv_usec;
+		pr_debug("transition latency is %u uSec\n", *transition_latency);
+
+		/* convert uSec to nSec and add 20% for safety reasons */
+		*transition_latency *= 1200;
+
+		/* check if the latency measurement is too high or too low
+		 * and set it to a safe value (500uSec) in that case
+		 */
+		if (*transition_latency > 10000000 ||
+		    *transition_latency < 50000) {
+			printk(KERN_WARNING PFX "frequency transition "
+					"measured seems out of range (%u "
+					"nSec), falling back to a safe one of"
+					"%u nSec.\n",
+					*transition_latency, 500000);
+			*transition_latency = 500000;
+		}
+	}
+
+out:
+	local_irq_restore(flags);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(speedstep_get_freqs);
+
+#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
+module_param(relaxed_check, int, 0444);
+MODULE_PARM_DESC(relaxed_check,
+		"Don't do all checks for speedstep capability.");
+#endif
+
+MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
+MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
+MODULE_LICENSE("GPL");
diff --git a/drivers/cpufreq/speedstep-lib.h b/drivers/cpufreq/speedstep-lib.h
new file mode 100644
index 00000000000..70d9cea1219
--- /dev/null
+++ b/drivers/cpufreq/speedstep-lib.h
@@ -0,0 +1,49 @@
+/*
+ * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *
+ *  Library for common functions for Intel SpeedStep v.1 and v.2 support
+ *
+ *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
+ */
+
+
+
+/* processors */
+enum speedstep_processor {
+	SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001,  /* Coppermine core */
+	SPEEDSTEP_CPU_PIII_C	   = 0x00000002,  /* Coppermine core */
+	SPEEDSTEP_CPU_PIII_T	   = 0x00000003,  /* Tualatin core */
+	SPEEDSTEP_CPU_P4M	   = 0x00000004,  /* P4-M  */
+/* the following processors are not speedstep-capable and are not auto-detected
+ * in speedstep_detect_processor(). However, their speed can be detected using
+ * the speedstep_get_frequency() call. */
+	SPEEDSTEP_CPU_PM	   = 0xFFFFFF03,  /* Pentium M  */
+	SPEEDSTEP_CPU_P4D	   = 0xFFFFFF04,  /* desktop P4  */
+	SPEEDSTEP_CPU_PCORE	   = 0xFFFFFF05,  /* Core */
+};
+
+/* speedstep states -- only two of them */
+
+#define SPEEDSTEP_HIGH	0x00000000
+#define SPEEDSTEP_LOW	0x00000001
+
+
+/* detect a speedstep-capable processor */
+extern enum speedstep_processor speedstep_detect_processor(void);
+
+/* detect the current speed (in khz) of the processor */
+extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
+
+
+/* detect the low and high speeds of the processor. The callback
+ * set_state"'s first argument is either SPEEDSTEP_HIGH or
+ * SPEEDSTEP_LOW; the second argument is zero so that no
+ * cpufreq_notify_transition calls are initiated.
+ */
+extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
+	unsigned int *low_speed,
+	unsigned int *high_speed,
+	unsigned int *transition_latency,
+	void (*set_state) (unsigned int state));
diff --git a/drivers/cpufreq/speedstep-smi.c b/drivers/cpufreq/speedstep-smi.c
new file mode 100644
index 00000000000..c76ead3490b
--- /dev/null
+++ b/drivers/cpufreq/speedstep-smi.c
@@ -0,0 +1,464 @@
+/*
+ * Intel SpeedStep SMI driver.
+ *
+ * (C) 2003  Hiroshi Miura <miura@da-cha.org>
+ *
+ *  Licensed under the terms of the GNU GPL License version 2.
+ *
+ */
+
+
+/*********************************************************************
+ *                        SPEEDSTEP - DEFINITIONS                    *
+ *********************************************************************/
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <asm/ist.h>
+
+#include "speedstep-lib.h"
+
+/* speedstep system management interface port/command.
+ *
+ * These parameters are got from IST-SMI BIOS call.
+ * If user gives it, these are used.
+ *
+ */
+static int smi_port;
+static int smi_cmd;
+static unsigned int smi_sig;
+
+/* info about the processor */
+static enum speedstep_processor speedstep_processor;
+
+/*
+ * There are only two frequency states for each processor. Values
+ * are in kHz for the time being.
+ */
+static struct cpufreq_frequency_table speedstep_freqs[] = {
+	{SPEEDSTEP_HIGH,	0},
+	{SPEEDSTEP_LOW,		0},
+	{0,			CPUFREQ_TABLE_END},
+};
+
+#define GET_SPEEDSTEP_OWNER 0
+#define GET_SPEEDSTEP_STATE 1
+#define SET_SPEEDSTEP_STATE 2
+#define GET_SPEEDSTEP_FREQS 4
+
+/* how often shall the SMI call be tried if it failed, e.g. because
+ * of DMA activity going on? */
+#define SMI_TRIES 5
+
+/**
+ * speedstep_smi_ownership
+ */
+static int speedstep_smi_ownership(void)
+{
+	u32 command, result, magic, dummy;
+	u32 function = GET_SPEEDSTEP_OWNER;
+	unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation";
+
+	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
+	magic = virt_to_phys(magic_data);
+
+	pr_debug("trying to obtain ownership with command %x at port %x\n",
+			command, smi_port);
+
+	__asm__ __volatile__(
+		"push %%ebp\n"
+		"out %%al, (%%dx)\n"
+		"pop %%ebp\n"
+		: "=D" (result),
+		  "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy),
+		  "=S" (dummy)
+		: "a" (command), "b" (function), "c" (0), "d" (smi_port),
+		  "D" (0), "S" (magic)
+		: "memory"
+	);
+
+	pr_debug("result is %x\n", result);
+
+	return result;
+}
+
+/**
+ * speedstep_smi_get_freqs - get SpeedStep preferred & current freq.
+ * @low: the low frequency value is placed here
+ * @high: the high frequency value is placed here
+ *
+ * Only available on later SpeedStep-enabled systems, returns false results or
+ * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing
+ * shows that the latter occurs if !(ist_info.event & 0xFFFF).
+ */
+static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high)
+{
+	u32 command, result = 0, edi, high_mhz, low_mhz, dummy;
+	u32 state = 0;
+	u32 function = GET_SPEEDSTEP_FREQS;
+
+	if (!(ist_info.event & 0xFFFF)) {
+		pr_debug("bug #1422 -- can't read freqs from BIOS\n");
+		return -ENODEV;
+	}
+
+	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
+
+	pr_debug("trying to determine frequencies with command %x at port %x\n",
+			command, smi_port);
+
+	__asm__ __volatile__(
+		"push %%ebp\n"
+		"out %%al, (%%dx)\n"
+		"pop %%ebp"
+		: "=a" (result),
+		  "=b" (high_mhz),
+		  "=c" (low_mhz),
+		  "=d" (state), "=D" (edi), "=S" (dummy)
+		: "a" (command),
+		  "b" (function),
+		  "c" (state),
+		  "d" (smi_port), "S" (0), "D" (0)
+	);
+
+	pr_debug("result %x, low_freq %u, high_freq %u\n",
+			result, low_mhz, high_mhz);
+
+	/* abort if results are obviously incorrect... */
+	if ((high_mhz + low_mhz) < 600)
+		return -EINVAL;
+
+	*high = high_mhz * 1000;
+	*low  = low_mhz  * 1000;
+
+	return result;
+}
+
+/**
+ * speedstep_get_state - set the SpeedStep state
+ * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
+ *
+ */
+static int speedstep_get_state(void)
+{
+	u32 function = GET_SPEEDSTEP_STATE;
+	u32 result, state, edi, command, dummy;
+
+	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
+
+	pr_debug("trying to determine current setting with command %x "
+		"at port %x\n", command, smi_port);
+
+	__asm__ __volatile__(
+		"push %%ebp\n"
+		"out %%al, (%%dx)\n"
+		"pop %%ebp\n"
+		: "=a" (result),
+		  "=b" (state), "=D" (edi),
+		  "=c" (dummy), "=d" (dummy), "=S" (dummy)
+		: "a" (command), "b" (function), "c" (0),
+		  "d" (smi_port), "S" (0), "D" (0)
+	);
+
+	pr_debug("state is %x, result is %x\n", state, result);
+
+	return state & 1;
+}
+
+
+/**
+ * speedstep_set_state - set the SpeedStep state
+ * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
+ *
+ */
+static void speedstep_set_state(unsigned int state)
+{
+	unsigned int result = 0, command, new_state, dummy;
+	unsigned long flags;
+	unsigned int function = SET_SPEEDSTEP_STATE;
+	unsigned int retry = 0;
+
+	if (state > 0x1)
+		return;
+
+	/* Disable IRQs */
+	local_irq_save(flags);
+
+	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
+
+	pr_debug("trying to set frequency to state %u "
+		"with command %x at port %x\n",
+		state, command, smi_port);
+
+	do {
+		if (retry) {
+			pr_debug("retry %u, previous result %u, waiting...\n",
+					retry, result);
+			mdelay(retry * 50);
+		}
+		retry++;
+		__asm__ __volatile__(
+			"push %%ebp\n"
+			"out %%al, (%%dx)\n"
+			"pop %%ebp"
+			: "=b" (new_state), "=D" (result),
+			  "=c" (dummy), "=a" (dummy),
+			  "=d" (dummy), "=S" (dummy)
+			: "a" (command), "b" (function), "c" (state),
+			  "d" (smi_port), "S" (0), "D" (0)
+			);
+	} while ((new_state != state) && (retry <= SMI_TRIES));
+
+	/* enable IRQs */
+	local_irq_restore(flags);
+
+	if (new_state == state)
+		pr_debug("change to %u MHz succeeded after %u tries "
+			"with result %u\n",
+			(speedstep_freqs[new_state].frequency / 1000),
+			retry, result);
+	else
+		printk(KERN_ERR "cpufreq: change to state %u "
+			"failed with new_state %u and result %u\n",
+			state, new_state, result);
+
+	return;
+}
+
+
+/**
+ * speedstep_target - set a new CPUFreq policy
+ * @policy: new policy
+ * @target_freq: new freq
+ * @relation:
+ *
+ * Sets a new CPUFreq policy/freq.
+ */
+static int speedstep_target(struct cpufreq_policy *policy,
+			unsigned int target_freq, unsigned int relation)
+{
+	unsigned int newstate = 0;
+	struct cpufreq_freqs freqs;
+
+	if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
+				target_freq, relation, &newstate))
+		return -EINVAL;
+
+	freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
+	freqs.new = speedstep_freqs[newstate].frequency;
+	freqs.cpu = 0; /* speedstep.c is UP only driver */
+
+	if (freqs.old == freqs.new)
+		return 0;
+
+	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	speedstep_set_state(newstate);
+	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+
+	return 0;
+}
+
+
+/**
+ * speedstep_verify - verifies a new CPUFreq policy
+ * @policy: new policy
+ *
+ * Limit must be within speedstep_low_freq and speedstep_high_freq, with
+ * at least one border included.
+ */
+static int speedstep_verify(struct cpufreq_policy *policy)
+{
+	return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
+}
+
+
+static int speedstep_cpu_init(struct cpufreq_policy *policy)
+{
+	int result;
+	unsigned int speed, state;
+	unsigned int *low, *high;
+
+	/* capability check */
+	if (policy->cpu != 0)
+		return -ENODEV;
+
+	result = speedstep_smi_ownership();
+	if (result) {
+		pr_debug("fails in acquiring ownership of a SMI interface.\n");
+		return -EINVAL;
+	}
+
+	/* detect low and high frequency */
+	low = &speedstep_freqs[SPEEDSTEP_LOW].frequency;
+	high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency;
+
+	result = speedstep_smi_get_freqs(low, high);
+	if (result) {
+		/* fall back to speedstep_lib.c dection mechanism:
+		 * try both states out */
+		pr_debug("could not detect low and high frequencies "
+				"by SMI call.\n");
+		result = speedstep_get_freqs(speedstep_processor,
+				low, high,
+				NULL,
+				&speedstep_set_state);
+
+		if (result) {
+			pr_debug("could not detect two different speeds"
+					" -- aborting.\n");
+			return result;
+		} else
+			pr_debug("workaround worked.\n");
+	}
+
+	/* get current speed setting */
+	state = speedstep_get_state();
+	speed = speedstep_freqs[state].frequency;
+
+	pr_debug("currently at %s speed setting - %i MHz\n",
+		(speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
+		? "low" : "high",
+		(speed / 1000));
+
+	/* cpuinfo and default policy values */
+	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+	policy->cur = speed;
+
+	result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
+	if (result)
+		return result;
+
+	cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
+
+	return 0;
+}
+
+static int speedstep_cpu_exit(struct cpufreq_policy *policy)
+{
+	cpufreq_frequency_table_put_attr(policy->cpu);
+	return 0;
+}
+
+static unsigned int speedstep_get(unsigned int cpu)
+{
+	if (cpu)
+		return -ENODEV;
+	return speedstep_get_frequency(speedstep_processor);
+}
+
+
+static int speedstep_resume(struct cpufreq_policy *policy)
+{
+	int result = speedstep_smi_ownership();
+
+	if (result)
+		pr_debug("fails in re-acquiring ownership of a SMI interface.\n");
+
+	return result;
+}
+
+static struct freq_attr *speedstep_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	NULL,
+};
+
+static struct cpufreq_driver speedstep_driver = {
+	.name		= "speedstep-smi",
+	.verify		= speedstep_verify,
+	.target		= speedstep_target,
+	.init		= speedstep_cpu_init,
+	.exit		= speedstep_cpu_exit,
+	.get		= speedstep_get,
+	.resume		= speedstep_resume,
+	.owner		= THIS_MODULE,
+	.attr		= speedstep_attr,
+};
+
+/**
+ * speedstep_init - initializes the SpeedStep CPUFreq driver
+ *
+ *   Initializes the SpeedStep support. Returns -ENODEV on unsupported
+ * BIOS, -EINVAL on problems during initiatization, and zero on
+ * success.
+ */
+static int __init speedstep_init(void)
+{
+	speedstep_processor = speedstep_detect_processor();
+
+	switch (speedstep_processor) {
+	case SPEEDSTEP_CPU_PIII_T:
+	case SPEEDSTEP_CPU_PIII_C:
+	case SPEEDSTEP_CPU_PIII_C_EARLY:
+		break;
+	default:
+		speedstep_processor = 0;
+	}
+
+	if (!speedstep_processor) {
+		pr_debug("No supported Intel CPU detected.\n");
+		return -ENODEV;
+	}
+
+	pr_debug("signature:0x%.8ulx, command:0x%.8ulx, "
+		"event:0x%.8ulx, perf_level:0x%.8ulx.\n",
+		ist_info.signature, ist_info.command,
+		ist_info.event, ist_info.perf_level);
+
+	/* Error if no IST-SMI BIOS or no PARM
+		 sig= 'ISGE' aka 'Intel Speedstep Gate E' */
+	if ((ist_info.signature !=  0x47534943) && (
+	    (smi_port == 0) || (smi_cmd == 0)))
+		return -ENODEV;
+
+	if (smi_sig == 1)
+		smi_sig = 0x47534943;
+	else
+		smi_sig = ist_info.signature;
+
+	/* setup smi_port from MODLULE_PARM or BIOS */
+	if ((smi_port > 0xff) || (smi_port < 0))
+		return -EINVAL;
+	else if (smi_port == 0)
+		smi_port = ist_info.command & 0xff;
+
+	if ((smi_cmd > 0xff) || (smi_cmd < 0))
+		return -EINVAL;
+	else if (smi_cmd == 0)
+		smi_cmd = (ist_info.command >> 16) & 0xff;
+
+	return cpufreq_register_driver(&speedstep_driver);
+}
+
+
+/**
+ * speedstep_exit - unregisters SpeedStep support
+ *
+ *   Unregisters SpeedStep support.
+ */
+static void __exit speedstep_exit(void)
+{
+	cpufreq_unregister_driver(&speedstep_driver);
+}
+
+module_param(smi_port, int, 0444);
+module_param(smi_cmd,  int, 0444);
+module_param(smi_sig, uint, 0444);
+
+MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value "
+		"-- Intel's default setting is 0xb2");
+MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value "
+		"-- Intel's default setting is 0x82");
+MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the "
+		"SMI interface.");
+
+MODULE_AUTHOR("Hiroshi Miura");
+MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface.");
+MODULE_LICENSE("GPL");
+
+module_init(speedstep_init);
+module_exit(speedstep_exit);
-- 
cgit v1.2.3-70-g09d2