119 files changed, 2710 insertions, 17582 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 7338ef2218b..04105574c8e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,7 +24,6 @@ endif
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_vsyscall_64.o	:= $(PROFILING) -g0 $(nostackp)
 CFLAGS_hpet.o		:= $(nostackp)
-CFLAGS_tsc.o		:= $(nostackp)
 CFLAGS_paravirt.o	:= $(nostackp)
 GCOV_PROFILE_vsyscall_64.o	:= n
 GCOV_PROFILE_hpet.o		:= n
@@ -36,10 +35,11 @@ obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time.o ioport.o ldt.o dumpstack.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
 obj-$(CONFIG_IRQ_WORK)  += irq_work.o
-obj-$(CONFIG_X86_32)	+= probe_roms_32.o
+obj-y			+= probe_roms.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
+obj-$(CONFIG_X86_64)	+= vsyscall_emu_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o topology.o kdebugfs.o
 obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
@@ -117,9 +117,8 @@ obj-$(CONFIG_OF)			+= devicetree.o
 ifeq ($(CONFIG_X86_64),y)
 	obj-$(CONFIG_AUDIT)		+= audit_64.o
 
-	obj-$(CONFIG_GART_IOMMU)	+= pci-gart_64.o aperture_64.o
+	obj-$(CONFIG_GART_IOMMU)	+= amd_gart_64.o aperture_64.o
 	obj-$(CONFIG_CALGARY_IOMMU)	+= pci-calgary_64.o tce_64.o
-	obj-$(CONFIG_AMD_IOMMU)		+= amd_iommu_init.o amd_iommu.o
 
 	obj-$(CONFIG_PCI_MMCONFIG)	+= mmconf-fam10h_64.o
 	obj-y				+= vsmp_64.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 9a966c579af..4558f0d0822 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -970,7 +970,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
 	mp_irq.irqflag = (trigger << 2) | polarity;
 	mp_irq.srcbus = MP_ISA_BUS;
 	mp_irq.srcbusirq = bus_irq;	/* IRQ */
-	mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */
+	mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
 	mp_irq.dstirq = pin;	/* INTIN# */
 
 	mp_save_irq(&mp_irq);
@@ -1021,7 +1021,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 		if (ioapic < 0)
 			continue;
 		pin = mp_find_ioapic_pin(ioapic, gsi);
-		dstapic = mp_ioapics[ioapic].apicid;
+		dstapic = mpc_ioapic_id(ioapic);
 
 		for (idx = 0; idx < mp_irq_entries; idx++) {
 			struct mpc_intsrc *irq = mp_irqs + idx;
@@ -1082,7 +1082,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
 	mp_irq.srcbus = number;
 	mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
 	ioapic = mp_find_ioapic(gsi);
-	mp_irq.dstapic = mp_ioapics[ioapic].apicid;
+	mp_irq.dstapic = mpc_ioapic_id(ioapic);
 	mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
 
 	mp_save_irq(&mp_irq);
@@ -1113,7 +1113,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 
 	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
 		printk(KERN_ERR "Invalid reference to IOAPIC pin "
-		       "%d-%d\n", mp_ioapics[ioapic].apicid,
+		       "%d-%d\n", mpc_ioapic_id(ioapic),
 		       ioapic_pin);
 		return gsi;
 	}
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index ead21b66311..b4fd836e405 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -28,6 +28,8 @@ pmode_cr3:	.long	0	/* Saved %cr3 */
 pmode_cr4:	.long	0	/* Saved %cr4 */
 pmode_efer:	.quad	0	/* Saved EFER */
 pmode_gdt:	.quad	0
+pmode_misc_en:	.quad	0	/* Saved MISC_ENABLE MSR */
+pmode_behavior:	.long	0	/* Wakeup behavior flags */
 realmode_flags:	.long	0
 real_magic:	.long	0
 trampoline_segment:	.word 0
@@ -91,6 +93,18 @@ wakeup_code:
 	/* Call the C code */
 	calll	main
 
+	/* Restore MISC_ENABLE before entering protected mode, in case
+	   BIOS decided to clear XD_DISABLE during S3. */
+	movl	pmode_behavior, %eax
+	btl	$WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax
+	jnc	1f
+
+	movl	pmode_misc_en, %eax
+	movl	pmode_misc_en + 4, %edx
+	movl	$MSR_IA32_MISC_ENABLE, %ecx
+	wrmsr
+1:
+
 	/* Do any other stuff... */
 
 #ifndef CONFIG_64BIT
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
index e1828c07e79..97a29e1430e 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -21,6 +21,9 @@ struct wakeup_header {
 	u32 pmode_efer_low;	/* Protected mode EFER */
 	u32 pmode_efer_high;
 	u64 pmode_gdt;
+	u32 pmode_misc_en_low;	/* Protected mode MISC_ENABLE */
+	u32 pmode_misc_en_high;
+	u32 pmode_behavior;	/* Wakeup routine behavior flags */
 	u32 realmode_flags;
 	u32 real_magic;
 	u16 trampoline_segment;	/* segment with trampoline code, 64-bit only */
@@ -39,4 +42,7 @@ extern struct wakeup_header wakeup_header;
 #define WAKEUP_HEADER_SIGNATURE 0x51ee1111
 #define WAKEUP_END_SIGNATURE	0x65a22c82
 
+/* Wakeup behavior bits */
+#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE     0
+
 #endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ff93bc1b09c..103b6ab368d 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -77,6 +77,12 @@ int acpi_suspend_lowlevel(void)
 
 	header->pmode_cr0 = read_cr0();
 	header->pmode_cr4 = read_cr4_safe();
+	header->pmode_behavior = 0;
+	if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
+			&header->pmode_misc_en_low,
+			&header->pmode_misc_en_high))
+		header->pmode_behavior |=
+			(1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE);
 	header->realmode_flags = acpi_realmode_flags;
 	header->real_magic = 0x12345678;
 
@@ -112,11 +118,6 @@ static int __init acpi_sleep_setup(char *str)
 #ifdef CONFIG_HIBERNATION
 		if (strncmp(str, "s4_nohwsig", 10) == 0)
 			acpi_no_s4_hw_signature();
-		if (strncmp(str, "s4_nonvs", 8) == 0) {
-			pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, "
-					"please use acpi_sleep=nonvs instead");
-			acpi_nvs_nosave();
-		}
 #endif
 		if (strncmp(str, "nonvs", 5) == 0)
 			acpi_nvs_nosave();
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 4a234677e21..c6382281624 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -14,7 +14,6 @@
 #include <asm/pgtable.h>
 #include <asm/mce.h>
 #include <asm/nmi.h>
-#include <asm/vsyscall.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
@@ -67,17 +66,30 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt);
 #define DPRINTK(fmt, args...) if (debug_alternative) \
 	printk(KERN_DEBUG fmt, args)
 
+/*
+ * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
+ * that correspond to that nop. Getting from one nop to the next, we
+ * add to the array the offset that is equal to the sum of all sizes of
+ * nops preceding the one we are after.
+ *
+ * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
+ * nice symmetry of sizes of the previous nops.
+ */
 #if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
-/* Use inline assembly to define this because the nops are defined
-   as inline assembly strings in the include files and we cannot
-   get them easily into strings. */
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: "
-	GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
-	GENERIC_NOP7 GENERIC_NOP8
-    "\t.previous");
-extern const unsigned char intelnops[];
-static const unsigned char *const __initconst_or_module
-intel_nops[ASM_NOP_MAX+1] = {
+static const unsigned char intelnops[] =
+{
+	GENERIC_NOP1,
+	GENERIC_NOP2,
+	GENERIC_NOP3,
+	GENERIC_NOP4,
+	GENERIC_NOP5,
+	GENERIC_NOP6,
+	GENERIC_NOP7,
+	GENERIC_NOP8,
+	GENERIC_NOP5_ATOMIC
+};
+static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
+{
 	NULL,
 	intelnops,
 	intelnops + 1,
@@ -87,17 +99,25 @@ intel_nops[ASM_NOP_MAX+1] = {
 	intelnops + 1 + 2 + 3 + 4 + 5,
 	intelnops + 1 + 2 + 3 + 4 + 5 + 6,
 	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 };
 #endif
 
 #ifdef K8_NOP1
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: "
-	K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
-	K8_NOP7 K8_NOP8
-    "\t.previous");
-extern const unsigned char k8nops[];
-static const unsigned char *const __initconst_or_module
-k8_nops[ASM_NOP_MAX+1] = {
+static const unsigned char k8nops[] =
+{
+	K8_NOP1,
+	K8_NOP2,
+	K8_NOP3,
+	K8_NOP4,
+	K8_NOP5,
+	K8_NOP6,
+	K8_NOP7,
+	K8_NOP8,
+	K8_NOP5_ATOMIC
+};
+static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
+{
 	NULL,
 	k8nops,
 	k8nops + 1,
@@ -107,17 +127,25 @@ k8_nops[ASM_NOP_MAX+1] = {
 	k8nops + 1 + 2 + 3 + 4 + 5,
 	k8nops + 1 + 2 + 3 + 4 + 5 + 6,
 	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 };
 #endif
 
 #if defined(K7_NOP1) && !defined(CONFIG_X86_64)
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: "
-	K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
-	K7_NOP7 K7_NOP8
-    "\t.previous");
-extern const unsigned char k7nops[];
-static const unsigned char *const __initconst_or_module
-k7_nops[ASM_NOP_MAX+1] = {
+static const unsigned char k7nops[] =
+{
+	K7_NOP1,
+	K7_NOP2,
+	K7_NOP3,
+	K7_NOP4,
+	K7_NOP5,
+	K7_NOP6,
+	K7_NOP7,
+	K7_NOP8,
+	K7_NOP5_ATOMIC
+};
+static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
+{
 	NULL,
 	k7nops,
 	k7nops + 1,
@@ -127,17 +155,25 @@ k7_nops[ASM_NOP_MAX+1] = {
 	k7nops + 1 + 2 + 3 + 4 + 5,
 	k7nops + 1 + 2 + 3 + 4 + 5 + 6,
 	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 };
 #endif
 
 #ifdef P6_NOP1
-asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: "
-	P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6
-	P6_NOP7 P6_NOP8
-    "\t.previous");
-extern const unsigned char p6nops[];
-static const unsigned char *const __initconst_or_module
-p6_nops[ASM_NOP_MAX+1] = {
+static const unsigned char  __initconst_or_module p6nops[] =
+{
+	P6_NOP1,
+	P6_NOP2,
+	P6_NOP3,
+	P6_NOP4,
+	P6_NOP5,
+	P6_NOP6,
+	P6_NOP7,
+	P6_NOP8,
+	P6_NOP5_ATOMIC
+};
+static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
+{
 	NULL,
 	p6nops,
 	p6nops + 1,
@@ -147,47 +183,65 @@ p6_nops[ASM_NOP_MAX+1] = {
 	p6nops + 1 + 2 + 3 + 4 + 5,
 	p6nops + 1 + 2 + 3 + 4 + 5 + 6,
 	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
 };
 #endif
 
+/* Initialize these to a safe default */
 #ifdef CONFIG_X86_64
+const unsigned char * const *ideal_nops = p6_nops;
+#else
+const unsigned char * const *ideal_nops = intel_nops;
+#endif
 
-extern char __vsyscall_0;
-static const unsigned char *const *__init_or_module find_nop_table(void)
+void __init arch_init_ideal_nops(void)
 {
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
-	    boot_cpu_has(X86_FEATURE_NOPL))
-		return p6_nops;
-	else
-		return k8_nops;
-}
-
-#else /* CONFIG_X86_64 */
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_INTEL:
+		/*
+		 * Due to a decoder implementation quirk, some
+		 * specific Intel CPUs actually perform better with
+		 * the "k8_nops" than with the SDM-recommended NOPs.
+		 */
+		if (boot_cpu_data.x86 == 6 &&
+		    boot_cpu_data.x86_model >= 0x0f &&
+		    boot_cpu_data.x86_model != 0x1c &&
+		    boot_cpu_data.x86_model != 0x26 &&
+		    boot_cpu_data.x86_model != 0x27 &&
+		    boot_cpu_data.x86_model < 0x30) {
+			ideal_nops = k8_nops;
+		} else if (boot_cpu_has(X86_FEATURE_NOPL)) {
+			   ideal_nops = p6_nops;
+		} else {
+#ifdef CONFIG_X86_64
+			ideal_nops = k8_nops;
+#else
+			ideal_nops = intel_nops;
+#endif
+		}
 
-static const unsigned char *const *__init_or_module find_nop_table(void)
-{
-	if (boot_cpu_has(X86_FEATURE_K8))
-		return k8_nops;
-	else if (boot_cpu_has(X86_FEATURE_K7))
-		return k7_nops;
-	else if (boot_cpu_has(X86_FEATURE_NOPL))
-		return p6_nops;
-	else
-		return intel_nops;
+	default:
+#ifdef CONFIG_X86_64
+		ideal_nops = k8_nops;
+#else
+		if (boot_cpu_has(X86_FEATURE_K8))
+			ideal_nops = k8_nops;
+		else if (boot_cpu_has(X86_FEATURE_K7))
+			ideal_nops = k7_nops;
+		else
+			ideal_nops = intel_nops;
+#endif
+	}
 }
 
-#endif /* CONFIG_X86_64 */
-
 /* Use this to add nops to a buffer, then text_poke the whole buffer. */
 static void __init_or_module add_nops(void *insns, unsigned int len)
 {
-	const unsigned char *const *noptable = find_nop_table();
-
 	while (len > 0) {
 		unsigned int noplen = len;
 		if (noplen > ASM_NOP_MAX)
 			noplen = ASM_NOP_MAX;
-		memcpy(insns, noptable[noplen], noplen);
+		memcpy(insns, ideal_nops[noplen], noplen);
 		insns += noplen;
 		len -= noplen;
 	}
@@ -207,29 +261,37 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 					 struct alt_instr *end)
 {
 	struct alt_instr *a;
+	u8 *instr, *replacement;
 	u8 insnbuf[MAX_PATCH_LEN];
 
 	DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
+	/*
+	 * The scan order should be from start to end. A later scanned
+	 * alternative code can overwrite a previous scanned alternative code.
+	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
+	 * patch code.
+	 *
+	 * So be careful if you want to change the scan order to any other
+	 * order.
+	 */
 	for (a = start; a < end; a++) {
-		u8 *instr = a->instr;
+		instr = (u8 *)&a->instr_offset + a->instr_offset;
+		replacement = (u8 *)&a->repl_offset + a->repl_offset;
 		BUG_ON(a->replacementlen > a->instrlen);
 		BUG_ON(a->instrlen > sizeof(insnbuf));
 		BUG_ON(a->cpuid >= NCAPINTS*32);
 		if (!boot_cpu_has(a->cpuid))
 			continue;
-#ifdef CONFIG_X86_64
-		/* vsyscall code is not mapped yet. resolve it manually. */
-		if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
-			instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
-			DPRINTK("%s: vsyscall fixup: %p => %p\n",
-				__func__, a->instr, instr);
-		}
-#endif
-		memcpy(insnbuf, a->replacement, a->replacementlen);
+
+		memcpy(insnbuf, replacement, a->replacementlen);
+
+		/* 0xe8 is a relative jump; fix the offset. */
 		if (*insnbuf == 0xe8 && a->replacementlen == 5)
-		    *(s32 *)(insnbuf + 1) += a->replacement - a->instr;
+		    *(s32 *)(insnbuf + 1) += replacement - instr;
+
 		add_nops(insnbuf + a->replacementlen,
 			 a->instrlen - a->replacementlen);
+
 		text_poke_early(instr, insnbuf, a->instrlen);
 	}
 }
@@ -678,29 +740,3 @@ void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
 	wrote_text = 0;
 	__stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
 }
-
-#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
-
-#ifdef CONFIG_X86_64
-unsigned char ideal_nop5[5] = { 0x66, 0x66, 0x66, 0x66, 0x90 };
-#else
-unsigned char ideal_nop5[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 };
-#endif
-
-void __init arch_init_ideal_nop5(void)
-{
-	/*
-	 * There is no good nop for all x86 archs.  This selection
-	 * algorithm should be unified with the one in find_nop_table(),
-	 * but this should be good enough for now.
-	 *
-	 * For cases other than the ones below, use the safe (as in
-	 * always functional) defaults above.
-	 */
-#ifdef CONFIG_X86_64
-	/* Don't use these on 32 bits due to broken virtualizers */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-		memcpy(ideal_nop5, p6_nops[5], 5);
-#endif
-}
-#endif
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 82ada01625b..8a439d364b9 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -30,7 +30,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/io.h>
 #include <linux/gfp.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/mtrr.h>
 #include <asm/pgtable.h>
 #include <asm/proto.h>
@@ -81,6 +81,9 @@ static u32 gart_unmapped_entry;
 #define AGPEXTERN
 #endif
 
+/* GART can only remap to physical addresses < 1TB */
+#define GART_MAX_PHYS_ADDR	(1ULL << 40)
+
 /* backdoor interface to AGP driver */
 AGPEXTERN int agp_memory_reserved;
 AGPEXTERN __u32 *agp_gatt_table;
@@ -212,9 +215,13 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
 				size_t size, int dir, unsigned long align_mask)
 {
 	unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
-	unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
+	unsigned long iommu_page;
 	int i;
 
+	if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR))
+		return bad_dma_addr;
+
+	iommu_page = alloc_iommu(dev, npages, align_mask);
 	if (iommu_page == -1) {
 		if (!nonforced_iommu(dev, phys_mem, size))
 			return phys_mem;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
deleted file mode 100644
index 57ca7778722..00000000000
--- a/arch/x86/kernel/amd_iommu.c
+++ /dev/null
@@ -1,2635 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- *         Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#include <linux/pci.h>
-#include <linux/bitmap.h>
-#include <linux/slab.h>
-#include <linux/debugfs.h>
-#include <linux/scatterlist.h>
-#include <linux/dma-mapping.h>
-#include <linux/iommu-helper.h>
-#include <linux/iommu.h>
-#include <asm/proto.h>
-#include <asm/iommu.h>
-#include <asm/gart.h>
-#include <asm/amd_iommu_proto.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
-
-#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
-
-#define EXIT_LOOP_COUNT 10000000
-
-static DEFINE_RWLOCK(amd_iommu_devtable_lock);
-
-/* A list of preallocated protection domains */
-static LIST_HEAD(iommu_pd_list);
-static DEFINE_SPINLOCK(iommu_pd_list_lock);
-
-/*
- * Domain for untranslated devices - only allocated
- * if iommu=pt passed on kernel cmd line.
- */
-static struct protection_domain *pt_domain;
-
-static struct iommu_ops amd_iommu_ops;
-
-/*
- * general struct to manage commands send to an IOMMU
- */
-struct iommu_cmd {
-	u32 data[4];
-};
-
-static void reset_iommu_command_buffer(struct amd_iommu *iommu);
-static void update_domain(struct protection_domain *domain);
-
-/****************************************************************************
- *
- * Helper functions
- *
- ****************************************************************************/
-
-static inline u16 get_device_id(struct device *dev)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-
-	return calc_devid(pdev->bus->number, pdev->devfn);
-}
-
-static struct iommu_dev_data *get_dev_data(struct device *dev)
-{
-	return dev->archdata.iommu;
-}
-
-/*
- * In this function the list of preallocated protection domains is traversed to
- * find the domain for a specific device
- */
-static struct dma_ops_domain *find_protection_domain(u16 devid)
-{
-	struct dma_ops_domain *entry, *ret = NULL;
-	unsigned long flags;
-	u16 alias = amd_iommu_alias_table[devid];
-
-	if (list_empty(&iommu_pd_list))
-		return NULL;
-
-	spin_lock_irqsave(&iommu_pd_list_lock, flags);
-
-	list_for_each_entry(entry, &iommu_pd_list, list) {
-		if (entry->target_dev == devid ||
-		    entry->target_dev == alias) {
-			ret = entry;
-			break;
-		}
-	}
-
-	spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
-
-	return ret;
-}
-
-/*
- * This function checks if the driver got a valid device from the caller to
- * avoid dereferencing invalid pointers.
- */
-static bool check_device(struct device *dev)
-{
-	u16 devid;
-
-	if (!dev || !dev->dma_mask)
-		return false;
-
-	/* No device or no PCI device */
-	if (dev->bus != &pci_bus_type)
-		return false;
-
-	devid = get_device_id(dev);
-
-	/* Out of our scope? */
-	if (devid > amd_iommu_last_bdf)
-		return false;
-
-	if (amd_iommu_rlookup_table[devid] == NULL)
-		return false;
-
-	return true;
-}
-
-static int iommu_init_device(struct device *dev)
-{
-	struct iommu_dev_data *dev_data;
-	struct pci_dev *pdev;
-	u16 devid, alias;
-
-	if (dev->archdata.iommu)
-		return 0;
-
-	dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
-	if (!dev_data)
-		return -ENOMEM;
-
-	dev_data->dev = dev;
-
-	devid = get_device_id(dev);
-	alias = amd_iommu_alias_table[devid];
-	pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
-	if (pdev)
-		dev_data->alias = &pdev->dev;
-
-	atomic_set(&dev_data->bind, 0);
-
-	dev->archdata.iommu = dev_data;
-
-
-	return 0;
-}
-
-static void iommu_uninit_device(struct device *dev)
-{
-	kfree(dev->archdata.iommu);
-}
-
-void __init amd_iommu_uninit_devices(void)
-{
-	struct pci_dev *pdev = NULL;
-
-	for_each_pci_dev(pdev) {
-
-		if (!check_device(&pdev->dev))
-			continue;
-
-		iommu_uninit_device(&pdev->dev);
-	}
-}
-
-int __init amd_iommu_init_devices(void)
-{
-	struct pci_dev *pdev = NULL;
-	int ret = 0;
-
-	for_each_pci_dev(pdev) {
-
-		if (!check_device(&pdev->dev))
-			continue;
-
-		ret = iommu_init_device(&pdev->dev);
-		if (ret)
-			goto out_free;
-	}
-
-	return 0;
-
-out_free:
-
-	amd_iommu_uninit_devices();
-
-	return ret;
-}
-#ifdef CONFIG_AMD_IOMMU_STATS
-
-/*
- * Initialization code for statistics collection
- */
-
-DECLARE_STATS_COUNTER(compl_wait);
-DECLARE_STATS_COUNTER(cnt_map_single);
-DECLARE_STATS_COUNTER(cnt_unmap_single);
-DECLARE_STATS_COUNTER(cnt_map_sg);
-DECLARE_STATS_COUNTER(cnt_unmap_sg);
-DECLARE_STATS_COUNTER(cnt_alloc_coherent);
-DECLARE_STATS_COUNTER(cnt_free_coherent);
-DECLARE_STATS_COUNTER(cross_page);
-DECLARE_STATS_COUNTER(domain_flush_single);
-DECLARE_STATS_COUNTER(domain_flush_all);
-DECLARE_STATS_COUNTER(alloced_io_mem);
-DECLARE_STATS_COUNTER(total_map_requests);
-
-static struct dentry *stats_dir;
-static struct dentry *de_fflush;
-
-static void amd_iommu_stats_add(struct __iommu_counter *cnt)
-{
-	if (stats_dir == NULL)
-		return;
-
-	cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
-				       &cnt->value);
-}
-
-static void amd_iommu_stats_init(void)
-{
-	stats_dir = debugfs_create_dir("amd-iommu", NULL);
-	if (stats_dir == NULL)
-		return;
-
-	de_fflush  = debugfs_create_bool("fullflush", 0444, stats_dir,
-					 (u32 *)&amd_iommu_unmap_flush);
-
-	amd_iommu_stats_add(&compl_wait);
-	amd_iommu_stats_add(&cnt_map_single);
-	amd_iommu_stats_add(&cnt_unmap_single);
-	amd_iommu_stats_add(&cnt_map_sg);
-	amd_iommu_stats_add(&cnt_unmap_sg);
-	amd_iommu_stats_add(&cnt_alloc_coherent);
-	amd_iommu_stats_add(&cnt_free_coherent);
-	amd_iommu_stats_add(&cross_page);
-	amd_iommu_stats_add(&domain_flush_single);
-	amd_iommu_stats_add(&domain_flush_all);
-	amd_iommu_stats_add(&alloced_io_mem);
-	amd_iommu_stats_add(&total_map_requests);
-}
-
-#endif
-
-/****************************************************************************
- *
- * Interrupt handling functions
- *
- ****************************************************************************/
-
-static void dump_dte_entry(u16 devid)
-{
-	int i;
-
-	for (i = 0; i < 8; ++i)
-		pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
-			amd_iommu_dev_table[devid].data[i]);
-}
-
-static void dump_command(unsigned long phys_addr)
-{
-	struct iommu_cmd *cmd = phys_to_virt(phys_addr);
-	int i;
-
-	for (i = 0; i < 4; ++i)
-		pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
-}
-
-static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
-{
-	u32 *event = __evt;
-	int type  = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
-	int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
-	int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
-	int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
-	u64 address = (u64)(((u64)event[3]) << 32) | event[2];
-
-	printk(KERN_ERR "AMD-Vi: Event logged [");
-
-	switch (type) {
-	case EVENT_TYPE_ILL_DEV:
-		printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
-		       "address=0x%016llx flags=0x%04x]\n",
-		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-		       address, flags);
-		dump_dte_entry(devid);
-		break;
-	case EVENT_TYPE_IO_FAULT:
-		printk("IO_PAGE_FAULT device=%02x:%02x.%x "
-		       "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
-		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-		       domid, address, flags);
-		break;
-	case EVENT_TYPE_DEV_TAB_ERR:
-		printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
-		       "address=0x%016llx flags=0x%04x]\n",
-		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-		       address, flags);
-		break;
-	case EVENT_TYPE_PAGE_TAB_ERR:
-		printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
-		       "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
-		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-		       domid, address, flags);
-		break;
-	case EVENT_TYPE_ILL_CMD:
-		printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
-		iommu->reset_in_progress = true;
-		reset_iommu_command_buffer(iommu);
-		dump_command(address);
-		break;
-	case EVENT_TYPE_CMD_HARD_ERR:
-		printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
-		       "flags=0x%04x]\n", address, flags);
-		break;
-	case EVENT_TYPE_IOTLB_INV_TO:
-		printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
-		       "address=0x%016llx]\n",
-		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-		       address);
-		break;
-	case EVENT_TYPE_INV_DEV_REQ:
-		printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
-		       "address=0x%016llx flags=0x%04x]\n",
-		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-		       address, flags);
-		break;
-	default:
-		printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
-	}
-}
-
-static void iommu_poll_events(struct amd_iommu *iommu)
-{
-	u32 head, tail;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iommu->lock, flags);
-
-	head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
-	tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
-
-	while (head != tail) {
-		iommu_print_event(iommu, iommu->evt_buf + head);
-		head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
-	}
-
-	writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
-
-	spin_unlock_irqrestore(&iommu->lock, flags);
-}
-
-irqreturn_t amd_iommu_int_handler(int irq, void *data)
-{
-	struct amd_iommu *iommu;
-
-	for_each_iommu(iommu)
-		iommu_poll_events(iommu);
-
-	return IRQ_HANDLED;
-}
-
-/****************************************************************************
- *
- * IOMMU command queuing functions
- *
- ****************************************************************************/
-
-/*
- * Writes the command to the IOMMUs command buffer and informs the
- * hardware about the new command. Must be called with iommu->lock held.
- */
-static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
-{
-	u32 tail, head;
-	u8 *target;
-
-	WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
-	tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-	target = iommu->cmd_buf + tail;
-	memcpy_toio(target, cmd, sizeof(*cmd));
-	tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
-	head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
-	if (tail == head)
-		return -ENOMEM;
-	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-
-	return 0;
-}
-
-/*
- * General queuing function for commands. Takes iommu->lock and calls
- * __iommu_queue_command().
- */
-static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
-{
-	unsigned long flags;
-	int ret;
-
-	spin_lock_irqsave(&iommu->lock, flags);
-	ret = __iommu_queue_command(iommu, cmd);
-	if (!ret)
-		iommu->need_sync = true;
-	spin_unlock_irqrestore(&iommu->lock, flags);
-
-	return ret;
-}
-
-/*
- * This function waits until an IOMMU has completed a completion
- * wait command
- */
-static void __iommu_wait_for_completion(struct amd_iommu *iommu)
-{
-	int ready = 0;
-	unsigned status = 0;
-	unsigned long i = 0;
-
-	INC_STATS_COUNTER(compl_wait);
-
-	while (!ready && (i < EXIT_LOOP_COUNT)) {
-		++i;
-		/* wait for the bit to become one */
-		status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
-		ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
-	}
-
-	/* set bit back to zero */
-	status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
-	writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
-
-	if (unlikely(i == EXIT_LOOP_COUNT))
-		iommu->reset_in_progress = true;
-}
-
-/*
- * This function queues a completion wait command into the command
- * buffer of an IOMMU
- */
-static int __iommu_completion_wait(struct amd_iommu *iommu)
-{
-	struct iommu_cmd cmd;
-
-	 memset(&cmd, 0, sizeof(cmd));
-	 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
-	 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
-
-	 return __iommu_queue_command(iommu, &cmd);
-}
-
-/*
- * This function is called whenever we need to ensure that the IOMMU has
- * completed execution of all commands we sent. It sends a
- * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
- * us about that by writing a value to a physical address we pass with
- * the command.
- */
-static int iommu_completion_wait(struct amd_iommu *iommu)
-{
-	int ret = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iommu->lock, flags);
-
-	if (!iommu->need_sync)
-		goto out;
-
-	ret = __iommu_completion_wait(iommu);
-
-	iommu->need_sync = false;
-
-	if (ret)
-		goto out;
-
-	__iommu_wait_for_completion(iommu);
-
-out:
-	spin_unlock_irqrestore(&iommu->lock, flags);
-
-	if (iommu->reset_in_progress)
-		reset_iommu_command_buffer(iommu);
-
-	return 0;
-}
-
-static void iommu_flush_complete(struct protection_domain *domain)
-{
-	int i;
-
-	for (i = 0; i < amd_iommus_present; ++i) {
-		if (!domain->dev_iommu[i])
-			continue;
-
-		/*
-		 * Devices of this domain are behind this IOMMU
-		 * We need to wait for completion of all commands.
-		 */
-		iommu_completion_wait(amd_iommus[i]);
-	}
-}
-
-/*
- * Command send function for invalidating a device table entry
- */
-static int iommu_flush_device(struct device *dev)
-{
-	struct amd_iommu *iommu;
-	struct iommu_cmd cmd;
-	u16 devid;
-
-	devid = get_device_id(dev);
-	iommu = amd_iommu_rlookup_table[devid];
-
-	/* Build command */
-	memset(&cmd, 0, sizeof(cmd));
-	CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
-	cmd.data[0] = devid;
-
-	return iommu_queue_command(iommu, &cmd);
-}
-
-static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
-					  u16 domid, int pde, int s)
-{
-	memset(cmd, 0, sizeof(*cmd));
-	address &= PAGE_MASK;
-	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
-	cmd->data[1] |= domid;
-	cmd->data[2] = lower_32_bits(address);
-	cmd->data[3] = upper_32_bits(address);
-	if (s) /* size bit - we flush more than one 4kb page */
-		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
-	if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
-		cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
-}
-
-/*
- * Generic command send function for invalidaing TLB entries
- */
-static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
-		u64 address, u16 domid, int pde, int s)
-{
-	struct iommu_cmd cmd;
-	int ret;
-
-	__iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s);
-
-	ret = iommu_queue_command(iommu, &cmd);
-
-	return ret;
-}
-
-/*
- * TLB invalidation function which is called from the mapping functions.
- * It invalidates a single PTE if the range to flush is within a single
- * page. Otherwise it flushes the whole TLB of the IOMMU.
- */
-static void __iommu_flush_pages(struct protection_domain *domain,
-				u64 address, size_t size, int pde)
-{
-	int s = 0, i;
-	unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE);
-
-	address &= PAGE_MASK;
-
-	if (pages > 1) {
-		/*
-		 * If we have to flush more than one page, flush all
-		 * TLB entries for this domain
-		 */
-		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
-		s = 1;
-	}
-
-
-	for (i = 0; i < amd_iommus_present; ++i) {
-		if (!domain->dev_iommu[i])
-			continue;
-
-		/*
-		 * Devices of this domain are behind this IOMMU
-		 * We need a TLB flush
-		 */
-		iommu_queue_inv_iommu_pages(amd_iommus[i], address,
-					    domain->id, pde, s);
-	}
-
-	return;
-}
-
-static void iommu_flush_pages(struct protection_domain *domain,
-			     u64 address, size_t size)
-{
-	__iommu_flush_pages(domain, address, size, 0);
-}
-
-/* Flush the whole IO/TLB for a given protection domain */
-static void iommu_flush_tlb(struct protection_domain *domain)
-{
-	__iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
-}
-
-/* Flush the whole IO/TLB for a given protection domain - including PDE */
-static void iommu_flush_tlb_pde(struct protection_domain *domain)
-{
-	__iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
-}
-
-
-/*
- * This function flushes the DTEs for all devices in domain
- */
-static void iommu_flush_domain_devices(struct protection_domain *domain)
-{
-	struct iommu_dev_data *dev_data;
-	unsigned long flags;
-
-	spin_lock_irqsave(&domain->lock, flags);
-
-	list_for_each_entry(dev_data, &domain->dev_list, list)
-		iommu_flush_device(dev_data->dev);
-
-	spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-static void iommu_flush_all_domain_devices(void)
-{
-	struct protection_domain *domain;
-	unsigned long flags;
-
-	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
-
-	list_for_each_entry(domain, &amd_iommu_pd_list, list) {
-		iommu_flush_domain_devices(domain);
-		iommu_flush_complete(domain);
-	}
-
-	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-void amd_iommu_flush_all_devices(void)
-{
-	iommu_flush_all_domain_devices();
-}
-
-/*
- * This function uses heavy locking and may disable irqs for some time. But
- * this is no issue because it is only called during resume.
- */
-void amd_iommu_flush_all_domains(void)
-{
-	struct protection_domain *domain;
-	unsigned long flags;
-
-	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
-
-	list_for_each_entry(domain, &amd_iommu_pd_list, list) {
-		spin_lock(&domain->lock);
-		iommu_flush_tlb_pde(domain);
-		iommu_flush_complete(domain);
-		spin_unlock(&domain->lock);
-	}
-
-	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-static void reset_iommu_command_buffer(struct amd_iommu *iommu)
-{
-	pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
-
-	if (iommu->reset_in_progress)
-		panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
-
-	amd_iommu_reset_cmd_buffer(iommu);
-	amd_iommu_flush_all_devices();
-	amd_iommu_flush_all_domains();
-
-	iommu->reset_in_progress = false;
-}
-
-/****************************************************************************
- *
- * The functions below are used the create the page table mappings for
- * unity mapped regions.
- *
- ****************************************************************************/
-
-/*
- * This function is used to add another level to an IO page table. Adding
- * another level increases the size of the address space by 9 bits to a size up
- * to 64 bits.
- */
-static bool increase_address_space(struct protection_domain *domain,
-				   gfp_t gfp)
-{
-	u64 *pte;
-
-	if (domain->mode == PAGE_MODE_6_LEVEL)
-		/* address space already 64 bit large */
-		return false;
-
-	pte = (void *)get_zeroed_page(gfp);
-	if (!pte)
-		return false;
-
-	*pte             = PM_LEVEL_PDE(domain->mode,
-					virt_to_phys(domain->pt_root));
-	domain->pt_root  = pte;
-	domain->mode    += 1;
-	domain->updated  = true;
-
-	return true;
-}
-
-static u64 *alloc_pte(struct protection_domain *domain,
-		      unsigned long address,
-		      unsigned long page_size,
-		      u64 **pte_page,
-		      gfp_t gfp)
-{
-	int level, end_lvl;
-	u64 *pte, *page;
-
-	BUG_ON(!is_power_of_2(page_size));
-
-	while (address > PM_LEVEL_SIZE(domain->mode))
-		increase_address_space(domain, gfp);
-
-	level   = domain->mode - 1;
-	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
-	address = PAGE_SIZE_ALIGN(address, page_size);
-	end_lvl = PAGE_SIZE_LEVEL(page_size);
-
-	while (level > end_lvl) {
-		if (!IOMMU_PTE_PRESENT(*pte)) {
-			page = (u64 *)get_zeroed_page(gfp);
-			if (!page)
-				return NULL;
-			*pte = PM_LEVEL_PDE(level, virt_to_phys(page));
-		}
-
-		/* No level skipping support yet */
-		if (PM_PTE_LEVEL(*pte) != level)
-			return NULL;
-
-		level -= 1;
-
-		pte = IOMMU_PTE_PAGE(*pte);
-
-		if (pte_page && level == end_lvl)
-			*pte_page = pte;
-
-		pte = &pte[PM_LEVEL_INDEX(level, address)];
-	}
-
-	return pte;
-}
-
-/*
- * This function checks if there is a PTE for a given dma address. If
- * there is one, it returns the pointer to it.
- */
-static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
-{
-	int level;
-	u64 *pte;
-
-	if (address > PM_LEVEL_SIZE(domain->mode))
-		return NULL;
-
-	level   =  domain->mode - 1;
-	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
-
-	while (level > 0) {
-
-		/* Not Present */
-		if (!IOMMU_PTE_PRESENT(*pte))
-			return NULL;
-
-		/* Large PTE */
-		if (PM_PTE_LEVEL(*pte) == 0x07) {
-			unsigned long pte_mask, __pte;
-
-			/*
-			 * If we have a series of large PTEs, make
-			 * sure to return a pointer to the first one.
-			 */
-			pte_mask = PTE_PAGE_SIZE(*pte);
-			pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
-			__pte    = ((unsigned long)pte) & pte_mask;
-
-			return (u64 *)__pte;
-		}
-
-		/* No level skipping support yet */
-		if (PM_PTE_LEVEL(*pte) != level)
-			return NULL;
-
-		level -= 1;
-
-		/* Walk to the next level */
-		pte = IOMMU_PTE_PAGE(*pte);
-		pte = &pte[PM_LEVEL_INDEX(level, address)];
-	}
-
-	return pte;
-}
-
-/*
- * Generic mapping functions. It maps a physical address into a DMA
- * address space. It allocates the page table pages if necessary.
- * In the future it can be extended to a generic mapping function
- * supporting all features of AMD IOMMU page tables like level skipping
- * and full 64 bit address spaces.
- */
-static int iommu_map_page(struct protection_domain *dom,
-			  unsigned long bus_addr,
-			  unsigned long phys_addr,
-			  int prot,
-			  unsigned long page_size)
-{
-	u64 __pte, *pte;
-	int i, count;
-
-	if (!(prot & IOMMU_PROT_MASK))
-		return -EINVAL;
-
-	bus_addr  = PAGE_ALIGN(bus_addr);
-	phys_addr = PAGE_ALIGN(phys_addr);
-	count     = PAGE_SIZE_PTE_COUNT(page_size);
-	pte       = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
-
-	for (i = 0; i < count; ++i)
-		if (IOMMU_PTE_PRESENT(pte[i]))
-			return -EBUSY;
-
-	if (page_size > PAGE_SIZE) {
-		__pte = PAGE_SIZE_PTE(phys_addr, page_size);
-		__pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
-	} else
-		__pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
-
-	if (prot & IOMMU_PROT_IR)
-		__pte |= IOMMU_PTE_IR;
-	if (prot & IOMMU_PROT_IW)
-		__pte |= IOMMU_PTE_IW;
-
-	for (i = 0; i < count; ++i)
-		pte[i] = __pte;
-
-	update_domain(dom);
-
-	return 0;
-}
-
-static unsigned long iommu_unmap_page(struct protection_domain *dom,
-				      unsigned long bus_addr,
-				      unsigned long page_size)
-{
-	unsigned long long unmap_size, unmapped;
-	u64 *pte;
-
-	BUG_ON(!is_power_of_2(page_size));
-
-	unmapped = 0;
-
-	while (unmapped < page_size) {
-
-		pte = fetch_pte(dom, bus_addr);
-
-		if (!pte) {
-			/*
-			 * No PTE for this address
-			 * move forward in 4kb steps
-			 */
-			unmap_size = PAGE_SIZE;
-		} else if (PM_PTE_LEVEL(*pte) == 0) {
-			/* 4kb PTE found for this address */
-			unmap_size = PAGE_SIZE;
-			*pte       = 0ULL;
-		} else {
-			int count, i;
-
-			/* Large PTE found which maps this address */
-			unmap_size = PTE_PAGE_SIZE(*pte);
-			count      = PAGE_SIZE_PTE_COUNT(unmap_size);
-			for (i = 0; i < count; i++)
-				pte[i] = 0ULL;
-		}
-
-		bus_addr  = (bus_addr & ~(unmap_size - 1)) + unmap_size;
-		unmapped += unmap_size;
-	}
-
-	BUG_ON(!is_power_of_2(unmapped));
-
-	return unmapped;
-}
-
-/*
- * This function checks if a specific unity mapping entry is needed for
- * this specific IOMMU.
- */
-static int iommu_for_unity_map(struct amd_iommu *iommu,
-			       struct unity_map_entry *entry)
-{
-	u16 bdf, i;
-
-	for (i = entry->devid_start; i <= entry->devid_end; ++i) {
-		bdf = amd_iommu_alias_table[i];
-		if (amd_iommu_rlookup_table[bdf] == iommu)
-			return 1;
-	}
-
-	return 0;
-}
-
-/*
- * This function actually applies the mapping to the page table of the
- * dma_ops domain.
- */
-static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
-			     struct unity_map_entry *e)
-{
-	u64 addr;
-	int ret;
-
-	for (addr = e->address_start; addr < e->address_end;
-	     addr += PAGE_SIZE) {
-		ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
-				     PAGE_SIZE);
-		if (ret)
-			return ret;
-		/*
-		 * if unity mapping is in aperture range mark the page
-		 * as allocated in the aperture
-		 */
-		if (addr < dma_dom->aperture_size)
-			__set_bit(addr >> PAGE_SHIFT,
-				  dma_dom->aperture[0]->bitmap);
-	}
-
-	return 0;
-}
-
-/*
- * Init the unity mappings for a specific IOMMU in the system
- *
- * Basically iterates over all unity mapping entries and applies them to
- * the default domain DMA of that IOMMU if necessary.
- */
-static int iommu_init_unity_mappings(struct amd_iommu *iommu)
-{
-	struct unity_map_entry *entry;
-	int ret;
-
-	list_for_each_entry(entry, &amd_iommu_unity_map, list) {
-		if (!iommu_for_unity_map(iommu, entry))
-			continue;
-		ret = dma_ops_unity_map(iommu->default_dom, entry);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-/*
- * Inits the unity mappings required for a specific device
- */
-static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
-					  u16 devid)
-{
-	struct unity_map_entry *e;
-	int ret;
-
-	list_for_each_entry(e, &amd_iommu_unity_map, list) {
-		if (!(devid >= e->devid_start && devid <= e->devid_end))
-			continue;
-		ret = dma_ops_unity_map(dma_dom, e);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-/****************************************************************************
- *
- * The next functions belong to the address allocator for the dma_ops
- * interface functions. They work like the allocators in the other IOMMU
- * drivers. Its basically a bitmap which marks the allocated pages in
- * the aperture. Maybe it could be enhanced in the future to a more
- * efficient allocator.
- *
- ****************************************************************************/
-
-/*
- * The address allocator core functions.
- *
- * called with domain->lock held
- */
-
-/*
- * Used to reserve address ranges in the aperture (e.g. for exclusion
- * ranges.
- */
-static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
-				      unsigned long start_page,
-				      unsigned int pages)
-{
-	unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
-
-	if (start_page + pages > last_page)
-		pages = last_page - start_page;
-
-	for (i = start_page; i < start_page + pages; ++i) {
-		int index = i / APERTURE_RANGE_PAGES;
-		int page  = i % APERTURE_RANGE_PAGES;
-		__set_bit(page, dom->aperture[index]->bitmap);
-	}
-}
-
-/*
- * This function is used to add a new aperture range to an existing
- * aperture in case of dma_ops domain allocation or address allocation
- * failure.
- */
-static int alloc_new_range(struct dma_ops_domain *dma_dom,
-			   bool populate, gfp_t gfp)
-{
-	int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
-	struct amd_iommu *iommu;
-	unsigned long i;
-
-#ifdef CONFIG_IOMMU_STRESS
-	populate = false;
-#endif
-
-	if (index >= APERTURE_MAX_RANGES)
-		return -ENOMEM;
-
-	dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
-	if (!dma_dom->aperture[index])
-		return -ENOMEM;
-
-	dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
-	if (!dma_dom->aperture[index]->bitmap)
-		goto out_free;
-
-	dma_dom->aperture[index]->offset = dma_dom->aperture_size;
-
-	if (populate) {
-		unsigned long address = dma_dom->aperture_size;
-		int i, num_ptes = APERTURE_RANGE_PAGES / 512;
-		u64 *pte, *pte_page;
-
-		for (i = 0; i < num_ptes; ++i) {
-			pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
-					&pte_page, gfp);
-			if (!pte)
-				goto out_free;
-
-			dma_dom->aperture[index]->pte_pages[i] = pte_page;
-
-			address += APERTURE_RANGE_SIZE / 64;
-		}
-	}
-
-	dma_dom->aperture_size += APERTURE_RANGE_SIZE;
-
-	/* Initialize the exclusion range if necessary */
-	for_each_iommu(iommu) {
-		if (iommu->exclusion_start &&
-		    iommu->exclusion_start >= dma_dom->aperture[index]->offset
-		    && iommu->exclusion_start < dma_dom->aperture_size) {
-			unsigned long startpage;
-			int pages = iommu_num_pages(iommu->exclusion_start,
-						    iommu->exclusion_length,
-						    PAGE_SIZE);
-			startpage = iommu->exclusion_start >> PAGE_SHIFT;
-			dma_ops_reserve_addresses(dma_dom, startpage, pages);
-		}
-	}
-
-	/*
-	 * Check for areas already mapped as present in the new aperture
-	 * range and mark those pages as reserved in the allocator. Such
-	 * mappings may already exist as a result of requested unity
-	 * mappings for devices.
-	 */
-	for (i = dma_dom->aperture[index]->offset;
-	     i < dma_dom->aperture_size;
-	     i += PAGE_SIZE) {
-		u64 *pte = fetch_pte(&dma_dom->domain, i);
-		if (!pte || !IOMMU_PTE_PRESENT(*pte))
-			continue;
-
-		dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
-	}
-
-	update_domain(&dma_dom->domain);
-
-	return 0;
-
-out_free:
-	update_domain(&dma_dom->domain);
-
-	free_page((unsigned long)dma_dom->aperture[index]->bitmap);
-
-	kfree(dma_dom->aperture[index]);
-	dma_dom->aperture[index] = NULL;
-
-	return -ENOMEM;
-}
-
-static unsigned long dma_ops_area_alloc(struct device *dev,
-					struct dma_ops_domain *dom,
-					unsigned int pages,
-					unsigned long align_mask,
-					u64 dma_mask,
-					unsigned long start)
-{
-	unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
-	int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
-	int i = start >> APERTURE_RANGE_SHIFT;
-	unsigned long boundary_size;
-	unsigned long address = -1;
-	unsigned long limit;
-
-	next_bit >>= PAGE_SHIFT;
-
-	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
-			PAGE_SIZE) >> PAGE_SHIFT;
-
-	for (;i < max_index; ++i) {
-		unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
-
-		if (dom->aperture[i]->offset >= dma_mask)
-			break;
-
-		limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
-					       dma_mask >> PAGE_SHIFT);
-
-		address = iommu_area_alloc(dom->aperture[i]->bitmap,
-					   limit, next_bit, pages, 0,
-					    boundary_size, align_mask);
-		if (address != -1) {
-			address = dom->aperture[i]->offset +
-				  (address << PAGE_SHIFT);
-			dom->next_address = address + (pages << PAGE_SHIFT);
-			break;
-		}
-
-		next_bit = 0;
-	}
-
-	return address;
-}
-
-static unsigned long dma_ops_alloc_addresses(struct device *dev,
-					     struct dma_ops_domain *dom,
-					     unsigned int pages,
-					     unsigned long align_mask,
-					     u64 dma_mask)
-{
-	unsigned long address;
-
-#ifdef CONFIG_IOMMU_STRESS
-	dom->next_address = 0;
-	dom->need_flush = true;
-#endif
-
-	address = dma_ops_area_alloc(dev, dom, pages, align_mask,
-				     dma_mask, dom->next_address);
-
-	if (address == -1) {
-		dom->next_address = 0;
-		address = dma_ops_area_alloc(dev, dom, pages, align_mask,
-					     dma_mask, 0);
-		dom->need_flush = true;
-	}
-
-	if (unlikely(address == -1))
-		address = DMA_ERROR_CODE;
-
-	WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
-
-	return address;
-}
-
-/*
- * The address free function.
- *
- * called with domain->lock held
- */
-static void dma_ops_free_addresses(struct dma_ops_domain *dom,
-				   unsigned long address,
-				   unsigned int pages)
-{
-	unsigned i = address >> APERTURE_RANGE_SHIFT;
-	struct aperture_range *range = dom->aperture[i];
-
-	BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
-
-#ifdef CONFIG_IOMMU_STRESS
-	if (i < 4)
-		return;
-#endif
-
-	if (address >= dom->next_address)
-		dom->need_flush = true;
-
-	address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
-
-	bitmap_clear(range->bitmap, address, pages);
-
-}
-
-/****************************************************************************
- *
- * The next functions belong to the domain allocation. A domain is
- * allocated for every IOMMU as the default domain. If device isolation
- * is enabled, every device get its own domain. The most important thing
- * about domains is the page table mapping the DMA address space they
- * contain.
- *
- ****************************************************************************/
-
-/*
- * This function adds a protection domain to the global protection domain list
- */
-static void add_domain_to_list(struct protection_domain *domain)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
-	list_add(&domain->list, &amd_iommu_pd_list);
-	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-/*
- * This function removes a protection domain to the global
- * protection domain list
- */
-static void del_domain_from_list(struct protection_domain *domain)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
-	list_del(&domain->list);
-	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-static u16 domain_id_alloc(void)
-{
-	unsigned long flags;
-	int id;
-
-	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-	id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
-	BUG_ON(id == 0);
-	if (id > 0 && id < MAX_DOMAIN_ID)
-		__set_bit(id, amd_iommu_pd_alloc_bitmap);
-	else
-		id = 0;
-	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
-	return id;
-}
-
-static void domain_id_free(int id)
-{
-	unsigned long flags;
-
-	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-	if (id > 0 && id < MAX_DOMAIN_ID)
-		__clear_bit(id, amd_iommu_pd_alloc_bitmap);
-	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-static void free_pagetable(struct protection_domain *domain)
-{
-	int i, j;
-	u64 *p1, *p2, *p3;
-
-	p1 = domain->pt_root;
-
-	if (!p1)
-		return;
-
-	for (i = 0; i < 512; ++i) {
-		if (!IOMMU_PTE_PRESENT(p1[i]))
-			continue;
-
-		p2 = IOMMU_PTE_PAGE(p1[i]);
-		for (j = 0; j < 512; ++j) {
-			if (!IOMMU_PTE_PRESENT(p2[j]))
-				continue;
-			p3 = IOMMU_PTE_PAGE(p2[j]);
-			free_page((unsigned long)p3);
-		}
-
-		free_page((unsigned long)p2);
-	}
-
-	free_page((unsigned long)p1);
-
-	domain->pt_root = NULL;
-}
-
-/*
- * Free a domain, only used if something went wrong in the
- * allocation path and we need to free an already allocated page table
- */
-static void dma_ops_domain_free(struct dma_ops_domain *dom)
-{
-	int i;
-
-	if (!dom)
-		return;
-
-	del_domain_from_list(&dom->domain);
-
-	free_pagetable(&dom->domain);
-
-	for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
-		if (!dom->aperture[i])
-			continue;
-		free_page((unsigned long)dom->aperture[i]->bitmap);
-		kfree(dom->aperture[i]);
-	}
-
-	kfree(dom);
-}
-
-/*
- * Allocates a new protection domain usable for the dma_ops functions.
- * It also initializes the page table and the address allocator data
- * structures required for the dma_ops interface
- */
-static struct dma_ops_domain *dma_ops_domain_alloc(void)
-{
-	struct dma_ops_domain *dma_dom;
-
-	dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
-	if (!dma_dom)
-		return NULL;
-
-	spin_lock_init(&dma_dom->domain.lock);
-
-	dma_dom->domain.id = domain_id_alloc();
-	if (dma_dom->domain.id == 0)
-		goto free_dma_dom;
-	INIT_LIST_HEAD(&dma_dom->domain.dev_list);
-	dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
-	dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
-	dma_dom->domain.flags = PD_DMA_OPS_MASK;
-	dma_dom->domain.priv = dma_dom;
-	if (!dma_dom->domain.pt_root)
-		goto free_dma_dom;
-
-	dma_dom->need_flush = false;
-	dma_dom->target_dev = 0xffff;
-
-	add_domain_to_list(&dma_dom->domain);
-
-	if (alloc_new_range(dma_dom, true, GFP_KERNEL))
-		goto free_dma_dom;
-
-	/*
-	 * mark the first page as allocated so we never return 0 as
-	 * a valid dma-address. So we can use 0 as error value
-	 */
-	dma_dom->aperture[0]->bitmap[0] = 1;
-	dma_dom->next_address = 0;
-
-
-	return dma_dom;
-
-free_dma_dom:
-	dma_ops_domain_free(dma_dom);
-
-	return NULL;
-}
-
-/*
- * little helper function to check whether a given protection domain is a
- * dma_ops domain
- */
-static bool dma_ops_domain(struct protection_domain *domain)
-{
-	return domain->flags & PD_DMA_OPS_MASK;
-}
-
-static void set_dte_entry(u16 devid, struct protection_domain *domain)
-{
-	u64 pte_root = virt_to_phys(domain->pt_root);
-
-	pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
-		    << DEV_ENTRY_MODE_SHIFT;
-	pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
-
-	amd_iommu_dev_table[devid].data[2] = domain->id;
-	amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
-	amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
-}
-
-static void clear_dte_entry(u16 devid)
-{
-	/* remove entry from the device table seen by the hardware */
-	amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
-	amd_iommu_dev_table[devid].data[1] = 0;
-	amd_iommu_dev_table[devid].data[2] = 0;
-
-	amd_iommu_apply_erratum_63(devid);
-}
-
-static void do_attach(struct device *dev, struct protection_domain *domain)
-{
-	struct iommu_dev_data *dev_data;
-	struct amd_iommu *iommu;
-	u16 devid;
-
-	devid    = get_device_id(dev);
-	iommu    = amd_iommu_rlookup_table[devid];
-	dev_data = get_dev_data(dev);
-
-	/* Update data structures */
-	dev_data->domain = domain;
-	list_add(&dev_data->list, &domain->dev_list);
-	set_dte_entry(devid, domain);
-
-	/* Do reference counting */
-	domain->dev_iommu[iommu->index] += 1;
-	domain->dev_cnt                 += 1;
-
-	/* Flush the DTE entry */
-	iommu_flush_device(dev);
-}
-
-static void do_detach(struct device *dev)
-{
-	struct iommu_dev_data *dev_data;
-	struct amd_iommu *iommu;
-	u16 devid;
-
-	devid    = get_device_id(dev);
-	iommu    = amd_iommu_rlookup_table[devid];
-	dev_data = get_dev_data(dev);
-
-	/* decrease reference counters */
-	dev_data->domain->dev_iommu[iommu->index] -= 1;
-	dev_data->domain->dev_cnt                 -= 1;
-
-	/* Update data structures */
-	dev_data->domain = NULL;
-	list_del(&dev_data->list);
-	clear_dte_entry(devid);
-
-	/* Flush the DTE entry */
-	iommu_flush_device(dev);
-}
-
-/*
- * If a device is not yet associated with a domain, this function does
- * assigns it visible for the hardware
- */
-static int __attach_device(struct device *dev,
-			   struct protection_domain *domain)
-{
-	struct iommu_dev_data *dev_data, *alias_data;
-	int ret;
-
-	dev_data   = get_dev_data(dev);
-	alias_data = get_dev_data(dev_data->alias);
-
-	if (!alias_data)
-		return -EINVAL;
-
-	/* lock domain */
-	spin_lock(&domain->lock);
-
-	/* Some sanity checks */
-	ret = -EBUSY;
-	if (alias_data->domain != NULL &&
-	    alias_data->domain != domain)
-		goto out_unlock;
-
-	if (dev_data->domain != NULL &&
-	    dev_data->domain != domain)
-		goto out_unlock;
-
-	/* Do real assignment */
-	if (dev_data->alias != dev) {
-		alias_data = get_dev_data(dev_data->alias);
-		if (alias_data->domain == NULL)
-			do_attach(dev_data->alias, domain);
-
-		atomic_inc(&alias_data->bind);
-	}
-
-	if (dev_data->domain == NULL)
-		do_attach(dev, domain);
-
-	atomic_inc(&dev_data->bind);
-
-	ret = 0;
-
-out_unlock:
-
-	/* ready */
-	spin_unlock(&domain->lock);
-
-	return ret;
-}
-
-/*
- * If a device is not yet associated with a domain, this function does
- * assigns it visible for the hardware
- */
-static int attach_device(struct device *dev,
-			 struct protection_domain *domain)
-{
-	unsigned long flags;
-	int ret;
-
-	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-	ret = __attach_device(dev, domain);
-	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
-	/*
-	 * We might boot into a crash-kernel here. The crashed kernel
-	 * left the caches in the IOMMU dirty. So we have to flush
-	 * here to evict all dirty stuff.
-	 */
-	iommu_flush_tlb_pde(domain);
-
-	return ret;
-}
-
-/*
- * Removes a device from a protection domain (unlocked)
- */
-static void __detach_device(struct device *dev)
-{
-	struct iommu_dev_data *dev_data = get_dev_data(dev);
-	struct iommu_dev_data *alias_data;
-	struct protection_domain *domain;
-	unsigned long flags;
-
-	BUG_ON(!dev_data->domain);
-
-	domain = dev_data->domain;
-
-	spin_lock_irqsave(&domain->lock, flags);
-
-	if (dev_data->alias != dev) {
-		alias_data = get_dev_data(dev_data->alias);
-		if (atomic_dec_and_test(&alias_data->bind))
-			do_detach(dev_data->alias);
-	}
-
-	if (atomic_dec_and_test(&dev_data->bind))
-		do_detach(dev);
-
-	spin_unlock_irqrestore(&domain->lock, flags);
-
-	/*
-	 * If we run in passthrough mode the device must be assigned to the
-	 * passthrough domain if it is detached from any other domain.
-	 * Make sure we can deassign from the pt_domain itself.
-	 */
-	if (iommu_pass_through &&
-	    (dev_data->domain == NULL && domain != pt_domain))
-		__attach_device(dev, pt_domain);
-}
-
-/*
- * Removes a device from a protection domain (with devtable_lock held)
- */
-static void detach_device(struct device *dev)
-{
-	unsigned long flags;
-
-	/* lock device table */
-	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-	__detach_device(dev);
-	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-/*
- * Find out the protection domain structure for a given PCI device. This
- * will give us the pointer to the page table root for example.
- */
-static struct protection_domain *domain_for_device(struct device *dev)
-{
-	struct protection_domain *dom;
-	struct iommu_dev_data *dev_data, *alias_data;
-	unsigned long flags;
-	u16 devid, alias;
-
-	devid      = get_device_id(dev);
-	alias      = amd_iommu_alias_table[devid];
-	dev_data   = get_dev_data(dev);
-	alias_data = get_dev_data(dev_data->alias);
-	if (!alias_data)
-		return NULL;
-
-	read_lock_irqsave(&amd_iommu_devtable_lock, flags);
-	dom = dev_data->domain;
-	if (dom == NULL &&
-	    alias_data->domain != NULL) {
-		__attach_device(dev, alias_data->domain);
-		dom = alias_data->domain;
-	}
-
-	read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
-	return dom;
-}
-
-static int device_change_notifier(struct notifier_block *nb,
-				  unsigned long action, void *data)
-{
-	struct device *dev = data;
-	u16 devid;
-	struct protection_domain *domain;
-	struct dma_ops_domain *dma_domain;
-	struct amd_iommu *iommu;
-	unsigned long flags;
-
-	if (!check_device(dev))
-		return 0;
-
-	devid  = get_device_id(dev);
-	iommu  = amd_iommu_rlookup_table[devid];
-
-	switch (action) {
-	case BUS_NOTIFY_UNBOUND_DRIVER:
-
-		domain = domain_for_device(dev);
-
-		if (!domain)
-			goto out;
-		if (iommu_pass_through)
-			break;
-		detach_device(dev);
-		break;
-	case BUS_NOTIFY_ADD_DEVICE:
-
-		iommu_init_device(dev);
-
-		domain = domain_for_device(dev);
-
-		/* allocate a protection domain if a device is added */
-		dma_domain = find_protection_domain(devid);
-		if (dma_domain)
-			goto out;
-		dma_domain = dma_ops_domain_alloc();
-		if (!dma_domain)
-			goto out;
-		dma_domain->target_dev = devid;
-
-		spin_lock_irqsave(&iommu_pd_list_lock, flags);
-		list_add_tail(&dma_domain->list, &iommu_pd_list);
-		spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
-
-		break;
-	case BUS_NOTIFY_DEL_DEVICE:
-
-		iommu_uninit_device(dev);
-
-	default:
-		goto out;
-	}
-
-	iommu_flush_device(dev);
-	iommu_completion_wait(iommu);
-
-out:
-	return 0;
-}
-
-static struct notifier_block device_nb = {
-	.notifier_call = device_change_notifier,
-};
-
-void amd_iommu_init_notifier(void)
-{
-	bus_register_notifier(&pci_bus_type, &device_nb);
-}
-
-/*****************************************************************************
- *
- * The next functions belong to the dma_ops mapping/unmapping code.
- *
- *****************************************************************************/
-
-/*
- * In the dma_ops path we only have the struct device. This function
- * finds the corresponding IOMMU, the protection domain and the
- * requestor id for a given device.
- * If the device is not yet associated with a domain this is also done
- * in this function.
- */
-static struct protection_domain *get_domain(struct device *dev)
-{
-	struct protection_domain *domain;
-	struct dma_ops_domain *dma_dom;
-	u16 devid = get_device_id(dev);
-
-	if (!check_device(dev))
-		return ERR_PTR(-EINVAL);
-
-	domain = domain_for_device(dev);
-	if (domain != NULL && !dma_ops_domain(domain))
-		return ERR_PTR(-EBUSY);
-
-	if (domain != NULL)
-		return domain;
-
-	/* Device not bount yet - bind it */
-	dma_dom = find_protection_domain(devid);
-	if (!dma_dom)
-		dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
-	attach_device(dev, &dma_dom->domain);
-	DUMP_printk("Using protection domain %d for device %s\n",
-		    dma_dom->domain.id, dev_name(dev));
-
-	return &dma_dom->domain;
-}
-
-static void update_device_table(struct protection_domain *domain)
-{
-	struct iommu_dev_data *dev_data;
-
-	list_for_each_entry(dev_data, &domain->dev_list, list) {
-		u16 devid = get_device_id(dev_data->dev);
-		set_dte_entry(devid, domain);
-	}
-}
-
-static void update_domain(struct protection_domain *domain)
-{
-	if (!domain->updated)
-		return;
-
-	update_device_table(domain);
-	iommu_flush_domain_devices(domain);
-	iommu_flush_tlb_pde(domain);
-
-	domain->updated = false;
-}
-
-/*
- * This function fetches the PTE for a given address in the aperture
- */
-static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
-			    unsigned long address)
-{
-	struct aperture_range *aperture;
-	u64 *pte, *pte_page;
-
-	aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
-	if (!aperture)
-		return NULL;
-
-	pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
-	if (!pte) {
-		pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
-				GFP_ATOMIC);
-		aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
-	} else
-		pte += PM_LEVEL_INDEX(0, address);
-
-	update_domain(&dom->domain);
-
-	return pte;
-}
-
-/*
- * This is the generic map function. It maps one 4kb page at paddr to
- * the given address in the DMA address space for the domain.
- */
-static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
-				     unsigned long address,
-				     phys_addr_t paddr,
-				     int direction)
-{
-	u64 *pte, __pte;
-
-	WARN_ON(address > dom->aperture_size);
-
-	paddr &= PAGE_MASK;
-
-	pte  = dma_ops_get_pte(dom, address);
-	if (!pte)
-		return DMA_ERROR_CODE;
-
-	__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
-
-	if (direction == DMA_TO_DEVICE)
-		__pte |= IOMMU_PTE_IR;
-	else if (direction == DMA_FROM_DEVICE)
-		__pte |= IOMMU_PTE_IW;
-	else if (direction == DMA_BIDIRECTIONAL)
-		__pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
-
-	WARN_ON(*pte);
-
-	*pte = __pte;
-
-	return (dma_addr_t)address;
-}
-
-/*
- * The generic unmapping function for on page in the DMA address space.
- */
-static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
-				 unsigned long address)
-{
-	struct aperture_range *aperture;
-	u64 *pte;
-
-	if (address >= dom->aperture_size)
-		return;
-
-	aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
-	if (!aperture)
-		return;
-
-	pte  = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
-	if (!pte)
-		return;
-
-	pte += PM_LEVEL_INDEX(0, address);
-
-	WARN_ON(!*pte);
-
-	*pte = 0ULL;
-}
-
-/*
- * This function contains common code for mapping of a physically
- * contiguous memory region into DMA address space. It is used by all
- * mapping functions provided with this IOMMU driver.
- * Must be called with the domain lock held.
- */
-static dma_addr_t __map_single(struct device *dev,
-			       struct dma_ops_domain *dma_dom,
-			       phys_addr_t paddr,
-			       size_t size,
-			       int dir,
-			       bool align,
-			       u64 dma_mask)
-{
-	dma_addr_t offset = paddr & ~PAGE_MASK;
-	dma_addr_t address, start, ret;
-	unsigned int pages;
-	unsigned long align_mask = 0;
-	int i;
-
-	pages = iommu_num_pages(paddr, size, PAGE_SIZE);
-	paddr &= PAGE_MASK;
-
-	INC_STATS_COUNTER(total_map_requests);
-
-	if (pages > 1)
-		INC_STATS_COUNTER(cross_page);
-
-	if (align)
-		align_mask = (1UL << get_order(size)) - 1;
-
-retry:
-	address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
-					  dma_mask);
-	if (unlikely(address == DMA_ERROR_CODE)) {
-		/*
-		 * setting next_address here will let the address
-		 * allocator only scan the new allocated range in the
-		 * first run. This is a small optimization.
-		 */
-		dma_dom->next_address = dma_dom->aperture_size;
-
-		if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
-			goto out;
-
-		/*
-		 * aperture was successfully enlarged by 128 MB, try
-		 * allocation again
-		 */
-		goto retry;
-	}
-
-	start = address;
-	for (i = 0; i < pages; ++i) {
-		ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
-		if (ret == DMA_ERROR_CODE)
-			goto out_unmap;
-
-		paddr += PAGE_SIZE;
-		start += PAGE_SIZE;
-	}
-	address += offset;
-
-	ADD_STATS_COUNTER(alloced_io_mem, size);
-
-	if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
-		iommu_flush_tlb(&dma_dom->domain);
-		dma_dom->need_flush = false;
-	} else if (unlikely(amd_iommu_np_cache))
-		iommu_flush_pages(&dma_dom->domain, address, size);
-
-out:
-	return address;
-
-out_unmap:
-
-	for (--i; i >= 0; --i) {
-		start -= PAGE_SIZE;
-		dma_ops_domain_unmap(dma_dom, start);
-	}
-
-	dma_ops_free_addresses(dma_dom, address, pages);
-
-	return DMA_ERROR_CODE;
-}
-
-/*
- * Does the reverse of the __map_single function. Must be called with
- * the domain lock held too
- */
-static void __unmap_single(struct dma_ops_domain *dma_dom,
-			   dma_addr_t dma_addr,
-			   size_t size,
-			   int dir)
-{
-	dma_addr_t flush_addr;
-	dma_addr_t i, start;
-	unsigned int pages;
-
-	if ((dma_addr == DMA_ERROR_CODE) ||
-	    (dma_addr + size > dma_dom->aperture_size))
-		return;
-
-	flush_addr = dma_addr;
-	pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
-	dma_addr &= PAGE_MASK;
-	start = dma_addr;
-
-	for (i = 0; i < pages; ++i) {
-		dma_ops_domain_unmap(dma_dom, start);
-		start += PAGE_SIZE;
-	}
-
-	SUB_STATS_COUNTER(alloced_io_mem, size);
-
-	dma_ops_free_addresses(dma_dom, dma_addr, pages);
-
-	if (amd_iommu_unmap_flush || dma_dom->need_flush) {
-		iommu_flush_pages(&dma_dom->domain, flush_addr, size);
-		dma_dom->need_flush = false;
-	}
-}
-
-/*
- * The exported map_single function for dma_ops.
- */
-static dma_addr_t map_page(struct device *dev, struct page *page,
-			   unsigned long offset, size_t size,
-			   enum dma_data_direction dir,
-			   struct dma_attrs *attrs)
-{
-	unsigned long flags;
-	struct protection_domain *domain;
-	dma_addr_t addr;
-	u64 dma_mask;
-	phys_addr_t paddr = page_to_phys(page) + offset;
-
-	INC_STATS_COUNTER(cnt_map_single);
-
-	domain = get_domain(dev);
-	if (PTR_ERR(domain) == -EINVAL)
-		return (dma_addr_t)paddr;
-	else if (IS_ERR(domain))
-		return DMA_ERROR_CODE;
-
-	dma_mask = *dev->dma_mask;
-
-	spin_lock_irqsave(&domain->lock, flags);
-
-	addr = __map_single(dev, domain->priv, paddr, size, dir, false,
-			    dma_mask);
-	if (addr == DMA_ERROR_CODE)
-		goto out;
-
-	iommu_flush_complete(domain);
-
-out:
-	spin_unlock_irqrestore(&domain->lock, flags);
-
-	return addr;
-}
-
-/*
- * The exported unmap_single function for dma_ops.
- */
-static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
-		       enum dma_data_direction dir, struct dma_attrs *attrs)
-{
-	unsigned long flags;
-	struct protection_domain *domain;
-
-	INC_STATS_COUNTER(cnt_unmap_single);
-
-	domain = get_domain(dev);
-	if (IS_ERR(domain))
-		return;
-
-	spin_lock_irqsave(&domain->lock, flags);
-
-	__unmap_single(domain->priv, dma_addr, size, dir);
-
-	iommu_flush_complete(domain);
-
-	spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-/*
- * This is a special map_sg function which is used if we should map a
- * device which is not handled by an AMD IOMMU in the system.
- */
-static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
-			   int nelems, int dir)
-{
-	struct scatterlist *s;
-	int i;
-
-	for_each_sg(sglist, s, nelems, i) {
-		s->dma_address = (dma_addr_t)sg_phys(s);
-		s->dma_length  = s->length;
-	}
-
-	return nelems;
-}
-
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static int map_sg(struct device *dev, struct scatterlist *sglist,
-		  int nelems, enum dma_data_direction dir,
-		  struct dma_attrs *attrs)
-{
-	unsigned long flags;
-	struct protection_domain *domain;
-	int i;
-	struct scatterlist *s;
-	phys_addr_t paddr;
-	int mapped_elems = 0;
-	u64 dma_mask;
-
-	INC_STATS_COUNTER(cnt_map_sg);
-
-	domain = get_domain(dev);
-	if (PTR_ERR(domain) == -EINVAL)
-		return map_sg_no_iommu(dev, sglist, nelems, dir);
-	else if (IS_ERR(domain))
-		return 0;
-
-	dma_mask = *dev->dma_mask;
-
-	spin_lock_irqsave(&domain->lock, flags);
-
-	for_each_sg(sglist, s, nelems, i) {
-		paddr = sg_phys(s);
-
-		s->dma_address = __map_single(dev, domain->priv,
-					      paddr, s->length, dir, false,
-					      dma_mask);
-
-		if (s->dma_address) {
-			s->dma_length = s->length;
-			mapped_elems++;
-		} else
-			goto unmap;
-	}
-
-	iommu_flush_complete(domain);
-
-out:
-	spin_unlock_irqrestore(&domain->lock, flags);
-
-	return mapped_elems;
-unmap:
-	for_each_sg(sglist, s, mapped_elems, i) {
-		if (s->dma_address)
-			__unmap_single(domain->priv, s->dma_address,
-				       s->dma_length, dir);
-		s->dma_address = s->dma_length = 0;
-	}
-
-	mapped_elems = 0;
-
-	goto out;
-}
-
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static void unmap_sg(struct device *dev, struct scatterlist *sglist,
-		     int nelems, enum dma_data_direction dir,
-		     struct dma_attrs *attrs)
-{
-	unsigned long flags;
-	struct protection_domain *domain;
-	struct scatterlist *s;
-	int i;
-
-	INC_STATS_COUNTER(cnt_unmap_sg);
-
-	domain = get_domain(dev);
-	if (IS_ERR(domain))
-		return;
-
-	spin_lock_irqsave(&domain->lock, flags);
-
-	for_each_sg(sglist, s, nelems, i) {
-		__unmap_single(domain->priv, s->dma_address,
-			       s->dma_length, dir);
-		s->dma_address = s->dma_length = 0;
-	}
-
-	iommu_flush_complete(domain);
-
-	spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-/*
- * The exported alloc_coherent function for dma_ops.
- */
-static void *alloc_coherent(struct device *dev, size_t size,
-			    dma_addr_t *dma_addr, gfp_t flag)
-{
-	unsigned long flags;
-	void *virt_addr;
-	struct protection_domain *domain;
-	phys_addr_t paddr;
-	u64 dma_mask = dev->coherent_dma_mask;
-
-	INC_STATS_COUNTER(cnt_alloc_coherent);
-
-	domain = get_domain(dev);
-	if (PTR_ERR(domain) == -EINVAL) {
-		virt_addr = (void *)__get_free_pages(flag, get_order(size));
-		*dma_addr = __pa(virt_addr);
-		return virt_addr;
-	} else if (IS_ERR(domain))
-		return NULL;
-
-	dma_mask  = dev->coherent_dma_mask;
-	flag     &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-	flag     |= __GFP_ZERO;
-
-	virt_addr = (void *)__get_free_pages(flag, get_order(size));
-	if (!virt_addr)
-		return NULL;
-
-	paddr = virt_to_phys(virt_addr);
-
-	if (!dma_mask)
-		dma_mask = *dev->dma_mask;
-
-	spin_lock_irqsave(&domain->lock, flags);
-
-	*dma_addr = __map_single(dev, domain->priv, paddr,
-				 size, DMA_BIDIRECTIONAL, true, dma_mask);
-
-	if (*dma_addr == DMA_ERROR_CODE) {
-		spin_unlock_irqrestore(&domain->lock, flags);
-		goto out_free;
-	}
-
-	iommu_flush_complete(domain);
-
-	spin_unlock_irqrestore(&domain->lock, flags);
-
-	return virt_addr;
-
-out_free:
-
-	free_pages((unsigned long)virt_addr, get_order(size));
-
-	return NULL;
-}
-
-/*
- * The exported free_coherent function for dma_ops.
- */
-static void free_coherent(struct device *dev, size_t size,
-			  void *virt_addr, dma_addr_t dma_addr)
-{
-	unsigned long flags;
-	struct protection_domain *domain;
-
-	INC_STATS_COUNTER(cnt_free_coherent);
-
-	domain = get_domain(dev);
-	if (IS_ERR(domain))
-		goto free_mem;
-
-	spin_lock_irqsave(&domain->lock, flags);
-
-	__unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
-
-	iommu_flush_complete(domain);
-
-	spin_unlock_irqrestore(&domain->lock, flags);
-
-free_mem:
-	free_pages((unsigned long)virt_addr, get_order(size));
-}
-
-/*
- * This function is called by the DMA layer to find out if we can handle a
- * particular device. It is part of the dma_ops.
- */
-static int amd_iommu_dma_supported(struct device *dev, u64 mask)
-{
-	return check_device(dev);
-}
-
-/*
- * The function for pre-allocating protection domains.
- *
- * If the driver core informs the DMA layer if a driver grabs a device
- * we don't need to preallocate the protection domains anymore.
- * For now we have to.
- */
-static void prealloc_protection_domains(void)
-{
-	struct pci_dev *dev = NULL;
-	struct dma_ops_domain *dma_dom;
-	u16 devid;
-
-	for_each_pci_dev(dev) {
-
-		/* Do we handle this device? */
-		if (!check_device(&dev->dev))
-			continue;
-
-		/* Is there already any domain for it? */
-		if (domain_for_device(&dev->dev))
-			continue;
-
-		devid = get_device_id(&dev->dev);
-
-		dma_dom = dma_ops_domain_alloc();
-		if (!dma_dom)
-			continue;
-		init_unity_mappings_for_device(dma_dom, devid);
-		dma_dom->target_dev = devid;
-
-		attach_device(&dev->dev, &dma_dom->domain);
-
-		list_add_tail(&dma_dom->list, &iommu_pd_list);
-	}
-}
-
-static struct dma_map_ops amd_iommu_dma_ops = {
-	.alloc_coherent = alloc_coherent,
-	.free_coherent = free_coherent,
-	.map_page = map_page,
-	.unmap_page = unmap_page,
-	.map_sg = map_sg,
-	.unmap_sg = unmap_sg,
-	.dma_supported = amd_iommu_dma_supported,
-};
-
-/*
- * The function which clues the AMD IOMMU driver into dma_ops.
- */
-
-void __init amd_iommu_init_api(void)
-{
-	register_iommu(&amd_iommu_ops);
-}
-
-int __init amd_iommu_init_dma_ops(void)
-{
-	struct amd_iommu *iommu;
-	int ret;
-
-	/*
-	 * first allocate a default protection domain for every IOMMU we
-	 * found in the system. Devices not assigned to any other
-	 * protection domain will be assigned to the default one.
-	 */
-	for_each_iommu(iommu) {
-		iommu->default_dom = dma_ops_domain_alloc();
-		if (iommu->default_dom == NULL)
-			return -ENOMEM;
-		iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
-		ret = iommu_init_unity_mappings(iommu);
-		if (ret)
-			goto free_domains;
-	}
-
-	/*
-	 * Pre-allocate the protection domains for each device.
-	 */
-	prealloc_protection_domains();
-
-	iommu_detected = 1;
-	swiotlb = 0;
-
-	/* Make the driver finally visible to the drivers */
-	dma_ops = &amd_iommu_dma_ops;
-
-	amd_iommu_stats_init();
-
-	return 0;
-
-free_domains:
-
-	for_each_iommu(iommu) {
-		if (iommu->default_dom)
-			dma_ops_domain_free(iommu->default_dom);
-	}
-
-	return ret;
-}
-
-/*****************************************************************************
- *
- * The following functions belong to the exported interface of AMD IOMMU
- *
- * This interface allows access to lower level functions of the IOMMU
- * like protection domain handling and assignement of devices to domains
- * which is not possible with the dma_ops interface.
- *
- *****************************************************************************/
-
-static void cleanup_domain(struct protection_domain *domain)
-{
-	struct iommu_dev_data *dev_data, *next;
-	unsigned long flags;
-
-	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-
-	list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
-		struct device *dev = dev_data->dev;
-
-		__detach_device(dev);
-		atomic_set(&dev_data->bind, 0);
-	}
-
-	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-static void protection_domain_free(struct protection_domain *domain)
-{
-	if (!domain)
-		return;
-
-	del_domain_from_list(domain);
-
-	if (domain->id)
-		domain_id_free(domain->id);
-
-	kfree(domain);
-}
-
-static struct protection_domain *protection_domain_alloc(void)
-{
-	struct protection_domain *domain;
-
-	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
-	if (!domain)
-		return NULL;
-
-	spin_lock_init(&domain->lock);
-	mutex_init(&domain->api_lock);
-	domain->id = domain_id_alloc();
-	if (!domain->id)
-		goto out_err;
-	INIT_LIST_HEAD(&domain->dev_list);
-
-	add_domain_to_list(domain);
-
-	return domain;
-
-out_err:
-	kfree(domain);
-
-	return NULL;
-}
-
-static int amd_iommu_domain_init(struct iommu_domain *dom)
-{
-	struct protection_domain *domain;
-
-	domain = protection_domain_alloc();
-	if (!domain)
-		goto out_free;
-
-	domain->mode    = PAGE_MODE_3_LEVEL;
-	domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
-	if (!domain->pt_root)
-		goto out_free;
-
-	dom->priv = domain;
-
-	return 0;
-
-out_free:
-	protection_domain_free(domain);
-
-	return -ENOMEM;
-}
-
-static void amd_iommu_domain_destroy(struct iommu_domain *dom)
-{
-	struct protection_domain *domain = dom->priv;
-
-	if (!domain)
-		return;
-
-	if (domain->dev_cnt > 0)
-		cleanup_domain(domain);
-
-	BUG_ON(domain->dev_cnt != 0);
-
-	free_pagetable(domain);
-
-	protection_domain_free(domain);
-
-	dom->priv = NULL;
-}
-
-static void amd_iommu_detach_device(struct iommu_domain *dom,
-				    struct device *dev)
-{
-	struct iommu_dev_data *dev_data = dev->archdata.iommu;
-	struct amd_iommu *iommu;
-	u16 devid;
-
-	if (!check_device(dev))
-		return;
-
-	devid = get_device_id(dev);
-
-	if (dev_data->domain != NULL)
-		detach_device(dev);
-
-	iommu = amd_iommu_rlookup_table[devid];
-	if (!iommu)
-		return;
-
-	iommu_flush_device(dev);
-	iommu_completion_wait(iommu);
-}
-
-static int amd_iommu_attach_device(struct iommu_domain *dom,
-				   struct device *dev)
-{
-	struct protection_domain *domain = dom->priv;
-	struct iommu_dev_data *dev_data;
-	struct amd_iommu *iommu;
-	int ret;
-	u16 devid;
-
-	if (!check_device(dev))
-		return -EINVAL;
-
-	dev_data = dev->archdata.iommu;
-
-	devid = get_device_id(dev);
-
-	iommu = amd_iommu_rlookup_table[devid];
-	if (!iommu)
-		return -EINVAL;
-
-	if (dev_data->domain)
-		detach_device(dev);
-
-	ret = attach_device(dev, domain);
-
-	iommu_completion_wait(iommu);
-
-	return ret;
-}
-
-static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
-			 phys_addr_t paddr, int gfp_order, int iommu_prot)
-{
-	unsigned long page_size = 0x1000UL << gfp_order;
-	struct protection_domain *domain = dom->priv;
-	int prot = 0;
-	int ret;
-
-	if (iommu_prot & IOMMU_READ)
-		prot |= IOMMU_PROT_IR;
-	if (iommu_prot & IOMMU_WRITE)
-		prot |= IOMMU_PROT_IW;
-
-	mutex_lock(&domain->api_lock);
-	ret = iommu_map_page(domain, iova, paddr, prot, page_size);
-	mutex_unlock(&domain->api_lock);
-
-	return ret;
-}
-
-static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
-			   int gfp_order)
-{
-	struct protection_domain *domain = dom->priv;
-	unsigned long page_size, unmap_size;
-
-	page_size  = 0x1000UL << gfp_order;
-
-	mutex_lock(&domain->api_lock);
-	unmap_size = iommu_unmap_page(domain, iova, page_size);
-	mutex_unlock(&domain->api_lock);
-
-	iommu_flush_tlb_pde(domain);
-
-	return get_order(unmap_size);
-}
-
-static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
-					  unsigned long iova)
-{
-	struct protection_domain *domain = dom->priv;
-	unsigned long offset_mask;
-	phys_addr_t paddr;
-	u64 *pte, __pte;
-
-	pte = fetch_pte(domain, iova);
-
-	if (!pte || !IOMMU_PTE_PRESENT(*pte))
-		return 0;
-
-	if (PM_PTE_LEVEL(*pte) == 0)
-		offset_mask = PAGE_SIZE - 1;
-	else
-		offset_mask = PTE_PAGE_SIZE(*pte) - 1;
-
-	__pte = *pte & PM_ADDR_MASK;
-	paddr = (__pte & ~offset_mask) | (iova & offset_mask);
-
-	return paddr;
-}
-
-static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
-				    unsigned long cap)
-{
-	switch (cap) {
-	case IOMMU_CAP_CACHE_COHERENCY:
-		return 1;
-	}
-
-	return 0;
-}
-
-static struct iommu_ops amd_iommu_ops = {
-	.domain_init = amd_iommu_domain_init,
-	.domain_destroy = amd_iommu_domain_destroy,
-	.attach_dev = amd_iommu_attach_device,
-	.detach_dev = amd_iommu_detach_device,
-	.map = amd_iommu_map,
-	.unmap = amd_iommu_unmap,
-	.iova_to_phys = amd_iommu_iova_to_phys,
-	.domain_has_cap = amd_iommu_domain_has_cap,
-};
-
-/*****************************************************************************
- *
- * The next functions do a basic initialization of IOMMU for pass through
- * mode
- *
- * In passthrough mode the IOMMU is initialized and enabled but not used for
- * DMA-API translation.
- *
- *****************************************************************************/
-
-int __init amd_iommu_init_passthrough(void)
-{
-	struct amd_iommu *iommu;
-	struct pci_dev *dev = NULL;
-	u16 devid;
-
-	/* allocate passthrough domain */
-	pt_domain = protection_domain_alloc();
-	if (!pt_domain)
-		return -ENOMEM;
-
-	pt_domain->mode |= PAGE_MODE_NONE;
-
-	for_each_pci_dev(dev) {
-		if (!check_device(&dev->dev))
-			continue;
-
-		devid = get_device_id(&dev->dev);
-
-		iommu = amd_iommu_rlookup_table[devid];
-		if (!iommu)
-			continue;
-
-		attach_device(&dev->dev, pt_domain);
-	}
-
-	pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
-
-	return 0;
-}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
deleted file mode 100644
index 246d727b65b..00000000000
--- a/arch/x86/kernel/amd_iommu_init.c
+++ /dev/null
@@ -1,1540 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- *         Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#include <linux/pci.h>
-#include <linux/acpi.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/syscore_ops.h>
-#include <linux/interrupt.h>
-#include <linux/msi.h>
-#include <asm/pci-direct.h>
-#include <asm/amd_iommu_proto.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
-#include <asm/iommu.h>
-#include <asm/gart.h>
-#include <asm/x86_init.h>
-#include <asm/iommu_table.h>
-/*
- * definitions for the ACPI scanning code
- */
-#define IVRS_HEADER_LENGTH 48
-
-#define ACPI_IVHD_TYPE                  0x10
-#define ACPI_IVMD_TYPE_ALL              0x20
-#define ACPI_IVMD_TYPE                  0x21
-#define ACPI_IVMD_TYPE_RANGE            0x22
-
-#define IVHD_DEV_ALL                    0x01
-#define IVHD_DEV_SELECT                 0x02
-#define IVHD_DEV_SELECT_RANGE_START     0x03
-#define IVHD_DEV_RANGE_END              0x04
-#define IVHD_DEV_ALIAS                  0x42
-#define IVHD_DEV_ALIAS_RANGE            0x43
-#define IVHD_DEV_EXT_SELECT             0x46
-#define IVHD_DEV_EXT_SELECT_RANGE       0x47
-
-#define IVHD_FLAG_HT_TUN_EN_MASK        0x01
-#define IVHD_FLAG_PASSPW_EN_MASK        0x02
-#define IVHD_FLAG_RESPASSPW_EN_MASK     0x04
-#define IVHD_FLAG_ISOC_EN_MASK          0x08
-
-#define IVMD_FLAG_EXCL_RANGE            0x08
-#define IVMD_FLAG_UNITY_MAP             0x01
-
-#define ACPI_DEVFLAG_INITPASS           0x01
-#define ACPI_DEVFLAG_EXTINT             0x02
-#define ACPI_DEVFLAG_NMI                0x04
-#define ACPI_DEVFLAG_SYSMGT1            0x10
-#define ACPI_DEVFLAG_SYSMGT2            0x20
-#define ACPI_DEVFLAG_LINT0              0x40
-#define ACPI_DEVFLAG_LINT1              0x80
-#define ACPI_DEVFLAG_ATSDIS             0x10000000
-
-/*
- * ACPI table definitions
- *
- * These data structures are laid over the table to parse the important values
- * out of it.
- */
-
-/*
- * structure describing one IOMMU in the ACPI table. Typically followed by one
- * or more ivhd_entrys.
- */
-struct ivhd_header {
-	u8 type;
-	u8 flags;
-	u16 length;
-	u16 devid;
-	u16 cap_ptr;
-	u64 mmio_phys;
-	u16 pci_seg;
-	u16 info;
-	u32 reserved;
-} __attribute__((packed));
-
-/*
- * A device entry describing which devices a specific IOMMU translates and
- * which requestor ids they use.
- */
-struct ivhd_entry {
-	u8 type;
-	u16 devid;
-	u8 flags;
-	u32 ext;
-} __attribute__((packed));
-
-/*
- * An AMD IOMMU memory definition structure. It defines things like exclusion
- * ranges for devices and regions that should be unity mapped.
- */
-struct ivmd_header {
-	u8 type;
-	u8 flags;
-	u16 length;
-	u16 devid;
-	u16 aux;
-	u64 resv;
-	u64 range_start;
-	u64 range_length;
-} __attribute__((packed));
-
-bool amd_iommu_dump;
-
-static int __initdata amd_iommu_detected;
-static bool __initdata amd_iommu_disabled;
-
-u16 amd_iommu_last_bdf;			/* largest PCI device id we have
-					   to handle */
-LIST_HEAD(amd_iommu_unity_map);		/* a list of required unity mappings
-					   we find in ACPI */
-bool amd_iommu_unmap_flush;		/* if true, flush on every unmap */
-
-LIST_HEAD(amd_iommu_list);		/* list of all AMD IOMMUs in the
-					   system */
-
-/* Array to assign indices to IOMMUs*/
-struct amd_iommu *amd_iommus[MAX_IOMMUS];
-int amd_iommus_present;
-
-/* IOMMUs have a non-present cache? */
-bool amd_iommu_np_cache __read_mostly;
-
-/*
- * The ACPI table parsing functions set this variable on an error
- */
-static int __initdata amd_iommu_init_err;
-
-/*
- * List of protection domains - used during resume
- */
-LIST_HEAD(amd_iommu_pd_list);
-spinlock_t amd_iommu_pd_lock;
-
-/*
- * Pointer to the device table which is shared by all AMD IOMMUs
- * it is indexed by the PCI device id or the HT unit id and contains
- * information about the domain the device belongs to as well as the
- * page table root pointer.
- */
-struct dev_table_entry *amd_iommu_dev_table;
-
-/*
- * The alias table is a driver specific data structure which contains the
- * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
- * More than one device can share the same requestor id.
- */
-u16 *amd_iommu_alias_table;
-
-/*
- * The rlookup table is used to find the IOMMU which is responsible
- * for a specific device. It is also indexed by the PCI device id.
- */
-struct amd_iommu **amd_iommu_rlookup_table;
-
-/*
- * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
- * to know which ones are already in use.
- */
-unsigned long *amd_iommu_pd_alloc_bitmap;
-
-static u32 dev_table_size;	/* size of the device table */
-static u32 alias_table_size;	/* size of the alias table */
-static u32 rlookup_table_size;	/* size if the rlookup table */
-
-static inline void update_last_devid(u16 devid)
-{
-	if (devid > amd_iommu_last_bdf)
-		amd_iommu_last_bdf = devid;
-}
-
-static inline unsigned long tbl_size(int entry_size)
-{
-	unsigned shift = PAGE_SHIFT +
-			 get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
-
-	return 1UL << shift;
-}
-
-/* Access to l1 and l2 indexed register spaces */
-
-static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
-{
-	u32 val;
-
-	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
-	pci_read_config_dword(iommu->dev, 0xfc, &val);
-	return val;
-}
-
-static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
-{
-	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
-	pci_write_config_dword(iommu->dev, 0xfc, val);
-	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
-}
-
-static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
-{
-	u32 val;
-
-	pci_write_config_dword(iommu->dev, 0xf0, address);
-	pci_read_config_dword(iommu->dev, 0xf4, &val);
-	return val;
-}
-
-static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
-{
-	pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
-	pci_write_config_dword(iommu->dev, 0xf4, val);
-}
-
-/****************************************************************************
- *
- * AMD IOMMU MMIO register space handling functions
- *
- * These functions are used to program the IOMMU device registers in
- * MMIO space required for that driver.
- *
- ****************************************************************************/
-
-/*
- * This function set the exclusion range in the IOMMU. DMA accesses to the
- * exclusion range are passed through untranslated
- */
-static void iommu_set_exclusion_range(struct amd_iommu *iommu)
-{
-	u64 start = iommu->exclusion_start & PAGE_MASK;
-	u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
-	u64 entry;
-
-	if (!iommu->exclusion_start)
-		return;
-
-	entry = start | MMIO_EXCL_ENABLE_MASK;
-	memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
-			&entry, sizeof(entry));
-
-	entry = limit;
-	memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
-			&entry, sizeof(entry));
-}
-
-/* Programs the physical address of the device table into the IOMMU hardware */
-static void __init iommu_set_device_table(struct amd_iommu *iommu)
-{
-	u64 entry;
-
-	BUG_ON(iommu->mmio_base == NULL);
-
-	entry = virt_to_phys(amd_iommu_dev_table);
-	entry |= (dev_table_size >> 12) - 1;
-	memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
-			&entry, sizeof(entry));
-}
-
-/* Generic functions to enable/disable certain features of the IOMMU. */
-static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
-{
-	u32 ctrl;
-
-	ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
-	ctrl |= (1 << bit);
-	writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
-}
-
-static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
-{
-	u32 ctrl;
-
-	ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
-	ctrl &= ~(1 << bit);
-	writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
-}
-
-/* Function to enable the hardware */
-static void iommu_enable(struct amd_iommu *iommu)
-{
-	printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n",
-	       dev_name(&iommu->dev->dev), iommu->cap_ptr);
-
-	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
-}
-
-static void iommu_disable(struct amd_iommu *iommu)
-{
-	/* Disable command buffer */
-	iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
-
-	/* Disable event logging and event interrupts */
-	iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
-	iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
-
-	/* Disable IOMMU hardware itself */
-	iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
-}
-
-/*
- * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
- * the system has one.
- */
-static u8 * __init iommu_map_mmio_space(u64 address)
-{
-	u8 *ret;
-
-	if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) {
-		pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n",
-			address);
-		pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n");
-		return NULL;
-	}
-
-	ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
-	if (ret != NULL)
-		return ret;
-
-	release_mem_region(address, MMIO_REGION_LENGTH);
-
-	return NULL;
-}
-
-static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
-{
-	if (iommu->mmio_base)
-		iounmap(iommu->mmio_base);
-	release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
-}
-
-/****************************************************************************
- *
- * The functions below belong to the first pass of AMD IOMMU ACPI table
- * parsing. In this pass we try to find out the highest device id this
- * code has to handle. Upon this information the size of the shared data
- * structures is determined later.
- *
- ****************************************************************************/
-
-/*
- * This function calculates the length of a given IVHD entry
- */
-static inline int ivhd_entry_length(u8 *ivhd)
-{
-	return 0x04 << (*ivhd >> 6);
-}
-
-/*
- * This function reads the last device id the IOMMU has to handle from the PCI
- * capability header for this IOMMU
- */
-static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
-{
-	u32 cap;
-
-	cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
-	update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
-
-	return 0;
-}
-
-/*
- * After reading the highest device id from the IOMMU PCI capability header
- * this function looks if there is a higher device id defined in the ACPI table
- */
-static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
-{
-	u8 *p = (void *)h, *end = (void *)h;
-	struct ivhd_entry *dev;
-
-	p += sizeof(*h);
-	end += h->length;
-
-	find_last_devid_on_pci(PCI_BUS(h->devid),
-			PCI_SLOT(h->devid),
-			PCI_FUNC(h->devid),
-			h->cap_ptr);
-
-	while (p < end) {
-		dev = (struct ivhd_entry *)p;
-		switch (dev->type) {
-		case IVHD_DEV_SELECT:
-		case IVHD_DEV_RANGE_END:
-		case IVHD_DEV_ALIAS:
-		case IVHD_DEV_EXT_SELECT:
-			/* all the above subfield types refer to device ids */
-			update_last_devid(dev->devid);
-			break;
-		default:
-			break;
-		}
-		p += ivhd_entry_length(p);
-	}
-
-	WARN_ON(p != end);
-
-	return 0;
-}
-
-/*
- * Iterate over all IVHD entries in the ACPI table and find the highest device
- * id which we need to handle. This is the first of three functions which parse
- * the ACPI table. So we check the checksum here.
- */
-static int __init find_last_devid_acpi(struct acpi_table_header *table)
-{
-	int i;
-	u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
-	struct ivhd_header *h;
-
-	/*
-	 * Validate checksum here so we don't need to do it when
-	 * we actually parse the table
-	 */
-	for (i = 0; i < table->length; ++i)
-		checksum += p[i];
-	if (checksum != 0) {
-		/* ACPI table corrupt */
-		amd_iommu_init_err = -ENODEV;
-		return 0;
-	}
-
-	p += IVRS_HEADER_LENGTH;
-
-	end += table->length;
-	while (p < end) {
-		h = (struct ivhd_header *)p;
-		switch (h->type) {
-		case ACPI_IVHD_TYPE:
-			find_last_devid_from_ivhd(h);
-			break;
-		default:
-			break;
-		}
-		p += h->length;
-	}
-	WARN_ON(p != end);
-
-	return 0;
-}
-
-/****************************************************************************
- *
- * The following functions belong the the code path which parses the ACPI table
- * the second time. In this ACPI parsing iteration we allocate IOMMU specific
- * data structures, initialize the device/alias/rlookup table and also
- * basically initialize the hardware.
- *
- ****************************************************************************/
-
-/*
- * Allocates the command buffer. This buffer is per AMD IOMMU. We can
- * write commands to that buffer later and the IOMMU will execute them
- * asynchronously
- */
-static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
-{
-	u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-			get_order(CMD_BUFFER_SIZE));
-
-	if (cmd_buf == NULL)
-		return NULL;
-
-	iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
-
-	return cmd_buf;
-}
-
-/*
- * This function resets the command buffer if the IOMMU stopped fetching
- * commands from it.
- */
-void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
-{
-	iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
-
-	writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
-	writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-
-	iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
-}
-
-/*
- * This function writes the command buffer address to the hardware and
- * enables it.
- */
-static void iommu_enable_command_buffer(struct amd_iommu *iommu)
-{
-	u64 entry;
-
-	BUG_ON(iommu->cmd_buf == NULL);
-
-	entry = (u64)virt_to_phys(iommu->cmd_buf);
-	entry |= MMIO_CMD_SIZE_512;
-
-	memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
-		    &entry, sizeof(entry));
-
-	amd_iommu_reset_cmd_buffer(iommu);
-	iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
-}
-
-static void __init free_command_buffer(struct amd_iommu *iommu)
-{
-	free_pages((unsigned long)iommu->cmd_buf,
-		   get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
-}
-
-/* allocates the memory where the IOMMU will log its events to */
-static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
-{
-	iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-						get_order(EVT_BUFFER_SIZE));
-
-	if (iommu->evt_buf == NULL)
-		return NULL;
-
-	iommu->evt_buf_size = EVT_BUFFER_SIZE;
-
-	return iommu->evt_buf;
-}
-
-static void iommu_enable_event_buffer(struct amd_iommu *iommu)
-{
-	u64 entry;
-
-	BUG_ON(iommu->evt_buf == NULL);
-
-	entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
-
-	memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
-		    &entry, sizeof(entry));
-
-	/* set head and tail to zero manually */
-	writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
-	writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
-
-	iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
-}
-
-static void __init free_event_buffer(struct amd_iommu *iommu)
-{
-	free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
-}
-
-/* sets a specific bit in the device table entry. */
-static void set_dev_entry_bit(u16 devid, u8 bit)
-{
-	int i = (bit >> 5) & 0x07;
-	int _bit = bit & 0x1f;
-
-	amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
-}
-
-static int get_dev_entry_bit(u16 devid, u8 bit)
-{
-	int i = (bit >> 5) & 0x07;
-	int _bit = bit & 0x1f;
-
-	return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
-}
-
-
-void amd_iommu_apply_erratum_63(u16 devid)
-{
-	int sysmgt;
-
-	sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
-		 (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
-
-	if (sysmgt == 0x01)
-		set_dev_entry_bit(devid, DEV_ENTRY_IW);
-}
-
-/* Writes the specific IOMMU for a device into the rlookup table */
-static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
-{
-	amd_iommu_rlookup_table[devid] = iommu;
-}
-
-/*
- * This function takes the device specific flags read from the ACPI
- * table and sets up the device table entry with that information
- */
-static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
-					   u16 devid, u32 flags, u32 ext_flags)
-{
-	if (flags & ACPI_DEVFLAG_INITPASS)
-		set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
-	if (flags & ACPI_DEVFLAG_EXTINT)
-		set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
-	if (flags & ACPI_DEVFLAG_NMI)
-		set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
-	if (flags & ACPI_DEVFLAG_SYSMGT1)
-		set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
-	if (flags & ACPI_DEVFLAG_SYSMGT2)
-		set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
-	if (flags & ACPI_DEVFLAG_LINT0)
-		set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
-	if (flags & ACPI_DEVFLAG_LINT1)
-		set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
-
-	amd_iommu_apply_erratum_63(devid);
-
-	set_iommu_for_device(iommu, devid);
-}
-
-/*
- * Reads the device exclusion range from ACPI and initialize IOMMU with
- * it
- */
-static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
-{
-	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
-
-	if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
-		return;
-
-	if (iommu) {
-		/*
-		 * We only can configure exclusion ranges per IOMMU, not
-		 * per device. But we can enable the exclusion range per
-		 * device. This is done here
-		 */
-		set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
-		iommu->exclusion_start = m->range_start;
-		iommu->exclusion_length = m->range_length;
-	}
-}
-
-/*
- * This function reads some important data from the IOMMU PCI space and
- * initializes the driver data structure with it. It reads the hardware
- * capabilities and the first/last device entries
- */
-static void __init init_iommu_from_pci(struct amd_iommu *iommu)
-{
-	int cap_ptr = iommu->cap_ptr;
-	u32 range, misc;
-	int i, j;
-
-	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
-			      &iommu->cap);
-	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
-			      &range);
-	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
-			      &misc);
-
-	iommu->first_device = calc_devid(MMIO_GET_BUS(range),
-					 MMIO_GET_FD(range));
-	iommu->last_device = calc_devid(MMIO_GET_BUS(range),
-					MMIO_GET_LD(range));
-	iommu->evt_msi_num = MMIO_MSI_NUM(misc);
-
-	if (!is_rd890_iommu(iommu->dev))
-		return;
-
-	/*
-	 * Some rd890 systems may not be fully reconfigured by the BIOS, so
-	 * it's necessary for us to store this information so it can be
-	 * reprogrammed on resume
-	 */
-
-	pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
-			      &iommu->stored_addr_lo);
-	pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
-			      &iommu->stored_addr_hi);
-
-	/* Low bit locks writes to configuration space */
-	iommu->stored_addr_lo &= ~1;
-
-	for (i = 0; i < 6; i++)
-		for (j = 0; j < 0x12; j++)
-			iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
-
-	for (i = 0; i < 0x83; i++)
-		iommu->stored_l2[i] = iommu_read_l2(iommu, i);
-}
-
-/*
- * Takes a pointer to an AMD IOMMU entry in the ACPI table and
- * initializes the hardware and our data structures with it.
- */
-static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
-					struct ivhd_header *h)
-{
-	u8 *p = (u8 *)h;
-	u8 *end = p, flags = 0;
-	u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
-	u32 ext_flags = 0;
-	bool alias = false;
-	struct ivhd_entry *e;
-
-	/*
-	 * First save the recommended feature enable bits from ACPI
-	 */
-	iommu->acpi_flags = h->flags;
-
-	/*
-	 * Done. Now parse the device entries
-	 */
-	p += sizeof(struct ivhd_header);
-	end += h->length;
-
-
-	while (p < end) {
-		e = (struct ivhd_entry *)p;
-		switch (e->type) {
-		case IVHD_DEV_ALL:
-
-			DUMP_printk("  DEV_ALL\t\t\t first devid: %02x:%02x.%x"
-				    " last device %02x:%02x.%x flags: %02x\n",
-				    PCI_BUS(iommu->first_device),
-				    PCI_SLOT(iommu->first_device),
-				    PCI_FUNC(iommu->first_device),
-				    PCI_BUS(iommu->last_device),
-				    PCI_SLOT(iommu->last_device),
-				    PCI_FUNC(iommu->last_device),
-				    e->flags);
-
-			for (dev_i = iommu->first_device;
-					dev_i <= iommu->last_device; ++dev_i)
-				set_dev_entry_from_acpi(iommu, dev_i,
-							e->flags, 0);
-			break;
-		case IVHD_DEV_SELECT:
-
-			DUMP_printk("  DEV_SELECT\t\t\t devid: %02x:%02x.%x "
-				    "flags: %02x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid),
-				    e->flags);
-
-			devid = e->devid;
-			set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
-			break;
-		case IVHD_DEV_SELECT_RANGE_START:
-
-			DUMP_printk("  DEV_SELECT_RANGE_START\t "
-				    "devid: %02x:%02x.%x flags: %02x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid),
-				    e->flags);
-
-			devid_start = e->devid;
-			flags = e->flags;
-			ext_flags = 0;
-			alias = false;
-			break;
-		case IVHD_DEV_ALIAS:
-
-			DUMP_printk("  DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
-				    "flags: %02x devid_to: %02x:%02x.%x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid),
-				    e->flags,
-				    PCI_BUS(e->ext >> 8),
-				    PCI_SLOT(e->ext >> 8),
-				    PCI_FUNC(e->ext >> 8));
-
-			devid = e->devid;
-			devid_to = e->ext >> 8;
-			set_dev_entry_from_acpi(iommu, devid   , e->flags, 0);
-			set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
-			amd_iommu_alias_table[devid] = devid_to;
-			break;
-		case IVHD_DEV_ALIAS_RANGE:
-
-			DUMP_printk("  DEV_ALIAS_RANGE\t\t "
-				    "devid: %02x:%02x.%x flags: %02x "
-				    "devid_to: %02x:%02x.%x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid),
-				    e->flags,
-				    PCI_BUS(e->ext >> 8),
-				    PCI_SLOT(e->ext >> 8),
-				    PCI_FUNC(e->ext >> 8));
-
-			devid_start = e->devid;
-			flags = e->flags;
-			devid_to = e->ext >> 8;
-			ext_flags = 0;
-			alias = true;
-			break;
-		case IVHD_DEV_EXT_SELECT:
-
-			DUMP_printk("  DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
-				    "flags: %02x ext: %08x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid),
-				    e->flags, e->ext);
-
-			devid = e->devid;
-			set_dev_entry_from_acpi(iommu, devid, e->flags,
-						e->ext);
-			break;
-		case IVHD_DEV_EXT_SELECT_RANGE:
-
-			DUMP_printk("  DEV_EXT_SELECT_RANGE\t devid: "
-				    "%02x:%02x.%x flags: %02x ext: %08x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid),
-				    e->flags, e->ext);
-
-			devid_start = e->devid;
-			flags = e->flags;
-			ext_flags = e->ext;
-			alias = false;
-			break;
-		case IVHD_DEV_RANGE_END:
-
-			DUMP_printk("  DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid));
-
-			devid = e->devid;
-			for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
-				if (alias) {
-					amd_iommu_alias_table[dev_i] = devid_to;
-					set_dev_entry_from_acpi(iommu,
-						devid_to, flags, ext_flags);
-				}
-				set_dev_entry_from_acpi(iommu, dev_i,
-							flags, ext_flags);
-			}
-			break;
-		default:
-			break;
-		}
-
-		p += ivhd_entry_length(p);
-	}
-}
-
-/* Initializes the device->iommu mapping for the driver */
-static int __init init_iommu_devices(struct amd_iommu *iommu)
-{
-	u16 i;
-
-	for (i = iommu->first_device; i <= iommu->last_device; ++i)
-		set_iommu_for_device(iommu, i);
-
-	return 0;
-}
-
-static void __init free_iommu_one(struct amd_iommu *iommu)
-{
-	free_command_buffer(iommu);
-	free_event_buffer(iommu);
-	iommu_unmap_mmio_space(iommu);
-}
-
-static void __init free_iommu_all(void)
-{
-	struct amd_iommu *iommu, *next;
-
-	for_each_iommu_safe(iommu, next) {
-		list_del(&iommu->list);
-		free_iommu_one(iommu);
-		kfree(iommu);
-	}
-}
-
-/*
- * This function clues the initialization function for one IOMMU
- * together and also allocates the command buffer and programs the
- * hardware. It does NOT enable the IOMMU. This is done afterwards.
- */
-static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
-{
-	spin_lock_init(&iommu->lock);
-
-	/* Add IOMMU to internal data structures */
-	list_add_tail(&iommu->list, &amd_iommu_list);
-	iommu->index             = amd_iommus_present++;
-
-	if (unlikely(iommu->index >= MAX_IOMMUS)) {
-		WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
-		return -ENOSYS;
-	}
-
-	/* Index is fine - add IOMMU to the array */
-	amd_iommus[iommu->index] = iommu;
-
-	/*
-	 * Copy data from ACPI table entry to the iommu struct
-	 */
-	iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
-	if (!iommu->dev)
-		return 1;
-
-	iommu->cap_ptr = h->cap_ptr;
-	iommu->pci_seg = h->pci_seg;
-	iommu->mmio_phys = h->mmio_phys;
-	iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
-	if (!iommu->mmio_base)
-		return -ENOMEM;
-
-	iommu->cmd_buf = alloc_command_buffer(iommu);
-	if (!iommu->cmd_buf)
-		return -ENOMEM;
-
-	iommu->evt_buf = alloc_event_buffer(iommu);
-	if (!iommu->evt_buf)
-		return -ENOMEM;
-
-	iommu->int_enabled = false;
-
-	init_iommu_from_pci(iommu);
-	init_iommu_from_acpi(iommu, h);
-	init_iommu_devices(iommu);
-
-	if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
-		amd_iommu_np_cache = true;
-
-	return pci_enable_device(iommu->dev);
-}
-
-/*
- * Iterates over all IOMMU entries in the ACPI table, allocates the
- * IOMMU structure and initializes it with init_iommu_one()
- */
-static int __init init_iommu_all(struct acpi_table_header *table)
-{
-	u8 *p = (u8 *)table, *end = (u8 *)table;
-	struct ivhd_header *h;
-	struct amd_iommu *iommu;
-	int ret;
-
-	end += table->length;
-	p += IVRS_HEADER_LENGTH;
-
-	while (p < end) {
-		h = (struct ivhd_header *)p;
-		switch (*p) {
-		case ACPI_IVHD_TYPE:
-
-			DUMP_printk("device: %02x:%02x.%01x cap: %04x "
-				    "seg: %d flags: %01x info %04x\n",
-				    PCI_BUS(h->devid), PCI_SLOT(h->devid),
-				    PCI_FUNC(h->devid), h->cap_ptr,
-				    h->pci_seg, h->flags, h->info);
-			DUMP_printk("       mmio-addr: %016llx\n",
-				    h->mmio_phys);
-
-			iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
-			if (iommu == NULL) {
-				amd_iommu_init_err = -ENOMEM;
-				return 0;
-			}
-
-			ret = init_iommu_one(iommu, h);
-			if (ret) {
-				amd_iommu_init_err = ret;
-				return 0;
-			}
-			break;
-		default:
-			break;
-		}
-		p += h->length;
-
-	}
-	WARN_ON(p != end);
-
-	return 0;
-}
-
-/****************************************************************************
- *
- * The following functions initialize the MSI interrupts for all IOMMUs
- * in the system. Its a bit challenging because there could be multiple
- * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
- * pci_dev.
- *
- ****************************************************************************/
-
-static int iommu_setup_msi(struct amd_iommu *iommu)
-{
-	int r;
-
-	if (pci_enable_msi(iommu->dev))
-		return 1;
-
-	r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
-			IRQF_SAMPLE_RANDOM,
-			"AMD-Vi",
-			NULL);
-
-	if (r) {
-		pci_disable_msi(iommu->dev);
-		return 1;
-	}
-
-	iommu->int_enabled = true;
-	iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
-
-	return 0;
-}
-
-static int iommu_init_msi(struct amd_iommu *iommu)
-{
-	if (iommu->int_enabled)
-		return 0;
-
-	if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
-		return iommu_setup_msi(iommu);
-
-	return 1;
-}
-
-/****************************************************************************
- *
- * The next functions belong to the third pass of parsing the ACPI
- * table. In this last pass the memory mapping requirements are
- * gathered (like exclusion and unity mapping reanges).
- *
- ****************************************************************************/
-
-static void __init free_unity_maps(void)
-{
-	struct unity_map_entry *entry, *next;
-
-	list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
-		list_del(&entry->list);
-		kfree(entry);
-	}
-}
-
-/* called when we find an exclusion range definition in ACPI */
-static int __init init_exclusion_range(struct ivmd_header *m)
-{
-	int i;
-
-	switch (m->type) {
-	case ACPI_IVMD_TYPE:
-		set_device_exclusion_range(m->devid, m);
-		break;
-	case ACPI_IVMD_TYPE_ALL:
-		for (i = 0; i <= amd_iommu_last_bdf; ++i)
-			set_device_exclusion_range(i, m);
-		break;
-	case ACPI_IVMD_TYPE_RANGE:
-		for (i = m->devid; i <= m->aux; ++i)
-			set_device_exclusion_range(i, m);
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-/* called for unity map ACPI definition */
-static int __init init_unity_map_range(struct ivmd_header *m)
-{
-	struct unity_map_entry *e = 0;
-	char *s;
-
-	e = kzalloc(sizeof(*e), GFP_KERNEL);
-	if (e == NULL)
-		return -ENOMEM;
-
-	switch (m->type) {
-	default:
-		kfree(e);
-		return 0;
-	case ACPI_IVMD_TYPE:
-		s = "IVMD_TYPEi\t\t\t";
-		e->devid_start = e->devid_end = m->devid;
-		break;
-	case ACPI_IVMD_TYPE_ALL:
-		s = "IVMD_TYPE_ALL\t\t";
-		e->devid_start = 0;
-		e->devid_end = amd_iommu_last_bdf;
-		break;
-	case ACPI_IVMD_TYPE_RANGE:
-		s = "IVMD_TYPE_RANGE\t\t";
-		e->devid_start = m->devid;
-		e->devid_end = m->aux;
-		break;
-	}
-	e->address_start = PAGE_ALIGN(m->range_start);
-	e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
-	e->prot = m->flags >> 1;
-
-	DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
-		    " range_start: %016llx range_end: %016llx flags: %x\n", s,
-		    PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
-		    PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
-		    PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
-		    e->address_start, e->address_end, m->flags);
-
-	list_add_tail(&e->list, &amd_iommu_unity_map);
-
-	return 0;
-}
-
-/* iterates over all memory definitions we find in the ACPI table */
-static int __init init_memory_definitions(struct acpi_table_header *table)
-{
-	u8 *p = (u8 *)table, *end = (u8 *)table;
-	struct ivmd_header *m;
-
-	end += table->length;
-	p += IVRS_HEADER_LENGTH;
-
-	while (p < end) {
-		m = (struct ivmd_header *)p;
-		if (m->flags & IVMD_FLAG_EXCL_RANGE)
-			init_exclusion_range(m);
-		else if (m->flags & IVMD_FLAG_UNITY_MAP)
-			init_unity_map_range(m);
-
-		p += m->length;
-	}
-
-	return 0;
-}
-
-/*
- * Init the device table to not allow DMA access for devices and
- * suppress all page faults
- */
-static void init_device_table(void)
-{
-	u16 devid;
-
-	for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
-		set_dev_entry_bit(devid, DEV_ENTRY_VALID);
-		set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
-	}
-}
-
-static void iommu_init_flags(struct amd_iommu *iommu)
-{
-	iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ?
-		iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
-		iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
-
-	iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ?
-		iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
-		iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
-
-	iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
-		iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
-		iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
-
-	iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ?
-		iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
-		iommu_feature_disable(iommu, CONTROL_ISOC_EN);
-
-	/*
-	 * make IOMMU memory accesses cache coherent
-	 */
-	iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
-}
-
-static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
-{
-	int i, j;
-	u32 ioc_feature_control;
-	struct pci_dev *pdev = NULL;
-
-	/* RD890 BIOSes may not have completely reconfigured the iommu */
-	if (!is_rd890_iommu(iommu->dev))
-		return;
-
-	/*
-	 * First, we need to ensure that the iommu is enabled. This is
-	 * controlled by a register in the northbridge
-	 */
-	pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
-
-	if (!pdev)
-		return;
-
-	/* Select Northbridge indirect register 0x75 and enable writing */
-	pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
-	pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
-
-	/* Enable the iommu */
-	if (!(ioc_feature_control & 0x1))
-		pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
-
-	pci_dev_put(pdev);
-
-	/* Restore the iommu BAR */
-	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
-			       iommu->stored_addr_lo);
-	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
-			       iommu->stored_addr_hi);
-
-	/* Restore the l1 indirect regs for each of the 6 l1s */
-	for (i = 0; i < 6; i++)
-		for (j = 0; j < 0x12; j++)
-			iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
-
-	/* Restore the l2 indirect regs */
-	for (i = 0; i < 0x83; i++)
-		iommu_write_l2(iommu, i, iommu->stored_l2[i]);
-
-	/* Lock PCI setup registers */
-	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
-			       iommu->stored_addr_lo | 1);
-}
-
-/*
- * This function finally enables all IOMMUs found in the system after
- * they have been initialized
- */
-static void enable_iommus(void)
-{
-	struct amd_iommu *iommu;
-
-	for_each_iommu(iommu) {
-		iommu_disable(iommu);
-		iommu_init_flags(iommu);
-		iommu_set_device_table(iommu);
-		iommu_enable_command_buffer(iommu);
-		iommu_enable_event_buffer(iommu);
-		iommu_set_exclusion_range(iommu);
-		iommu_init_msi(iommu);
-		iommu_enable(iommu);
-	}
-}
-
-static void disable_iommus(void)
-{
-	struct amd_iommu *iommu;
-
-	for_each_iommu(iommu)
-		iommu_disable(iommu);
-}
-
-/*
- * Suspend/Resume support
- * disable suspend until real resume implemented
- */
-
-static void amd_iommu_resume(void)
-{
-	struct amd_iommu *iommu;
-
-	for_each_iommu(iommu)
-		iommu_apply_resume_quirks(iommu);
-
-	/* re-load the hardware */
-	enable_iommus();
-
-	/*
-	 * we have to flush after the IOMMUs are enabled because a
-	 * disabled IOMMU will never execute the commands we send
-	 */
-	amd_iommu_flush_all_devices();
-	amd_iommu_flush_all_domains();
-}
-
-static int amd_iommu_suspend(void)
-{
-	/* disable IOMMUs to go out of the way for BIOS */
-	disable_iommus();
-
-	return 0;
-}
-
-static struct syscore_ops amd_iommu_syscore_ops = {
-	.suspend = amd_iommu_suspend,
-	.resume = amd_iommu_resume,
-};
-
-/*
- * This is the core init function for AMD IOMMU hardware in the system.
- * This function is called from the generic x86 DMA layer initialization
- * code.
- *
- * This function basically parses the ACPI table for AMD IOMMU (IVRS)
- * three times:
- *
- *	1 pass) Find the highest PCI device id the driver has to handle.
- *		Upon this information the size of the data structures is
- *		determined that needs to be allocated.
- *
- *	2 pass) Initialize the data structures just allocated with the
- *		information in the ACPI table about available AMD IOMMUs
- *		in the system. It also maps the PCI devices in the
- *		system to specific IOMMUs
- *
- *	3 pass) After the basic data structures are allocated and
- *		initialized we update them with information about memory
- *		remapping requirements parsed out of the ACPI table in
- *		this last pass.
- *
- * After that the hardware is initialized and ready to go. In the last
- * step we do some Linux specific things like registering the driver in
- * the dma_ops interface and initializing the suspend/resume support
- * functions. Finally it prints some information about AMD IOMMUs and
- * the driver state and enables the hardware.
- */
-static int __init amd_iommu_init(void)
-{
-	int i, ret = 0;
-
-	/*
-	 * First parse ACPI tables to find the largest Bus/Dev/Func
-	 * we need to handle. Upon this information the shared data
-	 * structures for the IOMMUs in the system will be allocated
-	 */
-	if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
-		return -ENODEV;
-
-	ret = amd_iommu_init_err;
-	if (ret)
-		goto out;
-
-	dev_table_size     = tbl_size(DEV_TABLE_ENTRY_SIZE);
-	alias_table_size   = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
-	rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
-
-	ret = -ENOMEM;
-
-	/* Device table - directly used by all IOMMUs */
-	amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-				      get_order(dev_table_size));
-	if (amd_iommu_dev_table == NULL)
-		goto out;
-
-	/*
-	 * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
-	 * IOMMU see for that device
-	 */
-	amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
-			get_order(alias_table_size));
-	if (amd_iommu_alias_table == NULL)
-		goto free;
-
-	/* IOMMU rlookup table - find the IOMMU for a specific device */
-	amd_iommu_rlookup_table = (void *)__get_free_pages(
-			GFP_KERNEL | __GFP_ZERO,
-			get_order(rlookup_table_size));
-	if (amd_iommu_rlookup_table == NULL)
-		goto free;
-
-	amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
-					    GFP_KERNEL | __GFP_ZERO,
-					    get_order(MAX_DOMAIN_ID/8));
-	if (amd_iommu_pd_alloc_bitmap == NULL)
-		goto free;
-
-	/* init the device table */
-	init_device_table();
-
-	/*
-	 * let all alias entries point to itself
-	 */
-	for (i = 0; i <= amd_iommu_last_bdf; ++i)
-		amd_iommu_alias_table[i] = i;
-
-	/*
-	 * never allocate domain 0 because its used as the non-allocated and
-	 * error value placeholder
-	 */
-	amd_iommu_pd_alloc_bitmap[0] = 1;
-
-	spin_lock_init(&amd_iommu_pd_lock);
-
-	/*
-	 * now the data structures are allocated and basically initialized
-	 * start the real acpi table scan
-	 */
-	ret = -ENODEV;
-	if (acpi_table_parse("IVRS", init_iommu_all) != 0)
-		goto free;
-
-	if (amd_iommu_init_err) {
-		ret = amd_iommu_init_err;
-		goto free;
-	}
-
-	if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
-		goto free;
-
-	if (amd_iommu_init_err) {
-		ret = amd_iommu_init_err;
-		goto free;
-	}
-
-	ret = amd_iommu_init_devices();
-	if (ret)
-		goto free;
-
-	enable_iommus();
-
-	if (iommu_pass_through)
-		ret = amd_iommu_init_passthrough();
-	else
-		ret = amd_iommu_init_dma_ops();
-
-	if (ret)
-		goto free_disable;
-
-	amd_iommu_init_api();
-
-	amd_iommu_init_notifier();
-
-	register_syscore_ops(&amd_iommu_syscore_ops);
-
-	if (iommu_pass_through)
-		goto out;
-
-	if (amd_iommu_unmap_flush)
-		printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
-	else
-		printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
-
-	x86_platform.iommu_shutdown = disable_iommus;
-out:
-	return ret;
-
-free_disable:
-	disable_iommus();
-
-free:
-	amd_iommu_uninit_devices();
-
-	free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
-		   get_order(MAX_DOMAIN_ID/8));
-
-	free_pages((unsigned long)amd_iommu_rlookup_table,
-		   get_order(rlookup_table_size));
-
-	free_pages((unsigned long)amd_iommu_alias_table,
-		   get_order(alias_table_size));
-
-	free_pages((unsigned long)amd_iommu_dev_table,
-		   get_order(dev_table_size));
-
-	free_iommu_all();
-
-	free_unity_maps();
-
-#ifdef CONFIG_GART_IOMMU
-	/*
-	 * We failed to initialize the AMD IOMMU - try fallback to GART
-	 * if possible.
-	 */
-	gart_iommu_init();
-
-#endif
-
-	goto out;
-}
-
-/****************************************************************************
- *
- * Early detect code. This code runs at IOMMU detection time in the DMA
- * layer. It just looks if there is an IVRS ACPI table to detect AMD
- * IOMMUs
- *
- ****************************************************************************/
-static int __init early_amd_iommu_detect(struct acpi_table_header *table)
-{
-	return 0;
-}
-
-int __init amd_iommu_detect(void)
-{
-	if (no_iommu || (iommu_detected && !gart_iommu_aperture))
-		return -ENODEV;
-
-	if (amd_iommu_disabled)
-		return -ENODEV;
-
-	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
-		iommu_detected = 1;
-		amd_iommu_detected = 1;
-		x86_init.iommu.iommu_init = amd_iommu_init;
-
-		/* Make sure ACS will be enabled */
-		pci_request_acs();
-		return 1;
-	}
-	return -ENODEV;
-}
-
-/****************************************************************************
- *
- * Parsing functions for the AMD IOMMU specific kernel command line
- * options.
- *
- ****************************************************************************/
-
-static int __init parse_amd_iommu_dump(char *str)
-{
-	amd_iommu_dump = true;
-
-	return 1;
-}
-
-static int __init parse_amd_iommu_options(char *str)
-{
-	for (; *str; ++str) {
-		if (strncmp(str, "fullflush", 9) == 0)
-			amd_iommu_unmap_flush = true;
-		if (strncmp(str, "off", 3) == 0)
-			amd_iommu_disabled = true;
-	}
-
-	return 1;
-}
-
-__setup("amd_iommu_dump", parse_amd_iommu_dump);
-__setup("amd_iommu=", parse_amd_iommu_options);
-
-IOMMU_INIT_FINISH(amd_iommu_detect,
-		  gart_iommu_hole_init,
-		  0,
-		  0);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index cd1ffed4ee2..afdc3f756de 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -27,15 +27,12 @@
  * timer, but by default APB timer has higher rating than local APIC timers.
  */
 
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
 #include <linux/delay.h>
+#include <linux/dw_apb_timer.h>
 #include <linux/errno.h>
 #include <linux/init.h>
-#include <linux/sysdev.h>
 #include <linux/slab.h>
 #include <linux/pm.h>
-#include <linux/pci.h>
 #include <linux/sfi.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
@@ -44,76 +41,48 @@
 #include <asm/fixmap.h>
 #include <asm/apb_timer.h>
 #include <asm/mrst.h>
+#include <asm/time.h>
 
-#define APBT_MASK			CLOCKSOURCE_MASK(32)
-#define APBT_SHIFT			22
 #define APBT_CLOCKEVENT_RATING		110
 #define APBT_CLOCKSOURCE_RATING		250
-#define APBT_MIN_DELTA_USEC		200
 
-#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt)
 #define APBT_CLOCKEVENT0_NUM   (0)
-#define APBT_CLOCKEVENT1_NUM   (1)
 #define APBT_CLOCKSOURCE_NUM   (2)
 
-static unsigned long apbt_address;
+static phys_addr_t apbt_address;
 static int apb_timer_block_enabled;
 static void __iomem *apbt_virt_address;
-static int phy_cs_timer_id;
 
 /*
  * Common DW APB timer info
  */
-static uint64_t apbt_freq;
-
-static void apbt_set_mode(enum clock_event_mode mode,
-			  struct clock_event_device *evt);
-static int apbt_next_event(unsigned long delta,
-			   struct clock_event_device *evt);
-static cycle_t apbt_read_clocksource(struct clocksource *cs);
-static void apbt_restart_clocksource(struct clocksource *cs);
+static unsigned long apbt_freq;
 
 struct apbt_dev {
-	struct clock_event_device evt;
-	unsigned int num;
-	int cpu;
-	unsigned int irq;
-	unsigned int tick;
-	unsigned int count;
-	unsigned int flags;
-	char name[10];
+	struct dw_apb_clock_event_device	*timer;
+	unsigned int				num;
+	int					cpu;
+	unsigned int				irq;
+	char					name[10];
 };
 
-static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
+static struct dw_apb_clocksource *clocksource_apbt;
 
-#ifdef CONFIG_SMP
-static unsigned int apbt_num_timers_used;
-static struct apbt_dev *apbt_devs;
-#endif
-
-static	inline unsigned long apbt_readl_reg(unsigned long a)
+static inline void __iomem *adev_virt_addr(struct apbt_dev *adev)
 {
-	return readl(apbt_virt_address + a);
+	return apbt_virt_address + adev->num * APBTMRS_REG_SIZE;
 }
 
-static inline void apbt_writel_reg(unsigned long d, unsigned long a)
-{
-	writel(d, apbt_virt_address + a);
-}
-
-static inline unsigned long apbt_readl(int n, unsigned long a)
-{
-	return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE);
-}
+static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
 
-static inline void apbt_writel(int n, unsigned long d, unsigned long a)
-{
-	writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE);
-}
+#ifdef CONFIG_SMP
+static unsigned int apbt_num_timers_used;
+#endif
 
 static inline void apbt_set_mapping(void)
 {
 	struct sfi_timer_table_entry *mtmr;
+	int phy_cs_timer_id = 0;
 
 	if (apbt_virt_address) {
 		pr_debug("APBT base already mapped\n");
@@ -125,21 +94,18 @@ static inline void apbt_set_mapping(void)
 		       APBT_CLOCKEVENT0_NUM);
 		return;
 	}
-	apbt_address = (unsigned long)mtmr->phys_addr;
+	apbt_address = (phys_addr_t)mtmr->phys_addr;
 	if (!apbt_address) {
 		printk(KERN_WARNING "No timer base from SFI, use default\n");
 		apbt_address = APBT_DEFAULT_BASE;
 	}
 	apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
-	if (apbt_virt_address) {
-		pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\
-			 (void *)apbt_address, (void *)apbt_virt_address);
-	} else {
-		pr_debug("Failed mapping APBT phy address at %p\n",\
-			 (void *)apbt_address);
+	if (!apbt_virt_address) {
+		pr_debug("Failed mapping APBT phy address at %lu\n",\
+			 (unsigned long)apbt_address);
 		goto panic_noapbt;
 	}
-	apbt_freq = mtmr->freq_hz / USEC_PER_SEC;
+	apbt_freq = mtmr->freq_hz;
 	sfi_free_mtmr(mtmr);
 
 	/* Now figure out the physical timer id for clocksource device */
@@ -148,9 +114,14 @@ static inline void apbt_set_mapping(void)
 		goto panic_noapbt;
 
 	/* Now figure out the physical timer id */
-	phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff)
-		/ APBTMRS_REG_SIZE;
-	pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id);
+	pr_debug("Use timer %d for clocksource\n",
+		 (int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE);
+	phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) /
+		APBTMRS_REG_SIZE;
+
+	clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING,
+		"apbt0", apbt_virt_address + phy_cs_timer_id *
+		APBTMRS_REG_SIZE, apbt_freq);
 	return;
 
 panic_noapbt:
@@ -172,83 +143,6 @@ static inline int is_apbt_capable(void)
 	return apbt_virt_address ? 1 : 0;
 }
 
-static struct clocksource clocksource_apbt = {
-	.name		= "apbt",
-	.rating		= APBT_CLOCKSOURCE_RATING,
-	.read		= apbt_read_clocksource,
-	.mask		= APBT_MASK,
-	.shift		= APBT_SHIFT,
-	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
-	.resume		= apbt_restart_clocksource,
-};
-
-/* boot APB clock event device */
-static struct clock_event_device apbt_clockevent = {
-	.name		= "apbt0",
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-	.set_mode	= apbt_set_mode,
-	.set_next_event = apbt_next_event,
-	.shift		= APBT_SHIFT,
-	.irq		= 0,
-	.rating		= APBT_CLOCKEVENT_RATING,
-};
-
-/*
- * start count down from 0xffff_ffff. this is done by toggling the enable bit
- * then load initial load count to ~0.
- */
-static void apbt_start_counter(int n)
-{
-	unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
-
-	ctrl &= ~APBTMR_CONTROL_ENABLE;
-	apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-	apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT);
-	/* enable, mask interrupt */
-	ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
-	ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT);
-	apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-	/* read it once to get cached counter value initialized */
-	apbt_read_clocksource(&clocksource_apbt);
-}
-
-static irqreturn_t apbt_interrupt_handler(int irq, void *data)
-{
-	struct apbt_dev *dev = (struct apbt_dev *)data;
-	struct clock_event_device *aevt = &dev->evt;
-
-	if (!aevt->event_handler) {
-		printk(KERN_INFO "Spurious APBT timer interrupt on %d\n",
-		       dev->num);
-		return IRQ_NONE;
-	}
-	aevt->event_handler(aevt);
-	return IRQ_HANDLED;
-}
-
-static void apbt_restart_clocksource(struct clocksource *cs)
-{
-	apbt_start_counter(phy_cs_timer_id);
-}
-
-static void apbt_enable_int(int n)
-{
-	unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
-	/* clear pending intr */
-	apbt_readl(n, APBTMR_N_EOI);
-	ctrl &= ~APBTMR_CONTROL_INT;
-	apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-}
-
-static void apbt_disable_int(int n)
-{
-	unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
-
-	ctrl |= APBTMR_CONTROL_INT;
-	apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-}
-
-
 static int __init apbt_clockevent_register(void)
 {
 	struct sfi_timer_table_entry *mtmr;
@@ -261,45 +155,21 @@ static int __init apbt_clockevent_register(void)
 		return -ENODEV;
 	}
 
-	/*
-	 * We need to calculate the scaled math multiplication factor for
-	 * nanosecond to apbt tick conversion.
-	 * mult = (nsec/cycle)*2^APBT_SHIFT
-	 */
-	apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz
-				      , NSEC_PER_SEC, APBT_SHIFT);
-
-	/* Calculate the min / max delta */
-	apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
-							   &apbt_clockevent);
-	apbt_clockevent.min_delta_ns = clockevent_delta2ns(
-		APBT_MIN_DELTA_USEC*apbt_freq,
-		&apbt_clockevent);
-	/*
-	 * Start apbt with the boot cpu mask and make it
-	 * global if not used for per cpu timer.
-	 */
-	apbt_clockevent.cpumask = cpumask_of(smp_processor_id());
 	adev->num = smp_processor_id();
-	memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
+	adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0",
+		mrst_timer_options == MRST_TIMER_LAPIC_APBT ?
+		APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING,
+		adev_virt_addr(adev), 0, apbt_freq);
+	/* Firmware does EOI handling for us. */
+	adev->timer->eoi = NULL;
 
 	if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
-		adev->evt.rating = APBT_CLOCKEVENT_RATING - 100;
-		global_clock_event = &adev->evt;
+		global_clock_event = &adev->timer->ced;
 		printk(KERN_DEBUG "%s clockevent registered as global\n",
 		       global_clock_event->name);
 	}
 
-	if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler,
-			IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
-			apbt_clockevent.name, adev)) {
-		printk(KERN_ERR "Failed request IRQ for APBT%d\n",
-		       apbt_clockevent.irq);
-	}
-
-	clockevents_register_device(&adev->evt);
-	/* Start APBT 0 interrupts */
-	apbt_enable_int(APBT_CLOCKEVENT0_NUM);
+	dw_apb_clockevent_register(adev->timer);
 
 	sfi_free_mtmr(mtmr);
 	return 0;
@@ -317,52 +187,34 @@ static void apbt_setup_irq(struct apbt_dev *adev)
 	irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
 	/* APB timer irqs are set up as mp_irqs, timer is edge type */
 	__irq_set_handler(adev->irq, handle_edge_irq, 0, "edge");
-
-	if (system_state == SYSTEM_BOOTING) {
-		if (request_irq(adev->irq, apbt_interrupt_handler,
-					IRQF_TIMER | IRQF_DISABLED |
-					IRQF_NOBALANCING,
-					adev->name, adev)) {
-			printk(KERN_ERR "Failed request IRQ for APBT%d\n",
-			       adev->num);
-		}
-	} else
-		enable_irq(adev->irq);
 }
 
 /* Should be called with per cpu */
 void apbt_setup_secondary_clock(void)
 {
 	struct apbt_dev *adev;
-	struct clock_event_device *aevt;
 	int cpu;
 
 	/* Don't register boot CPU clockevent */
 	cpu = smp_processor_id();
 	if (!cpu)
 		return;
-	/*
-	 * We need to calculate the scaled math multiplication factor for
-	 * nanosecond to apbt tick conversion.
-	 * mult = (nsec/cycle)*2^APBT_SHIFT
-	 */
-	printk(KERN_INFO "Init per CPU clockevent %d\n", cpu);
-	adev = &per_cpu(cpu_apbt_dev, cpu);
-	aevt = &adev->evt;
 
-	memcpy(aevt, &apbt_clockevent, sizeof(*aevt));
-	aevt->cpumask = cpumask_of(cpu);
-	aevt->name = adev->name;
-	aevt->mode = CLOCK_EVT_MODE_UNUSED;
+	adev = &__get_cpu_var(cpu_apbt_dev);
+	if (!adev->timer) {
+		adev->timer = dw_apb_clockevent_init(cpu, adev->name,
+			APBT_CLOCKEVENT_RATING, adev_virt_addr(adev),
+			adev->irq, apbt_freq);
+		adev->timer->eoi = NULL;
+	} else {
+		dw_apb_clockevent_resume(adev->timer);
+	}
 
-	printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n",
-	       cpu, aevt->name, *(u32 *)aevt->cpumask);
+	printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n",
+	       cpu, adev->name, adev->cpu);
 
 	apbt_setup_irq(adev);
-
-	clockevents_register_device(aevt);
-
-	apbt_enable_int(cpu);
+	dw_apb_clockevent_register(adev->timer);
 
 	return;
 }
@@ -385,13 +237,12 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
 
 	switch (action & 0xf) {
 	case CPU_DEAD:
-		disable_irq(adev->irq);
-		apbt_disable_int(cpu);
+		dw_apb_clockevent_pause(adev->timer);
 		if (system_state == SYSTEM_RUNNING) {
 			pr_debug("skipping APBT CPU %lu offline\n", cpu);
 		} else if (adev) {
 			pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
-			free_irq(adev->irq, adev);
+			dw_apb_clockevent_stop(adev->timer);
 		}
 		break;
 	default:
@@ -416,116 +267,16 @@ void apbt_setup_secondary_clock(void) {}
 
 #endif /* CONFIG_SMP */
 
-static void apbt_set_mode(enum clock_event_mode mode,
-			  struct clock_event_device *evt)
-{
-	unsigned long ctrl;
-	uint64_t delta;
-	int timer_num;
-	struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
-
-	BUG_ON(!apbt_virt_address);
-
-	timer_num = adev->num;
-	pr_debug("%s CPU %d timer %d mode=%d\n",
-		 __func__, first_cpu(*evt->cpumask), timer_num, mode);
-
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-		delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult;
-		delta >>= apbt_clockevent.shift;
-		ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
-		ctrl |= APBTMR_CONTROL_MODE_PERIODIC;
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-		/*
-		 * DW APB p. 46, have to disable timer before load counter,
-		 * may cause sync problem.
-		 */
-		ctrl &= ~APBTMR_CONTROL_ENABLE;
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-		udelay(1);
-		pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ);
-		apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
-		ctrl |= APBTMR_CONTROL_ENABLE;
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-		break;
-		/* APB timer does not have one-shot mode, use free running mode */
-	case CLOCK_EVT_MODE_ONESHOT:
-		ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
-		/*
-		 * set free running mode, this mode will let timer reload max
-		 * timeout which will give time (3min on 25MHz clock) to rearm
-		 * the next event, therefore emulate the one-shot mode.
-		 */
-		ctrl &= ~APBTMR_CONTROL_ENABLE;
-		ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
-
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-		/* write again to set free running mode */
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-
-		/*
-		 * DW APB p. 46, load counter with all 1s before starting free
-		 * running mode.
-		 */
-		apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT);
-		ctrl &= ~APBTMR_CONTROL_INT;
-		ctrl |= APBTMR_CONTROL_ENABLE;
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-		break;
-
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		apbt_disable_int(timer_num);
-		ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
-		ctrl &= ~APBTMR_CONTROL_ENABLE;
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-		break;
-
-	case CLOCK_EVT_MODE_RESUME:
-		apbt_enable_int(timer_num);
-		break;
-	}
-}
-
-static int apbt_next_event(unsigned long delta,
-			   struct clock_event_device *evt)
-{
-	unsigned long ctrl;
-	int timer_num;
-
-	struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
-
-	timer_num = adev->num;
-	/* Disable timer */
-	ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
-	ctrl &= ~APBTMR_CONTROL_ENABLE;
-	apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-	/* write new count */
-	apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
-	ctrl |= APBTMR_CONTROL_ENABLE;
-	apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-	return 0;
-}
-
-static cycle_t apbt_read_clocksource(struct clocksource *cs)
-{
-	unsigned long current_count;
-
-	current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE);
-	return (cycle_t)~current_count;
-}
-
 static int apbt_clocksource_register(void)
 {
 	u64 start, now;
 	cycle_t t1;
 
 	/* Start the counter, use timer 2 as source, timer 0/1 for event */
-	apbt_start_counter(phy_cs_timer_id);
+	dw_apb_clocksource_start(clocksource_apbt);
 
 	/* Verify whether apbt counter works */
-	t1 = apbt_read_clocksource(&clocksource_apbt);
+	t1 = dw_apb_clocksource_read(clocksource_apbt);
 	rdtscll(start);
 
 	/*
@@ -540,17 +291,10 @@ static int apbt_clocksource_register(void)
 	} while ((now - start) < 200000UL);
 
 	/* APBT is the only always on clocksource, it has to work! */
-	if (t1 == apbt_read_clocksource(&clocksource_apbt))
+	if (t1 == dw_apb_clocksource_read(clocksource_apbt))
 		panic("APBT counter not counting. APBT disabled\n");
 
-	/*
-	 * initialize and register APBT clocksource
-	 * convert that to ns/clock cycle
-	 * mult = (ns/c) * 2^APBT_SHIFT
-	 */
-	clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
-				       (unsigned long) apbt_freq, APBT_SHIFT);
-	clocksource_register(&clocksource_apbt);
+	dw_apb_clocksource_register(clocksource_apbt);
 
 	return 0;
 }
@@ -574,10 +318,7 @@ void __init apbt_time_init(void)
 	if (apb_timer_block_enabled)
 		return;
 	apbt_set_mapping();
-	if (apbt_virt_address) {
-		pr_debug("Found APBT version 0x%lx\n",\
-			 apbt_readl_reg(APBTMRS_COMP_VERSION));
-	} else
+	if (!apbt_virt_address)
 		goto out_noapbt;
 	/*
 	 * Read the frequency and check for a sane value, for ESL model
@@ -585,7 +326,7 @@ void __init apbt_time_init(void)
 	 */
 
 	if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
-		pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq);
+		pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq);
 		goto out_noapbt;
 	}
 	if (apbt_clocksource_register()) {
@@ -611,30 +352,20 @@ void __init apbt_time_init(void)
 	} else {
 		percpu_timer = 0;
 		apbt_num_timers_used = 1;
-		adev = &per_cpu(cpu_apbt_dev, 0);
-		adev->flags &= ~APBT_DEV_USED;
 	}
 	pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
 
 	/* here we set up per CPU timer data structure */
-	apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used,
-			    GFP_KERNEL);
-	if (!apbt_devs) {
-		printk(KERN_ERR "Failed to allocate APB timer devices\n");
-		return;
-	}
 	for (i = 0; i < apbt_num_timers_used; i++) {
 		adev = &per_cpu(cpu_apbt_dev, i);
 		adev->num = i;
 		adev->cpu = i;
 		p_mtmr = sfi_get_mtmr(i);
-		if (p_mtmr) {
-			adev->tick = p_mtmr->freq_hz;
+		if (p_mtmr)
 			adev->irq = p_mtmr->irq;
-		} else
+		else
 			printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
-		adev->count = 0;
-		sprintf(adev->name, "apbt%d", i);
+		snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i);
 	}
 #endif
 
@@ -646,17 +377,8 @@ out_noapbt:
 	panic("failed to enable APB timer\n");
 }
 
-static inline void apbt_disable(int n)
-{
-	if (is_apbt_capable()) {
-		unsigned long ctrl =  apbt_readl(n, APBTMR_N_CONTROL);
-		ctrl &= ~APBTMR_CONTROL_ENABLE;
-		apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-	}
-}
-
 /* called before apb_timer_enable, use early map */
-unsigned long apbt_quick_calibrate()
+unsigned long apbt_quick_calibrate(void)
 {
 	int i, scale;
 	u64 old, new;
@@ -665,31 +387,31 @@ unsigned long apbt_quick_calibrate()
 	u32 loop, shift;
 
 	apbt_set_mapping();
-	apbt_start_counter(phy_cs_timer_id);
+	dw_apb_clocksource_start(clocksource_apbt);
 
 	/* check if the timer can count down, otherwise return */
-	old = apbt_read_clocksource(&clocksource_apbt);
+	old = dw_apb_clocksource_read(clocksource_apbt);
 	i = 10000;
 	while (--i) {
-		if (old != apbt_read_clocksource(&clocksource_apbt))
+		if (old != dw_apb_clocksource_read(clocksource_apbt))
 			break;
 	}
 	if (!i)
 		goto failed;
 
 	/* count 16 ms */
-	loop = (apbt_freq * 1000) << 4;
+	loop = (apbt_freq / 1000) << 4;
 
 	/* restart the timer to ensure it won't get to 0 in the calibration */
-	apbt_start_counter(phy_cs_timer_id);
+	dw_apb_clocksource_start(clocksource_apbt);
 
-	old = apbt_read_clocksource(&clocksource_apbt);
+	old = dw_apb_clocksource_read(clocksource_apbt);
 	old += loop;
 
 	t1 = __native_read_tsc();
 
 	do {
-		new = apbt_read_clocksource(&clocksource_apbt);
+		new = dw_apb_clocksource_read(clocksource_apbt);
 	} while (new < old);
 
 	t2 = __native_read_tsc();
@@ -701,7 +423,7 @@ unsigned long apbt_quick_calibrate()
 		return 0;
 	}
 	scale = (int)div_u64((t2 - t1), loop >> shift);
-	khz = (scale * apbt_freq * 1000) >> shift;
+	khz = (scale * (apbt_freq / 1000)) >> shift;
 	printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
 	return khz;
 failed:
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 86d1ad4962a..3d2661ca654 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -30,6 +30,22 @@
 #include <asm/amd_nb.h>
 #include <asm/x86_init.h>
 
+/*
+ * Using 512M as goal, in case kexec will load kernel_big
+ * that will do the on-position decompress, and could overlap with
+ * with the gart aperture that is used.
+ * Sequence:
+ * kernel_small
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kernel_small (gart area become e820_reserved)
+ * ==> kexec (with kdump trigger path or gart still enabled)
+ * ==> kerne_big (uncompressed size will be big than 64M or 128M)
+ * So don't use 512M below as gart iommu, leave the space for kernel
+ * code for safe.
+ */
+#define GART_MIN_ADDR	(512ULL << 20)
+#define GART_MAX_ADDR	(1ULL   << 32)
+
 int gart_iommu_aperture;
 int gart_iommu_aperture_disabled __initdata;
 int gart_iommu_aperture_allowed __initdata;
@@ -70,21 +86,9 @@ static u32 __init allocate_aperture(void)
 	 * memory. Unfortunately we cannot move it up because that would
 	 * make the IOMMU useless.
 	 */
-	/*
-	 * using 512M as goal, in case kexec will load kernel_big
-	 * that will do the on position decompress, and  could overlap with
-	 * that position with gart that is used.
-	 * sequende:
-	 * kernel_small
-	 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
-	 * ==> kernel_small(gart area become e820_reserved)
-	 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
-	 * ==> kerne_big (uncompressed size will be big than 64M or 128M)
-	 * so don't use 512M below as gart iommu, leave the space for kernel
-	 * code for safe
-	 */
-	addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20);
-	if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) {
+	addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
+				      aper_size, aper_size);
+	if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) {
 		printk(KERN_ERR
 			"Cannot allocate aperture memory hole (%lx,%uK)\n",
 				addr, aper_size>>10);
@@ -499,7 +503,7 @@ out:
 		 * Don't enable translation yet but enable GART IO and CPU
 		 * accesses and set DISTLBWALKPRB since GART table memory is UC.
 		 */
-		u32 ctl = DISTLBWALKPRB | aper_order << 1;
+		u32 ctl = aper_order << 1;
 
 		bus = amd_nb_bus_dev_ranges[i].bus;
 		dev_base = amd_nb_bus_dev_ranges[i].dev_base;
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 3966b564ea4..767fd04f284 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,20 +2,25 @@
 # Makefile for local APIC drivers and for the IO-APIC code
 #
 
-obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o probe_$(BITS).o ipi.o
+obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o ipi.o
 obj-y				+= hw_nmi.o
 
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_SMP)		+= ipi.o
 
 ifeq ($(CONFIG_X86_64),y)
-obj-y				+= apic_flat_64.o
-obj-$(CONFIG_X86_X2APIC)	+= x2apic_cluster.o
-obj-$(CONFIG_X86_X2APIC)	+= x2apic_phys.o
+# APIC probe will depend on the listing order here
 obj-$(CONFIG_X86_UV)		+= x2apic_uv_x.o
+obj-$(CONFIG_X86_X2APIC)	+= x2apic_phys.o
+obj-$(CONFIG_X86_X2APIC)	+= x2apic_cluster.o
+obj-y				+= apic_flat_64.o
 endif
 
-obj-$(CONFIG_X86_BIGSMP)	+= bigsmp_32.o
+# APIC probe will depend on the listing order here
 obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
-obj-$(CONFIG_X86_ES7000)	+= es7000_32.o
 obj-$(CONFIG_X86_SUMMIT)	+= summit_32.o
+obj-$(CONFIG_X86_BIGSMP)	+= bigsmp_32.o
+obj-$(CONFIG_X86_ES7000)	+= es7000_32.o
+
+# For 32bit, probe_32 need to be listed last
+obj-$(CONFIG_X86_LOCAL_APIC)	+= probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index fabf01eff77..52fa56399a5 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -27,6 +27,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/delay.h>
 #include <linux/timex.h>
+#include <linux/i8253.h>
 #include <linux/dmar.h>
 #include <linux/init.h>
 #include <linux/cpu.h>
@@ -37,9 +38,8 @@
 #include <asm/perf_event.h>
 #include <asm/x86_init.h>
 #include <asm/pgalloc.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/mpspec.h>
-#include <asm/i8253.h>
 #include <asm/i8259.h>
 #include <asm/proto.h>
 #include <asm/apic.h>
@@ -48,6 +48,7 @@
 #include <asm/hpet.h>
 #include <asm/idle.h>
 #include <asm/mtrr.h>
+#include <asm/time.h>
 #include <asm/smp.h>
 #include <asm/mce.h>
 #include <asm/tsc.h>
@@ -390,7 +391,8 @@ static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
 
 /*
  * If mask=1, the LVT entry does not generate interrupts while mask=0
- * enables the vector. See also the BKDGs.
+ * enables the vector. See also the BKDGs. Must be called with
+ * preemption disabled.
  */
 
 int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
@@ -505,7 +507,7 @@ static void __cpuinit setup_APIC_timer(void)
 {
 	struct clock_event_device *levt = &__get_cpu_var(lapic_events);
 
-	if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) {
+	if (this_cpu_has(X86_FEATURE_ARAT)) {
 		lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
 		/* Make LAPIC timer preferrable over percpu HPET */
 		lapic_clockevent.rating = 150;
@@ -1237,6 +1239,17 @@ void __cpuinit setup_local_APIC(void)
 	/* always use the value from LDR */
 	early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
 		logical_smp_processor_id();
+
+	/*
+	 * Some NUMA implementations (NUMAQ) don't initialize apicid to
+	 * node mapping during NUMA init.  Now that logical apicid is
+	 * guaranteed to be known, give it another chance.  This is already
+	 * a bit too late - percpu allocation has already happened without
+	 * proper NUMA affinity.
+	 */
+	if (apic->x86_32_numa_cpu_node)
+		set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu),
+				   apic->x86_32_numa_cpu_node(cpu));
 #endif
 
 	/*
@@ -1417,7 +1430,7 @@ void enable_x2apic(void)
 	rdmsr(MSR_IA32_APICBASE, msr, msr2);
 	if (!(msr & X2APIC_ENABLE)) {
 		printk_once(KERN_INFO "Enabling x2apic\n");
-		wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
+		wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2);
 	}
 }
 #endif /* CONFIG_X86_X2APIC */
@@ -1450,7 +1463,6 @@ int __init enable_IR(void)
 void __init enable_IR_x2apic(void)
 {
 	unsigned long flags;
-	struct IO_APIC_route_entry **ioapic_entries;
 	int ret, x2apic_enabled = 0;
 	int dmar_table_init_ret;
 
@@ -1458,13 +1470,7 @@ void __init enable_IR_x2apic(void)
 	if (dmar_table_init_ret && !x2apic_supported())
 		return;
 
-	ioapic_entries = alloc_ioapic_entries();
-	if (!ioapic_entries) {
-		pr_err("Allocate ioapic_entries failed\n");
-		goto out;
-	}
-
-	ret = save_IO_APIC_setup(ioapic_entries);
+	ret = save_ioapic_entries();
 	if (ret) {
 		pr_info("Saving IO-APIC state failed: %d\n", ret);
 		goto out;
@@ -1472,7 +1478,7 @@ void __init enable_IR_x2apic(void)
 
 	local_irq_save(flags);
 	legacy_pic->mask_all();
-	mask_IO_APIC_setup(ioapic_entries);
+	mask_ioapic_entries();
 
 	if (dmar_table_init_ret)
 		ret = 0;
@@ -1503,14 +1509,11 @@ void __init enable_IR_x2apic(void)
 
 nox2apic:
 	if (!ret) /* IR enabling failed */
-		restore_IO_APIC_setup(ioapic_entries);
+		restore_ioapic_entries();
 	legacy_pic->restore_mask();
 	local_irq_restore(flags);
 
 out:
-	if (ioapic_entries)
-		free_ioapic_entries(ioapic_entries);
-
 	if (x2apic_enabled)
 		return;
 
@@ -1812,30 +1815,41 @@ void smp_spurious_interrupt(struct pt_regs *regs)
  */
 void smp_error_interrupt(struct pt_regs *regs)
 {
-	u32 v, v1;
+	u32 v0, v1;
+	u32 i = 0;
+	static const char * const error_interrupt_reason[] = {
+		"Send CS error",		/* APIC Error Bit 0 */
+		"Receive CS error",		/* APIC Error Bit 1 */
+		"Send accept error",		/* APIC Error Bit 2 */
+		"Receive accept error",		/* APIC Error Bit 3 */
+		"Redirectable IPI",		/* APIC Error Bit 4 */
+		"Send illegal vector",		/* APIC Error Bit 5 */
+		"Received illegal vector",	/* APIC Error Bit 6 */
+		"Illegal register address",	/* APIC Error Bit 7 */
+	};
 
 	exit_idle();
 	irq_enter();
 	/* First tickle the hardware, only then report what went on. -- REW */
-	v = apic_read(APIC_ESR);
+	v0 = apic_read(APIC_ESR);
 	apic_write(APIC_ESR, 0);
 	v1 = apic_read(APIC_ESR);
 	ack_APIC_irq();
 	atomic_inc(&irq_err_count);
 
-	/*
-	 * Here is what the APIC error bits mean:
-	 * 0: Send CS error
-	 * 1: Receive CS error
-	 * 2: Send accept error
-	 * 3: Receive accept error
-	 * 4: Reserved
-	 * 5: Send illegal vector
-	 * 6: Received illegal vector
-	 * 7: Illegal register address
-	 */
-	pr_debug("APIC error on CPU%d: %02x(%02x)\n",
-		smp_processor_id(), v , v1);
+	apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)",
+		    smp_processor_id(), v0 , v1);
+
+	v1 = v1 & 0xff;
+	while (v1) {
+		if (v1 & 0x1)
+			apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
+		i++;
+		v1 >>= 1;
+	};
+
+	apic_printk(APIC_DEBUG, KERN_CONT "\n");
+
 	irq_exit();
 }
 
@@ -1930,10 +1944,28 @@ void disconnect_bsp_APIC(int virt_wire_setup)
 
 void __cpuinit generic_processor_info(int apicid, int version)
 {
-	int cpu;
+	int cpu, max = nr_cpu_ids;
+	bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
+				phys_cpu_present_map);
+
+	/*
+	 * If boot cpu has not been detected yet, then only allow upto
+	 * nr_cpu_ids - 1 processors and keep one slot free for boot cpu
+	 */
+	if (!boot_cpu_detected && num_processors >= nr_cpu_ids - 1 &&
+	    apicid != boot_cpu_physical_apicid) {
+		int thiscpu = max + disabled_cpus - 1;
+
+		pr_warning(
+			"ACPI: NR_CPUS/possible_cpus limit of %i almost"
+			" reached. Keeping one slot for boot cpu."
+			"  Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+
+		disabled_cpus++;
+		return;
+	}
 
 	if (num_processors >= nr_cpu_ids) {
-		int max = nr_cpu_ids;
 		int thiscpu = max + disabled_cpus;
 
 		pr_warning(
@@ -2003,21 +2035,6 @@ void default_init_apic_ldr(void)
 	apic_write(APIC_LDR, val);
 }
 
-#ifdef CONFIG_X86_32
-int default_x86_32_numa_cpu_node(int cpu)
-{
-#ifdef CONFIG_NUMA
-	int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
-
-	if (apicid != BAD_APICID)
-		return __apicid_to_node[apicid];
-	return NUMA_NO_NODE;
-#else
-	return 0;
-#endif
-}
-#endif
-
 /*
  * Power management
  */
@@ -2088,28 +2105,20 @@ static void lapic_resume(void)
 {
 	unsigned int l, h;
 	unsigned long flags;
-	int maxlvt, ret;
-	struct IO_APIC_route_entry **ioapic_entries = NULL;
+	int maxlvt;
 
 	if (!apic_pm_state.active)
 		return;
 
 	local_irq_save(flags);
 	if (intr_remapping_enabled) {
-		ioapic_entries = alloc_ioapic_entries();
-		if (!ioapic_entries) {
-			WARN(1, "Alloc ioapic_entries in lapic resume failed.");
-			goto restore;
-		}
-
-		ret = save_IO_APIC_setup(ioapic_entries);
-		if (ret) {
-			WARN(1, "Saving IO-APIC state failed: %d\n", ret);
-			free_ioapic_entries(ioapic_entries);
-			goto restore;
-		}
-
-		mask_IO_APIC_setup(ioapic_entries);
+		/*
+		 * IO-APIC and PIC have their own resume routines.
+		 * We just mask them here to make sure the interrupt
+		 * subsystem is completely quiet while we enable x2apic
+		 * and interrupt-remapping.
+		 */
+		mask_ioapic_entries();
 		legacy_pic->mask_all();
 	}
 
@@ -2152,13 +2161,9 @@ static void lapic_resume(void)
 	apic_write(APIC_ESR, 0);
 	apic_read(APIC_ESR);
 
-	if (intr_remapping_enabled) {
+	if (intr_remapping_enabled)
 		reenable_intr_remapping(x2apic_mode);
-		legacy_pic->restore_mask();
-		restore_IO_APIC_setup(ioapic_entries);
-		free_ioapic_entries(ioapic_entries);
-	}
-restore:
+
 	local_irq_restore(flags);
 }
 
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 5652d31fe10..f7a41e4cae4 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -16,6 +16,7 @@
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/hardirq.h>
+#include <linux/module.h>
 #include <asm/smp.h>
 #include <asm/apic.h>
 #include <asm/ipi.h>
@@ -24,6 +25,12 @@
 #include <acpi/acpi_bus.h>
 #endif
 
+static struct apic apic_physflat;
+static struct apic apic_flat;
+
+struct apic __read_mostly *apic = &apic_flat;
+EXPORT_SYMBOL_GPL(apic);
+
 static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
 	return 1;
@@ -164,7 +171,7 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
 	return initial_apic_id >> index_msb;
 }
 
-struct apic apic_flat =  {
+static struct apic apic_flat =  {
 	.name				= "flat",
 	.probe				= NULL,
 	.acpi_madt_oem_check		= flat_acpi_madt_oem_check,
@@ -312,10 +319,18 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
 	return per_cpu(x86_cpu_to_apicid, cpu);
 }
 
-struct apic apic_physflat =  {
+static int physflat_probe(void)
+{
+	if (apic == &apic_physflat || num_possible_cpus() > 8)
+		return 1;
+
+	return 0;
+}
+
+static struct apic apic_physflat =  {
 
 	.name				= "physical flat",
-	.probe				= NULL,
+	.probe				= physflat_probe,
 	.acpi_madt_oem_check		= physflat_acpi_madt_oem_check,
 	.apic_id_registered		= flat_apic_id_registered,
 
@@ -369,3 +384,8 @@ struct apic apic_physflat =  {
 	.wait_icr_idle			= native_apic_wait_icr_idle,
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 };
+
+/*
+ * We need to check for physflat first, so this order is important.
+ */
+apic_drivers(apic_physflat, apic_flat);
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index f1baa2dc087..775b82bc655 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -119,14 +119,6 @@ static void noop_apic_write(u32 reg, u32 v)
 	WARN_ON_ONCE(cpu_has_apic && !disable_apic);
 }
 
-#ifdef CONFIG_X86_32
-static int noop_x86_32_numa_cpu_node(int cpu)
-{
-	/* we're always on node 0 */
-	return 0;
-}
-#endif
-
 struct apic apic_noop = {
 	.name				= "noop",
 	.probe				= noop_probe,
@@ -195,6 +187,5 @@ struct apic apic_noop = {
 
 #ifdef CONFIG_X86_32
 	.x86_32_early_logical_apicid	= noop_x86_32_early_logical_apicid,
-	.x86_32_numa_cpu_node		= noop_x86_32_numa_cpu_node,
 #endif
 };
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 541a2e43165..efd737e827f 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -193,7 +193,7 @@ static int probe_bigsmp(void)
 	return dmi_bigsmp;
 }
 
-struct apic apic_bigsmp = {
+static struct apic apic_bigsmp = {
 
 	.name				= "bigsmp",
 	.probe				= probe_bigsmp,
@@ -253,5 +253,14 @@ struct apic apic_bigsmp = {
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
 	.x86_32_early_logical_apicid	= bigsmp_early_logical_apicid,
-	.x86_32_numa_cpu_node		= default_x86_32_numa_cpu_node,
 };
+
+struct apic * __init generic_bigsmp_probe(void)
+{
+	if (probe_bigsmp())
+		return &apic_bigsmp;
+
+	return NULL;
+}
+
+apic_driver(apic_bigsmp);
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 3e9de4854c5..5d513bc47b6 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -48,7 +48,7 @@
 #include <linux/io.h>
 
 #include <asm/apicdef.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/fixmap.h>
 #include <asm/mpspec.h>
 #include <asm/setup.h>
@@ -510,11 +510,6 @@ static void es7000_setup_apic_routing(void)
 		nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
 }
 
-static int es7000_numa_cpu_node(int cpu)
-{
-	return 0;
-}
-
 static int es7000_cpu_present_to_apicid(int mps_cpu)
 {
 	if (!mps_cpu)
@@ -625,7 +620,7 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
 }
 
 /* We've been warned by a false positive warning.Use __refdata to keep calm. */
-struct apic __refdata apic_es7000_cluster = {
+static struct apic __refdata apic_es7000_cluster = {
 
 	.name				= "es7000",
 	.probe				= probe_es7000,
@@ -688,10 +683,9 @@ struct apic __refdata apic_es7000_cluster = {
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
 	.x86_32_early_logical_apicid	= es7000_early_logical_apicid,
-	.x86_32_numa_cpu_node		= es7000_numa_cpu_node,
 };
 
-struct apic __refdata apic_es7000 = {
+static struct apic __refdata apic_es7000 = {
 
 	.name				= "es7000",
 	.probe				= probe_es7000,
@@ -752,5 +746,10 @@ struct apic __refdata apic_es7000 = {
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
 	.x86_32_early_logical_apicid	= es7000_early_logical_apicid,
-	.x86_32_numa_cpu_node		= es7000_numa_cpu_node,
 };
+
+/*
+ * Need to check for es7000 followed by es7000_cluster, so this order
+ * in apic_drivers is important.
+ */
+apic_drivers(apic_es7000, apic_es7000_cluster);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 5260fe91bcb..d5e57db0f7b 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -19,9 +19,9 @@
 #include <linux/delay.h>
 
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
-u64 hw_nmi_get_sample_period(void)
+u64 hw_nmi_get_sample_period(int watchdog_thresh)
 {
-	return (u64)(cpu_khz) * 1000 * 60;
+	return (u64)(cpu_khz) * 1000 * watchdog_thresh;
 }
 #endif
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 68df09bba92..8eb863e27ea 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -76,17 +76,40 @@ int sis_apic_bug = -1;
 static DEFINE_RAW_SPINLOCK(ioapic_lock);
 static DEFINE_RAW_SPINLOCK(vector_lock);
 
-/*
- * # of IRQ routing registers
- */
-int nr_ioapic_registers[MAX_IO_APICS];
+static struct ioapic {
+	/*
+	 * # of IRQ routing registers
+	 */
+	int nr_registers;
+	/*
+	 * Saved state during suspend/resume, or while enabling intr-remap.
+	 */
+	struct IO_APIC_route_entry *saved_registers;
+	/* I/O APIC config */
+	struct mpc_ioapic mp_config;
+	/* IO APIC gsi routing info */
+	struct mp_ioapic_gsi  gsi_config;
+	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} ioapics[MAX_IO_APICS];
 
-/* I/O APIC entries */
-struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
-int nr_ioapics;
+#define mpc_ioapic_ver(id)		ioapics[id].mp_config.apicver
+
+int mpc_ioapic_id(int id)
+{
+	return ioapics[id].mp_config.apicid;
+}
 
-/* IO APIC gsi routing info */
-struct mp_ioapic_gsi  mp_gsi_routing[MAX_IO_APICS];
+unsigned int mpc_ioapic_addr(int id)
+{
+	return ioapics[id].mp_config.apicaddr;
+}
+
+struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int id)
+{
+	return &ioapics[id].gsi_config;
+}
+
+int nr_ioapics;
 
 /* The one past the highest gsi number used */
 u32 gsi_top;
@@ -128,8 +151,8 @@ static int __init parse_noapic(char *str)
 }
 early_param("noapic", parse_noapic);
 
-static int io_apic_setup_irq_pin_once(unsigned int irq, int node,
-				      struct io_apic_irq_attr *attr);
+static int io_apic_setup_irq_pin(unsigned int irq, int node,
+				 struct io_apic_irq_attr *attr);
 
 /* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
 void mp_save_irq(struct mpc_intsrc *m)
@@ -179,6 +202,14 @@ int __init arch_early_irq_init(void)
 		io_apic_irqs = ~0UL;
 	}
 
+	for (i = 0; i < nr_ioapics; i++) {
+		ioapics[i].saved_registers =
+			kzalloc(sizeof(struct IO_APIC_route_entry) *
+				ioapics[i].nr_registers, GFP_KERNEL);
+		if (!ioapics[i].saved_registers)
+			pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
+	}
+
 	cfg = irq_cfgx;
 	count = ARRAY_SIZE(irq_cfgx);
 	node = cpu_to_node(0);
@@ -297,7 +328,7 @@ struct io_apic {
 static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
 {
 	return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
-		+ (mp_ioapics[idx].apicaddr & ~PAGE_MASK);
+		+ (mpc_ioapic_addr(idx) & ~PAGE_MASK);
 }
 
 static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
@@ -573,7 +604,7 @@ static void clear_IO_APIC (void)
 	int apic, pin;
 
 	for (apic = 0; apic < nr_ioapics; apic++)
-		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+		for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
 			clear_IO_APIC_pin(apic, pin);
 }
 
@@ -615,74 +646,43 @@ static int __init ioapic_pirq_setup(char *str)
 __setup("pirq=", ioapic_pirq_setup);
 #endif /* CONFIG_X86_32 */
 
-struct IO_APIC_route_entry **alloc_ioapic_entries(void)
-{
-	int apic;
-	struct IO_APIC_route_entry **ioapic_entries;
-
-	ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
-				GFP_KERNEL);
-	if (!ioapic_entries)
-		return 0;
-
-	for (apic = 0; apic < nr_ioapics; apic++) {
-		ioapic_entries[apic] =
-			kzalloc(sizeof(struct IO_APIC_route_entry) *
-				nr_ioapic_registers[apic], GFP_KERNEL);
-		if (!ioapic_entries[apic])
-			goto nomem;
-	}
-
-	return ioapic_entries;
-
-nomem:
-	while (--apic >= 0)
-		kfree(ioapic_entries[apic]);
-	kfree(ioapic_entries);
-
-	return 0;
-}
-
 /*
  * Saves all the IO-APIC RTE's
  */
-int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+int save_ioapic_entries(void)
 {
 	int apic, pin;
-
-	if (!ioapic_entries)
-		return -ENOMEM;
+	int err = 0;
 
 	for (apic = 0; apic < nr_ioapics; apic++) {
-		if (!ioapic_entries[apic])
-			return -ENOMEM;
+		if (!ioapics[apic].saved_registers) {
+			err = -ENOMEM;
+			continue;
+		}
 
-		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
-			ioapic_entries[apic][pin] =
+		for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
+			ioapics[apic].saved_registers[pin] =
 				ioapic_read_entry(apic, pin);
 	}
 
-	return 0;
+	return err;
 }
 
 /*
  * Mask all IO APIC entries.
  */
-void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+void mask_ioapic_entries(void)
 {
 	int apic, pin;
 
-	if (!ioapic_entries)
-		return;
-
 	for (apic = 0; apic < nr_ioapics; apic++) {
-		if (!ioapic_entries[apic])
-			break;
+		if (!ioapics[apic].saved_registers)
+			continue;
 
-		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+		for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
 			struct IO_APIC_route_entry entry;
 
-			entry = ioapic_entries[apic][pin];
+			entry = ioapics[apic].saved_registers[pin];
 			if (!entry.mask) {
 				entry.mask = 1;
 				ioapic_write_entry(apic, pin, entry);
@@ -692,36 +692,23 @@ void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
 }
 
 /*
- * Restore IO APIC entries which was saved in ioapic_entries.
+ * Restore IO APIC entries which was saved in the ioapic structure.
  */
-int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
+int restore_ioapic_entries(void)
 {
 	int apic, pin;
 
-	if (!ioapic_entries)
-		return -ENOMEM;
-
 	for (apic = 0; apic < nr_ioapics; apic++) {
-		if (!ioapic_entries[apic])
-			return -ENOMEM;
+		if (!ioapics[apic].saved_registers)
+			continue;
 
-		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+		for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
 			ioapic_write_entry(apic, pin,
-					ioapic_entries[apic][pin]);
+					   ioapics[apic].saved_registers[pin]);
 	}
 	return 0;
 }
 
-void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
-{
-	int apic;
-
-	for (apic = 0; apic < nr_ioapics; apic++)
-		kfree(ioapic_entries[apic]);
-
-	kfree(ioapic_entries);
-}
-
 /*
  * Find the IRQ entry number of a certain pin.
  */
@@ -731,7 +718,7 @@ static int find_irq_entry(int apic, int pin, int type)
 
 	for (i = 0; i < mp_irq_entries; i++)
 		if (mp_irqs[i].irqtype == type &&
-		    (mp_irqs[i].dstapic == mp_ioapics[apic].apicid ||
+		    (mp_irqs[i].dstapic == mpc_ioapic_id(apic) ||
 		     mp_irqs[i].dstapic == MP_APIC_ALL) &&
 		    mp_irqs[i].dstirq == pin)
 			return i;
@@ -773,7 +760,7 @@ static int __init find_isa_irq_apic(int irq, int type)
 	if (i < mp_irq_entries) {
 		int apic;
 		for(apic = 0; apic < nr_ioapics; apic++) {
-			if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic)
+			if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic)
 				return apic;
 		}
 	}
@@ -942,6 +929,7 @@ static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq;
 	int bus = mp_irqs[idx].srcbus;
+	struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic);
 
 	/*
 	 * Debugging check, we are in big trouble if this message pops up!
@@ -952,7 +940,7 @@ static int pin_2_irq(int idx, int apic, int pin)
 	if (test_bit(bus, mp_bus_not_pci)) {
 		irq = mp_irqs[idx].srcbusirq;
 	} else {
-		u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
+		u32 gsi = gsi_cfg->gsi_base + pin;
 
 		if (gsi >= NR_IRQS_LEGACY)
 			irq = gsi;
@@ -1003,7 +991,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
 		int lbus = mp_irqs[i].srcbus;
 
 		for (apic = 0; apic < nr_ioapics; apic++)
-			if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
+			if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic ||
 			    mp_irqs[i].dstapic == MP_APIC_ALL)
 				break;
 
@@ -1222,7 +1210,7 @@ static inline int IO_APIC_irq_trigger(int irq)
 	int apic, idx, pin;
 
 	for (apic = 0; apic < nr_ioapics; apic++) {
-		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+		for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
 			idx = find_irq_entry(apic, pin, mp_INT);
 			if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
 				return irq_trigger(idx);
@@ -1307,6 +1295,16 @@ static int setup_ioapic_entry(int apic_id, int irq,
 		 * irq handler will do the explicit EOI to the io-apic.
 		 */
 		ir_entry->vector = pin;
+
+		apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
+			"Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
+			"Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
+			"Avail:%X Vector:%02X Dest:%08X "
+			"SID:%04X SQ:%X SVT:%X)\n",
+			apic_id, irte.present, irte.fpd, irte.dst_mode,
+			irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
+			irte.avail, irte.vector, irte.dest_id,
+			irte.sid, irte.sq, irte.svt);
 	} else {
 		entry->delivery_mode = apic->irq_delivery_mode;
 		entry->dest_mode = apic->irq_dest_mode;
@@ -1349,15 +1347,15 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
 
 	apic_printk(APIC_VERBOSE,KERN_DEBUG
 		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
-		    "IRQ %d Mode:%i Active:%i)\n",
-		    apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector,
-		    irq, trigger, polarity);
+		    "IRQ %d Mode:%i Active:%i Dest:%d)\n",
+		    apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector,
+		    irq, trigger, polarity, dest);
 
 
-	if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry,
+	if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry,
 			       dest, trigger, polarity, cfg->vector, pin)) {
 		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
-		       mp_ioapics[apic_id].apicid, pin);
+		       mpc_ioapic_id(apic_id), pin);
 		__clear_irq_vector(irq, cfg);
 		return;
 	}
@@ -1369,17 +1367,13 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
 	ioapic_write_entry(apic_id, pin, entry);
 }
 
-static struct {
-	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
-} mp_ioapic_routing[MAX_IO_APICS];
-
 static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin)
 {
 	if (idx != -1)
 		return false;
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
-		    mp_ioapics[apic_id].apicid, pin);
+		    mpc_ioapic_id(apic_id), pin);
 	return true;
 }
 
@@ -1389,7 +1383,7 @@ static void __init __io_apic_setup_irqs(unsigned int apic_id)
 	struct io_apic_irq_attr attr;
 	unsigned int pin, irq;
 
-	for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
+	for (pin = 0; pin < ioapics[apic_id].nr_registers; pin++) {
 		idx = find_irq_entry(apic_id, pin, mp_INT);
 		if (io_apic_pin_not_connected(idx, apic_id, pin))
 			continue;
@@ -1511,7 +1505,7 @@ __apicdebuginit(void) print_IO_APIC(void)
 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
 	for (i = 0; i < nr_ioapics; i++)
 		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-		       mp_ioapics[i].apicid, nr_ioapic_registers[i]);
+		       mpc_ioapic_id(i), ioapics[i].nr_registers);
 
 	/*
 	 * We are a bit conservative about what we expect.  We have to
@@ -1531,17 +1525,19 @@ __apicdebuginit(void) print_IO_APIC(void)
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	printk("\n");
-	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
+	printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(apic));
 	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
 	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
 	printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
 	printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
 
 	printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
-	printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
+	printk(KERN_DEBUG ".......     : max redirection entries: %02X\n",
+		reg_01.bits.entries);
 
 	printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
-	printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
+	printk(KERN_DEBUG ".......     : IO APIC version: %02X\n",
+		reg_01.bits.version);
 
 	/*
 	 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
@@ -1566,31 +1562,60 @@ __apicdebuginit(void) print_IO_APIC(void)
 
 	printk(KERN_DEBUG ".... IRQ redirection table:\n");
 
-	printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
-			  " Stat Dmod Deli Vect:\n");
+	if (intr_remapping_enabled) {
+		printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
+			" Pol Stat Indx2 Zero Vect:\n");
+	} else {
+		printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
+			" Stat Dmod Deli Vect:\n");
+	}
 
 	for (i = 0; i <= reg_01.bits.entries; i++) {
-		struct IO_APIC_route_entry entry;
-
-		entry = ioapic_read_entry(apic, i);
-
-		printk(KERN_DEBUG " %02x %03X ",
-			i,
-			entry.dest
-		);
+		if (intr_remapping_enabled) {
+			struct IO_APIC_route_entry entry;
+			struct IR_IO_APIC_route_entry *ir_entry;
+
+			entry = ioapic_read_entry(apic, i);
+			ir_entry = (struct IR_IO_APIC_route_entry *) &entry;
+			printk(KERN_DEBUG " %02x %04X ",
+				i,
+				ir_entry->index
+			);
+			printk("%1d   %1d    %1d    %1d   %1d   "
+				"%1d    %1d     %X    %02X\n",
+				ir_entry->format,
+				ir_entry->mask,
+				ir_entry->trigger,
+				ir_entry->irr,
+				ir_entry->polarity,
+				ir_entry->delivery_status,
+				ir_entry->index2,
+				ir_entry->zero,
+				ir_entry->vector
+			);
+		} else {
+			struct IO_APIC_route_entry entry;
 
-		printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
-			entry.mask,
-			entry.trigger,
-			entry.irr,
-			entry.polarity,
-			entry.delivery_status,
-			entry.dest_mode,
-			entry.delivery_mode,
-			entry.vector
-		);
+			entry = ioapic_read_entry(apic, i);
+			printk(KERN_DEBUG " %02x %02X  ",
+				i,
+				entry.dest
+			);
+			printk("%1d    %1d    %1d   %1d   %1d    "
+				"%1d    %1d    %02X\n",
+				entry.mask,
+				entry.trigger,
+				entry.irr,
+				entry.polarity,
+				entry.delivery_status,
+				entry.dest_mode,
+				entry.delivery_mode,
+				entry.vector
+			);
+		}
 	}
 	}
+
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
 	for_each_active_irq(irq) {
 		struct irq_pin_list *entry;
@@ -1808,7 +1833,7 @@ __apicdebuginit(int) print_ICs(void)
 	return 0;
 }
 
-fs_initcall(print_ICs);
+late_initcall(print_ICs);
 
 
 /* Where if anywhere is the i8259 connect in external int mode */
@@ -1825,7 +1850,7 @@ void __init enable_IO_APIC(void)
 	for(apic = 0; apic < nr_ioapics; apic++) {
 		int pin;
 		/* See if any of the pins is in ExtINT mode */
-		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+		for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
 			struct IO_APIC_route_entry entry;
 			entry = ioapic_read_entry(apic, pin);
 
@@ -1949,14 +1974,14 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
 		reg_00.raw = io_apic_read(apic_id, 0);
 		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
-		old_id = mp_ioapics[apic_id].apicid;
+		old_id = mpc_ioapic_id(apic_id);
 
-		if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) {
+		if (mpc_ioapic_id(apic_id) >= get_physical_broadcast()) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
-				apic_id, mp_ioapics[apic_id].apicid);
+				apic_id, mpc_ioapic_id(apic_id));
 			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
 				reg_00.bits.ID);
-			mp_ioapics[apic_id].apicid = reg_00.bits.ID;
+			ioapics[apic_id].mp_config.apicid = reg_00.bits.ID;
 		}
 
 		/*
@@ -1965,9 +1990,9 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
 		 * 'stuck on smp_invalidate_needed IPI wait' messages.
 		 */
 		if (apic->check_apicid_used(&phys_id_present_map,
-					mp_ioapics[apic_id].apicid)) {
+					    mpc_ioapic_id(apic_id))) {
 			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
-				apic_id, mp_ioapics[apic_id].apicid);
+				apic_id, mpc_ioapic_id(apic_id));
 			for (i = 0; i < get_physical_broadcast(); i++)
 				if (!physid_isset(i, phys_id_present_map))
 					break;
@@ -1976,13 +2001,14 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
 			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
 				i);
 			physid_set(i, phys_id_present_map);
-			mp_ioapics[apic_id].apicid = i;
+			ioapics[apic_id].mp_config.apicid = i;
 		} else {
 			physid_mask_t tmp;
-			apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp);
+			apic->apicid_to_cpu_present(mpc_ioapic_id(apic_id),
+						    &tmp);
 			apic_printk(APIC_VERBOSE, "Setting %d in the "
 					"phys_id_present_map\n",
-					mp_ioapics[apic_id].apicid);
+					mpc_ioapic_id(apic_id));
 			physids_or(phys_id_present_map, phys_id_present_map, tmp);
 		}
 
@@ -1990,24 +2016,24 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
 		 * We need to adjust the IRQ routing table
 		 * if the ID changed.
 		 */
-		if (old_id != mp_ioapics[apic_id].apicid)
+		if (old_id != mpc_ioapic_id(apic_id))
 			for (i = 0; i < mp_irq_entries; i++)
 				if (mp_irqs[i].dstapic == old_id)
 					mp_irqs[i].dstapic
-						= mp_ioapics[apic_id].apicid;
+						= mpc_ioapic_id(apic_id);
 
 		/*
 		 * Update the ID register according to the right value
 		 * from the MPC table if they are different.
 		 */
-		if (mp_ioapics[apic_id].apicid == reg_00.bits.ID)
+		if (mpc_ioapic_id(apic_id) == reg_00.bits.ID)
 			continue;
 
 		apic_printk(APIC_VERBOSE, KERN_INFO
 			"...changing IO-APIC physical APIC ID to %d ...",
-			mp_ioapics[apic_id].apicid);
+			mpc_ioapic_id(apic_id));
 
-		reg_00.bits.ID = mp_ioapics[apic_id].apicid;
+		reg_00.bits.ID = mpc_ioapic_id(apic_id);
 		raw_spin_lock_irqsave(&ioapic_lock, flags);
 		io_apic_write(apic_id, 0, reg_00.raw);
 		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2018,7 +2044,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
 		raw_spin_lock_irqsave(&ioapic_lock, flags);
 		reg_00.raw = io_apic_read(apic_id, 0);
 		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-		if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
+		if (reg_00.bits.ID != mpc_ioapic_id(apic_id))
 			printk("could not set ID!\n");
 		else
 			apic_printk(APIC_VERBOSE, " ok.\n");
@@ -2404,7 +2430,7 @@ static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	for_each_irq_pin(entry, cfg->irq_2_pin) {
-		if (mp_ioapics[entry->apic].apicver >= 0x20) {
+		if (mpc_ioapic_ver(entry->apic) >= 0x20) {
 			/*
 			 * Intr-remapping uses pin number as the virtual vector
 			 * in the RTE. Actual vector is programmed in
@@ -2918,49 +2944,19 @@ static int __init io_apic_bug_finalize(void)
 
 late_initcall(io_apic_bug_finalize);
 
-static struct IO_APIC_route_entry *ioapic_saved_data[MAX_IO_APICS];
-
-static void suspend_ioapic(int ioapic_id)
+static void resume_ioapic_id(int ioapic_id)
 {
-	struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id];
-	int i;
-
-	if (!saved_data)
-		return;
-
-	for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++)
-		saved_data[i] = ioapic_read_entry(ioapic_id, i);
-}
-
-static int ioapic_suspend(void)
-{
-	int ioapic_id;
-
-	for (ioapic_id = 0; ioapic_id < nr_ioapics; ioapic_id++)
-		suspend_ioapic(ioapic_id);
-
-	return 0;
-}
-
-static void resume_ioapic(int ioapic_id)
-{
-	struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id];
 	unsigned long flags;
 	union IO_APIC_reg_00 reg_00;
-	int i;
 
-	if (!saved_data)
-		return;
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	reg_00.raw = io_apic_read(ioapic_id, 0);
-	if (reg_00.bits.ID != mp_ioapics[ioapic_id].apicid) {
-		reg_00.bits.ID = mp_ioapics[ioapic_id].apicid;
+	if (reg_00.bits.ID != mpc_ioapic_id(ioapic_id)) {
+		reg_00.bits.ID = mpc_ioapic_id(ioapic_id);
 		io_apic_write(ioapic_id, 0, reg_00.raw);
 	}
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-	for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++)
-		ioapic_write_entry(ioapic_id, i, saved_data[i]);
 }
 
 static void ioapic_resume(void)
@@ -2968,28 +2964,18 @@ static void ioapic_resume(void)
 	int ioapic_id;
 
 	for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--)
-		resume_ioapic(ioapic_id);
+		resume_ioapic_id(ioapic_id);
+
+	restore_ioapic_entries();
 }
 
 static struct syscore_ops ioapic_syscore_ops = {
-	.suspend = ioapic_suspend,
+	.suspend = save_ioapic_entries,
 	.resume = ioapic_resume,
 };
 
 static int __init ioapic_init_ops(void)
 {
-	int i;
-
-	for (i = 0; i < nr_ioapics; i++) {
-		unsigned int size;
-
-		size = nr_ioapic_registers[i]
-			* sizeof(struct IO_APIC_route_entry);
-		ioapic_saved_data[i] = kzalloc(size, GFP_KERNEL);
-		if (!ioapic_saved_data[i])
-			pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
-	}
-
 	register_syscore_ops(&ioapic_syscore_ops);
 
 	return 0;
@@ -3570,7 +3556,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 }
 #endif /* CONFIG_HT_IRQ */
 
-int
+static int
 io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
 {
 	struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
@@ -3585,21 +3571,21 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
 	return ret;
 }
 
-static int io_apic_setup_irq_pin_once(unsigned int irq, int node,
-				      struct io_apic_irq_attr *attr)
+int io_apic_setup_irq_pin_once(unsigned int irq, int node,
+			       struct io_apic_irq_attr *attr)
 {
 	unsigned int id = attr->ioapic, pin = attr->ioapic_pin;
 	int ret;
 
 	/* Avoid redundant programming */
-	if (test_bit(pin, mp_ioapic_routing[id].pin_programmed)) {
+	if (test_bit(pin, ioapics[id].pin_programmed)) {
 		pr_debug("Pin %d-%d already programmed\n",
-			 mp_ioapics[id].apicid, pin);
+			 mpc_ioapic_id(id), pin);
 		return 0;
 	}
 	ret = io_apic_setup_irq_pin(irq, node, attr);
 	if (!ret)
-		set_bit(pin, mp_ioapic_routing[id].pin_programmed);
+		set_bit(pin, ioapics[id].pin_programmed);
 	return ret;
 }
 
@@ -3764,8 +3750,7 @@ static u8 __init io_apic_unique_id(u8 id)
 
 	bitmap_zero(used, 256);
 	for (i = 0; i < nr_ioapics; i++) {
-		struct mpc_ioapic *ia = &mp_ioapics[i];
-		__set_bit(ia->apicid, used);
+		__set_bit(mpc_ioapic_id(i), used);
 	}
 	if (!test_bit(id, used))
 		return id;
@@ -3825,7 +3810,7 @@ void __init setup_ioapic_dest(void)
 		return;
 
 	for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
-	for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+	for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) {
 		irq_entry = find_irq_entry(ioapic, pin, mp_INT);
 		if (irq_entry == -1)
 			continue;
@@ -3896,7 +3881,7 @@ void __init ioapic_and_gsi_init(void)
 	ioapic_res = ioapic_setup_resources(nr_ioapics);
 	for (i = 0; i < nr_ioapics; i++) {
 		if (smp_found_config) {
-			ioapic_phys = mp_ioapics[i].apicaddr;
+			ioapic_phys = mpc_ioapic_addr(i);
 #ifdef CONFIG_X86_32
 			if (!ioapic_phys) {
 				printk(KERN_ERR
@@ -3956,8 +3941,9 @@ int mp_find_ioapic(u32 gsi)
 
 	/* Find the IOAPIC that manages this GSI. */
 	for (i = 0; i < nr_ioapics; i++) {
-		if ((gsi >= mp_gsi_routing[i].gsi_base)
-		    && (gsi <= mp_gsi_routing[i].gsi_end))
+		struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
+		if ((gsi >= gsi_cfg->gsi_base)
+		    && (gsi <= gsi_cfg->gsi_end))
 			return i;
 	}
 
@@ -3967,12 +3953,16 @@ int mp_find_ioapic(u32 gsi)
 
 int mp_find_ioapic_pin(int ioapic, u32 gsi)
 {
+	struct mp_ioapic_gsi *gsi_cfg;
+
 	if (WARN_ON(ioapic == -1))
 		return -1;
-	if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end))
+
+	gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+	if (WARN_ON(gsi > gsi_cfg->gsi_end))
 		return -1;
 
-	return gsi - mp_gsi_routing[ioapic].gsi_base;
+	return gsi - gsi_cfg->gsi_base;
 }
 
 static __init int bad_ioapic(unsigned long address)
@@ -3994,40 +3984,42 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 {
 	int idx = 0;
 	int entries;
+	struct mp_ioapic_gsi *gsi_cfg;
 
 	if (bad_ioapic(address))
 		return;
 
 	idx = nr_ioapics;
 
-	mp_ioapics[idx].type = MP_IOAPIC;
-	mp_ioapics[idx].flags = MPC_APIC_USABLE;
-	mp_ioapics[idx].apicaddr = address;
+	ioapics[idx].mp_config.type = MP_IOAPIC;
+	ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
+	ioapics[idx].mp_config.apicaddr = address;
 
 	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
-	mp_ioapics[idx].apicid = io_apic_unique_id(id);
-	mp_ioapics[idx].apicver = io_apic_get_version(idx);
+	ioapics[idx].mp_config.apicid = io_apic_unique_id(id);
+	ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
 
 	/*
 	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
 	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
 	 */
 	entries = io_apic_get_redir_entries(idx);
-	mp_gsi_routing[idx].gsi_base = gsi_base;
-	mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1;
+	gsi_cfg = mp_ioapic_gsi_routing(idx);
+	gsi_cfg->gsi_base = gsi_base;
+	gsi_cfg->gsi_end = gsi_base + entries - 1;
 
 	/*
 	 * The number of IO-APIC IRQ registers (== #pins):
 	 */
-	nr_ioapic_registers[idx] = entries;
+	ioapics[idx].nr_registers = entries;
 
-	if (mp_gsi_routing[idx].gsi_end >= gsi_top)
-		gsi_top = mp_gsi_routing[idx].gsi_end + 1;
+	if (gsi_cfg->gsi_end >= gsi_top)
+		gsi_top = gsi_cfg->gsi_end + 1;
 
 	printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
-	       "GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
-	       mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr,
-	       mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end);
+	       "GSI %d-%d\n", idx, mpc_ioapic_id(idx),
+	       mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
+	       gsi_cfg->gsi_base, gsi_cfg->gsi_end);
 
 	nr_ioapics++;
 }
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 6273eee5134..c4a61ca1349 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -48,8 +48,6 @@
 #include <asm/e820.h>
 #include <asm/ipi.h>
 
-#define	MB_TO_PAGES(addr)		((addr) << (20 - PAGE_SHIFT))
-
 int found_numaq;
 
 /*
@@ -79,31 +77,20 @@ int					quad_local_to_mp_bus_id[NR_CPUS/4][4];
 static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
 {
 	struct eachquadmem *eq = scd->eq + node;
+	u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20;
+	u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20;
+	int ret;
 
-	node_set_online(node);
-
-	/* Convert to pages */
-	node_start_pfn[node] =
-		 MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size);
-
-	node_end_pfn[node] =
-		 MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
-
-	memblock_x86_register_active_regions(node, node_start_pfn[node],
-						node_end_pfn[node]);
-
-	memory_present(node, node_start_pfn[node], node_end_pfn[node]);
-
-	node_remap_size[node] = node_memmap_size_bytes(node,
-					node_start_pfn[node],
-					node_end_pfn[node]);
+	node_set(node, numa_nodes_parsed);
+	ret = numa_add_memblk(node, start, end);
+	BUG_ON(ret < 0);
 }
 
 /*
  * Function: smp_dump_qct()
  *
  * Description: gets memory layout from the quad config table.  This
- * function also updates node_online_map with the nodes (quads) present.
+ * function also updates numa_nodes_parsed with the nodes (quads) present.
  */
 static void __init smp_dump_qct(void)
 {
@@ -112,7 +99,6 @@ static void __init smp_dump_qct(void)
 
 	scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
 
-	nodes_clear(node_online_map);
 	for_each_node(node) {
 		if (scd->quads_present31_0 & (1 << node))
 			numaq_register_node(node, scd);
@@ -282,14 +268,14 @@ static __init void early_check_numaq(void)
 	}
 }
 
-int __init get_memcfg_numaq(void)
+int __init numaq_numa_init(void)
 {
 	early_check_numaq();
 	if (!found_numaq)
-		return 0;
+		return -ENOENT;
 	smp_dump_qct();
 
-	return 1;
+	return 0;
 }
 
 #define NUMAQ_APIC_DFR_VALUE	(APIC_DFR_CLUSTER)
@@ -486,8 +472,8 @@ static void numaq_setup_portio_remap(void)
 		(u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
 }
 
-/* Use __refdata to keep false positive warning calm.	*/
-struct apic __refdata apic_numaq = {
+/* Use __refdata to keep false positive warning calm.  */
+static struct apic __refdata apic_numaq = {
 
 	.name				= "NUMAQ",
 	.probe				= probe_numaq,
@@ -551,3 +537,5 @@ struct apic __refdata apic_numaq = {
 	.x86_32_early_logical_apicid	= noop_x86_32_early_logical_apicid,
 	.x86_32_numa_cpu_node		= numaq_numa_cpu_node,
 };
+
+apic_driver(apic_numaq);
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index fc84c7b6110..b5254ad044a 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -52,31 +52,6 @@ static int __init print_ipi_mode(void)
 }
 late_initcall(print_ipi_mode);
 
-void __init default_setup_apic_routing(void)
-{
-	int version = apic_version[boot_cpu_physical_apicid];
-
-	if (num_possible_cpus() > 8) {
-		switch (boot_cpu_data.x86_vendor) {
-		case X86_VENDOR_INTEL:
-			if (!APIC_XAPIC(version)) {
-				def_to_bigsmp = 0;
-				break;
-			}
-			/* If P4 and above fall through */
-		case X86_VENDOR_AMD:
-			def_to_bigsmp = 1;
-		}
-	}
-
-#ifdef CONFIG_X86_BIGSMP
-	generic_bigsmp_probe();
-#endif
-
-	if (apic->setup_apic_routing)
-		apic->setup_apic_routing();
-}
-
 static int default_x86_32_early_logical_apicid(int cpu)
 {
 	return 1 << cpu;
@@ -112,7 +87,7 @@ static int probe_default(void)
 	return 1;
 }
 
-struct apic apic_default = {
+static struct apic apic_default = {
 
 	.name				= "default",
 	.probe				= probe_default,
@@ -172,47 +147,24 @@ struct apic apic_default = {
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
 	.x86_32_early_logical_apicid	= default_x86_32_early_logical_apicid,
-	.x86_32_numa_cpu_node		= default_x86_32_numa_cpu_node,
 };
 
-extern struct apic apic_numaq;
-extern struct apic apic_summit;
-extern struct apic apic_bigsmp;
-extern struct apic apic_es7000;
-extern struct apic apic_es7000_cluster;
+apic_driver(apic_default);
 
 struct apic *apic = &apic_default;
 EXPORT_SYMBOL_GPL(apic);
 
-static struct apic *apic_probe[] __initdata = {
-#ifdef CONFIG_X86_NUMAQ
-	&apic_numaq,
-#endif
-#ifdef CONFIG_X86_SUMMIT
-	&apic_summit,
-#endif
-#ifdef CONFIG_X86_BIGSMP
-	&apic_bigsmp,
-#endif
-#ifdef CONFIG_X86_ES7000
-	&apic_es7000,
-	&apic_es7000_cluster,
-#endif
-	&apic_default,	/* must be last */
-	NULL,
-};
-
 static int cmdline_apic __initdata;
 static int __init parse_apic(char *arg)
 {
-	int i;
+	struct apic **drv;
 
 	if (!arg)
 		return -EINVAL;
 
-	for (i = 0; apic_probe[i]; i++) {
-		if (!strcmp(apic_probe[i]->name, arg)) {
-			apic = apic_probe[i];
+	for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+		if (!strcmp((*drv)->name, arg)) {
+			apic = *drv;
 			cmdline_apic = 1;
 			return 0;
 		}
@@ -223,38 +175,58 @@ static int __init parse_apic(char *arg)
 }
 early_param("apic", parse_apic);
 
-void __init generic_bigsmp_probe(void)
+void __init default_setup_apic_routing(void)
 {
+	int version = apic_version[boot_cpu_physical_apicid];
+
+	if (num_possible_cpus() > 8) {
+		switch (boot_cpu_data.x86_vendor) {
+		case X86_VENDOR_INTEL:
+			if (!APIC_XAPIC(version)) {
+				def_to_bigsmp = 0;
+				break;
+			}
+			/* If P4 and above fall through */
+		case X86_VENDOR_AMD:
+			def_to_bigsmp = 1;
+		}
+	}
+
 #ifdef CONFIG_X86_BIGSMP
 	/*
-	 * This routine is used to switch to bigsmp mode when
+	 * This is used to switch to bigsmp mode when
 	 * - There is no apic= option specified by the user
 	 * - generic_apic_probe() has chosen apic_default as the sub_arch
 	 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
 	 */
 
 	if (!cmdline_apic && apic == &apic_default) {
-		if (apic_bigsmp.probe()) {
-			apic = &apic_bigsmp;
+		struct apic *bigsmp = generic_bigsmp_probe();
+		if (bigsmp) {
+			apic = bigsmp;
 			printk(KERN_INFO "Overriding APIC driver with %s\n",
 			       apic->name);
 		}
 	}
 #endif
+
+	if (apic->setup_apic_routing)
+		apic->setup_apic_routing();
 }
 
 void __init generic_apic_probe(void)
 {
 	if (!cmdline_apic) {
-		int i;
-		for (i = 0; apic_probe[i]; i++) {
-			if (apic_probe[i]->probe()) {
-				apic = apic_probe[i];
+		struct apic **drv;
+
+		for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+			if ((*drv)->probe()) {
+				apic = *drv;
 				break;
 			}
 		}
 		/* Not visible without early console */
-		if (!apic_probe[i])
+		if (drv == __apicdrivers_end)
 			panic("Didn't find an APIC driver");
 	}
 	printk(KERN_INFO "Using APIC driver %s\n", apic->name);
@@ -265,16 +237,16 @@ void __init generic_apic_probe(void)
 int __init
 generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
 {
-	int i;
+	struct apic **drv;
 
-	for (i = 0; apic_probe[i]; ++i) {
-		if (!apic_probe[i]->mps_oem_check)
+	for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+		if (!((*drv)->mps_oem_check))
 			continue;
-		if (!apic_probe[i]->mps_oem_check(mpc, oem, productid))
+		if (!(*drv)->mps_oem_check(mpc, oem, productid))
 			continue;
 
 		if (!cmdline_apic) {
-			apic = apic_probe[i];
+			apic = *drv;
 			printk(KERN_INFO "Switched to APIC driver `%s'.\n",
 			       apic->name);
 		}
@@ -285,16 +257,16 @@ generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
 
 int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-	int i;
+	struct apic **drv;
 
-	for (i = 0; apic_probe[i]; ++i) {
-		if (!apic_probe[i]->acpi_madt_oem_check)
+	for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+		if (!(*drv)->acpi_madt_oem_check)
 			continue;
-		if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id))
+		if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id))
 			continue;
 
 		if (!cmdline_apic) {
-			apic = apic_probe[i];
+			apic = *drv;
 			printk(KERN_INFO "Switched to APIC driver `%s'.\n",
 			       apic->name);
 		}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index d8c4a6feb28..3fe98669892 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -23,27 +23,6 @@
 #include <asm/ipi.h>
 #include <asm/setup.h>
 
-extern struct apic apic_flat;
-extern struct apic apic_physflat;
-extern struct apic apic_x2xpic_uv_x;
-extern struct apic apic_x2apic_phys;
-extern struct apic apic_x2apic_cluster;
-
-struct apic __read_mostly *apic = &apic_flat;
-EXPORT_SYMBOL_GPL(apic);
-
-static struct apic *apic_probe[] __initdata = {
-#ifdef CONFIG_X86_UV
-	&apic_x2apic_uv_x,
-#endif
-#ifdef CONFIG_X86_X2APIC
-	&apic_x2apic_phys,
-	&apic_x2apic_cluster,
-#endif
-	&apic_physflat,
-	NULL,
-};
-
 static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
 {
 	return hard_smp_processor_id() >> index_msb;
@@ -54,26 +33,20 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
  */
 void __init default_setup_apic_routing(void)
 {
+	struct apic **drv;
 
 	enable_IR_x2apic();
 
-#ifdef CONFIG_X86_X2APIC
-	if (x2apic_mode
-#ifdef CONFIG_X86_UV
-		       && apic != &apic_x2apic_uv_x
-#endif
-		       ) {
-		if (x2apic_phys)
-			apic = &apic_x2apic_phys;
-		else
-			apic = &apic_x2apic_cluster;
+	for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+		if ((*drv)->probe && (*drv)->probe()) {
+			if (apic != *drv) {
+				apic = *drv;
+				pr_info("Switched APIC routing to %s.\n",
+					apic->name);
+			}
+			break;
+		}
 	}
-#endif
-
-	if (apic == &apic_flat && num_possible_cpus() > 8)
-			apic = &apic_physflat;
-
-	printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
 
 	if (is_vsmp_box()) {
 		/* need to update phys_pkg_id */
@@ -90,13 +63,15 @@ void apic_send_IPI_self(int vector)
 
 int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-	int i;
+	struct apic **drv;
 
-	for (i = 0; apic_probe[i]; ++i) {
-		if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
-			apic = apic_probe[i];
-			printk(KERN_INFO "Setting APIC routing to %s.\n",
-				apic->name);
+	for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
+		if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) {
+			if (apic != *drv) {
+				apic = *drv;
+				pr_info("Setting APIC routing to %s.\n",
+					apic->name);
+			}
 			return 1;
 		}
 	}
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index e4b8059b414..19114423c58 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -491,7 +491,7 @@ void setup_summit(void)
 }
 #endif
 
-struct apic apic_summit = {
+static struct apic apic_summit = {
 
 	.name				= "summit",
 	.probe				= probe_summit,
@@ -551,5 +551,6 @@ struct apic apic_summit = {
 	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 
 	.x86_32_early_logical_apicid	= summit_early_logical_apicid,
-	.x86_32_numa_cpu_node		= default_x86_32_numa_cpu_node,
 };
+
+apic_driver(apic_summit);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 90949bbd566..50079587582 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -5,118 +5,95 @@
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/dmar.h>
+#include <linux/cpu.h>
 
 #include <asm/smp.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
+#include <asm/x2apic.h>
 
 static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
+static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
 
 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
 	return x2apic_enabled();
 }
 
-/*
- * need to use more than cpu 0, because we need more vectors when
- * MSI-X are used.
- */
-static const struct cpumask *x2apic_target_cpus(void)
+static inline u32 x2apic_cluster(int cpu)
 {
-	return cpu_online_mask;
-}
-
-/*
- * for now each logical cpu is in its own vector allocation domain.
- */
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
-	cpumask_clear(retmask);
-	cpumask_set_cpu(cpu, retmask);
+	return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
 }
 
 static void
- __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
 {
-	unsigned long cfg;
+	struct cpumask *cpus_in_cluster_ptr;
+	struct cpumask *ipi_mask_ptr;
+	unsigned int cpu, this_cpu;
+	unsigned long flags;
+	u32 dest;
+
+	x2apic_wrmsr_fence();
+
+	local_irq_save(flags);
 
-	cfg = __prepare_ICR(0, vector, dest);
+	this_cpu = smp_processor_id();
 
 	/*
-	 * send the IPI.
+	 * We are to modify mask, so we need an own copy
+	 * and be sure it's manipulated with irq off.
 	 */
-	native_x2apic_icr_write(cfg, apicid);
-}
+	ipi_mask_ptr = __raw_get_cpu_var(ipi_mask);
+	cpumask_copy(ipi_mask_ptr, mask);
 
-/*
- * for now, we send the IPI's one by one in the cpumask.
- * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
- * at once. We have 16 cpu's in a cluster. This will minimize IPI register
- * writes.
- */
-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
-{
-	unsigned long query_cpu;
-	unsigned long flags;
+	/*
+	 * The idea is to send one IPI per cluster.
+	 */
+	for_each_cpu(cpu, ipi_mask_ptr) {
+		unsigned long i;
 
-	x2apic_wrmsr_fence();
+		cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
+		dest = 0;
 
-	local_irq_save(flags);
-	for_each_cpu(query_cpu, mask) {
-		__x2apic_send_IPI_dest(
-			per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-			vector, apic->dest_logical);
+		/* Collect cpus in cluster. */
+		for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) {
+			if (apic_dest == APIC_DEST_ALLINC || i != this_cpu)
+				dest |= per_cpu(x86_cpu_to_logical_apicid, i);
+		}
+
+		if (!dest)
+			continue;
+
+		__x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
+		/*
+		 * Cluster sibling cpus should be discared now so
+		 * we would not send IPI them second time.
+		 */
+		cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr);
 	}
+
 	local_irq_restore(flags);
 }
 
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+	__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
+}
+
 static void
  x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
 {
-	unsigned long this_cpu = smp_processor_id();
-	unsigned long query_cpu;
-	unsigned long flags;
-
-	x2apic_wrmsr_fence();
-
-	local_irq_save(flags);
-	for_each_cpu(query_cpu, mask) {
-		if (query_cpu == this_cpu)
-			continue;
-		__x2apic_send_IPI_dest(
-				per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-				vector, apic->dest_logical);
-	}
-	local_irq_restore(flags);
+	__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
 }
 
 static void x2apic_send_IPI_allbutself(int vector)
 {
-	unsigned long this_cpu = smp_processor_id();
-	unsigned long query_cpu;
-	unsigned long flags;
-
-	x2apic_wrmsr_fence();
-
-	local_irq_save(flags);
-	for_each_online_cpu(query_cpu) {
-		if (query_cpu == this_cpu)
-			continue;
-		__x2apic_send_IPI_dest(
-				per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-				vector, apic->dest_logical);
-	}
-	local_irq_restore(flags);
+	__x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
 }
 
 static void x2apic_send_IPI_all(int vector)
 {
-	x2apic_send_IPI_mask(cpu_online_mask, vector);
-}
-
-static int x2apic_apic_id_registered(void)
-{
-	return 1;
+	__x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
 }
 
 static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
@@ -151,43 +128,90 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
 	return per_cpu(x86_cpu_to_logical_apicid, cpu);
 }
 
-static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
+static void init_x2apic_ldr(void)
 {
-	unsigned int id;
+	unsigned int this_cpu = smp_processor_id();
+	unsigned int cpu;
 
-	id = x;
-	return id;
+	per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
+
+	__cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
+	for_each_online_cpu(cpu) {
+		if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+			continue;
+		__cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
+		__cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
+	}
 }
 
-static unsigned long set_apic_id(unsigned int id)
+ /*
+  * At CPU state changes, update the x2apic cluster sibling info.
+  */
+static int __cpuinit
+update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
-	unsigned long x;
+	unsigned int this_cpu = (unsigned long)hcpu;
+	unsigned int cpu;
+	int err = 0;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu),
+					GFP_KERNEL)) {
+			err = -ENOMEM;
+		} else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu),
+					       GFP_KERNEL)) {
+			free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+			err = -ENOMEM;
+		}
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+	case CPU_DEAD:
+		for_each_online_cpu(cpu) {
+			if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+				continue;
+			__cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu));
+			__cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
+		}
+		free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
+		free_cpumask_var(per_cpu(ipi_mask, this_cpu));
+		break;
+	}
 
-	x = id;
-	return x;
+	return notifier_from_errno(err);
 }
 
-static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb)
-{
-	return initial_apicid >> index_msb;
-}
+static struct notifier_block __refdata x2apic_cpu_notifier = {
+	.notifier_call = update_clusterinfo,
+};
 
-static void x2apic_send_IPI_self(int vector)
+static int x2apic_init_cpu_notifier(void)
 {
-	apic_write(APIC_SELF_IPI, vector);
+	int cpu = smp_processor_id();
+
+	zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
+	zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
+
+	BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
+
+	__cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
+	register_hotcpu_notifier(&x2apic_cpu_notifier);
+	return 1;
 }
 
-static void init_x2apic_ldr(void)
+static int x2apic_cluster_probe(void)
 {
-	int cpu = smp_processor_id();
-
-	per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
+	if (x2apic_mode)
+		return x2apic_init_cpu_notifier();
+	else
+		return 0;
 }
 
-struct apic apic_x2apic_cluster = {
+static struct apic apic_x2apic_cluster = {
 
 	.name				= "cluster x2apic",
-	.probe				= NULL,
+	.probe				= x2apic_cluster_probe,
 	.acpi_madt_oem_check		= x2apic_acpi_madt_oem_check,
 	.apic_id_registered		= x2apic_apic_id_registered,
 
@@ -211,11 +235,11 @@ struct apic apic_x2apic_cluster = {
 	.setup_portio_remap		= NULL,
 	.check_phys_apicid_present	= default_check_phys_apicid_present,
 	.enable_apic_mode		= NULL,
-	.phys_pkg_id			= x2apic_cluster_phys_pkg_id,
+	.phys_pkg_id			= x2apic_phys_pkg_id,
 	.mps_oem_check			= NULL,
 
-	.get_apic_id			= x2apic_cluster_phys_get_apic_id,
-	.set_apic_id			= set_apic_id,
+	.get_apic_id			= x2apic_get_apic_id,
+	.set_apic_id			= x2apic_set_apic_id,
 	.apic_id_mask			= 0xFFFFFFFFu,
 
 	.cpu_mask_to_apicid		= x2apic_cpu_mask_to_apicid,
@@ -240,3 +264,5 @@ struct apic apic_x2apic_cluster = {
 	.wait_icr_idle			= native_x2apic_wait_icr_idle,
 	.safe_wait_icr_idle		= native_safe_x2apic_wait_icr_idle,
 };
+
+apic_driver(apic_x2apic_cluster);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index c7e6d6645bf..f5373dfde21 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -7,11 +7,12 @@
 #include <linux/dmar.h>
 
 #include <asm/smp.h>
-#include <asm/apic.h>
-#include <asm/ipi.h>
+#include <asm/x2apic.h>
 
 int x2apic_phys;
 
+static struct apic apic_x2apic_phys;
+
 static int set_x2apic_phys_mode(char *arg)
 {
 	x2apic_phys = 1;
@@ -27,94 +28,46 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 		return 0;
 }
 
-/*
- * need to use more than cpu 0, because we need more vectors when
- * MSI-X are used.
- */
-static const struct cpumask *x2apic_target_cpus(void)
-{
-	return cpu_online_mask;
-}
-
-static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
-{
-	cpumask_clear(retmask);
-	cpumask_set_cpu(cpu, retmask);
-}
-
-static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
-				   unsigned int dest)
-{
-	unsigned long cfg;
-
-	cfg = __prepare_ICR(0, vector, dest);
-
-	/*
-	 * send the IPI.
-	 */
-	native_x2apic_icr_write(cfg, apicid);
-}
-
-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+static void
+__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
 {
 	unsigned long query_cpu;
+	unsigned long this_cpu;
 	unsigned long flags;
 
 	x2apic_wrmsr_fence();
 
 	local_irq_save(flags);
+
+	this_cpu = smp_processor_id();
 	for_each_cpu(query_cpu, mask) {
+		if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu)
+			continue;
 		__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
 				       vector, APIC_DEST_PHYSICAL);
 	}
 	local_irq_restore(flags);
 }
 
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+	__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
+}
+
 static void
  x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
 {
-	unsigned long this_cpu = smp_processor_id();
-	unsigned long query_cpu;
-	unsigned long flags;
-
-	x2apic_wrmsr_fence();
-
-	local_irq_save(flags);
-	for_each_cpu(query_cpu, mask) {
-		if (query_cpu != this_cpu)
-			__x2apic_send_IPI_dest(
-				per_cpu(x86_cpu_to_apicid, query_cpu),
-				vector, APIC_DEST_PHYSICAL);
-	}
-	local_irq_restore(flags);
+	__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
 }
 
 static void x2apic_send_IPI_allbutself(int vector)
 {
-	unsigned long this_cpu = smp_processor_id();
-	unsigned long query_cpu;
-	unsigned long flags;
-
-	x2apic_wrmsr_fence();
-
-	local_irq_save(flags);
-	for_each_online_cpu(query_cpu) {
-		if (query_cpu == this_cpu)
-			continue;
-		__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
-				       vector, APIC_DEST_PHYSICAL);
-	}
-	local_irq_restore(flags);
+	__x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
 }
 
 static void x2apic_send_IPI_all(int vector)
 {
-	x2apic_send_IPI_mask(cpu_online_mask, vector);
-}
-
-static int x2apic_apic_id_registered(void)
-{
-	return 1;
+	__x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
 }
 
 static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
@@ -149,34 +102,22 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
 	return per_cpu(x86_cpu_to_apicid, cpu);
 }
 
-static unsigned int x2apic_phys_get_apic_id(unsigned long x)
-{
-	return x;
-}
-
-static unsigned long set_apic_id(unsigned int id)
-{
-	return id;
-}
-
-static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
+static void init_x2apic_ldr(void)
 {
-	return initial_apicid >> index_msb;
 }
 
-static void x2apic_send_IPI_self(int vector)
+static int x2apic_phys_probe(void)
 {
-	apic_write(APIC_SELF_IPI, vector);
-}
+	if (x2apic_mode && x2apic_phys)
+		return 1;
 
-static void init_x2apic_ldr(void)
-{
+	return apic == &apic_x2apic_phys;
 }
 
-struct apic apic_x2apic_phys = {
+static struct apic apic_x2apic_phys = {
 
 	.name				= "physical x2apic",
-	.probe				= NULL,
+	.probe				= x2apic_phys_probe,
 	.acpi_madt_oem_check		= x2apic_acpi_madt_oem_check,
 	.apic_id_registered		= x2apic_apic_id_registered,
 
@@ -203,8 +144,8 @@ struct apic apic_x2apic_phys = {
 	.phys_pkg_id			= x2apic_phys_pkg_id,
 	.mps_oem_check			= NULL,
 
-	.get_apic_id			= x2apic_phys_get_apic_id,
-	.set_apic_id			= set_apic_id,
+	.get_apic_id			= x2apic_get_apic_id,
+	.set_apic_id			= x2apic_set_apic_id,
 	.apic_id_mask			= 0xFFFFFFFFu,
 
 	.cpu_mask_to_apicid		= x2apic_cpu_mask_to_apicid,
@@ -229,3 +170,5 @@ struct apic apic_x2apic_phys = {
 	.wait_icr_idle			= native_x2apic_wait_icr_idle,
 	.safe_wait_icr_idle		= native_safe_x2apic_wait_icr_idle,
 };
+
+apic_driver(apic_x2apic_phys);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 33b10a0fc09..adc66c3a1fe 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -37,6 +37,13 @@
 #include <asm/smp.h>
 #include <asm/x86_init.h>
 #include <asm/emergency-restart.h>
+#include <asm/nmi.h>
+
+/* BMC sets a bit this MMR non-zero before sending an NMI */
+#define UVH_NMI_MMR				UVH_SCRATCH5
+#define UVH_NMI_MMR_CLEAR			(UVH_NMI_MMR + 8)
+#define UV_NMI_PENDING_MASK			(1UL << 63)
+DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count);
 
 DEFINE_PER_CPU(int, x2apic_extra_bits);
 
@@ -51,6 +58,8 @@ unsigned int uv_apicid_hibits;
 EXPORT_SYMBOL_GPL(uv_apicid_hibits);
 static DEFINE_SPINLOCK(uv_nmi_lock);
 
+static struct apic apic_x2apic_uv_x;
+
 static unsigned long __init uv_early_read_mmr(unsigned long addr)
 {
 	unsigned long val, *mmr;
@@ -82,6 +91,10 @@ static int __init early_get_pnodeid(void)
 	m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
 	uv_min_hub_revision_id = node_id.s.revision;
 
+	if (node_id.s.part_number == UV2_HUB_PART_NUMBER)
+		uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
+
+	uv_hub_info->hub_revision = uv_min_hub_revision_id;
 	pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
 	return pnode;
 }
@@ -103,17 +116,25 @@ static void __init early_get_apic_pnode_shift(void)
  */
 static void __init uv_set_apicid_hibit(void)
 {
-	union uvh_lb_target_physical_apic_id_mask_u apicid_mask;
+	union uv1h_lb_target_physical_apic_id_mask_u apicid_mask;
 
-	apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK);
-	uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK;
+	if (is_uv1_hub()) {
+		apicid_mask.v =
+			uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
+		uv_apicid_hibits =
+			apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
+	}
 }
 
 static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-	int pnodeid;
+	int pnodeid, is_uv1, is_uv2;
 
-	if (!strcmp(oem_id, "SGI")) {
+	is_uv1 = !strcmp(oem_id, "SGI");
+	is_uv2 = !strcmp(oem_id, "SGI2");
+	if (is_uv1 || is_uv2) {
+		uv_hub_info->hub_revision =
+			is_uv1 ? UV1_HUB_REVISION_BASE : UV2_HUB_REVISION_BASE;
 		pnodeid = early_get_pnodeid();
 		early_get_apic_pnode_shift();
 		x86_platform.is_untracked_pat_range =  uv_is_untracked_pat_range;
@@ -319,10 +340,15 @@ static void uv_send_IPI_self(int vector)
 	apic_write(APIC_SELF_IPI, vector);
 }
 
-struct apic __refdata apic_x2apic_uv_x = {
+static int uv_probe(void)
+{
+	return apic == &apic_x2apic_uv_x;
+}
+
+static struct apic __refdata apic_x2apic_uv_x = {
 
 	.name				= "UV large system",
-	.probe				= NULL,
+	.probe				= uv_probe,
 	.acpi_madt_oem_check		= uv_acpi_madt_oem_check,
 	.apic_id_registered		= uv_apic_id_registered,
 
@@ -470,12 +496,19 @@ static __init void map_mmr_high(int max_pnode)
 static __init void map_mmioh_high(int max_pnode)
 {
 	union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
-	int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+	int shift;
 
 	mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
-	if (mmioh.s.enable)
-		map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io,
+	if (is_uv1_hub() && mmioh.s1.enable) {
+		shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+		map_high("MMIOH", mmioh.s1.base, shift, mmioh.s1.m_io,
+			max_pnode, map_uc);
+	}
+	if (is_uv2_hub() && mmioh.s2.enable) {
+		shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
+		map_high("MMIOH", mmioh.s2.base, shift, mmioh.s2.m_io,
 			max_pnode, map_uc);
+	}
 }
 
 static __init void map_low_mmrs(void)
@@ -599,14 +632,14 @@ late_initcall(uv_init_heartbeat);
 
 /* Direct Legacy VGA I/O traffic to designated IOH */
 int uv_set_vga_state(struct pci_dev *pdev, bool decode,
-		      unsigned int command_bits, bool change_bridge)
+		      unsigned int command_bits, u32 flags)
 {
 	int domain, bus, rc;
 
-	PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n",
-			pdev->devfn, decode, command_bits, change_bridge);
+	PR_DEVEL("devfn %x decode %d cmd %x flags %d\n",
+			pdev->devfn, decode, command_bits, flags);
 
-	if (!change_bridge)
+	if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE))
 		return 0;
 
 	if ((command_bits & PCI_COMMAND_IO) == 0)
@@ -642,18 +675,46 @@ void __cpuinit uv_cpu_init(void)
  */
 int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
 {
+	unsigned long real_uv_nmi;
+	int bid;
+
 	if (reason != DIE_NMIUNKNOWN)
 		return NOTIFY_OK;
 
 	if (in_crash_kexec)
 		/* do nothing if entering the crash kernel */
 		return NOTIFY_OK;
+
 	/*
-	 * Use a lock so only one cpu prints at a time
-	 * to prevent intermixed output.
+	 * Each blade has an MMR that indicates when an NMI has been sent
+	 * to cpus on the blade. If an NMI is detected, atomically
+	 * clear the MMR and update a per-blade NMI count used to
+	 * cause each cpu on the blade to notice a new NMI.
+	 */
+	bid = uv_numa_blade_id();
+	real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
+
+	if (unlikely(real_uv_nmi)) {
+		spin_lock(&uv_blade_info[bid].nmi_lock);
+		real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
+		if (real_uv_nmi) {
+			uv_blade_info[bid].nmi_count++;
+			uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK);
+		}
+		spin_unlock(&uv_blade_info[bid].nmi_lock);
+	}
+
+	if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count))
+		return NOTIFY_DONE;
+
+	__get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count;
+
+	/*
+	 * Use a lock so only one cpu prints at a time.
+	 * This prevents intermixed output.
 	 */
 	spin_lock(&uv_nmi_lock);
-	pr_info("NMI stack dump cpu %u:\n", smp_processor_id());
+	pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id());
 	dump_stack();
 	spin_unlock(&uv_nmi_lock);
 
@@ -661,7 +722,8 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
 }
 
 static struct notifier_block uv_dump_stack_nmi_nb = {
-	.notifier_call	= uv_handle_nmi
+	.notifier_call	= uv_handle_nmi,
+	.priority = NMI_LOCAL_LOW_PRIOR - 1,
 };
 
 void uv_register_nmi_notifier(void)
@@ -693,13 +755,14 @@ void __init uv_system_init(void)
 	unsigned long mmr_base, present, paddr;
 	unsigned short pnode_mask, pnode_io_mask;
 
+	printk(KERN_INFO "UV: Found %s hub\n", is_uv1_hub() ? "UV1" : "UV2");
 	map_low_mmrs();
 
 	m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
 	m_val = m_n_config.s.m_skt;
 	n_val = m_n_config.s.n_skt;
 	mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
-	n_io = mmioh.s.n_io;
+	n_io = is_uv1_hub() ? mmioh.s1.n_io : mmioh.s2.n_io;
 	mmr_base =
 	    uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
 	    ~UV_MMR_ENABLE;
@@ -720,8 +783,9 @@ void __init uv_system_init(void)
 	printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
 
 	bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
-	uv_blade_info = kmalloc(bytes, GFP_KERNEL);
+	uv_blade_info = kzalloc(bytes, GFP_KERNEL);
 	BUG_ON(!uv_blade_info);
+
 	for (blade = 0; blade < uv_num_possible_blades(); blade++)
 		uv_blade_info[blade].memory_nid = -1;
 
@@ -747,6 +811,7 @@ void __init uv_system_init(void)
 			uv_blade_info[blade].pnode = pnode;
 			uv_blade_info[blade].nr_possible_cpus = 0;
 			uv_blade_info[blade].nr_online_cpus = 0;
+			spin_lock_init(&uv_blade_info[blade].nmi_lock);
 			max_pnode = max(pnode, max_pnode);
 			blade++;
 		}
@@ -766,6 +831,8 @@ void __init uv_system_init(void)
 		 */
 		uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
 		uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
+		uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision;
+
 		pnode = uv_apicid_to_pnode(apicid);
 		blade = boot_pnode_to_blade(pnode);
 		lcpu = uv_blade_info[blade].nr_possible_cpus;
@@ -821,3 +888,5 @@ void __init uv_system_init(void)
 	if (is_kdump_kernel())
 		reboot_type = BOOT_ACPI;
 }
+
+apic_driver(apic_x2apic_uv_x);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 0b4be431c62..0371c484bb8 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -228,11 +228,12 @@
 #include <linux/kthread.h>
 #include <linux/jiffies.h>
 #include <linux/acpi.h>
+#include <linux/syscore_ops.h>
+#include <linux/i8253.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/desc.h>
-#include <asm/i8253.h>
 #include <asm/olpc.h>
 #include <asm/paravirt.h>
 #include <asm/reboot.h>
@@ -360,6 +361,7 @@ struct apm_user {
  * idle percentage above which bios idle calls are done
  */
 #ifdef CONFIG_APM_CPU_IDLE
+#warning deprecated CONFIG_APM_CPU_IDLE will be deleted in 2012
 #define DEFAULT_IDLE_THRESHOLD	95
 #else
 #define DEFAULT_IDLE_THRESHOLD	100
@@ -903,6 +905,7 @@ static void apm_cpu_idle(void)
 	unsigned int jiffies_since_last_check = jiffies - last_jiffies;
 	unsigned int bucket;
 
+	WARN_ONCE(1, "deprecated apm_cpu_idle will be deleted in 2012");
 recalc:
 	if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
 		use_apm_idle = 0;
@@ -1217,11 +1220,11 @@ static void reinit_timer(void)
 
 	raw_spin_lock_irqsave(&i8253_lock, flags);
 	/* set the clock to HZ */
-	outb_pit(0x34, PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */
+	outb_p(0x34, PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */
 	udelay(10);
-	outb_pit(LATCH & 0xff, PIT_CH0);	/* LSB */
+	outb_p(LATCH & 0xff, PIT_CH0);	/* LSB */
 	udelay(10);
-	outb_pit(LATCH >> 8, PIT_CH0);	/* MSB */
+	outb_p(LATCH >> 8, PIT_CH0);	/* MSB */
 	udelay(10);
 	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 #endif
@@ -1237,7 +1240,7 @@ static int suspend(int vetoable)
 	dpm_suspend_noirq(PMSG_SUSPEND);
 
 	local_irq_disable();
-	sysdev_suspend(PMSG_SUSPEND);
+	syscore_suspend();
 
 	local_irq_enable();
 
@@ -1255,7 +1258,7 @@ static int suspend(int vetoable)
 		apm_error("suspend", err);
 	err = (err == APM_SUCCESS) ? 0 : -EIO;
 
-	sysdev_resume();
+	syscore_resume();
 	local_irq_enable();
 
 	dpm_resume_noirq(PMSG_RESUME);
@@ -1279,7 +1282,7 @@ static void standby(void)
 	dpm_suspend_noirq(PMSG_SUSPEND);
 
 	local_irq_disable();
-	sysdev_suspend(PMSG_SUSPEND);
+	syscore_suspend();
 	local_irq_enable();
 
 	err = set_system_power_state(APM_STATE_STANDBY);
@@ -1287,7 +1290,7 @@ static void standby(void)
 		apm_error("standby", err);
 
 	local_irq_disable();
-	sysdev_resume();
+	syscore_resume();
 	local_irq_enable();
 
 	dpm_resume_noirq(PMSG_RESUME);
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index c29d631af6f..395a10e6806 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -63,7 +63,6 @@ void foo(void)
 	BLANK();
 	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
 	OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
-	OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
 
 	BLANK();
 	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3f0ebe429a0..6042981d030 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_PERF_EVENTS)		+= perf_event.o
 
 obj-$(CONFIG_X86_MCE)			+= mcheck/
 obj-$(CONFIG_MTRR)			+= mtrr/
-obj-$(CONFIG_CPU_FREQ)			+= cpufreq/
 
 obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o
 
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 3532d3bf810..b13ed393dfc 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -612,8 +612,11 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	}
 #endif
 
-	/* As a rule processors have APIC timer running in deep C states */
-	if (c->x86 >= 0xf && !cpu_has_amd_erratum(amd_erratum_400))
+	/*
+	 * Family 0x12 and above processors have APIC timer
+	 * running in deep C states.
+	 */
+	if (c->x86 > 0x11)
 		set_cpu_cap(c, X86_FEATURE_ARAT);
 
 	/*
@@ -629,10 +632,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		 * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
 		 */
 		u64 mask;
+		int err;
 
-		rdmsrl(MSR_AMD64_MCx_MASK(4), mask);
-		mask |= (1 << 10);
-		wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
+		err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);
+		if (err == 0) {
+			mask |= (1 << 10);
+			checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
+		}
 	}
 }
 
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c39576cb301..46674fbb62b 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -19,6 +19,7 @@
 
 static int __init no_halt(char *s)
 {
+	WARN_ONCE(1, "\"no-hlt\" is deprecated, please use \"idle=poll\"\n");
 	boot_cpu_data.hlt_works_ok = 0;
 	return 1;
 }
@@ -61,6 +62,8 @@ static void __init check_fpu(void)
 		return;
 	}
 
+	kernel_fpu_begin();
+
 	/*
 	 * trap_init() enabled FXSR and company _before_ testing for FP
 	 * problems here.
@@ -79,6 +82,8 @@ static void __init check_fpu(void)
 		: "=m" (*&fdiv_bug)
 		: "m" (*&x), "m" (*&y));
 
+	kernel_fpu_end();
+
 	boot_cpu_data.fdiv_bug = fdiv_bug;
 	if (boot_cpu_data.fdiv_bug)
 		printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n");
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e2ced0074a4..62184390a60 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -21,7 +21,7 @@
 #include <linux/topology.h>
 #include <linux/cpumask.h>
 #include <asm/pgtable.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/proto.h>
 #include <asm/setup.h>
 #include <asm/apic.h>
@@ -254,6 +254,25 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
 }
 #endif
 
+static int disable_smep __cpuinitdata;
+static __init int setup_disable_smep(char *arg)
+{
+	disable_smep = 1;
+	return 1;
+}
+__setup("nosmep", setup_disable_smep);
+
+static __cpuinit void setup_smep(struct cpuinfo_x86 *c)
+{
+	if (cpu_has(c, X86_FEATURE_SMEP)) {
+		if (unlikely(disable_smep)) {
+			setup_clear_cpu_cap(X86_FEATURE_SMEP);
+			clear_in_cr4(X86_CR4_SMEP);
+		} else
+			set_in_cr4(X86_CR4_SMEP);
+	}
+}
+
 /*
  * Some CPU features depend on higher CPUID levels, which may not always
  * be available due to CPUID level capping or broken virtualization
@@ -458,13 +477,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 	if (smp_num_siblings <= 1)
 		goto out;
 
-	if (smp_num_siblings > nr_cpu_ids) {
-		pr_warning("CPU: Unsupported number of siblings %d",
-			   smp_num_siblings);
-		smp_num_siblings = 1;
-		return;
-	}
-
 	index_msb = get_count_order(smp_num_siblings);
 	c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
 
@@ -565,8 +577,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
 
 		cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
 
-		if (eax > 0)
-			c->x86_capability[9] = ebx;
+		c->x86_capability[9] = ebx;
 	}
 
 	/* AMD-defined flags: level 0x80000001 */
@@ -668,6 +679,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	c->cpu_index = 0;
 #endif
 	filter_cpuid_features(c, false);
+
+	setup_smep(c);
 }
 
 void __init early_cpu_init(void)
@@ -753,6 +766,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
 #endif
 	}
 
+	setup_smep(c);
+
 	get_model_name(c); /* Default name */
 
 	detect_nopl(c);
@@ -887,7 +902,7 @@ static void vgetcpu_set_mode(void)
 void __init identify_boot_cpu(void)
 {
 	identify_cpu(&boot_cpu_data);
-	init_c1e_mask();
+	init_amd_e400_c1e_mask();
 #ifdef CONFIG_X86_32
 	sysenter_setup();
 	enable_sep_cpu();
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
deleted file mode 100644
index 870e6cc6ad2..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ /dev/null
@@ -1,266 +0,0 @@
-#
-# CPU Frequency scaling
-#
-
-menu "CPU Frequency scaling"
-
-source "drivers/cpufreq/Kconfig"
-
-if CPU_FREQ
-
-comment "CPUFreq processor drivers"
-
-config X86_PCC_CPUFREQ
-	tristate "Processor Clocking Control interface driver"
-	depends on ACPI && ACPI_PROCESSOR
-	help
-	  This driver adds support for the PCC interface.
-
-	  For details, take a look at:
-	  <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called pcc-cpufreq.
-
-	  If in doubt, say N.
-
-config X86_ACPI_CPUFREQ
-	tristate "ACPI Processor P-States driver"
-	select CPU_FREQ_TABLE
-	depends on ACPI_PROCESSOR
-	help
-	  This driver adds a CPUFreq driver which utilizes the ACPI
-	  Processor Performance States.
-	  This driver also supports Intel Enhanced Speedstep.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called acpi-cpufreq.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config ELAN_CPUFREQ
-	tristate "AMD Elan SC400 and SC410"
-	select CPU_FREQ_TABLE
-	depends on X86_ELAN
-	---help---
-	  This adds the CPUFreq driver for AMD Elan SC400 and SC410
-	  processors.
-
-	  You need to specify the processor maximum speed as boot
-	  parameter: elanfreq=maxspeed (in kHz) or as module
-	  parameter "max_freq".
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config SC520_CPUFREQ
-	tristate "AMD Elan SC520"
-	select CPU_FREQ_TABLE
-	depends on X86_ELAN
-	---help---
-	  This adds the CPUFreq driver for AMD Elan SC520 processor.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-
-config X86_POWERNOW_K6
-	tristate "AMD Mobile K6-2/K6-3 PowerNow!"
-	select CPU_FREQ_TABLE
-	depends on X86_32
-	help
-	  This adds the CPUFreq driver for mobile AMD K6-2+ and mobile
-	  AMD K6-3+ processors.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config X86_POWERNOW_K7
-	tristate "AMD Mobile Athlon/Duron PowerNow!"
-	select CPU_FREQ_TABLE
-	depends on X86_32
-	help
-	  This adds the CPUFreq driver for mobile AMD K7 mobile processors.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config X86_POWERNOW_K7_ACPI
-	bool
-	depends on X86_POWERNOW_K7 && ACPI_PROCESSOR
-	depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m)
-	depends on X86_32
-	default y
-
-config X86_POWERNOW_K8
-	tristate "AMD Opteron/Athlon64 PowerNow!"
-	select CPU_FREQ_TABLE
-	depends on ACPI && ACPI_PROCESSOR
-	help
-	  This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called powernow-k8.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-config X86_GX_SUSPMOD
-	tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
-	depends on X86_32 && PCI
-	help
-	 This add the CPUFreq driver for NatSemi Geode processors which
-	 support suspend modulation.
-
-	 For details, take a look at <file:Documentation/cpu-freq/>.
-
-	 If in doubt, say N.
-
-config X86_SPEEDSTEP_CENTRINO
-	tristate "Intel Enhanced SpeedStep (deprecated)"
-	select CPU_FREQ_TABLE
-	select X86_SPEEDSTEP_CENTRINO_TABLE if X86_32
-	depends on X86_32 || (X86_64 && ACPI_PROCESSOR)
-	help
-	  This is deprecated and this functionality is now merged into
-	  acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of
-	  speedstep_centrino.
-	  This adds the CPUFreq driver for Enhanced SpeedStep enabled
-	  mobile CPUs.  This means Intel Pentium M (Centrino) CPUs
-	  or 64bit enabled Intel Xeons.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called speedstep-centrino.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config X86_SPEEDSTEP_CENTRINO_TABLE
-	bool "Built-in tables for Banias CPUs"
-	depends on X86_32 && X86_SPEEDSTEP_CENTRINO
-	default y
-	help
-	  Use built-in tables for Banias CPUs if ACPI encoding
-	  is not available.
-
-	  If in doubt, say N.
-
-config X86_SPEEDSTEP_ICH
-	tristate "Intel Speedstep on ICH-M chipsets (ioport interface)"
-	select CPU_FREQ_TABLE
-	depends on X86_32
-	help
-	  This adds the CPUFreq driver for certain mobile Intel Pentium III
-	  (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all
-	  mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2,
-	  ICH3 or ICH4 southbridge.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config X86_SPEEDSTEP_SMI
-	tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)"
-	select CPU_FREQ_TABLE
-	depends on X86_32 && EXPERIMENTAL
-	help
-	  This adds the CPUFreq driver for certain mobile Intel Pentium III
-	  (Coppermine), all mobile Intel Pentium III-M (Tualatin)
-	  on systems which have an Intel 440BX/ZX/MX southbridge.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config X86_P4_CLOCKMOD
-	tristate "Intel Pentium 4 clock modulation"
-	select CPU_FREQ_TABLE
-	help
-	  This adds the CPUFreq driver for Intel Pentium 4 / XEON
-	  processors.  When enabled it will lower CPU temperature by skipping
-	  clocks.
-
-	  This driver should be only used in exceptional
-	  circumstances when very low power is needed because it causes severe
-	  slowdowns and noticeable latencies.  Normally Speedstep should be used
-	  instead.
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called p4-clockmod.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  Unless you are absolutely sure say N.
-
-config X86_CPUFREQ_NFORCE2
-	tristate "nVidia nForce2 FSB changing"
-	depends on X86_32 && EXPERIMENTAL
-	help
-	  This adds the CPUFreq driver for FSB changing on nVidia nForce2
-	  platforms.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config X86_LONGRUN
-	tristate "Transmeta LongRun"
-	depends on X86_32
-	help
-	  This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors
-	  which support LongRun.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config X86_LONGHAUL
-	tristate "VIA Cyrix III Longhaul"
-	select CPU_FREQ_TABLE
-	depends on X86_32 && ACPI_PROCESSOR
-	help
-	  This adds the CPUFreq driver for VIA Samuel/CyrixIII,
-	  VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T
-	  processors.
-
-	  For details, take a look at <file:Documentation/cpu-freq/>.
-
-	  If in doubt, say N.
-
-config X86_E_POWERSAVER
-	tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
-	select CPU_FREQ_TABLE
-	depends on X86_32 && EXPERIMENTAL
-	help
-	  This adds the CPUFreq driver for VIA C7 processors.  However, this driver
-	  does not have any safeguards to prevent operating the CPU out of spec
-	  and is thus considered dangerous.  Please use the regular ACPI cpufreq
-	  driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
-
-	  If in doubt, say N.
-
-comment "shared options"
-
-config X86_SPEEDSTEP_LIB
-	tristate
-	default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD)
-
-config X86_SPEEDSTEP_RELAXED_CAP_CHECK
-	bool "Relaxed speedstep capability checks"
-	depends on X86_32 && (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH)
-	help
-	  Don't perform all checks for a speedstep capable system which would
-	  normally be done. Some ancient or strange systems, though speedstep
-	  capable, don't always indicate that they are speedstep capable. This
-	  option lets the probing code bypass some of those checks if the
-	  parameter "relaxed_check=1" is passed to the module.
-
-endif	# CPU_FREQ
-
-endmenu
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
deleted file mode 100644
index bd54bf67e6f..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-# Link order matters. K8 is preferred to ACPI because of firmware bugs in early
-# K8 systems. ACPI is preferred to all other hardware-specific drivers.
-# speedstep-* is preferred over p4-clockmod.
-
-obj-$(CONFIG_X86_POWERNOW_K8)		+= powernow-k8.o mperf.o
-obj-$(CONFIG_X86_ACPI_CPUFREQ)		+= acpi-cpufreq.o mperf.o
-obj-$(CONFIG_X86_PCC_CPUFREQ)		+= pcc-cpufreq.o
-obj-$(CONFIG_X86_POWERNOW_K6)		+= powernow-k6.o
-obj-$(CONFIG_X86_POWERNOW_K7)		+= powernow-k7.o
-obj-$(CONFIG_X86_LONGHAUL)		+= longhaul.o
-obj-$(CONFIG_X86_E_POWERSAVER)		+= e_powersaver.o
-obj-$(CONFIG_ELAN_CPUFREQ)		+= elanfreq.o
-obj-$(CONFIG_SC520_CPUFREQ)		+= sc520_freq.o
-obj-$(CONFIG_X86_LONGRUN)		+= longrun.o  
-obj-$(CONFIG_X86_GX_SUSPMOD)		+= gx-suspmod.o
-obj-$(CONFIG_X86_SPEEDSTEP_ICH)		+= speedstep-ich.o
-obj-$(CONFIG_X86_SPEEDSTEP_LIB)		+= speedstep-lib.o
-obj-$(CONFIG_X86_SPEEDSTEP_SMI)		+= speedstep-smi.o
-obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO)	+= speedstep-centrino.o
-obj-$(CONFIG_X86_P4_CLOCKMOD)		+= p4-clockmod.o
-obj-$(CONFIG_X86_CPUFREQ_NFORCE2)	+= cpufreq-nforce2.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
deleted file mode 100644
index a2baafb2fe6..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ /dev/null
@@ -1,776 +0,0 @@
-/*
- * acpi-cpufreq.c - ACPI Processor P-States Driver
- *
- *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
- *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
- *  Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
- *  Copyright (C) 2006       Denis Sadykov <denis.m.sadykov@intel.com>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or (at
- *  your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- *  General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/cpufreq.h>
-#include <linux/compiler.h>
-#include <linux/dmi.h>
-#include <linux/slab.h>
-
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/uaccess.h>
-
-#include <acpi/processor.h>
-
-#include <asm/msr.h>
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-#include "mperf.h"
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"acpi-cpufreq", msg)
-
-MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
-MODULE_DESCRIPTION("ACPI Processor P-States Driver");
-MODULE_LICENSE("GPL");
-
-enum {
-	UNDEFINED_CAPABLE = 0,
-	SYSTEM_INTEL_MSR_CAPABLE,
-	SYSTEM_IO_CAPABLE,
-};
-
-#define INTEL_MSR_RANGE		(0xffff)
-
-struct acpi_cpufreq_data {
-	struct acpi_processor_performance *acpi_data;
-	struct cpufreq_frequency_table *freq_table;
-	unsigned int resume;
-	unsigned int cpu_feature;
-};
-
-static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
-
-/* acpi_perf_data is a pointer to percpu data. */
-static struct acpi_processor_performance __percpu *acpi_perf_data;
-
-static struct cpufreq_driver acpi_cpufreq_driver;
-
-static unsigned int acpi_pstate_strict;
-
-static int check_est_cpu(unsigned int cpuid)
-{
-	struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
-
-	return cpu_has(cpu, X86_FEATURE_EST);
-}
-
-static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
-{
-	struct acpi_processor_performance *perf;
-	int i;
-
-	perf = data->acpi_data;
-
-	for (i = 0; i < perf->state_count; i++) {
-		if (value == perf->states[i].status)
-			return data->freq_table[i].frequency;
-	}
-	return 0;
-}
-
-static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
-{
-	int i;
-	struct acpi_processor_performance *perf;
-
-	msr &= INTEL_MSR_RANGE;
-	perf = data->acpi_data;
-
-	for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
-		if (msr == perf->states[data->freq_table[i].index].status)
-			return data->freq_table[i].frequency;
-	}
-	return data->freq_table[0].frequency;
-}
-
-static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
-{
-	switch (data->cpu_feature) {
-	case SYSTEM_INTEL_MSR_CAPABLE:
-		return extract_msr(val, data);
-	case SYSTEM_IO_CAPABLE:
-		return extract_io(val, data);
-	default:
-		return 0;
-	}
-}
-
-struct msr_addr {
-	u32 reg;
-};
-
-struct io_addr {
-	u16 port;
-	u8 bit_width;
-};
-
-struct drv_cmd {
-	unsigned int type;
-	const struct cpumask *mask;
-	union {
-		struct msr_addr msr;
-		struct io_addr io;
-	} addr;
-	u32 val;
-};
-
-/* Called via smp_call_function_single(), on the target CPU */
-static void do_drv_read(void *_cmd)
-{
-	struct drv_cmd *cmd = _cmd;
-	u32 h;
-
-	switch (cmd->type) {
-	case SYSTEM_INTEL_MSR_CAPABLE:
-		rdmsr(cmd->addr.msr.reg, cmd->val, h);
-		break;
-	case SYSTEM_IO_CAPABLE:
-		acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
-				&cmd->val,
-				(u32)cmd->addr.io.bit_width);
-		break;
-	default:
-		break;
-	}
-}
-
-/* Called via smp_call_function_many(), on the target CPUs */
-static void do_drv_write(void *_cmd)
-{
-	struct drv_cmd *cmd = _cmd;
-	u32 lo, hi;
-
-	switch (cmd->type) {
-	case SYSTEM_INTEL_MSR_CAPABLE:
-		rdmsr(cmd->addr.msr.reg, lo, hi);
-		lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
-		wrmsr(cmd->addr.msr.reg, lo, hi);
-		break;
-	case SYSTEM_IO_CAPABLE:
-		acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
-				cmd->val,
-				(u32)cmd->addr.io.bit_width);
-		break;
-	default:
-		break;
-	}
-}
-
-static void drv_read(struct drv_cmd *cmd)
-{
-	int err;
-	cmd->val = 0;
-
-	err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
-	WARN_ON_ONCE(err);	/* smp_call_function_any() was buggy? */
-}
-
-static void drv_write(struct drv_cmd *cmd)
-{
-	int this_cpu;
-
-	this_cpu = get_cpu();
-	if (cpumask_test_cpu(this_cpu, cmd->mask))
-		do_drv_write(cmd);
-	smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
-	put_cpu();
-}
-
-static u32 get_cur_val(const struct cpumask *mask)
-{
-	struct acpi_processor_performance *perf;
-	struct drv_cmd cmd;
-
-	if (unlikely(cpumask_empty(mask)))
-		return 0;
-
-	switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) {
-	case SYSTEM_INTEL_MSR_CAPABLE:
-		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
-		cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
-		break;
-	case SYSTEM_IO_CAPABLE:
-		cmd.type = SYSTEM_IO_CAPABLE;
-		perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data;
-		cmd.addr.io.port = perf->control_register.address;
-		cmd.addr.io.bit_width = perf->control_register.bit_width;
-		break;
-	default:
-		return 0;
-	}
-
-	cmd.mask = mask;
-	drv_read(&cmd);
-
-	dprintk("get_cur_val = %u\n", cmd.val);
-
-	return cmd.val;
-}
-
-static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
-{
-	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
-	unsigned int freq;
-	unsigned int cached_freq;
-
-	dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
-
-	if (unlikely(data == NULL ||
-		     data->acpi_data == NULL || data->freq_table == NULL)) {
-		return 0;
-	}
-
-	cached_freq = data->freq_table[data->acpi_data->state].frequency;
-	freq = extract_freq(get_cur_val(cpumask_of(cpu)), data);
-	if (freq != cached_freq) {
-		/*
-		 * The dreaded BIOS frequency change behind our back.
-		 * Force set the frequency on next target call.
-		 */
-		data->resume = 1;
-	}
-
-	dprintk("cur freq = %u\n", freq);
-
-	return freq;
-}
-
-static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
-				struct acpi_cpufreq_data *data)
-{
-	unsigned int cur_freq;
-	unsigned int i;
-
-	for (i = 0; i < 100; i++) {
-		cur_freq = extract_freq(get_cur_val(mask), data);
-		if (cur_freq == freq)
-			return 1;
-		udelay(10);
-	}
-	return 0;
-}
-
-static int acpi_cpufreq_target(struct cpufreq_policy *policy,
-			       unsigned int target_freq, unsigned int relation)
-{
-	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-	struct acpi_processor_performance *perf;
-	struct cpufreq_freqs freqs;
-	struct drv_cmd cmd;
-	unsigned int next_state = 0; /* Index into freq_table */
-	unsigned int next_perf_state = 0; /* Index into perf table */
-	unsigned int i;
-	int result = 0;
-
-	dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
-
-	if (unlikely(data == NULL ||
-	     data->acpi_data == NULL || data->freq_table == NULL)) {
-		return -ENODEV;
-	}
-
-	perf = data->acpi_data;
-	result = cpufreq_frequency_table_target(policy,
-						data->freq_table,
-						target_freq,
-						relation, &next_state);
-	if (unlikely(result)) {
-		result = -ENODEV;
-		goto out;
-	}
-
-	next_perf_state = data->freq_table[next_state].index;
-	if (perf->state == next_perf_state) {
-		if (unlikely(data->resume)) {
-			dprintk("Called after resume, resetting to P%d\n",
-				next_perf_state);
-			data->resume = 0;
-		} else {
-			dprintk("Already at target state (P%d)\n",
-				next_perf_state);
-			goto out;
-		}
-	}
-
-	switch (data->cpu_feature) {
-	case SYSTEM_INTEL_MSR_CAPABLE:
-		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
-		cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
-		cmd.val = (u32) perf->states[next_perf_state].control;
-		break;
-	case SYSTEM_IO_CAPABLE:
-		cmd.type = SYSTEM_IO_CAPABLE;
-		cmd.addr.io.port = perf->control_register.address;
-		cmd.addr.io.bit_width = perf->control_register.bit_width;
-		cmd.val = (u32) perf->states[next_perf_state].control;
-		break;
-	default:
-		result = -ENODEV;
-		goto out;
-	}
-
-	/* cpufreq holds the hotplug lock, so we are safe from here on */
-	if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
-		cmd.mask = policy->cpus;
-	else
-		cmd.mask = cpumask_of(policy->cpu);
-
-	freqs.old = perf->states[perf->state].core_frequency * 1000;
-	freqs.new = data->freq_table[next_state].frequency;
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	}
-
-	drv_write(&cmd);
-
-	if (acpi_pstate_strict) {
-		if (!check_freqs(cmd.mask, freqs.new, data)) {
-			dprintk("acpi_cpufreq_target failed (%d)\n",
-				policy->cpu);
-			result = -EAGAIN;
-			goto out;
-		}
-	}
-
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
-	perf->state = next_perf_state;
-
-out:
-	return result;
-}
-
-static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
-{
-	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-
-	dprintk("acpi_cpufreq_verify\n");
-
-	return cpufreq_frequency_table_verify(policy, data->freq_table);
-}
-
-static unsigned long
-acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
-{
-	struct acpi_processor_performance *perf = data->acpi_data;
-
-	if (cpu_khz) {
-		/* search the closest match to cpu_khz */
-		unsigned int i;
-		unsigned long freq;
-		unsigned long freqn = perf->states[0].core_frequency * 1000;
-
-		for (i = 0; i < (perf->state_count-1); i++) {
-			freq = freqn;
-			freqn = perf->states[i+1].core_frequency * 1000;
-			if ((2 * cpu_khz) > (freqn + freq)) {
-				perf->state = i;
-				return freq;
-			}
-		}
-		perf->state = perf->state_count-1;
-		return freqn;
-	} else {
-		/* assume CPU is at P0... */
-		perf->state = 0;
-		return perf->states[0].core_frequency * 1000;
-	}
-}
-
-static void free_acpi_perf_data(void)
-{
-	unsigned int i;
-
-	/* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
-	for_each_possible_cpu(i)
-		free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
-				 ->shared_cpu_map);
-	free_percpu(acpi_perf_data);
-}
-
-/*
- * acpi_cpufreq_early_init - initialize ACPI P-States library
- *
- * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c)
- * in order to determine correct frequency and voltage pairings. We can
- * do _PDC and _PSD and find out the processor dependency for the
- * actual init that will happen later...
- */
-static int __init acpi_cpufreq_early_init(void)
-{
-	unsigned int i;
-	dprintk("acpi_cpufreq_early_init\n");
-
-	acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
-	if (!acpi_perf_data) {
-		dprintk("Memory allocation error for acpi_perf_data.\n");
-		return -ENOMEM;
-	}
-	for_each_possible_cpu(i) {
-		if (!zalloc_cpumask_var_node(
-			&per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
-			GFP_KERNEL, cpu_to_node(i))) {
-
-			/* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
-			free_acpi_perf_data();
-			return -ENOMEM;
-		}
-	}
-
-	/* Do initialization in ACPI core */
-	acpi_processor_preregister_performance(acpi_perf_data);
-	return 0;
-}
-
-#ifdef CONFIG_SMP
-/*
- * Some BIOSes do SW_ANY coordination internally, either set it up in hw
- * or do it in BIOS firmware and won't inform about it to OS. If not
- * detected, this has a side effect of making CPU run at a different speed
- * than OS intended it to run at. Detect it and handle it cleanly.
- */
-static int bios_with_sw_any_bug;
-
-static int sw_any_bug_found(const struct dmi_system_id *d)
-{
-	bios_with_sw_any_bug = 1;
-	return 0;
-}
-
-static const struct dmi_system_id sw_any_bug_dmi_table[] = {
-	{
-		.callback = sw_any_bug_found,
-		.ident = "Supermicro Server X6DLP",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
-			DMI_MATCH(DMI_BIOS_VERSION, "080010"),
-			DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),
-		},
-	},
-	{ }
-};
-
-static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
-{
-	/* Intel Xeon Processor 7100 Series Specification Update
-	 * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
-	 * AL30: A Machine Check Exception (MCE) Occurring during an
-	 * Enhanced Intel SpeedStep Technology Ratio Change May Cause
-	 * Both Processor Cores to Lock Up. */
-	if (c->x86_vendor == X86_VENDOR_INTEL) {
-		if ((c->x86 == 15) &&
-		    (c->x86_model == 6) &&
-		    (c->x86_mask == 8)) {
-			printk(KERN_INFO "acpi-cpufreq: Intel(R) "
-			    "Xeon(R) 7100 Errata AL30, processors may "
-			    "lock up on frequency changes: disabling "
-			    "acpi-cpufreq.\n");
-			return -ENODEV;
-		    }
-		}
-	return 0;
-}
-#endif
-
-static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
-{
-	unsigned int i;
-	unsigned int valid_states = 0;
-	unsigned int cpu = policy->cpu;
-	struct acpi_cpufreq_data *data;
-	unsigned int result = 0;
-	struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
-	struct acpi_processor_performance *perf;
-#ifdef CONFIG_SMP
-	static int blacklisted;
-#endif
-
-	dprintk("acpi_cpufreq_cpu_init\n");
-
-#ifdef CONFIG_SMP
-	if (blacklisted)
-		return blacklisted;
-	blacklisted = acpi_cpufreq_blacklist(c);
-	if (blacklisted)
-		return blacklisted;
-#endif
-
-	data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
-	data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
-	per_cpu(acfreq_data, cpu) = data;
-
-	if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
-		acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
-
-	result = acpi_processor_register_performance(data->acpi_data, cpu);
-	if (result)
-		goto err_free;
-
-	perf = data->acpi_data;
-	policy->shared_type = perf->shared_type;
-
-	/*
-	 * Will let policy->cpus know about dependency only when software
-	 * coordination is required.
-	 */
-	if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
-	    policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
-		cpumask_copy(policy->cpus, perf->shared_cpu_map);
-	}
-	cpumask_copy(policy->related_cpus, perf->shared_cpu_map);
-
-#ifdef CONFIG_SMP
-	dmi_check_system(sw_any_bug_dmi_table);
-	if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) {
-		policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
-		cpumask_copy(policy->cpus, cpu_core_mask(cpu));
-	}
-#endif
-
-	/* capability check */
-	if (perf->state_count <= 1) {
-		dprintk("No P-States\n");
-		result = -ENODEV;
-		goto err_unreg;
-	}
-
-	if (perf->control_register.space_id != perf->status_register.space_id) {
-		result = -ENODEV;
-		goto err_unreg;
-	}
-
-	switch (perf->control_register.space_id) {
-	case ACPI_ADR_SPACE_SYSTEM_IO:
-		dprintk("SYSTEM IO addr space\n");
-		data->cpu_feature = SYSTEM_IO_CAPABLE;
-		break;
-	case ACPI_ADR_SPACE_FIXED_HARDWARE:
-		dprintk("HARDWARE addr space\n");
-		if (!check_est_cpu(cpu)) {
-			result = -ENODEV;
-			goto err_unreg;
-		}
-		data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
-		break;
-	default:
-		dprintk("Unknown addr space %d\n",
-			(u32) (perf->control_register.space_id));
-		result = -ENODEV;
-		goto err_unreg;
-	}
-
-	data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *
-		    (perf->state_count+1), GFP_KERNEL);
-	if (!data->freq_table) {
-		result = -ENOMEM;
-		goto err_unreg;
-	}
-
-	/* detect transition latency */
-	policy->cpuinfo.transition_latency = 0;
-	for (i = 0; i < perf->state_count; i++) {
-		if ((perf->states[i].transition_latency * 1000) >
-		    policy->cpuinfo.transition_latency)
-			policy->cpuinfo.transition_latency =
-			    perf->states[i].transition_latency * 1000;
-	}
-
-	/* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
-	if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
-	    policy->cpuinfo.transition_latency > 20 * 1000) {
-		policy->cpuinfo.transition_latency = 20 * 1000;
-		printk_once(KERN_INFO
-			    "P-state transition latency capped at 20 uS\n");
-	}
-
-	/* table init */
-	for (i = 0; i < perf->state_count; i++) {
-		if (i > 0 && perf->states[i].core_frequency >=
-		    data->freq_table[valid_states-1].frequency / 1000)
-			continue;
-
-		data->freq_table[valid_states].index = i;
-		data->freq_table[valid_states].frequency =
-		    perf->states[i].core_frequency * 1000;
-		valid_states++;
-	}
-	data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
-	perf->state = 0;
-
-	result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
-	if (result)
-		goto err_freqfree;
-
-	if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq)
-		printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n");
-
-	switch (perf->control_register.space_id) {
-	case ACPI_ADR_SPACE_SYSTEM_IO:
-		/* Current speed is unknown and not detectable by IO port */
-		policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
-		break;
-	case ACPI_ADR_SPACE_FIXED_HARDWARE:
-		acpi_cpufreq_driver.get = get_cur_freq_on_cpu;
-		policy->cur = get_cur_freq_on_cpu(cpu);
-		break;
-	default:
-		break;
-	}
-
-	/* notify BIOS that we exist */
-	acpi_processor_notify_smm(THIS_MODULE);
-
-	/* Check for APERF/MPERF support in hardware */
-	if (cpu_has(c, X86_FEATURE_APERFMPERF))
-		acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
-
-	dprintk("CPU%u - ACPI performance management activated.\n", cpu);
-	for (i = 0; i < perf->state_count; i++)
-		dprintk("     %cP%d: %d MHz, %d mW, %d uS\n",
-			(i == perf->state ? '*' : ' '), i,
-			(u32) perf->states[i].core_frequency,
-			(u32) perf->states[i].power,
-			(u32) perf->states[i].transition_latency);
-
-	cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
-
-	/*
-	 * the first call to ->target() should result in us actually
-	 * writing something to the appropriate registers.
-	 */
-	data->resume = 1;
-
-	return result;
-
-err_freqfree:
-	kfree(data->freq_table);
-err_unreg:
-	acpi_processor_unregister_performance(perf, cpu);
-err_free:
-	kfree(data);
-	per_cpu(acfreq_data, cpu) = NULL;
-
-	return result;
-}
-
-static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
-{
-	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-
-	dprintk("acpi_cpufreq_cpu_exit\n");
-
-	if (data) {
-		cpufreq_frequency_table_put_attr(policy->cpu);
-		per_cpu(acfreq_data, policy->cpu) = NULL;
-		acpi_processor_unregister_performance(data->acpi_data,
-						      policy->cpu);
-		kfree(data->freq_table);
-		kfree(data);
-	}
-
-	return 0;
-}
-
-static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
-{
-	struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
-
-	dprintk("acpi_cpufreq_resume\n");
-
-	data->resume = 1;
-
-	return 0;
-}
-
-static struct freq_attr *acpi_cpufreq_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-static struct cpufreq_driver acpi_cpufreq_driver = {
-	.verify		= acpi_cpufreq_verify,
-	.target		= acpi_cpufreq_target,
-	.bios_limit	= acpi_processor_get_bios_limit,
-	.init		= acpi_cpufreq_cpu_init,
-	.exit		= acpi_cpufreq_cpu_exit,
-	.resume		= acpi_cpufreq_resume,
-	.name		= "acpi-cpufreq",
-	.owner		= THIS_MODULE,
-	.attr		= acpi_cpufreq_attr,
-};
-
-static int __init acpi_cpufreq_init(void)
-{
-	int ret;
-
-	if (acpi_disabled)
-		return 0;
-
-	dprintk("acpi_cpufreq_init\n");
-
-	ret = acpi_cpufreq_early_init();
-	if (ret)
-		return ret;
-
-	ret = cpufreq_register_driver(&acpi_cpufreq_driver);
-	if (ret)
-		free_acpi_perf_data();
-
-	return ret;
-}
-
-static void __exit acpi_cpufreq_exit(void)
-{
-	dprintk("acpi_cpufreq_exit\n");
-
-	cpufreq_unregister_driver(&acpi_cpufreq_driver);
-
-	free_percpu(acpi_perf_data);
-}
-
-module_param(acpi_pstate_strict, uint, 0644);
-MODULE_PARM_DESC(acpi_pstate_strict,
-	"value 0 or non-zero. non-zero -> strict ACPI checks are "
-	"performed during frequency changes.");
-
-late_initcall(acpi_cpufreq_init);
-module_exit(acpi_cpufreq_exit);
-
-MODULE_ALIAS("acpi");
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
deleted file mode 100644
index 141abebc451..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ /dev/null
@@ -1,446 +0,0 @@
-/*
- * (C) 2004-2006  Sebastian Witt <se.witt@gmx.net>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *  Based upon reverse engineered information
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-
-#define NFORCE2_XTAL 25
-#define NFORCE2_BOOTFSB 0x48
-#define NFORCE2_PLLENABLE 0xa8
-#define NFORCE2_PLLREG 0xa4
-#define NFORCE2_PLLADR 0xa0
-#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div)
-
-#define NFORCE2_MIN_FSB 50
-#define NFORCE2_SAFE_DISTANCE 50
-
-/* Delay in ms between FSB changes */
-/* #define NFORCE2_DELAY 10 */
-
-/*
- * nforce2_chipset:
- * FSB is changed using the chipset
- */
-static struct pci_dev *nforce2_dev;
-
-/* fid:
- * multiplier * 10
- */
-static int fid;
-
-/* min_fsb, max_fsb:
- * minimum and maximum FSB (= FSB at boot time)
- */
-static int min_fsb;
-static int max_fsb;
-
-MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
-MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
-MODULE_LICENSE("GPL");
-
-module_param(fid, int, 0444);
-module_param(min_fsb, int, 0444);
-
-MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
-MODULE_PARM_DESC(min_fsb,
-		"Minimum FSB to use, if not defined: current FSB - 50");
-
-#define PFX "cpufreq-nforce2: "
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"cpufreq-nforce2", msg)
-
-/**
- * nforce2_calc_fsb - calculate FSB
- * @pll: PLL value
- *
- *   Calculates FSB from PLL value
- */
-static int nforce2_calc_fsb(int pll)
-{
-	unsigned char mul, div;
-
-	mul = (pll >> 8) & 0xff;
-	div = pll & 0xff;
-
-	if (div > 0)
-		return NFORCE2_XTAL * mul / div;
-
-	return 0;
-}
-
-/**
- * nforce2_calc_pll - calculate PLL value
- * @fsb: FSB
- *
- *   Calculate PLL value for given FSB
- */
-static int nforce2_calc_pll(unsigned int fsb)
-{
-	unsigned char xmul, xdiv;
-	unsigned char mul = 0, div = 0;
-	int tried = 0;
-
-	/* Try to calculate multiplier and divider up to 4 times */
-	while (((mul == 0) || (div == 0)) && (tried <= 3)) {
-		for (xdiv = 2; xdiv <= 0x80; xdiv++)
-			for (xmul = 1; xmul <= 0xfe; xmul++)
-				if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) ==
-				    fsb + tried) {
-					mul = xmul;
-					div = xdiv;
-				}
-		tried++;
-	}
-
-	if ((mul == 0) || (div == 0))
-		return -1;
-
-	return NFORCE2_PLL(mul, div);
-}
-
-/**
- * nforce2_write_pll - write PLL value to chipset
- * @pll: PLL value
- *
- *   Writes new FSB PLL value to chipset
- */
-static void nforce2_write_pll(int pll)
-{
-	int temp;
-
-	/* Set the pll addr. to 0x00 */
-	pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0);
-
-	/* Now write the value in all 64 registers */
-	for (temp = 0; temp <= 0x3f; temp++)
-		pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll);
-
-	return;
-}
-
-/**
- * nforce2_fsb_read - Read FSB
- *
- *   Read FSB from chipset
- *   If bootfsb != 0, return FSB at boot-time
- */
-static unsigned int nforce2_fsb_read(int bootfsb)
-{
-	struct pci_dev *nforce2_sub5;
-	u32 fsb, temp = 0;
-
-	/* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
-	nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF,
-				PCI_ANY_ID, PCI_ANY_ID, NULL);
-	if (!nforce2_sub5)
-		return 0;
-
-	pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb);
-	fsb /= 1000000;
-
-	/* Check if PLL register is already set */
-	pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
-
-	if (bootfsb || !temp)
-		return fsb;
-
-	/* Use PLL register FSB value */
-	pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp);
-	fsb = nforce2_calc_fsb(temp);
-
-	return fsb;
-}
-
-/**
- * nforce2_set_fsb - set new FSB
- * @fsb: New FSB
- *
- *   Sets new FSB
- */
-static int nforce2_set_fsb(unsigned int fsb)
-{
-	u32 temp = 0;
-	unsigned int tfsb;
-	int diff;
-	int pll = 0;
-
-	if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) {
-		printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb);
-		return -EINVAL;
-	}
-
-	tfsb = nforce2_fsb_read(0);
-	if (!tfsb) {
-		printk(KERN_ERR PFX "Error while reading the FSB\n");
-		return -EINVAL;
-	}
-
-	/* First write? Then set actual value */
-	pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
-	if (!temp) {
-		pll = nforce2_calc_pll(tfsb);
-
-		if (pll < 0)
-			return -EINVAL;
-
-		nforce2_write_pll(pll);
-	}
-
-	/* Enable write access */
-	temp = 0x01;
-	pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp);
-
-	diff = tfsb - fsb;
-
-	if (!diff)
-		return 0;
-
-	while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) {
-		if (diff < 0)
-			tfsb++;
-		else
-			tfsb--;
-
-		/* Calculate the PLL reg. value */
-		pll = nforce2_calc_pll(tfsb);
-		if (pll == -1)
-			return -EINVAL;
-
-		nforce2_write_pll(pll);
-#ifdef NFORCE2_DELAY
-		mdelay(NFORCE2_DELAY);
-#endif
-	}
-
-	temp = 0x40;
-	pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp);
-
-	return 0;
-}
-
-/**
- * nforce2_get - get the CPU frequency
- * @cpu: CPU number
- *
- * Returns the CPU frequency
- */
-static unsigned int nforce2_get(unsigned int cpu)
-{
-	if (cpu)
-		return 0;
-	return nforce2_fsb_read(0) * fid * 100;
-}
-
-/**
- * nforce2_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- *  (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int nforce2_target(struct cpufreq_policy *policy,
-			  unsigned int target_freq, unsigned int relation)
-{
-/*        unsigned long         flags; */
-	struct cpufreq_freqs freqs;
-	unsigned int target_fsb;
-
-	if ((target_freq > policy->max) || (target_freq < policy->min))
-		return -EINVAL;
-
-	target_fsb = target_freq / (fid * 100);
-
-	freqs.old = nforce2_get(policy->cpu);
-	freqs.new = target_fsb * fid * 100;
-	freqs.cpu = 0;		/* Only one CPU on nForce2 platforms */
-
-	if (freqs.old == freqs.new)
-		return 0;
-
-	dprintk("Old CPU frequency %d kHz, new %d kHz\n",
-	       freqs.old, freqs.new);
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
-	/* Disable IRQs */
-	/* local_irq_save(flags); */
-
-	if (nforce2_set_fsb(target_fsb) < 0)
-		printk(KERN_ERR PFX "Changing FSB to %d failed\n",
-			target_fsb);
-	else
-		dprintk("Changed FSB successfully to %d\n",
-			target_fsb);
-
-	/* Enable IRQs */
-	/* local_irq_restore(flags); */
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
-	return 0;
-}
-
-/**
- * nforce2_verify - verifies a new CPUFreq policy
- * @policy: new policy
- */
-static int nforce2_verify(struct cpufreq_policy *policy)
-{
-	unsigned int fsb_pol_max;
-
-	fsb_pol_max = policy->max / (fid * 100);
-
-	if (policy->min < (fsb_pol_max * fid * 100))
-		policy->max = (fsb_pol_max + 1) * fid * 100;
-
-	cpufreq_verify_within_limits(policy,
-				     policy->cpuinfo.min_freq,
-				     policy->cpuinfo.max_freq);
-	return 0;
-}
-
-static int nforce2_cpu_init(struct cpufreq_policy *policy)
-{
-	unsigned int fsb;
-	unsigned int rfid;
-
-	/* capability check */
-	if (policy->cpu != 0)
-		return -ENODEV;
-
-	/* Get current FSB */
-	fsb = nforce2_fsb_read(0);
-
-	if (!fsb)
-		return -EIO;
-
-	/* FIX: Get FID from CPU */
-	if (!fid) {
-		if (!cpu_khz) {
-			printk(KERN_WARNING PFX
-			"cpu_khz not set, can't calculate multiplier!\n");
-			return -ENODEV;
-		}
-
-		fid = cpu_khz / (fsb * 100);
-		rfid = fid % 5;
-
-		if (rfid) {
-			if (rfid > 2)
-				fid += 5 - rfid;
-			else
-				fid -= rfid;
-		}
-	}
-
-	printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb,
-	       fid / 10, fid % 10);
-
-	/* Set maximum FSB to FSB at boot time */
-	max_fsb = nforce2_fsb_read(1);
-
-	if (!max_fsb)
-		return -EIO;
-
-	if (!min_fsb)
-		min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE;
-
-	if (min_fsb < NFORCE2_MIN_FSB)
-		min_fsb = NFORCE2_MIN_FSB;
-
-	/* cpuinfo and default policy values */
-	policy->cpuinfo.min_freq = min_fsb * fid * 100;
-	policy->cpuinfo.max_freq = max_fsb * fid * 100;
-	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-	policy->cur = nforce2_get(policy->cpu);
-	policy->min = policy->cpuinfo.min_freq;
-	policy->max = policy->cpuinfo.max_freq;
-
-	return 0;
-}
-
-static int nforce2_cpu_exit(struct cpufreq_policy *policy)
-{
-	return 0;
-}
-
-static struct cpufreq_driver nforce2_driver = {
-	.name = "nforce2",
-	.verify = nforce2_verify,
-	.target = nforce2_target,
-	.get = nforce2_get,
-	.init = nforce2_cpu_init,
-	.exit = nforce2_cpu_exit,
-	.owner = THIS_MODULE,
-};
-
-/**
- * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic
- *
- * Detects nForce2 A2 and C1 stepping
- *
- */
-static int nforce2_detect_chipset(void)
-{
-	nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
-					PCI_DEVICE_ID_NVIDIA_NFORCE2,
-					PCI_ANY_ID, PCI_ANY_ID, NULL);
-
-	if (nforce2_dev == NULL)
-		return -ENODEV;
-
-	printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n",
-	       nforce2_dev->revision);
-	printk(KERN_INFO PFX
-	       "FSB changing is maybe unstable and can lead to "
-	       "crashes and data loss.\n");
-
-	return 0;
-}
-
-/**
- * nforce2_init - initializes the nForce2 CPUFreq driver
- *
- * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported
- * devices, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init nforce2_init(void)
-{
-	/* TODO: do we need to detect the processor? */
-
-	/* detect chipset */
-	if (nforce2_detect_chipset()) {
-		printk(KERN_INFO PFX "No nForce2 chipset.\n");
-		return -ENODEV;
-	}
-
-	return cpufreq_register_driver(&nforce2_driver);
-}
-
-/**
- * nforce2_exit - unregisters cpufreq module
- *
- *   Unregisters nForce2 FSB change support.
- */
-static void __exit nforce2_exit(void)
-{
-	cpufreq_unregister_driver(&nforce2_driver);
-}
-
-module_init(nforce2_init);
-module_exit(nforce2_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
deleted file mode 100644
index 35a257dd4bb..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ /dev/null
@@ -1,367 +0,0 @@
-/*
- *  Based on documentation provided by Dave Jones. Thanks!
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/ioport.h>
-#include <linux/slab.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/msr.h>
-#include <asm/tsc.h>
-
-#define EPS_BRAND_C7M	0
-#define EPS_BRAND_C7	1
-#define EPS_BRAND_EDEN	2
-#define EPS_BRAND_C3	3
-#define EPS_BRAND_C7D	4
-
-struct eps_cpu_data {
-	u32 fsb;
-	struct cpufreq_frequency_table freq_table[];
-};
-
-static struct eps_cpu_data *eps_cpu[NR_CPUS];
-
-
-static unsigned int eps_get(unsigned int cpu)
-{
-	struct eps_cpu_data *centaur;
-	u32 lo, hi;
-
-	if (cpu)
-		return 0;
-	centaur = eps_cpu[cpu];
-	if (centaur == NULL)
-		return 0;
-
-	/* Return current frequency */
-	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-	return centaur->fsb * ((lo >> 8) & 0xff);
-}
-
-static int eps_set_state(struct eps_cpu_data *centaur,
-			 unsigned int cpu,
-			 u32 dest_state)
-{
-	struct cpufreq_freqs freqs;
-	u32 lo, hi;
-	int err = 0;
-	int i;
-
-	freqs.old = eps_get(cpu);
-	freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff);
-	freqs.cpu = cpu;
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
-	/* Wait while CPU is busy */
-	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-	i = 0;
-	while (lo & ((1 << 16) | (1 << 17))) {
-		udelay(16);
-		rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-		i++;
-		if (unlikely(i > 64)) {
-			err = -ENODEV;
-			goto postchange;
-		}
-	}
-	/* Set new multiplier and voltage */
-	wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0);
-	/* Wait until transition end */
-	i = 0;
-	do {
-		udelay(16);
-		rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-		i++;
-		if (unlikely(i > 64)) {
-			err = -ENODEV;
-			goto postchange;
-		}
-	} while (lo & ((1 << 16) | (1 << 17)));
-
-	/* Return current frequency */
-postchange:
-	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-	freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
-
-#ifdef DEBUG
-	{
-	u8 current_multiplier, current_voltage;
-
-	/* Print voltage and multiplier */
-	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-	current_voltage = lo & 0xff;
-	printk(KERN_INFO "eps: Current voltage = %dmV\n",
-		current_voltage * 16 + 700);
-	current_multiplier = (lo >> 8) & 0xff;
-	printk(KERN_INFO "eps: Current multiplier = %d\n",
-		current_multiplier);
-	}
-#endif
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	return err;
-}
-
-static int eps_target(struct cpufreq_policy *policy,
-			       unsigned int target_freq,
-			       unsigned int relation)
-{
-	struct eps_cpu_data *centaur;
-	unsigned int newstate = 0;
-	unsigned int cpu = policy->cpu;
-	unsigned int dest_state;
-	int ret;
-
-	if (unlikely(eps_cpu[cpu] == NULL))
-		return -ENODEV;
-	centaur = eps_cpu[cpu];
-
-	if (unlikely(cpufreq_frequency_table_target(policy,
-			&eps_cpu[cpu]->freq_table[0],
-			target_freq,
-			relation,
-			&newstate))) {
-		return -EINVAL;
-	}
-
-	/* Make frequency transition */
-	dest_state = centaur->freq_table[newstate].index & 0xffff;
-	ret = eps_set_state(centaur, cpu, dest_state);
-	if (ret)
-		printk(KERN_ERR "eps: Timeout!\n");
-	return ret;
-}
-
-static int eps_verify(struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy,
-			&eps_cpu[policy->cpu]->freq_table[0]);
-}
-
-static int eps_cpu_init(struct cpufreq_policy *policy)
-{
-	unsigned int i;
-	u32 lo, hi;
-	u64 val;
-	u8 current_multiplier, current_voltage;
-	u8 max_multiplier, max_voltage;
-	u8 min_multiplier, min_voltage;
-	u8 brand = 0;
-	u32 fsb;
-	struct eps_cpu_data *centaur;
-	struct cpuinfo_x86 *c = &cpu_data(0);
-	struct cpufreq_frequency_table *f_table;
-	int k, step, voltage;
-	int ret;
-	int states;
-
-	if (policy->cpu != 0)
-		return -ENODEV;
-
-	/* Check brand */
-	printk(KERN_INFO "eps: Detected VIA ");
-
-	switch (c->x86_model) {
-	case 10:
-		rdmsr(0x1153, lo, hi);
-		brand = (((lo >> 2) ^ lo) >> 18) & 3;
-		printk(KERN_CONT "Model A ");
-		break;
-	case 13:
-		rdmsr(0x1154, lo, hi);
-		brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff;
-		printk(KERN_CONT "Model D ");
-		break;
-	}
-
-	switch (brand) {
-	case EPS_BRAND_C7M:
-		printk(KERN_CONT "C7-M\n");
-		break;
-	case EPS_BRAND_C7:
-		printk(KERN_CONT "C7\n");
-		break;
-	case EPS_BRAND_EDEN:
-		printk(KERN_CONT "Eden\n");
-		break;
-	case EPS_BRAND_C7D:
-		printk(KERN_CONT "C7-D\n");
-		break;
-	case EPS_BRAND_C3:
-		printk(KERN_CONT "C3\n");
-		return -ENODEV;
-		break;
-	}
-	/* Enable Enhanced PowerSaver */
-	rdmsrl(MSR_IA32_MISC_ENABLE, val);
-	if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
-		val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
-		wrmsrl(MSR_IA32_MISC_ENABLE, val);
-		/* Can be locked at 0 */
-		rdmsrl(MSR_IA32_MISC_ENABLE, val);
-		if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
-			printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n");
-			return -ENODEV;
-		}
-	}
-
-	/* Print voltage and multiplier */
-	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-	current_voltage = lo & 0xff;
-	printk(KERN_INFO "eps: Current voltage = %dmV\n",
-			current_voltage * 16 + 700);
-	current_multiplier = (lo >> 8) & 0xff;
-	printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier);
-
-	/* Print limits */
-	max_voltage = hi & 0xff;
-	printk(KERN_INFO "eps: Highest voltage = %dmV\n",
-			max_voltage * 16 + 700);
-	max_multiplier = (hi >> 8) & 0xff;
-	printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier);
-	min_voltage = (hi >> 16) & 0xff;
-	printk(KERN_INFO "eps: Lowest voltage = %dmV\n",
-			min_voltage * 16 + 700);
-	min_multiplier = (hi >> 24) & 0xff;
-	printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier);
-
-	/* Sanity checks */
-	if (current_multiplier == 0 || max_multiplier == 0
-	    || min_multiplier == 0)
-		return -EINVAL;
-	if (current_multiplier > max_multiplier
-	    || max_multiplier <= min_multiplier)
-		return -EINVAL;
-	if (current_voltage > 0x1f || max_voltage > 0x1f)
-		return -EINVAL;
-	if (max_voltage < min_voltage)
-		return -EINVAL;
-
-	/* Calc FSB speed */
-	fsb = cpu_khz / current_multiplier;
-	/* Calc number of p-states supported */
-	if (brand == EPS_BRAND_C7M)
-		states = max_multiplier - min_multiplier + 1;
-	else
-		states = 2;
-
-	/* Allocate private data and frequency table for current cpu */
-	centaur = kzalloc(sizeof(struct eps_cpu_data)
-		    + (states + 1) * sizeof(struct cpufreq_frequency_table),
-		    GFP_KERNEL);
-	if (!centaur)
-		return -ENOMEM;
-	eps_cpu[0] = centaur;
-
-	/* Copy basic values */
-	centaur->fsb = fsb;
-
-	/* Fill frequency and MSR value table */
-	f_table = &centaur->freq_table[0];
-	if (brand != EPS_BRAND_C7M) {
-		f_table[0].frequency = fsb * min_multiplier;
-		f_table[0].index = (min_multiplier << 8) | min_voltage;
-		f_table[1].frequency = fsb * max_multiplier;
-		f_table[1].index = (max_multiplier << 8) | max_voltage;
-		f_table[2].frequency = CPUFREQ_TABLE_END;
-	} else {
-		k = 0;
-		step = ((max_voltage - min_voltage) * 256)
-			/ (max_multiplier - min_multiplier);
-		for (i = min_multiplier; i <= max_multiplier; i++) {
-			voltage = (k * step) / 256 + min_voltage;
-			f_table[k].frequency = fsb * i;
-			f_table[k].index = (i << 8) | voltage;
-			k++;
-		}
-		f_table[k].frequency = CPUFREQ_TABLE_END;
-	}
-
-	policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */
-	policy->cur = fsb * current_multiplier;
-
-	ret = cpufreq_frequency_table_cpuinfo(policy, &centaur->freq_table[0]);
-	if (ret) {
-		kfree(centaur);
-		return ret;
-	}
-
-	cpufreq_frequency_table_get_attr(&centaur->freq_table[0], policy->cpu);
-	return 0;
-}
-
-static int eps_cpu_exit(struct cpufreq_policy *policy)
-{
-	unsigned int cpu = policy->cpu;
-	struct eps_cpu_data *centaur;
-	u32 lo, hi;
-
-	if (eps_cpu[cpu] == NULL)
-		return -ENODEV;
-	centaur = eps_cpu[cpu];
-
-	/* Get max frequency */
-	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
-	/* Set max frequency */
-	eps_set_state(centaur, cpu, hi & 0xffff);
-	/* Bye */
-	cpufreq_frequency_table_put_attr(policy->cpu);
-	kfree(eps_cpu[cpu]);
-	eps_cpu[cpu] = NULL;
-	return 0;
-}
-
-static struct freq_attr *eps_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-static struct cpufreq_driver eps_driver = {
-	.verify		= eps_verify,
-	.target		= eps_target,
-	.init		= eps_cpu_init,
-	.exit		= eps_cpu_exit,
-	.get		= eps_get,
-	.name		= "e_powersaver",
-	.owner		= THIS_MODULE,
-	.attr		= eps_attr,
-};
-
-static int __init eps_init(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-
-	/* This driver will work only on Centaur C7 processors with
-	 * Enhanced SpeedStep/PowerSaver registers */
-	if (c->x86_vendor != X86_VENDOR_CENTAUR
-	    || c->x86 != 6 || c->x86_model < 10)
-		return -ENODEV;
-	if (!cpu_has(c, X86_FEATURE_EST))
-		return -ENODEV;
-
-	if (cpufreq_register_driver(&eps_driver))
-		return -EINVAL;
-	return 0;
-}
-
-static void __exit eps_exit(void)
-{
-	cpufreq_unregister_driver(&eps_driver);
-}
-
-MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>");
-MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
-MODULE_LICENSE("GPL");
-
-module_init(eps_init);
-module_exit(eps_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
deleted file mode 100644
index c587db472a7..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- *	elanfreq:	cpufreq driver for the AMD ELAN family
- *
- *	(c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de>
- *
- *	Parts of this code are (c) Sven Geggus <sven@geggus.net>
- *
- *      All Rights Reserved.
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- *	2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-
-#include <linux/delay.h>
-#include <linux/cpufreq.h>
-
-#include <asm/msr.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#define REG_CSCIR 0x22		/* Chip Setup and Control Index Register    */
-#define REG_CSCDR 0x23		/* Chip Setup and Control Data  Register    */
-
-/* Module parameter */
-static int max_freq;
-
-struct s_elan_multiplier {
-	int clock;		/* frequency in kHz                         */
-	int val40h;		/* PMU Force Mode register                  */
-	int val80h;		/* CPU Clock Speed Register                 */
-};
-
-/*
- * It is important that the frequencies
- * are listed in ascending order here!
- */
-static struct s_elan_multiplier elan_multiplier[] = {
-	{1000,	0x02,	0x18},
-	{2000,	0x02,	0x10},
-	{4000,	0x02,	0x08},
-	{8000,	0x00,	0x00},
-	{16000,	0x00,	0x02},
-	{33000,	0x00,	0x04},
-	{66000,	0x01,	0x04},
-	{99000,	0x01,	0x05}
-};
-
-static struct cpufreq_frequency_table elanfreq_table[] = {
-	{0,	1000},
-	{1,	2000},
-	{2,	4000},
-	{3,	8000},
-	{4,	16000},
-	{5,	33000},
-	{6,	66000},
-	{7,	99000},
-	{0,	CPUFREQ_TABLE_END},
-};
-
-
-/**
- *	elanfreq_get_cpu_frequency: determine current cpu speed
- *
- *	Finds out at which frequency the CPU of the Elan SOC runs
- *	at the moment. Frequencies from 1 to 33 MHz are generated
- *	the normal way, 66 and 99 MHz are called "Hyperspeed Mode"
- *	and have the rest of the chip running with 33 MHz.
- */
-
-static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
-{
-	u8 clockspeed_reg;    /* Clock Speed Register */
-
-	local_irq_disable();
-	outb_p(0x80, REG_CSCIR);
-	clockspeed_reg = inb_p(REG_CSCDR);
-	local_irq_enable();
-
-	if ((clockspeed_reg & 0xE0) == 0xE0)
-		return 0;
-
-	/* Are we in CPU clock multiplied mode (66/99 MHz)? */
-	if ((clockspeed_reg & 0xE0) == 0xC0) {
-		if ((clockspeed_reg & 0x01) == 0)
-			return 66000;
-		else
-			return 99000;
-	}
-
-	/* 33 MHz is not 32 MHz... */
-	if ((clockspeed_reg & 0xE0) == 0xA0)
-		return 33000;
-
-	return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
-}
-
-
-/**
- *	elanfreq_set_cpu_frequency: Change the CPU core frequency
- *	@cpu: cpu number
- *	@freq: frequency in kHz
- *
- *	This function takes a frequency value and changes the CPU frequency
- *	according to this. Note that the frequency has to be checked by
- *	elanfreq_validatespeed() for correctness!
- *
- *	There is no return value.
- */
-
-static void elanfreq_set_cpu_state(unsigned int state)
-{
-	struct cpufreq_freqs    freqs;
-
-	freqs.old = elanfreq_get_cpu_frequency(0);
-	freqs.new = elan_multiplier[state].clock;
-	freqs.cpu = 0; /* elanfreq.c is UP only driver */
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
-	printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n",
-			elan_multiplier[state].clock);
-
-
-	/*
-	 * Access to the Elan's internal registers is indexed via
-	 * 0x22: Chip Setup & Control Register Index Register (CSCI)
-	 * 0x23: Chip Setup & Control Register Data  Register (CSCD)
-	 *
-	 */
-
-	/*
-	 * 0x40 is the Power Management Unit's Force Mode Register.
-	 * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency)
-	 */
-
-	local_irq_disable();
-	outb_p(0x40, REG_CSCIR);		/* Disable hyperspeed mode */
-	outb_p(0x00, REG_CSCDR);
-	local_irq_enable();		/* wait till internal pipelines and */
-	udelay(1000);			/* buffers have cleaned up          */
-
-	local_irq_disable();
-
-	/* now, set the CPU clock speed register (0x80) */
-	outb_p(0x80, REG_CSCIR);
-	outb_p(elan_multiplier[state].val80h, REG_CSCDR);
-
-	/* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
-	outb_p(0x40, REG_CSCIR);
-	outb_p(elan_multiplier[state].val40h, REG_CSCDR);
-	udelay(10000);
-	local_irq_enable();
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-};
-
-
-/**
- *	elanfreq_validatespeed: test if frequency range is valid
- *	@policy: the policy to validate
- *
- *	This function checks if a given frequency range in kHz is valid
- *	for the hardware supported by the driver.
- */
-
-static int elanfreq_verify(struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
-}
-
-static int elanfreq_target(struct cpufreq_policy *policy,
-			    unsigned int target_freq,
-			    unsigned int relation)
-{
-	unsigned int newstate = 0;
-
-	if (cpufreq_frequency_table_target(policy, &elanfreq_table[0],
-				target_freq, relation, &newstate))
-		return -EINVAL;
-
-	elanfreq_set_cpu_state(newstate);
-
-	return 0;
-}
-
-
-/*
- *	Module init and exit code
- */
-
-static int elanfreq_cpu_init(struct cpufreq_policy *policy)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-	unsigned int i;
-	int result;
-
-	/* capability check */
-	if ((c->x86_vendor != X86_VENDOR_AMD) ||
-	    (c->x86 != 4) || (c->x86_model != 10))
-		return -ENODEV;
-
-	/* max freq */
-	if (!max_freq)
-		max_freq = elanfreq_get_cpu_frequency(0);
-
-	/* table init */
-	for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
-		if (elanfreq_table[i].frequency > max_freq)
-			elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
-	}
-
-	/* cpuinfo and default policy values */
-	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-	policy->cur = elanfreq_get_cpu_frequency(0);
-
-	result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
-	if (result)
-		return result;
-
-	cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
-	return 0;
-}
-
-
-static int elanfreq_cpu_exit(struct cpufreq_policy *policy)
-{
-	cpufreq_frequency_table_put_attr(policy->cpu);
-	return 0;
-}
-
-
-#ifndef MODULE
-/**
- * elanfreq_setup - elanfreq command line parameter parsing
- *
- * elanfreq command line parameter.  Use:
- *  elanfreq=66000
- * to set the maximum CPU frequency to 66 MHz. Note that in
- * case you do not give this boot parameter, the maximum
- * frequency will fall back to _current_ CPU frequency which
- * might be lower. If you build this as a module, use the
- * max_freq module parameter instead.
- */
-static int __init elanfreq_setup(char *str)
-{
-	max_freq = simple_strtoul(str, &str, 0);
-	printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n");
-	return 1;
-}
-__setup("elanfreq=", elanfreq_setup);
-#endif
-
-
-static struct freq_attr *elanfreq_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-
-static struct cpufreq_driver elanfreq_driver = {
-	.get		= elanfreq_get_cpu_frequency,
-	.verify		= elanfreq_verify,
-	.target		= elanfreq_target,
-	.init		= elanfreq_cpu_init,
-	.exit		= elanfreq_cpu_exit,
-	.name		= "elanfreq",
-	.owner		= THIS_MODULE,
-	.attr		= elanfreq_attr,
-};
-
-
-static int __init elanfreq_init(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-
-	/* Test if we have the right hardware */
-	if ((c->x86_vendor != X86_VENDOR_AMD) ||
-		(c->x86 != 4) || (c->x86_model != 10)) {
-		printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
-		return -ENODEV;
-	}
-	return cpufreq_register_driver(&elanfreq_driver);
-}
-
-
-static void __exit elanfreq_exit(void)
-{
-	cpufreq_unregister_driver(&elanfreq_driver);
-}
-
-
-module_param(max_freq, int, 0444);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, "
-		"Sven Geggus <sven@geggus.net>");
-MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs");
-
-module_init(elanfreq_init);
-module_exit(elanfreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
deleted file mode 100644
index 32974cf8423..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ /dev/null
@@ -1,517 +0,0 @@
-/*
- *	Cyrix MediaGX and NatSemi Geode Suspend Modulation
- *	(C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
- *	(C) 2002 Hiroshi Miura   <miura@da-cha.org>
- *	All Rights Reserved
- *
- *	This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      version 2 as published by the Free Software Foundation
- *
- *      The author(s) of this software shall not be held liable for damages
- *      of any nature resulting due to the use of this software. This
- *      software is provided AS-IS with no warranties.
- *
- * Theoretical note:
- *
- *	(see Geode(tm) CS5530 manual (rev.4.1) page.56)
- *
- *	CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0
- *	are based on Suspend Modulation.
- *
- *	Suspend Modulation works by asserting and de-asserting the SUSP# pin
- *	to CPU(GX1/GXLV) for configurable durations. When asserting SUSP#
- *	the CPU enters an idle state. GX1 stops its core clock when SUSP# is
- *	asserted then power consumption is reduced.
- *
- *	Suspend Modulation's OFF/ON duration are configurable
- *	with 'Suspend Modulation OFF Count Register'
- *	and 'Suspend Modulation ON Count Register'.
- *	These registers are 8bit counters that represent the number of
- *	32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF)
- *	to the processor.
- *
- *	These counters define a ratio which is the effective frequency
- *	of operation of the system.
- *
- *			       OFF Count
- *	F_eff = Fgx * ----------------------
- *	                OFF Count + ON Count
- *
- *	0 <= On Count, Off Count <= 255
- *
- *	From these limits, we can get register values
- *
- *	off_duration + on_duration <= MAX_DURATION
- *	on_duration = off_duration * (stock_freq - freq) / freq
- *
- *      off_duration  =  (freq * DURATION) / stock_freq
- *      on_duration = DURATION - off_duration
- *
- *
- *---------------------------------------------------------------------------
- *
- * ChangeLog:
- *	Dec. 12, 2003	Hiroshi Miura <miura@da-cha.org>
- *		- fix on/off register mistake
- *		- fix cpu_khz calc when it stops cpu modulation.
- *
- *	Dec. 11, 2002	Hiroshi Miura <miura@da-cha.org>
- *		- rewrite for Cyrix MediaGX Cx5510/5520 and
- *		  NatSemi Geode Cs5530(A).
- *
- *	Jul. ??, 2002  Zwane Mwaikambo <zwane@commfireservices.com>
- *		- cs5530_mod patch for 2.4.19-rc1.
- *
- *---------------------------------------------------------------------------
- *
- * Todo
- *	Test on machines with 5510, 5530, 5530A
- */
-
-/************************************************************************
- *			Suspend Modulation - Definitions		*
- ************************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-
-#include <asm/processor-cyrix.h>
-
-/* PCI config registers, all at F0 */
-#define PCI_PMER1	0x80	/* power management enable register 1 */
-#define PCI_PMER2	0x81	/* power management enable register 2 */
-#define PCI_PMER3	0x82	/* power management enable register 3 */
-#define PCI_IRQTC	0x8c	/* irq speedup timer counter register:typical 2 to 4ms */
-#define PCI_VIDTC	0x8d	/* video speedup timer counter register: typical 50 to 100ms */
-#define PCI_MODOFF	0x94	/* suspend modulation OFF counter register, 1 = 32us */
-#define PCI_MODON	0x95	/* suspend modulation ON counter register */
-#define PCI_SUSCFG	0x96	/* suspend configuration register */
-
-/* PMER1 bits */
-#define GPM		(1<<0)	/* global power management */
-#define GIT		(1<<1)	/* globally enable PM device idle timers */
-#define GTR		(1<<2)	/* globally enable IO traps */
-#define IRQ_SPDUP	(1<<3)	/* disable clock throttle during interrupt handling */
-#define VID_SPDUP	(1<<4)	/* disable clock throttle during vga video handling */
-
-/* SUSCFG bits */
-#define SUSMOD		(1<<0)	/* enable/disable suspend modulation */
-/* the below is supported only with cs5530 (after rev.1.2)/cs5530A */
-#define SMISPDUP	(1<<1)	/* select how SMI re-enable suspend modulation: */
-				/* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */
-#define SUSCFG		(1<<2)	/* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */
-/* the below is supported only with cs5530A */
-#define PWRSVE_ISA	(1<<3)	/* stop ISA clock  */
-#define PWRSVE		(1<<4)	/* active idle */
-
-struct gxfreq_params {
-	u8 on_duration;
-	u8 off_duration;
-	u8 pci_suscfg;
-	u8 pci_pmer1;
-	u8 pci_pmer2;
-	struct pci_dev *cs55x0;
-};
-
-static struct gxfreq_params *gx_params;
-static int stock_freq;
-
-/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */
-static int pci_busclk;
-module_param(pci_busclk, int, 0444);
-
-/* maximum duration for which the cpu may be suspended
- * (32us * MAX_DURATION). If no parameter is given, this defaults
- * to 255.
- * Note that this leads to a maximum of 8 ms(!) where the CPU clock
- * is suspended -- processing power is just 0.39% of what it used to be,
- * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */
-static int max_duration = 255;
-module_param(max_duration, int, 0444);
-
-/* For the default policy, we want at least some processing power
- * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV)
- */
-#define POLICY_MIN_DIV 20
-
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"gx-suspmod", msg)
-
-/**
- * we can detect a core multipiler from dir0_lsb
- * from GX1 datasheet p.56,
- *	MULT[3:0]:
- *	0000 = SYSCLK multiplied by 4 (test only)
- *	0001 = SYSCLK multiplied by 10
- *	0010 = SYSCLK multiplied by 4
- *	0011 = SYSCLK multiplied by 6
- *	0100 = SYSCLK multiplied by 9
- *	0101 = SYSCLK multiplied by 5
- *	0110 = SYSCLK multiplied by 7
- *	0111 = SYSCLK multiplied by 8
- *              of 33.3MHz
- **/
-static int gx_freq_mult[16] = {
-		4, 10, 4, 6, 9, 5, 7, 8,
-		0, 0, 0, 0, 0, 0, 0, 0
-};
-
-
-/****************************************************************
- *	Low Level chipset interface				*
- ****************************************************************/
-static struct pci_device_id gx_chipset_tbl[] __initdata = {
-	{ PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
-	{ PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
-	{ PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
-	{ 0, },
-};
-
-static void gx_write_byte(int reg, int value)
-{
-	pci_write_config_byte(gx_params->cs55x0, reg, value);
-}
-
-/**
- * gx_detect_chipset:
- *
- **/
-static __init struct pci_dev *gx_detect_chipset(void)
-{
-	struct pci_dev *gx_pci = NULL;
-
-	/* check if CPU is a MediaGX or a Geode. */
-	if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) &&
-	    (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) {
-		dprintk("error: no MediaGX/Geode processor found!\n");
-		return NULL;
-	}
-
-	/* detect which companion chip is used */
-	for_each_pci_dev(gx_pci) {
-		if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
-			return gx_pci;
-	}
-
-	dprintk("error: no supported chipset found!\n");
-	return NULL;
-}
-
-/**
- * gx_get_cpuspeed:
- *
- * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi
- * Geode CPU runs.
- */
-static unsigned int gx_get_cpuspeed(unsigned int cpu)
-{
-	if ((gx_params->pci_suscfg & SUSMOD) == 0)
-		return stock_freq;
-
-	return (stock_freq * gx_params->off_duration)
-		/ (gx_params->on_duration + gx_params->off_duration);
-}
-
-/**
- *      gx_validate_speed:
- *      determine current cpu speed
- *
- **/
-
-static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration,
-		u8 *off_duration)
-{
-	unsigned int i;
-	u8 tmp_on, tmp_off;
-	int old_tmp_freq = stock_freq;
-	int tmp_freq;
-
-	*off_duration = 1;
-	*on_duration = 0;
-
-	for (i = max_duration; i > 0; i--) {
-		tmp_off = ((khz * i) / stock_freq) & 0xff;
-		tmp_on = i - tmp_off;
-		tmp_freq = (stock_freq * tmp_off) / i;
-		/* if this relation is closer to khz, use this. If it's equal,
-		 * prefer it, too - lower latency */
-		if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) {
-			*on_duration = tmp_on;
-			*off_duration = tmp_off;
-			old_tmp_freq = tmp_freq;
-		}
-	}
-
-	return old_tmp_freq;
-}
-
-
-/**
- * gx_set_cpuspeed:
- * set cpu speed in khz.
- **/
-
-static void gx_set_cpuspeed(unsigned int khz)
-{
-	u8 suscfg, pmer1;
-	unsigned int new_khz;
-	unsigned long flags;
-	struct cpufreq_freqs freqs;
-
-	freqs.cpu = 0;
-	freqs.old = gx_get_cpuspeed(0);
-
-	new_khz = gx_validate_speed(khz, &gx_params->on_duration,
-			&gx_params->off_duration);
-
-	freqs.new = new_khz;
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	local_irq_save(flags);
-
-
-
-	if (new_khz != stock_freq) {
-		/* if new khz == 100% of CPU speed, it is special case */
-		switch (gx_params->cs55x0->device) {
-		case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
-			pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP;
-			/* FIXME: need to test other values -- Zwane,Miura */
-			/* typical 2 to 4ms */
-			gx_write_byte(PCI_IRQTC, 4);
-			/* typical 50 to 100ms */
-			gx_write_byte(PCI_VIDTC, 100);
-			gx_write_byte(PCI_PMER1, pmer1);
-
-			if (gx_params->cs55x0->revision < 0x10) {
-				/* CS5530(rev 1.2, 1.3) */
-				suscfg = gx_params->pci_suscfg|SUSMOD;
-			} else {
-				/* CS5530A,B.. */
-				suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE;
-			}
-			break;
-		case PCI_DEVICE_ID_CYRIX_5520:
-		case PCI_DEVICE_ID_CYRIX_5510:
-			suscfg = gx_params->pci_suscfg | SUSMOD;
-			break;
-		default:
-			local_irq_restore(flags);
-			dprintk("fatal: try to set unknown chipset.\n");
-			return;
-		}
-	} else {
-		suscfg = gx_params->pci_suscfg & ~(SUSMOD);
-		gx_params->off_duration = 0;
-		gx_params->on_duration = 0;
-		dprintk("suspend modulation disabled: cpu runs 100%% speed.\n");
-	}
-
-	gx_write_byte(PCI_MODOFF, gx_params->off_duration);
-	gx_write_byte(PCI_MODON, gx_params->on_duration);
-
-	gx_write_byte(PCI_SUSCFG, suscfg);
-	pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg);
-
-	local_irq_restore(flags);
-
-	gx_params->pci_suscfg = suscfg;
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
-	dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n",
-		gx_params->on_duration * 32, gx_params->off_duration * 32);
-	dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new);
-}
-
-/****************************************************************
- *             High level functions                             *
- ****************************************************************/
-
-/*
- *	cpufreq_gx_verify: test if frequency range is valid
- *
- *	This function checks if a given frequency range in kHz is valid
- *      for the hardware supported by the driver.
- */
-
-static int cpufreq_gx_verify(struct cpufreq_policy *policy)
-{
-	unsigned int tmp_freq = 0;
-	u8 tmp1, tmp2;
-
-	if (!stock_freq || !policy)
-		return -EINVAL;
-
-	policy->cpu = 0;
-	cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
-			stock_freq);
-
-	/* it needs to be assured that at least one supported frequency is
-	 * within policy->min and policy->max. If it is not, policy->max
-	 * needs to be increased until one freuqency is supported.
-	 * policy->min may not be decreased, though. This way we guarantee a
-	 * specific processing capacity.
-	 */
-	tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2);
-	if (tmp_freq < policy->min)
-		tmp_freq += stock_freq / max_duration;
-	policy->min = tmp_freq;
-	if (policy->min > policy->max)
-		policy->max = tmp_freq;
-	tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2);
-	if (tmp_freq > policy->max)
-		tmp_freq -= stock_freq / max_duration;
-	policy->max = tmp_freq;
-	if (policy->max < policy->min)
-		policy->max = policy->min;
-	cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
-			stock_freq);
-
-	return 0;
-}
-
-/*
- *      cpufreq_gx_target:
- *
- */
-static int cpufreq_gx_target(struct cpufreq_policy *policy,
-			     unsigned int target_freq,
-			     unsigned int relation)
-{
-	u8 tmp1, tmp2;
-	unsigned int tmp_freq;
-
-	if (!stock_freq || !policy)
-		return -EINVAL;
-
-	policy->cpu = 0;
-
-	tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2);
-	while (tmp_freq < policy->min) {
-		tmp_freq += stock_freq / max_duration;
-		tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
-	}
-	while (tmp_freq > policy->max) {
-		tmp_freq -= stock_freq / max_duration;
-		tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
-	}
-
-	gx_set_cpuspeed(tmp_freq);
-
-	return 0;
-}
-
-static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
-{
-	unsigned int maxfreq, curfreq;
-
-	if (!policy || policy->cpu != 0)
-		return -ENODEV;
-
-	/* determine maximum frequency */
-	if (pci_busclk)
-		maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
-	else if (cpu_khz)
-		maxfreq = cpu_khz;
-	else
-		maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
-
-	stock_freq = maxfreq;
-	curfreq = gx_get_cpuspeed(0);
-
-	dprintk("cpu max frequency is %d.\n", maxfreq);
-	dprintk("cpu current frequency is %dkHz.\n", curfreq);
-
-	/* setup basic struct for cpufreq API */
-	policy->cpu = 0;
-
-	if (max_duration < POLICY_MIN_DIV)
-		policy->min = maxfreq / max_duration;
-	else
-		policy->min = maxfreq / POLICY_MIN_DIV;
-	policy->max = maxfreq;
-	policy->cur = curfreq;
-	policy->cpuinfo.min_freq = maxfreq / max_duration;
-	policy->cpuinfo.max_freq = maxfreq;
-	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-
-	return 0;
-}
-
-/*
- * cpufreq_gx_init:
- *   MediaGX/Geode GX initialize cpufreq driver
- */
-static struct cpufreq_driver gx_suspmod_driver = {
-	.get		= gx_get_cpuspeed,
-	.verify		= cpufreq_gx_verify,
-	.target		= cpufreq_gx_target,
-	.init		= cpufreq_gx_cpu_init,
-	.name		= "gx-suspmod",
-	.owner		= THIS_MODULE,
-};
-
-static int __init cpufreq_gx_init(void)
-{
-	int ret;
-	struct gxfreq_params *params;
-	struct pci_dev *gx_pci;
-
-	/* Test if we have the right hardware */
-	gx_pci = gx_detect_chipset();
-	if (gx_pci == NULL)
-		return -ENODEV;
-
-	/* check whether module parameters are sane */
-	if (max_duration > 0xff)
-		max_duration = 0xff;
-
-	dprintk("geode suspend modulation available.\n");
-
-	params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL);
-	if (params == NULL)
-		return -ENOMEM;
-
-	params->cs55x0 = gx_pci;
-	gx_params = params;
-
-	/* keep cs55x0 configurations */
-	pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg));
-	pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1));
-	pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
-	pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
-	pci_read_config_byte(params->cs55x0, PCI_MODOFF,
-			&(params->off_duration));
-
-	ret = cpufreq_register_driver(&gx_suspmod_driver);
-	if (ret) {
-		kfree(params);
-		return ret;                   /* register error! */
-	}
-
-	return 0;
-}
-
-static void __exit cpufreq_gx_exit(void)
-{
-	cpufreq_unregister_driver(&gx_suspmod_driver);
-	pci_dev_put(gx_params->cs55x0);
-	kfree(gx_params);
-}
-
-MODULE_AUTHOR("Hiroshi Miura <miura@da-cha.org>");
-MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
-MODULE_LICENSE("GPL");
-
-module_init(cpufreq_gx_init);
-module_exit(cpufreq_gx_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
deleted file mode 100644
index cf48cdd6907..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ /dev/null
@@ -1,1029 +0,0 @@
-/*
- *  (C) 2001-2004  Dave Jones. <davej@redhat.com>
- *  (C) 2002  Padraig Brady. <padraig@antefacto.com>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *  Based upon datasheets & sample CPUs kindly provided by VIA.
- *
- *  VIA have currently 3 different versions of Longhaul.
- *  Version 1 (Longhaul) uses the BCR2 MSR at 0x1147.
- *   It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0.
- *  Version 2 of longhaul is backward compatible with v1, but adds
- *   LONGHAUL MSR for purpose of both frequency and voltage scaling.
- *   Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C).
- *  Version 3 of longhaul got renamed to Powersaver and redesigned
- *   to use only the POWERSAVER MSR at 0x110a.
- *   It is present in Ezra-T (C5M), Nehemiah (C5X) and above.
- *   It's pretty much the same feature wise to longhaul v2, though
- *   there is provision for scaling FSB too, but this doesn't work
- *   too well in practice so we don't even try to use this.
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/delay.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-#include <linux/acpi.h>
-
-#include <asm/msr.h>
-#include <acpi/processor.h>
-
-#include "longhaul.h"
-
-#define PFX "longhaul: "
-
-#define TYPE_LONGHAUL_V1	1
-#define TYPE_LONGHAUL_V2	2
-#define TYPE_POWERSAVER		3
-
-#define	CPU_SAMUEL	1
-#define	CPU_SAMUEL2	2
-#define	CPU_EZRA	3
-#define	CPU_EZRA_T	4
-#define	CPU_NEHEMIAH	5
-#define	CPU_NEHEMIAH_C	6
-
-/* Flags */
-#define USE_ACPI_C3		(1 << 1)
-#define USE_NORTHBRIDGE		(1 << 2)
-
-static int cpu_model;
-static unsigned int numscales = 16;
-static unsigned int fsb;
-
-static const struct mV_pos *vrm_mV_table;
-static const unsigned char *mV_vrm_table;
-
-static unsigned int highest_speed, lowest_speed; /* kHz */
-static unsigned int minmult, maxmult;
-static int can_scale_voltage;
-static struct acpi_processor *pr;
-static struct acpi_processor_cx *cx;
-static u32 acpi_regs_addr;
-static u8 longhaul_flags;
-static unsigned int longhaul_index;
-
-/* Module parameters */
-static int scale_voltage;
-static int disable_acpi_c3;
-static int revid_errata;
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"longhaul", msg)
-
-
-/* Clock ratios multiplied by 10 */
-static int mults[32];
-static int eblcr[32];
-static int longhaul_version;
-static struct cpufreq_frequency_table *longhaul_table;
-
-#ifdef CONFIG_CPU_FREQ_DEBUG
-static char speedbuffer[8];
-
-static char *print_speed(int speed)
-{
-	if (speed < 1000) {
-		snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed);
-		return speedbuffer;
-	}
-
-	if (speed%1000 == 0)
-		snprintf(speedbuffer, sizeof(speedbuffer),
-			"%dGHz", speed/1000);
-	else
-		snprintf(speedbuffer, sizeof(speedbuffer),
-			"%d.%dGHz", speed/1000, (speed%1000)/100);
-
-	return speedbuffer;
-}
-#endif
-
-
-static unsigned int calc_speed(int mult)
-{
-	int khz;
-	khz = (mult/10)*fsb;
-	if (mult%10)
-		khz += fsb/2;
-	khz *= 1000;
-	return khz;
-}
-
-
-static int longhaul_get_cpu_mult(void)
-{
-	unsigned long invalue = 0, lo, hi;
-
-	rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi);
-	invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22;
-	if (longhaul_version == TYPE_LONGHAUL_V2 ||
-	    longhaul_version == TYPE_POWERSAVER) {
-		if (lo & (1<<27))
-			invalue += 16;
-	}
-	return eblcr[invalue];
-}
-
-/* For processor with BCR2 MSR */
-
-static void do_longhaul1(unsigned int mults_index)
-{
-	union msr_bcr2 bcr2;
-
-	rdmsrl(MSR_VIA_BCR2, bcr2.val);
-	/* Enable software clock multiplier */
-	bcr2.bits.ESOFTBF = 1;
-	bcr2.bits.CLOCKMUL = mults_index & 0xff;
-
-	/* Sync to timer tick */
-	safe_halt();
-	/* Change frequency on next halt or sleep */
-	wrmsrl(MSR_VIA_BCR2, bcr2.val);
-	/* Invoke transition */
-	ACPI_FLUSH_CPU_CACHE();
-	halt();
-
-	/* Disable software clock multiplier */
-	local_irq_disable();
-	rdmsrl(MSR_VIA_BCR2, bcr2.val);
-	bcr2.bits.ESOFTBF = 0;
-	wrmsrl(MSR_VIA_BCR2, bcr2.val);
-}
-
-/* For processor with Longhaul MSR */
-
-static void do_powersaver(int cx_address, unsigned int mults_index,
-			  unsigned int dir)
-{
-	union msr_longhaul longhaul;
-	u32 t;
-
-	rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-	/* Setup new frequency */
-	if (!revid_errata)
-		longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
-	else
-		longhaul.bits.RevisionKey = 0;
-	longhaul.bits.SoftBusRatio = mults_index & 0xf;
-	longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4;
-	/* Setup new voltage */
-	if (can_scale_voltage)
-		longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f;
-	/* Sync to timer tick */
-	safe_halt();
-	/* Raise voltage if necessary */
-	if (can_scale_voltage && dir) {
-		longhaul.bits.EnableSoftVID = 1;
-		wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-		/* Change voltage */
-		if (!cx_address) {
-			ACPI_FLUSH_CPU_CACHE();
-			halt();
-		} else {
-			ACPI_FLUSH_CPU_CACHE();
-			/* Invoke C3 */
-			inb(cx_address);
-			/* Dummy op - must do something useless after P_LVL3
-			 * read */
-			t = inl(acpi_gbl_FADT.xpm_timer_block.address);
-		}
-		longhaul.bits.EnableSoftVID = 0;
-		wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-	}
-
-	/* Change frequency on next halt or sleep */
-	longhaul.bits.EnableSoftBusRatio = 1;
-	wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-	if (!cx_address) {
-		ACPI_FLUSH_CPU_CACHE();
-		halt();
-	} else {
-		ACPI_FLUSH_CPU_CACHE();
-		/* Invoke C3 */
-		inb(cx_address);
-		/* Dummy op - must do something useless after P_LVL3 read */
-		t = inl(acpi_gbl_FADT.xpm_timer_block.address);
-	}
-	/* Disable bus ratio bit */
-	longhaul.bits.EnableSoftBusRatio = 0;
-	wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-
-	/* Reduce voltage if necessary */
-	if (can_scale_voltage && !dir) {
-		longhaul.bits.EnableSoftVID = 1;
-		wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-		/* Change voltage */
-		if (!cx_address) {
-			ACPI_FLUSH_CPU_CACHE();
-			halt();
-		} else {
-			ACPI_FLUSH_CPU_CACHE();
-			/* Invoke C3 */
-			inb(cx_address);
-			/* Dummy op - must do something useless after P_LVL3
-			 * read */
-			t = inl(acpi_gbl_FADT.xpm_timer_block.address);
-		}
-		longhaul.bits.EnableSoftVID = 0;
-		wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-	}
-}
-
-/**
- * longhaul_set_cpu_frequency()
- * @mults_index : bitpattern of the new multiplier.
- *
- * Sets a new clock ratio.
- */
-
-static void longhaul_setstate(unsigned int table_index)
-{
-	unsigned int mults_index;
-	int speed, mult;
-	struct cpufreq_freqs freqs;
-	unsigned long flags;
-	unsigned int pic1_mask, pic2_mask;
-	u16 bm_status = 0;
-	u32 bm_timeout = 1000;
-	unsigned int dir = 0;
-
-	mults_index = longhaul_table[table_index].index;
-	/* Safety precautions */
-	mult = mults[mults_index & 0x1f];
-	if (mult == -1)
-		return;
-	speed = calc_speed(mult);
-	if ((speed > highest_speed) || (speed < lowest_speed))
-		return;
-	/* Voltage transition before frequency transition? */
-	if (can_scale_voltage && longhaul_index < table_index)
-		dir = 1;
-
-	freqs.old = calc_speed(longhaul_get_cpu_mult());
-	freqs.new = speed;
-	freqs.cpu = 0; /* longhaul.c is UP only driver */
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
-	dprintk("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
-			fsb, mult/10, mult%10, print_speed(speed/1000));
-retry_loop:
-	preempt_disable();
-	local_irq_save(flags);
-
-	pic2_mask = inb(0xA1);
-	pic1_mask = inb(0x21);	/* works on C3. save mask. */
-	outb(0xFF, 0xA1);	/* Overkill */
-	outb(0xFE, 0x21);	/* TMR0 only */
-
-	/* Wait while PCI bus is busy. */
-	if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE
-	    || ((pr != NULL) && pr->flags.bm_control))) {
-		bm_status = inw(acpi_regs_addr);
-		bm_status &= 1 << 4;
-		while (bm_status && bm_timeout) {
-			outw(1 << 4, acpi_regs_addr);
-			bm_timeout--;
-			bm_status = inw(acpi_regs_addr);
-			bm_status &= 1 << 4;
-		}
-	}
-
-	if (longhaul_flags & USE_NORTHBRIDGE) {
-		/* Disable AGP and PCI arbiters */
-		outb(3, 0x22);
-	} else if ((pr != NULL) && pr->flags.bm_control) {
-		/* Disable bus master arbitration */
-		acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1);
-	}
-	switch (longhaul_version) {
-
-	/*
-	 * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B])
-	 * Software controlled multipliers only.
-	 */
-	case TYPE_LONGHAUL_V1:
-		do_longhaul1(mults_index);
-		break;
-
-	/*
-	 * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C]
-	 *
-	 * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N])
-	 * Nehemiah can do FSB scaling too, but this has never been proven
-	 * to work in practice.
-	 */
-	case TYPE_LONGHAUL_V2:
-	case TYPE_POWERSAVER:
-		if (longhaul_flags & USE_ACPI_C3) {
-			/* Don't allow wakeup */
-			acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
-			do_powersaver(cx->address, mults_index, dir);
-		} else {
-			do_powersaver(0, mults_index, dir);
-		}
-		break;
-	}
-
-	if (longhaul_flags & USE_NORTHBRIDGE) {
-		/* Enable arbiters */
-		outb(0, 0x22);
-	} else if ((pr != NULL) && pr->flags.bm_control) {
-		/* Enable bus master arbitration */
-		acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0);
-	}
-	outb(pic2_mask, 0xA1);	/* restore mask */
-	outb(pic1_mask, 0x21);
-
-	local_irq_restore(flags);
-	preempt_enable();
-
-	freqs.new = calc_speed(longhaul_get_cpu_mult());
-	/* Check if requested frequency is set. */
-	if (unlikely(freqs.new != speed)) {
-		printk(KERN_INFO PFX "Failed to set requested frequency!\n");
-		/* Revision ID = 1 but processor is expecting revision key
-		 * equal to 0. Jumpers at the bottom of processor will change
-		 * multiplier and FSB, but will not change bits in Longhaul
-		 * MSR nor enable voltage scaling. */
-		if (!revid_errata) {
-			printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" "
-						"option.\n");
-			revid_errata = 1;
-			msleep(200);
-			goto retry_loop;
-		}
-		/* Why ACPI C3 sometimes doesn't work is a mystery for me.
-		 * But it does happen. Processor is entering ACPI C3 state,
-		 * but it doesn't change frequency. I tried poking various
-		 * bits in northbridge registers, but without success. */
-		if (longhaul_flags & USE_ACPI_C3) {
-			printk(KERN_INFO PFX "Disabling ACPI C3 support.\n");
-			longhaul_flags &= ~USE_ACPI_C3;
-			if (revid_errata) {
-				printk(KERN_INFO PFX "Disabling \"Ignore "
-						"Revision ID\" option.\n");
-				revid_errata = 0;
-			}
-			msleep(200);
-			goto retry_loop;
-		}
-		/* This shouldn't happen. Longhaul ver. 2 was reported not
-		 * working on processors without voltage scaling, but with
-		 * RevID = 1. RevID errata will make things right. Just
-		 * to be 100% sure. */
-		if (longhaul_version == TYPE_LONGHAUL_V2) {
-			printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n");
-			longhaul_version = TYPE_LONGHAUL_V1;
-			msleep(200);
-			goto retry_loop;
-		}
-	}
-	/* Report true CPU frequency */
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
-	if (!bm_timeout)
-		printk(KERN_INFO PFX "Warning: Timeout while waiting for "
-				"idle PCI bus.\n");
-}
-
-/*
- * Centaur decided to make life a little more tricky.
- * Only longhaul v1 is allowed to read EBLCR BSEL[0:1].
- * Samuel2 and above have to try and guess what the FSB is.
- * We do this by assuming we booted at maximum multiplier, and interpolate
- * between that value multiplied by possible FSBs and cpu_mhz which
- * was calculated at boot time. Really ugly, but no other way to do this.
- */
-
-#define ROUNDING	0xf
-
-static int guess_fsb(int mult)
-{
-	int speed = cpu_khz / 1000;
-	int i;
-	int speeds[] = { 666, 1000, 1333, 2000 };
-	int f_max, f_min;
-
-	for (i = 0; i < 4; i++) {
-		f_max = ((speeds[i] * mult) + 50) / 100;
-		f_max += (ROUNDING / 2);
-		f_min = f_max - ROUNDING;
-		if ((speed <= f_max) && (speed >= f_min))
-			return speeds[i] / 10;
-	}
-	return 0;
-}
-
-
-static int __cpuinit longhaul_get_ranges(void)
-{
-	unsigned int i, j, k = 0;
-	unsigned int ratio;
-	int mult;
-
-	/* Get current frequency */
-	mult = longhaul_get_cpu_mult();
-	if (mult == -1) {
-		printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n");
-		return -EINVAL;
-	}
-	fsb = guess_fsb(mult);
-	if (fsb == 0) {
-		printk(KERN_INFO PFX "Invalid (reserved) FSB!\n");
-		return -EINVAL;
-	}
-	/* Get max multiplier - as we always did.
-	 * Longhaul MSR is useful only when voltage scaling is enabled.
-	 * C3 is booting at max anyway. */
-	maxmult = mult;
-	/* Get min multiplier */
-	switch (cpu_model) {
-	case CPU_NEHEMIAH:
-		minmult = 50;
-		break;
-	case CPU_NEHEMIAH_C:
-		minmult = 40;
-		break;
-	default:
-		minmult = 30;
-		break;
-	}
-
-	dprintk("MinMult:%d.%dx MaxMult:%d.%dx\n",
-		 minmult/10, minmult%10, maxmult/10, maxmult%10);
-
-	highest_speed = calc_speed(maxmult);
-	lowest_speed = calc_speed(minmult);
-	dprintk("FSB:%dMHz  Lowest speed: %s   Highest speed:%s\n", fsb,
-		 print_speed(lowest_speed/1000),
-		 print_speed(highest_speed/1000));
-
-	if (lowest_speed == highest_speed) {
-		printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n");
-		return -EINVAL;
-	}
-	if (lowest_speed > highest_speed) {
-		printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
-			lowest_speed, highest_speed);
-		return -EINVAL;
-	}
-
-	longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table),
-			GFP_KERNEL);
-	if (!longhaul_table)
-		return -ENOMEM;
-
-	for (j = 0; j < numscales; j++) {
-		ratio = mults[j];
-		if (ratio == -1)
-			continue;
-		if (ratio > maxmult || ratio < minmult)
-			continue;
-		longhaul_table[k].frequency = calc_speed(ratio);
-		longhaul_table[k].index	= j;
-		k++;
-	}
-	if (k <= 1) {
-		kfree(longhaul_table);
-		return -ENODEV;
-	}
-	/* Sort */
-	for (j = 0; j < k - 1; j++) {
-		unsigned int min_f, min_i;
-		min_f = longhaul_table[j].frequency;
-		min_i = j;
-		for (i = j + 1; i < k; i++) {
-			if (longhaul_table[i].frequency < min_f) {
-				min_f = longhaul_table[i].frequency;
-				min_i = i;
-			}
-		}
-		if (min_i != j) {
-			swap(longhaul_table[j].frequency,
-			     longhaul_table[min_i].frequency);
-			swap(longhaul_table[j].index,
-			     longhaul_table[min_i].index);
-		}
-	}
-
-	longhaul_table[k].frequency = CPUFREQ_TABLE_END;
-
-	/* Find index we are running on */
-	for (j = 0; j < k; j++) {
-		if (mults[longhaul_table[j].index & 0x1f] == mult) {
-			longhaul_index = j;
-			break;
-		}
-	}
-	return 0;
-}
-
-
-static void __cpuinit longhaul_setup_voltagescaling(void)
-{
-	union msr_longhaul longhaul;
-	struct mV_pos minvid, maxvid, vid;
-	unsigned int j, speed, pos, kHz_step, numvscales;
-	int min_vid_speed;
-
-	rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
-	if (!(longhaul.bits.RevisionID & 1)) {
-		printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n");
-		return;
-	}
-
-	if (!longhaul.bits.VRMRev) {
-		printk(KERN_INFO PFX "VRM 8.5\n");
-		vrm_mV_table = &vrm85_mV[0];
-		mV_vrm_table = &mV_vrm85[0];
-	} else {
-		printk(KERN_INFO PFX "Mobile VRM\n");
-		if (cpu_model < CPU_NEHEMIAH)
-			return;
-		vrm_mV_table = &mobilevrm_mV[0];
-		mV_vrm_table = &mV_mobilevrm[0];
-	}
-
-	minvid = vrm_mV_table[longhaul.bits.MinimumVID];
-	maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
-
-	if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
-		printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
-					"Voltage scaling disabled.\n",
-					minvid.mV/1000, minvid.mV%1000,
-					maxvid.mV/1000, maxvid.mV%1000);
-		return;
-	}
-
-	if (minvid.mV == maxvid.mV) {
-		printk(KERN_INFO PFX "Claims to support voltage scaling but "
-				"min & max are both %d.%03d. "
-				"Voltage scaling disabled\n",
-				maxvid.mV/1000, maxvid.mV%1000);
-		return;
-	}
-
-	/* How many voltage steps*/
-	numvscales = maxvid.pos - minvid.pos + 1;
-	printk(KERN_INFO PFX
-		"Max VID=%d.%03d  "
-		"Min VID=%d.%03d, "
-		"%d possible voltage scales\n",
-		maxvid.mV/1000, maxvid.mV%1000,
-		minvid.mV/1000, minvid.mV%1000,
-		numvscales);
-
-	/* Calculate max frequency at min voltage */
-	j = longhaul.bits.MinMHzBR;
-	if (longhaul.bits.MinMHzBR4)
-		j += 16;
-	min_vid_speed = eblcr[j];
-	if (min_vid_speed == -1)
-		return;
-	switch (longhaul.bits.MinMHzFSB) {
-	case 0:
-		min_vid_speed *= 13333;
-		break;
-	case 1:
-		min_vid_speed *= 10000;
-		break;
-	case 3:
-		min_vid_speed *= 6666;
-		break;
-	default:
-		return;
-		break;
-	}
-	if (min_vid_speed >= highest_speed)
-		return;
-	/* Calculate kHz for one voltage step */
-	kHz_step = (highest_speed - min_vid_speed) / numvscales;
-
-	j = 0;
-	while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
-		speed = longhaul_table[j].frequency;
-		if (speed > min_vid_speed)
-			pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
-		else
-			pos = minvid.pos;
-		longhaul_table[j].index |= mV_vrm_table[pos] << 8;
-		vid = vrm_mV_table[mV_vrm_table[pos]];
-		printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n",
-				speed, j, vid.mV);
-		j++;
-	}
-
-	can_scale_voltage = 1;
-	printk(KERN_INFO PFX "Voltage scaling enabled.\n");
-}
-
-
-static int longhaul_verify(struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy, longhaul_table);
-}
-
-
-static int longhaul_target(struct cpufreq_policy *policy,
-			    unsigned int target_freq, unsigned int relation)
-{
-	unsigned int table_index = 0;
-	unsigned int i;
-	unsigned int dir = 0;
-	u8 vid, current_vid;
-
-	if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq,
-				relation, &table_index))
-		return -EINVAL;
-
-	/* Don't set same frequency again */
-	if (longhaul_index == table_index)
-		return 0;
-
-	if (!can_scale_voltage)
-		longhaul_setstate(table_index);
-	else {
-		/* On test system voltage transitions exceeding single
-		 * step up or down were turning motherboard off. Both
-		 * "ondemand" and "userspace" are unsafe. C7 is doing
-		 * this in hardware, C3 is old and we need to do this
-		 * in software. */
-		i = longhaul_index;
-		current_vid = (longhaul_table[longhaul_index].index >> 8);
-		current_vid &= 0x1f;
-		if (table_index > longhaul_index)
-			dir = 1;
-		while (i != table_index) {
-			vid = (longhaul_table[i].index >> 8) & 0x1f;
-			if (vid != current_vid) {
-				longhaul_setstate(i);
-				current_vid = vid;
-				msleep(200);
-			}
-			if (dir)
-				i++;
-			else
-				i--;
-		}
-		longhaul_setstate(table_index);
-	}
-	longhaul_index = table_index;
-	return 0;
-}
-
-
-static unsigned int longhaul_get(unsigned int cpu)
-{
-	if (cpu)
-		return 0;
-	return calc_speed(longhaul_get_cpu_mult());
-}
-
-static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
-					  u32 nesting_level,
-					  void *context, void **return_value)
-{
-	struct acpi_device *d;
-
-	if (acpi_bus_get_device(obj_handle, &d))
-		return 0;
-
-	*return_value = acpi_driver_data(d);
-	return 1;
-}
-
-/* VIA don't support PM2 reg, but have something similar */
-static int enable_arbiter_disable(void)
-{
-	struct pci_dev *dev;
-	int status = 1;
-	int reg;
-	u8 pci_cmd;
-
-	/* Find PLE133 host bridge */
-	reg = 0x78;
-	dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0,
-			     NULL);
-	/* Find PM133/VT8605 host bridge */
-	if (dev == NULL)
-		dev = pci_get_device(PCI_VENDOR_ID_VIA,
-				     PCI_DEVICE_ID_VIA_8605_0, NULL);
-	/* Find CLE266 host bridge */
-	if (dev == NULL) {
-		reg = 0x76;
-		dev = pci_get_device(PCI_VENDOR_ID_VIA,
-				     PCI_DEVICE_ID_VIA_862X_0, NULL);
-		/* Find CN400 V-Link host bridge */
-		if (dev == NULL)
-			dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL);
-	}
-	if (dev != NULL) {
-		/* Enable access to port 0x22 */
-		pci_read_config_byte(dev, reg, &pci_cmd);
-		if (!(pci_cmd & 1<<7)) {
-			pci_cmd |= 1<<7;
-			pci_write_config_byte(dev, reg, pci_cmd);
-			pci_read_config_byte(dev, reg, &pci_cmd);
-			if (!(pci_cmd & 1<<7)) {
-				printk(KERN_ERR PFX
-					"Can't enable access to port 0x22.\n");
-				status = 0;
-			}
-		}
-		pci_dev_put(dev);
-		return status;
-	}
-	return 0;
-}
-
-static int longhaul_setup_southbridge(void)
-{
-	struct pci_dev *dev;
-	u8 pci_cmd;
-
-	/* Find VT8235 southbridge */
-	dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
-	if (dev == NULL)
-		/* Find VT8237 southbridge */
-		dev = pci_get_device(PCI_VENDOR_ID_VIA,
-				     PCI_DEVICE_ID_VIA_8237, NULL);
-	if (dev != NULL) {
-		/* Set transition time to max */
-		pci_read_config_byte(dev, 0xec, &pci_cmd);
-		pci_cmd &= ~(1 << 2);
-		pci_write_config_byte(dev, 0xec, pci_cmd);
-		pci_read_config_byte(dev, 0xe4, &pci_cmd);
-		pci_cmd &= ~(1 << 7);
-		pci_write_config_byte(dev, 0xe4, pci_cmd);
-		pci_read_config_byte(dev, 0xe5, &pci_cmd);
-		pci_cmd |= 1 << 7;
-		pci_write_config_byte(dev, 0xe5, pci_cmd);
-		/* Get address of ACPI registers block*/
-		pci_read_config_byte(dev, 0x81, &pci_cmd);
-		if (pci_cmd & 1 << 7) {
-			pci_read_config_dword(dev, 0x88, &acpi_regs_addr);
-			acpi_regs_addr &= 0xff00;
-			printk(KERN_INFO PFX "ACPI I/O at 0x%x\n",
-					acpi_regs_addr);
-		}
-
-		pci_dev_put(dev);
-		return 1;
-	}
-	return 0;
-}
-
-static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-	char *cpuname = NULL;
-	int ret;
-	u32 lo, hi;
-
-	/* Check what we have on this motherboard */
-	switch (c->x86_model) {
-	case 6:
-		cpu_model = CPU_SAMUEL;
-		cpuname = "C3 'Samuel' [C5A]";
-		longhaul_version = TYPE_LONGHAUL_V1;
-		memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
-		memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr));
-		break;
-
-	case 7:
-		switch (c->x86_mask) {
-		case 0:
-			longhaul_version = TYPE_LONGHAUL_V1;
-			cpu_model = CPU_SAMUEL2;
-			cpuname = "C3 'Samuel 2' [C5B]";
-			/* Note, this is not a typo, early Samuel2's had
-			 * Samuel1 ratios. */
-			memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
-			memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
-			break;
-		case 1 ... 15:
-			longhaul_version = TYPE_LONGHAUL_V2;
-			if (c->x86_mask < 8) {
-				cpu_model = CPU_SAMUEL2;
-				cpuname = "C3 'Samuel 2' [C5B]";
-			} else {
-				cpu_model = CPU_EZRA;
-				cpuname = "C3 'Ezra' [C5C]";
-			}
-			memcpy(mults, ezra_mults, sizeof(ezra_mults));
-			memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr));
-			break;
-		}
-		break;
-
-	case 8:
-		cpu_model = CPU_EZRA_T;
-		cpuname = "C3 'Ezra-T' [C5M]";
-		longhaul_version = TYPE_POWERSAVER;
-		numscales = 32;
-		memcpy(mults, ezrat_mults, sizeof(ezrat_mults));
-		memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr));
-		break;
-
-	case 9:
-		longhaul_version = TYPE_POWERSAVER;
-		numscales = 32;
-		memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults));
-		memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr));
-		switch (c->x86_mask) {
-		case 0 ... 1:
-			cpu_model = CPU_NEHEMIAH;
-			cpuname = "C3 'Nehemiah A' [C5XLOE]";
-			break;
-		case 2 ... 4:
-			cpu_model = CPU_NEHEMIAH;
-			cpuname = "C3 'Nehemiah B' [C5XLOH]";
-			break;
-		case 5 ... 15:
-			cpu_model = CPU_NEHEMIAH_C;
-			cpuname = "C3 'Nehemiah C' [C5P]";
-			break;
-		}
-		break;
-
-	default:
-		cpuname = "Unknown";
-		break;
-	}
-	/* Check Longhaul ver. 2 */
-	if (longhaul_version == TYPE_LONGHAUL_V2) {
-		rdmsr(MSR_VIA_LONGHAUL, lo, hi);
-		if (lo == 0 && hi == 0)
-			/* Looks like MSR isn't present */
-			longhaul_version = TYPE_LONGHAUL_V1;
-	}
-
-	printk(KERN_INFO PFX "VIA %s CPU detected.  ", cpuname);
-	switch (longhaul_version) {
-	case TYPE_LONGHAUL_V1:
-	case TYPE_LONGHAUL_V2:
-		printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version);
-		break;
-	case TYPE_POWERSAVER:
-		printk(KERN_CONT "Powersaver supported.\n");
-		break;
-	};
-
-	/* Doesn't hurt */
-	longhaul_setup_southbridge();
-
-	/* Find ACPI data for processor */
-	acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
-				ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
-				NULL, (void *)&pr);
-
-	/* Check ACPI support for C3 state */
-	if (pr != NULL && longhaul_version == TYPE_POWERSAVER) {
-		cx = &pr->power.states[ACPI_STATE_C3];
-		if (cx->address > 0 && cx->latency <= 1000)
-			longhaul_flags |= USE_ACPI_C3;
-	}
-	/* Disable if it isn't working */
-	if (disable_acpi_c3)
-		longhaul_flags &= ~USE_ACPI_C3;
-	/* Check if northbridge is friendly */
-	if (enable_arbiter_disable())
-		longhaul_flags |= USE_NORTHBRIDGE;
-
-	/* Check ACPI support for bus master arbiter disable */
-	if (!(longhaul_flags & USE_ACPI_C3
-	     || longhaul_flags & USE_NORTHBRIDGE)
-	    && ((pr == NULL) || !(pr->flags.bm_control))) {
-		printk(KERN_ERR PFX
-			"No ACPI support. Unsupported northbridge.\n");
-		return -ENODEV;
-	}
-
-	if (longhaul_flags & USE_NORTHBRIDGE)
-		printk(KERN_INFO PFX "Using northbridge support.\n");
-	if (longhaul_flags & USE_ACPI_C3)
-		printk(KERN_INFO PFX "Using ACPI support.\n");
-
-	ret = longhaul_get_ranges();
-	if (ret != 0)
-		return ret;
-
-	if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0))
-		longhaul_setup_voltagescaling();
-
-	policy->cpuinfo.transition_latency = 200000;	/* nsec */
-	policy->cur = calc_speed(longhaul_get_cpu_mult());
-
-	ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table);
-	if (ret)
-		return ret;
-
-	cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu);
-
-	return 0;
-}
-
-static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
-{
-	cpufreq_frequency_table_put_attr(policy->cpu);
-	return 0;
-}
-
-static struct freq_attr *longhaul_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-static struct cpufreq_driver longhaul_driver = {
-	.verify	= longhaul_verify,
-	.target	= longhaul_target,
-	.get	= longhaul_get,
-	.init	= longhaul_cpu_init,
-	.exit	= __devexit_p(longhaul_cpu_exit),
-	.name	= "longhaul",
-	.owner	= THIS_MODULE,
-	.attr	= longhaul_attr,
-};
-
-
-static int __init longhaul_init(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-
-	if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6)
-		return -ENODEV;
-
-#ifdef CONFIG_SMP
-	if (num_online_cpus() > 1) {
-		printk(KERN_ERR PFX "More than 1 CPU detected, "
-				"longhaul disabled.\n");
-		return -ENODEV;
-	}
-#endif
-#ifdef CONFIG_X86_IO_APIC
-	if (cpu_has_apic) {
-		printk(KERN_ERR PFX "APIC detected. Longhaul is currently "
-				"broken in this configuration.\n");
-		return -ENODEV;
-	}
-#endif
-	switch (c->x86_model) {
-	case 6 ... 9:
-		return cpufreq_register_driver(&longhaul_driver);
-	case 10:
-		printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n");
-	default:
-		;
-	}
-
-	return -ENODEV;
-}
-
-
-static void __exit longhaul_exit(void)
-{
-	int i;
-
-	for (i = 0; i < numscales; i++) {
-		if (mults[i] == maxmult) {
-			longhaul_setstate(i);
-			break;
-		}
-	}
-
-	cpufreq_unregister_driver(&longhaul_driver);
-	kfree(longhaul_table);
-}
-
-/* Even if BIOS is exporting ACPI C3 state, and it is used
- * with success when CPU is idle, this state doesn't
- * trigger frequency transition in some cases. */
-module_param(disable_acpi_c3, int, 0644);
-MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
-/* Change CPU voltage with frequency. Very useful to save
- * power, but most VIA C3 processors aren't supporting it. */
-module_param(scale_voltage, int, 0644);
-MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
-/* Force revision key to 0 for processors which doesn't
- * support voltage scaling, but are introducing itself as
- * such. */
-module_param(revid_errata, int, 0644);
-MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
-
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
-MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors.");
-MODULE_LICENSE("GPL");
-
-late_initcall(longhaul_init);
-module_exit(longhaul_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
deleted file mode 100644
index cbf48fbca88..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- *  longhaul.h
- *  (C) 2003 Dave Jones.
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  VIA-specific information
- */
-
-union msr_bcr2 {
-	struct {
-		unsigned Reseved:19,	// 18:0
-		ESOFTBF:1,		// 19
-		Reserved2:3,		// 22:20
-		CLOCKMUL:4,		// 26:23
-		Reserved3:5;		// 31:27
-	} bits;
-	unsigned long val;
-};
-
-union msr_longhaul {
-	struct {
-		unsigned RevisionID:4,	// 3:0
-		RevisionKey:4,		// 7:4
-		EnableSoftBusRatio:1,	// 8
-		EnableSoftVID:1,	// 9
-		EnableSoftBSEL:1,	// 10
-		Reserved:3,		// 11:13
-		SoftBusRatio4:1,	// 14
-		VRMRev:1,		// 15
-		SoftBusRatio:4,		// 19:16
-		SoftVID:5,		// 24:20
-		Reserved2:3,		// 27:25
-		SoftBSEL:2,		// 29:28
-		Reserved3:2,		// 31:30
-		MaxMHzBR:4,		// 35:32
-		MaximumVID:5,		// 40:36
-		MaxMHzFSB:2,		// 42:41
-		MaxMHzBR4:1,		// 43
-		Reserved4:4,		// 47:44
-		MinMHzBR:4,		// 51:48
-		MinimumVID:5,		// 56:52
-		MinMHzFSB:2,		// 58:57
-		MinMHzBR4:1,		// 59
-		Reserved5:4;		// 63:60
-	} bits;
-	unsigned long long val;
-};
-
-/*
- * Clock ratio tables. Div/Mod by 10 to get ratio.
- * The eblcr values specify the ratio read from the CPU.
- * The mults values specify what to write to the CPU.
- */
-
-/*
- * VIA C3 Samuel 1  & Samuel 2 (stepping 0)
- */
-static const int __cpuinitdata samuel1_mults[16] = {
-	-1, /* 0000 -> RESERVED */
-	30, /* 0001 ->  3.0x */
-	40, /* 0010 ->  4.0x */
-	-1, /* 0011 -> RESERVED */
-	-1, /* 0100 -> RESERVED */
-	35, /* 0101 ->  3.5x */
-	45, /* 0110 ->  4.5x */
-	55, /* 0111 ->  5.5x */
-	60, /* 1000 ->  6.0x */
-	70, /* 1001 ->  7.0x */
-	80, /* 1010 ->  8.0x */
-	50, /* 1011 ->  5.0x */
-	65, /* 1100 ->  6.5x */
-	75, /* 1101 ->  7.5x */
-	-1, /* 1110 -> RESERVED */
-	-1, /* 1111 -> RESERVED */
-};
-
-static const int __cpuinitdata samuel1_eblcr[16] = {
-	50, /* 0000 -> RESERVED */
-	30, /* 0001 ->  3.0x */
-	40, /* 0010 ->  4.0x */
-	-1, /* 0011 -> RESERVED */
-	55, /* 0100 ->  5.5x */
-	35, /* 0101 ->  3.5x */
-	45, /* 0110 ->  4.5x */
-	-1, /* 0111 -> RESERVED */
-	-1, /* 1000 -> RESERVED */
-	70, /* 1001 ->  7.0x */
-	80, /* 1010 ->  8.0x */
-	60, /* 1011 ->  6.0x */
-	-1, /* 1100 -> RESERVED */
-	75, /* 1101 ->  7.5x */
-	-1, /* 1110 -> RESERVED */
-	65, /* 1111 ->  6.5x */
-};
-
-/*
- * VIA C3 Samuel2 Stepping 1->15
- */
-static const int __cpuinitdata samuel2_eblcr[16] = {
-	50,  /* 0000 ->  5.0x */
-	30,  /* 0001 ->  3.0x */
-	40,  /* 0010 ->  4.0x */
-	100, /* 0011 -> 10.0x */
-	55,  /* 0100 ->  5.5x */
-	35,  /* 0101 ->  3.5x */
-	45,  /* 0110 ->  4.5x */
-	110, /* 0111 -> 11.0x */
-	90,  /* 1000 ->  9.0x */
-	70,  /* 1001 ->  7.0x */
-	80,  /* 1010 ->  8.0x */
-	60,  /* 1011 ->  6.0x */
-	120, /* 1100 -> 12.0x */
-	75,  /* 1101 ->  7.5x */
-	130, /* 1110 -> 13.0x */
-	65,  /* 1111 ->  6.5x */
-};
-
-/*
- * VIA C3 Ezra
- */
-static const int __cpuinitdata ezra_mults[16] = {
-	100, /* 0000 -> 10.0x */
-	30,  /* 0001 ->  3.0x */
-	40,  /* 0010 ->  4.0x */
-	90,  /* 0011 ->  9.0x */
-	95,  /* 0100 ->  9.5x */
-	35,  /* 0101 ->  3.5x */
-	45,  /* 0110 ->  4.5x */
-	55,  /* 0111 ->  5.5x */
-	60,  /* 1000 ->  6.0x */
-	70,  /* 1001 ->  7.0x */
-	80,  /* 1010 ->  8.0x */
-	50,  /* 1011 ->  5.0x */
-	65,  /* 1100 ->  6.5x */
-	75,  /* 1101 ->  7.5x */
-	85,  /* 1110 ->  8.5x */
-	120, /* 1111 -> 12.0x */
-};
-
-static const int __cpuinitdata ezra_eblcr[16] = {
-	50,  /* 0000 ->  5.0x */
-	30,  /* 0001 ->  3.0x */
-	40,  /* 0010 ->  4.0x */
-	100, /* 0011 -> 10.0x */
-	55,  /* 0100 ->  5.5x */
-	35,  /* 0101 ->  3.5x */
-	45,  /* 0110 ->  4.5x */
-	95,  /* 0111 ->  9.5x */
-	90,  /* 1000 ->  9.0x */
-	70,  /* 1001 ->  7.0x */
-	80,  /* 1010 ->  8.0x */
-	60,  /* 1011 ->  6.0x */
-	120, /* 1100 -> 12.0x */
-	75,  /* 1101 ->  7.5x */
-	85,  /* 1110 ->  8.5x */
-	65,  /* 1111 ->  6.5x */
-};
-
-/*
- * VIA C3 (Ezra-T) [C5M].
- */
-static const int __cpuinitdata ezrat_mults[32] = {
-	100, /* 0000 -> 10.0x */
-	30,  /* 0001 ->  3.0x */
-	40,  /* 0010 ->  4.0x */
-	90,  /* 0011 ->  9.0x */
-	95,  /* 0100 ->  9.5x */
-	35,  /* 0101 ->  3.5x */
-	45,  /* 0110 ->  4.5x */
-	55,  /* 0111 ->  5.5x */
-	60,  /* 1000 ->  6.0x */
-	70,  /* 1001 ->  7.0x */
-	80,  /* 1010 ->  8.0x */
-	50,  /* 1011 ->  5.0x */
-	65,  /* 1100 ->  6.5x */
-	75,  /* 1101 ->  7.5x */
-	85,  /* 1110 ->  8.5x */
-	120, /* 1111 ->  12.0x */
-
-	-1,  /* 0000 -> RESERVED (10.0x) */
-	110, /* 0001 -> 11.0x */
-	-1, /* 0010 -> 12.0x */
-	-1,  /* 0011 -> RESERVED (9.0x)*/
-	105, /* 0100 -> 10.5x */
-	115, /* 0101 -> 11.5x */
-	125, /* 0110 -> 12.5x */
-	135, /* 0111 -> 13.5x */
-	140, /* 1000 -> 14.0x */
-	150, /* 1001 -> 15.0x */
-	160, /* 1010 -> 16.0x */
-	130, /* 1011 -> 13.0x */
-	145, /* 1100 -> 14.5x */
-	155, /* 1101 -> 15.5x */
-	-1,  /* 1110 -> RESERVED (13.0x) */
-	-1,  /* 1111 -> RESERVED (12.0x) */
-};
-
-static const int __cpuinitdata ezrat_eblcr[32] = {
-	50,  /* 0000 ->  5.0x */
-	30,  /* 0001 ->  3.0x */
-	40,  /* 0010 ->  4.0x */
-	100, /* 0011 -> 10.0x */
-	55,  /* 0100 ->  5.5x */
-	35,  /* 0101 ->  3.5x */
-	45,  /* 0110 ->  4.5x */
-	95,  /* 0111 ->  9.5x */
-	90,  /* 1000 ->  9.0x */
-	70,  /* 1001 ->  7.0x */
-	80,  /* 1010 ->  8.0x */
-	60,  /* 1011 ->  6.0x */
-	120, /* 1100 -> 12.0x */
-	75,  /* 1101 ->  7.5x */
-	85,  /* 1110 ->  8.5x */
-	65,  /* 1111 ->  6.5x */
-
-	-1,  /* 0000 -> RESERVED (9.0x) */
-	110, /* 0001 -> 11.0x */
-	120, /* 0010 -> 12.0x */
-	-1,  /* 0011 -> RESERVED (10.0x)*/
-	135, /* 0100 -> 13.5x */
-	115, /* 0101 -> 11.5x */
-	125, /* 0110 -> 12.5x */
-	105, /* 0111 -> 10.5x */
-	130, /* 1000 -> 13.0x */
-	150, /* 1001 -> 15.0x */
-	160, /* 1010 -> 16.0x */
-	140, /* 1011 -> 14.0x */
-	-1,  /* 1100 -> RESERVED (12.0x) */
-	155, /* 1101 -> 15.5x */
-	-1,  /* 1110 -> RESERVED (13.0x) */
-	145, /* 1111 -> 14.5x */
-};
-
-/*
- * VIA C3 Nehemiah */
-
-static const int __cpuinitdata nehemiah_mults[32] = {
-	100, /* 0000 -> 10.0x */
-	-1, /* 0001 -> 16.0x */
-	40,  /* 0010 ->  4.0x */
-	90,  /* 0011 ->  9.0x */
-	95,  /* 0100 ->  9.5x */
-	-1,  /* 0101 ->  RESERVED */
-	45,  /* 0110 ->  4.5x */
-	55,  /* 0111 ->  5.5x */
-	60,  /* 1000 ->  6.0x */
-	70,  /* 1001 ->  7.0x */
-	80,  /* 1010 ->  8.0x */
-	50,  /* 1011 ->  5.0x */
-	65,  /* 1100 ->  6.5x */
-	75,  /* 1101 ->  7.5x */
-	85,  /* 1110 ->  8.5x */
-	120, /* 1111 -> 12.0x */
-	-1, /* 0000 -> 10.0x */
-	110, /* 0001 -> 11.0x */
-	-1, /* 0010 -> 12.0x */
-	-1,  /* 0011 ->  9.0x */
-	105, /* 0100 -> 10.5x */
-	115, /* 0101 -> 11.5x */
-	125, /* 0110 -> 12.5x */
-	135, /* 0111 -> 13.5x */
-	140, /* 1000 -> 14.0x */
-	150, /* 1001 -> 15.0x */
-	160, /* 1010 -> 16.0x */
-	130, /* 1011 -> 13.0x */
-	145, /* 1100 -> 14.5x */
-	155, /* 1101 -> 15.5x */
-	-1,  /* 1110 -> RESERVED (13.0x) */
-	-1, /* 1111 -> 12.0x */
-};
-
-static const int __cpuinitdata nehemiah_eblcr[32] = {
-	50,  /* 0000 ->  5.0x */
-	160, /* 0001 -> 16.0x */
-	40,  /* 0010 ->  4.0x */
-	100, /* 0011 -> 10.0x */
-	55,  /* 0100 ->  5.5x */
-	-1,  /* 0101 ->  RESERVED */
-	45,  /* 0110 ->  4.5x */
-	95,  /* 0111 ->  9.5x */
-	90,  /* 1000 ->  9.0x */
-	70,  /* 1001 ->  7.0x */
-	80,  /* 1010 ->  8.0x */
-	60,  /* 1011 ->  6.0x */
-	120, /* 1100 -> 12.0x */
-	75,  /* 1101 ->  7.5x */
-	85,  /* 1110 ->  8.5x */
-	65,  /* 1111 ->  6.5x */
-	90,  /* 0000 ->  9.0x */
-	110, /* 0001 -> 11.0x */
-	120, /* 0010 -> 12.0x */
-	100, /* 0011 -> 10.0x */
-	135, /* 0100 -> 13.5x */
-	115, /* 0101 -> 11.5x */
-	125, /* 0110 -> 12.5x */
-	105, /* 0111 -> 10.5x */
-	130, /* 1000 -> 13.0x */
-	150, /* 1001 -> 15.0x */
-	160, /* 1010 -> 16.0x */
-	140, /* 1011 -> 14.0x */
-	120, /* 1100 -> 12.0x */
-	155, /* 1101 -> 15.5x */
-	-1,  /* 1110 -> RESERVED (13.0x) */
-	145 /* 1111 -> 14.5x */
-};
-
-/*
- * Voltage scales. Div/Mod by 1000 to get actual voltage.
- * Which scale to use depends on the VRM type in use.
- */
-
-struct mV_pos {
-	unsigned short mV;
-	unsigned short pos;
-};
-
-static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
-	{1250, 8},	{1200, 6},	{1150, 4},	{1100, 2},
-	{1050, 0},	{1800, 30},	{1750, 28},	{1700, 26},
-	{1650, 24},	{1600, 22},	{1550, 20},	{1500, 18},
-	{1450, 16},	{1400, 14},	{1350, 12},	{1300, 10},
-	{1275, 9},	{1225, 7},	{1175, 5},	{1125, 3},
-	{1075, 1},	{1825, 31},	{1775, 29},	{1725, 27},
-	{1675, 25},	{1625, 23},	{1575, 21},	{1525, 19},
-	{1475, 17},	{1425, 15},	{1375, 13},	{1325, 11}
-};
-
-static const unsigned char __cpuinitdata mV_vrm85[32] = {
-	0x04,	0x14,	0x03,	0x13,	0x02,	0x12,	0x01,	0x11,
-	0x00,	0x10,	0x0f,	0x1f,	0x0e,	0x1e,	0x0d,	0x1d,
-	0x0c,	0x1c,	0x0b,	0x1b,	0x0a,	0x1a,	0x09,	0x19,
-	0x08,	0x18,	0x07,	0x17,	0x06,	0x16,	0x05,	0x15
-};
-
-static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
-	{1750, 31},	{1700, 30},	{1650, 29},	{1600, 28},
-	{1550, 27},	{1500, 26},	{1450, 25},	{1400, 24},
-	{1350, 23},	{1300, 22},	{1250, 21},	{1200, 20},
-	{1150, 19},	{1100, 18},	{1050, 17},	{1000, 16},
-	{975, 15},	{950, 14},	{925, 13},	{900, 12},
-	{875, 11},	{850, 10},	{825, 9},	{800, 8},
-	{775, 7},	{750, 6},	{725, 5},	{700, 4},
-	{675, 3},	{650, 2},	{625, 1},	{600, 0}
-};
-
-static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
-	0x1f,	0x1e,	0x1d,	0x1c,	0x1b,	0x1a,	0x19,	0x18,
-	0x17,	0x16,	0x15,	0x14,	0x13,	0x12,	0x11,	0x10,
-	0x0f,	0x0e,	0x0d,	0x0c,	0x0b,	0x0a,	0x09,	0x08,
-	0x07,	0x06,	0x05,	0x04,	0x03,	0x02,	0x01,	0x00
-};
-
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
deleted file mode 100644
index d9f51367666..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * (C) 2002 - 2003  Dominik Brodowski <linux@brodo.de>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/timex.h>
-
-#include <asm/msr.h>
-#include <asm/processor.h>
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"longrun", msg)
-
-static struct cpufreq_driver	longrun_driver;
-
-/**
- * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz
- * values into per cent values. In TMTA microcode, the following is valid:
- * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
- */
-static unsigned int longrun_low_freq, longrun_high_freq;
-
-
-/**
- * longrun_get_policy - get the current LongRun policy
- * @policy: struct cpufreq_policy where current policy is written into
- *
- * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
- * and MSR_TMTA_LONGRUN_CTRL
- */
-static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy)
-{
-	u32 msr_lo, msr_hi;
-
-	rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
-	dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi);
-	if (msr_lo & 0x01)
-		policy->policy = CPUFREQ_POLICY_PERFORMANCE;
-	else
-		policy->policy = CPUFREQ_POLICY_POWERSAVE;
-
-	rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-	dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi);
-	msr_lo &= 0x0000007F;
-	msr_hi &= 0x0000007F;
-
-	if (longrun_high_freq <= longrun_low_freq) {
-		/* Assume degenerate Longrun table */
-		policy->min = policy->max = longrun_high_freq;
-	} else {
-		policy->min = longrun_low_freq + msr_lo *
-			((longrun_high_freq - longrun_low_freq) / 100);
-		policy->max = longrun_low_freq + msr_hi *
-			((longrun_high_freq - longrun_low_freq) / 100);
-	}
-	policy->cpu = 0;
-}
-
-
-/**
- * longrun_set_policy - sets a new CPUFreq policy
- * @policy: new policy
- *
- * Sets a new CPUFreq policy on LongRun-capable processors. This function
- * has to be called with cpufreq_driver locked.
- */
-static int longrun_set_policy(struct cpufreq_policy *policy)
-{
-	u32 msr_lo, msr_hi;
-	u32 pctg_lo, pctg_hi;
-
-	if (!policy)
-		return -EINVAL;
-
-	if (longrun_high_freq <= longrun_low_freq) {
-		/* Assume degenerate Longrun table */
-		pctg_lo = pctg_hi = 100;
-	} else {
-		pctg_lo = (policy->min - longrun_low_freq) /
-			((longrun_high_freq - longrun_low_freq) / 100);
-		pctg_hi = (policy->max - longrun_low_freq) /
-			((longrun_high_freq - longrun_low_freq) / 100);
-	}
-
-	if (pctg_hi > 100)
-		pctg_hi = 100;
-	if (pctg_lo > pctg_hi)
-		pctg_lo = pctg_hi;
-
-	/* performance or economy mode */
-	rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
-	msr_lo &= 0xFFFFFFFE;
-	switch (policy->policy) {
-	case CPUFREQ_POLICY_PERFORMANCE:
-		msr_lo |= 0x00000001;
-		break;
-	case CPUFREQ_POLICY_POWERSAVE:
-		break;
-	}
-	wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
-
-	/* lower and upper boundary */
-	rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-	msr_lo &= 0xFFFFFF80;
-	msr_hi &= 0xFFFFFF80;
-	msr_lo |= pctg_lo;
-	msr_hi |= pctg_hi;
-	wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-
-	return 0;
-}
-
-
-/**
- * longrun_verify_poliy - verifies a new CPUFreq policy
- * @policy: the policy to verify
- *
- * Validates a new CPUFreq policy. This function has to be called with
- * cpufreq_driver locked.
- */
-static int longrun_verify_policy(struct cpufreq_policy *policy)
-{
-	if (!policy)
-		return -EINVAL;
-
-	policy->cpu = 0;
-	cpufreq_verify_within_limits(policy,
-		policy->cpuinfo.min_freq,
-		policy->cpuinfo.max_freq);
-
-	if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) &&
-	    (policy->policy != CPUFREQ_POLICY_PERFORMANCE))
-		return -EINVAL;
-
-	return 0;
-}
-
-static unsigned int longrun_get(unsigned int cpu)
-{
-	u32 eax, ebx, ecx, edx;
-
-	if (cpu)
-		return 0;
-
-	cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
-	dprintk("cpuid eax is %u\n", eax);
-
-	return eax * 1000;
-}
-
-/**
- * longrun_determine_freqs - determines the lowest and highest possible core frequency
- * @low_freq: an int to put the lowest frequency into
- * @high_freq: an int to put the highest frequency into
- *
- * Determines the lowest and highest possible core frequencies on this CPU.
- * This is necessary to calculate the performance percentage according to
- * TMTA rules:
- * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
- */
-static int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
-						      unsigned int *high_freq)
-{
-	u32 msr_lo, msr_hi;
-	u32 save_lo, save_hi;
-	u32 eax, ebx, ecx, edx;
-	u32 try_hi;
-	struct cpuinfo_x86 *c = &cpu_data(0);
-
-	if (!low_freq || !high_freq)
-		return -EINVAL;
-
-	if (cpu_has(c, X86_FEATURE_LRTI)) {
-		/* if the LongRun Table Interface is present, the
-		 * detection is a bit easier:
-		 * For minimum frequency, read out the maximum
-		 * level (msr_hi), write that into "currently
-		 * selected level", and read out the frequency.
-		 * For maximum frequency, read out level zero.
-		 */
-		/* minimum */
-		rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi);
-		wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi);
-		rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
-		*low_freq = msr_lo * 1000; /* to kHz */
-
-		/* maximum */
-		wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi);
-		rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
-		*high_freq = msr_lo * 1000; /* to kHz */
-
-		dprintk("longrun table interface told %u - %u kHz\n",
-				*low_freq, *high_freq);
-
-		if (*low_freq > *high_freq)
-			*low_freq = *high_freq;
-		return 0;
-	}
-
-	/* set the upper border to the value determined during TSC init */
-	*high_freq = (cpu_khz / 1000);
-	*high_freq = *high_freq * 1000;
-	dprintk("high frequency is %u kHz\n", *high_freq);
-
-	/* get current borders */
-	rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-	save_lo = msr_lo & 0x0000007F;
-	save_hi = msr_hi & 0x0000007F;
-
-	/* if current perf_pctg is larger than 90%, we need to decrease the
-	 * upper limit to make the calculation more accurate.
-	 */
-	cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
-	/* try decreasing in 10% steps, some processors react only
-	 * on some barrier values */
-	for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) {
-		/* set to 0 to try_hi perf_pctg */
-		msr_lo &= 0xFFFFFF80;
-		msr_hi &= 0xFFFFFF80;
-		msr_hi |= try_hi;
-		wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
-
-		/* read out current core MHz and current perf_pctg */
-		cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
-
-		/* restore values */
-		wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi);
-	}
-	dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax);
-
-	/* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
-	 * eqals
-	 * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
-	 *
-	 * high_freq * perf_pctg is stored tempoarily into "ebx".
-	 */
-	ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */
-
-	if ((ecx > 95) || (ecx == 0) || (eax < ebx))
-		return -EIO;
-
-	edx = ((eax - ebx) * 100) / (100 - ecx);
-	*low_freq = edx * 1000; /* back to kHz */
-
-	dprintk("low frequency is %u kHz\n", *low_freq);
-
-	if (*low_freq > *high_freq)
-		*low_freq = *high_freq;
-
-	return 0;
-}
-
-
-static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
-{
-	int result = 0;
-
-	/* capability check */
-	if (policy->cpu != 0)
-		return -ENODEV;
-
-	/* detect low and high frequency */
-	result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq);
-	if (result)
-		return result;
-
-	/* cpuinfo and default policy values */
-	policy->cpuinfo.min_freq = longrun_low_freq;
-	policy->cpuinfo.max_freq = longrun_high_freq;
-	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-	longrun_get_policy(policy);
-
-	return 0;
-}
-
-
-static struct cpufreq_driver longrun_driver = {
-	.flags		= CPUFREQ_CONST_LOOPS,
-	.verify		= longrun_verify_policy,
-	.setpolicy	= longrun_set_policy,
-	.get		= longrun_get,
-	.init		= longrun_cpu_init,
-	.name		= "longrun",
-	.owner		= THIS_MODULE,
-};
-
-
-/**
- * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver
- *
- * Initializes the LongRun support.
- */
-static int __init longrun_init(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-
-	if (c->x86_vendor != X86_VENDOR_TRANSMETA ||
-	    !cpu_has(c, X86_FEATURE_LONGRUN))
-		return -ENODEV;
-
-	return cpufreq_register_driver(&longrun_driver);
-}
-
-
-/**
- * longrun_exit - unregisters LongRun support
- */
-static void __exit longrun_exit(void)
-{
-	cpufreq_unregister_driver(&longrun_driver);
-}
-
-
-MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and "
-		"Efficeon processors.");
-MODULE_LICENSE("GPL");
-
-module_init(longrun_init);
-module_exit(longrun_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
deleted file mode 100644
index 911e193018a..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.c
+++ /dev/null
@@ -1,51 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-
-#include "mperf.h"
-
-static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
-
-/* Called via smp_call_function_single(), on the target CPU */
-static void read_measured_perf_ctrs(void *_cur)
-{
-	struct aperfmperf *am = _cur;
-
-	get_aperfmperf(am);
-}
-
-/*
- * Return the measured active (C0) frequency on this CPU since last call
- * to this function.
- * Input: cpu number
- * Return: Average CPU frequency in terms of max frequency (zero on error)
- *
- * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
- * over a period of time, while CPU is in C0 state.
- * IA32_MPERF counts at the rate of max advertised frequency
- * IA32_APERF counts at the rate of actual CPU frequency
- * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
- * no meaning should be associated with absolute values of these MSRs.
- */
-unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
-					unsigned int cpu)
-{
-	struct aperfmperf perf;
-	unsigned long ratio;
-	unsigned int retval;
-
-	if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
-		return 0;
-
-	ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
-	per_cpu(acfreq_old_perf, cpu) = perf;
-
-	retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
-
-	return retval;
-}
-EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
deleted file mode 100644
index 5dbf2950dc2..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.h
+++ /dev/null
@@ -1,9 +0,0 @@
-/*
- *  (c) 2010 Advanced Micro Devices, Inc.
- *  Your use of this code is subject to the terms and conditions of the
- *  GNU general public license version 2. See "COPYING" or
- *  http://www.gnu.org/licenses/gpl.html
- */
-
-unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
-					unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
deleted file mode 100644
index 52c93648e49..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- *	Pentium 4/Xeon CPU on demand clock modulation/speed scaling
- *	(C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *	(C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
- *	(C) 2002 Arjan van de Ven <arjanv@redhat.com>
- *	(C) 2002 Tora T. Engstad
- *	All Rights Reserved
- *
- *	This program is free software; you can redistribute it and/or
- *      modify it under the terms of the GNU General Public License
- *      as published by the Free Software Foundation; either version
- *      2 of the License, or (at your option) any later version.
- *
- *      The author(s) of this software shall not be held liable for damages
- *      of any nature resulting due to the use of this software. This
- *      software is provided AS-IS with no warranties.
- *
- *	Date		Errata			Description
- *	20020525	N44, O17	12.5% or 25% DC causes lockup
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/cpufreq.h>
-#include <linux/cpumask.h>
-#include <linux/timex.h>
-
-#include <asm/processor.h>
-#include <asm/msr.h>
-#include <asm/timer.h>
-
-#include "speedstep-lib.h"
-
-#define PFX	"p4-clockmod: "
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"p4-clockmod", msg)
-
-/*
- * Duty Cycle (3bits), note DC_DISABLE is not specified in
- * intel docs i just use it to mean disable
- */
-enum {
-	DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT,
-	DC_64PT, DC_75PT, DC_88PT, DC_DISABLE
-};
-
-#define DC_ENTRIES	8
-
-
-static int has_N44_O17_errata[NR_CPUS];
-static unsigned int stock_freq;
-static struct cpufreq_driver p4clockmod_driver;
-static unsigned int cpufreq_p4_get(unsigned int cpu);
-
-static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
-{
-	u32 l, h;
-
-	if (!cpu_online(cpu) ||
-	    (newstate > DC_DISABLE) || (newstate == DC_RESV))
-		return -EINVAL;
-
-	rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
-
-	if (l & 0x01)
-		dprintk("CPU#%d currently thermal throttled\n", cpu);
-
-	if (has_N44_O17_errata[cpu] &&
-	    (newstate == DC_25PT || newstate == DC_DFLT))
-		newstate = DC_38PT;
-
-	rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
-	if (newstate == DC_DISABLE) {
-		dprintk("CPU#%d disabling modulation\n", cpu);
-		wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
-	} else {
-		dprintk("CPU#%d setting duty cycle to %d%%\n",
-			cpu, ((125 * newstate) / 10));
-		/* bits 63 - 5	: reserved
-		 * bit  4	: enable/disable
-		 * bits 3-1	: duty cycle
-		 * bit  0	: reserved
-		 */
-		l = (l & ~14);
-		l = l | (1<<4) | ((newstate & 0x7)<<1);
-		wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h);
-	}
-
-	return 0;
-}
-
-
-static struct cpufreq_frequency_table p4clockmod_table[] = {
-	{DC_RESV, CPUFREQ_ENTRY_INVALID},
-	{DC_DFLT, 0},
-	{DC_25PT, 0},
-	{DC_38PT, 0},
-	{DC_50PT, 0},
-	{DC_64PT, 0},
-	{DC_75PT, 0},
-	{DC_88PT, 0},
-	{DC_DISABLE, 0},
-	{DC_RESV, CPUFREQ_TABLE_END},
-};
-
-
-static int cpufreq_p4_target(struct cpufreq_policy *policy,
-			     unsigned int target_freq,
-			     unsigned int relation)
-{
-	unsigned int    newstate = DC_RESV;
-	struct cpufreq_freqs freqs;
-	int i;
-
-	if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0],
-				target_freq, relation, &newstate))
-		return -EINVAL;
-
-	freqs.old = cpufreq_p4_get(policy->cpu);
-	freqs.new = stock_freq * p4clockmod_table[newstate].index / 8;
-
-	if (freqs.new == freqs.old)
-		return 0;
-
-	/* notifiers */
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	}
-
-	/* run on each logical CPU,
-	 * see section 13.15.3 of IA32 Intel Architecture Software
-	 * Developer's Manual, Volume 3
-	 */
-	for_each_cpu(i, policy->cpus)
-		cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
-
-	/* notifiers */
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
-
-	return 0;
-}
-
-
-static int cpufreq_p4_verify(struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]);
-}
-
-
-static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
-{
-	if (c->x86 == 0x06) {
-		if (cpu_has(c, X86_FEATURE_EST))
-			printk_once(KERN_WARNING PFX "Warning: EST-capable "
-			       "CPU detected. The acpi-cpufreq module offers "
-			       "voltage scaling in addition to frequency "
-			       "scaling. You should use that instead of "
-			       "p4-clockmod, if possible.\n");
-		switch (c->x86_model) {
-		case 0x0E: /* Core */
-		case 0x0F: /* Core Duo */
-		case 0x16: /* Celeron Core */
-		case 0x1C: /* Atom */
-			p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
-			return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
-		case 0x0D: /* Pentium M (Dothan) */
-			p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
-			/* fall through */
-		case 0x09: /* Pentium M (Banias) */
-			return speedstep_get_frequency(SPEEDSTEP_CPU_PM);
-		}
-	}
-
-	if (c->x86 != 0xF)
-		return 0;
-
-	/* on P-4s, the TSC runs with constant frequency independent whether
-	 * throttling is active or not. */
-	p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
-
-	if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) {
-		printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. "
-		       "The speedstep-ich or acpi cpufreq modules offer "
-		       "voltage scaling in addition of frequency scaling. "
-		       "You should use either one instead of p4-clockmod, "
-		       "if possible.\n");
-		return speedstep_get_frequency(SPEEDSTEP_CPU_P4M);
-	}
-
-	return speedstep_get_frequency(SPEEDSTEP_CPU_P4D);
-}
-
-
-
-static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
-{
-	struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
-	int cpuid = 0;
-	unsigned int i;
-
-#ifdef CONFIG_SMP
-	cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
-#endif
-
-	/* Errata workaround */
-	cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask;
-	switch (cpuid) {
-	case 0x0f07:
-	case 0x0f0a:
-	case 0x0f11:
-	case 0x0f12:
-		has_N44_O17_errata[policy->cpu] = 1;
-		dprintk("has errata -- disabling low frequencies\n");
-	}
-
-	if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D &&
-	    c->x86_model < 2) {
-		/* switch to maximum frequency and measure result */
-		cpufreq_p4_setdc(policy->cpu, DC_DISABLE);
-		recalibrate_cpu_khz();
-	}
-	/* get max frequency */
-	stock_freq = cpufreq_p4_get_frequency(c);
-	if (!stock_freq)
-		return -EINVAL;
-
-	/* table init */
-	for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
-		if ((i < 2) && (has_N44_O17_errata[policy->cpu]))
-			p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
-		else
-			p4clockmod_table[i].frequency = (stock_freq * i)/8;
-	}
-	cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu);
-
-	/* cpuinfo and default policy values */
-
-	/* the transition latency is set to be 1 higher than the maximum
-	 * transition latency of the ondemand governor */
-	policy->cpuinfo.transition_latency = 10000001;
-	policy->cur = stock_freq;
-
-	return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]);
-}
-
-
-static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy)
-{
-	cpufreq_frequency_table_put_attr(policy->cpu);
-	return 0;
-}
-
-static unsigned int cpufreq_p4_get(unsigned int cpu)
-{
-	u32 l, h;
-
-	rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
-
-	if (l & 0x10) {
-		l = l >> 1;
-		l &= 0x7;
-	} else
-		l = DC_DISABLE;
-
-	if (l != DC_DISABLE)
-		return stock_freq * l / 8;
-
-	return stock_freq;
-}
-
-static struct freq_attr *p4clockmod_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-static struct cpufreq_driver p4clockmod_driver = {
-	.verify		= cpufreq_p4_verify,
-	.target		= cpufreq_p4_target,
-	.init		= cpufreq_p4_cpu_init,
-	.exit		= cpufreq_p4_cpu_exit,
-	.get		= cpufreq_p4_get,
-	.name		= "p4-clockmod",
-	.owner		= THIS_MODULE,
-	.attr		= p4clockmod_attr,
-};
-
-
-static int __init cpufreq_p4_init(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-	int ret;
-
-	/*
-	 * THERM_CONTROL is architectural for IA32 now, so
-	 * we can rely on the capability checks
-	 */
-	if (c->x86_vendor != X86_VENDOR_INTEL)
-		return -ENODEV;
-
-	if (!test_cpu_cap(c, X86_FEATURE_ACPI) ||
-				!test_cpu_cap(c, X86_FEATURE_ACC))
-		return -ENODEV;
-
-	ret = cpufreq_register_driver(&p4clockmod_driver);
-	if (!ret)
-		printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock "
-				"Modulation available\n");
-
-	return ret;
-}
-
-
-static void __exit cpufreq_p4_exit(void)
-{
-	cpufreq_unregister_driver(&p4clockmod_driver);
-}
-
-
-MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>");
-MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
-MODULE_LICENSE("GPL");
-
-late_initcall(cpufreq_p4_init);
-module_exit(cpufreq_p4_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
deleted file mode 100644
index 755a31e0f5b..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ /dev/null
@@ -1,624 +0,0 @@
-/*
- *  pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
- *
- *  Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
- *  Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
- *	Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; version 2 of the License.
- *
- *  This program is distributed in the hope that it will be useful, but
- *  WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
- *  INFRINGEMENT. See the GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <linux/sched.h>
-#include <linux/cpufreq.h>
-#include <linux/compiler.h>
-#include <linux/slab.h>
-
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/spinlock.h>
-#include <linux/uaccess.h>
-
-#include <acpi/processor.h>
-
-#define PCC_VERSION 	"1.00.00"
-#define POLL_LOOPS 	300
-
-#define CMD_COMPLETE 	0x1
-#define CMD_GET_FREQ 	0x0
-#define CMD_SET_FREQ 	0x1
-
-#define BUF_SZ		4
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER,	\
-					     "pcc-cpufreq", msg)
-
-struct pcc_register_resource {
-	u8 descriptor;
-	u16 length;
-	u8 space_id;
-	u8 bit_width;
-	u8 bit_offset;
-	u8 access_size;
-	u64 address;
-} __attribute__ ((packed));
-
-struct pcc_memory_resource {
-	u8 descriptor;
-	u16 length;
-	u8 space_id;
-	u8 resource_usage;
-	u8 type_specific;
-	u64 granularity;
-	u64 minimum;
-	u64 maximum;
-	u64 translation_offset;
-	u64 address_length;
-} __attribute__ ((packed));
-
-static struct cpufreq_driver pcc_cpufreq_driver;
-
-struct pcc_header {
-	u32 signature;
-	u16 length;
-	u8 major;
-	u8 minor;
-	u32 features;
-	u16 command;
-	u16 status;
-	u32 latency;
-	u32 minimum_time;
-	u32 maximum_time;
-	u32 nominal;
-	u32 throttled_frequency;
-	u32 minimum_frequency;
-};
-
-static void __iomem *pcch_virt_addr;
-static struct pcc_header __iomem *pcch_hdr;
-
-static DEFINE_SPINLOCK(pcc_lock);
-
-static struct acpi_generic_address doorbell;
-
-static u64 doorbell_preserve;
-static u64 doorbell_write;
-
-static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
-			  0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
-
-struct pcc_cpu {
-	u32 input_offset;
-	u32 output_offset;
-};
-
-static struct pcc_cpu __percpu *pcc_cpu_info;
-
-static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
-{
-	cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
-				     policy->cpuinfo.max_freq);
-	return 0;
-}
-
-static inline void pcc_cmd(void)
-{
-	u64 doorbell_value;
-	int i;
-
-	acpi_read(&doorbell_value, &doorbell);
-	acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
-		   &doorbell);
-
-	for (i = 0; i < POLL_LOOPS; i++) {
-		if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
-			break;
-	}
-}
-
-static inline void pcc_clear_mapping(void)
-{
-	if (pcch_virt_addr)
-		iounmap(pcch_virt_addr);
-	pcch_virt_addr = NULL;
-}
-
-static unsigned int pcc_get_freq(unsigned int cpu)
-{
-	struct pcc_cpu *pcc_cpu_data;
-	unsigned int curr_freq;
-	unsigned int freq_limit;
-	u16 status;
-	u32 input_buffer;
-	u32 output_buffer;
-
-	spin_lock(&pcc_lock);
-
-	dprintk("get: get_freq for CPU %d\n", cpu);
-	pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
-
-	input_buffer = 0x1;
-	iowrite32(input_buffer,
-			(pcch_virt_addr + pcc_cpu_data->input_offset));
-	iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
-
-	pcc_cmd();
-
-	output_buffer =
-		ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
-
-	/* Clear the input buffer - we are done with the current command */
-	memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
-
-	status = ioread16(&pcch_hdr->status);
-	if (status != CMD_COMPLETE) {
-		dprintk("get: FAILED: for CPU %d, status is %d\n",
-			cpu, status);
-		goto cmd_incomplete;
-	}
-	iowrite16(0, &pcch_hdr->status);
-	curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
-			/ 100) * 1000);
-
-	dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
-		"0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
-		cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
-		output_buffer, curr_freq);
-
-	freq_limit = (output_buffer >> 8) & 0xff;
-	if (freq_limit != 0xff) {
-		dprintk("get: frequency for cpu %d is being temporarily"
-			" capped at %d\n", cpu, curr_freq);
-	}
-
-	spin_unlock(&pcc_lock);
-	return curr_freq;
-
-cmd_incomplete:
-	iowrite16(0, &pcch_hdr->status);
-	spin_unlock(&pcc_lock);
-	return 0;
-}
-
-static int pcc_cpufreq_target(struct cpufreq_policy *policy,
-			      unsigned int target_freq,
-			      unsigned int relation)
-{
-	struct pcc_cpu *pcc_cpu_data;
-	struct cpufreq_freqs freqs;
-	u16 status;
-	u32 input_buffer;
-	int cpu;
-
-	spin_lock(&pcc_lock);
-	cpu = policy->cpu;
-	pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
-
-	dprintk("target: CPU %d should go to target freq: %d "
-		"(virtual) input_offset is 0x%x\n",
-		cpu, target_freq,
-		(pcch_virt_addr + pcc_cpu_data->input_offset));
-
-	freqs.new = target_freq;
-	freqs.cpu = cpu;
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
-	input_buffer = 0x1 | (((target_freq * 100)
-			       / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
-	iowrite32(input_buffer,
-			(pcch_virt_addr + pcc_cpu_data->input_offset));
-	iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
-
-	pcc_cmd();
-
-	/* Clear the input buffer - we are done with the current command */
-	memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
-
-	status = ioread16(&pcch_hdr->status);
-	if (status != CMD_COMPLETE) {
-		dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
-			cpu, status);
-		goto cmd_incomplete;
-	}
-	iowrite16(0, &pcch_hdr->status);
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
-	spin_unlock(&pcc_lock);
-
-	return 0;
-
-cmd_incomplete:
-	iowrite16(0, &pcch_hdr->status);
-	spin_unlock(&pcc_lock);
-	return -EINVAL;
-}
-
-static int pcc_get_offset(int cpu)
-{
-	acpi_status status;
-	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
-	union acpi_object *pccp, *offset;
-	struct pcc_cpu *pcc_cpu_data;
-	struct acpi_processor *pr;
-	int ret = 0;
-
-	pr = per_cpu(processors, cpu);
-	pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
-
-	status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
-	if (ACPI_FAILURE(status))
-		return -ENODEV;
-
-	pccp = buffer.pointer;
-	if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
-		ret = -ENODEV;
-		goto out_free;
-	};
-
-	offset = &(pccp->package.elements[0]);
-	if (!offset || offset->type != ACPI_TYPE_INTEGER) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	pcc_cpu_data->input_offset = offset->integer.value;
-
-	offset = &(pccp->package.elements[1]);
-	if (!offset || offset->type != ACPI_TYPE_INTEGER) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	pcc_cpu_data->output_offset = offset->integer.value;
-
-	memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
-	memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
-
-	dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
-		"input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
-		cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
-out_free:
-	kfree(buffer.pointer);
-	return ret;
-}
-
-static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
-{
-	acpi_status status;
-	struct acpi_object_list input;
-	struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
-	union acpi_object in_params[4];
-	union acpi_object *out_obj;
-	u32 capabilities[2];
-	u32 errors;
-	u32 supported;
-	int ret = 0;
-
-	input.count = 4;
-	input.pointer = in_params;
-	in_params[0].type               = ACPI_TYPE_BUFFER;
-	in_params[0].buffer.length      = 16;
-	in_params[0].buffer.pointer     = OSC_UUID;
-	in_params[1].type               = ACPI_TYPE_INTEGER;
-	in_params[1].integer.value      = 1;
-	in_params[2].type               = ACPI_TYPE_INTEGER;
-	in_params[2].integer.value      = 2;
-	in_params[3].type               = ACPI_TYPE_BUFFER;
-	in_params[3].buffer.length      = 8;
-	in_params[3].buffer.pointer     = (u8 *)&capabilities;
-
-	capabilities[0] = OSC_QUERY_ENABLE;
-	capabilities[1] = 0x1;
-
-	status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
-	if (ACPI_FAILURE(status))
-		return -ENODEV;
-
-	if (!output.length)
-		return -ENODEV;
-
-	out_obj = output.pointer;
-	if (out_obj->type != ACPI_TYPE_BUFFER) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
-	if (errors) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	supported = *((u32 *)(out_obj->buffer.pointer + 4));
-	if (!(supported & 0x1)) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	kfree(output.pointer);
-	capabilities[0] = 0x0;
-	capabilities[1] = 0x1;
-
-	status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
-	if (ACPI_FAILURE(status))
-		return -ENODEV;
-
-	if (!output.length)
-		return -ENODEV;
-
-	out_obj = output.pointer;
-	if (out_obj->type != ACPI_TYPE_BUFFER) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
-	if (errors) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	supported = *((u32 *)(out_obj->buffer.pointer + 4));
-	if (!(supported & 0x1)) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-out_free:
-	kfree(output.pointer);
-	return ret;
-}
-
-static int __init pcc_cpufreq_probe(void)
-{
-	acpi_status status;
-	struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
-	struct pcc_memory_resource *mem_resource;
-	struct pcc_register_resource *reg_resource;
-	union acpi_object *out_obj, *member;
-	acpi_handle handle, osc_handle, pcch_handle;
-	int ret = 0;
-
-	status = acpi_get_handle(NULL, "\\_SB", &handle);
-	if (ACPI_FAILURE(status))
-		return -ENODEV;
-
-	status = acpi_get_handle(handle, "PCCH", &pcch_handle);
-	if (ACPI_FAILURE(status))
-		return -ENODEV;
-
-	status = acpi_get_handle(handle, "_OSC", &osc_handle);
-	if (ACPI_SUCCESS(status)) {
-		ret = pcc_cpufreq_do_osc(&osc_handle);
-		if (ret)
-			dprintk("probe: _OSC evaluation did not succeed\n");
-		/* Firmware's use of _OSC is optional */
-		ret = 0;
-	}
-
-	status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
-	if (ACPI_FAILURE(status))
-		return -ENODEV;
-
-	out_obj = output.pointer;
-	if (out_obj->type != ACPI_TYPE_PACKAGE) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	member = &out_obj->package.elements[0];
-	if (member->type != ACPI_TYPE_BUFFER) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
-
-	dprintk("probe: mem_resource descriptor: 0x%x,"
-		" length: %d, space_id: %d, resource_usage: %d,"
-		" type_specific: %d, granularity: 0x%llx,"
-		" minimum: 0x%llx, maximum: 0x%llx,"
-		" translation_offset: 0x%llx, address_length: 0x%llx\n",
-		mem_resource->descriptor, mem_resource->length,
-		mem_resource->space_id, mem_resource->resource_usage,
-		mem_resource->type_specific, mem_resource->granularity,
-		mem_resource->minimum, mem_resource->maximum,
-		mem_resource->translation_offset,
-		mem_resource->address_length);
-
-	if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
-		ret = -ENODEV;
-		goto out_free;
-	}
-
-	pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
-					mem_resource->address_length);
-	if (pcch_virt_addr == NULL) {
-		dprintk("probe: could not map shared mem region\n");
-		goto out_free;
-	}
-	pcch_hdr = pcch_virt_addr;
-
-	dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
-	dprintk("probe: PCCH header is at physical address: 0x%llx,"
-		" signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
-		" supported features: 0x%x, command field: 0x%x,"
-		" status field: 0x%x, nominal latency: %d us\n",
-		mem_resource->minimum, ioread32(&pcch_hdr->signature),
-		ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
-		ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
-		ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
-		ioread32(&pcch_hdr->latency));
-
-	dprintk("probe: min time between commands: %d us,"
-		" max time between commands: %d us,"
-		" nominal CPU frequency: %d MHz,"
-		" minimum CPU frequency: %d MHz,"
-		" minimum CPU frequency without throttling: %d MHz\n",
-		ioread32(&pcch_hdr->minimum_time),
-		ioread32(&pcch_hdr->maximum_time),
-		ioread32(&pcch_hdr->nominal),
-		ioread32(&pcch_hdr->throttled_frequency),
-		ioread32(&pcch_hdr->minimum_frequency));
-
-	member = &out_obj->package.elements[1];
-	if (member->type != ACPI_TYPE_BUFFER) {
-		ret = -ENODEV;
-		goto pcch_free;
-	}
-
-	reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
-
-	doorbell.space_id = reg_resource->space_id;
-	doorbell.bit_width = reg_resource->bit_width;
-	doorbell.bit_offset = reg_resource->bit_offset;
-	doorbell.access_width = 64;
-	doorbell.address = reg_resource->address;
-
-	dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
-		"bit_offset is %d, access_width is %d, address is 0x%llx\n",
-		doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
-		doorbell.access_width, reg_resource->address);
-
-	member = &out_obj->package.elements[2];
-	if (member->type != ACPI_TYPE_INTEGER) {
-		ret = -ENODEV;
-		goto pcch_free;
-	}
-
-	doorbell_preserve = member->integer.value;
-
-	member = &out_obj->package.elements[3];
-	if (member->type != ACPI_TYPE_INTEGER) {
-		ret = -ENODEV;
-		goto pcch_free;
-	}
-
-	doorbell_write = member->integer.value;
-
-	dprintk("probe: doorbell_preserve: 0x%llx,"
-		" doorbell_write: 0x%llx\n",
-		doorbell_preserve, doorbell_write);
-
-	pcc_cpu_info = alloc_percpu(struct pcc_cpu);
-	if (!pcc_cpu_info) {
-		ret = -ENOMEM;
-		goto pcch_free;
-	}
-
-	printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
-	       " limits: %d MHz, %d MHz\n", PCC_VERSION,
-	       ioread32(&pcch_hdr->minimum_frequency),
-	       ioread32(&pcch_hdr->nominal));
-	kfree(output.pointer);
-	return ret;
-pcch_free:
-	pcc_clear_mapping();
-out_free:
-	kfree(output.pointer);
-	return ret;
-}
-
-static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
-{
-	unsigned int cpu = policy->cpu;
-	unsigned int result = 0;
-
-	if (!pcch_virt_addr) {
-		result = -1;
-		goto out;
-	}
-
-	result = pcc_get_offset(cpu);
-	if (result) {
-		dprintk("init: PCCP evaluation failed\n");
-		goto out;
-	}
-
-	policy->max = policy->cpuinfo.max_freq =
-		ioread32(&pcch_hdr->nominal) * 1000;
-	policy->min = policy->cpuinfo.min_freq =
-		ioread32(&pcch_hdr->minimum_frequency) * 1000;
-	policy->cur = pcc_get_freq(cpu);
-
-	if (!policy->cur) {
-		dprintk("init: Unable to get current CPU frequency\n");
-		result = -EINVAL;
-		goto out;
-	}
-
-	dprintk("init: policy->max is %d, policy->min is %d\n",
-		policy->max, policy->min);
-out:
-	return result;
-}
-
-static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
-{
-	return 0;
-}
-
-static struct cpufreq_driver pcc_cpufreq_driver = {
-	.flags = CPUFREQ_CONST_LOOPS,
-	.get = pcc_get_freq,
-	.verify = pcc_cpufreq_verify,
-	.target = pcc_cpufreq_target,
-	.init = pcc_cpufreq_cpu_init,
-	.exit = pcc_cpufreq_cpu_exit,
-	.name = "pcc-cpufreq",
-	.owner = THIS_MODULE,
-};
-
-static int __init pcc_cpufreq_init(void)
-{
-	int ret;
-
-	if (acpi_disabled)
-		return 0;
-
-	ret = pcc_cpufreq_probe();
-	if (ret) {
-		dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
-		return ret;
-	}
-
-	ret = cpufreq_register_driver(&pcc_cpufreq_driver);
-
-	return ret;
-}
-
-static void __exit pcc_cpufreq_exit(void)
-{
-	cpufreq_unregister_driver(&pcc_cpufreq_driver);
-
-	pcc_clear_mapping();
-
-	free_percpu(pcc_cpu_info);
-}
-
-MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
-MODULE_VERSION(PCC_VERSION);
-MODULE_DESCRIPTION("Processor Clocking Control interface driver");
-MODULE_LICENSE("GPL");
-
-late_initcall(pcc_cpufreq_init);
-module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
deleted file mode 100644
index b3379d6a5c5..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- *  This file was based upon code in Powertweak Linux (http://powertweak.sf.net)
- *  (C) 2000-2003  Dave Jones, Arjan van de Ven, Janne Pänkälä,
- *                 Dominik Brodowski.
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/ioport.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <asm/msr.h>
-
-#define POWERNOW_IOPORT 0xfff0          /* it doesn't matter where, as long
-					   as it is unused */
-
-#define PFX "powernow-k6: "
-static unsigned int                     busfreq;   /* FSB, in 10 kHz */
-static unsigned int                     max_multiplier;
-
-
-/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */
-static struct cpufreq_frequency_table clock_ratio[] = {
-	{45,  /* 000 -> 4.5x */ 0},
-	{50,  /* 001 -> 5.0x */ 0},
-	{40,  /* 010 -> 4.0x */ 0},
-	{55,  /* 011 -> 5.5x */ 0},
-	{20,  /* 100 -> 2.0x */ 0},
-	{30,  /* 101 -> 3.0x */ 0},
-	{60,  /* 110 -> 6.0x */ 0},
-	{35,  /* 111 -> 3.5x */ 0},
-	{0, CPUFREQ_TABLE_END}
-};
-
-
-/**
- * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier
- *
- *   Returns the current setting of the frequency multiplier. Core clock
- * speed is frequency of the Front-Side Bus multiplied with this value.
- */
-static int powernow_k6_get_cpu_multiplier(void)
-{
-	u64 invalue = 0;
-	u32 msrval;
-
-	msrval = POWERNOW_IOPORT + 0x1;
-	wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
-	invalue = inl(POWERNOW_IOPORT + 0x8);
-	msrval = POWERNOW_IOPORT + 0x0;
-	wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
-
-	return clock_ratio[(invalue >> 5)&7].index;
-}
-
-
-/**
- * powernow_k6_set_state - set the PowerNow! multiplier
- * @best_i: clock_ratio[best_i] is the target multiplier
- *
- *   Tries to change the PowerNow! multiplier
- */
-static void powernow_k6_set_state(unsigned int best_i)
-{
-	unsigned long outvalue = 0, invalue = 0;
-	unsigned long msrval;
-	struct cpufreq_freqs freqs;
-
-	if (clock_ratio[best_i].index > max_multiplier) {
-		printk(KERN_ERR PFX "invalid target frequency\n");
-		return;
-	}
-
-	freqs.old = busfreq * powernow_k6_get_cpu_multiplier();
-	freqs.new = busfreq * clock_ratio[best_i].index;
-	freqs.cpu = 0; /* powernow-k6.c is UP only driver */
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
-	/* we now need to transform best_i to the BVC format, see AMD#23446 */
-
-	outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5);
-
-	msrval = POWERNOW_IOPORT + 0x1;
-	wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
-	invalue = inl(POWERNOW_IOPORT + 0x8);
-	invalue = invalue & 0xf;
-	outvalue = outvalue | invalue;
-	outl(outvalue , (POWERNOW_IOPORT + 0x8));
-	msrval = POWERNOW_IOPORT + 0x0;
-	wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
-	return;
-}
-
-
-/**
- * powernow_k6_verify - verifies a new CPUfreq policy
- * @policy: new policy
- *
- * Policy must be within lowest and highest possible CPU Frequency,
- * and at least one possible state must be within min and max.
- */
-static int powernow_k6_verify(struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy, &clock_ratio[0]);
-}
-
-
-/**
- * powernow_k6_setpolicy - sets a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- *  (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * sets a new CPUFreq policy
- */
-static int powernow_k6_target(struct cpufreq_policy *policy,
-			       unsigned int target_freq,
-			       unsigned int relation)
-{
-	unsigned int newstate = 0;
-
-	if (cpufreq_frequency_table_target(policy, &clock_ratio[0],
-				target_freq, relation, &newstate))
-		return -EINVAL;
-
-	powernow_k6_set_state(newstate);
-
-	return 0;
-}
-
-
-static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
-{
-	unsigned int i, f;
-	int result;
-
-	if (policy->cpu != 0)
-		return -ENODEV;
-
-	/* get frequencies */
-	max_multiplier = powernow_k6_get_cpu_multiplier();
-	busfreq = cpu_khz / max_multiplier;
-
-	/* table init */
-	for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
-		f = clock_ratio[i].index;
-		if (f > max_multiplier)
-			clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
-		else
-			clock_ratio[i].frequency = busfreq * f;
-	}
-
-	/* cpuinfo and default policy values */
-	policy->cpuinfo.transition_latency = 200000;
-	policy->cur = busfreq * max_multiplier;
-
-	result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
-	if (result)
-		return result;
-
-	cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
-
-	return 0;
-}
-
-
-static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
-{
-	unsigned int i;
-	for (i = 0; i < 8; i++) {
-		if (i == max_multiplier)
-			powernow_k6_set_state(i);
-	}
-	cpufreq_frequency_table_put_attr(policy->cpu);
-	return 0;
-}
-
-static unsigned int powernow_k6_get(unsigned int cpu)
-{
-	unsigned int ret;
-	ret = (busfreq * powernow_k6_get_cpu_multiplier());
-	return ret;
-}
-
-static struct freq_attr *powernow_k6_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-static struct cpufreq_driver powernow_k6_driver = {
-	.verify		= powernow_k6_verify,
-	.target		= powernow_k6_target,
-	.init		= powernow_k6_cpu_init,
-	.exit		= powernow_k6_cpu_exit,
-	.get		= powernow_k6_get,
-	.name		= "powernow-k6",
-	.owner		= THIS_MODULE,
-	.attr		= powernow_k6_attr,
-};
-
-
-/**
- * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver
- *
- *   Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported
- * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero
- * on success.
- */
-static int __init powernow_k6_init(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-
-	if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) ||
-		((c->x86_model != 12) && (c->x86_model != 13)))
-		return -ENODEV;
-
-	if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) {
-		printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n");
-		return -EIO;
-	}
-
-	if (cpufreq_register_driver(&powernow_k6_driver)) {
-		release_region(POWERNOW_IOPORT, 16);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-
-/**
- * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support
- *
- *   Unregisters AMD K6-2+ / K6-3+ PowerNow! support.
- */
-static void __exit powernow_k6_exit(void)
-{
-	cpufreq_unregister_driver(&powernow_k6_driver);
-	release_region(POWERNOW_IOPORT, 16);
-}
-
-
-MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, "
-		"Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
-MODULE_LICENSE("GPL");
-
-module_init(powernow_k6_init);
-module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
deleted file mode 100644
index 4a45fd6e41b..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ /dev/null
@@ -1,752 +0,0 @@
-/*
- *  AMD K7 Powernow driver.
- *  (C) 2003 Dave Jones on behalf of SuSE Labs.
- *  (C) 2003-2004 Dave Jones <davej@redhat.com>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *  Based upon datasheets & sample CPUs kindly provided by AMD.
- *
- * Errata 5:
- *  CPU may fail to execute a FID/VID change in presence of interrupt.
- *  - We cli/sti on stepping A0 CPUs around the FID/VID transition.
- * Errata 15:
- *  CPU with half frequency multipliers may hang upon wakeup from disconnect.
- *  - We disable half multipliers if ACPI is used on A0 stepping CPUs.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/dmi.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <asm/timer.h>		/* Needed for recalibrate_cpu_khz() */
-#include <asm/msr.h>
-#include <asm/system.h>
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-#include <linux/acpi.h>
-#include <acpi/processor.h>
-#endif
-
-#include "powernow-k7.h"
-
-#define PFX "powernow: "
-
-
-struct psb_s {
-	u8 signature[10];
-	u8 tableversion;
-	u8 flags;
-	u16 settlingtime;
-	u8 reserved1;
-	u8 numpst;
-};
-
-struct pst_s {
-	u32 cpuid;
-	u8 fsbspeed;
-	u8 maxfid;
-	u8 startvid;
-	u8 numpstates;
-};
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-union powernow_acpi_control_t {
-	struct {
-		unsigned long fid:5,
-			vid:5,
-			sgtc:20,
-			res1:2;
-	} bits;
-	unsigned long val;
-};
-#endif
-
-#ifdef CONFIG_CPU_FREQ_DEBUG
-/* divide by 1000 to get VCore voltage in V. */
-static const int mobile_vid_table[32] = {
-    2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
-    1600, 1550, 1500, 1450, 1400, 1350, 1300, 0,
-    1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
-    1075, 1050, 1025, 1000, 975, 950, 925, 0,
-};
-#endif
-
-/* divide by 10 to get FID. */
-static const int fid_codes[32] = {
-    110, 115, 120, 125, 50, 55, 60, 65,
-    70, 75, 80, 85, 90, 95, 100, 105,
-    30, 190, 40, 200, 130, 135, 140, 210,
-    150, 225, 160, 165, 170, 180, -1, -1,
-};
-
-/* This parameter is used in order to force ACPI instead of legacy method for
- * configuration purpose.
- */
-
-static int acpi_force;
-
-static struct cpufreq_frequency_table *powernow_table;
-
-static unsigned int can_scale_bus;
-static unsigned int can_scale_vid;
-static unsigned int minimum_speed = -1;
-static unsigned int maximum_speed;
-static unsigned int number_scales;
-static unsigned int fsb;
-static unsigned int latency;
-static char have_a0;
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"powernow-k7", msg)
-
-static int check_fsb(unsigned int fsbspeed)
-{
-	int delta;
-	unsigned int f = fsb / 1000;
-
-	delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
-	return delta < 5;
-}
-
-static int check_powernow(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-	unsigned int maxei, eax, ebx, ecx, edx;
-
-	if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) {
-#ifdef MODULE
-		printk(KERN_INFO PFX "This module only works with "
-				"AMD K7 CPUs\n");
-#endif
-		return 0;
-	}
-
-	/* Get maximum capabilities */
-	maxei = cpuid_eax(0x80000000);
-	if (maxei < 0x80000007) {	/* Any powernow info ? */
-#ifdef MODULE
-		printk(KERN_INFO PFX "No powernow capabilities detected\n");
-#endif
-		return 0;
-	}
-
-	if ((c->x86_model == 6) && (c->x86_mask == 0)) {
-		printk(KERN_INFO PFX "K7 660[A0] core detected, "
-				"enabling errata workarounds\n");
-		have_a0 = 1;
-	}
-
-	cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
-
-	/* Check we can actually do something before we say anything.*/
-	if (!(edx & (1 << 1 | 1 << 2)))
-		return 0;
-
-	printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
-
-	if (edx & 1 << 1) {
-		printk("frequency");
-		can_scale_bus = 1;
-	}
-
-	if ((edx & (1 << 1 | 1 << 2)) == 0x6)
-		printk(" and ");
-
-	if (edx & 1 << 2) {
-		printk("voltage");
-		can_scale_vid = 1;
-	}
-
-	printk(".\n");
-	return 1;
-}
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-static void invalidate_entry(unsigned int entry)
-{
-	powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
-}
-#endif
-
-static int get_ranges(unsigned char *pst)
-{
-	unsigned int j;
-	unsigned int speed;
-	u8 fid, vid;
-
-	powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
-				(number_scales + 1)), GFP_KERNEL);
-	if (!powernow_table)
-		return -ENOMEM;
-
-	for (j = 0 ; j < number_scales; j++) {
-		fid = *pst++;
-
-		powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
-		powernow_table[j].index = fid; /* lower 8 bits */
-
-		speed = powernow_table[j].frequency;
-
-		if ((fid_codes[fid] % 10) == 5) {
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-			if (have_a0 == 1)
-				invalidate_entry(j);
-#endif
-		}
-
-		if (speed < minimum_speed)
-			minimum_speed = speed;
-		if (speed > maximum_speed)
-			maximum_speed = speed;
-
-		vid = *pst++;
-		powernow_table[j].index |= (vid << 8); /* upper 8 bits */
-
-		dprintk("   FID: 0x%x (%d.%dx [%dMHz])  "
-			 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
-			 fid_codes[fid] % 10, speed/1000, vid,
-			 mobile_vid_table[vid]/1000,
-			 mobile_vid_table[vid]%1000);
-	}
-	powernow_table[number_scales].frequency = CPUFREQ_TABLE_END;
-	powernow_table[number_scales].index = 0;
-
-	return 0;
-}
-
-
-static void change_FID(int fid)
-{
-	union msr_fidvidctl fidvidctl;
-
-	rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
-	if (fidvidctl.bits.FID != fid) {
-		fidvidctl.bits.SGTC = latency;
-		fidvidctl.bits.FID = fid;
-		fidvidctl.bits.VIDC = 0;
-		fidvidctl.bits.FIDC = 1;
-		wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
-	}
-}
-
-
-static void change_VID(int vid)
-{
-	union msr_fidvidctl fidvidctl;
-
-	rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
-	if (fidvidctl.bits.VID != vid) {
-		fidvidctl.bits.SGTC = latency;
-		fidvidctl.bits.VID = vid;
-		fidvidctl.bits.FIDC = 0;
-		fidvidctl.bits.VIDC = 1;
-		wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
-	}
-}
-
-
-static void change_speed(unsigned int index)
-{
-	u8 fid, vid;
-	struct cpufreq_freqs freqs;
-	union msr_fidvidstatus fidvidstatus;
-	int cfid;
-
-	/* fid are the lower 8 bits of the index we stored into
-	 * the cpufreq frequency table in powernow_decode_bios,
-	 * vid are the upper 8 bits.
-	 */
-
-	fid = powernow_table[index].index & 0xFF;
-	vid = (powernow_table[index].index & 0xFF00) >> 8;
-
-	freqs.cpu = 0;
-
-	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
-	cfid = fidvidstatus.bits.CFID;
-	freqs.old = fsb * fid_codes[cfid] / 10;
-
-	freqs.new = powernow_table[index].frequency;
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
-	/* Now do the magic poking into the MSRs.  */
-
-	if (have_a0 == 1)	/* A0 errata 5 */
-		local_irq_disable();
-
-	if (freqs.old > freqs.new) {
-		/* Going down, so change FID first */
-		change_FID(fid);
-		change_VID(vid);
-	} else {
-		/* Going up, so change VID first */
-		change_VID(vid);
-		change_FID(fid);
-	}
-
-
-	if (have_a0 == 1)
-		local_irq_enable();
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-}
-
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-
-static struct acpi_processor_performance *acpi_processor_perf;
-
-static int powernow_acpi_init(void)
-{
-	int i;
-	int retval = 0;
-	union powernow_acpi_control_t pc;
-
-	if (acpi_processor_perf != NULL && powernow_table != NULL) {
-		retval = -EINVAL;
-		goto err0;
-	}
-
-	acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance),
-				      GFP_KERNEL);
-	if (!acpi_processor_perf) {
-		retval = -ENOMEM;
-		goto err0;
-	}
-
-	if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
-								GFP_KERNEL)) {
-		retval = -ENOMEM;
-		goto err05;
-	}
-
-	if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
-		retval = -EIO;
-		goto err1;
-	}
-
-	if (acpi_processor_perf->control_register.space_id !=
-			ACPI_ADR_SPACE_FIXED_HARDWARE) {
-		retval = -ENODEV;
-		goto err2;
-	}
-
-	if (acpi_processor_perf->status_register.space_id !=
-			ACPI_ADR_SPACE_FIXED_HARDWARE) {
-		retval = -ENODEV;
-		goto err2;
-	}
-
-	number_scales = acpi_processor_perf->state_count;
-
-	if (number_scales < 2) {
-		retval = -ENODEV;
-		goto err2;
-	}
-
-	powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
-				(number_scales + 1)), GFP_KERNEL);
-	if (!powernow_table) {
-		retval = -ENOMEM;
-		goto err2;
-	}
-
-	pc.val = (unsigned long) acpi_processor_perf->states[0].control;
-	for (i = 0; i < number_scales; i++) {
-		u8 fid, vid;
-		struct acpi_processor_px *state =
-			&acpi_processor_perf->states[i];
-		unsigned int speed, speed_mhz;
-
-		pc.val = (unsigned long) state->control;
-		dprintk("acpi:  P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
-			 i,
-			 (u32) state->core_frequency,
-			 (u32) state->power,
-			 (u32) state->transition_latency,
-			 (u32) state->control,
-			 pc.bits.sgtc);
-
-		vid = pc.bits.vid;
-		fid = pc.bits.fid;
-
-		powernow_table[i].frequency = fsb * fid_codes[fid] / 10;
-		powernow_table[i].index = fid; /* lower 8 bits */
-		powernow_table[i].index |= (vid << 8); /* upper 8 bits */
-
-		speed = powernow_table[i].frequency;
-		speed_mhz = speed / 1000;
-
-		/* processor_perflib will multiply the MHz value by 1000 to
-		 * get a KHz value (e.g. 1266000). However, powernow-k7 works
-		 * with true KHz values (e.g. 1266768). To ensure that all
-		 * powernow frequencies are available, we must ensure that
-		 * ACPI doesn't restrict them, so we round up the MHz value
-		 * to ensure that perflib's computed KHz value is greater than
-		 * or equal to powernow's KHz value.
-		 */
-		if (speed % 1000 > 0)
-			speed_mhz++;
-
-		if ((fid_codes[fid] % 10) == 5) {
-			if (have_a0 == 1)
-				invalidate_entry(i);
-		}
-
-		dprintk("   FID: 0x%x (%d.%dx [%dMHz])  "
-			 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
-			 fid_codes[fid] % 10, speed_mhz, vid,
-			 mobile_vid_table[vid]/1000,
-			 mobile_vid_table[vid]%1000);
-
-		if (state->core_frequency != speed_mhz) {
-			state->core_frequency = speed_mhz;
-			dprintk("   Corrected ACPI frequency to %d\n",
-				speed_mhz);
-		}
-
-		if (latency < pc.bits.sgtc)
-			latency = pc.bits.sgtc;
-
-		if (speed < minimum_speed)
-			minimum_speed = speed;
-		if (speed > maximum_speed)
-			maximum_speed = speed;
-	}
-
-	powernow_table[i].frequency = CPUFREQ_TABLE_END;
-	powernow_table[i].index = 0;
-
-	/* notify BIOS that we exist */
-	acpi_processor_notify_smm(THIS_MODULE);
-
-	return 0;
-
-err2:
-	acpi_processor_unregister_performance(acpi_processor_perf, 0);
-err1:
-	free_cpumask_var(acpi_processor_perf->shared_cpu_map);
-err05:
-	kfree(acpi_processor_perf);
-err0:
-	printk(KERN_WARNING PFX "ACPI perflib can not be used on "
-			"this platform\n");
-	acpi_processor_perf = NULL;
-	return retval;
-}
-#else
-static int powernow_acpi_init(void)
-{
-	printk(KERN_INFO PFX "no support for ACPI processor found."
-	       "  Please recompile your kernel with ACPI processor\n");
-	return -EINVAL;
-}
-#endif
-
-static void print_pst_entry(struct pst_s *pst, unsigned int j)
-{
-	dprintk("PST:%d (@%p)\n", j, pst);
-	dprintk(" cpuid: 0x%x  fsb: %d  maxFID: 0x%x  startvid: 0x%x\n",
-		pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
-}
-
-static int powernow_decode_bios(int maxfid, int startvid)
-{
-	struct psb_s *psb;
-	struct pst_s *pst;
-	unsigned int i, j;
-	unsigned char *p;
-	unsigned int etuple;
-	unsigned int ret;
-
-	etuple = cpuid_eax(0x80000001);
-
-	for (i = 0xC0000; i < 0xffff0 ; i += 16) {
-
-		p = phys_to_virt(i);
-
-		if (memcmp(p, "AMDK7PNOW!",  10) == 0) {
-			dprintk("Found PSB header at %p\n", p);
-			psb = (struct psb_s *) p;
-			dprintk("Table version: 0x%x\n", psb->tableversion);
-			if (psb->tableversion != 0x12) {
-				printk(KERN_INFO PFX "Sorry, only v1.2 tables"
-						" supported right now\n");
-				return -ENODEV;
-			}
-
-			dprintk("Flags: 0x%x\n", psb->flags);
-			if ((psb->flags & 1) == 0)
-				dprintk("Mobile voltage regulator\n");
-			else
-				dprintk("Desktop voltage regulator\n");
-
-			latency = psb->settlingtime;
-			if (latency < 100) {
-				printk(KERN_INFO PFX "BIOS set settling time "
-						"to %d microseconds. "
-						"Should be at least 100. "
-						"Correcting.\n", latency);
-				latency = 100;
-			}
-			dprintk("Settling Time: %d microseconds.\n",
-					psb->settlingtime);
-			dprintk("Has %d PST tables. (Only dumping ones "
-					"relevant to this CPU).\n",
-					psb->numpst);
-
-			p += sizeof(struct psb_s);
-
-			pst = (struct pst_s *) p;
-
-			for (j = 0; j < psb->numpst; j++) {
-				pst = (struct pst_s *) p;
-				number_scales = pst->numpstates;
-
-				if ((etuple == pst->cpuid) &&
-				    check_fsb(pst->fsbspeed) &&
-				    (maxfid == pst->maxfid) &&
-				    (startvid == pst->startvid)) {
-					print_pst_entry(pst, j);
-					p = (char *)pst + sizeof(struct pst_s);
-					ret = get_ranges(p);
-					return ret;
-				} else {
-					unsigned int k;
-					p = (char *)pst + sizeof(struct pst_s);
-					for (k = 0; k < number_scales; k++)
-						p += 2;
-				}
-			}
-			printk(KERN_INFO PFX "No PST tables match this cpuid "
-					"(0x%x)\n", etuple);
-			printk(KERN_INFO PFX "This is indicative of a broken "
-					"BIOS.\n");
-
-			return -EINVAL;
-		}
-		p++;
-	}
-
-	return -ENODEV;
-}
-
-
-static int powernow_target(struct cpufreq_policy *policy,
-			    unsigned int target_freq,
-			    unsigned int relation)
-{
-	unsigned int newstate;
-
-	if (cpufreq_frequency_table_target(policy, powernow_table, target_freq,
-				relation, &newstate))
-		return -EINVAL;
-
-	change_speed(newstate);
-
-	return 0;
-}
-
-
-static int powernow_verify(struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy, powernow_table);
-}
-
-/*
- * We use the fact that the bus frequency is somehow
- * a multiple of 100000/3 khz, then we compute sgtc according
- * to this multiple.
- * That way, we match more how AMD thinks all of that work.
- * We will then get the same kind of behaviour already tested under
- * the "well-known" other OS.
- */
-static int __cpuinit fixup_sgtc(void)
-{
-	unsigned int sgtc;
-	unsigned int m;
-
-	m = fsb / 3333;
-	if ((m % 10) >= 5)
-		m += 5;
-
-	m /= 10;
-
-	sgtc = 100 * m * latency;
-	sgtc = sgtc / 3;
-	if (sgtc > 0xfffff) {
-		printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc);
-		sgtc = 0xfffff;
-	}
-	return sgtc;
-}
-
-static unsigned int powernow_get(unsigned int cpu)
-{
-	union msr_fidvidstatus fidvidstatus;
-	unsigned int cfid;
-
-	if (cpu)
-		return 0;
-	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
-	cfid = fidvidstatus.bits.CFID;
-
-	return fsb * fid_codes[cfid] / 10;
-}
-
-
-static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
-{
-	printk(KERN_WARNING PFX
-		"%s laptop with broken PST tables in BIOS detected.\n",
-		d->ident);
-	printk(KERN_WARNING PFX
-		"You need to downgrade to 3A21 (09/09/2002), or try a newer "
-		"BIOS than 3A71 (01/20/2003)\n");
-	printk(KERN_WARNING PFX
-		"cpufreq scaling has been disabled as a result of this.\n");
-	return 0;
-}
-
-/*
- * Some Athlon laptops have really fucked PST tables.
- * A BIOS update is all that can save them.
- * Mention this, and disable cpufreq.
- */
-static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
-	{
-		.callback = acer_cpufreq_pst,
-		.ident = "Acer Aspire",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"),
-			DMI_MATCH(DMI_BIOS_VERSION, "3A71"),
-		},
-	},
-	{ }
-};
-
-static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
-{
-	union msr_fidvidstatus fidvidstatus;
-	int result;
-
-	if (policy->cpu != 0)
-		return -ENODEV;
-
-	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
-
-	recalibrate_cpu_khz();
-
-	fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID];
-	if (!fsb) {
-		printk(KERN_WARNING PFX "can not determine bus frequency\n");
-		return -EINVAL;
-	}
-	dprintk("FSB: %3dMHz\n", fsb/1000);
-
-	if (dmi_check_system(powernow_dmi_table) || acpi_force) {
-		printk(KERN_INFO PFX "PSB/PST known to be broken.  "
-				"Trying ACPI instead\n");
-		result = powernow_acpi_init();
-	} else {
-		result = powernow_decode_bios(fidvidstatus.bits.MFID,
-				fidvidstatus.bits.SVID);
-		if (result) {
-			printk(KERN_INFO PFX "Trying ACPI perflib\n");
-			maximum_speed = 0;
-			minimum_speed = -1;
-			latency = 0;
-			result = powernow_acpi_init();
-			if (result) {
-				printk(KERN_INFO PFX
-					"ACPI and legacy methods failed\n");
-			}
-		} else {
-			/* SGTC use the bus clock as timer */
-			latency = fixup_sgtc();
-			printk(KERN_INFO PFX "SGTC: %d\n", latency);
-		}
-	}
-
-	if (result)
-		return result;
-
-	printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
-				minimum_speed/1000, maximum_speed/1000);
-
-	policy->cpuinfo.transition_latency =
-		cpufreq_scale(2000000UL, fsb, latency);
-
-	policy->cur = powernow_get(0);
-
-	cpufreq_frequency_table_get_attr(powernow_table, policy->cpu);
-
-	return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
-}
-
-static int powernow_cpu_exit(struct cpufreq_policy *policy)
-{
-	cpufreq_frequency_table_put_attr(policy->cpu);
-
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-	if (acpi_processor_perf) {
-		acpi_processor_unregister_performance(acpi_processor_perf, 0);
-		free_cpumask_var(acpi_processor_perf->shared_cpu_map);
-		kfree(acpi_processor_perf);
-	}
-#endif
-
-	kfree(powernow_table);
-	return 0;
-}
-
-static struct freq_attr *powernow_table_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-static struct cpufreq_driver powernow_driver = {
-	.verify		= powernow_verify,
-	.target		= powernow_target,
-	.get		= powernow_get,
-#ifdef CONFIG_X86_POWERNOW_K7_ACPI
-	.bios_limit	= acpi_processor_get_bios_limit,
-#endif
-	.init		= powernow_cpu_init,
-	.exit		= powernow_cpu_exit,
-	.name		= "powernow-k7",
-	.owner		= THIS_MODULE,
-	.attr		= powernow_table_attr,
-};
-
-static int __init powernow_init(void)
-{
-	if (check_powernow() == 0)
-		return -ENODEV;
-	return cpufreq_register_driver(&powernow_driver);
-}
-
-
-static void __exit powernow_exit(void)
-{
-	cpufreq_unregister_driver(&powernow_driver);
-}
-
-module_param(acpi_force,  int, 0444);
-MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
-
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
-MODULE_DESCRIPTION("Powernow driver for AMD K7 processors.");
-MODULE_LICENSE("GPL");
-
-late_initcall(powernow_init);
-module_exit(powernow_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
deleted file mode 100644
index 35fb4eaf6e1..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- *  (C) 2003 Dave Jones.
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  AMD-specific information
- *
- */
-
-union msr_fidvidctl {
-	struct {
-		unsigned FID:5,			// 4:0
-		reserved1:3,	// 7:5
-		VID:5,			// 12:8
-		reserved2:3,	// 15:13
-		FIDC:1,			// 16
-		VIDC:1,			// 17
-		reserved3:2,	// 19:18
-		FIDCHGRATIO:1,	// 20
-		reserved4:11,	// 31-21
-		SGTC:20,		// 32:51
-		reserved5:12;	// 63:52
-	} bits;
-	unsigned long long val;
-};
-
-union msr_fidvidstatus {
-	struct {
-		unsigned CFID:5,			// 4:0
-		reserved1:3,	// 7:5
-		SFID:5,			// 12:8
-		reserved2:3,	// 15:13
-		MFID:5,			// 20:16
-		reserved3:11,	// 31:21
-		CVID:5,			// 36:32
-		reserved4:3,	// 39:37
-		SVID:5,			// 44:40
-		reserved5:3,	// 47:45
-		MVID:5,			// 52:48
-		reserved6:11;	// 63:53
-	} bits;
-	unsigned long long val;
-};
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
deleted file mode 100644
index 2368e38327b..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ /dev/null
@@ -1,1607 +0,0 @@
-/*
- *   (c) 2003-2010 Advanced Micro Devices, Inc.
- *  Your use of this code is subject to the terms and conditions of the
- *  GNU general public license version 2. See "COPYING" or
- *  http://www.gnu.org/licenses/gpl.html
- *
- *  Support : mark.langsdorf@amd.com
- *
- *  Based on the powernow-k7.c module written by Dave Jones.
- *  (C) 2003 Dave Jones on behalf of SuSE Labs
- *  (C) 2004 Dominik Brodowski <linux@brodo.de>
- *  (C) 2004 Pavel Machek <pavel@ucw.cz>
- *  Licensed under the terms of the GNU GPL License version 2.
- *  Based upon datasheets & sample CPUs kindly provided by AMD.
- *
- *  Valuable input gratefully received from Dave Jones, Pavel Machek,
- *  Dominik Brodowski, Jacob Shin, and others.
- *  Originally developed by Paul Devriendt.
- *  Processor information obtained from Chapter 9 (Power and Thermal Management)
- *  of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
- *  Opteron Processors" available for download from www.amd.com
- *
- *  Tables for specific CPUs can be inferred from
- *     http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf
- */
-
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/cpumask.h>
-#include <linux/sched.h>	/* for current / set_cpus_allowed() */
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/msr.h>
-
-#include <linux/acpi.h>
-#include <linux/mutex.h>
-#include <acpi/processor.h>
-
-#define PFX "powernow-k8: "
-#define VERSION "version 2.20.00"
-#include "powernow-k8.h"
-#include "mperf.h"
-
-/* serialize freq changes  */
-static DEFINE_MUTEX(fidvid_mutex);
-
-static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
-
-static int cpu_family = CPU_OPTERON;
-
-/* core performance boost */
-static bool cpb_capable, cpb_enabled;
-static struct msr __percpu *msrs;
-
-static struct cpufreq_driver cpufreq_amd64_driver;
-
-#ifndef CONFIG_SMP
-static inline const struct cpumask *cpu_core_mask(int cpu)
-{
-	return cpumask_of(0);
-}
-#endif
-
-/* Return a frequency in MHz, given an input fid */
-static u32 find_freq_from_fid(u32 fid)
-{
-	return 800 + (fid * 100);
-}
-
-/* Return a frequency in KHz, given an input fid */
-static u32 find_khz_freq_from_fid(u32 fid)
-{
-	return 1000 * find_freq_from_fid(fid);
-}
-
-static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data,
-		u32 pstate)
-{
-	return data[pstate].frequency;
-}
-
-/* Return the vco fid for an input fid
- *
- * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
- * only from corresponding high fids. This returns "high" fid corresponding to
- * "low" one.
- */
-static u32 convert_fid_to_vco_fid(u32 fid)
-{
-	if (fid < HI_FID_TABLE_BOTTOM)
-		return 8 + (2 * fid);
-	else
-		return fid;
-}
-
-/*
- * Return 1 if the pending bit is set. Unless we just instructed the processor
- * to transition to a new state, seeing this bit set is really bad news.
- */
-static int pending_bit_stuck(void)
-{
-	u32 lo, hi;
-
-	if (cpu_family == CPU_HW_PSTATE)
-		return 0;
-
-	rdmsr(MSR_FIDVID_STATUS, lo, hi);
-	return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0;
-}
-
-/*
- * Update the global current fid / vid values from the status msr.
- * Returns 1 on error.
- */
-static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
-{
-	u32 lo, hi;
-	u32 i = 0;
-
-	if (cpu_family == CPU_HW_PSTATE) {
-		rdmsr(MSR_PSTATE_STATUS, lo, hi);
-		i = lo & HW_PSTATE_MASK;
-		data->currpstate = i;
-
-		/*
-		 * a workaround for family 11h erratum 311 might cause
-		 * an "out-of-range Pstate if the core is in Pstate-0
-		 */
-		if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps))
-			data->currpstate = HW_PSTATE_0;
-
-		return 0;
-	}
-	do {
-		if (i++ > 10000) {
-			dprintk("detected change pending stuck\n");
-			return 1;
-		}
-		rdmsr(MSR_FIDVID_STATUS, lo, hi);
-	} while (lo & MSR_S_LO_CHANGE_PENDING);
-
-	data->currvid = hi & MSR_S_HI_CURRENT_VID;
-	data->currfid = lo & MSR_S_LO_CURRENT_FID;
-
-	return 0;
-}
-
-/* the isochronous relief time */
-static void count_off_irt(struct powernow_k8_data *data)
-{
-	udelay((1 << data->irt) * 10);
-	return;
-}
-
-/* the voltage stabilization time */
-static void count_off_vst(struct powernow_k8_data *data)
-{
-	udelay(data->vstable * VST_UNITS_20US);
-	return;
-}
-
-/* need to init the control msr to a safe value (for each cpu) */
-static void fidvid_msr_init(void)
-{
-	u32 lo, hi;
-	u8 fid, vid;
-
-	rdmsr(MSR_FIDVID_STATUS, lo, hi);
-	vid = hi & MSR_S_HI_CURRENT_VID;
-	fid = lo & MSR_S_LO_CURRENT_FID;
-	lo = fid | (vid << MSR_C_LO_VID_SHIFT);
-	hi = MSR_C_HI_STP_GNT_BENIGN;
-	dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);
-	wrmsr(MSR_FIDVID_CTL, lo, hi);
-}
-
-/* write the new fid value along with the other control fields to the msr */
-static int write_new_fid(struct powernow_k8_data *data, u32 fid)
-{
-	u32 lo;
-	u32 savevid = data->currvid;
-	u32 i = 0;
-
-	if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) {
-		printk(KERN_ERR PFX "internal error - overflow on fid write\n");
-		return 1;
-	}
-
-	lo = fid;
-	lo |= (data->currvid << MSR_C_LO_VID_SHIFT);
-	lo |= MSR_C_LO_INIT_FID_VID;
-
-	dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
-		fid, lo, data->plllock * PLL_LOCK_CONVERSION);
-
-	do {
-		wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
-		if (i++ > 100) {
-			printk(KERN_ERR PFX
-				"Hardware error - pending bit very stuck - "
-				"no further pstate changes possible\n");
-			return 1;
-		}
-	} while (query_current_values_with_pending_wait(data));
-
-	count_off_irt(data);
-
-	if (savevid != data->currvid) {
-		printk(KERN_ERR PFX
-			"vid change on fid trans, old 0x%x, new 0x%x\n",
-			savevid, data->currvid);
-		return 1;
-	}
-
-	if (fid != data->currfid) {
-		printk(KERN_ERR PFX
-			"fid trans failed, fid 0x%x, curr 0x%x\n", fid,
-			data->currfid);
-		return 1;
-	}
-
-	return 0;
-}
-
-/* Write a new vid to the hardware */
-static int write_new_vid(struct powernow_k8_data *data, u32 vid)
-{
-	u32 lo;
-	u32 savefid = data->currfid;
-	int i = 0;
-
-	if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) {
-		printk(KERN_ERR PFX "internal error - overflow on vid write\n");
-		return 1;
-	}
-
-	lo = data->currfid;
-	lo |= (vid << MSR_C_LO_VID_SHIFT);
-	lo |= MSR_C_LO_INIT_FID_VID;
-
-	dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
-		vid, lo, STOP_GRANT_5NS);
-
-	do {
-		wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
-		if (i++ > 100) {
-			printk(KERN_ERR PFX "internal error - pending bit "
-					"very stuck - no further pstate "
-					"changes possible\n");
-			return 1;
-		}
-	} while (query_current_values_with_pending_wait(data));
-
-	if (savefid != data->currfid) {
-		printk(KERN_ERR PFX "fid changed on vid trans, old "
-			"0x%x new 0x%x\n",
-		       savefid, data->currfid);
-		return 1;
-	}
-
-	if (vid != data->currvid) {
-		printk(KERN_ERR PFX "vid trans failed, vid 0x%x, "
-				"curr 0x%x\n",
-				vid, data->currvid);
-		return 1;
-	}
-
-	return 0;
-}
-
-/*
- * Reduce the vid by the max of step or reqvid.
- * Decreasing vid codes represent increasing voltages:
- * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.
- */
-static int decrease_vid_code_by_step(struct powernow_k8_data *data,
-		u32 reqvid, u32 step)
-{
-	if ((data->currvid - reqvid) > step)
-		reqvid = data->currvid - step;
-
-	if (write_new_vid(data, reqvid))
-		return 1;
-
-	count_off_vst(data);
-
-	return 0;
-}
-
-/* Change hardware pstate by single MSR write */
-static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
-{
-	wrmsr(MSR_PSTATE_CTRL, pstate, 0);
-	data->currpstate = pstate;
-	return 0;
-}
-
-/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
-static int transition_fid_vid(struct powernow_k8_data *data,
-		u32 reqfid, u32 reqvid)
-{
-	if (core_voltage_pre_transition(data, reqvid, reqfid))
-		return 1;
-
-	if (core_frequency_transition(data, reqfid))
-		return 1;
-
-	if (core_voltage_post_transition(data, reqvid))
-		return 1;
-
-	if (query_current_values_with_pending_wait(data))
-		return 1;
-
-	if ((reqfid != data->currfid) || (reqvid != data->currvid)) {
-		printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, "
-				"curr 0x%x 0x%x\n",
-				smp_processor_id(),
-				reqfid, reqvid, data->currfid, data->currvid);
-		return 1;
-	}
-
-	dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",
-		smp_processor_id(), data->currfid, data->currvid);
-
-	return 0;
-}
-
-/* Phase 1 - core voltage transition ... setup voltage */
-static int core_voltage_pre_transition(struct powernow_k8_data *data,
-		u32 reqvid, u32 reqfid)
-{
-	u32 rvosteps = data->rvo;
-	u32 savefid = data->currfid;
-	u32 maxvid, lo, rvomult = 1;
-
-	dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
-		"reqvid 0x%x, rvo 0x%x\n",
-		smp_processor_id(),
-		data->currfid, data->currvid, reqvid, data->rvo);
-
-	if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP))
-		rvomult = 2;
-	rvosteps *= rvomult;
-	rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
-	maxvid = 0x1f & (maxvid >> 16);
-	dprintk("ph1 maxvid=0x%x\n", maxvid);
-	if (reqvid < maxvid) /* lower numbers are higher voltages */
-		reqvid = maxvid;
-
-	while (data->currvid > reqvid) {
-		dprintk("ph1: curr 0x%x, req vid 0x%x\n",
-			data->currvid, reqvid);
-		if (decrease_vid_code_by_step(data, reqvid, data->vidmvs))
-			return 1;
-	}
-
-	while ((rvosteps > 0) &&
-			((rvomult * data->rvo + data->currvid) > reqvid)) {
-		if (data->currvid == maxvid) {
-			rvosteps = 0;
-		} else {
-			dprintk("ph1: changing vid for rvo, req 0x%x\n",
-				data->currvid - 1);
-			if (decrease_vid_code_by_step(data, data->currvid-1, 1))
-				return 1;
-			rvosteps--;
-		}
-	}
-
-	if (query_current_values_with_pending_wait(data))
-		return 1;
-
-	if (savefid != data->currfid) {
-		printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n",
-				data->currfid);
-		return 1;
-	}
-
-	dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n",
-		data->currfid, data->currvid);
-
-	return 0;
-}
-
-/* Phase 2 - core frequency transition */
-static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
-{
-	u32 vcoreqfid, vcocurrfid, vcofiddiff;
-	u32 fid_interval, savevid = data->currvid;
-
-	if (data->currfid == reqfid) {
-		printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
-				data->currfid);
-		return 0;
-	}
-
-	dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "
-		"reqfid 0x%x\n",
-		smp_processor_id(),
-		data->currfid, data->currvid, reqfid);
-
-	vcoreqfid = convert_fid_to_vco_fid(reqfid);
-	vcocurrfid = convert_fid_to_vco_fid(data->currfid);
-	vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
-	    : vcoreqfid - vcocurrfid;
-
-	if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP))
-		vcofiddiff = 0;
-
-	while (vcofiddiff > 2) {
-		(data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
-
-		if (reqfid > data->currfid) {
-			if (data->currfid > LO_FID_TABLE_TOP) {
-				if (write_new_fid(data,
-						data->currfid + fid_interval))
-					return 1;
-			} else {
-				if (write_new_fid
-				    (data,
-				     2 + convert_fid_to_vco_fid(data->currfid)))
-					return 1;
-			}
-		} else {
-			if (write_new_fid(data, data->currfid - fid_interval))
-				return 1;
-		}
-
-		vcocurrfid = convert_fid_to_vco_fid(data->currfid);
-		vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
-		    : vcoreqfid - vcocurrfid;
-	}
-
-	if (write_new_fid(data, reqfid))
-		return 1;
-
-	if (query_current_values_with_pending_wait(data))
-		return 1;
-
-	if (data->currfid != reqfid) {
-		printk(KERN_ERR PFX
-			"ph2: mismatch, failed fid transition, "
-			"curr 0x%x, req 0x%x\n",
-			data->currfid, reqfid);
-		return 1;
-	}
-
-	if (savevid != data->currvid) {
-		printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n",
-			savevid, data->currvid);
-		return 1;
-	}
-
-	dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n",
-		data->currfid, data->currvid);
-
-	return 0;
-}
-
-/* Phase 3 - core voltage transition flow ... jump to the final vid. */
-static int core_voltage_post_transition(struct powernow_k8_data *data,
-		u32 reqvid)
-{
-	u32 savefid = data->currfid;
-	u32 savereqvid = reqvid;
-
-	dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",
-		smp_processor_id(),
-		data->currfid, data->currvid);
-
-	if (reqvid != data->currvid) {
-		if (write_new_vid(data, reqvid))
-			return 1;
-
-		if (savefid != data->currfid) {
-			printk(KERN_ERR PFX
-			       "ph3: bad fid change, save 0x%x, curr 0x%x\n",
-			       savefid, data->currfid);
-			return 1;
-		}
-
-		if (data->currvid != reqvid) {
-			printk(KERN_ERR PFX
-			       "ph3: failed vid transition\n, "
-			       "req 0x%x, curr 0x%x",
-			       reqvid, data->currvid);
-			return 1;
-		}
-	}
-
-	if (query_current_values_with_pending_wait(data))
-		return 1;
-
-	if (savereqvid != data->currvid) {
-		dprintk("ph3 failed, currvid 0x%x\n", data->currvid);
-		return 1;
-	}
-
-	if (savefid != data->currfid) {
-		dprintk("ph3 failed, currfid changed 0x%x\n",
-			data->currfid);
-		return 1;
-	}
-
-	dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n",
-		data->currfid, data->currvid);
-
-	return 0;
-}
-
-static void check_supported_cpu(void *_rc)
-{
-	u32 eax, ebx, ecx, edx;
-	int *rc = _rc;
-
-	*rc = -ENODEV;
-
-	if (__this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_AMD)
-		return;
-
-	eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
-	if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
-	    ((eax & CPUID_XFAM) < CPUID_XFAM_10H))
-		return;
-
-	if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
-		if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
-		    ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
-			printk(KERN_INFO PFX
-				"Processor cpuid %x not supported\n", eax);
-			return;
-		}
-
-		eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
-		if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
-			printk(KERN_INFO PFX
-			       "No frequency change capabilities detected\n");
-			return;
-		}
-
-		cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
-		if ((edx & P_STATE_TRANSITION_CAPABLE)
-			!= P_STATE_TRANSITION_CAPABLE) {
-			printk(KERN_INFO PFX
-				"Power state transitions not supported\n");
-			return;
-		}
-	} else { /* must be a HW Pstate capable processor */
-		cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
-		if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
-			cpu_family = CPU_HW_PSTATE;
-		else
-			return;
-	}
-
-	*rc = 0;
-}
-
-static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
-		u8 maxvid)
-{
-	unsigned int j;
-	u8 lastfid = 0xff;
-
-	for (j = 0; j < data->numps; j++) {
-		if (pst[j].vid > LEAST_VID) {
-			printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
-			       j, pst[j].vid);
-			return -EINVAL;
-		}
-		if (pst[j].vid < data->rvo) {
-			/* vid + rvo >= 0 */
-			printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
-			       " %d\n", j);
-			return -ENODEV;
-		}
-		if (pst[j].vid < maxvid + data->rvo) {
-			/* vid + rvo >= maxvid */
-			printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
-			       " %d\n", j);
-			return -ENODEV;
-		}
-		if (pst[j].fid > MAX_FID) {
-			printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
-			       " %d\n", j);
-			return -ENODEV;
-		}
-		if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
-			/* Only first fid is allowed to be in "low" range */
-			printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
-			       "0x%x\n", j, pst[j].fid);
-			return -EINVAL;
-		}
-		if (pst[j].fid < lastfid)
-			lastfid = pst[j].fid;
-	}
-	if (lastfid & 1) {
-		printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
-		return -EINVAL;
-	}
-	if (lastfid > LO_FID_TABLE_TOP)
-		printk(KERN_INFO FW_BUG PFX
-			"first fid not from lo freq table\n");
-
-	return 0;
-}
-
-static void invalidate_entry(struct cpufreq_frequency_table *powernow_table,
-		unsigned int entry)
-{
-	powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
-}
-
-static void print_basics(struct powernow_k8_data *data)
-{
-	int j;
-	for (j = 0; j < data->numps; j++) {
-		if (data->powernow_table[j].frequency !=
-				CPUFREQ_ENTRY_INVALID) {
-			if (cpu_family == CPU_HW_PSTATE) {
-				printk(KERN_INFO PFX
-					"   %d : pstate %d (%d MHz)\n", j,
-					data->powernow_table[j].index,
-					data->powernow_table[j].frequency/1000);
-			} else {
-				printk(KERN_INFO PFX
-					"fid 0x%x (%d MHz), vid 0x%x\n",
-					data->powernow_table[j].index & 0xff,
-					data->powernow_table[j].frequency/1000,
-					data->powernow_table[j].index >> 8);
-			}
-		}
-	}
-	if (data->batps)
-		printk(KERN_INFO PFX "Only %d pstates on battery\n",
-				data->batps);
-}
-
-static u32 freq_from_fid_did(u32 fid, u32 did)
-{
-	u32 mhz = 0;
-
-	if (boot_cpu_data.x86 == 0x10)
-		mhz = (100 * (fid + 0x10)) >> did;
-	else if (boot_cpu_data.x86 == 0x11)
-		mhz = (100 * (fid + 8)) >> did;
-	else
-		BUG();
-
-	return mhz * 1000;
-}
-
-static int fill_powernow_table(struct powernow_k8_data *data,
-		struct pst_s *pst, u8 maxvid)
-{
-	struct cpufreq_frequency_table *powernow_table;
-	unsigned int j;
-
-	if (data->batps) {
-		/* use ACPI support to get full speed on mains power */
-		printk(KERN_WARNING PFX
-			"Only %d pstates usable (use ACPI driver for full "
-			"range\n", data->batps);
-		data->numps = data->batps;
-	}
-
-	for (j = 1; j < data->numps; j++) {
-		if (pst[j-1].fid >= pst[j].fid) {
-			printk(KERN_ERR PFX "PST out of sequence\n");
-			return -EINVAL;
-		}
-	}
-
-	if (data->numps < 2) {
-		printk(KERN_ERR PFX "no p states to transition\n");
-		return -ENODEV;
-	}
-
-	if (check_pst_table(data, pst, maxvid))
-		return -EINVAL;
-
-	powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
-		* (data->numps + 1)), GFP_KERNEL);
-	if (!powernow_table) {
-		printk(KERN_ERR PFX "powernow_table memory alloc failure\n");
-		return -ENOMEM;
-	}
-
-	for (j = 0; j < data->numps; j++) {
-		int freq;
-		powernow_table[j].index = pst[j].fid; /* lower 8 bits */
-		powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
-		freq = find_khz_freq_from_fid(pst[j].fid);
-		powernow_table[j].frequency = freq;
-	}
-	powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
-	powernow_table[data->numps].index = 0;
-
-	if (query_current_values_with_pending_wait(data)) {
-		kfree(powernow_table);
-		return -EIO;
-	}
-
-	dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
-	data->powernow_table = powernow_table;
-	if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
-		print_basics(data);
-
-	for (j = 0; j < data->numps; j++)
-		if ((pst[j].fid == data->currfid) &&
-		    (pst[j].vid == data->currvid))
-			return 0;
-
-	dprintk("currfid/vid do not match PST, ignoring\n");
-	return 0;
-}
-
-/* Find and validate the PSB/PST table in BIOS. */
-static int find_psb_table(struct powernow_k8_data *data)
-{
-	struct psb_s *psb;
-	unsigned int i;
-	u32 mvs;
-	u8 maxvid;
-	u32 cpst = 0;
-	u32 thiscpuid;
-
-	for (i = 0xc0000; i < 0xffff0; i += 0x10) {
-		/* Scan BIOS looking for the signature. */
-		/* It can not be at ffff0 - it is too big. */
-
-		psb = phys_to_virt(i);
-		if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0)
-			continue;
-
-		dprintk("found PSB header at 0x%p\n", psb);
-
-		dprintk("table vers: 0x%x\n", psb->tableversion);
-		if (psb->tableversion != PSB_VERSION_1_4) {
-			printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
-			return -ENODEV;
-		}
-
-		dprintk("flags: 0x%x\n", psb->flags1);
-		if (psb->flags1) {
-			printk(KERN_ERR FW_BUG PFX "unknown flags\n");
-			return -ENODEV;
-		}
-
-		data->vstable = psb->vstable;
-		dprintk("voltage stabilization time: %d(*20us)\n",
-				data->vstable);
-
-		dprintk("flags2: 0x%x\n", psb->flags2);
-		data->rvo = psb->flags2 & 3;
-		data->irt = ((psb->flags2) >> 2) & 3;
-		mvs = ((psb->flags2) >> 4) & 3;
-		data->vidmvs = 1 << mvs;
-		data->batps = ((psb->flags2) >> 6) & 3;
-
-		dprintk("ramp voltage offset: %d\n", data->rvo);
-		dprintk("isochronous relief time: %d\n", data->irt);
-		dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);
-
-		dprintk("numpst: 0x%x\n", psb->num_tables);
-		cpst = psb->num_tables;
-		if ((psb->cpuid == 0x00000fc0) ||
-		    (psb->cpuid == 0x00000fe0)) {
-			thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
-			if ((thiscpuid == 0x00000fc0) ||
-			    (thiscpuid == 0x00000fe0))
-				cpst = 1;
-		}
-		if (cpst != 1) {
-			printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
-			return -ENODEV;
-		}
-
-		data->plllock = psb->plllocktime;
-		dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);
-		dprintk("maxfid: 0x%x\n", psb->maxfid);
-		dprintk("maxvid: 0x%x\n", psb->maxvid);
-		maxvid = psb->maxvid;
-
-		data->numps = psb->numps;
-		dprintk("numpstates: 0x%x\n", data->numps);
-		return fill_powernow_table(data,
-				(struct pst_s *)(psb+1), maxvid);
-	}
-	/*
-	 * If you see this message, complain to BIOS manufacturer. If
-	 * he tells you "we do not support Linux" or some similar
-	 * nonsense, remember that Windows 2000 uses the same legacy
-	 * mechanism that the old Linux PSB driver uses. Tell them it
-	 * is broken with Windows 2000.
-	 *
-	 * The reference to the AMD documentation is chapter 9 in the
-	 * BIOS and Kernel Developer's Guide, which is available on
-	 * www.amd.com
-	 */
-	printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
-	printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
-		" and Cool'N'Quiet support is enabled in BIOS setup\n");
-	return -ENODEV;
-}
-
-static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
-		unsigned int index)
-{
-	u64 control;
-
-	if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
-		return;
-
-	control = data->acpi_data.states[index].control;
-	data->irt = (control >> IRT_SHIFT) & IRT_MASK;
-	data->rvo = (control >> RVO_SHIFT) & RVO_MASK;
-	data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
-	data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK;
-	data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK);
-	data->vstable = (control >> VST_SHIFT) & VST_MASK;
-}
-
-static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
-{
-	struct cpufreq_frequency_table *powernow_table;
-	int ret_val = -ENODEV;
-	u64 control, status;
-
-	if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
-		dprintk("register performance failed: bad ACPI data\n");
-		return -EIO;
-	}
-
-	/* verify the data contained in the ACPI structures */
-	if (data->acpi_data.state_count <= 1) {
-		dprintk("No ACPI P-States\n");
-		goto err_out;
-	}
-
-	control = data->acpi_data.control_register.space_id;
-	status = data->acpi_data.status_register.space_id;
-
-	if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
-	    (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
-		dprintk("Invalid control/status registers (%x - %x)\n",
-			control, status);
-		goto err_out;
-	}
-
-	/* fill in data->powernow_table */
-	powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
-		* (data->acpi_data.state_count + 1)), GFP_KERNEL);
-	if (!powernow_table) {
-		dprintk("powernow_table memory alloc failure\n");
-		goto err_out;
-	}
-
-	/* fill in data */
-	data->numps = data->acpi_data.state_count;
-	powernow_k8_acpi_pst_values(data, 0);
-
-	if (cpu_family == CPU_HW_PSTATE)
-		ret_val = fill_powernow_table_pstate(data, powernow_table);
-	else
-		ret_val = fill_powernow_table_fidvid(data, powernow_table);
-	if (ret_val)
-		goto err_out_mem;
-
-	powernow_table[data->acpi_data.state_count].frequency =
-		CPUFREQ_TABLE_END;
-	powernow_table[data->acpi_data.state_count].index = 0;
-	data->powernow_table = powernow_table;
-
-	if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
-		print_basics(data);
-
-	/* notify BIOS that we exist */
-	acpi_processor_notify_smm(THIS_MODULE);
-
-	if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
-		printk(KERN_ERR PFX
-				"unable to alloc powernow_k8_data cpumask\n");
-		ret_val = -ENOMEM;
-		goto err_out_mem;
-	}
-
-	return 0;
-
-err_out_mem:
-	kfree(powernow_table);
-
-err_out:
-	acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
-
-	/* data->acpi_data.state_count informs us at ->exit()
-	 * whether ACPI was used */
-	data->acpi_data.state_count = 0;
-
-	return ret_val;
-}
-
-static int fill_powernow_table_pstate(struct powernow_k8_data *data,
-		struct cpufreq_frequency_table *powernow_table)
-{
-	int i;
-	u32 hi = 0, lo = 0;
-	rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
-	data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
-
-	for (i = 0; i < data->acpi_data.state_count; i++) {
-		u32 index;
-
-		index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
-		if (index > data->max_hw_pstate) {
-			printk(KERN_ERR PFX "invalid pstate %d - "
-					"bad value %d.\n", i, index);
-			printk(KERN_ERR PFX "Please report to BIOS "
-					"manufacturer\n");
-			invalidate_entry(powernow_table, i);
-			continue;
-		}
-		rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
-		if (!(hi & HW_PSTATE_VALID_MASK)) {
-			dprintk("invalid pstate %d, ignoring\n", index);
-			invalidate_entry(powernow_table, i);
-			continue;
-		}
-
-		powernow_table[i].index = index;
-
-		/* Frequency may be rounded for these */
-		if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
-				 || boot_cpu_data.x86 == 0x11) {
-			powernow_table[i].frequency =
-				freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
-		} else
-			powernow_table[i].frequency =
-				data->acpi_data.states[i].core_frequency * 1000;
-	}
-	return 0;
-}
-
-static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
-		struct cpufreq_frequency_table *powernow_table)
-{
-	int i;
-
-	for (i = 0; i < data->acpi_data.state_count; i++) {
-		u32 fid;
-		u32 vid;
-		u32 freq, index;
-		u64 status, control;
-
-		if (data->exttype) {
-			status =  data->acpi_data.states[i].status;
-			fid = status & EXT_FID_MASK;
-			vid = (status >> VID_SHIFT) & EXT_VID_MASK;
-		} else {
-			control =  data->acpi_data.states[i].control;
-			fid = control & FID_MASK;
-			vid = (control >> VID_SHIFT) & VID_MASK;
-		}
-
-		dprintk("   %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
-
-		index = fid | (vid<<8);
-		powernow_table[i].index = index;
-
-		freq = find_khz_freq_from_fid(fid);
-		powernow_table[i].frequency = freq;
-
-		/* verify frequency is OK */
-		if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
-			dprintk("invalid freq %u kHz, ignoring\n", freq);
-			invalidate_entry(powernow_table, i);
-			continue;
-		}
-
-		/* verify voltage is OK -
-		 * BIOSs are using "off" to indicate invalid */
-		if (vid == VID_OFF) {
-			dprintk("invalid vid %u, ignoring\n", vid);
-			invalidate_entry(powernow_table, i);
-			continue;
-		}
-
-		if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
-			printk(KERN_INFO PFX "invalid freq entries "
-				"%u kHz vs. %u kHz\n", freq,
-				(unsigned int)
-				(data->acpi_data.states[i].core_frequency
-				 * 1000));
-			invalidate_entry(powernow_table, i);
-			continue;
-		}
-	}
-	return 0;
-}
-
-static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
-{
-	if (data->acpi_data.state_count)
-		acpi_processor_unregister_performance(&data->acpi_data,
-				data->cpu);
-	free_cpumask_var(data->acpi_data.shared_cpu_map);
-}
-
-static int get_transition_latency(struct powernow_k8_data *data)
-{
-	int max_latency = 0;
-	int i;
-	for (i = 0; i < data->acpi_data.state_count; i++) {
-		int cur_latency = data->acpi_data.states[i].transition_latency
-			+ data->acpi_data.states[i].bus_master_latency;
-		if (cur_latency > max_latency)
-			max_latency = cur_latency;
-	}
-	if (max_latency == 0) {
-		/*
-		 * Fam 11h and later may return 0 as transition latency. This
-		 * is intended and means "very fast". While cpufreq core and
-		 * governors currently can handle that gracefully, better set it
-		 * to 1 to avoid problems in the future.
-		 */
-		if (boot_cpu_data.x86 < 0x11)
-			printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
-				"latency\n");
-		max_latency = 1;
-	}
-	/* value in usecs, needs to be in nanoseconds */
-	return 1000 * max_latency;
-}
-
-/* Take a frequency, and issue the fid/vid transition command */
-static int transition_frequency_fidvid(struct powernow_k8_data *data,
-		unsigned int index)
-{
-	u32 fid = 0;
-	u32 vid = 0;
-	int res, i;
-	struct cpufreq_freqs freqs;
-
-	dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
-
-	/* fid/vid correctness check for k8 */
-	/* fid are the lower 8 bits of the index we stored into
-	 * the cpufreq frequency table in find_psb_table, vid
-	 * are the upper 8 bits.
-	 */
-	fid = data->powernow_table[index].index & 0xFF;
-	vid = (data->powernow_table[index].index & 0xFF00) >> 8;
-
-	dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
-
-	if (query_current_values_with_pending_wait(data))
-		return 1;
-
-	if ((data->currvid == vid) && (data->currfid == fid)) {
-		dprintk("target matches current values (fid 0x%x, vid 0x%x)\n",
-			fid, vid);
-		return 0;
-	}
-
-	dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
-		smp_processor_id(), fid, vid);
-	freqs.old = find_khz_freq_from_fid(data->currfid);
-	freqs.new = find_khz_freq_from_fid(fid);
-
-	for_each_cpu(i, data->available_cores) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	}
-
-	res = transition_fid_vid(data, fid, vid);
-	freqs.new = find_khz_freq_from_fid(data->currfid);
-
-	for_each_cpu(i, data->available_cores) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
-	return res;
-}
-
-/* Take a frequency, and issue the hardware pstate transition command */
-static int transition_frequency_pstate(struct powernow_k8_data *data,
-		unsigned int index)
-{
-	u32 pstate = 0;
-	int res, i;
-	struct cpufreq_freqs freqs;
-
-	dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
-
-	/* get MSR index for hardware pstate transition */
-	pstate = index & HW_PSTATE_MASK;
-	if (pstate > data->max_hw_pstate)
-		return 0;
-	freqs.old = find_khz_freq_from_pstate(data->powernow_table,
-			data->currpstate);
-	freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
-
-	for_each_cpu(i, data->available_cores) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	}
-
-	res = transition_pstate(data, pstate);
-	freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
-
-	for_each_cpu(i, data->available_cores) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
-	return res;
-}
-
-/* Driver entry point to switch to the target frequency */
-static int powernowk8_target(struct cpufreq_policy *pol,
-		unsigned targfreq, unsigned relation)
-{
-	cpumask_var_t oldmask;
-	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
-	u32 checkfid;
-	u32 checkvid;
-	unsigned int newstate;
-	int ret = -EIO;
-
-	if (!data)
-		return -EINVAL;
-
-	checkfid = data->currfid;
-	checkvid = data->currvid;
-
-	/* only run on specific CPU from here on. */
-	/* This is poor form: use a workqueue or smp_call_function_single */
-	if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
-		return -ENOMEM;
-
-	cpumask_copy(oldmask, tsk_cpus_allowed(current));
-	set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
-
-	if (smp_processor_id() != pol->cpu) {
-		printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
-		goto err_out;
-	}
-
-	if (pending_bit_stuck()) {
-		printk(KERN_ERR PFX "failing targ, change pending bit set\n");
-		goto err_out;
-	}
-
-	dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
-		pol->cpu, targfreq, pol->min, pol->max, relation);
-
-	if (query_current_values_with_pending_wait(data))
-		goto err_out;
-
-	if (cpu_family != CPU_HW_PSTATE) {
-		dprintk("targ: curr fid 0x%x, vid 0x%x\n",
-		data->currfid, data->currvid);
-
-		if ((checkvid != data->currvid) ||
-		    (checkfid != data->currfid)) {
-			printk(KERN_INFO PFX
-				"error - out of sync, fix 0x%x 0x%x, "
-				"vid 0x%x 0x%x\n",
-				checkfid, data->currfid,
-				checkvid, data->currvid);
-		}
-	}
-
-	if (cpufreq_frequency_table_target(pol, data->powernow_table,
-				targfreq, relation, &newstate))
-		goto err_out;
-
-	mutex_lock(&fidvid_mutex);
-
-	powernow_k8_acpi_pst_values(data, newstate);
-
-	if (cpu_family == CPU_HW_PSTATE)
-		ret = transition_frequency_pstate(data, newstate);
-	else
-		ret = transition_frequency_fidvid(data, newstate);
-	if (ret) {
-		printk(KERN_ERR PFX "transition frequency failed\n");
-		ret = 1;
-		mutex_unlock(&fidvid_mutex);
-		goto err_out;
-	}
-	mutex_unlock(&fidvid_mutex);
-
-	if (cpu_family == CPU_HW_PSTATE)
-		pol->cur = find_khz_freq_from_pstate(data->powernow_table,
-				newstate);
-	else
-		pol->cur = find_khz_freq_from_fid(data->currfid);
-	ret = 0;
-
-err_out:
-	set_cpus_allowed_ptr(current, oldmask);
-	free_cpumask_var(oldmask);
-	return ret;
-}
-
-/* Driver entry point to verify the policy and range of frequencies */
-static int powernowk8_verify(struct cpufreq_policy *pol)
-{
-	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
-
-	if (!data)
-		return -EINVAL;
-
-	return cpufreq_frequency_table_verify(pol, data->powernow_table);
-}
-
-struct init_on_cpu {
-	struct powernow_k8_data *data;
-	int rc;
-};
-
-static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu)
-{
-	struct init_on_cpu *init_on_cpu = _init_on_cpu;
-
-	if (pending_bit_stuck()) {
-		printk(KERN_ERR PFX "failing init, change pending bit set\n");
-		init_on_cpu->rc = -ENODEV;
-		return;
-	}
-
-	if (query_current_values_with_pending_wait(init_on_cpu->data)) {
-		init_on_cpu->rc = -ENODEV;
-		return;
-	}
-
-	if (cpu_family == CPU_OPTERON)
-		fidvid_msr_init();
-
-	init_on_cpu->rc = 0;
-}
-
-/* per CPU init entry point to the driver */
-static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
-{
-	static const char ACPI_PSS_BIOS_BUG_MSG[] =
-		KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
-		FW_BUG PFX "Try again with latest BIOS.\n";
-	struct powernow_k8_data *data;
-	struct init_on_cpu init_on_cpu;
-	int rc;
-	struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
-
-	if (!cpu_online(pol->cpu))
-		return -ENODEV;
-
-	smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1);
-	if (rc)
-		return -ENODEV;
-
-	data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
-	if (!data) {
-		printk(KERN_ERR PFX "unable to alloc powernow_k8_data");
-		return -ENOMEM;
-	}
-
-	data->cpu = pol->cpu;
-	data->currpstate = HW_PSTATE_INVALID;
-
-	if (powernow_k8_cpu_init_acpi(data)) {
-		/*
-		 * Use the PSB BIOS structure. This is only available on
-		 * an UP version, and is deprecated by AMD.
-		 */
-		if (num_online_cpus() != 1) {
-			printk_once(ACPI_PSS_BIOS_BUG_MSG);
-			goto err_out;
-		}
-		if (pol->cpu != 0) {
-			printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
-			       "CPU other than CPU0. Complain to your BIOS "
-			       "vendor.\n");
-			goto err_out;
-		}
-		rc = find_psb_table(data);
-		if (rc)
-			goto err_out;
-
-		/* Take a crude guess here.
-		 * That guess was in microseconds, so multiply with 1000 */
-		pol->cpuinfo.transition_latency = (
-			 ((data->rvo + 8) * data->vstable * VST_UNITS_20US) +
-			 ((1 << data->irt) * 30)) * 1000;
-	} else /* ACPI _PSS objects available */
-		pol->cpuinfo.transition_latency = get_transition_latency(data);
-
-	/* only run on specific CPU from here on */
-	init_on_cpu.data = data;
-	smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu,
-				 &init_on_cpu, 1);
-	rc = init_on_cpu.rc;
-	if (rc != 0)
-		goto err_out_exit_acpi;
-
-	if (cpu_family == CPU_HW_PSTATE)
-		cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
-	else
-		cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));
-	data->available_cores = pol->cpus;
-
-	if (cpu_family == CPU_HW_PSTATE)
-		pol->cur = find_khz_freq_from_pstate(data->powernow_table,
-				data->currpstate);
-	else
-		pol->cur = find_khz_freq_from_fid(data->currfid);
-	dprintk("policy current frequency %d kHz\n", pol->cur);
-
-	/* min/max the cpu is capable of */
-	if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
-		printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
-		powernow_k8_cpu_exit_acpi(data);
-		kfree(data->powernow_table);
-		kfree(data);
-		return -EINVAL;
-	}
-
-	/* Check for APERF/MPERF support in hardware */
-	if (cpu_has(c, X86_FEATURE_APERFMPERF))
-		cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
-
-	cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
-
-	if (cpu_family == CPU_HW_PSTATE)
-		dprintk("cpu_init done, current pstate 0x%x\n",
-				data->currpstate);
-	else
-		dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
-			data->currfid, data->currvid);
-
-	per_cpu(powernow_data, pol->cpu) = data;
-
-	return 0;
-
-err_out_exit_acpi:
-	powernow_k8_cpu_exit_acpi(data);
-
-err_out:
-	kfree(data);
-	return -ENODEV;
-}
-
-static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
-{
-	struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
-
-	if (!data)
-		return -EINVAL;
-
-	powernow_k8_cpu_exit_acpi(data);
-
-	cpufreq_frequency_table_put_attr(pol->cpu);
-
-	kfree(data->powernow_table);
-	kfree(data);
-	per_cpu(powernow_data, pol->cpu) = NULL;
-
-	return 0;
-}
-
-static void query_values_on_cpu(void *_err)
-{
-	int *err = _err;
-	struct powernow_k8_data *data = __this_cpu_read(powernow_data);
-
-	*err = query_current_values_with_pending_wait(data);
-}
-
-static unsigned int powernowk8_get(unsigned int cpu)
-{
-	struct powernow_k8_data *data = per_cpu(powernow_data, cpu);
-	unsigned int khz = 0;
-	int err;
-
-	if (!data)
-		return 0;
-
-	smp_call_function_single(cpu, query_values_on_cpu, &err, true);
-	if (err)
-		goto out;
-
-	if (cpu_family == CPU_HW_PSTATE)
-		khz = find_khz_freq_from_pstate(data->powernow_table,
-						data->currpstate);
-	else
-		khz = find_khz_freq_from_fid(data->currfid);
-
-
-out:
-	return khz;
-}
-
-static void _cpb_toggle_msrs(bool t)
-{
-	int cpu;
-
-	get_online_cpus();
-
-	rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
-
-	for_each_cpu(cpu, cpu_online_mask) {
-		struct msr *reg = per_cpu_ptr(msrs, cpu);
-		if (t)
-			reg->l &= ~BIT(25);
-		else
-			reg->l |= BIT(25);
-	}
-	wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
-
-	put_online_cpus();
-}
-
-/*
- * Switch on/off core performance boosting.
- *
- * 0=disable
- * 1=enable.
- */
-static void cpb_toggle(bool t)
-{
-	if (!cpb_capable)
-		return;
-
-	if (t && !cpb_enabled) {
-		cpb_enabled = true;
-		_cpb_toggle_msrs(t);
-		printk(KERN_INFO PFX "Core Boosting enabled.\n");
-	} else if (!t && cpb_enabled) {
-		cpb_enabled = false;
-		_cpb_toggle_msrs(t);
-		printk(KERN_INFO PFX "Core Boosting disabled.\n");
-	}
-}
-
-static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
-				 size_t count)
-{
-	int ret = -EINVAL;
-	unsigned long val = 0;
-
-	ret = strict_strtoul(buf, 10, &val);
-	if (!ret && (val == 0 || val == 1) && cpb_capable)
-		cpb_toggle(val);
-	else
-		return -EINVAL;
-
-	return count;
-}
-
-static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
-{
-	return sprintf(buf, "%u\n", cpb_enabled);
-}
-
-#define define_one_rw(_name) \
-static struct freq_attr _name = \
-__ATTR(_name, 0644, show_##_name, store_##_name)
-
-define_one_rw(cpb);
-
-static struct freq_attr *powernow_k8_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	&cpb,
-	NULL,
-};
-
-static struct cpufreq_driver cpufreq_amd64_driver = {
-	.verify		= powernowk8_verify,
-	.target		= powernowk8_target,
-	.bios_limit	= acpi_processor_get_bios_limit,
-	.init		= powernowk8_cpu_init,
-	.exit		= __devexit_p(powernowk8_cpu_exit),
-	.get		= powernowk8_get,
-	.name		= "powernow-k8",
-	.owner		= THIS_MODULE,
-	.attr		= powernow_k8_attr,
-};
-
-/*
- * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
- * cannot block the remaining ones from boosting. On the CPU_UP path we
- * simply keep the boost-disable flag in sync with the current global
- * state.
- */
-static int cpb_notify(struct notifier_block *nb, unsigned long action,
-		      void *hcpu)
-{
-	unsigned cpu = (long)hcpu;
-	u32 lo, hi;
-
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-
-		if (!cpb_enabled) {
-			rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
-			lo |= BIT(25);
-			wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
-		}
-		break;
-
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-		rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
-		lo &= ~BIT(25);
-		wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
-		break;
-
-	default:
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block cpb_nb = {
-	.notifier_call		= cpb_notify,
-};
-
-/* driver entry point for init */
-static int __cpuinit powernowk8_init(void)
-{
-	unsigned int i, supported_cpus = 0, cpu;
-	int rv;
-
-	for_each_online_cpu(i) {
-		int rc;
-		smp_call_function_single(i, check_supported_cpu, &rc, 1);
-		if (rc == 0)
-			supported_cpus++;
-	}
-
-	if (supported_cpus != num_online_cpus())
-		return -ENODEV;
-
-	printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
-		num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
-
-	if (boot_cpu_has(X86_FEATURE_CPB)) {
-
-		cpb_capable = true;
-
-		msrs = msrs_alloc();
-		if (!msrs) {
-			printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
-			return -ENOMEM;
-		}
-
-		register_cpu_notifier(&cpb_nb);
-
-		rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
-
-		for_each_cpu(cpu, cpu_online_mask) {
-			struct msr *reg = per_cpu_ptr(msrs, cpu);
-			cpb_enabled |= !(!!(reg->l & BIT(25)));
-		}
-
-		printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
-			(cpb_enabled ? "on" : "off"));
-	}
-
-	rv = cpufreq_register_driver(&cpufreq_amd64_driver);
-	if (rv < 0 && boot_cpu_has(X86_FEATURE_CPB)) {
-		unregister_cpu_notifier(&cpb_nb);
-		msrs_free(msrs);
-		msrs = NULL;
-	}
-	return rv;
-}
-
-/* driver entry point for term */
-static void __exit powernowk8_exit(void)
-{
-	dprintk("exit\n");
-
-	if (boot_cpu_has(X86_FEATURE_CPB)) {
-		msrs_free(msrs);
-		msrs = NULL;
-
-		unregister_cpu_notifier(&cpb_nb);
-	}
-
-	cpufreq_unregister_driver(&cpufreq_amd64_driver);
-}
-
-MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and "
-		"Mark Langsdorf <mark.langsdorf@amd.com>");
-MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
-MODULE_LICENSE("GPL");
-
-late_initcall(powernowk8_init);
-module_exit(powernowk8_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
deleted file mode 100644
index df3529b1c02..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- *  (c) 2003-2006 Advanced Micro Devices, Inc.
- *  Your use of this code is subject to the terms and conditions of the
- *  GNU general public license version 2. See "COPYING" or
- *  http://www.gnu.org/licenses/gpl.html
- */
-
-enum pstate {
-	HW_PSTATE_INVALID = 0xff,
-	HW_PSTATE_0 = 0,
-	HW_PSTATE_1 = 1,
-	HW_PSTATE_2 = 2,
-	HW_PSTATE_3 = 3,
-	HW_PSTATE_4 = 4,
-	HW_PSTATE_5 = 5,
-	HW_PSTATE_6 = 6,
-	HW_PSTATE_7 = 7,
-};
-
-struct powernow_k8_data {
-	unsigned int cpu;
-
-	u32 numps;  /* number of p-states */
-	u32 batps;  /* number of p-states supported on battery */
-	u32 max_hw_pstate; /* maximum legal hardware pstate */
-
-	/* these values are constant when the PSB is used to determine
-	 * vid/fid pairings, but are modified during the ->target() call
-	 * when ACPI is used */
-	u32 rvo;     /* ramp voltage offset */
-	u32 irt;     /* isochronous relief time */
-	u32 vidmvs;  /* usable value calculated from mvs */
-	u32 vstable; /* voltage stabilization time, units 20 us */
-	u32 plllock; /* pll lock time, units 1 us */
-        u32 exttype; /* extended interface = 1 */
-
-	/* keep track of the current fid / vid or pstate */
-	u32 currvid;
-	u32 currfid;
-	enum pstate currpstate;
-
-	/* the powernow_table includes all frequency and vid/fid pairings:
-	 * fid are the lower 8 bits of the index, vid are the upper 8 bits.
-	 * frequency is in kHz */
-	struct cpufreq_frequency_table  *powernow_table;
-
-	/* the acpi table needs to be kept. it's only available if ACPI was
-	 * used to determine valid frequency/vid/fid states */
-	struct acpi_processor_performance acpi_data;
-
-	/* we need to keep track of associated cores, but let cpufreq
-	 * handle hotplug events - so just point at cpufreq pol->cpus
-	 * structure */
-	struct cpumask *available_cores;
-};
-
-/* processor's cpuid instruction support */
-#define CPUID_PROCESSOR_SIGNATURE	1	/* function 1 */
-#define CPUID_XFAM			0x0ff00000	/* extended family */
-#define CPUID_XFAM_K8			0
-#define CPUID_XMOD			0x000f0000	/* extended model */
-#define CPUID_XMOD_REV_MASK		0x000c0000
-#define CPUID_XFAM_10H			0x00100000	/* family 0x10 */
-#define CPUID_USE_XFAM_XMOD		0x00000f00
-#define CPUID_GET_MAX_CAPABILITIES	0x80000000
-#define CPUID_FREQ_VOLT_CAPABILITIES	0x80000007
-#define P_STATE_TRANSITION_CAPABLE	6
-
-/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For     */
-/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and   */
-/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */
-/* the register number is placed in ecx, and the data is returned in edx:eax. */
-
-#define MSR_FIDVID_CTL      0xc0010041
-#define MSR_FIDVID_STATUS   0xc0010042
-
-/* Field definitions within the FID VID Low Control MSR : */
-#define MSR_C_LO_INIT_FID_VID     0x00010000
-#define MSR_C_LO_NEW_VID          0x00003f00
-#define MSR_C_LO_NEW_FID          0x0000003f
-#define MSR_C_LO_VID_SHIFT        8
-
-/* Field definitions within the FID VID High Control MSR : */
-#define MSR_C_HI_STP_GNT_TO	  0x000fffff
-
-/* Field definitions within the FID VID Low Status MSR : */
-#define MSR_S_LO_CHANGE_PENDING   0x80000000   /* cleared when completed */
-#define MSR_S_LO_MAX_RAMP_VID     0x3f000000
-#define MSR_S_LO_MAX_FID          0x003f0000
-#define MSR_S_LO_START_FID        0x00003f00
-#define MSR_S_LO_CURRENT_FID      0x0000003f
-
-/* Field definitions within the FID VID High Status MSR : */
-#define MSR_S_HI_MIN_WORKING_VID  0x3f000000
-#define MSR_S_HI_MAX_WORKING_VID  0x003f0000
-#define MSR_S_HI_START_VID        0x00003f00
-#define MSR_S_HI_CURRENT_VID      0x0000003f
-#define MSR_C_HI_STP_GNT_BENIGN	  0x00000001
-
-
-/* Hardware Pstate _PSS and MSR definitions */
-#define USE_HW_PSTATE		0x00000080
-#define HW_PSTATE_MASK 		0x00000007
-#define HW_PSTATE_VALID_MASK 	0x80000000
-#define HW_PSTATE_MAX_MASK	0x000000f0
-#define HW_PSTATE_MAX_SHIFT	4
-#define MSR_PSTATE_DEF_BASE 	0xc0010064 /* base of Pstate MSRs */
-#define MSR_PSTATE_STATUS 	0xc0010063 /* Pstate Status MSR */
-#define MSR_PSTATE_CTRL 	0xc0010062 /* Pstate control MSR */
-#define MSR_PSTATE_CUR_LIMIT	0xc0010061 /* pstate current limit MSR */
-
-/* define the two driver architectures */
-#define CPU_OPTERON 0
-#define CPU_HW_PSTATE 1
-
-
-/*
- * There are restrictions frequencies have to follow:
- * - only 1 entry in the low fid table ( <=1.4GHz )
- * - lowest entry in the high fid table must be >= 2 * the entry in the
- *   low fid table
- * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry
- *   in the low fid table
- * - the parts can only step at <= 200 MHz intervals, odd fid values are
- *   supported in revision G and later revisions.
- * - lowest frequency must be >= interprocessor hypertransport link speed
- *   (only applies to MP systems obviously)
- */
-
-/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */
-#define LO_FID_TABLE_TOP     7	/* fid values marking the boundary    */
-#define HI_FID_TABLE_BOTTOM  8	/* between the low and high tables    */
-
-#define LO_VCOFREQ_TABLE_TOP    1400	/* corresponding vco frequency values */
-#define HI_VCOFREQ_TABLE_BOTTOM 1600
-
-#define MIN_FREQ_RESOLUTION  200 /* fids jump by 2 matching freq jumps by 200 */
-
-#define MAX_FID 0x2a	/* Spec only gives FID values as far as 5 GHz */
-#define LEAST_VID 0x3e	/* Lowest (numerically highest) useful vid value */
-
-#define MIN_FREQ 800	/* Min and max freqs, per spec */
-#define MAX_FREQ 5000
-
-#define INVALID_FID_MASK 0xffffffc0  /* not a valid fid if these bits are set */
-#define INVALID_VID_MASK 0xffffffc0  /* not a valid vid if these bits are set */
-
-#define VID_OFF 0x3f
-
-#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */
-
-#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */
-
-#define MAXIMUM_VID_STEPS 1  /* Current cpus only allow a single step of 25mV */
-#define VST_UNITS_20US 20   /* Voltage Stabilization Time is in units of 20us */
-
-/*
- * Most values of interest are encoded in a single field of the _PSS
- * entries: the "control" value.
- */
-
-#define IRT_SHIFT      30
-#define RVO_SHIFT      28
-#define EXT_TYPE_SHIFT 27
-#define PLL_L_SHIFT    20
-#define MVS_SHIFT      18
-#define VST_SHIFT      11
-#define VID_SHIFT       6
-#define IRT_MASK        3
-#define RVO_MASK        3
-#define EXT_TYPE_MASK   1
-#define PLL_L_MASK   0x7f
-#define MVS_MASK        3
-#define VST_MASK     0x7f
-#define VID_MASK     0x1f
-#define FID_MASK     0x1f
-#define EXT_VID_MASK 0x3f
-#define EXT_FID_MASK 0x3f
-
-
-/*
- * Version 1.4 of the PSB table. This table is constructed by BIOS and is
- * to tell the OS's power management driver which VIDs and FIDs are
- * supported by this particular processor.
- * If the data in the PSB / PST is wrong, then this driver will program the
- * wrong values into hardware, which is very likely to lead to a crash.
- */
-
-#define PSB_ID_STRING      "AMDK7PNOW!"
-#define PSB_ID_STRING_LEN  10
-
-#define PSB_VERSION_1_4  0x14
-
-struct psb_s {
-	u8 signature[10];
-	u8 tableversion;
-	u8 flags1;
-	u16 vstable;
-	u8 flags2;
-	u8 num_tables;
-	u32 cpuid;
-	u8 plllocktime;
-	u8 maxfid;
-	u8 maxvid;
-	u8 numps;
-};
-
-/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */
-struct pst_s {
-	u8 fid;
-	u8 vid;
-};
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
-
-static int core_voltage_pre_transition(struct powernow_k8_data *data,
-	u32 reqvid, u32 regfid);
-static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
-static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
-
-static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
-
-static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
-static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
deleted file mode 100644
index 435a996a613..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- *	sc520_freq.c: cpufreq driver for the AMD Elan sc520
- *
- *	Copyright (C) 2005 Sean Young <sean@mess.org>
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- *	Based on elanfreq.c
- *
- *	2005-03-30: - initial revision
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-
-#include <linux/delay.h>
-#include <linux/cpufreq.h>
-#include <linux/timex.h>
-#include <linux/io.h>
-
-#include <asm/msr.h>
-
-#define MMCR_BASE	0xfffef000	/* The default base address */
-#define OFFS_CPUCTL	0x2   /* CPU Control Register */
-
-static __u8 __iomem *cpuctl;
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"sc520_freq", msg)
-#define PFX "sc520_freq: "
-
-static struct cpufreq_frequency_table sc520_freq_table[] = {
-	{0x01,	100000},
-	{0x02,	133000},
-	{0,	CPUFREQ_TABLE_END},
-};
-
-static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
-{
-	u8 clockspeed_reg = *cpuctl;
-
-	switch (clockspeed_reg & 0x03) {
-	default:
-		printk(KERN_ERR PFX "error: cpuctl register has unexpected "
-				"value %02x\n", clockspeed_reg);
-	case 0x01:
-		return 100000;
-	case 0x02:
-		return 133000;
-	}
-}
-
-static void sc520_freq_set_cpu_state(unsigned int state)
-{
-
-	struct cpufreq_freqs	freqs;
-	u8 clockspeed_reg;
-
-	freqs.old = sc520_freq_get_cpu_frequency(0);
-	freqs.new = sc520_freq_table[state].frequency;
-	freqs.cpu = 0; /* AMD Elan is UP */
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-
-	dprintk("attempting to set frequency to %i kHz\n",
-			sc520_freq_table[state].frequency);
-
-	local_irq_disable();
-
-	clockspeed_reg = *cpuctl & ~0x03;
-	*cpuctl = clockspeed_reg | sc520_freq_table[state].index;
-
-	local_irq_enable();
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-};
-
-static int sc520_freq_verify(struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]);
-}
-
-static int sc520_freq_target(struct cpufreq_policy *policy,
-			    unsigned int target_freq,
-			    unsigned int relation)
-{
-	unsigned int newstate = 0;
-
-	if (cpufreq_frequency_table_target(policy, sc520_freq_table,
-				target_freq, relation, &newstate))
-		return -EINVAL;
-
-	sc520_freq_set_cpu_state(newstate);
-
-	return 0;
-}
-
-
-/*
- *	Module init and exit code
- */
-
-static int sc520_freq_cpu_init(struct cpufreq_policy *policy)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-	int result;
-
-	/* capability check */
-	if (c->x86_vendor != X86_VENDOR_AMD ||
-	    c->x86 != 4 || c->x86_model != 9)
-		return -ENODEV;
-
-	/* cpuinfo and default policy values */
-	policy->cpuinfo.transition_latency = 1000000; /* 1ms */
-	policy->cur = sc520_freq_get_cpu_frequency(0);
-
-	result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table);
-	if (result)
-		return result;
-
-	cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu);
-
-	return 0;
-}
-
-
-static int sc520_freq_cpu_exit(struct cpufreq_policy *policy)
-{
-	cpufreq_frequency_table_put_attr(policy->cpu);
-	return 0;
-}
-
-
-static struct freq_attr *sc520_freq_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-
-static struct cpufreq_driver sc520_freq_driver = {
-	.get	= sc520_freq_get_cpu_frequency,
-	.verify	= sc520_freq_verify,
-	.target	= sc520_freq_target,
-	.init	= sc520_freq_cpu_init,
-	.exit	= sc520_freq_cpu_exit,
-	.name	= "sc520_freq",
-	.owner	= THIS_MODULE,
-	.attr	= sc520_freq_attr,
-};
-
-
-static int __init sc520_freq_init(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-	int err;
-
-	/* Test if we have the right hardware */
-	if (c->x86_vendor != X86_VENDOR_AMD ||
-	    c->x86 != 4 || c->x86_model != 9) {
-		dprintk("no Elan SC520 processor found!\n");
-		return -ENODEV;
-	}
-	cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1);
-	if (!cpuctl) {
-		printk(KERN_ERR "sc520_freq: error: failed to remap memory\n");
-		return -ENOMEM;
-	}
-
-	err = cpufreq_register_driver(&sc520_freq_driver);
-	if (err)
-		iounmap(cpuctl);
-
-	return err;
-}
-
-
-static void __exit sc520_freq_exit(void)
-{
-	cpufreq_unregister_driver(&sc520_freq_driver);
-	iounmap(cpuctl);
-}
-
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Sean Young <sean@mess.org>");
-MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU");
-
-module_init(sc520_freq_init);
-module_exit(sc520_freq_exit);
-
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
deleted file mode 100644
index 9b1ff37de46..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ /dev/null
@@ -1,636 +0,0 @@
-/*
- * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium
- * M (part of the Centrino chipset).
- *
- * Since the original Pentium M, most new Intel CPUs support Enhanced
- * SpeedStep.
- *
- * Despite the "SpeedStep" in the name, this is almost entirely unlike
- * traditional SpeedStep.
- *
- * Modelled on speedstep.c
- *
- * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/sched.h>	/* current */
-#include <linux/delay.h>
-#include <linux/compiler.h>
-#include <linux/gfp.h>
-
-#include <asm/msr.h>
-#include <asm/processor.h>
-#include <asm/cpufeature.h>
-
-#define PFX		"speedstep-centrino: "
-#define MAINTAINER	"cpufreq@vger.kernel.org"
-
-#define dprintk(msg...) \
-	cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
-
-#define INTEL_MSR_RANGE	(0xffff)
-
-struct cpu_id
-{
-	__u8	x86;            /* CPU family */
-	__u8	x86_model;	/* model */
-	__u8	x86_mask;	/* stepping */
-};
-
-enum {
-	CPU_BANIAS,
-	CPU_DOTHAN_A1,
-	CPU_DOTHAN_A2,
-	CPU_DOTHAN_B0,
-	CPU_MP4HT_D0,
-	CPU_MP4HT_E0,
-};
-
-static const struct cpu_id cpu_ids[] = {
-	[CPU_BANIAS]	= { 6,  9, 5 },
-	[CPU_DOTHAN_A1]	= { 6, 13, 1 },
-	[CPU_DOTHAN_A2]	= { 6, 13, 2 },
-	[CPU_DOTHAN_B0]	= { 6, 13, 6 },
-	[CPU_MP4HT_D0]	= {15,  3, 4 },
-	[CPU_MP4HT_E0]	= {15,  4, 1 },
-};
-#define N_IDS	ARRAY_SIZE(cpu_ids)
-
-struct cpu_model
-{
-	const struct cpu_id *cpu_id;
-	const char	*model_name;
-	unsigned	max_freq; /* max clock in kHz */
-
-	struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
-};
-static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
-				  const struct cpu_id *x);
-
-/* Operating points for current CPU */
-static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
-static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
-
-static struct cpufreq_driver centrino_driver;
-
-#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE
-
-/* Computes the correct form for IA32_PERF_CTL MSR for a particular
-   frequency/voltage operating point; frequency in MHz, volts in mV.
-   This is stored as "index" in the structure. */
-#define OP(mhz, mv)							\
-	{								\
-		.frequency = (mhz) * 1000,				\
-		.index = (((mhz)/100) << 8) | ((mv - 700) / 16)		\
-	}
-
-/*
- * These voltage tables were derived from the Intel Pentium M
- * datasheet, document 25261202.pdf, Table 5.  I have verified they
- * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium
- * M.
- */
-
-/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */
-static struct cpufreq_frequency_table banias_900[] =
-{
-	OP(600,  844),
-	OP(800,  988),
-	OP(900, 1004),
-	{ .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */
-static struct cpufreq_frequency_table banias_1000[] =
-{
-	OP(600,   844),
-	OP(800,   972),
-	OP(900,   988),
-	OP(1000, 1004),
-	{ .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */
-static struct cpufreq_frequency_table banias_1100[] =
-{
-	OP( 600,  956),
-	OP( 800, 1020),
-	OP( 900, 1100),
-	OP(1000, 1164),
-	OP(1100, 1180),
-	{ .frequency = CPUFREQ_TABLE_END }
-};
-
-
-/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */
-static struct cpufreq_frequency_table banias_1200[] =
-{
-	OP( 600,  956),
-	OP( 800, 1004),
-	OP( 900, 1020),
-	OP(1000, 1100),
-	OP(1100, 1164),
-	OP(1200, 1180),
-	{ .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.30GHz (Banias) */
-static struct cpufreq_frequency_table banias_1300[] =
-{
-	OP( 600,  956),
-	OP( 800, 1260),
-	OP(1000, 1292),
-	OP(1200, 1356),
-	OP(1300, 1388),
-	{ .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.40GHz (Banias) */
-static struct cpufreq_frequency_table banias_1400[] =
-{
-	OP( 600,  956),
-	OP( 800, 1180),
-	OP(1000, 1308),
-	OP(1200, 1436),
-	OP(1400, 1484),
-	{ .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.50GHz (Banias) */
-static struct cpufreq_frequency_table banias_1500[] =
-{
-	OP( 600,  956),
-	OP( 800, 1116),
-	OP(1000, 1228),
-	OP(1200, 1356),
-	OP(1400, 1452),
-	OP(1500, 1484),
-	{ .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.60GHz (Banias) */
-static struct cpufreq_frequency_table banias_1600[] =
-{
-	OP( 600,  956),
-	OP( 800, 1036),
-	OP(1000, 1164),
-	OP(1200, 1276),
-	OP(1400, 1420),
-	OP(1600, 1484),
-	{ .frequency = CPUFREQ_TABLE_END }
-};
-
-/* Intel Pentium M processor 1.70GHz (Banias) */
-static struct cpufreq_frequency_table banias_1700[] =
-{
-	OP( 600,  956),
-	OP( 800, 1004),
-	OP(1000, 1116),
-	OP(1200, 1228),
-	OP(1400, 1308),
-	OP(1700, 1484),
-	{ .frequency = CPUFREQ_TABLE_END }
-};
-#undef OP
-
-#define _BANIAS(cpuid, max, name)	\
-{	.cpu_id		= cpuid,	\
-	.model_name	= "Intel(R) Pentium(R) M processor " name "MHz", \
-	.max_freq	= (max)*1000,	\
-	.op_points	= banias_##max,	\
-}
-#define BANIAS(max)	_BANIAS(&cpu_ids[CPU_BANIAS], max, #max)
-
-/* CPU models, their operating frequency range, and freq/voltage
-   operating points */
-static struct cpu_model models[] =
-{
-	_BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"),
-	BANIAS(1000),
-	BANIAS(1100),
-	BANIAS(1200),
-	BANIAS(1300),
-	BANIAS(1400),
-	BANIAS(1500),
-	BANIAS(1600),
-	BANIAS(1700),
-
-	/* NULL model_name is a wildcard */
-	{ &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL },
-	{ &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL },
-	{ &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL },
-	{ &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL },
-	{ &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL },
-
-	{ NULL, }
-};
-#undef _BANIAS
-#undef BANIAS
-
-static int centrino_cpu_init_table(struct cpufreq_policy *policy)
-{
-	struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
-	struct cpu_model *model;
-
-	for(model = models; model->cpu_id != NULL; model++)
-		if (centrino_verify_cpu_id(cpu, model->cpu_id) &&
-		    (model->model_name == NULL ||
-		     strcmp(cpu->x86_model_id, model->model_name) == 0))
-			break;
-
-	if (model->cpu_id == NULL) {
-		/* No match at all */
-		dprintk("no support for CPU model \"%s\": "
-		       "send /proc/cpuinfo to " MAINTAINER "\n",
-		       cpu->x86_model_id);
-		return -ENOENT;
-	}
-
-	if (model->op_points == NULL) {
-		/* Matched a non-match */
-		dprintk("no table support for CPU model \"%s\"\n",
-		       cpu->x86_model_id);
-		dprintk("try using the acpi-cpufreq driver\n");
-		return -ENOENT;
-	}
-
-	per_cpu(centrino_model, policy->cpu) = model;
-
-	dprintk("found \"%s\": max frequency: %dkHz\n",
-	       model->model_name, model->max_freq);
-
-	return 0;
-}
-
-#else
-static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
-{
-	return -ENODEV;
-}
-#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
-
-static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
-				  const struct cpu_id *x)
-{
-	if ((c->x86 == x->x86) &&
-	    (c->x86_model == x->x86_model) &&
-	    (c->x86_mask == x->x86_mask))
-		return 1;
-	return 0;
-}
-
-/* To be called only after centrino_model is initialized */
-static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
-{
-	int i;
-
-	/*
-	 * Extract clock in kHz from PERF_CTL value
-	 * for centrino, as some DSDTs are buggy.
-	 * Ideally, this can be done using the acpi_data structure.
-	 */
-	if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
-	    (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
-	    (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
-		msr = (msr >> 8) & 0xff;
-		return msr * 100000;
-	}
-
-	if ((!per_cpu(centrino_model, cpu)) ||
-	    (!per_cpu(centrino_model, cpu)->op_points))
-		return 0;
-
-	msr &= 0xffff;
-	for (i = 0;
-		per_cpu(centrino_model, cpu)->op_points[i].frequency
-							!= CPUFREQ_TABLE_END;
-	     i++) {
-		if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
-			return per_cpu(centrino_model, cpu)->
-							op_points[i].frequency;
-	}
-	if (failsafe)
-		return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
-	else
-		return 0;
-}
-
-/* Return the current CPU frequency in kHz */
-static unsigned int get_cur_freq(unsigned int cpu)
-{
-	unsigned l, h;
-	unsigned clock_freq;
-
-	rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h);
-	clock_freq = extract_clock(l, cpu, 0);
-
-	if (unlikely(clock_freq == 0)) {
-		/*
-		 * On some CPUs, we can see transient MSR values (which are
-		 * not present in _PSS), while CPU is doing some automatic
-		 * P-state transition (like TM2). Get the last freq set 
-		 * in PERF_CTL.
-		 */
-		rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h);
-		clock_freq = extract_clock(l, cpu, 1);
-	}
-	return clock_freq;
-}
-
-
-static int centrino_cpu_init(struct cpufreq_policy *policy)
-{
-	struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
-	unsigned freq;
-	unsigned l, h;
-	int ret;
-	int i;
-
-	/* Only Intel makes Enhanced Speedstep-capable CPUs */
-	if (cpu->x86_vendor != X86_VENDOR_INTEL ||
-	    !cpu_has(cpu, X86_FEATURE_EST))
-		return -ENODEV;
-
-	if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
-		centrino_driver.flags |= CPUFREQ_CONST_LOOPS;
-
-	if (policy->cpu != 0)
-		return -ENODEV;
-
-	for (i = 0; i < N_IDS; i++)
-		if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))
-			break;
-
-	if (i != N_IDS)
-		per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
-
-	if (!per_cpu(centrino_cpu, policy->cpu)) {
-		dprintk("found unsupported CPU with "
-		"Enhanced SpeedStep: send /proc/cpuinfo to "
-		MAINTAINER "\n");
-		return -ENODEV;
-	}
-
-	if (centrino_cpu_init_table(policy)) {
-		return -ENODEV;
-	}
-
-	/* Check to see if Enhanced SpeedStep is enabled, and try to
-	   enable it if not. */
-	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-
-	if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
-		l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
-		dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);
-		wrmsr(MSR_IA32_MISC_ENABLE, l, h);
-
-		/* check to see if it stuck */
-		rdmsr(MSR_IA32_MISC_ENABLE, l, h);
-		if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
-			printk(KERN_INFO PFX
-				"couldn't enable Enhanced SpeedStep\n");
-			return -ENODEV;
-		}
-	}
-
-	freq = get_cur_freq(policy->cpu);
-	policy->cpuinfo.transition_latency = 10000;
-						/* 10uS transition latency */
-	policy->cur = freq;
-
-	dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
-
-	ret = cpufreq_frequency_table_cpuinfo(policy,
-		per_cpu(centrino_model, policy->cpu)->op_points);
-	if (ret)
-		return (ret);
-
-	cpufreq_frequency_table_get_attr(
-		per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
-
-	return 0;
-}
-
-static int centrino_cpu_exit(struct cpufreq_policy *policy)
-{
-	unsigned int cpu = policy->cpu;
-
-	if (!per_cpu(centrino_model, cpu))
-		return -ENODEV;
-
-	cpufreq_frequency_table_put_attr(cpu);
-
-	per_cpu(centrino_model, cpu) = NULL;
-
-	return 0;
-}
-
-/**
- * centrino_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within this model's frequency range at least one
- * border included.
- */
-static int centrino_verify (struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy,
-			per_cpu(centrino_model, policy->cpu)->op_points);
-}
-
-/**
- * centrino_setpolicy - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- *	(CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int centrino_target (struct cpufreq_policy *policy,
-			    unsigned int target_freq,
-			    unsigned int relation)
-{
-	unsigned int    newstate = 0;
-	unsigned int	msr, oldmsr = 0, h = 0, cpu = policy->cpu;
-	struct cpufreq_freqs	freqs;
-	int			retval = 0;
-	unsigned int		j, k, first_cpu, tmp;
-	cpumask_var_t covered_cpus;
-
-	if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))
-		return -ENOMEM;
-
-	if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
-		retval = -ENODEV;
-		goto out;
-	}
-
-	if (unlikely(cpufreq_frequency_table_target(policy,
-			per_cpu(centrino_model, cpu)->op_points,
-			target_freq,
-			relation,
-			&newstate))) {
-		retval = -EINVAL;
-		goto out;
-	}
-
-	first_cpu = 1;
-	for_each_cpu(j, policy->cpus) {
-		int good_cpu;
-
-		/* cpufreq holds the hotplug lock, so we are safe here */
-		if (!cpu_online(j))
-			continue;
-
-		/*
-		 * Support for SMP systems.
-		 * Make sure we are running on CPU that wants to change freq
-		 */
-		if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
-			good_cpu = cpumask_any_and(policy->cpus,
-						   cpu_online_mask);
-		else
-			good_cpu = j;
-
-		if (good_cpu >= nr_cpu_ids) {
-			dprintk("couldn't limit to CPUs in this domain\n");
-			retval = -EAGAIN;
-			if (first_cpu) {
-				/* We haven't started the transition yet. */
-				goto out;
-			}
-			break;
-		}
-
-		msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
-
-		if (first_cpu) {
-			rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);
-			if (msr == (oldmsr & 0xffff)) {
-				dprintk("no change needed - msr was and needs "
-					"to be %x\n", oldmsr);
-				retval = 0;
-				goto out;
-			}
-
-			freqs.old = extract_clock(oldmsr, cpu, 0);
-			freqs.new = extract_clock(msr, cpu, 0);
-
-			dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
-				target_freq, freqs.old, freqs.new, msr);
-
-			for_each_cpu(k, policy->cpus) {
-				if (!cpu_online(k))
-					continue;
-				freqs.cpu = k;
-				cpufreq_notify_transition(&freqs,
-					CPUFREQ_PRECHANGE);
-			}
-
-			first_cpu = 0;
-			/* all but 16 LSB are reserved, treat them with care */
-			oldmsr &= ~0xffff;
-			msr &= 0xffff;
-			oldmsr |= msr;
-		}
-
-		wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h);
-		if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
-			break;
-
-		cpumask_set_cpu(j, covered_cpus);
-	}
-
-	for_each_cpu(k, policy->cpus) {
-		if (!cpu_online(k))
-			continue;
-		freqs.cpu = k;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
-
-	if (unlikely(retval)) {
-		/*
-		 * We have failed halfway through the frequency change.
-		 * We have sent callbacks to policy->cpus and
-		 * MSRs have already been written on coverd_cpus.
-		 * Best effort undo..
-		 */
-
-		for_each_cpu(j, covered_cpus)
-			wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h);
-
-		tmp = freqs.new;
-		freqs.new = freqs.old;
-		freqs.old = tmp;
-		for_each_cpu(j, policy->cpus) {
-			if (!cpu_online(j))
-				continue;
-			cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-			cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-		}
-	}
-	retval = 0;
-
-out:
-	free_cpumask_var(covered_cpus);
-	return retval;
-}
-
-static struct freq_attr* centrino_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-static struct cpufreq_driver centrino_driver = {
-	.name		= "centrino", /* should be speedstep-centrino,
-					 but there's a 16 char limit */
-	.init		= centrino_cpu_init,
-	.exit		= centrino_cpu_exit,
-	.verify		= centrino_verify,
-	.target		= centrino_target,
-	.get		= get_cur_freq,
-	.attr           = centrino_attr,
-	.owner		= THIS_MODULE,
-};
-
-
-/**
- * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver
- *
- * Initializes the Enhanced SpeedStep support. Returns -ENODEV on
- * unsupported devices, -ENOENT if there's no voltage table for this
- * particular CPU model, -EINVAL on problems during initiatization,
- * and zero on success.
- *
- * This is quite picky.  Not only does the CPU have to advertise the
- * "est" flag in the cpuid capability flags, we look for a specific
- * CPU model and stepping, and we need to have the exact model name in
- * our voltage tables.  That is, be paranoid about not releasing
- * someone's valuable magic smoke.
- */
-static int __init centrino_init(void)
-{
-	struct cpuinfo_x86 *cpu = &cpu_data(0);
-
-	if (!cpu_has(cpu, X86_FEATURE_EST))
-		return -ENODEV;
-
-	return cpufreq_register_driver(&centrino_driver);
-}
-
-static void __exit centrino_exit(void)
-{
-	cpufreq_unregister_driver(&centrino_driver);
-}
-
-MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>");
-MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors.");
-MODULE_LICENSE ("GPL");
-
-late_initcall(centrino_init);
-module_exit(centrino_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
deleted file mode 100644
index 561758e9518..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ /dev/null
@@ -1,452 +0,0 @@
-/*
- * (C) 2001  Dave Jones, Arjan van de ven.
- * (C) 2002 - 2003  Dominik Brodowski <linux@brodo.de>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *  Based upon reverse engineered information, and on Intel documentation
- *  for chipsets ICH2-M and ICH3-M.
- *
- *  Many thanks to Ducrot Bruno for finding and fixing the last
- *  "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler
- *  for extensive testing.
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-
-/*********************************************************************
- *                        SPEEDSTEP - DEFINITIONS                    *
- *********************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/pci.h>
-#include <linux/sched.h>
-
-#include "speedstep-lib.h"
-
-
-/* speedstep_chipset:
- *   It is necessary to know which chipset is used. As accesses to
- * this device occur at various places in this module, we need a
- * static struct pci_dev * pointing to that device.
- */
-static struct pci_dev *speedstep_chipset_dev;
-
-
-/* speedstep_processor
- */
-static enum speedstep_processor speedstep_processor;
-
-static u32 pmbase;
-
-/*
- *   There are only two frequency states for each processor. Values
- * are in kHz for the time being.
- */
-static struct cpufreq_frequency_table speedstep_freqs[] = {
-	{SPEEDSTEP_HIGH,	0},
-	{SPEEDSTEP_LOW,		0},
-	{0,			CPUFREQ_TABLE_END},
-};
-
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"speedstep-ich", msg)
-
-
-/**
- * speedstep_find_register - read the PMBASE address
- *
- * Returns: -ENODEV if no register could be found
- */
-static int speedstep_find_register(void)
-{
-	if (!speedstep_chipset_dev)
-		return -ENODEV;
-
-	/* get PMBASE */
-	pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase);
-	if (!(pmbase & 0x01)) {
-		printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
-		return -ENODEV;
-	}
-
-	pmbase &= 0xFFFFFFFE;
-	if (!pmbase) {
-		printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
-		return -ENODEV;
-	}
-
-	dprintk("pmbase is 0x%x\n", pmbase);
-	return 0;
-}
-
-/**
- * speedstep_set_state - set the SpeedStep state
- * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- *   Tries to change the SpeedStep state.  Can be called from
- *   smp_call_function_single.
- */
-static void speedstep_set_state(unsigned int state)
-{
-	u8 pm2_blk;
-	u8 value;
-	unsigned long flags;
-
-	if (state > 0x1)
-		return;
-
-	/* Disable IRQs */
-	local_irq_save(flags);
-
-	/* read state */
-	value = inb(pmbase + 0x50);
-
-	dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
-
-	/* write new state */
-	value &= 0xFE;
-	value |= state;
-
-	dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase);
-
-	/* Disable bus master arbitration */
-	pm2_blk = inb(pmbase + 0x20);
-	pm2_blk |= 0x01;
-	outb(pm2_blk, (pmbase + 0x20));
-
-	/* Actual transition */
-	outb(value, (pmbase + 0x50));
-
-	/* Restore bus master arbitration */
-	pm2_blk &= 0xfe;
-	outb(pm2_blk, (pmbase + 0x20));
-
-	/* check if transition was successful */
-	value = inb(pmbase + 0x50);
-
-	/* Enable IRQs */
-	local_irq_restore(flags);
-
-	dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
-
-	if (state == (value & 0x1))
-		dprintk("change to %u MHz succeeded\n",
-			speedstep_get_frequency(speedstep_processor) / 1000);
-	else
-		printk(KERN_ERR "cpufreq: change failed - I/O error\n");
-
-	return;
-}
-
-/* Wrapper for smp_call_function_single. */
-static void _speedstep_set_state(void *_state)
-{
-	speedstep_set_state(*(unsigned int *)_state);
-}
-
-/**
- * speedstep_activate - activate SpeedStep control in the chipset
- *
- *   Tries to activate the SpeedStep status and control registers.
- * Returns -EINVAL on an unsupported chipset, and zero on success.
- */
-static int speedstep_activate(void)
-{
-	u16 value = 0;
-
-	if (!speedstep_chipset_dev)
-		return -EINVAL;
-
-	pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value);
-	if (!(value & 0x08)) {
-		value |= 0x08;
-		dprintk("activating SpeedStep (TM) registers\n");
-		pci_write_config_word(speedstep_chipset_dev, 0x00A0, value);
-	}
-
-	return 0;
-}
-
-
-/**
- * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic
- *
- *   Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to
- * the LPC bridge / PM module which contains all power-management
- * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
- * chipset, or zero on failure.
- */
-static unsigned int speedstep_detect_chipset(void)
-{
-	speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
-			      PCI_DEVICE_ID_INTEL_82801DB_12,
-			      PCI_ANY_ID, PCI_ANY_ID,
-			      NULL);
-	if (speedstep_chipset_dev)
-		return 4; /* 4-M */
-
-	speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
-			      PCI_DEVICE_ID_INTEL_82801CA_12,
-			      PCI_ANY_ID, PCI_ANY_ID,
-			      NULL);
-	if (speedstep_chipset_dev)
-		return 3; /* 3-M */
-
-
-	speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
-			      PCI_DEVICE_ID_INTEL_82801BA_10,
-			      PCI_ANY_ID, PCI_ANY_ID,
-			      NULL);
-	if (speedstep_chipset_dev) {
-		/* speedstep.c causes lockups on Dell Inspirons 8000 and
-		 * 8100 which use a pretty old revision of the 82815
-		 * host brige. Abort on these systems.
-		 */
-		static struct pci_dev *hostbridge;
-
-		hostbridge  = pci_get_subsys(PCI_VENDOR_ID_INTEL,
-			      PCI_DEVICE_ID_INTEL_82815_MC,
-			      PCI_ANY_ID, PCI_ANY_ID,
-			      NULL);
-
-		if (!hostbridge)
-			return 2; /* 2-M */
-
-		if (hostbridge->revision < 5) {
-			dprintk("hostbridge does not support speedstep\n");
-			speedstep_chipset_dev = NULL;
-			pci_dev_put(hostbridge);
-			return 0;
-		}
-
-		pci_dev_put(hostbridge);
-		return 2; /* 2-M */
-	}
-
-	return 0;
-}
-
-static void get_freq_data(void *_speed)
-{
-	unsigned int *speed = _speed;
-
-	*speed = speedstep_get_frequency(speedstep_processor);
-}
-
-static unsigned int speedstep_get(unsigned int cpu)
-{
-	unsigned int speed;
-
-	/* You're supposed to ensure CPU is online. */
-	if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
-		BUG();
-
-	dprintk("detected %u kHz as current frequency\n", speed);
-	return speed;
-}
-
-/**
- * speedstep_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency
- *	(CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
- *
- * Sets a new CPUFreq policy.
- */
-static int speedstep_target(struct cpufreq_policy *policy,
-			     unsigned int target_freq,
-			     unsigned int relation)
-{
-	unsigned int newstate = 0, policy_cpu;
-	struct cpufreq_freqs freqs;
-	int i;
-
-	if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
-				target_freq, relation, &newstate))
-		return -EINVAL;
-
-	policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
-	freqs.old = speedstep_get(policy_cpu);
-	freqs.new = speedstep_freqs[newstate].frequency;
-	freqs.cpu = policy->cpu;
-
-	dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new);
-
-	/* no transition necessary */
-	if (freqs.old == freqs.new)
-		return 0;
-
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	}
-
-	smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate,
-				 true);
-
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
-
-	return 0;
-}
-
-
-/**
- * speedstep_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within speedstep_low_freq and speedstep_high_freq, with
- * at least one border included.
- */
-static int speedstep_verify(struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
-}
-
-struct get_freqs {
-	struct cpufreq_policy *policy;
-	int ret;
-};
-
-static void get_freqs_on_cpu(void *_get_freqs)
-{
-	struct get_freqs *get_freqs = _get_freqs;
-
-	get_freqs->ret =
-		speedstep_get_freqs(speedstep_processor,
-			    &speedstep_freqs[SPEEDSTEP_LOW].frequency,
-			    &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
-			    &get_freqs->policy->cpuinfo.transition_latency,
-			    &speedstep_set_state);
-}
-
-static int speedstep_cpu_init(struct cpufreq_policy *policy)
-{
-	int result;
-	unsigned int policy_cpu, speed;
-	struct get_freqs gf;
-
-	/* only run on CPU to be set, or on its sibling */
-#ifdef CONFIG_SMP
-	cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
-#endif
-	policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
-
-	/* detect low and high frequency and transition latency */
-	gf.policy = policy;
-	smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1);
-	if (gf.ret)
-		return gf.ret;
-
-	/* get current speed setting */
-	speed = speedstep_get(policy_cpu);
-	if (!speed)
-		return -EIO;
-
-	dprintk("currently at %s speed setting - %i MHz\n",
-		(speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
-		? "low" : "high",
-		(speed / 1000));
-
-	/* cpuinfo and default policy values */
-	policy->cur = speed;
-
-	result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
-	if (result)
-		return result;
-
-	cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
-
-	return 0;
-}
-
-
-static int speedstep_cpu_exit(struct cpufreq_policy *policy)
-{
-	cpufreq_frequency_table_put_attr(policy->cpu);
-	return 0;
-}
-
-static struct freq_attr *speedstep_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-
-static struct cpufreq_driver speedstep_driver = {
-	.name	= "speedstep-ich",
-	.verify	= speedstep_verify,
-	.target	= speedstep_target,
-	.init	= speedstep_cpu_init,
-	.exit	= speedstep_cpu_exit,
-	.get	= speedstep_get,
-	.owner	= THIS_MODULE,
-	.attr	= speedstep_attr,
-};
-
-
-/**
- * speedstep_init - initializes the SpeedStep CPUFreq driver
- *
- *   Initializes the SpeedStep support. Returns -ENODEV on unsupported
- * devices, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init speedstep_init(void)
-{
-	/* detect processor */
-	speedstep_processor = speedstep_detect_processor();
-	if (!speedstep_processor) {
-		dprintk("Intel(R) SpeedStep(TM) capable processor "
-				"not found\n");
-		return -ENODEV;
-	}
-
-	/* detect chipset */
-	if (!speedstep_detect_chipset()) {
-		dprintk("Intel(R) SpeedStep(TM) for this chipset not "
-				"(yet) available.\n");
-		return -ENODEV;
-	}
-
-	/* activate speedstep support */
-	if (speedstep_activate()) {
-		pci_dev_put(speedstep_chipset_dev);
-		return -EINVAL;
-	}
-
-	if (speedstep_find_register())
-		return -ENODEV;
-
-	return cpufreq_register_driver(&speedstep_driver);
-}
-
-
-/**
- * speedstep_exit - unregisters SpeedStep support
- *
- *   Unregisters SpeedStep support.
- */
-static void __exit speedstep_exit(void)
-{
-	pci_dev_put(speedstep_chipset_dev);
-	cpufreq_unregister_driver(&speedstep_driver);
-}
-
-
-MODULE_AUTHOR("Dave Jones <davej@redhat.com>, "
-		"Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets "
-		"with ICH-M southbridges.");
-MODULE_LICENSE("GPL");
-
-module_init(speedstep_init);
-module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
deleted file mode 100644
index a94ec6be69f..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  Library for common functions for Intel SpeedStep v.1 and v.2 support
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-
-#include <asm/msr.h>
-#include <asm/tsc.h>
-#include "speedstep-lib.h"
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"speedstep-lib", msg)
-
-#define PFX "speedstep-lib: "
-
-#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
-static int relaxed_check;
-#else
-#define relaxed_check 0
-#endif
-
-/*********************************************************************
- *                   GET PROCESSOR CORE SPEED IN KHZ                 *
- *********************************************************************/
-
-static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
-{
-	/* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
-	struct {
-		unsigned int ratio;	/* Frequency Multiplier (x10) */
-		u8 bitmap;		/* power on configuration bits
-					[27, 25:22] (in MSR 0x2a) */
-	} msr_decode_mult[] = {
-		{ 30, 0x01 },
-		{ 35, 0x05 },
-		{ 40, 0x02 },
-		{ 45, 0x06 },
-		{ 50, 0x00 },
-		{ 55, 0x04 },
-		{ 60, 0x0b },
-		{ 65, 0x0f },
-		{ 70, 0x09 },
-		{ 75, 0x0d },
-		{ 80, 0x0a },
-		{ 85, 0x26 },
-		{ 90, 0x20 },
-		{ 100, 0x2b },
-		{ 0, 0xff }	/* error or unknown value */
-	};
-
-	/* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */
-	struct {
-		unsigned int value;	/* Front Side Bus speed in MHz */
-		u8 bitmap;		/* power on configuration bits [18: 19]
-					(in MSR 0x2a) */
-	} msr_decode_fsb[] = {
-		{  66, 0x0 },
-		{ 100, 0x2 },
-		{ 133, 0x1 },
-		{   0, 0xff}
-	};
-
-	u32 msr_lo, msr_tmp;
-	int i = 0, j = 0;
-
-	/* read MSR 0x2a - we only need the low 32 bits */
-	rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
-	dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
-	msr_tmp = msr_lo;
-
-	/* decode the FSB */
-	msr_tmp &= 0x00c0000;
-	msr_tmp >>= 18;
-	while (msr_tmp != msr_decode_fsb[i].bitmap) {
-		if (msr_decode_fsb[i].bitmap == 0xff)
-			return 0;
-		i++;
-	}
-
-	/* decode the multiplier */
-	if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) {
-		dprintk("workaround for early PIIIs\n");
-		msr_lo &= 0x03c00000;
-	} else
-		msr_lo &= 0x0bc00000;
-	msr_lo >>= 22;
-	while (msr_lo != msr_decode_mult[j].bitmap) {
-		if (msr_decode_mult[j].bitmap == 0xff)
-			return 0;
-		j++;
-	}
-
-	dprintk("speed is %u\n",
-		(msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
-
-	return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100;
-}
-
-
-static unsigned int pentiumM_get_frequency(void)
-{
-	u32 msr_lo, msr_tmp;
-
-	rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
-	dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
-
-	/* see table B-2 of 24547212.pdf */
-	if (msr_lo & 0x00040000) {
-		printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n",
-				msr_lo, msr_tmp);
-		return 0;
-	}
-
-	msr_tmp = (msr_lo >> 22) & 0x1f;
-	dprintk("bits 22-26 are 0x%x, speed is %u\n",
-			msr_tmp, (msr_tmp * 100 * 1000));
-
-	return msr_tmp * 100 * 1000;
-}
-
-static unsigned int pentium_core_get_frequency(void)
-{
-	u32 fsb = 0;
-	u32 msr_lo, msr_tmp;
-	int ret;
-
-	rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp);
-	/* see table B-2 of 25366920.pdf */
-	switch (msr_lo & 0x07) {
-	case 5:
-		fsb = 100000;
-		break;
-	case 1:
-		fsb = 133333;
-		break;
-	case 3:
-		fsb = 166667;
-		break;
-	case 2:
-		fsb = 200000;
-		break;
-	case 0:
-		fsb = 266667;
-		break;
-	case 4:
-		fsb = 333333;
-		break;
-	default:
-		printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value");
-	}
-
-	rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
-	dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n",
-			msr_lo, msr_tmp);
-
-	msr_tmp = (msr_lo >> 22) & 0x1f;
-	dprintk("bits 22-26 are 0x%x, speed is %u\n",
-			msr_tmp, (msr_tmp * fsb));
-
-	ret = (msr_tmp * fsb);
-	return ret;
-}
-
-
-static unsigned int pentium4_get_frequency(void)
-{
-	struct cpuinfo_x86 *c = &boot_cpu_data;
-	u32 msr_lo, msr_hi, mult;
-	unsigned int fsb = 0;
-	unsigned int ret;
-	u8 fsb_code;
-
-	/* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency
-	 * to System Bus Frequency Ratio Field in the Processor Frequency
-	 * Configuration Register of the MSR. Therefore the current
-	 * frequency cannot be calculated and has to be measured.
-	 */
-	if (c->x86_model < 2)
-		return cpu_khz;
-
-	rdmsr(0x2c, msr_lo, msr_hi);
-
-	dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi);
-
-	/* decode the FSB: see IA-32 Intel (C) Architecture Software
-	 * Developer's Manual, Volume 3: System Prgramming Guide,
-	 * revision #12 in Table B-1: MSRs in the Pentium 4 and
-	 * Intel Xeon Processors, on page B-4 and B-5.
-	 */
-	fsb_code = (msr_lo >> 16) & 0x7;
-	switch (fsb_code) {
-	case 0:
-		fsb = 100 * 1000;
-		break;
-	case 1:
-		fsb = 13333 * 10;
-		break;
-	case 2:
-		fsb = 200 * 1000;
-		break;
-	}
-
-	if (!fsb)
-		printk(KERN_DEBUG PFX "couldn't detect FSB speed. "
-				"Please send an e-mail to <linux@brodo.de>\n");
-
-	/* Multiplier. */
-	mult = msr_lo >> 24;
-
-	dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n",
-			fsb, mult, (fsb * mult));
-
-	ret = (fsb * mult);
-	return ret;
-}
-
-
-/* Warning: may get called from smp_call_function_single. */
-unsigned int speedstep_get_frequency(enum speedstep_processor processor)
-{
-	switch (processor) {
-	case SPEEDSTEP_CPU_PCORE:
-		return pentium_core_get_frequency();
-	case SPEEDSTEP_CPU_PM:
-		return pentiumM_get_frequency();
-	case SPEEDSTEP_CPU_P4D:
-	case SPEEDSTEP_CPU_P4M:
-		return pentium4_get_frequency();
-	case SPEEDSTEP_CPU_PIII_T:
-	case SPEEDSTEP_CPU_PIII_C:
-	case SPEEDSTEP_CPU_PIII_C_EARLY:
-		return pentium3_get_frequency(processor);
-	default:
-		return 0;
-	};
-	return 0;
-}
-EXPORT_SYMBOL_GPL(speedstep_get_frequency);
-
-
-/*********************************************************************
- *                 DETECT SPEEDSTEP-CAPABLE PROCESSOR                *
- *********************************************************************/
-
-unsigned int speedstep_detect_processor(void)
-{
-	struct cpuinfo_x86 *c = &cpu_data(0);
-	u32 ebx, msr_lo, msr_hi;
-
-	dprintk("x86: %x, model: %x\n", c->x86, c->x86_model);
-
-	if ((c->x86_vendor != X86_VENDOR_INTEL) ||
-	    ((c->x86 != 6) && (c->x86 != 0xF)))
-		return 0;
-
-	if (c->x86 == 0xF) {
-		/* Intel Mobile Pentium 4-M
-		 * or Intel Mobile Pentium 4 with 533 MHz FSB */
-		if (c->x86_model != 2)
-			return 0;
-
-		ebx = cpuid_ebx(0x00000001);
-		ebx &= 0x000000FF;
-
-		dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask);
-
-		switch (c->x86_mask) {
-		case 4:
-			/*
-			 * B-stepping [M-P4-M]
-			 * sample has ebx = 0x0f, production has 0x0e.
-			 */
-			if ((ebx == 0x0e) || (ebx == 0x0f))
-				return SPEEDSTEP_CPU_P4M;
-			break;
-		case 7:
-			/*
-			 * C-stepping [M-P4-M]
-			 * needs to have ebx=0x0e, else it's a celeron:
-			 * cf. 25130917.pdf / page 7, footnote 5 even
-			 * though 25072120.pdf / page 7 doesn't say
-			 * samples are only of B-stepping...
-			 */
-			if (ebx == 0x0e)
-				return SPEEDSTEP_CPU_P4M;
-			break;
-		case 9:
-			/*
-			 * D-stepping [M-P4-M or M-P4/533]
-			 *
-			 * this is totally strange: CPUID 0x0F29 is
-			 * used by M-P4-M, M-P4/533 and(!) Celeron CPUs.
-			 * The latter need to be sorted out as they don't
-			 * support speedstep.
-			 * Celerons with CPUID 0x0F29 may have either
-			 * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything
-			 * specific.
-			 * M-P4-Ms may have either ebx=0xe or 0xf [see above]
-			 * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf]
-			 * also, M-P4M HTs have ebx=0x8, too
-			 * For now, they are distinguished by the model_id
-			 * string
-			 */
-			if ((ebx == 0x0e) ||
-				(strstr(c->x86_model_id,
-				    "Mobile Intel(R) Pentium(R) 4") != NULL))
-				return SPEEDSTEP_CPU_P4M;
-			break;
-		default:
-			break;
-		}
-		return 0;
-	}
-
-	switch (c->x86_model) {
-	case 0x0B: /* Intel PIII [Tualatin] */
-		/* cpuid_ebx(1) is 0x04 for desktop PIII,
-		 * 0x06 for mobile PIII-M */
-		ebx = cpuid_ebx(0x00000001);
-		dprintk("ebx is %x\n", ebx);
-
-		ebx &= 0x000000FF;
-
-		if (ebx != 0x06)
-			return 0;
-
-		/* So far all PIII-M processors support SpeedStep. See
-		 * Intel's 24540640.pdf of June 2003
-		 */
-		return SPEEDSTEP_CPU_PIII_T;
-
-	case 0x08: /* Intel PIII [Coppermine] */
-
-		/* all mobile PIII Coppermines have FSB 100 MHz
-		 * ==> sort out a few desktop PIIIs. */
-		rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
-		dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n",
-				msr_lo, msr_hi);
-		msr_lo &= 0x00c0000;
-		if (msr_lo != 0x0080000)
-			return 0;
-
-		/*
-		 * If the processor is a mobile version,
-		 * platform ID has bit 50 set
-		 * it has SpeedStep technology if either
-		 * bit 56 or 57 is set
-		 */
-		rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
-		dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n",
-				msr_lo, msr_hi);
-		if ((msr_hi & (1<<18)) &&
-		    (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
-			if (c->x86_mask == 0x01) {
-				dprintk("early PIII version\n");
-				return SPEEDSTEP_CPU_PIII_C_EARLY;
-			} else
-				return SPEEDSTEP_CPU_PIII_C;
-		}
-
-	default:
-		return 0;
-	}
-}
-EXPORT_SYMBOL_GPL(speedstep_detect_processor);
-
-
-/*********************************************************************
- *                     DETECT SPEEDSTEP SPEEDS                       *
- *********************************************************************/
-
-unsigned int speedstep_get_freqs(enum speedstep_processor processor,
-				  unsigned int *low_speed,
-				  unsigned int *high_speed,
-				  unsigned int *transition_latency,
-				  void (*set_state) (unsigned int state))
-{
-	unsigned int prev_speed;
-	unsigned int ret = 0;
-	unsigned long flags;
-	struct timeval tv1, tv2;
-
-	if ((!processor) || (!low_speed) || (!high_speed) || (!set_state))
-		return -EINVAL;
-
-	dprintk("trying to determine both speeds\n");
-
-	/* get current speed */
-	prev_speed = speedstep_get_frequency(processor);
-	if (!prev_speed)
-		return -EIO;
-
-	dprintk("previous speed is %u\n", prev_speed);
-
-	local_irq_save(flags);
-
-	/* switch to low state */
-	set_state(SPEEDSTEP_LOW);
-	*low_speed = speedstep_get_frequency(processor);
-	if (!*low_speed) {
-		ret = -EIO;
-		goto out;
-	}
-
-	dprintk("low speed is %u\n", *low_speed);
-
-	/* start latency measurement */
-	if (transition_latency)
-		do_gettimeofday(&tv1);
-
-	/* switch to high state */
-	set_state(SPEEDSTEP_HIGH);
-
-	/* end latency measurement */
-	if (transition_latency)
-		do_gettimeofday(&tv2);
-
-	*high_speed = speedstep_get_frequency(processor);
-	if (!*high_speed) {
-		ret = -EIO;
-		goto out;
-	}
-
-	dprintk("high speed is %u\n", *high_speed);
-
-	if (*low_speed == *high_speed) {
-		ret = -ENODEV;
-		goto out;
-	}
-
-	/* switch to previous state, if necessary */
-	if (*high_speed != prev_speed)
-		set_state(SPEEDSTEP_LOW);
-
-	if (transition_latency) {
-		*transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC +
-			tv2.tv_usec - tv1.tv_usec;
-		dprintk("transition latency is %u uSec\n", *transition_latency);
-
-		/* convert uSec to nSec and add 20% for safety reasons */
-		*transition_latency *= 1200;
-
-		/* check if the latency measurement is too high or too low
-		 * and set it to a safe value (500uSec) in that case
-		 */
-		if (*transition_latency > 10000000 ||
-		    *transition_latency < 50000) {
-			printk(KERN_WARNING PFX "frequency transition "
-					"measured seems out of range (%u "
-					"nSec), falling back to a safe one of"
-					"%u nSec.\n",
-					*transition_latency, 500000);
-			*transition_latency = 500000;
-		}
-	}
-
-out:
-	local_irq_restore(flags);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(speedstep_get_freqs);
-
-#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
-module_param(relaxed_check, int, 0444);
-MODULE_PARM_DESC(relaxed_check,
-		"Don't do all checks for speedstep capability.");
-#endif
-
-MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
deleted file mode 100644
index 70d9cea1219..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- *  Library for common functions for Intel SpeedStep v.1 and v.2 support
- *
- *  BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
- */
-
-
-
-/* processors */
-enum speedstep_processor {
-	SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001,  /* Coppermine core */
-	SPEEDSTEP_CPU_PIII_C	   = 0x00000002,  /* Coppermine core */
-	SPEEDSTEP_CPU_PIII_T	   = 0x00000003,  /* Tualatin core */
-	SPEEDSTEP_CPU_P4M	   = 0x00000004,  /* P4-M  */
-/* the following processors are not speedstep-capable and are not auto-detected
- * in speedstep_detect_processor(). However, their speed can be detected using
- * the speedstep_get_frequency() call. */
-	SPEEDSTEP_CPU_PM	   = 0xFFFFFF03,  /* Pentium M  */
-	SPEEDSTEP_CPU_P4D	   = 0xFFFFFF04,  /* desktop P4  */
-	SPEEDSTEP_CPU_PCORE	   = 0xFFFFFF05,  /* Core */
-};
-
-/* speedstep states -- only two of them */
-
-#define SPEEDSTEP_HIGH	0x00000000
-#define SPEEDSTEP_LOW	0x00000001
-
-
-/* detect a speedstep-capable processor */
-extern enum speedstep_processor speedstep_detect_processor(void);
-
-/* detect the current speed (in khz) of the processor */
-extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
-
-
-/* detect the low and high speeds of the processor. The callback
- * set_state"'s first argument is either SPEEDSTEP_HIGH or
- * SPEEDSTEP_LOW; the second argument is zero so that no
- * cpufreq_notify_transition calls are initiated.
- */
-extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
-	unsigned int *low_speed,
-	unsigned int *high_speed,
-	unsigned int *transition_latency,
-	void (*set_state) (unsigned int state));
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
deleted file mode 100644
index 91bc25b67bc..00000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ /dev/null
@@ -1,467 +0,0 @@
-/*
- * Intel SpeedStep SMI driver.
- *
- * (C) 2003  Hiroshi Miura <miura@da-cha.org>
- *
- *  Licensed under the terms of the GNU GPL License version 2.
- *
- */
-
-
-/*********************************************************************
- *                        SPEEDSTEP - DEFINITIONS                    *
- *********************************************************************/
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/cpufreq.h>
-#include <linux/delay.h>
-#include <linux/io.h>
-#include <asm/ist.h>
-
-#include "speedstep-lib.h"
-
-/* speedstep system management interface port/command.
- *
- * These parameters are got from IST-SMI BIOS call.
- * If user gives it, these are used.
- *
- */
-static int smi_port;
-static int smi_cmd;
-static unsigned int smi_sig;
-
-/* info about the processor */
-static enum speedstep_processor speedstep_processor;
-
-/*
- * There are only two frequency states for each processor. Values
- * are in kHz for the time being.
- */
-static struct cpufreq_frequency_table speedstep_freqs[] = {
-	{SPEEDSTEP_HIGH,	0},
-	{SPEEDSTEP_LOW,		0},
-	{0,			CPUFREQ_TABLE_END},
-};
-
-#define GET_SPEEDSTEP_OWNER 0
-#define GET_SPEEDSTEP_STATE 1
-#define SET_SPEEDSTEP_STATE 2
-#define GET_SPEEDSTEP_FREQS 4
-
-/* how often shall the SMI call be tried if it failed, e.g. because
- * of DMA activity going on? */
-#define SMI_TRIES 5
-
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
-		"speedstep-smi", msg)
-
-/**
- * speedstep_smi_ownership
- */
-static int speedstep_smi_ownership(void)
-{
-	u32 command, result, magic, dummy;
-	u32 function = GET_SPEEDSTEP_OWNER;
-	unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation";
-
-	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-	magic = virt_to_phys(magic_data);
-
-	dprintk("trying to obtain ownership with command %x at port %x\n",
-			command, smi_port);
-
-	__asm__ __volatile__(
-		"push %%ebp\n"
-		"out %%al, (%%dx)\n"
-		"pop %%ebp\n"
-		: "=D" (result),
-		  "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy),
-		  "=S" (dummy)
-		: "a" (command), "b" (function), "c" (0), "d" (smi_port),
-		  "D" (0), "S" (magic)
-		: "memory"
-	);
-
-	dprintk("result is %x\n", result);
-
-	return result;
-}
-
-/**
- * speedstep_smi_get_freqs - get SpeedStep preferred & current freq.
- * @low: the low frequency value is placed here
- * @high: the high frequency value is placed here
- *
- * Only available on later SpeedStep-enabled systems, returns false results or
- * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing
- * shows that the latter occurs if !(ist_info.event & 0xFFFF).
- */
-static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high)
-{
-	u32 command, result = 0, edi, high_mhz, low_mhz, dummy;
-	u32 state = 0;
-	u32 function = GET_SPEEDSTEP_FREQS;
-
-	if (!(ist_info.event & 0xFFFF)) {
-		dprintk("bug #1422 -- can't read freqs from BIOS\n");
-		return -ENODEV;
-	}
-
-	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-
-	dprintk("trying to determine frequencies with command %x at port %x\n",
-			command, smi_port);
-
-	__asm__ __volatile__(
-		"push %%ebp\n"
-		"out %%al, (%%dx)\n"
-		"pop %%ebp"
-		: "=a" (result),
-		  "=b" (high_mhz),
-		  "=c" (low_mhz),
-		  "=d" (state), "=D" (edi), "=S" (dummy)
-		: "a" (command),
-		  "b" (function),
-		  "c" (state),
-		  "d" (smi_port), "S" (0), "D" (0)
-	);
-
-	dprintk("result %x, low_freq %u, high_freq %u\n",
-			result, low_mhz, high_mhz);
-
-	/* abort if results are obviously incorrect... */
-	if ((high_mhz + low_mhz) < 600)
-		return -EINVAL;
-
-	*high = high_mhz * 1000;
-	*low  = low_mhz  * 1000;
-
-	return result;
-}
-
-/**
- * speedstep_get_state - set the SpeedStep state
- * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- */
-static int speedstep_get_state(void)
-{
-	u32 function = GET_SPEEDSTEP_STATE;
-	u32 result, state, edi, command, dummy;
-
-	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-
-	dprintk("trying to determine current setting with command %x "
-		"at port %x\n", command, smi_port);
-
-	__asm__ __volatile__(
-		"push %%ebp\n"
-		"out %%al, (%%dx)\n"
-		"pop %%ebp\n"
-		: "=a" (result),
-		  "=b" (state), "=D" (edi),
-		  "=c" (dummy), "=d" (dummy), "=S" (dummy)
-		: "a" (command), "b" (function), "c" (0),
-		  "d" (smi_port), "S" (0), "D" (0)
-	);
-
-	dprintk("state is %x, result is %x\n", state, result);
-
-	return state & 1;
-}
-
-
-/**
- * speedstep_set_state - set the SpeedStep state
- * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
- *
- */
-static void speedstep_set_state(unsigned int state)
-{
-	unsigned int result = 0, command, new_state, dummy;
-	unsigned long flags;
-	unsigned int function = SET_SPEEDSTEP_STATE;
-	unsigned int retry = 0;
-
-	if (state > 0x1)
-		return;
-
-	/* Disable IRQs */
-	local_irq_save(flags);
-
-	command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
-
-	dprintk("trying to set frequency to state %u "
-		"with command %x at port %x\n",
-		state, command, smi_port);
-
-	do {
-		if (retry) {
-			dprintk("retry %u, previous result %u, waiting...\n",
-					retry, result);
-			mdelay(retry * 50);
-		}
-		retry++;
-		__asm__ __volatile__(
-			"push %%ebp\n"
-			"out %%al, (%%dx)\n"
-			"pop %%ebp"
-			: "=b" (new_state), "=D" (result),
-			  "=c" (dummy), "=a" (dummy),
-			  "=d" (dummy), "=S" (dummy)
-			: "a" (command), "b" (function), "c" (state),
-			  "d" (smi_port), "S" (0), "D" (0)
-			);
-	} while ((new_state != state) && (retry <= SMI_TRIES));
-
-	/* enable IRQs */
-	local_irq_restore(flags);
-
-	if (new_state == state)
-		dprintk("change to %u MHz succeeded after %u tries "
-			"with result %u\n",
-			(speedstep_freqs[new_state].frequency / 1000),
-			retry, result);
-	else
-		printk(KERN_ERR "cpufreq: change to state %u "
-			"failed with new_state %u and result %u\n",
-			state, new_state, result);
-
-	return;
-}
-
-
-/**
- * speedstep_target - set a new CPUFreq policy
- * @policy: new policy
- * @target_freq: new freq
- * @relation:
- *
- * Sets a new CPUFreq policy/freq.
- */
-static int speedstep_target(struct cpufreq_policy *policy,
-			unsigned int target_freq, unsigned int relation)
-{
-	unsigned int newstate = 0;
-	struct cpufreq_freqs freqs;
-
-	if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
-				target_freq, relation, &newstate))
-		return -EINVAL;
-
-	freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
-	freqs.new = speedstep_freqs[newstate].frequency;
-	freqs.cpu = 0; /* speedstep.c is UP only driver */
-
-	if (freqs.old == freqs.new)
-		return 0;
-
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	speedstep_set_state(newstate);
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-
-	return 0;
-}
-
-
-/**
- * speedstep_verify - verifies a new CPUFreq policy
- * @policy: new policy
- *
- * Limit must be within speedstep_low_freq and speedstep_high_freq, with
- * at least one border included.
- */
-static int speedstep_verify(struct cpufreq_policy *policy)
-{
-	return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
-}
-
-
-static int speedstep_cpu_init(struct cpufreq_policy *policy)
-{
-	int result;
-	unsigned int speed, state;
-	unsigned int *low, *high;
-
-	/* capability check */
-	if (policy->cpu != 0)
-		return -ENODEV;
-
-	result = speedstep_smi_ownership();
-	if (result) {
-		dprintk("fails in acquiring ownership of a SMI interface.\n");
-		return -EINVAL;
-	}
-
-	/* detect low and high frequency */
-	low = &speedstep_freqs[SPEEDSTEP_LOW].frequency;
-	high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency;
-
-	result = speedstep_smi_get_freqs(low, high);
-	if (result) {
-		/* fall back to speedstep_lib.c dection mechanism:
-		 * try both states out */
-		dprintk("could not detect low and high frequencies "
-				"by SMI call.\n");
-		result = speedstep_get_freqs(speedstep_processor,
-				low, high,
-				NULL,
-				&speedstep_set_state);
-
-		if (result) {
-			dprintk("could not detect two different speeds"
-					" -- aborting.\n");
-			return result;
-		} else
-			dprintk("workaround worked.\n");
-	}
-
-	/* get current speed setting */
-	state = speedstep_get_state();
-	speed = speedstep_freqs[state].frequency;
-
-	dprintk("currently at %s speed setting - %i MHz\n",
-		(speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
-		? "low" : "high",
-		(speed / 1000));
-
-	/* cpuinfo and default policy values */
-	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
-	policy->cur = speed;
-
-	result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
-	if (result)
-		return result;
-
-	cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
-
-	return 0;
-}
-
-static int speedstep_cpu_exit(struct cpufreq_policy *policy)
-{
-	cpufreq_frequency_table_put_attr(policy->cpu);
-	return 0;
-}
-
-static unsigned int speedstep_get(unsigned int cpu)
-{
-	if (cpu)
-		return -ENODEV;
-	return speedstep_get_frequency(speedstep_processor);
-}
-
-
-static int speedstep_resume(struct cpufreq_policy *policy)
-{
-	int result = speedstep_smi_ownership();
-
-	if (result)
-		dprintk("fails in re-acquiring ownership of a SMI interface.\n");
-
-	return result;
-}
-
-static struct freq_attr *speedstep_attr[] = {
-	&cpufreq_freq_attr_scaling_available_freqs,
-	NULL,
-};
-
-static struct cpufreq_driver speedstep_driver = {
-	.name		= "speedstep-smi",
-	.verify		= speedstep_verify,
-	.target		= speedstep_target,
-	.init		= speedstep_cpu_init,
-	.exit		= speedstep_cpu_exit,
-	.get		= speedstep_get,
-	.resume		= speedstep_resume,
-	.owner		= THIS_MODULE,
-	.attr		= speedstep_attr,
-};
-
-/**
- * speedstep_init - initializes the SpeedStep CPUFreq driver
- *
- *   Initializes the SpeedStep support. Returns -ENODEV on unsupported
- * BIOS, -EINVAL on problems during initiatization, and zero on
- * success.
- */
-static int __init speedstep_init(void)
-{
-	speedstep_processor = speedstep_detect_processor();
-
-	switch (speedstep_processor) {
-	case SPEEDSTEP_CPU_PIII_T:
-	case SPEEDSTEP_CPU_PIII_C:
-	case SPEEDSTEP_CPU_PIII_C_EARLY:
-		break;
-	default:
-		speedstep_processor = 0;
-	}
-
-	if (!speedstep_processor) {
-		dprintk("No supported Intel CPU detected.\n");
-		return -ENODEV;
-	}
-
-	dprintk("signature:0x%.8lx, command:0x%.8lx, "
-		"event:0x%.8lx, perf_level:0x%.8lx.\n",
-		ist_info.signature, ist_info.command,
-		ist_info.event, ist_info.perf_level);
-
-	/* Error if no IST-SMI BIOS or no PARM
-		 sig= 'ISGE' aka 'Intel Speedstep Gate E' */
-	if ((ist_info.signature !=  0x47534943) && (
-	    (smi_port == 0) || (smi_cmd == 0)))
-		return -ENODEV;
-
-	if (smi_sig == 1)
-		smi_sig = 0x47534943;
-	else
-		smi_sig = ist_info.signature;
-
-	/* setup smi_port from MODLULE_PARM or BIOS */
-	if ((smi_port > 0xff) || (smi_port < 0))
-		return -EINVAL;
-	else if (smi_port == 0)
-		smi_port = ist_info.command & 0xff;
-
-	if ((smi_cmd > 0xff) || (smi_cmd < 0))
-		return -EINVAL;
-	else if (smi_cmd == 0)
-		smi_cmd = (ist_info.command >> 16) & 0xff;
-
-	return cpufreq_register_driver(&speedstep_driver);
-}
-
-
-/**
- * speedstep_exit - unregisters SpeedStep support
- *
- *   Unregisters SpeedStep support.
- */
-static void __exit speedstep_exit(void)
-{
-	cpufreq_unregister_driver(&speedstep_driver);
-}
-
-module_param(smi_port, int, 0444);
-module_param(smi_cmd,  int, 0444);
-module_param(smi_sig, uint, 0444);
-
-MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value "
-		"-- Intel's default setting is 0xb2");
-MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value "
-		"-- Intel's default setting is 0x82");
-MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the "
-		"SMI interface.");
-
-MODULE_AUTHOR("Hiroshi Miura");
-MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface.");
-MODULE_LICENSE("GPL");
-
-module_init(speedstep_init);
-module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 8095f8611f8..755f64fb074 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -32,11 +32,11 @@
  */
 static const __initconst struct hypervisor_x86 * const hypervisors[] =
 {
-	&x86_hyper_vmware,
-	&x86_hyper_ms_hyperv,
 #ifdef CONFIG_XEN_PVHVM
 	&x86_hyper_xen_hvm,
 #endif
+	&x86_hyper_vmware,
+	&x86_hyper_ms_hyperv,
 };
 
 const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index df86bc8c859..ed6086eedf1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -29,10 +29,10 @@
 
 static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 {
+	u64 misc_enable;
+
 	/* Unmask CPUID levels if masked: */
 	if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
-		u64 misc_enable;
-
 		rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
 
 		if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
@@ -118,8 +118,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 	 * (model 2) with the same problem.
 	 */
 	if (c->x86 == 15) {
-		u64 misc_enable;
-
 		rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
 
 		if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
@@ -130,6 +128,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 		}
 	}
 #endif
+
+	/*
+	 * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
+	 * clear the fast string and enhanced fast string CPU capabilities.
+	 */
+	if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
+		rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+		if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
+			printk(KERN_INFO "Disabled fast string operations\n");
+			setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
+			setup_clear_cpu_cap(X86_FEATURE_ERMS);
+		}
+	}
 }
 
 #ifdef CONFIG_X86_32
@@ -400,12 +411,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 
 		switch (c->x86_model) {
 		case 5:
-			if (c->x86_mask == 0) {
-				if (l2 == 0)
-					p = "Celeron (Covington)";
-				else if (l2 == 256)
-					p = "Mobile Pentium II (Dixon)";
-			}
+			if (l2 == 0)
+				p = "Celeron (Covington)";
+			else if (l2 == 256)
+				p = "Mobile Pentium II (Dixon)";
 			break;
 
 		case 6:
@@ -447,6 +456,24 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 
 	if (cpu_has(c, X86_FEATURE_VMX))
 		detect_vmx_virtcap(c);
+
+	/*
+	 * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
+	 * x86_energy_perf_policy(8) is available to change it at run-time
+	 */
+	if (cpu_has(c, X86_FEATURE_EPB)) {
+		u64 epb;
+
+		rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+		if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
+			printk_once(KERN_WARNING "ENERGY_PERF_BIAS:"
+				" Set to 'normal', was 'performance'\n"
+				"ENERGY_PERF_BIAS: View and update with"
+				" x86_energy_perf_policy(8)\n");
+			epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
+			wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+		}
+	}
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 1ce1af2899d..c105c533ed9 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -327,7 +327,6 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
 	l3->subcaches[2] = sc2 = !(val & BIT(8))  + !(val & BIT(9));
 	l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
 
-	l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
 	l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
 }
 
@@ -454,27 +453,16 @@ int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
 {
 	int ret = 0;
 
-#define SUBCACHE_MASK	(3UL << 20)
-#define SUBCACHE_INDEX	0xfff
-
-	/*
-	 * check whether this slot is already used or
-	 * the index is already disabled
-	 */
+	/*  check if @slot is already used or the index is already disabled */
 	ret = amd_get_l3_disable_slot(l3, slot);
 	if (ret >= 0)
 		return -EINVAL;
 
-	/*
-	 * check whether the other slot has disabled the
-	 * same index already
-	 */
-	if (index == amd_get_l3_disable_slot(l3, !slot))
+	if (index > l3->indices)
 		return -EINVAL;
 
-	/* do not allow writes outside of allowed bits */
-	if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
-	    ((index & SUBCACHE_INDEX) > l3->indices))
+	/* check whether the other slot has disabled the same index already */
+	if (index == amd_get_l3_disable_slot(l3, !slot))
 		return -EINVAL;
 
 	amd_l3_disable_index(l3, cpu, slot, index);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 1e8d66c1336..7395d5f4272 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -43,61 +43,105 @@ static struct severity {
 	unsigned char covered;
 	char *msg;
 } severities[] = {
-#define KERNEL .context = IN_KERNEL
-#define USER .context = IN_USER
-#define SER .ser = SER_REQUIRED
-#define NOSER .ser = NO_SER
-#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
-#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
-#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
-#define MCGMASK(x, res, s, m, r...) \
-	{ .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
-#define MASK(x, y, s, m, r...) \
-	{ .mask = x, .result = y, SEV(s), .msg = m, ## r }
+#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
+#define  KERNEL		.context = IN_KERNEL
+#define  USER		.context = IN_USER
+#define  SER		.ser = SER_REQUIRED
+#define  NOSER		.ser = NO_SER
+#define  BITCLR(x)	.mask = x, .result = 0
+#define  BITSET(x)	.mask = x, .result = x
+#define  MCGMASK(x, y)	.mcgmask = x, .mcgres = y
+#define  MASK(x, y)	.mask = x, .result = y
 #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
 #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
 #define MCACOD 0xffff
 
-	BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
-	BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
-	BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
+	MCESEV(
+		NO, "Invalid",
+		BITCLR(MCI_STATUS_VAL)
+		),
+	MCESEV(
+		NO, "Not enabled",
+		BITCLR(MCI_STATUS_EN)
+		),
+	MCESEV(
+		PANIC, "Processor context corrupt",
+		BITSET(MCI_STATUS_PCC)
+		),
 	/* When MCIP is not set something is very confused */
-	MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
+	MCESEV(
+		PANIC, "MCIP not set in MCA handler",
+		MCGMASK(MCG_STATUS_MCIP, 0)
+		),
 	/* Neither return not error IP -- no chance to recover -> PANIC */
-	MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
-		"Neither restart nor error IP"),
-	MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
-		KERNEL),
-	BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
-	MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
-	     "Spurious not enabled", SER),
+	MCESEV(
+		PANIC, "Neither restart nor error IP",
+		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
+		),
+	MCESEV(
+		PANIC, "In kernel and no restart IP",
+		KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
+		),
+	MCESEV(
+		KEEP, "Corrected error",
+		NOSER, BITCLR(MCI_STATUS_UC)
+		),
 
 	/* ignore OVER for UCNA */
-	MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
-	     "Uncorrected no action required", SER),
-	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
-	     "Illegal combination (UCNA with AR=1)", SER),
-	MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
+	MCESEV(
+		KEEP, "Uncorrected no action required",
+		SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
+		),
+	MCESEV(
+		PANIC, "Illegal combination (UCNA with AR=1)",
+		SER,
+		MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
+		),
+	MCESEV(
+		KEEP, "Non signalled machine check",
+		SER, BITCLR(MCI_STATUS_S)
+		),
 
 	/* AR add known MCACODs here */
-	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
-	     "Action required with lost events", SER),
-	MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
-	     "Action required; unknown MCACOD", SER),
+	MCESEV(
+		PANIC, "Action required with lost events",
+		SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
+		),
+	MCESEV(
+		PANIC, "Action required: unknown MCACOD",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
+		),
 
 	/* known AO MCACODs: */
-	MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
-	     "Action optional: memory scrubbing error", SER),
-	MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
-	     "Action optional: last level cache writeback error", SER),
-
-	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
-	     "Action optional unknown MCACOD", SER),
-	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
-	     "Action optional with lost events", SER),
-	BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
-	BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
-	BITSET(0, SOME, "No match")	/* always matches. keep at end */
+	MCESEV(
+		AO, "Action optional: memory scrubbing error",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0)
+		),
+	MCESEV(
+		AO, "Action optional: last level cache writeback error",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a)
+		),
+	MCESEV(
+		SOME, "Action optional: unknown MCACOD",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
+		),
+	MCESEV(
+		SOME, "Action optional with lost events",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
+		),
+
+	MCESEV(
+		PANIC, "Overflowed uncorrected",
+		BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
+		),
+	MCESEV(
+		UC, "Uncorrected",
+		BITSET(MCI_STATUS_UC)
+		),
+	MCESEV(
+		SOME, "No match",
+		BITSET(0)
+		)	/* always matches. keep at end */
 };
 
 /*
@@ -112,15 +156,15 @@ static int error_context(struct mce *m)
 	return IN_KERNEL;
 }
 
-int mce_severity(struct mce *a, int tolerant, char **msg)
+int mce_severity(struct mce *m, int tolerant, char **msg)
 {
-	enum context ctx = error_context(a);
+	enum context ctx = error_context(m);
 	struct severity *s;
 
 	for (s = severities;; s++) {
-		if ((a->status & s->mask) != s->result)
+		if ((m->status & s->mask) != s->result)
 			continue;
-		if ((a->mcgstatus & s->mcgmask) != s->mcgres)
+		if ((m->mcgstatus & s->mcgmask) != s->mcgres)
 			continue;
 		if (s->ser == SER_REQUIRED && !mce_ser)
 			continue;
@@ -197,15 +241,15 @@ static const struct file_operations severities_coverage_fops = {
 
 static int __init severities_debugfs_init(void)
 {
-	struct dentry *dmce = NULL, *fseverities_coverage = NULL;
+	struct dentry *dmce, *fsev;
 
 	dmce = mce_get_debugfs_dir();
-	if (dmce == NULL)
+	if (!dmce)
 		goto err_out;
-	fseverities_coverage = debugfs_create_file("severities-coverage",
-						   0444, dmce, NULL,
-						   &severities_coverage_fops);
-	if (fseverities_coverage == NULL)
+
+	fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
+				   &severities_coverage_fops);
+	if (!fsev)
 		goto err_out;
 
 	return 0;
@@ -214,4 +258,4 @@ err_out:
 	return -ENOMEM;
 }
 late_initcall(severities_debugfs_init);
-#endif
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3385ea26f68..08363b04212 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -10,7 +10,6 @@
 #include <linux/thread_info.h>
 #include <linux/capability.h>
 #include <linux/miscdevice.h>
-#include <linux/interrupt.h>
 #include <linux/ratelimit.h>
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
@@ -38,23 +37,20 @@
 #include <linux/mm.h>
 #include <linux/debugfs.h>
 #include <linux/edac_mce.h>
+#include <linux/irq_work.h>
 
 #include <asm/processor.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/ipi.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 
 #include "mce-internal.h"
 
-static DEFINE_MUTEX(mce_read_mutex);
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
 
 #define rcu_dereference_check_mce(p) \
 	rcu_dereference_index_check((p), \
 			      rcu_read_lock_sched_held() || \
-			      lockdep_is_held(&mce_read_mutex))
+			      lockdep_is_held(&mce_chrdev_read_mutex))
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/mce.h>
@@ -94,7 +90,8 @@ static unsigned long		mce_need_notify;
 static char			mce_helper[128];
 static char			*mce_helper_argv[2] = { mce_helper, NULL };
 
-static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+
 static DEFINE_PER_CPU(struct mce, mces_seen);
 static int			cpu_missing;
 
@@ -105,20 +102,6 @@ static int			cpu_missing;
 ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
 EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
 
-static int default_decode_mce(struct notifier_block *nb, unsigned long val,
-			       void *data)
-{
-	pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
-	pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
-
-	return NOTIFY_STOP;
-}
-
-static struct notifier_block mce_dec_nb = {
-	.notifier_call = default_decode_mce,
-	.priority      = -1,
-};
-
 /* MCA banks polled by the period polling timer for corrected events */
 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
@@ -212,6 +195,8 @@ void mce_log(struct mce *mce)
 
 static void print_mce(struct mce *m)
 {
+	int ret = 0;
+
 	pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
 	       m->extcpu, m->mcgstatus, m->bank, m->status);
 
@@ -239,7 +224,11 @@ static void print_mce(struct mce *m)
 	 * Print out human-readable details about the MCE error,
 	 * (if the CPU has an implementation for that)
 	 */
-	atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
+	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
+	if (ret == NOTIFY_STOP)
+		return;
+
+	pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
 }
 
 #define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -381,6 +370,31 @@ static void mce_wrmsrl(u32 msr, u64 v)
 }
 
 /*
+ * Collect all global (w.r.t. this processor) status about this machine
+ * check into our "mce" struct so that we can use it later to assess
+ * the severity of the problem as we read per-bank specific details.
+ */
+static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
+{
+	mce_setup(m);
+
+	m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+	if (regs) {
+		/*
+		 * Get the address of the instruction at the time of
+		 * the machine check error.
+		 */
+		if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
+			m->ip = regs->ip;
+			m->cs = regs->cs;
+		}
+		/* Use accurate RIP reporting if available. */
+		if (rip_msr)
+			m->ip = mce_rdmsrl(rip_msr);
+	}
+}
+
+/*
  * Simple lockless ring to communicate PFNs from the exception handler with the
  * process context work function. This is vastly simplified because there's
  * only a single reader and a single writer.
@@ -451,40 +465,13 @@ static void mce_schedule_work(void)
 	}
 }
 
-/*
- * Get the address of the instruction at the time of the machine check
- * error.
- */
-static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
-{
+DEFINE_PER_CPU(struct irq_work, mce_irq_work);
 
-	if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
-		m->ip = regs->ip;
-		m->cs = regs->cs;
-	} else {
-		m->ip = 0;
-		m->cs = 0;
-	}
-	if (rip_msr)
-		m->ip = mce_rdmsrl(rip_msr);
-}
-
-#ifdef CONFIG_X86_LOCAL_APIC
-/*
- * Called after interrupts have been reenabled again
- * when a MCE happened during an interrupts off region
- * in the kernel.
- */
-asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
+static void mce_irq_work_cb(struct irq_work *entry)
 {
-	ack_APIC_irq();
-	exit_idle();
-	irq_enter();
 	mce_notify_irq();
 	mce_schedule_work();
-	irq_exit();
 }
-#endif
 
 static void mce_report_event(struct pt_regs *regs)
 {
@@ -500,29 +487,7 @@ static void mce_report_event(struct pt_regs *regs)
 		return;
 	}
 
-#ifdef CONFIG_X86_LOCAL_APIC
-	/*
-	 * Without APIC do not notify. The event will be picked
-	 * up eventually.
-	 */
-	if (!cpu_has_apic)
-		return;
-
-	/*
-	 * When interrupts are disabled we cannot use
-	 * kernel services safely. Trigger an self interrupt
-	 * through the APIC to instead do the notification
-	 * after interrupts are reenabled again.
-	 */
-	apic->send_IPI_self(MCE_SELF_VECTOR);
-
-	/*
-	 * Wait for idle afterwards again so that we don't leave the
-	 * APIC in a non idle state because the normal APIC writes
-	 * cannot exclude us.
-	 */
-	apic_wait_icr_idle();
-#endif
+	irq_work_queue(&__get_cpu_var(mce_irq_work));
 }
 
 DEFINE_PER_CPU(unsigned, mce_poll_count);
@@ -549,9 +514,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 
 	percpu_inc(mce_poll_count);
 
-	mce_setup(&m);
+	mce_gather_info(&m, NULL);
 
-	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 	for (i = 0; i < banks; i++) {
 		if (!mce_banks[i].ctl || !test_bit(i, *b))
 			continue;
@@ -590,7 +554,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
 			mce_log(&m);
 			atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
-			add_taint(TAINT_MACHINE_CHECK);
 		}
 
 		/*
@@ -888,9 +851,9 @@ static int mce_usable_address(struct mce *m)
 {
 	if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
 		return 0;
-	if ((m->misc & 0x3f) > PAGE_SHIFT)
+	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
 		return 0;
-	if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
+	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
 		return 0;
 	return 1;
 }
@@ -951,9 +914,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	if (!banks)
 		goto out;
 
-	mce_setup(&m);
+	mce_gather_info(&m, regs);
 
-	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 	final = &__get_cpu_var(mces_seen);
 	*final = m;
 
@@ -1037,7 +999,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
 			mce_ring_add(m.addr >> PAGE_SHIFT);
 
-		mce_get_rip(&m, regs);
 		mce_log(&m);
 
 		if (severity > worst) {
@@ -1199,7 +1160,8 @@ int mce_notify_irq(void)
 	clear_thread_flag(TIF_MCE_NOTIFY);
 
 	if (test_and_clear_bit(0, &mce_need_notify)) {
-		wake_up_interruptible(&mce_wait);
+		/* wake processes polling /dev/mcelog */
+		wake_up_interruptible(&mce_chrdev_wait);
 
 		/*
 		 * There is no risk of missing notifications because
@@ -1372,18 +1334,23 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 	return 0;
 }
 
-static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
 {
 	if (c->x86 != 5)
-		return;
+		return 0;
+
 	switch (c->x86_vendor) {
 	case X86_VENDOR_INTEL:
 		intel_p5_mcheck_init(c);
+		return 1;
 		break;
 	case X86_VENDOR_CENTAUR:
 		winchip_mcheck_init(c);
+		return 1;
 		break;
 	}
+
+	return 0;
 }
 
 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
@@ -1437,7 +1404,8 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
 	if (mce_disabled)
 		return;
 
-	__mcheck_cpu_ancient_init(c);
+	if (__mcheck_cpu_ancient_init(c))
+		return;
 
 	if (!mce_available(c))
 		return;
@@ -1453,44 +1421,45 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
 	__mcheck_cpu_init_vendor(c);
 	__mcheck_cpu_init_timer();
 	INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
-
+	init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
 }
 
 /*
- * Character device to read and clear the MCE log.
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
  */
 
-static DEFINE_SPINLOCK(mce_state_lock);
-static int		open_count;		/* #times opened */
-static int		open_exclu;		/* already open exclusive? */
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count;	/* #times opened */
+static int mce_chrdev_open_exclu;	/* already open exclusive? */
 
-static int mce_open(struct inode *inode, struct file *file)
+static int mce_chrdev_open(struct inode *inode, struct file *file)
 {
-	spin_lock(&mce_state_lock);
+	spin_lock(&mce_chrdev_state_lock);
 
-	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
-		spin_unlock(&mce_state_lock);
+	if (mce_chrdev_open_exclu ||
+	    (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+		spin_unlock(&mce_chrdev_state_lock);
 
 		return -EBUSY;
 	}
 
 	if (file->f_flags & O_EXCL)
-		open_exclu = 1;
-	open_count++;
+		mce_chrdev_open_exclu = 1;
+	mce_chrdev_open_count++;
 
-	spin_unlock(&mce_state_lock);
+	spin_unlock(&mce_chrdev_state_lock);
 
 	return nonseekable_open(inode, file);
 }
 
-static int mce_release(struct inode *inode, struct file *file)
+static int mce_chrdev_release(struct inode *inode, struct file *file)
 {
-	spin_lock(&mce_state_lock);
+	spin_lock(&mce_chrdev_state_lock);
 
-	open_count--;
-	open_exclu = 0;
+	mce_chrdev_open_count--;
+	mce_chrdev_open_exclu = 0;
 
-	spin_unlock(&mce_state_lock);
+	spin_unlock(&mce_chrdev_state_lock);
 
 	return 0;
 }
@@ -1539,8 +1508,8 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
 	return 0;
 }
 
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
-			loff_t *off)
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+				size_t usize, loff_t *off)
 {
 	char __user *buf = ubuf;
 	unsigned long *cpu_tsc;
@@ -1551,7 +1520,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 	if (!cpu_tsc)
 		return -ENOMEM;
 
-	mutex_lock(&mce_read_mutex);
+	mutex_lock(&mce_chrdev_read_mutex);
 
 	if (!mce_apei_read_done) {
 		err = __mce_read_apei(&buf, usize);
@@ -1571,19 +1540,18 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 	do {
 		for (i = prev; i < next; i++) {
 			unsigned long start = jiffies;
+			struct mce *m = &mcelog.entry[i];
 
-			while (!mcelog.entry[i].finished) {
+			while (!m->finished) {
 				if (time_after_eq(jiffies, start + 2)) {
-					memset(mcelog.entry + i, 0,
-					       sizeof(struct mce));
+					memset(m, 0, sizeof(*m));
 					goto timeout;
 				}
 				cpu_relax();
 			}
 			smp_rmb();
-			err |= copy_to_user(buf, mcelog.entry + i,
-					    sizeof(struct mce));
-			buf += sizeof(struct mce);
+			err |= copy_to_user(buf, m, sizeof(*m));
+			buf += sizeof(*m);
 timeout:
 			;
 		}
@@ -1603,13 +1571,13 @@ timeout:
 	on_each_cpu(collect_tscs, cpu_tsc, 1);
 
 	for (i = next; i < MCE_LOG_LEN; i++) {
-		if (mcelog.entry[i].finished &&
-		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
-			err |= copy_to_user(buf, mcelog.entry+i,
-					    sizeof(struct mce));
+		struct mce *m = &mcelog.entry[i];
+
+		if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
+			err |= copy_to_user(buf, m, sizeof(*m));
 			smp_rmb();
-			buf += sizeof(struct mce);
-			memset(&mcelog.entry[i], 0, sizeof(struct mce));
+			buf += sizeof(*m);
+			memset(m, 0, sizeof(*m));
 		}
 	}
 
@@ -1617,15 +1585,15 @@ timeout:
 		err = -EFAULT;
 
 out:
-	mutex_unlock(&mce_read_mutex);
+	mutex_unlock(&mce_chrdev_read_mutex);
 	kfree(cpu_tsc);
 
 	return err ? err : buf - ubuf;
 }
 
-static unsigned int mce_poll(struct file *file, poll_table *wait)
+static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
 {
-	poll_wait(file, &mce_wait, wait);
+	poll_wait(file, &mce_chrdev_wait, wait);
 	if (rcu_access_index(mcelog.next))
 		return POLLIN | POLLRDNORM;
 	if (!mce_apei_read_done && apei_check_mce())
@@ -1633,7 +1601,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
 	return 0;
 }
 
-static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+				unsigned long arg)
 {
 	int __user *p = (int __user *)arg;
 
@@ -1661,16 +1630,16 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 
 /* Modified in mce-inject.c, so not static or const */
 struct file_operations mce_chrdev_ops = {
-	.open			= mce_open,
-	.release		= mce_release,
-	.read			= mce_read,
-	.poll			= mce_poll,
-	.unlocked_ioctl		= mce_ioctl,
-	.llseek		= no_llseek,
+	.open			= mce_chrdev_open,
+	.release		= mce_chrdev_release,
+	.read			= mce_chrdev_read,
+	.poll			= mce_chrdev_poll,
+	.unlocked_ioctl		= mce_chrdev_ioctl,
+	.llseek			= no_llseek,
 };
 EXPORT_SYMBOL_GPL(mce_chrdev_ops);
 
-static struct miscdevice mce_log_device = {
+static struct miscdevice mce_chrdev_device = {
 	MISC_MCELOG_MINOR,
 	"mcelog",
 	&mce_chrdev_ops,
@@ -1722,15 +1691,13 @@ __setup("mce", mcheck_enable);
 
 int __init mcheck_init(void)
 {
-	atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
-
 	mcheck_intel_therm_init();
 
 	return 0;
 }
 
 /*
- * Sysfs support
+ * mce_syscore: PM support
  */
 
 /*
@@ -1750,12 +1717,12 @@ static int mce_disable_error_reporting(void)
 	return 0;
 }
 
-static int mce_suspend(void)
+static int mce_syscore_suspend(void)
 {
 	return mce_disable_error_reporting();
 }
 
-static void mce_shutdown(void)
+static void mce_syscore_shutdown(void)
 {
 	mce_disable_error_reporting();
 }
@@ -1765,18 +1732,22 @@ static void mce_shutdown(void)
  * Only one CPU is active at this time, the others get re-added later using
  * CPU hotplug:
  */
-static void mce_resume(void)
+static void mce_syscore_resume(void)
 {
 	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
 }
 
 static struct syscore_ops mce_syscore_ops = {
-	.suspend	= mce_suspend,
-	.shutdown	= mce_shutdown,
-	.resume		= mce_resume,
+	.suspend	= mce_syscore_suspend,
+	.shutdown	= mce_syscore_shutdown,
+	.resume		= mce_syscore_resume,
 };
 
+/*
+ * mce_sysdev: Sysfs support
+ */
+
 static void mce_cpu_restart(void *data)
 {
 	del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1812,11 +1783,11 @@ static void mce_enable_ce(void *all)
 		__mcheck_cpu_init_timer();
 }
 
-static struct sysdev_class mce_sysclass = {
+static struct sysdev_class mce_sysdev_class = {
 	.name		= "machinecheck",
 };
 
-DEFINE_PER_CPU(struct sys_device, mce_dev);
+DEFINE_PER_CPU(struct sys_device, mce_sysdev);
 
 __cpuinitdata
 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -1945,7 +1916,7 @@ static struct sysdev_ext_attribute attr_cmci_disabled = {
 	&mce_cmci_disabled
 };
 
-static struct sysdev_attribute *mce_attrs[] = {
+static struct sysdev_attribute *mce_sysdev_attrs[] = {
 	&attr_tolerant.attr,
 	&attr_check_interval.attr,
 	&attr_trigger,
@@ -1956,66 +1927,67 @@ static struct sysdev_attribute *mce_attrs[] = {
 	NULL
 };
 
-static cpumask_var_t mce_dev_initialized;
+static cpumask_var_t mce_sysdev_initialized;
 
 /* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_create_device(unsigned int cpu)
+static __cpuinit int mce_sysdev_create(unsigned int cpu)
 {
+	struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
 	int err;
 	int i, j;
 
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
-	per_cpu(mce_dev, cpu).id	= cpu;
-	per_cpu(mce_dev, cpu).cls	= &mce_sysclass;
+	memset(&sysdev->kobj, 0, sizeof(struct kobject));
+	sysdev->id  = cpu;
+	sysdev->cls = &mce_sysdev_class;
 
-	err = sysdev_register(&per_cpu(mce_dev, cpu));
+	err = sysdev_register(sysdev);
 	if (err)
 		return err;
 
-	for (i = 0; mce_attrs[i]; i++) {
-		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+	for (i = 0; mce_sysdev_attrs[i]; i++) {
+		err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]);
 		if (err)
 			goto error;
 	}
 	for (j = 0; j < banks; j++) {
-		err = sysdev_create_file(&per_cpu(mce_dev, cpu),
-					&mce_banks[j].attr);
+		err = sysdev_create_file(sysdev, &mce_banks[j].attr);
 		if (err)
 			goto error2;
 	}
-	cpumask_set_cpu(cpu, mce_dev_initialized);
+	cpumask_set_cpu(cpu, mce_sysdev_initialized);
 
 	return 0;
 error2:
 	while (--j >= 0)
-		sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
+		sysdev_remove_file(sysdev, &mce_banks[j].attr);
 error:
 	while (--i >= 0)
-		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+		sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
 
-	sysdev_unregister(&per_cpu(mce_dev, cpu));
+	sysdev_unregister(sysdev);
 
 	return err;
 }
 
-static __cpuinit void mce_remove_device(unsigned int cpu)
+static __cpuinit void mce_sysdev_remove(unsigned int cpu)
 {
+	struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
 	int i;
 
-	if (!cpumask_test_cpu(cpu, mce_dev_initialized))
+	if (!cpumask_test_cpu(cpu, mce_sysdev_initialized))
 		return;
 
-	for (i = 0; mce_attrs[i]; i++)
-		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+	for (i = 0; mce_sysdev_attrs[i]; i++)
+		sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
 
 	for (i = 0; i < banks; i++)
-		sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
+		sysdev_remove_file(sysdev, &mce_banks[i].attr);
 
-	sysdev_unregister(&per_cpu(mce_dev, cpu));
-	cpumask_clear_cpu(cpu, mce_dev_initialized);
+	sysdev_unregister(sysdev);
+	cpumask_clear_cpu(cpu, mce_sysdev_initialized);
 }
 
 /* Make sure there are no machine checks on offlined CPUs. */
@@ -2065,7 +2037,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		mce_create_device(cpu);
+		mce_sysdev_create(cpu);
 		if (threshold_cpu_callback)
 			threshold_cpu_callback(action, cpu);
 		break;
@@ -2073,7 +2045,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_DEAD_FROZEN:
 		if (threshold_cpu_callback)
 			threshold_cpu_callback(action, cpu);
-		mce_remove_device(cpu);
+		mce_sysdev_remove(cpu);
 		break;
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
@@ -2127,27 +2099,28 @@ static __init int mcheck_init_device(void)
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
+	zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL);
 
 	mce_init_banks();
 
-	err = sysdev_class_register(&mce_sysclass);
+	err = sysdev_class_register(&mce_sysdev_class);
 	if (err)
 		return err;
 
 	for_each_online_cpu(i) {
-		err = mce_create_device(i);
+		err = mce_sysdev_create(i);
 		if (err)
 			return err;
 	}
 
 	register_syscore_ops(&mce_syscore_ops);
 	register_hotcpu_notifier(&mce_cpu_notifier);
-	misc_register(&mce_log_device);
+
+	/* register character device /dev/mcelog */
+	misc_register(&mce_chrdev_device);
 
 	return err;
 }
-
 device_initcall(mcheck_init_device);
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 167f97b5596..f5474218cff 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -509,6 +509,7 @@ recurse:
 out_free:
 	if (b) {
 		kobject_put(&b->kobj);
+		list_del(&b->miscj);
 		kfree(b);
 	}
 	return err;
@@ -547,7 +548,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (!b)
 			goto out;
 
-		err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
+		err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj,
 					b->kobj, name);
 		if (err)
 			goto out;
@@ -570,7 +571,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		goto out;
 	}
 
-	b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
+	b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj);
 	if (!b->kobj)
 		goto out_free;
 
@@ -590,7 +591,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (i == cpu)
 			continue;
 
-		err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
+		err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj,
 					b->kobj, name);
 		if (err)
 			goto out;
@@ -668,7 +669,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 #ifdef CONFIG_SMP
 	/* sibling symlink */
 	if (shared_bank[bank] && b->blocks->cpu != cpu) {
-		sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
+		sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name);
 		per_cpu(threshold_banks, cpu)[bank] = NULL;
 
 		return;
@@ -680,7 +681,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 		if (i == cpu)
 			continue;
 
-		sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
+		sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name);
 		per_cpu(threshold_banks, i)[bank] = NULL;
 	}
 
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 6f8c5e9da97..27c625178bf 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -187,8 +187,6 @@ static int therm_throt_process(bool new_event, int event, int level)
 				this_cpu,
 				level == CORE_LEVEL ? "Core" : "Package",
 				state->count);
-
-		add_taint(TAINT_MACHINE_CHECK);
 		return 1;
 	}
 	if (old_event) {
@@ -355,7 +353,6 @@ static void notify_thresholds(__u64 msr_val)
 static void intel_thermal_interrupt(void)
 {
 	__u64 msr_val;
-	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
 
 	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
 
@@ -367,19 +364,19 @@ static void intel_thermal_interrupt(void)
 				CORE_LEVEL) != 0)
 		mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
 
-	if (cpu_has(c, X86_FEATURE_PLN))
+	if (this_cpu_has(X86_FEATURE_PLN))
 		if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
 					POWER_LIMIT_EVENT,
 					CORE_LEVEL) != 0)
 			mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
 
-	if (cpu_has(c, X86_FEATURE_PTS)) {
+	if (this_cpu_has(X86_FEATURE_PTS)) {
 		rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
 		if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
 					THERMAL_THROTTLING_EVENT,
 					PACKAGE_LEVEL) != 0)
 			mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
-		if (cpu_has(c, X86_FEATURE_PLN))
+		if (this_cpu_has(X86_FEATURE_PLN))
 			if (therm_throt_process(msr_val &
 					PACKAGE_THERM_STATUS_POWER_LIMIT,
 					POWER_LIMIT_EVENT,
@@ -393,7 +390,6 @@ static void unexpected_thermal_interrupt(void)
 {
 	printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
 			smp_processor_id());
-	add_taint(TAINT_MACHINE_CHECK);
 }
 
 static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
@@ -446,18 +442,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 	 */
 	rdmsr(MSR_IA32_MISC_ENABLE, l, h);
 
+	h = lvtthmr_init;
 	/*
 	 * The initial value of thermal LVT entries on all APs always reads
 	 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
 	 * sequence to them and LVT registers are reset to 0s except for
 	 * the mask bits which are set to 1s when APs receive INIT IPI.
-	 * Always restore the value that BIOS has programmed on AP based on
-	 * BSP's info we saved since BIOS is always setting the same value
-	 * for all threads/cores
+	 * If BIOS takes over the thermal interrupt and sets its interrupt
+	 * delivery mode to SMI (not fixed), it restores the value that the
+	 * BIOS has programmed on AP based on BSP's info we saved since BIOS
+	 * is always setting the same value for all threads/cores.
 	 */
-	apic_write(APIC_LVTTHMR, lvtthmr_init);
+	if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
+		apic_write(APIC_LVTTHMR, lvtthmr_init);
 
-	h = lvtthmr_init;
 
 	if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
 		printk(KERN_DEBUG
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 929739a653d..08119a37e53 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -79,7 +79,6 @@ void set_mtrr_ops(const struct mtrr_ops *ops)
 static int have_wrcomb(void)
 {
 	struct pci_dev *dev;
-	u8 rev;
 
 	dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
 	if (dev != NULL) {
@@ -89,13 +88,11 @@ static int have_wrcomb(void)
 		 * chipsets to be tagged
 		 */
 		if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
-		    dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) {
-			pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
-			if (rev <= 5) {
-				pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
-				pci_dev_put(dev);
-				return 0;
-			}
+		    dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
+		    dev->revision <= 5) {
+			pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
+			pci_dev_put(dev);
+			return 0;
 		}
 		/*
 		 * Intel 450NX errata # 23. Non ascending cacheline evictions to
@@ -137,55 +134,43 @@ static void __init init_table(void)
 }
 
 struct set_mtrr_data {
-	atomic_t	count;
-	atomic_t	gate;
 	unsigned long	smp_base;
 	unsigned long	smp_size;
 	unsigned int	smp_reg;
 	mtrr_type	smp_type;
 };
 
-static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
-
 /**
- * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
+ * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed
+ * by all the CPUs.
  * @info: pointer to mtrr configuration data
  *
  * Returns nothing.
  */
-static int mtrr_work_handler(void *info)
+static int mtrr_rendezvous_handler(void *info)
 {
 #ifdef CONFIG_SMP
 	struct set_mtrr_data *data = info;
-	unsigned long flags;
-
-	atomic_dec(&data->count);
-	while (!atomic_read(&data->gate))
-		cpu_relax();
-
-	local_irq_save(flags);
-
-	atomic_dec(&data->count);
-	while (atomic_read(&data->gate))
-		cpu_relax();
 
-	/*  The master has cleared me to execute  */
+	/*
+	 * We use this same function to initialize the mtrrs during boot,
+	 * resume, runtime cpu online and on an explicit request to set a
+	 * specific MTRR.
+	 *
+	 * During boot or suspend, the state of the boot cpu's mtrrs has been
+	 * saved, and we want to replicate that across all the cpus that come
+	 * online (either at the end of boot or resume or during a runtime cpu
+	 * online). If we're doing that, @reg is set to something special and on
+	 * all the cpu's we do mtrr_if->set_all() (On the logical cpu that
+	 * started the boot/resume sequence, this might be a duplicate
+	 * set_all()).
+	 */
 	if (data->smp_reg != ~0U) {
 		mtrr_if->set(data->smp_reg, data->smp_base,
 			     data->smp_size, data->smp_type);
-	} else if (mtrr_aps_delayed_init) {
-		/*
-		 * Initialize the MTRRs inaddition to the synchronisation.
-		 */
+	} else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
 		mtrr_if->set_all();
 	}
-
-	atomic_dec(&data->count);
-	while (!atomic_read(&data->gate))
-		cpu_relax();
-
-	atomic_dec(&data->count);
-	local_irq_restore(flags);
 #endif
 	return 0;
 }
@@ -223,20 +208,11 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
  * 14. Wait for buddies to catch up
  * 15. Enable interrupts.
  *
- * What does that mean for us? Well, first we set data.count to the number
- * of CPUs. As each CPU announces that it started the rendezvous handler by
- * decrementing the count, We reset data.count and set the data.gate flag
- * allowing all the cpu's to proceed with the work. As each cpu disables
- * interrupts, it'll decrement data.count once. We wait until it hits 0 and
- * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
- * are waiting for that flag to be cleared. Once it's cleared, each
- * CPU goes through the transition of updating MTRRs.
- * The CPU vendors may each do it differently,
- * so we call mtrr_if->set() callback and let them take care of it.
- * When they're done, they again decrement data->count and wait for data.gate
- * to be set.
- * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
- * Everyone then enables interrupts and we all continue on.
+ * What does that mean for us? Well, stop_machine() will ensure that
+ * the rendezvous handler is started on each CPU. And in lockstep they
+ * do the state transition of disabling interrupts, updating MTRR's
+ * (the CPU vendors may each do it differently, so we call mtrr_if->set()
+ * callback and let them take care of it.) and enabling interrupts.
  *
  * Note that the mechanism is the same for UP systems, too; all the SMP stuff
  * becomes nops.
@@ -244,92 +220,26 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
 static void
 set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
 {
-	struct set_mtrr_data data;
-	unsigned long flags;
-	int cpu;
-
-	preempt_disable();
-
-	data.smp_reg = reg;
-	data.smp_base = base;
-	data.smp_size = size;
-	data.smp_type = type;
-	atomic_set(&data.count, num_booting_cpus() - 1);
-
-	/* Make sure data.count is visible before unleashing other CPUs */
-	smp_wmb();
-	atomic_set(&data.gate, 0);
-
-	/* Start the ball rolling on other CPUs */
-	for_each_online_cpu(cpu) {
-		struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
-
-		if (cpu == smp_processor_id())
-			continue;
-
-		stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
-	}
-
-
-	while (atomic_read(&data.count))
-		cpu_relax();
-
-	/* Ok, reset count and toggle gate */
-	atomic_set(&data.count, num_booting_cpus() - 1);
-	smp_wmb();
-	atomic_set(&data.gate, 1);
-
-	local_irq_save(flags);
-
-	while (atomic_read(&data.count))
-		cpu_relax();
-
-	/* Ok, reset count and toggle gate */
-	atomic_set(&data.count, num_booting_cpus() - 1);
-	smp_wmb();
-	atomic_set(&data.gate, 0);
-
-	/* Do our MTRR business */
-
-	/*
-	 * HACK!
-	 *
-	 * We use this same function to initialize the mtrrs during boot,
-	 * resume, runtime cpu online and on an explicit request to set a
-	 * specific MTRR.
-	 *
-	 * During boot or suspend, the state of the boot cpu's mtrrs has been
-	 * saved, and we want to replicate that across all the cpus that come
-	 * online (either at the end of boot or resume or during a runtime cpu
-	 * online). If we're doing that, @reg is set to something special and on
-	 * this cpu we still do mtrr_if->set_all(). During boot/resume, this
-	 * is unnecessary if at this point we are still on the cpu that started
-	 * the boot/resume sequence. But there is no guarantee that we are still
-	 * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be
-	 * sure that we are in sync with everyone else.
-	 */
-	if (reg != ~0U)
-		mtrr_if->set(reg, base, size, type);
-	else
-		mtrr_if->set_all();
+	struct set_mtrr_data data = { .smp_reg = reg,
+				      .smp_base = base,
+				      .smp_size = size,
+				      .smp_type = type
+				    };
 
-	/* Wait for the others */
-	while (atomic_read(&data.count))
-		cpu_relax();
-
-	atomic_set(&data.count, num_booting_cpus() - 1);
-	smp_wmb();
-	atomic_set(&data.gate, 1);
-
-	/*
-	 * Wait here for everyone to have seen the gate change
-	 * So we're the last ones to touch 'data'
-	 */
-	while (atomic_read(&data.count))
-		cpu_relax();
+	stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
+}
 
-	local_irq_restore(flags);
-	preempt_enable();
+static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
+				      unsigned long size, mtrr_type type)
+{
+	struct set_mtrr_data data = { .smp_reg = reg,
+				      .smp_base = base,
+				      .smp_size = size,
+				      .smp_type = type
+				    };
+
+	stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
+				       cpu_callout_mask);
 }
 
 /**
@@ -783,7 +693,7 @@ void mtrr_ap_init(void)
 	 *   2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
 	 *      lock to prevent mtrr entry changes
 	 */
-	set_mtrr(~0U, 0, 0, 0);
+	set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
 }
 
 /**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index eed3673a865..4ee3abf20ed 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -22,7 +22,6 @@
 #include <linux/sched.h>
 #include <linux/uaccess.h>
 #include <linux/slab.h>
-#include <linux/highmem.h>
 #include <linux/cpu.h>
 #include <linux/bitops.h>
 
@@ -31,6 +30,7 @@
 #include <asm/nmi.h>
 #include <asm/compat.h>
 #include <asm/smp.h>
+#include <asm/alternative.h>
 
 #if 0
 #undef wrmsrl
@@ -44,38 +44,27 @@ do {								\
 #endif
 
 /*
- * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ *          |   NHM/WSM    |      SNB     |
+ * register -------------------------------
+ *          |  HT  | no HT |  HT  | no HT |
+ *-----------------------------------------
+ * offcore  | core | core  | cpu  | core  |
+ * lbr_sel  | core | core  | cpu  | core  |
+ * ld_lat   | cpu  | core  | cpu  | core  |
+ *-----------------------------------------
+ *
+ * Given that there is a small number of shared regs,
+ * we can pre-allocate their slot in the per-cpu
+ * per-core reg tables.
  */
-static unsigned long
-copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
-{
-	unsigned long offset, addr = (unsigned long)from;
-	unsigned long size, len = 0;
-	struct page *page;
-	void *map;
-	int ret;
-
-	do {
-		ret = __get_user_pages_fast(addr, 1, 0, &page);
-		if (!ret)
-			break;
-
-		offset = addr & (PAGE_SIZE - 1);
-		size = min(PAGE_SIZE - offset, n - len);
+enum extra_reg_type {
+	EXTRA_REG_NONE  = -1,	/* not used */
 
-		map = kmap_atomic(page);
-		memcpy(to, map+offset, size);
-		kunmap_atomic(map);
-		put_page(page);
+	EXTRA_REG_RSP_0 = 0,	/* offcore_response_0 */
+	EXTRA_REG_RSP_1 = 1,	/* offcore_response_1 */
 
-		len  += size;
-		to   += size;
-		addr += size;
-
-	} while (len < n);
-
-	return len;
-}
+	EXTRA_REG_MAX		/* number of entries needed */
+};
 
 struct event_constraint {
 	union {
@@ -131,11 +120,10 @@ struct cpu_hw_events {
 	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
 
 	/*
-	 * Intel percore register state.
-	 * Coordinate shared resources between HT threads.
+	 * manage shared (per-core, per-cpu) registers
+	 * used on Intel NHM/WSM/SNB
 	 */
-	int				percore_used; /* Used by this CPU? */
-	struct intel_percore		*per_core;
+	struct intel_shared_regs	*shared_regs;
 
 	/*
 	 * AMD specific bits
@@ -186,26 +174,45 @@ struct cpu_hw_events {
 	for ((e) = (c); (e)->weight; (e)++)
 
 /*
+ * Per register state.
+ */
+struct er_account {
+	raw_spinlock_t		lock;	/* per-core: protect structure */
+	u64			config;	/* extra MSR config */
+	u64			reg;	/* extra MSR number */
+	atomic_t		ref;	/* reference count */
+};
+
+/*
  * Extra registers for specific events.
+ *
  * Some events need large masks and require external MSRs.
- * Define a mapping to these extra registers.
+ * Those extra MSRs end up being shared for all events on
+ * a PMU and sometimes between PMU of sibling HT threads.
+ * In either case, the kernel needs to handle conflicting
+ * accesses to those extra, shared, regs. The data structure
+ * to manage those registers is stored in cpu_hw_event.
  */
 struct extra_reg {
 	unsigned int		event;
 	unsigned int		msr;
 	u64			config_mask;
 	u64			valid_mask;
+	int			idx;  /* per_xxx->regs[] reg index */
 };
 
-#define EVENT_EXTRA_REG(e, ms, m, vm) {	\
+#define EVENT_EXTRA_REG(e, ms, m, vm, i) {	\
 	.event = (e),		\
 	.msr = (ms),		\
 	.config_mask = (m),	\
 	.valid_mask = (vm),	\
+	.idx = EXTRA_REG_##i	\
 	}
-#define INTEL_EVENT_EXTRA_REG(event, msr, vm)	\
-	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
-#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
+
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)	\
+	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
+
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
 
 union perf_capabilities {
 	struct {
@@ -251,7 +258,6 @@ struct x86_pmu {
 	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
 						 struct perf_event *event);
 	struct event_constraint *event_constraints;
-	struct event_constraint *percore_constraints;
 	void		(*quirks)(void);
 	int		perfctr_second_write;
 
@@ -285,8 +291,12 @@ struct x86_pmu {
 	 * Extra registers for events
 	 */
 	struct extra_reg *extra_regs;
+	unsigned int er_flags;
 };
 
+#define ERF_NO_HT_SHARING	1
+#define ERF_HAS_RSP_1		2
+
 static struct x86_pmu x86_pmu __read_mostly;
 
 static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
@@ -363,12 +373,18 @@ again:
 	return new_raw_count;
 }
 
-/* using X86_FEATURE_PERFCTR_CORE to later implement ALTERNATIVE() here */
 static inline int x86_pmu_addr_offset(int index)
 {
-	if (boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
-		return index << 1;
-	return index;
+	int offset;
+
+	/* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
+	alternative_io(ASM_NOP2,
+		       "shll $1, %%eax",
+		       X86_FEATURE_PERFCTR_CORE,
+		       "=a" (offset),
+		       "a"  (index));
+
+	return offset;
 }
 
 static inline unsigned int x86_pmu_config_addr(int index)
@@ -386,10 +402,10 @@ static inline unsigned int x86_pmu_event_addr(int index)
  */
 static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
 {
+	struct hw_perf_event_extra *reg;
 	struct extra_reg *er;
 
-	event->hw.extra_reg = 0;
-	event->hw.extra_config = 0;
+	reg = &event->hw.extra_reg;
 
 	if (!x86_pmu.extra_regs)
 		return 0;
@@ -399,8 +415,10 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
 			continue;
 		if (event->attr.config1 & ~er->valid_mask)
 			return -EINVAL;
-		event->hw.extra_reg = er->msr;
-		event->hw.extra_config = event->attr.config1;
+
+		reg->idx = er->idx;
+		reg->config = event->attr.config1;
+		reg->reg = er->msr;
 		break;
 	}
 	return 0;
@@ -586,8 +604,12 @@ static int x86_setup_perfctr(struct perf_event *event)
 			return -EOPNOTSUPP;
 	}
 
+	/*
+	 * Do not allow config1 (extended registers) to propagate,
+	 * there's no sane user-space generalization yet:
+	 */
 	if (attr->type == PERF_TYPE_RAW)
-		return x86_pmu_extra_regs(event->attr.config, event);
+		return 0;
 
 	if (attr->type == PERF_TYPE_HW_CACHE)
 		return set_ext_hw_attr(hwc, event);
@@ -609,8 +631,8 @@ static int x86_setup_perfctr(struct perf_event *event)
 	/*
 	 * Branch tracing:
 	 */
-	if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
-	    (hwc->sample_period == 1)) {
+	if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+	    !attr->freq && hwc->sample_period == 1) {
 		/* BTS is not supported by this architecture. */
 		if (!x86_pmu.bts_active)
 			return -EOPNOTSUPP;
@@ -695,6 +717,9 @@ static int __x86_pmu_event_init(struct perf_event *event)
 	event->hw.last_cpu = -1;
 	event->hw.last_tag = ~0ULL;
 
+	/* mark unused */
+	event->hw.extra_reg.idx = EXTRA_REG_NONE;
+
 	return x86_pmu.hw_config(event);
 }
 
@@ -736,8 +761,8 @@ static void x86_pmu_disable(struct pmu *pmu)
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
 					  u64 enable_mask)
 {
-	if (hwc->extra_reg)
-		wrmsrl(hwc->extra_reg, hwc->extra_config);
+	if (hwc->extra_reg.reg)
+		wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
 	wrmsrl(hwc->config_base, hwc->config | enable_mask);
 }
 
@@ -1284,6 +1309,16 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 
 	cpuc = &__get_cpu_var(cpu_hw_events);
 
+	/*
+	 * Some chipsets need to unmask the LVTPC in a particular spot
+	 * inside the nmi handler.  As a result, the unmasking was pushed
+	 * into all the nmi handlers.
+	 *
+	 * This generic handler doesn't seem to have any issues where the
+	 * unmasking occurs so it was left at the top.
+	 */
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		if (!test_bit(idx, cpuc->active_mask)) {
 			/*
@@ -1311,7 +1346,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 		if (!x86_perf_event_set_period(event))
 			continue;
 
-		if (perf_event_overflow(event, 1, &data, regs))
+		if (perf_event_overflow(event, &data, regs))
 			x86_pmu_stop(event, 0);
 	}
 
@@ -1370,8 +1405,6 @@ perf_event_nmi_handler(struct notifier_block *self,
 		return NOTIFY_DONE;
 	}
 
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-
 	handled = x86_pmu.handle_irq(args->regs);
 	if (!handled)
 		return NOTIFY_DONE;
@@ -1618,6 +1651,40 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
 	perf_pmu_enable(pmu);
 	return 0;
 }
+/*
+ * a fake_cpuc is used to validate event groups. Due to
+ * the extra reg logic, we need to also allocate a fake
+ * per_core and per_cpu structure. Otherwise, group events
+ * using extra reg may conflict without the kernel being
+ * able to catch this when the last event gets added to
+ * the group.
+ */
+static void free_fake_cpuc(struct cpu_hw_events *cpuc)
+{
+	kfree(cpuc->shared_regs);
+	kfree(cpuc);
+}
+
+static struct cpu_hw_events *allocate_fake_cpuc(void)
+{
+	struct cpu_hw_events *cpuc;
+	int cpu = raw_smp_processor_id();
+
+	cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
+	if (!cpuc)
+		return ERR_PTR(-ENOMEM);
+
+	/* only needed, if we have extra_regs */
+	if (x86_pmu.extra_regs) {
+		cpuc->shared_regs = allocate_shared_regs(cpu);
+		if (!cpuc->shared_regs)
+			goto error;
+	}
+	return cpuc;
+error:
+	free_fake_cpuc(cpuc);
+	return ERR_PTR(-ENOMEM);
+}
 
 /*
  * validate that we can schedule this event
@@ -1628,9 +1695,9 @@ static int validate_event(struct perf_event *event)
 	struct event_constraint *c;
 	int ret = 0;
 
-	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
-	if (!fake_cpuc)
-		return -ENOMEM;
+	fake_cpuc = allocate_fake_cpuc();
+	if (IS_ERR(fake_cpuc))
+		return PTR_ERR(fake_cpuc);
 
 	c = x86_pmu.get_event_constraints(fake_cpuc, event);
 
@@ -1640,7 +1707,7 @@ static int validate_event(struct perf_event *event)
 	if (x86_pmu.put_event_constraints)
 		x86_pmu.put_event_constraints(fake_cpuc, event);
 
-	kfree(fake_cpuc);
+	free_fake_cpuc(fake_cpuc);
 
 	return ret;
 }
@@ -1660,36 +1727,32 @@ static int validate_group(struct perf_event *event)
 {
 	struct perf_event *leader = event->group_leader;
 	struct cpu_hw_events *fake_cpuc;
-	int ret, n;
-
-	ret = -ENOMEM;
-	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
-	if (!fake_cpuc)
-		goto out;
+	int ret = -ENOSPC, n;
 
+	fake_cpuc = allocate_fake_cpuc();
+	if (IS_ERR(fake_cpuc))
+		return PTR_ERR(fake_cpuc);
 	/*
 	 * the event is not yet connected with its
 	 * siblings therefore we must first collect
 	 * existing siblings, then add the new event
 	 * before we can simulate the scheduling
 	 */
-	ret = -ENOSPC;
 	n = collect_events(fake_cpuc, leader, true);
 	if (n < 0)
-		goto out_free;
+		goto out;
 
 	fake_cpuc->n_events = n;
 	n = collect_events(fake_cpuc, event, false);
 	if (n < 0)
-		goto out_free;
+		goto out;
 
 	fake_cpuc->n_events = n;
 
 	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
 
-out_free:
-	kfree(fake_cpuc);
 out:
+	free_fake_cpuc(fake_cpuc);
 	return ret;
 }
 
@@ -1754,17 +1817,6 @@ static struct pmu pmu = {
  * callchain support
  */
 
-static void
-backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-	/* Ignore warnings */
-}
-
-static void backtrace_warning(void *data, char *msg)
-{
-	/* Ignore warnings */
-}
-
 static int backtrace_stack(void *data, char *name)
 {
 	return 0;
@@ -1778,8 +1830,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
 }
 
 static const struct stacktrace_ops backtrace_ops = {
-	.warning		= backtrace_warning,
-	.warning_symbol		= backtrace_warning_symbol,
 	.stack			= backtrace_stack,
 	.address		= backtrace_address,
 	.walk_stack		= print_context_stack_bp,
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 461f62bbd77..941caa2e449 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -8,7 +8,7 @@ static __initconst const u64 amd_hw_cache_event_ids
  [ C(L1D) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses        */
-		[ C(RESULT_MISS)   ] = 0x0041, /* Data Cache Misses          */
+		[ C(RESULT_MISS)   ] = 0x0141, /* Data Cache Misses          */
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
@@ -89,6 +89,20 @@ static __initconst const u64 amd_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = -1,
 	},
  },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
+		[ C(RESULT_MISS)   ] = 0x98e9, /* CPU Request to Memory, r   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
 };
 
 /*
@@ -96,12 +110,14 @@ static __initconst const u64 amd_hw_cache_event_ids
  */
 static const u64 amd_perfmon_event_map[] =
 {
-  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0076,
-  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0080,
-  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0081,
-  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c2,
-  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c3,
+  [PERF_COUNT_HW_CPU_CYCLES]			= 0x0076,
+  [PERF_COUNT_HW_INSTRUCTIONS]			= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]		= 0x0080,
+  [PERF_COUNT_HW_CACHE_MISSES]			= 0x0081,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]		= 0x00c2,
+  [PERF_COUNT_HW_BRANCH_MISSES]			= 0x00c3,
+  [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= 0x00d0, /* "Decoder empty" event */
+  [PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= 0x00d1, /* "Dispatch stalls" event */
 };
 
 static u64 amd_pmu_event_map(int hw_event)
@@ -427,7 +443,9 @@ static __initconst const struct x86_pmu amd_pmu = {
  *
  * Exceptions:
  *
+ * 0x000	FP	PERF_CTL[3], PERF_CTL[5:3] (*)
  * 0x003	FP	PERF_CTL[3]
+ * 0x004	FP	PERF_CTL[3], PERF_CTL[5:3] (*)
  * 0x00B	FP	PERF_CTL[3]
  * 0x00D	FP	PERF_CTL[3]
  * 0x023	DE	PERF_CTL[2:0]
@@ -448,6 +466,8 @@ static __initconst const struct x86_pmu amd_pmu = {
  * 0x0DF	LS	PERF_CTL[5:0]
  * 0x1D6	EX	PERF_CTL[5:0]
  * 0x1D8	EX	PERF_CTL[5:0]
+ *
+ * (*) depending on the umask all FPU counters may be used
  */
 
 static struct event_constraint amd_f15_PMC0  = EVENT_CONSTRAINT(0, 0x01, 0);
@@ -460,18 +480,28 @@ static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
 static struct event_constraint *
 amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
 {
-	unsigned int event_code = amd_get_event_code(&event->hw);
+	struct hw_perf_event *hwc = &event->hw;
+	unsigned int event_code = amd_get_event_code(hwc);
 
 	switch (event_code & AMD_EVENT_TYPE_MASK) {
 	case AMD_EVENT_FP:
 		switch (event_code) {
+		case 0x000:
+			if (!(hwc->config & 0x0000F000ULL))
+				break;
+			if (!(hwc->config & 0x00000F00ULL))
+				break;
+			return &amd_f15_PMC3;
+		case 0x004:
+			if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
+				break;
+			return &amd_f15_PMC3;
 		case 0x003:
 		case 0x00B:
 		case 0x00D:
 			return &amd_f15_PMC3;
-		default:
-			return &amd_f15_PMC53;
 		}
+		return &amd_f15_PMC53;
 	case AMD_EVENT_LS:
 	case AMD_EVENT_DC:
 	case AMD_EVENT_EX_LS:
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 8fc2b2cee1d..45fbb8f7f54 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,31 +1,21 @@
 #ifdef CONFIG_CPU_SUP_INTEL
 
-#define MAX_EXTRA_REGS 2
-
-/*
- * Per register state.
- */
-struct er_account {
-	int			ref;		/* reference count */
-	unsigned int		extra_reg;	/* extra MSR number */
-	u64			extra_config;	/* extra MSR config */
-};
-
 /*
- * Per core state
- * This used to coordinate shared registers for HT threads.
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
  */
-struct intel_percore {
-	raw_spinlock_t		lock;		/* protect structure */
-	struct er_account	regs[MAX_EXTRA_REGS];
-	int			refcnt;		/* number of threads */
-	unsigned		core_id;
+struct intel_shared_regs {
+	struct er_account       regs[EXTRA_REG_MAX];
+	int                     refcnt;		/* per-core: #HT threads */
+	unsigned                core_id;	/* per-core: core id */
 };
 
 /*
  * Intel PerfMon, used on Core and later.
  */
-static const u64 intel_perfmon_event_map[] =
+static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
 {
   [PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
   [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
@@ -36,7 +26,7 @@ static const u64 intel_perfmon_event_map[] =
   [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
 };
 
-static struct event_constraint intel_core_event_constraints[] =
+static struct event_constraint intel_core_event_constraints[] __read_mostly =
 {
 	INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
 	INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
@@ -47,7 +37,7 @@ static struct event_constraint intel_core_event_constraints[] =
 	EVENT_CONSTRAINT_END
 };
 
-static struct event_constraint intel_core2_event_constraints[] =
+static struct event_constraint intel_core2_event_constraints[] __read_mostly =
 {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
 	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -70,7 +60,7 @@ static struct event_constraint intel_core2_event_constraints[] =
 	EVENT_CONSTRAINT_END
 };
 
-static struct event_constraint intel_nehalem_event_constraints[] =
+static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
 {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
 	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -86,19 +76,13 @@ static struct event_constraint intel_nehalem_event_constraints[] =
 	EVENT_CONSTRAINT_END
 };
 
-static struct extra_reg intel_nehalem_extra_regs[] =
+static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
 {
-	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
+	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
 	EVENT_EXTRA_END
 };
 
-static struct event_constraint intel_nehalem_percore_constraints[] =
-{
-	INTEL_EVENT_CONSTRAINT(0xb7, 0),
-	EVENT_CONSTRAINT_END
-};
-
-static struct event_constraint intel_westmere_event_constraints[] =
+static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
 {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
 	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -110,34 +94,30 @@ static struct event_constraint intel_westmere_event_constraints[] =
 	EVENT_CONSTRAINT_END
 };
 
-static struct event_constraint intel_snb_event_constraints[] =
+static struct event_constraint intel_snb_event_constraints[] __read_mostly =
 {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
 	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
 	/* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
 	INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
-	INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */
-	INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */
 	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
 	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
 	EVENT_CONSTRAINT_END
 };
 
-static struct extra_reg intel_westmere_extra_regs[] =
+static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
 {
-	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
-	INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
+	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+	INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
 	EVENT_EXTRA_END
 };
 
-static struct event_constraint intel_westmere_percore_constraints[] =
+static struct event_constraint intel_v1_event_constraints[] __read_mostly =
 {
-	INTEL_EVENT_CONSTRAINT(0xb7, 0),
-	INTEL_EVENT_CONSTRAINT(0xbb, 0),
 	EVENT_CONSTRAINT_END
 };
 
-static struct event_constraint intel_gen_event_constraints[] =
+static struct event_constraint intel_gen_event_constraints[] __read_mostly =
 {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
 	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -145,6 +125,12 @@ static struct event_constraint intel_gen_event_constraints[] =
 	EVENT_CONSTRAINT_END
 };
 
+static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
+	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+	INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+	EVENT_EXTRA_END
+};
+
 static u64 intel_pmu_event_map(int hw_event)
 {
 	return intel_perfmon_event_map[hw_event];
@@ -184,26 +170,23 @@ static __initconst const u64 snb_hw_cache_event_ids
 	},
  },
  [ C(LL  ) ] = {
-	/*
-	 * TBD: Need Off-core Response Performance Monitoring support
-	 */
 	[ C(OP_READ) ] = {
-		/* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */
+		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
 		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01bb,
+		/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
 	},
 	[ C(OP_WRITE) ] = {
-		/* OFFCORE_RESPONSE_0.ANY_RFO.LOCAL_CACHE */
+		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
 		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE_1.ANY_RFO.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01bb,
+		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
 	},
 	[ C(OP_PREFETCH) ] = {
-		/* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */
+		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
 		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01bb,
+		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
 	},
  },
  [ C(DTLB) ] = {
@@ -248,6 +231,21 @@ static __initconst const u64 snb_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = -1,
 	},
  },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+
 };
 
 static __initconst const u64 westmere_hw_cache_event_ids
@@ -285,26 +283,26 @@ static __initconst const u64 westmere_hw_cache_event_ids
  },
  [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
-		/* OFFCORE_RESPONSE_0.ANY_DATA.LOCAL_CACHE */
+		/* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
 		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE_1.ANY_DATA.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01bb,
+		/* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
 	},
 	/*
 	 * Use RFO, not WRITEBACK, because a write miss would typically occur
 	 * on RFO.
 	 */
 	[ C(OP_WRITE) ] = {
-		/* OFFCORE_RESPONSE_1.ANY_RFO.LOCAL_CACHE */
-		[ C(RESULT_ACCESS) ] = 0x01bb,
-		/* OFFCORE_RESPONSE_0.ANY_RFO.ANY_LLC_MISS */
+		/* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		/* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
 		[ C(RESULT_MISS)   ] = 0x01b7,
 	},
 	[ C(OP_PREFETCH) ] = {
-		/* OFFCORE_RESPONSE_0.PREFETCH.LOCAL_CACHE */
+		/* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
 		[ C(RESULT_ACCESS) ] = 0x01b7,
-		/* OFFCORE_RESPONSE_1.PREFETCH.ANY_LLC_MISS */
-		[ C(RESULT_MISS)   ] = 0x01bb,
+		/* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
+		[ C(RESULT_MISS)   ] = 0x01b7,
 	},
  },
  [ C(DTLB) ] = {
@@ -349,19 +347,53 @@ static __initconst const u64 westmere_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = -1,
 	},
  },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
 };
 
 /*
- * OFFCORE_RESPONSE MSR bits (subset), See IA32 SDM Vol 3 30.6.1.3
+ * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
+ * See IA32 SDM Vol 3B 30.6.1.3
  */
 
-#define DMND_DATA_RD     (1 << 0)
-#define DMND_RFO         (1 << 1)
-#define DMND_WB          (1 << 3)
-#define PF_DATA_RD       (1 << 4)
-#define PF_DATA_RFO      (1 << 5)
-#define RESP_UNCORE_HIT  (1 << 8)
-#define RESP_MISS        (0xf600) /* non uncore hit */
+#define NHM_DMND_DATA_RD	(1 << 0)
+#define NHM_DMND_RFO		(1 << 1)
+#define NHM_DMND_IFETCH		(1 << 2)
+#define NHM_DMND_WB		(1 << 3)
+#define NHM_PF_DATA_RD		(1 << 4)
+#define NHM_PF_DATA_RFO		(1 << 5)
+#define NHM_PF_IFETCH		(1 << 6)
+#define NHM_OFFCORE_OTHER	(1 << 7)
+#define NHM_UNCORE_HIT		(1 << 8)
+#define NHM_OTHER_CORE_HIT_SNP	(1 << 9)
+#define NHM_OTHER_CORE_HITM	(1 << 10)
+        			/* reserved */
+#define NHM_REMOTE_CACHE_FWD	(1 << 12)
+#define NHM_REMOTE_DRAM		(1 << 13)
+#define NHM_LOCAL_DRAM		(1 << 14)
+#define NHM_NON_DRAM		(1 << 15)
+
+#define NHM_ALL_DRAM		(NHM_REMOTE_DRAM|NHM_LOCAL_DRAM)
+
+#define NHM_DMND_READ		(NHM_DMND_DATA_RD)
+#define NHM_DMND_WRITE		(NHM_DMND_RFO|NHM_DMND_WB)
+#define NHM_DMND_PREFETCH	(NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
+
+#define NHM_L3_HIT	(NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
+#define NHM_L3_MISS	(NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD)
+#define NHM_L3_ACCESS	(NHM_L3_HIT|NHM_L3_MISS)
 
 static __initconst const u64 nehalem_hw_cache_extra_regs
 				[PERF_COUNT_HW_CACHE_MAX]
@@ -370,18 +402,32 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
 {
  [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = DMND_DATA_RD|RESP_UNCORE_HIT,
-		[ C(RESULT_MISS)   ] = DMND_DATA_RD|RESP_MISS,
+		[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
+		[ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_L3_MISS,
 	},
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = DMND_RFO|DMND_WB|RESP_UNCORE_HIT,
-		[ C(RESULT_MISS)   ] = DMND_RFO|DMND_WB|RESP_MISS,
+		[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
+		[ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_L3_MISS,
 	},
 	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = PF_DATA_RD|PF_DATA_RFO|RESP_UNCORE_HIT,
-		[ C(RESULT_MISS)   ] = PF_DATA_RD|PF_DATA_RFO|RESP_MISS,
+		[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
+		[ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
 	},
- }
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM,
+		[ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE_DRAM,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM,
+		[ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM,
+		[ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM,
+	},
+ },
 };
 
 static __initconst const u64 nehalem_hw_cache_event_ids
@@ -391,12 +437,12 @@ static __initconst const u64 nehalem_hw_cache_event_ids
 {
  [ C(L1D) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */
-		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */
+		[ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS       */
+		[ C(RESULT_MISS)   ] = 0x0151, /* L1D.REPL                     */
 	},
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */
-		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */
+		[ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES      */
+		[ C(RESULT_MISS)   ] = 0x0251, /* L1D.M_REPL                   */
 	},
 	[ C(OP_PREFETCH) ] = {
 		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
@@ -483,6 +529,20 @@ static __initconst const u64 nehalem_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = -1,
 	},
  },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
 };
 
 static __initconst const u64 core2_hw_cache_event_ids
@@ -933,6 +993,16 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 
 	cpuc = &__get_cpu_var(cpu_hw_events);
 
+	/*
+	 * Some chipsets need to unmask the LVTPC in a particular spot
+	 * inside the nmi handler.  As a result, the unmasking was pushed
+	 * into all the nmi handlers.
+	 *
+	 * This handler doesn't seem to have any issues with the unmasking
+	 * so it was left at the top.
+	 */
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+
 	intel_pmu_disable_all();
 	handled = intel_pmu_drain_bts_buffer();
 	status = intel_pmu_get_status();
@@ -976,7 +1046,7 @@ again:
 
 		data.period = event->hw.last_period;
 
-		if (perf_event_overflow(event, 1, &data, regs))
+		if (perf_event_overflow(event, &data, regs))
 			x86_pmu_stop(event, 0);
 	}
 
@@ -998,6 +1068,9 @@ intel_bts_constraints(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	unsigned int hw_event, bts_event;
 
+	if (event->attr.freq)
+		return NULL;
+
 	hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
 	bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
 
@@ -1007,65 +1080,121 @@ intel_bts_constraints(struct perf_event *event)
 	return NULL;
 }
 
+static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
+{
+	if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
+		return false;
+
+	if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) {
+		event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+		event->hw.config |= 0x01bb;
+		event->hw.extra_reg.idx = EXTRA_REG_RSP_1;
+		event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
+	} else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) {
+		event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+		event->hw.config |= 0x01b7;
+		event->hw.extra_reg.idx = EXTRA_REG_RSP_0;
+		event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+	}
+
+	if (event->hw.extra_reg.idx == orig_idx)
+		return false;
+
+	return true;
+}
+
+/*
+ * manage allocation of shared extra msr for certain events
+ *
+ * sharing can be:
+ * per-cpu: to be shared between the various events on a single PMU
+ * per-core: per-cpu + shared by HT threads
+ */
 static struct event_constraint *
-intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
+				   struct perf_event *event)
 {
-	struct hw_perf_event *hwc = &event->hw;
-	unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
-	struct event_constraint *c;
-	struct intel_percore *pc;
+	struct event_constraint *c = &emptyconstraint;
+	struct hw_perf_event_extra *reg = &event->hw.extra_reg;
 	struct er_account *era;
-	int i;
-	int free_slot;
-	int found;
+	unsigned long flags;
+	int orig_idx = reg->idx;
 
-	if (!x86_pmu.percore_constraints || hwc->extra_alloc)
-		return NULL;
+	/* already allocated shared msr */
+	if (reg->alloc)
+		return &unconstrained;
 
-	for (c = x86_pmu.percore_constraints; c->cmask; c++) {
-		if (e != c->code)
-			continue;
+again:
+	era = &cpuc->shared_regs->regs[reg->idx];
+	/*
+	 * we use spin_lock_irqsave() to avoid lockdep issues when
+	 * passing a fake cpuc
+	 */
+	raw_spin_lock_irqsave(&era->lock, flags);
+
+	if (!atomic_read(&era->ref) || era->config == reg->config) {
+
+		/* lock in msr value */
+		era->config = reg->config;
+		era->reg = reg->reg;
+
+		/* one more user */
+		atomic_inc(&era->ref);
+
+		/* no need to reallocate during incremental event scheduling */
+		reg->alloc = 1;
 
 		/*
-		 * Allocate resource per core.
+		 * All events using extra_reg are unconstrained.
+		 * Avoids calling x86_get_event_constraints()
+		 *
+		 * Must revisit if extra_reg controlling events
+		 * ever have constraints. Worst case we go through
+		 * the regular event constraint table.
 		 */
-		pc = cpuc->per_core;
-		if (!pc)
-			break;
-		c = &emptyconstraint;
-		raw_spin_lock(&pc->lock);
-		free_slot = -1;
-		found = 0;
-		for (i = 0; i < MAX_EXTRA_REGS; i++) {
-			era = &pc->regs[i];
-			if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
-				/* Allow sharing same config */
-				if (hwc->extra_config == era->extra_config) {
-					era->ref++;
-					cpuc->percore_used = 1;
-					hwc->extra_alloc = 1;
-					c = NULL;
-				}
-				/* else conflict */
-				found = 1;
-				break;
-			} else if (era->ref == 0 && free_slot == -1)
-				free_slot = i;
-		}
-		if (!found && free_slot != -1) {
-			era = &pc->regs[free_slot];
-			era->ref = 1;
-			era->extra_reg = hwc->extra_reg;
-			era->extra_config = hwc->extra_config;
-			cpuc->percore_used = 1;
-			hwc->extra_alloc = 1;
-			c = NULL;
-		}
-		raw_spin_unlock(&pc->lock);
-		return c;
+		c = &unconstrained;
+	} else if (intel_try_alt_er(event, orig_idx)) {
+		raw_spin_unlock(&era->lock);
+		goto again;
 	}
+	raw_spin_unlock_irqrestore(&era->lock, flags);
 
-	return NULL;
+	return c;
+}
+
+static void
+__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
+				   struct hw_perf_event_extra *reg)
+{
+	struct er_account *era;
+
+	/*
+	 * only put constraint if extra reg was actually
+	 * allocated. Also takes care of event which do
+	 * not use an extra shared reg
+	 */
+	if (!reg->alloc)
+		return;
+
+	era = &cpuc->shared_regs->regs[reg->idx];
+
+	/* one fewer user */
+	atomic_dec(&era->ref);
+
+	/* allocate again next time */
+	reg->alloc = 0;
+}
+
+static struct event_constraint *
+intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
+			      struct perf_event *event)
+{
+	struct event_constraint *c = NULL;
+
+	if (event->hw.extra_reg.idx != EXTRA_REG_NONE)
+		c = __intel_shared_reg_get_constraints(cpuc, event);
+
+	return c;
 }
 
 static struct event_constraint *
@@ -1081,49 +1210,28 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
 	if (c)
 		return c;
 
-	c = intel_percore_constraints(cpuc, event);
+	c = intel_shared_regs_constraints(cpuc, event);
 	if (c)
 		return c;
 
 	return x86_get_event_constraints(cpuc, event);
 }
 
-static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+static void
+intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
 					struct perf_event *event)
 {
-	struct extra_reg *er;
-	struct intel_percore *pc;
-	struct er_account *era;
-	struct hw_perf_event *hwc = &event->hw;
-	int i, allref;
-
-	if (!cpuc->percore_used)
-		return;
+	struct hw_perf_event_extra *reg;
 
-	for (er = x86_pmu.extra_regs; er->msr; er++) {
-		if (er->event != (hwc->config & er->config_mask))
-			continue;
+	reg = &event->hw.extra_reg;
+	if (reg->idx != EXTRA_REG_NONE)
+		__intel_shared_reg_put_constraints(cpuc, reg);
+}
 
-		pc = cpuc->per_core;
-		raw_spin_lock(&pc->lock);
-		for (i = 0; i < MAX_EXTRA_REGS; i++) {
-			era = &pc->regs[i];
-			if (era->ref > 0 &&
-			    era->extra_config == hwc->extra_config &&
-			    era->extra_reg == er->msr) {
-				era->ref--;
-				hwc->extra_alloc = 0;
-				break;
-			}
-		}
-		allref = 0;
-		for (i = 0; i < MAX_EXTRA_REGS; i++)
-			allref += pc->regs[i].ref;
-		if (allref == 0)
-			cpuc->percore_used = 0;
-		raw_spin_unlock(&pc->lock);
-		break;
-	}
+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+					struct perf_event *event)
+{
+	intel_put_shared_regs_event_constraints(cpuc, event);
 }
 
 static int intel_pmu_hw_config(struct perf_event *event)
@@ -1201,20 +1309,36 @@ static __initconst const struct x86_pmu core_pmu = {
 	.event_constraints	= intel_core_event_constraints,
 };
 
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+	struct intel_shared_regs *regs;
+	int i;
+
+	regs = kzalloc_node(sizeof(struct intel_shared_regs),
+			    GFP_KERNEL, cpu_to_node(cpu));
+	if (regs) {
+		/*
+		 * initialize the locks to keep lockdep happy
+		 */
+		for (i = 0; i < EXTRA_REG_MAX; i++)
+			raw_spin_lock_init(&regs->regs[i].lock);
+
+		regs->core_id = -1;
+	}
+	return regs;
+}
+
 static int intel_pmu_cpu_prepare(int cpu)
 {
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
 
-	if (!cpu_has_ht_siblings())
+	if (!x86_pmu.extra_regs)
 		return NOTIFY_OK;
 
-	cpuc->per_core = kzalloc_node(sizeof(struct intel_percore),
-				      GFP_KERNEL, cpu_to_node(cpu));
-	if (!cpuc->per_core)
+	cpuc->shared_regs = allocate_shared_regs(cpu);
+	if (!cpuc->shared_regs)
 		return NOTIFY_BAD;
 
-	raw_spin_lock_init(&cpuc->per_core->lock);
-	cpuc->per_core->core_id = -1;
 	return NOTIFY_OK;
 }
 
@@ -1230,32 +1354,34 @@ static void intel_pmu_cpu_starting(int cpu)
 	 */
 	intel_pmu_lbr_reset();
 
-	if (!cpu_has_ht_siblings())
+	if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING))
 		return;
 
 	for_each_cpu(i, topology_thread_cpumask(cpu)) {
-		struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
+		struct intel_shared_regs *pc;
 
+		pc = per_cpu(cpu_hw_events, i).shared_regs;
 		if (pc && pc->core_id == core_id) {
-			kfree(cpuc->per_core);
-			cpuc->per_core = pc;
+			kfree(cpuc->shared_regs);
+			cpuc->shared_regs = pc;
 			break;
 		}
 	}
 
-	cpuc->per_core->core_id = core_id;
-	cpuc->per_core->refcnt++;
+	cpuc->shared_regs->core_id = core_id;
+	cpuc->shared_regs->refcnt++;
 }
 
 static void intel_pmu_cpu_dying(int cpu)
 {
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	struct intel_percore *pc = cpuc->per_core;
+	struct intel_shared_regs *pc;
 
+	pc = cpuc->shared_regs;
 	if (pc) {
 		if (pc->core_id == -1 || --pc->refcnt == 0)
 			kfree(pc);
-		cpuc->per_core = NULL;
+		cpuc->shared_regs = NULL;
 	}
 
 	fini_debug_store_on_cpu(cpu);
@@ -1305,7 +1431,7 @@ static void intel_clovertown_quirks(void)
 	 * AJ106 could possibly be worked around by not allowing LBR
 	 *       usage from PEBS, including the fixup.
 	 * AJ68  could possibly be worked around by always programming
-	 * 	 a pebs_event_reset[0] value and coping with the lost events.
+	 *	 a pebs_event_reset[0] value and coping with the lost events.
 	 *
 	 * But taken together it might just make sense to not enable PEBS on
 	 * these chips.
@@ -1406,9 +1532,25 @@ static __init int intel_pmu_init(void)
 
 		x86_pmu.event_constraints = intel_nehalem_event_constraints;
 		x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
-		x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
 		x86_pmu.extra_regs = intel_nehalem_extra_regs;
+
+		/* UOPS_ISSUED.STALLED_CYCLES */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
+
+		if (ebx & 0x40) {
+			/*
+			 * Erratum AAJ80 detected, we work it around by using
+			 * the BR_MISP_EXEC.ANY event. This will over-count
+			 * branch-misses, but it's still much better than the
+			 * architectural event which is often completely bogus:
+			 */
+			intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
+
+			pr_cont("erratum AAJ80 worked around, ");
+		}
 		pr_cont("Nehalem events, ");
 		break;
 
@@ -1425,6 +1567,7 @@ static __init int intel_pmu_init(void)
 
 	case 37: /* 32 nm nehalem, "Clarkdale" */
 	case 44: /* 32 nm nehalem, "Gulftown" */
+	case 47: /* 32 nm Xeon E7 */
 		memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
@@ -1433,10 +1576,16 @@ static __init int intel_pmu_init(void)
 		intel_pmu_lbr_init_nhm();
 
 		x86_pmu.event_constraints = intel_westmere_event_constraints;
-		x86_pmu.percore_constraints = intel_westmere_percore_constraints;
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
 		x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_westmere_extra_regs;
+		x86_pmu.er_flags |= ERF_HAS_RSP_1;
+
+		/* UOPS_ISSUED.STALLED_CYCLES */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+		/* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
+
 		pr_cont("Westmere events, ");
 		break;
 
@@ -1448,15 +1597,33 @@ static __init int intel_pmu_init(void)
 
 		x86_pmu.event_constraints = intel_snb_event_constraints;
 		x86_pmu.pebs_constraints = intel_snb_pebs_events;
+		x86_pmu.extra_regs = intel_snb_extra_regs;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.er_flags |= ERF_HAS_RSP_1;
+		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+
+		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
+		/* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
+		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1;
+
 		pr_cont("SandyBridge events, ");
 		break;
 
 	default:
-		/*
-		 * default constraints for v2 and up
-		 */
-		x86_pmu.event_constraints = intel_gen_event_constraints;
-		pr_cont("generic architected perfmon, ");
+		switch (x86_pmu.version) {
+		case 1:
+			x86_pmu.event_constraints = intel_v1_event_constraints;
+			pr_cont("generic architected perfmon v1, ");
+			break;
+		default:
+			/*
+			 * default constraints for v2 and up
+			 */
+			x86_pmu.event_constraints = intel_gen_event_constraints;
+			pr_cont("generic architected perfmon, ");
+			break;
+		}
 	}
 	return 0;
 }
@@ -1468,4 +1635,8 @@ static int intel_pmu_init(void)
 	return 0;
 }
 
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+	return NULL;
+}
 #endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index bab491b8ee2..1b1ef3addcf 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void)
 	 */
 	perf_prepare_sample(&header, &data, event, &regs);
 
-	if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
+	if (perf_output_begin(&handle, event, header.size * (top - at)))
 		return 1;
 
 	for (; at < top; at++) {
@@ -616,7 +616,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 	else
 		regs.flags &= ~PERF_EFLAGS_EXACT;
 
-	if (perf_event_overflow(event, 1, &data, &regs))
+	if (perf_event_overflow(event, &data, &regs))
 		x86_pmu_stop(event, 0);
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index c2520e178d3..7809d2bcb20 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -468,7 +468,7 @@ static struct p4_event_bind p4_event_bind_map[] = {
 		.opcode		= P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
 		.escr_msr	= { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
 		.escr_emask	=
-		P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
+			P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
 		.cntr		= { {12, 13, 16}, {14, 15, 17} },
 	},
 	[P4_EVENT_X87_ASSIST] = {
@@ -554,13 +554,102 @@ static __initconst const u64 p4_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = -1,
 	},
  },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+/*
+ * Because of Netburst being quite restricted in how many
+ * identical events may run simultaneously, we introduce event aliases,
+ * ie the different events which have the same functionality but
+ * utilize non-intersected resources (ESCR/CCCR/counter registers).
+ *
+ * This allow us to relax restrictions a bit and run two or more
+ * identical events together.
+ *
+ * Never set any custom internal bits such as P4_CONFIG_HT,
+ * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
+ * either up to date automatically or not applicable at all.
+ */
+struct p4_event_alias {
+	u64 original;
+	u64 alternative;
+} p4_event_aliases[] = {
+	{
+		/*
+		 * Non-halted cycles can be substituted with non-sleeping cycles (see
+		 * Intel SDM Vol3b for details). We need this alias to be able
+		 * to run nmi-watchdog and 'perf top' (or any other user space tool
+		 * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
+		 * simultaneously.
+		 */
+	.original	=
+		p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)		|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+	.alternative	=
+		p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT)		|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)	|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)	|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)	|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
+		p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT		|
+				    P4_CCCR_COMPARE),
+	},
 };
 
+static u64 p4_get_alias_event(u64 config)
+{
+	u64 config_match;
+	int i;
+
+	/*
+	 * Only event with special mark is allowed,
+	 * we're to be sure it didn't come as malformed
+	 * RAW event.
+	 */
+	if (!(config & P4_CONFIG_ALIASABLE))
+		return 0;
+
+	config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
+
+	for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
+		if (config_match == p4_event_aliases[i].original) {
+			config_match = p4_event_aliases[i].alternative;
+			break;
+		} else if (config_match == p4_event_aliases[i].alternative) {
+			config_match = p4_event_aliases[i].original;
+			break;
+		}
+	}
+
+	if (i >= ARRAY_SIZE(p4_event_aliases))
+		return 0;
+
+	return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
+}
+
 static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
   /* non-halted CPU clocks */
   [PERF_COUNT_HW_CPU_CYCLES] =
 	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+		P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING))	|
+		P4_CONFIG_ALIASABLE,
 
   /*
    * retired instructions
@@ -912,8 +1001,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 	int idx, handled = 0;
 	u64 val;
 
-	data.addr = 0;
-	data.raw = NULL;
+	perf_sample_data_init(&data, 0);
 
 	cpuc = &__get_cpu_var(cpu_hw_events);
 
@@ -946,15 +1034,24 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 
 		if (!x86_perf_event_set_period(event))
 			continue;
-		if (perf_event_overflow(event, 1, &data, regs))
-			p4_pmu_disable_event(event);
+		if (perf_event_overflow(event, &data, regs))
+			x86_pmu_stop(event, 0);
 	}
 
-	if (handled) {
-		/* p4 quirk: unmask it again */
-		apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
+	if (handled)
 		inc_irq_stat(apic_perf_irqs);
-	}
+
+	/*
+	 * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
+	 * been observed that the OVF bit flag has to be cleared first _before_
+	 * the LVTPC can be unmasked.
+	 *
+	 * The reason is the NMI line will continue to be asserted while the OVF
+	 * bit is set.  This causes a second NMI to generate if the LVTPC is
+	 * unmasked before the OVF bit is cleared, leading to unknown NMI
+	 * messages.
+	 */
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
 
 	return handled;
 }
@@ -1112,6 +1209,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
 	struct p4_event_bind *bind;
 	unsigned int i, thread, num;
 	int cntr_idx, escr_idx;
+	u64 config_alias;
+	int pass;
 
 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
 	bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
@@ -1120,6 +1219,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
 
 		hwc = &cpuc->event_list[i]->hw;
 		thread = p4_ht_thread(cpu);
+		pass = 0;
+
+again:
+		/*
+		 * It's possible to hit a circular lock
+		 * between original and alternative events
+		 * if both are scheduled already.
+		 */
+		if (pass > 2)
+			goto done;
+
 		bind = p4_config_get_bind(hwc->config);
 		escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
 		if (unlikely(escr_idx == -1))
@@ -1133,8 +1243,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
 		}
 
 		cntr_idx = p4_next_cntr(thread, used_mask, bind);
-		if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
-			goto done;
+		if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
+			/*
+			 * Check whether an event alias is still available.
+			 */
+			config_alias = p4_get_alias_event(hwc->config);
+			if (!config_alias)
+				goto done;
+			hwc->config = config_alias;
+			pass++;
+			goto again;
+		}
 
 		p4_pmu_swap_config_ts(hwc, cpu);
 		if (assign)
@@ -1188,7 +1307,7 @@ static __init int p4_pmu_init(void)
 {
 	unsigned int low, high;
 
-	/* If we get stripped -- indexig fails */
+	/* If we get stripped -- indexing fails */
 	BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
 
 	rdmsr(MSR_IA32_MISC_ENABLE, low, high);
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 706a9fb46a5..a621f342768 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/pci.h>
 #include <linux/of_pci.h>
+#include <linux/initrd.h>
 
 #include <asm/hpet.h>
 #include <asm/irq_controller.h>
@@ -98,6 +99,16 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
 	return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS));
 }
 
+#ifdef CONFIG_BLK_DEV_INITRD
+void __init early_init_dt_setup_initrd_arch(unsigned long start,
+					    unsigned long end)
+{
+	initrd_start = (unsigned long)__va(start);
+	initrd_end = (unsigned long)__va(end);
+	initrd_below_start_ok = 1;
+}
+#endif
+
 void __init add_dtb(u64 data)
 {
 	initial_dtb = data + offsetof(struct setup_data, data);
@@ -123,6 +134,24 @@ static int __init add_bus_probe(void)
 module_init(add_bus_probe);
 
 #ifdef CONFIG_PCI
+struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
+{
+	struct device_node *np;
+
+	for_each_node_by_type(np, "pci") {
+		const void *prop;
+		unsigned int bus_min;
+
+		prop = of_get_property(np, "bus-range", NULL);
+		if (!prop)
+			continue;
+		bus_min = be32_to_cpup(prop);
+		if (bus->number == bus_min)
+			return np;
+	}
+	return NULL;
+}
+
 static int x86_of_pci_irq_enable(struct pci_dev *dev)
 {
 	struct of_irq oirq;
@@ -154,50 +183,8 @@ static void x86_of_pci_irq_disable(struct pci_dev *dev)
 
 void __cpuinit x86_of_pci_init(void)
 {
-	struct device_node *np;
-
 	pcibios_enable_irq = x86_of_pci_irq_enable;
 	pcibios_disable_irq = x86_of_pci_irq_disable;
-
-	for_each_node_by_type(np, "pci") {
-		const void *prop;
-		struct pci_bus *bus;
-		unsigned int bus_min;
-		struct device_node *child;
-
-		prop = of_get_property(np, "bus-range", NULL);
-		if (!prop)
-			continue;
-		bus_min = be32_to_cpup(prop);
-
-		bus = pci_find_bus(0, bus_min);
-		if (!bus) {
-			printk(KERN_ERR "Can't find a node for bus %s.\n",
-					np->full_name);
-			continue;
-		}
-
-		if (bus->self)
-			bus->self->dev.of_node = np;
-		else
-			bus->dev.of_node = np;
-
-		for_each_child_of_node(np, child) {
-			struct pci_dev *dev;
-			u32 devfn;
-
-			prop = of_get_property(child, "reg", NULL);
-			if (!prop)
-				continue;
-
-			devfn = (be32_to_cpup(prop) >> 8) & 0xff;
-			dev = pci_get_slot(bus, devfn);
-			if (!dev)
-				continue;
-			dev->dev.of_node = child;
-			pci_dev_put(dev);
-		}
-	}
 }
 #endif
 
@@ -369,6 +356,7 @@ static struct of_ioapic_type of_ioapic_type[] =
 static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
 			u32 *out_hwirq, u32 *out_type)
 {
+	struct mp_ioapic_gsi *gsi_cfg;
 	struct io_apic_irq_attr attr;
 	struct of_ioapic_type *it;
 	u32 line, idx, type;
@@ -378,7 +366,8 @@ static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
 
 	line = *intspec;
 	idx = (u32) id->priv;
-	*out_hwirq = line + mp_gsi_routing[idx].gsi_base;
+	gsi_cfg = mp_ioapic_gsi_routing(idx);
+	*out_hwirq = line + gsi_cfg->gsi_base;
 
 	intspec++;
 	type = *intspec;
@@ -391,7 +380,7 @@ static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
 
 	set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
 
-	return io_apic_setup_irq_pin(*out_hwirq, cpu_to_node(0), &attr);
+	return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr);
 }
 
 static void __init ioapic_add_ofnode(struct device_node *np)
@@ -407,7 +396,7 @@ static void __init ioapic_add_ofnode(struct device_node *np)
 	}
 
 	for (i = 0; i < nr_ioapics; i++) {
-		if (r.start == mp_ioapics[i].apicaddr) {
+		if (r.start == mpc_ioapic_addr(i)) {
 			struct irq_domain *id;
 
 			id = kzalloc(sizeof(*id), GFP_KERNEL);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index e2a3f0606da..1aae78f775f 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -135,20 +135,6 @@ print_context_stack_bp(struct thread_info *tinfo,
 }
 EXPORT_SYMBOL_GPL(print_context_stack_bp);
 
-
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-	printk(data);
-	print_symbol(msg, symbol);
-	printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
-	printk("%s%s\n", (char *)data, msg);
-}
-
 static int print_trace_stack(void *data, char *name)
 {
 	printk("%s <%s> ", (char *)data, name);
@@ -166,8 +152,6 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
 }
 
 static const struct stacktrace_ops print_trace_ops = {
-	.warning		= print_trace_warning,
-	.warning_symbol		= print_trace_warning_symbol,
 	.stack			= print_trace_stack,
 	.address		= print_trace_address,
 	.walk_stack		= print_context_stack,
@@ -279,7 +263,6 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 	printk("DEBUG_PAGEALLOC");
 #endif
 	printk("\n");
-	sysfs_printk_last_file();
 	if (notify_die(DIE_OOPS, str, regs, err,
 			current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
 		return 1;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index e71c98d3c0d..19853ad8afc 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -105,34 +105,6 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
 }
 
 /*
- * We are returning from the irq stack and go to the previous one.
- * If the previous stack is also in the irq stack, then bp in the first
- * frame of the irq stack points to the previous, interrupted one.
- * Otherwise we have another level of indirection: We first save
- * the bp of the previous stack, then we switch the stack to the irq one
- * and save a new bp that links to the previous one.
- * (See save_args())
- */
-static inline unsigned long
-fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
-		  unsigned long *irq_stack, unsigned long *irq_stack_end)
-{
-#ifdef CONFIG_FRAME_POINTER
-	struct stack_frame *frame = (struct stack_frame *)bp;
-	unsigned long next;
-
-	if (!in_irq_stack(stack, irq_stack, irq_stack_end)) {
-		if (!probe_kernel_address(&frame->next_frame, next))
-			return next;
-		else
-			WARN_ONCE(1, "Perf: bad frame pointer = %p in "
-				  "callchain\n", &frame->next_frame);
-	}
-#endif
-	return bp;
-}
-
-/*
  * x86-64 can have up to three kernel stacks:
  * process stack
  * interrupt stack
@@ -155,9 +127,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		task = current;
 
 	if (!stack) {
-		stack = &dummy;
-		if (task && task != current)
+		if (regs)
+			stack = (unsigned long *)regs->sp;
+		else if (task && task != current)
 			stack = (unsigned long *)task->thread.sp;
+		else
+			stack = &dummy;
 	}
 
 	if (!bp)
@@ -205,8 +180,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 				 * pointer (index -1 to end) in the IRQ stack:
 				 */
 				stack = (unsigned long *) (irq_stack_end[-1]);
-				bp = fixup_bp_irq_link(bp, stack, irq_stack,
-						       irq_stack_end);
 				irq_stack_end = NULL;
 				ops->stack(data, "EOI");
 				continue;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 8a445a0c989..e13329d800c 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -9,6 +9,8 @@
 /*
  * entry.S contains the system-call and fault low-level handling routines.
  *
+ * Some of this is documented in Documentation/x86/entry_64.txt
+ *
  * NOTE: This code handles signal-recognition, which happens every time
  * after an interrupt and after each system call.
  *
@@ -297,27 +299,26 @@ ENDPROC(native_usergs_sysret64)
 	.endm
 
 /* save partial stack frame */
-	.pushsection .kprobes.text, "ax"
-ENTRY(save_args)
-	XCPT_FRAME
+	.macro SAVE_ARGS_IRQ
 	cld
-	/*
-	 * start from rbp in pt_regs and jump over
-	 * return address.
-	 */
-	movq_cfi rdi, RDI+8-RBP
-	movq_cfi rsi, RSI+8-RBP
-	movq_cfi rdx, RDX+8-RBP
-	movq_cfi rcx, RCX+8-RBP
-	movq_cfi rax, RAX+8-RBP
-	movq_cfi  r8,  R8+8-RBP
-	movq_cfi  r9,  R9+8-RBP
-	movq_cfi r10, R10+8-RBP
-	movq_cfi r11, R11+8-RBP
-
-	leaq -RBP+8(%rsp),%rdi	/* arg1 for handler */
-	movq_cfi rbp, 8		/* push %rbp */
-	leaq 8(%rsp), %rbp		/* mov %rsp, %ebp */
+	/* start from rbp in pt_regs and jump over */
+	movq_cfi rdi, RDI-RBP
+	movq_cfi rsi, RSI-RBP
+	movq_cfi rdx, RDX-RBP
+	movq_cfi rcx, RCX-RBP
+	movq_cfi rax, RAX-RBP
+	movq_cfi  r8,  R8-RBP
+	movq_cfi  r9,  R9-RBP
+	movq_cfi r10, R10-RBP
+	movq_cfi r11, R11-RBP
+
+	/* Save rbp so that we can unwind from get_irq_regs() */
+	movq_cfi rbp, 0
+
+	/* Save previous stack value */
+	movq %rsp, %rsi
+
+	leaq -RBP(%rsp),%rdi	/* arg1 for handler */
 	testl $3, CS(%rdi)
 	je 1f
 	SWAPGS
@@ -329,19 +330,14 @@ ENTRY(save_args)
 	 */
 1:	incl PER_CPU_VAR(irq_count)
 	jne 2f
-	popq_cfi %rax			/* move return address... */
 	mov PER_CPU_VAR(irq_stack_ptr),%rsp
 	EMPTY_FRAME 0
-	pushq_cfi %rbp			/* backlink for unwinder */
-	pushq_cfi %rax			/* ... to the new stack */
-	/*
-	 * We entered an interrupt context - irqs are off:
-	 */
-2:	TRACE_IRQS_OFF
-	ret
-	CFI_ENDPROC
-END(save_args)
-	.popsection
+
+2:	/* Store previous stack value */
+	pushq %rsi
+	/* We entered an interrupt context - irqs are off: */
+	TRACE_IRQS_OFF
+	.endm
 
 ENTRY(save_rest)
 	PARTIAL_FRAME 1 REST_SKIP+8
@@ -473,7 +469,7 @@ ENTRY(system_call_after_swapgs)
 	 * and short:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	SAVE_ARGS 8,1
+	SAVE_ARGS 8,0
 	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
@@ -508,7 +504,7 @@ sysret_check:
 	TRACE_IRQS_ON
 	movq RIP-ARGOFFSET(%rsp),%rcx
 	CFI_REGISTER	rip,rcx
-	RESTORE_ARGS 0,-ARG_SKIP,1
+	RESTORE_ARGS 1,-ARG_SKIP,0
 	/*CFI_REGISTER	rflags,r11*/
 	movq	PER_CPU_VAR(old_rsp), %rsp
 	USERGS_SYSRET64
@@ -791,7 +787,7 @@ END(interrupt)
 	/* reserve pt_regs for scratch regs and rbp */
 	subq $ORIG_RAX-RBP, %rsp
 	CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
-	call save_args
+	SAVE_ARGS_IRQ
 	PARTIAL_FRAME 0
 	call \func
 	.endm
@@ -814,15 +810,14 @@ ret_from_intr:
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	decl PER_CPU_VAR(irq_count)
-	leaveq
 
-	CFI_RESTORE		rbp
+	/* Restore saved previous stack */
+	popq %rsi
+	leaq 16(%rsi), %rsp
+
 	CFI_DEF_CFA_REGISTER	rsp
-	CFI_ADJUST_CFA_OFFSET	-8
+	CFI_ADJUST_CFA_OFFSET	-16
 
-	/* we did not save rbx, restore only from ARGOFFSET */
-	addq $8, %rsp
-	CFI_ADJUST_CFA_OFFSET	-8
 exit_intr:
 	GET_THREAD_INFO(%rcx)
 	testl $3,CS-ARGOFFSET(%rsp)
@@ -858,7 +853,7 @@ retint_restore_args:	/* return to kernel space */
 	 */
 	TRACE_IRQS_IRETQ
 restore_args:
-	RESTORE_ARGS 0,8,0
+	RESTORE_ARGS 1,8,1
 
 irq_return:
 	INTERRUPT_RETURN
@@ -991,11 +986,6 @@ apicinterrupt THRESHOLD_APIC_VECTOR \
 apicinterrupt THERMAL_APIC_VECTOR \
 	thermal_interrupt smp_thermal_interrupt
 
-#ifdef CONFIG_X86_MCE
-apicinterrupt MCE_SELF_VECTOR \
-	mce_self_interrupt smp_mce_self_interrupt
-#endif
-
 #ifdef CONFIG_SMP
 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
 	call_function_single_interrupt smp_call_function_single_interrupt
@@ -1121,6 +1111,8 @@ zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
 zeroentry coprocessor_error do_coprocessor_error
 errorentry alignment_check do_alignment_check
 zeroentry simd_coprocessor_error do_simd_coprocessor_error
+zeroentry emulate_vsyscall do_emulate_vsyscall
+
 
 	/* Reload gs selector with exception handling */
 	/* edi:  new selector */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index a93742a5746..c9a281f272f 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -123,7 +123,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 static atomic_t nmi_running = ATOMIC_INIT(0);
 static int mod_code_status;		/* holds return value of text write */
 static void *mod_code_ip;		/* holds the IP to write to */
-static void *mod_code_newcode;		/* holds the text to write to the IP */
+static const void *mod_code_newcode;	/* holds the text to write to the IP */
 
 static unsigned nmi_wait_count;
 static atomic_t nmi_update_count = ATOMIC_INIT(0);
@@ -225,7 +225,7 @@ within(unsigned long addr, unsigned long start, unsigned long end)
 }
 
 static int
-do_ftrace_mod_code(unsigned long ip, void *new_code)
+do_ftrace_mod_code(unsigned long ip, const void *new_code)
 {
 	/*
 	 * On x86_64, kernel text mappings are mapped read-only with
@@ -260,14 +260,14 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
 	return mod_code_status;
 }
 
-static unsigned char *ftrace_nop_replace(void)
+static const unsigned char *ftrace_nop_replace(void)
 {
-	return ideal_nop5;
+	return ideal_nops[NOP_ATOMIC5];
 }
 
 static int
-ftrace_modify_code(unsigned long ip, unsigned char *old_code,
-		   unsigned char *new_code)
+ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
+		   unsigned const char *new_code)
 {
 	unsigned char replaced[MCOUNT_INSN_SIZE];
 
@@ -301,7 +301,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 int ftrace_make_nop(struct module *mod,
 		    struct dyn_ftrace *rec, unsigned long addr)
 {
-	unsigned char *new, *old;
+	unsigned const char *new, *old;
 	unsigned long ip = rec->ip;
 
 	old = ftrace_call_replace(ip, addr);
@@ -312,7 +312,7 @@ int ftrace_make_nop(struct module *mod,
 
 int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
 {
-	unsigned char *new, *old;
+	unsigned const char *new, *old;
 	unsigned long ip = rec->ip;
 
 	old = ftrace_nop_replace();
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index d6d6bb36193..3bb08509a7a 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -23,7 +23,6 @@
 static void __init i386_default_early_setup(void)
 {
 	/* Initialize 32bit specific setup functions */
-	x86_init.resources.probe_roms = probe_roms;
 	x86_init.resources.reserve_resources = i386_reserve_resources;
 	x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
 
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index bfe8f729e08..4aecc54236a 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -4,6 +4,7 @@
 #include <linux/sysdev.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
+#include <linux/i8253.h>
 #include <linux/slab.h>
 #include <linux/hpet.h>
 #include <linux/init.h>
@@ -12,8 +13,8 @@
 #include <linux/io.h>
 
 #include <asm/fixmap.h>
-#include <asm/i8253.h>
 #include <asm/hpet.h>
+#include <asm/time.h>
 
 #define HPET_MASK			CLOCKSOURCE_MASK(32)
 
@@ -71,7 +72,7 @@ static inline void hpet_set_mapping(void)
 {
 	hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
 #ifdef CONFIG_X86_64
-	__set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
+	__set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
 #endif
 }
 
@@ -217,7 +218,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { }
 /*
  * Common hpet info
  */
-static unsigned long hpet_period;
+static unsigned long hpet_freq;
 
 static void hpet_legacy_set_mode(enum clock_event_mode mode,
 			  struct clock_event_device *evt);
@@ -232,7 +233,6 @@ static struct clock_event_device hpet_clockevent = {
 	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
 	.set_mode	= hpet_legacy_set_mode,
 	.set_next_event = hpet_legacy_next_event,
-	.shift		= 32,
 	.irq		= 0,
 	.rating		= 50,
 };
@@ -290,28 +290,12 @@ static void hpet_legacy_clockevent_register(void)
 	hpet_enable_legacy_int();
 
 	/*
-	 * The mult factor is defined as (include/linux/clockchips.h)
-	 *  mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h)
-	 * hpet_period is in units of femtoseconds (per cycle), so
-	 *  mult/2^shift = cyc/ns = 10^6/hpet_period
-	 *  mult = (10^6 * 2^shift)/hpet_period
-	 *  mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period
-	 */
-	hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
-				      hpet_period, hpet_clockevent.shift);
-	/* Calculate the min / max delta */
-	hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
-							   &hpet_clockevent);
-	/* Setup minimum reprogramming delta. */
-	hpet_clockevent.min_delta_ns = clockevent_delta2ns(HPET_MIN_PROG_DELTA,
-							   &hpet_clockevent);
-
-	/*
 	 * Start hpet with the boot cpu mask and make it
 	 * global after the IO_APIC has been initialized.
 	 */
 	hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
-	clockevents_register_device(&hpet_clockevent);
+	clockevents_config_and_register(&hpet_clockevent, hpet_freq,
+					HPET_MIN_PROG_DELTA, 0x7FFFFFFF);
 	global_clock_event = &hpet_clockevent;
 	printk(KERN_DEBUG "hpet clockevent registered\n");
 }
@@ -549,7 +533,6 @@ static int hpet_setup_irq(struct hpet_dev *dev)
 static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
 {
 	struct clock_event_device *evt = &hdev->evt;
-	uint64_t hpet_freq;
 
 	WARN_ON(cpu != smp_processor_id());
 	if (!(hdev->flags & HPET_DEV_VALID))
@@ -571,24 +554,10 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
 
 	evt->set_mode = hpet_msi_set_mode;
 	evt->set_next_event = hpet_msi_next_event;
-	evt->shift = 32;
-
-	/*
-	 * The period is a femto seconds value. We need to calculate the
-	 * scaled math multiplication factor for nanosecond to hpet tick
-	 * conversion.
-	 */
-	hpet_freq = FSEC_PER_SEC;
-	do_div(hpet_freq, hpet_period);
-	evt->mult = div_sc((unsigned long) hpet_freq,
-				      NSEC_PER_SEC, evt->shift);
-	/* Calculate the max delta */
-	evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
-	/* 5 usec minimum reprogramming delta. */
-	evt->min_delta_ns = 5000;
-
 	evt->cpumask = cpumask_of(hdev->cpu);
-	clockevents_register_device(evt);
+
+	clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA,
+					0x7FFFFFFF);
 }
 
 #ifdef CONFIG_HPET
@@ -770,13 +739,6 @@ static cycle_t read_hpet(struct clocksource *cs)
 	return (cycle_t)hpet_readl(HPET_COUNTER);
 }
 
-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_hpet(void)
-{
-	return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
-}
-#endif
-
 static struct clocksource clocksource_hpet = {
 	.name		= "hpet",
 	.rating		= 250,
@@ -785,14 +747,13 @@ static struct clocksource clocksource_hpet = {
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 	.resume		= hpet_resume_counter,
 #ifdef CONFIG_X86_64
-	.vread		= vread_hpet,
+	.archdata	= { .vclock_mode = VCLOCK_HPET },
 #endif
 };
 
 static int hpet_clocksource_register(void)
 {
 	u64 start, now;
-	u64 hpet_freq;
 	cycle_t t1;
 
 	/* Start the counter */
@@ -819,24 +780,7 @@ static int hpet_clocksource_register(void)
 		return -ENODEV;
 	}
 
-	/*
-	 * The definition of mult is (include/linux/clocksource.h)
-	 * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc
-	 * so we first need to convert hpet_period to ns/cyc units:
-	 *  mult/2^shift = ns/cyc = hpet_period/10^6
-	 *  mult = (hpet_period * 2^shift)/10^6
-	 *  mult = (hpet_period << shift)/FSEC_PER_NSEC
-	 */
-
-	/* Need to convert hpet_period (fsec/cyc) to cyc/sec:
-	 *
-	 * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
-	 * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
-	 */
-	hpet_freq = FSEC_PER_SEC;
-	do_div(hpet_freq, hpet_period);
 	clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
-
 	return 0;
 }
 
@@ -845,7 +789,9 @@ static int hpet_clocksource_register(void)
  */
 int __init hpet_enable(void)
 {
+	unsigned long hpet_period;
 	unsigned int id;
+	u64 freq;
 	int i;
 
 	if (!is_hpet_capable())
@@ -884,6 +830,14 @@ int __init hpet_enable(void)
 		goto out_nohpet;
 
 	/*
+	 * The period is a femto seconds value. Convert it to a
+	 * frequency.
+	 */
+	freq = FSEC_PER_SEC;
+	do_div(freq, hpet_period);
+	hpet_freq = freq;
+
+	/*
 	 * Read the HPET ID register to retrieve the IRQ routing
 	 * information and the number of channels
 	 */
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 12aff253768..739d8598f78 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -321,7 +321,7 @@ static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
 	return tmp;
 }
 
-#define FPREG_ADDR(f, n)	((void *)&(f)->st_space + (n) * 16);
+#define FPREG_ADDR(f, n)	((void *)&(f)->st_space + (n) * 16)
 #define FP_EXP_TAG_VALID	0
 #define FP_EXP_TAG_ZERO		1
 #define FP_EXP_TAG_SPECIAL	2
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 2dfd3159744..f2b96de3c7c 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -3,195 +3,27 @@
  *
  */
 #include <linux/clockchips.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/timex.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/io.h>
+#include <linux/i8253.h>
 
-#include <asm/i8253.h>
 #include <asm/hpet.h>
+#include <asm/time.h>
 #include <asm/smp.h>
 
-DEFINE_RAW_SPINLOCK(i8253_lock);
-EXPORT_SYMBOL(i8253_lock);
-
 /*
  * HPET replaces the PIT, when enabled. So we need to know, which of
  * the two timers is used
  */
 struct clock_event_device *global_clock_event;
 
-/*
- * Initialize the PIT timer.
- *
- * This is also called after resume to bring the PIT into operation again.
- */
-static void init_pit_timer(enum clock_event_mode mode,
-			   struct clock_event_device *evt)
-{
-	raw_spin_lock(&i8253_lock);
-
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-		/* binary, mode 2, LSB/MSB, ch 0 */
-		outb_pit(0x34, PIT_MODE);
-		outb_pit(LATCH & 0xff , PIT_CH0);	/* LSB */
-		outb_pit(LATCH >> 8 , PIT_CH0);		/* MSB */
-		break;
-
-	case CLOCK_EVT_MODE_SHUTDOWN:
-	case CLOCK_EVT_MODE_UNUSED:
-		if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
-		    evt->mode == CLOCK_EVT_MODE_ONESHOT) {
-			outb_pit(0x30, PIT_MODE);
-			outb_pit(0, PIT_CH0);
-			outb_pit(0, PIT_CH0);
-		}
-		break;
-
-	case CLOCK_EVT_MODE_ONESHOT:
-		/* One shot setup */
-		outb_pit(0x38, PIT_MODE);
-		break;
-
-	case CLOCK_EVT_MODE_RESUME:
-		/* Nothing to do here */
-		break;
-	}
-	raw_spin_unlock(&i8253_lock);
-}
-
-/*
- * Program the next event in oneshot mode
- *
- * Delta is given in PIT ticks
- */
-static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
-{
-	raw_spin_lock(&i8253_lock);
-	outb_pit(delta & 0xff , PIT_CH0);	/* LSB */
-	outb_pit(delta >> 8 , PIT_CH0);		/* MSB */
-	raw_spin_unlock(&i8253_lock);
-
-	return 0;
-}
-
-/*
- * On UP the PIT can serve all of the possible timer functions. On SMP systems
- * it can be solely used for the global tick.
- *
- * The profiling and update capabilities are switched off once the local apic is
- * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
- * !using_apic_timer decisions in do_timer_interrupt_hook()
- */
-static struct clock_event_device pit_ce = {
-	.name		= "pit",
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-	.set_mode	= init_pit_timer,
-	.set_next_event = pit_next_event,
-	.shift		= 32,
-	.irq		= 0,
-};
-
-/*
- * Initialize the conversion factor and the min/max deltas of the clock event
- * structure and register the clock event source with the framework.
- */
 void __init setup_pit_timer(void)
 {
-	/*
-	 * Start pit with the boot cpu mask and make it global after the
-	 * IO_APIC has been initialized.
-	 */
-	pit_ce.cpumask = cpumask_of(smp_processor_id());
-	pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift);
-	pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce);
-	pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce);
-
-	clockevents_register_device(&pit_ce);
-	global_clock_event = &pit_ce;
+	clockevent_i8253_init(true);
+	global_clock_event = &i8253_clockevent;
 }
 
 #ifndef CONFIG_X86_64
-/*
- * Since the PIT overflows every tick, its not very useful
- * to just read by itself. So use jiffies to emulate a free
- * running counter:
- */
-static cycle_t pit_read(struct clocksource *cs)
-{
-	static int old_count;
-	static u32 old_jifs;
-	unsigned long flags;
-	int count;
-	u32 jifs;
-
-	raw_spin_lock_irqsave(&i8253_lock, flags);
-	/*
-	 * Although our caller may have the read side of xtime_lock,
-	 * this is now a seqlock, and we are cheating in this routine
-	 * by having side effects on state that we cannot undo if
-	 * there is a collision on the seqlock and our caller has to
-	 * retry.  (Namely, old_jifs and old_count.)  So we must treat
-	 * jiffies as volatile despite the lock.  We read jiffies
-	 * before latching the timer count to guarantee that although
-	 * the jiffies value might be older than the count (that is,
-	 * the counter may underflow between the last point where
-	 * jiffies was incremented and the point where we latch the
-	 * count), it cannot be newer.
-	 */
-	jifs = jiffies;
-	outb_pit(0x00, PIT_MODE);	/* latch the count ASAP */
-	count = inb_pit(PIT_CH0);	/* read the latched count */
-	count |= inb_pit(PIT_CH0) << 8;
-
-	/* VIA686a test code... reset the latch if count > max + 1 */
-	if (count > LATCH) {
-		outb_pit(0x34, PIT_MODE);
-		outb_pit(LATCH & 0xff, PIT_CH0);
-		outb_pit(LATCH >> 8, PIT_CH0);
-		count = LATCH - 1;
-	}
-
-	/*
-	 * It's possible for count to appear to go the wrong way for a
-	 * couple of reasons:
-	 *
-	 *  1. The timer counter underflows, but we haven't handled the
-	 *     resulting interrupt and incremented jiffies yet.
-	 *  2. Hardware problem with the timer, not giving us continuous time,
-	 *     the counter does small "jumps" upwards on some Pentium systems,
-	 *     (see c't 95/10 page 335 for Neptun bug.)
-	 *
-	 * Previous attempts to handle these cases intelligently were
-	 * buggy, so we just do the simple thing now.
-	 */
-	if (count > old_count && jifs == old_jifs)
-		count = old_count;
-
-	old_count = count;
-	old_jifs = jifs;
-
-	raw_spin_unlock_irqrestore(&i8253_lock, flags);
-
-	count = (LATCH - 1) - count;
-
-	return (cycle_t)(jifs * LATCH) + count;
-}
-
-static struct clocksource pit_cs = {
-	.name		= "pit",
-	.rating		= 110,
-	.read		= pit_read,
-	.mask		= CLOCKSOURCE_MASK(32),
-	.mult		= 0,
-	.shift		= 20,
-};
-
 static int __init init_pit_clocksource(void)
 {
 	 /*
@@ -202,13 +34,10 @@ static int __init init_pit_clocksource(void)
 	  * - when local APIC timer is active (PIT is switched off)
 	  */
 	if (num_possible_cpus() > 1 || is_hpet_enabled() ||
-	    pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
+	    i8253_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
 		return 0;
 
-	pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift);
-
-	return clocksource_register(&pit_cs);
+	return clocksource_i8253_init();
 }
 arch_initcall(init_pit_clocksource);
-
 #endif /* !CONFIG_X86_64 */
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 65b8f5c2eeb..610485223bd 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -14,7 +14,7 @@
 #include <linux/io.h>
 #include <linux/delay.h>
 
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/system.h>
 #include <asm/timer.h>
 #include <asm/hw_irq.h>
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 1cb0b9fc78d..6c0802eb2f7 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -249,7 +249,7 @@ void fixup_irqs(void)
 
 		data = irq_desc_get_irq_data(desc);
 		affinity = data->affinity;
-		if (!irq_has_action(irq) ||
+		if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
 		    cpumask_subset(affinity, cpu_online_mask)) {
 			raw_spin_unlock(&desc->lock);
 			continue;
@@ -276,7 +276,8 @@ void fixup_irqs(void)
 		else if (!(warned++))
 			set_affinity = 0;
 
-		if (!irqd_can_move_in_process_context(data) && chip->irq_unmask)
+		if (!irqd_can_move_in_process_context(data) &&
+		    !irqd_irq_disabled(data) && chip->irq_unmask)
 			chip->irq_unmask(data);
 
 		raw_spin_unlock(&desc->lock);
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index f470e4ef993..b3300e6bace 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -15,7 +15,7 @@
 #include <linux/io.h>
 #include <linux/delay.h>
 
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/system.h>
 #include <asm/timer.h>
 #include <asm/hw_irq.h>
@@ -272,9 +272,6 @@ static void __init apic_intr_init(void)
 #ifdef CONFIG_X86_MCE_THRESHOLD
 	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
 #endif
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
-	alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
-#endif
 
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
 	/* self generated IPI for local APIC timer */
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 961b6b30ba9..3fee346ef54 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -34,7 +34,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
 		code.offset = entry->target -
 				(entry->code + JUMP_LABEL_NOP_SIZE);
 	} else
-		memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+		memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);
 	get_online_cpus();
 	mutex_lock(&text_mutex);
 	text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
@@ -44,7 +44,8 @@ void arch_jump_label_transform(struct jump_entry *entry,
 
 void arch_jump_label_text_poke_early(jump_label_t addr)
 {
-	text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE);
+	text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5],
+			JUMP_LABEL_NOP_SIZE);
 }
 
 #endif
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 5f9ecff328b..00354d4919a 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -608,7 +608,7 @@ int kgdb_arch_init(void)
 	return register_die_notifier(&kgdb_notifier);
 }
 
-static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi,
+static void kgdb_hw_overflow_handler(struct perf_event *event,
 		struct perf_sample_data *data, struct pt_regs *regs)
 {
 	struct task_struct *tsk = current;
@@ -638,7 +638,7 @@ void kgdb_arch_late(void)
 	for (i = 0; i < HBP_NUM; i++) {
 		if (breakinfo[i].pev)
 			continue;
-		breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
+		breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL);
 		if (IS_ERR((void * __force)breakinfo[i].pev)) {
 			printk(KERN_ERR "kgdb: Could not allocate hw"
 			       "breakpoints\nDisabling the kernel debugger\n");
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index c969fd9d156..f1a6244d7d9 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1183,12 +1183,13 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
 					 struct pt_regs *regs)
 {
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	unsigned long flags;
 
 	/* This is possible if op is under delayed unoptimizing */
 	if (kprobe_disabled(&op->kp))
 		return;
 
-	preempt_disable();
+	local_irq_save(flags);
 	if (kprobe_running()) {
 		kprobes_inc_nmissed_count(&op->kp);
 	} else {
@@ -1207,7 +1208,7 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
 		opt_pre_handler(&op->kp, regs);
 		__this_cpu_write(current_kprobe, NULL);
 	}
-	preempt_enable_no_resched();
+	local_irq_restore(flags);
 }
 
 static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33c07b0b122..a9c2116001d 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -51,6 +51,15 @@ static int parse_no_kvmapf(char *arg)
 
 early_param("no-kvmapf", parse_no_kvmapf);
 
+static int steal_acc = 1;
+static int parse_no_stealacc(char *arg)
+{
+        steal_acc = 0;
+        return 0;
+}
+
+early_param("no-steal-acc", parse_no_stealacc);
+
 struct kvm_para_state {
 	u8 mmu_queue[MMU_QUEUE_SIZE];
 	int mmu_queue_len;
@@ -58,6 +67,8 @@ struct kvm_para_state {
 
 static DEFINE_PER_CPU(struct kvm_para_state, para_state);
 static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
+static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
+static int has_steal_clock = 0;
 
 static struct kvm_para_state *kvm_para_state(void)
 {
@@ -441,6 +452,21 @@ static void __init paravirt_ops_setup(void)
 #endif
 }
 
+static void kvm_register_steal_time(void)
+{
+	int cpu = smp_processor_id();
+	struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
+
+	if (!has_steal_clock)
+		return;
+
+	memset(st, 0, sizeof(*st));
+
+	wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED));
+	printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n",
+		cpu, __pa(st));
+}
+
 void __cpuinit kvm_guest_cpu_init(void)
 {
 	if (!kvm_para_available())
@@ -457,6 +483,9 @@ void __cpuinit kvm_guest_cpu_init(void)
 		printk(KERN_INFO"KVM setup async PF for cpu %d\n",
 		       smp_processor_id());
 	}
+
+	if (has_steal_clock)
+		kvm_register_steal_time();
 }
 
 static void kvm_pv_disable_apf(void *unused)
@@ -483,6 +512,31 @@ static struct notifier_block kvm_pv_reboot_nb = {
 	.notifier_call = kvm_pv_reboot_notify,
 };
 
+static u64 kvm_steal_clock(int cpu)
+{
+	u64 steal;
+	struct kvm_steal_time *src;
+	int version;
+
+	src = &per_cpu(steal_time, cpu);
+	do {
+		version = src->version;
+		rmb();
+		steal = src->steal;
+		rmb();
+	} while ((version & 1) || (version != src->version));
+
+	return steal;
+}
+
+void kvm_disable_steal_time(void)
+{
+	if (!has_steal_clock)
+		return;
+
+	wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
+}
+
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
@@ -500,6 +554,7 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)
 
 static void kvm_guest_cpu_offline(void *dummy)
 {
+	kvm_disable_steal_time();
 	kvm_pv_disable_apf(NULL);
 	apf_task_wake_all();
 }
@@ -548,6 +603,11 @@ void __init kvm_guest_init(void)
 	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
 		x86_init.irqs.trap_init = kvm_apf_trap_init;
 
+	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+		has_steal_clock = 1;
+		pv_time_ops.steal_clock = kvm_steal_clock;
+	}
+
 #ifdef CONFIG_SMP
 	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
 	register_cpu_notifier(&kvm_cpu_notifier);
@@ -555,3 +615,15 @@ void __init kvm_guest_init(void)
 	kvm_guest_cpu_init();
 #endif
 }
+
+static __init int activate_jump_labels(void)
+{
+	if (has_steal_clock) {
+		jump_label_inc(&paravirt_steal_enabled);
+		if (steal_acc)
+			jump_label_inc(&paravirt_steal_rq_enabled);
+	}
+
+	return 0;
+}
+arch_initcall(activate_jump_labels);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f98d3eafe07..c1a0188e29a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -26,8 +26,6 @@
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
 
-#define KVM_SCALE 22
-
 static int kvmclock = 1;
 static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
 static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
@@ -120,8 +118,6 @@ static struct clocksource kvm_clock = {
 	.read = kvm_clock_get_cycles,
 	.rating = 400,
 	.mask = CLOCKSOURCE_MASK(64),
-	.mult = 1 << KVM_SCALE,
-	.shift = KVM_SCALE,
 	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
 };
 
@@ -164,6 +160,7 @@ static void __cpuinit kvm_setup_secondary_clock(void)
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
 	native_write_msr(msr_kvm_system_time, 0, 0);
+	kvm_disable_steal_time();
 	native_machine_crash_shutdown(regs);
 }
 #endif
@@ -171,6 +168,7 @@ static void kvm_crash_shutdown(struct pt_regs *regs)
 static void kvm_shutdown(void)
 {
 	native_write_msr(msr_kvm_system_time, 0, 0);
+	kvm_disable_steal_time();
 	native_machine_shutdown();
 }
 
@@ -203,7 +201,7 @@ void __init kvmclock_init(void)
 	machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
 	kvm_get_preset_lpj();
-	clocksource_register(&kvm_clock);
+	clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
 	pv_info.paravirt_enabled = 1;
 	pv_info.name = "KVM";
 
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index c5610384ab1..591be0ee193 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -66,8 +66,8 @@ struct microcode_amd {
 	unsigned int			mpb[0];
 };
 
-#define UCODE_CONTAINER_SECTION_HDR	8
-#define UCODE_CONTAINER_HEADER_SIZE	12
+#define SECTION_HDR_SIZE	8
+#define CONTAINER_HDR_SZ	12
 
 static struct equiv_cpu_entry *equiv_cpu_table;
 
@@ -157,7 +157,7 @@ static int apply_microcode_amd(int cpu)
 static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
 {
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
-	unsigned int max_size, actual_size;
+	u32 max_size, actual_size;
 
 #define F1XH_MPB_MAX_SIZE 2048
 #define F14H_MPB_MAX_SIZE 1824
@@ -175,9 +175,9 @@ static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
 		break;
 	}
 
-	actual_size = buf[4] + (buf[5] << 8);
+	actual_size = *(u32 *)(buf + 4);
 
-	if (actual_size > size || actual_size > max_size) {
+	if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) {
 		pr_err("section size mismatch\n");
 		return 0;
 	}
@@ -191,7 +191,7 @@ get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
 	struct microcode_header_amd *mc = NULL;
 	unsigned int actual_size = 0;
 
-	if (buf[0] != UCODE_UCODE_TYPE) {
+	if (*(u32 *)buf != UCODE_UCODE_TYPE) {
 		pr_err("invalid type field in container file section header\n");
 		goto out;
 	}
@@ -204,8 +204,8 @@ get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
 	if (!mc)
 		goto out;
 
-	get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size);
-	*mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR;
+	get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size);
+	*mc_size = actual_size + SECTION_HDR_SIZE;
 
 out:
 	return mc;
@@ -229,9 +229,10 @@ static int install_equiv_cpu_table(const u8 *buf)
 		return -ENOMEM;
 	}
 
-	get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size);
+	get_ucode_data(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
 
-	return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
+	/* add header length */
+	return size + CONTAINER_HDR_SZ;
 }
 
 static void free_equiv_cpu_table(void)
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index ab23f1ad4bf..925179f871d 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
 #include <linux/bug.h>
 #include <linux/mm.h>
 #include <linux/gfp.h>
+#include <linux/jump_label.h>
 
 #include <asm/system.h>
 #include <asm/page.h>
@@ -44,21 +45,6 @@ void *module_alloc(unsigned long size)
 				-1, __builtin_return_address(0));
 }
 
-/* Free memory returned from module_alloc */
-void module_free(struct module *mod, void *module_region)
-{
-	vfree(module_region);
-}
-
-/* We don't need anything special. */
-int module_frob_arch_sections(Elf_Ehdr *hdr,
-			      Elf_Shdr *sechdrs,
-			      char *secstrings,
-			      struct module *mod)
-{
-	return 0;
-}
-
 #ifdef CONFIG_X86_32
 int apply_relocate(Elf32_Shdr *sechdrs,
 		   const char *strtab,
@@ -99,17 +85,6 @@ int apply_relocate(Elf32_Shdr *sechdrs,
 	}
 	return 0;
 }
-
-int apply_relocate_add(Elf32_Shdr *sechdrs,
-		       const char *strtab,
-		       unsigned int symindex,
-		       unsigned int relsec,
-		       struct module *me)
-{
-	printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
-	       me->name);
-	return -ENOEXEC;
-}
 #else /*X86_64*/
 int apply_relocate_add(Elf64_Shdr *sechdrs,
 		   const char *strtab,
@@ -180,17 +155,6 @@ overflow:
 	       me->name);
 	return -ENOEXEC;
 }
-
-int apply_relocate(Elf_Shdr *sechdrs,
-		   const char *strtab,
-		   unsigned int symindex,
-		   unsigned int relsec,
-		   struct module *me)
-{
-	printk(KERN_ERR "non add relocation not supported\n");
-	return -ENOSYS;
-}
-
 #endif
 
 int module_finalize(const Elf_Ehdr *hdr,
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 5a532ce646b..9103b89c145 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -285,7 +285,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
 	intsrc.type = MP_INTSRC;
 	intsrc.irqflag = 0;	/* conforming */
 	intsrc.srcbus = 0;
-	intsrc.dstapic = mp_ioapics[0].apicid;
+	intsrc.dstapic = mpc_ioapic_id(0);
 
 	intsrc.irqtype = mp_INT;
 
@@ -715,17 +715,15 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
 	}
 }
 
-static int
+static int __init
 check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
 {
-	int ret = 0;
-
 	if (!mpc_new_phys || count <= mpc_new_length) {
 		WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
 		return -1;
 	}
 
-	return ret;
+	return 0;
 }
 #else /* CONFIG_X86_IO_APIC */
 static
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 869e1aeeb71..613a7931ecc 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -202,6 +202,14 @@ static void native_flush_tlb_single(unsigned long addr)
 	__native_flush_tlb_single(addr);
 }
 
+struct jump_label_key paravirt_steal_enabled;
+struct jump_label_key paravirt_steal_rq_enabled;
+
+static u64 native_steal_clock(int cpu)
+{
+	return 0;
+}
+
 /* These are in entry.S */
 extern void native_iret(void);
 extern void native_irq_enable_sysexit(void);
@@ -307,6 +315,7 @@ struct pv_init_ops pv_init_ops = {
 
 struct pv_time_ops pv_time_ops = {
 	.sched_clock = native_sched_clock,
+	.steal_clock = native_steal_clock,
 };
 
 struct pv_irq_ops pv_irq_ops = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index e8c33a30200..726494b5834 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1553,7 +1553,7 @@ static void __init calgary_fixup_one_tce_space(struct pci_dev *dev)
 			continue;
 
 		/* cover the whole region */
-		npages = (r->end - r->start) >> PAGE_SHIFT;
+		npages = resource_size(r) >> PAGE_SHIFT;
 		npages++;
 
 		iommu_range_reserve(tbl, r->start, npages);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 9ea999a4dcc..b49d00da2ae 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -68,74 +68,10 @@ int dma_set_mask(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_set_mask);
 
-#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
-static __initdata void *dma32_bootmem_ptr;
-static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
-
-static int __init parse_dma32_size_opt(char *p)
-{
-	if (!p)
-		return -EINVAL;
-	dma32_bootmem_size = memparse(p, &p);
-	return 0;
-}
-early_param("dma32_size", parse_dma32_size_opt);
-
-void __init dma32_reserve_bootmem(void)
-{
-	unsigned long size, align;
-	if (max_pfn <= MAX_DMA32_PFN)
-		return;
-
-	/*
-	 * check aperture_64.c allocate_aperture() for reason about
-	 * using 512M as goal
-	 */
-	align = 64ULL<<20;
-	size = roundup(dma32_bootmem_size, align);
-	dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
-				 512ULL<<20);
-	/*
-	 * Kmemleak should not scan this block as it may not be mapped via the
-	 * kernel direct mapping.
-	 */
-	kmemleak_ignore(dma32_bootmem_ptr);
-	if (dma32_bootmem_ptr)
-		dma32_bootmem_size = size;
-	else
-		dma32_bootmem_size = 0;
-}
-static void __init dma32_free_bootmem(void)
-{
-
-	if (max_pfn <= MAX_DMA32_PFN)
-		return;
-
-	if (!dma32_bootmem_ptr)
-		return;
-
-	free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
-
-	dma32_bootmem_ptr = NULL;
-	dma32_bootmem_size = 0;
-}
-#else
-void __init dma32_reserve_bootmem(void)
-{
-}
-static void __init dma32_free_bootmem(void)
-{
-}
-
-#endif
-
 void __init pci_iommu_alloc(void)
 {
 	struct iommu_table_entry *p;
 
-	/* free the range so iommu could get some range less than 4G */
-	dma32_free_bootmem();
-
 	sort_iommu_table(__iommu_table, __iommu_table_end);
 	check_iommu_entries(__iommu_table, __iommu_table_end);
 
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
index 55d745ec118..35ccf75696e 100644
--- a/arch/x86/kernel/pci-iommu_table.c
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -50,20 +50,14 @@ void __init check_iommu_entries(struct iommu_table_entry *start,
 				struct iommu_table_entry *finish)
 {
 	struct iommu_table_entry *p, *q, *x;
-	char sym_p[KSYM_SYMBOL_LEN];
-	char sym_q[KSYM_SYMBOL_LEN];
 
 	/* Simple cyclic dependency checker. */
 	for (p = start; p < finish; p++) {
 		q = find_dependents_of(start, finish, p);
 		x = find_dependents_of(start, finish, q);
 		if (p == x) {
-			sprint_symbol(sym_p, (unsigned long)p->detect);
-			sprint_symbol(sym_q, (unsigned long)q->detect);
-
-			printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %s depends" \
-					" on %s and vice-versa. BREAKING IT.\n",
-					sym_p, sym_q);
+			printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n",
+			       p->detect, q->detect);
 			/* Heavy handed way..*/
 			x->depend = 0;
 		}
@@ -72,12 +66,8 @@ void __init check_iommu_entries(struct iommu_table_entry *start,
 	for (p = start; p < finish; p++) {
 		q = find_dependents_of(p, finish, p);
 		if (q && q > p) {
-			sprint_symbol(sym_p, (unsigned long)p->detect);
-			sprint_symbol(sym_q, (unsigned long)q->detect);
-
-			printk(KERN_ERR "EXECUTION ORDER INVALID! %s "\
-					"should be called before %s!\n",
-					sym_p, sym_q);
+			printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n",
+			       p->detect, q->detect);
 		}
 	}
 }
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms.c
index 071e7fea42e..63228035f9d 100644
--- a/arch/x86/kernel/probe_roms_32.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -73,6 +73,107 @@ static struct resource video_rom_resource = {
 	.flags	= IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
 };
 
+/* does this oprom support the given pci device, or any of the devices
+ * that the driver supports?
+ */
+static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
+{
+	struct pci_driver *drv = pdev->driver;
+	const struct pci_device_id *id;
+
+	if (pdev->vendor == vendor && pdev->device == device)
+		return true;
+
+	for (id = drv ? drv->id_table : NULL; id && id->vendor; id++)
+		if (id->vendor == vendor && id->device == device)
+			break;
+
+	return id && id->vendor;
+}
+
+static bool probe_list(struct pci_dev *pdev, unsigned short vendor,
+		       const unsigned char *rom_list)
+{
+	unsigned short device;
+
+	do {
+		if (probe_kernel_address(rom_list, device) != 0)
+			device = 0;
+
+		if (device && match_id(pdev, vendor, device))
+			break;
+
+		rom_list += 2;
+	} while (device);
+
+	return !!device;
+}
+
+static struct resource *find_oprom(struct pci_dev *pdev)
+{
+	struct resource *oprom = NULL;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
+		struct resource *res = &adapter_rom_resources[i];
+		unsigned short offset, vendor, device, list, rev;
+		const unsigned char *rom;
+
+		if (res->end == 0)
+			break;
+
+		rom = isa_bus_to_virt(res->start);
+		if (probe_kernel_address(rom + 0x18, offset) != 0)
+			continue;
+
+		if (probe_kernel_address(rom + offset + 0x4, vendor) != 0)
+			continue;
+
+		if (probe_kernel_address(rom + offset + 0x6, device) != 0)
+			continue;
+
+		if (match_id(pdev, vendor, device)) {
+			oprom = res;
+			break;
+		}
+
+		if (probe_kernel_address(rom + offset + 0x8, list) == 0 &&
+		    probe_kernel_address(rom + offset + 0xc, rev) == 0 &&
+		    rev >= 3 && list &&
+		    probe_list(pdev, vendor, rom + offset + list)) {
+			oprom = res;
+			break;
+		}
+	}
+
+	return oprom;
+}
+
+void *pci_map_biosrom(struct pci_dev *pdev)
+{
+	struct resource *oprom = find_oprom(pdev);
+
+	if (!oprom)
+		return NULL;
+
+	return ioremap(oprom->start, resource_size(oprom));
+}
+EXPORT_SYMBOL(pci_map_biosrom);
+
+void pci_unmap_biosrom(void __iomem *image)
+{
+	iounmap(image);
+}
+EXPORT_SYMBOL(pci_unmap_biosrom);
+
+size_t pci_biosrom_size(struct pci_dev *pdev)
+{
+	struct resource *oprom = find_oprom(pdev);
+
+	return oprom ? resource_size(oprom) : 0;
+}
+EXPORT_SYMBOL(pci_biosrom_size);
+
 #define ROMSIGNATURE 0xaa55
 
 static int __init romsignature(const unsigned char *rom)
@@ -133,7 +234,7 @@ void __init probe_roms(void)
 	/* check for extension rom (ignore length byte!) */
 	rom = isa_bus_to_virt(extension_rom_resource.start);
 	if (romsignature(rom)) {
-		length = extension_rom_resource.end - extension_rom_resource.start + 1;
+		length = resource_size(&extension_rom_resource);
 		if (romchecksum(rom, length)) {
 			request_resource(&iomem_resource, &extension_rom_resource);
 			upper = extension_rom_resource.start;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d46cbe46b7a..e1ba8cb24e4 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -337,7 +337,9 @@ EXPORT_SYMBOL(boot_option_idle_override);
  * Powermanagement idle function, if any..
  */
 void (*pm_idle)(void);
+#ifdef CONFIG_APM_MODULE
 EXPORT_SYMBOL(pm_idle);
+#endif
 
 #ifdef CONFIG_X86_32
 /*
@@ -449,7 +451,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
 	if (!need_resched()) {
-		if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
+		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
 			clflush((void *)&current_thread_info()->flags);
 
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -465,7 +467,7 @@ static void mwait_idle(void)
 	if (!need_resched()) {
 		trace_power_start(POWER_CSTATE, 1, smp_processor_id());
 		trace_cpu_idle(1, smp_processor_id());
-		if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
+		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
 			clflush((void *)&current_thread_info()->flags);
 
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -535,45 +537,45 @@ int mwait_usable(const struct cpuinfo_x86 *c)
 	return (edx & MWAIT_EDX_C1);
 }
 
-bool c1e_detected;
-EXPORT_SYMBOL(c1e_detected);
+bool amd_e400_c1e_detected;
+EXPORT_SYMBOL(amd_e400_c1e_detected);
 
-static cpumask_var_t c1e_mask;
+static cpumask_var_t amd_e400_c1e_mask;
 
-void c1e_remove_cpu(int cpu)
+void amd_e400_remove_cpu(int cpu)
 {
-	if (c1e_mask != NULL)
-		cpumask_clear_cpu(cpu, c1e_mask);
+	if (amd_e400_c1e_mask != NULL)
+		cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
 }
 
 /*
- * C1E aware idle routine. We check for C1E active in the interrupt
+ * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
  * pending message MSR. If we detect C1E, then we handle it the same
  * way as C3 power states (local apic timer and TSC stop)
  */
-static void c1e_idle(void)
+static void amd_e400_idle(void)
 {
 	if (need_resched())
 		return;
 
-	if (!c1e_detected) {
+	if (!amd_e400_c1e_detected) {
 		u32 lo, hi;
 
 		rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
 
 		if (lo & K8_INTP_C1E_ACTIVE_MASK) {
-			c1e_detected = true;
+			amd_e400_c1e_detected = true;
 			if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
 				mark_tsc_unstable("TSC halt in AMD C1E");
 			printk(KERN_INFO "System has AMD C1E enabled\n");
 		}
 	}
 
-	if (c1e_detected) {
+	if (amd_e400_c1e_detected) {
 		int cpu = smp_processor_id();
 
-		if (!cpumask_test_cpu(cpu, c1e_mask)) {
-			cpumask_set_cpu(cpu, c1e_mask);
+		if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
+			cpumask_set_cpu(cpu, amd_e400_c1e_mask);
 			/*
 			 * Force broadcast so ACPI can not interfere.
 			 */
@@ -616,17 +618,17 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 		pm_idle = mwait_idle;
 	} else if (cpu_has_amd_erratum(amd_erratum_400)) {
 		/* E400: APIC timer interrupt does not wake up CPU from C1e */
-		printk(KERN_INFO "using C1E aware idle routine\n");
-		pm_idle = c1e_idle;
+		printk(KERN_INFO "using AMD E400 aware idle routine\n");
+		pm_idle = amd_e400_idle;
 	} else
 		pm_idle = default_idle;
 }
 
-void __init init_c1e_mask(void)
+void __init init_amd_e400_c1e_mask(void)
 {
-	/* If we're using c1e_idle, we need to allocate c1e_mask. */
-	if (pm_idle == c1e_idle)
-		zalloc_cpumask_var(&c1e_mask, GFP_KERNEL);
+	/* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
+	if (pm_idle == amd_e400_idle)
+		zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
 }
 
 static int __init idle_setup(char *str)
@@ -640,6 +642,7 @@ static int __init idle_setup(char *str)
 		boot_option_idle_override = IDLE_POLL;
 	} else if (!strcmp(str, "mwait")) {
 		boot_option_idle_override = IDLE_FORCE_MWAIT;
+		WARN_ONCE(1, "\"idle=mwait\" will be removed in 2012\n");
 	} else if (!strcmp(str, "halt")) {
 		/*
 		 * When the boot option of idle=halt is added, halt is
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8d128783af4..a3d0dc59067 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -245,7 +245,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 {
 	set_user_gs(regs, 0);
 	regs->fs		= 0;
-	set_fs(USER_DS);
 	regs->ds		= __USER_DS;
 	regs->es		= __USER_DS;
 	regs->ss		= __USER_DS;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6c9dd922ac0..ca6f7ab8df3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -338,7 +338,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 	regs->cs		= _cs;
 	regs->ss		= _ss;
 	regs->flags		= X86_EFLAGS_IF;
-	set_fs(USER_DS);
 	/*
 	 * Free the old FP and other extended state
 	 */
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 45892dc4b72..82528799c5d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -528,7 +528,7 @@ static int genregs_set(struct task_struct *target,
 	return ret;
 }
 
-static void ptrace_triggered(struct perf_event *bp, int nmi,
+static void ptrace_triggered(struct perf_event *bp,
 			     struct perf_sample_data *data,
 			     struct pt_regs *regs)
 {
@@ -608,6 +608,9 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
 	unsigned len, type;
 	struct perf_event *bp;
 
+	if (ptrace_get_breakpoints(tsk) < 0)
+		return -ESRCH;
+
 	data &= ~DR_CONTROL_RESERVED;
 	old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
 restore:
@@ -655,6 +658,9 @@ restore:
 		}
 		goto restore;
 	}
+
+	ptrace_put_breakpoints(tsk);
+
 	return ((orig_ret < 0) ? orig_ret : rc);
 }
 
@@ -668,10 +674,17 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
 
 	if (n < HBP_NUM) {
 		struct perf_event *bp;
+
+		if (ptrace_get_breakpoints(tsk) < 0)
+			return -ESRCH;
+
 		bp = thread->ptrace_bps[n];
 		if (!bp)
-			return 0;
-		val = bp->hw.info.address;
+			val = 0;
+		else
+			val = bp->hw.info.address;
+
+		ptrace_put_breakpoints(tsk);
 	} else if (n == 6) {
 		val = thread->debugreg6;
 	 } else if (n == 7) {
@@ -686,6 +699,10 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 	struct perf_event *bp;
 	struct thread_struct *t = &tsk->thread;
 	struct perf_event_attr attr;
+	int err = 0;
+
+	if (ptrace_get_breakpoints(tsk) < 0)
+		return -ESRCH;
 
 	if (!t->ptrace_bps[nr]) {
 		ptrace_breakpoint_init(&attr);
@@ -698,7 +715,8 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 		attr.bp_type = HW_BREAKPOINT_W;
 		attr.disabled = 1;
 
-		bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
+		bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
+						 NULL, tsk);
 
 		/*
 		 * CHECKME: the previous code returned -EIO if the addr wasn't
@@ -709,24 +727,23 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 		 * writing for the user. And anyway this is the previous
 		 * behaviour.
 		 */
-		if (IS_ERR(bp))
-			return PTR_ERR(bp);
+		if (IS_ERR(bp)) {
+			err = PTR_ERR(bp);
+			goto put;
+		}
 
 		t->ptrace_bps[nr] = bp;
 	} else {
-		int err;
-
 		bp = t->ptrace_bps[nr];
 
 		attr = bp->attr;
 		attr.bp_addr = addr;
 		err = modify_user_hw_breakpoint(bp, &attr);
-		if (err)
-			return err;
 	}
 
-
-	return 0;
+put:
+	ptrace_put_breakpoints(tsk);
+	return err;
 }
 
 /*
@@ -1347,7 +1364,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
  * We must return the syscall number to actually look up in the table.
  * This can be -1L to skip running any syscall at all.
  */
-asmregparm long syscall_trace_enter(struct pt_regs *regs)
+long syscall_trace_enter(struct pt_regs *regs)
 {
 	long ret = 0;
 
@@ -1392,7 +1409,7 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
 	return ret ?: regs->orig_ax;
 }
 
-asmregparm void syscall_trace_leave(struct pt_regs *regs)
+void syscall_trace_leave(struct pt_regs *regs)
 {
 	bool step;
 
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 8bbe8c56916..b78643d0f9a 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -10,7 +10,7 @@
 
 static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
 {
-	u8 config, rev;
+	u8 config;
 	u16 word;
 
 	/* BIOS may enable hardware IRQ balancing for
@@ -18,8 +18,7 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
 	 * based platforms.
 	 * Disable SW irqbalance/affinity on those platforms.
 	 */
-	pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
-	if (rev > 0x9)
+	if (dev->revision > 0x9)
 		return;
 
 	/* enable access to config space*/
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 08c44b08bf5..9242436e993 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -36,7 +36,7 @@ EXPORT_SYMBOL(pm_power_off);
 
 static const struct desc_ptr no_idt = {};
 static int reboot_mode;
-enum reboot_type reboot_type = BOOT_KBD;
+enum reboot_type reboot_type = BOOT_ACPI;
 int reboot_force;
 
 #if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
@@ -294,6 +294,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"),
 		},
 	},
+	{ /* Handle reboot issue on Acer Aspire one */
+		.callback = set_bios_reboot,
+		.ident = "Acer Aspire One A110",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
+		},
+	},
 	{ }
 };
 
@@ -411,6 +419,30 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
 		},
 	},
+	{	/* Handle problems with rebooting on the Latitude E6320. */
+		.callback = set_pci_reboot,
+		.ident = "Dell Latitude E6320",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
+		},
+	},
+	{	/* Handle problems with rebooting on the Latitude E5420. */
+		.callback = set_pci_reboot,
+		.ident = "Dell Latitude E5420",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
+		},
+	},
+	{	/* Handle problems with rebooting on the Latitude E6420. */
+		.callback = set_pci_reboot,
+		.ident = "Dell Latitude E6420",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
+		},
+	},
 	{ }
 };
 
@@ -478,9 +510,24 @@ void __attribute__((weak)) mach_reboot_fixups(void)
 {
 }
 
+/*
+ * Windows compatible x86 hardware expects the following on reboot:
+ *
+ * 1) If the FADT has the ACPI reboot register flag set, try it
+ * 2) If still alive, write to the keyboard controller
+ * 3) If still alive, write to the ACPI reboot register again
+ * 4) If still alive, write to the keyboard controller again
+ *
+ * If the machine is still alive at this stage, it gives up. We default to
+ * following the same pattern, except that if we're still alive after (4) we'll
+ * try to force a triple fault and then cycle between hitting the keyboard
+ * controller and doing that
+ */
 static void native_machine_emergency_restart(void)
 {
 	int i;
+	int attempt = 0;
+	int orig_reboot_type = reboot_type;
 
 	if (reboot_emergency)
 		emergency_vmx_disable_all();
@@ -502,6 +549,13 @@ static void native_machine_emergency_restart(void)
 				outb(0xfe, 0x64); /* pulse reset low */
 				udelay(50);
 			}
+			if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
+				attempt = 1;
+				reboot_type = BOOT_ACPI;
+			} else {
+				reboot_type = BOOT_TRIPLE;
+			}
+			break;
 
 		case BOOT_TRIPLE:
 			load_idt(&no_idt);
diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S
index 29092b38d81..1d5c46df0d7 100644
--- a/arch/x86/kernel/reboot_32.S
+++ b/arch/x86/kernel/reboot_32.S
@@ -21,26 +21,26 @@ r_base = .
 	/* Get our own relocated address */
 	call	1f
 1:	popl	%ebx
-	subl	$1b, %ebx
+	subl	$(1b - r_base), %ebx
 
 	/* Compute the equivalent real-mode segment */
 	movl	%ebx, %ecx
 	shrl	$4, %ecx
 	
 	/* Patch post-real-mode segment jump */
-	movw	dispatch_table(%ebx,%eax,2),%ax
-	movw	%ax, 101f(%ebx)
-	movw	%cx, 102f(%ebx)
+	movw	(dispatch_table - r_base)(%ebx,%eax,2),%ax
+	movw	%ax, (101f - r_base)(%ebx)
+	movw	%cx, (102f - r_base)(%ebx)
 
 	/* Set up the IDT for real mode. */
-	lidtl	machine_real_restart_idt(%ebx)
+	lidtl	(machine_real_restart_idt - r_base)(%ebx)
 
 	/*
 	 * Set up a GDT from which we can load segment descriptors for real
 	 * mode.  The GDT is not used in real mode; it is just needed here to
 	 * prepare the descriptors.
 	 */
-	lgdtl	machine_real_restart_gdt(%ebx)
+	lgdtl	(machine_real_restart_gdt - r_base)(%ebx)
 
 	/*
 	 * Load the data segment registers with 16-bit compatible values
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 41235531b11..36818f8ec2b 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -97,6 +97,8 @@ relocate_kernel:
 	ret
 
 identity_mapped:
+	/* set return address to 0 if not preserving context */
+	pushl	$0
 	/* store the start address on the stack */
 	pushl   %edx
 
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 4de8f5b3d47..7a6f3b3be3c 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -100,6 +100,8 @@ relocate_kernel:
 	ret
 
 identity_mapped:
+	/* set return address to 0 if not preserving context */
+	pushq	$0
 	/* store the start address on the stack */
 	pushq   %rdx
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4be9b398470..afaf38447ef 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -691,8 +691,6 @@ early_param("reservelow", parse_reservelow);
 
 void __init setup_arch(char **cmdline_p)
 {
-	unsigned long flags;
-
 #ifdef CONFIG_X86_32
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
 	visws_early_detect();
@@ -912,6 +910,13 @@ void __init setup_arch(char **cmdline_p)
 	memblock.current_limit = get_max_mapped();
 	memblock_x86_fill();
 
+	/*
+	 * The EFI specification says that boot service code won't be called
+	 * after ExitBootServices(). This is, in fact, a lie.
+	 */
+	if (efi_enabled)
+		efi_reserve_boot_services();
+
 	/* preallocate 4k for mptable mpc */
 	early_reserve_e820_mpc_new();
 
@@ -948,6 +953,8 @@ void __init setup_arch(char **cmdline_p)
 	if (init_ohci1394_dma_early)
 		init_ohci1394_dma_on_all_controllers();
 #endif
+	/* Allocate bigger log buffer */
+	setup_log_buf(1);
 
 	reserve_initrd();
 
@@ -966,7 +973,6 @@ void __init setup_arch(char **cmdline_p)
 
 	initmem_init();
 	memblock_find_dma_reserve();
-	dma32_reserve_bootmem();
 
 #ifdef CONFIG_KVM_CLOCK
 	kvmclock_init();
@@ -1041,9 +1047,7 @@ void __init setup_arch(char **cmdline_p)
 
 	mcheck_init();
 
-	local_irq_save(flags);
-	arch_init_ideal_nop5();
-	local_irq_restore(flags);
+	arch_init_ideal_nops();
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4fd173cd8e5..54ddaeb221c 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -485,17 +485,18 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 asmlinkage int
 sys_sigsuspend(int history0, int history1, old_sigset_t mask)
 {
-	mask &= _BLOCKABLE;
-	spin_lock_irq(&current->sighand->siglock);
+	sigset_t blocked;
+
 	current->saved_sigmask = current->blocked;
-	siginitset(&current->blocked, mask);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
+
+	mask &= _BLOCKABLE;
+	siginitset(&blocked, mask);
+	set_current_blocked(&blocked);
 
 	current->state = TASK_INTERRUPTIBLE;
 	schedule();
-	set_restore_sigmask();
 
+	set_restore_sigmask();
 	return -ERESTARTNOHAND;
 }
 
@@ -572,10 +573,7 @@ unsigned long sys_sigreturn(struct pt_regs *regs)
 		goto badframe;
 
 	sigdelsetmask(&set, ~_BLOCKABLE);
-	spin_lock_irq(&current->sighand->siglock);
-	current->blocked = set;
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
+	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->sc, &ax))
 		goto badframe;
@@ -601,10 +599,7 @@ long sys_rt_sigreturn(struct pt_regs *regs)
 		goto badframe;
 
 	sigdelsetmask(&set, ~_BLOCKABLE);
-	spin_lock_irq(&current->sighand->siglock);
-	current->blocked = set;
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
+	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
 		goto badframe;
@@ -656,11 +651,15 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
 
 static int
 setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
-	       sigset_t *set, struct pt_regs *regs)
+		struct pt_regs *regs)
 {
 	int usig = signr_convert(sig);
+	sigset_t *set = &current->blocked;
 	int ret;
 
+	if (current_thread_info()->status & TS_RESTORE_SIGMASK)
+		set = &current->saved_sigmask;
+
 	/* Set up the stack frame */
 	if (is_ia32) {
 		if (ka->sa.sa_flags & SA_SIGINFO)
@@ -675,13 +674,15 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 		return -EFAULT;
 	}
 
+	current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
 	return ret;
 }
 
 static int
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
-	      sigset_t *oldset, struct pt_regs *regs)
+		struct pt_regs *regs)
 {
+	sigset_t blocked;
 	int ret;
 
 	/* Are we from a system call? */
@@ -714,20 +715,11 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	    likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
 		regs->flags &= ~X86_EFLAGS_TF;
 
-	ret = setup_rt_frame(sig, ka, info, oldset, regs);
+	ret = setup_rt_frame(sig, ka, info, regs);
 
 	if (ret)
 		return ret;
 
-#ifdef CONFIG_X86_64
-	/*
-	 * This has nothing to do with segment registers,
-	 * despite the name.  This magic affects uaccess.h
-	 * macros' behavior.  Reset it to the normal setting.
-	 */
-	set_fs(USER_DS);
-#endif
-
 	/*
 	 * Clear the direction flag as per the ABI for function entry.
 	 */
@@ -741,12 +733,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	 */
 	regs->flags &= ~X86_EFLAGS_TF;
 
-	spin_lock_irq(&current->sighand->siglock);
-	sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
+	sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
 	if (!(ka->sa.sa_flags & SA_NODEFER))
-		sigaddset(&current->blocked, sig);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
+		sigaddset(&blocked, sig);
+	set_current_blocked(&blocked);
 
 	tracehook_signal_handler(sig, info, ka, regs,
 				 test_thread_flag(TIF_SINGLESTEP));
@@ -771,7 +761,6 @@ static void do_signal(struct pt_regs *regs)
 	struct k_sigaction ka;
 	siginfo_t info;
 	int signr;
-	sigset_t *oldset;
 
 	/*
 	 * We want the common case to go fast, which is why we may in certain
@@ -783,23 +772,10 @@ static void do_signal(struct pt_regs *regs)
 	if (!user_mode(regs))
 		return;
 
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK)
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee! Actually deliver the signal.  */
-		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TS_RESTORE_SIGMASK flag.
-			 */
-			current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		}
+		handle_signal(signr, &info, &ka, regs);
 		return;
 	}
 
@@ -827,7 +803,7 @@ static void do_signal(struct pt_regs *regs)
 	 */
 	if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
 		current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
+		set_current_blocked(&current->saved_sigmask);
 	}
 }
 
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 513deac7228..013e7eba83b 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait)
 }
 
 /*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
+ * Reschedule call back.
  */
 void smp_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
 	inc_irq_stat(irq_resched_count);
+	scheduler_ipi();
 	/*
 	 * KVM uses this interrupt to force a cpu out of guest mode
 	 */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8ed8908cc9f..9f548cb4a95 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -285,6 +285,19 @@ notrace static void __cpuinit start_secondary(void *unused)
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 	x86_platform.nmi_init();
 
+	/*
+	 * Wait until the cpu which brought this one up marked it
+	 * online before enabling interrupts. If we don't do that then
+	 * we can end up waking up the softirq thread before this cpu
+	 * reached the active state, which makes the scheduler unhappy
+	 * and schedule the softirq thread on the wrong cpu. This is
+	 * only observable with forced threaded interrupts, but in
+	 * theory it could also happen w/o them. It's just way harder
+	 * to achieve.
+	 */
+	while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
+		cpu_relax();
+
 	/* enable local interrupts */
 	local_irq_enable();
 
@@ -312,26 +325,6 @@ void __cpuinit smp_store_cpu_info(int id)
 		identify_secondary_cpu(c);
 }
 
-static void __cpuinit check_cpu_siblings_on_same_node(int cpu1, int cpu2)
-{
-	int node1 = early_cpu_to_node(cpu1);
-	int node2 = early_cpu_to_node(cpu2);
-
-	/*
-	 * Our CPU scheduler assumes all logical cpus in the same physical cpu
-	 * share the same node. But, buggy ACPI or NUMA emulation might assign
-	 * them to different node. Fix it.
-	 */
-	if (node1 != node2) {
-		pr_warning("CPU %d in node %d and CPU %d in node %d are in the same physical CPU. forcing same node %d\n",
-			   cpu1, node1, cpu2, node2, node2);
-
-		numa_remove_cpu(cpu1);
-		numa_set_node(cpu1, node2);
-		numa_add_cpu(cpu1);
-	}
-}
-
 static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
 {
 	cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
@@ -340,7 +333,6 @@ static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
 	cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
 	cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
 	cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
-	check_cpu_siblings_on_same_node(cpu1, cpu2);
 }
 
 
@@ -382,12 +374,10 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 		    per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
 			cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
 			cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
-			check_cpu_siblings_on_same_node(cpu, i);
 		}
 		if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
 			cpumask_set_cpu(i, cpu_core_mask(cpu));
 			cpumask_set_cpu(cpu, cpu_core_mask(i));
-			check_cpu_siblings_on_same_node(cpu, i);
 			/*
 			 *  Does this new cpu bringup a new core?
 			 */
@@ -448,7 +438,7 @@ static void impress_friends(void)
 void __inquire_remote_apic(int apicid)
 {
 	unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
-	char *names[] = { "ID", "VERSION", "SPIV" };
+	const char * const names[] = { "ID", "VERSION", "SPIV" };
 	int timeout;
 	u32 status;
 
@@ -1330,7 +1320,7 @@ void play_dead_common(void)
 {
 	idle_task_exit();
 	reset_lazy_tlbstate();
-	c1e_remove_cpu(raw_smp_processor_id());
+	amd_e400_remove_cpu(raw_smp_processor_id());
 
 	mb();
 	/* Ack it */
@@ -1355,9 +1345,9 @@ static inline void mwait_play_dead(void)
 	void *mwait_ptr;
 	struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
 
-	if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)))
+	if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)))
 		return;
-	if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH))
+	if (!this_cpu_has(X86_FEATURE_CLFLSH))
 		return;
 	if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
 		return;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 6515733a289..fdd0c6430e5 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -9,15 +9,6 @@
 #include <linux/uaccess.h>
 #include <asm/stacktrace.h>
 
-static void save_stack_warning(void *data, char *msg)
-{
-}
-
-static void
-save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-}
-
 static int save_stack_stack(void *data, char *name)
 {
 	return 0;
@@ -53,16 +44,12 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
 }
 
 static const struct stacktrace_ops save_stack_ops = {
-	.warning	= save_stack_warning,
-	.warning_symbol	= save_stack_warning_symbol,
 	.stack		= save_stack_stack,
 	.address	= save_stack_address,
 	.walk_stack	= print_context_stack,
 };
 
 static const struct stacktrace_ops save_stack_ops_nosched = {
-	.warning	= save_stack_warning,
-	.warning_symbol	= save_stack_warning_symbol,
 	.stack		= save_stack_stack,
 	.address	= save_stack_address_nosched,
 	.walk_stack	= print_context_stack,
@@ -79,7 +66,7 @@ void save_stack_trace(struct stack_trace *trace)
 }
 EXPORT_SYMBOL_GPL(save_stack_trace);
 
-void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
+void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
 {
 	dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
 	if (trace->nr_entries < trace->max_entries)
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index abce34d5c79..fbb0a045a1a 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -344,3 +344,5 @@ ENTRY(sys_call_table)
 	.long sys_open_by_handle_at
 	.long sys_clock_adjtime
 	.long sys_syncfs
+	.long sys_sendmmsg		/* 345 */
+	.long sys_setns
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 998e972f3b1..e07a2fc876b 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -36,6 +36,7 @@
 #include <asm/bootparam.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
+#include <asm/swiotlb.h>
 #include <asm/fixmap.h>
 #include <asm/proto.h>
 #include <asm/setup.h>
@@ -110,7 +111,6 @@ static struct mm_struct tboot_mm = {
 	.mmap_sem       = __RWSEM_INITIALIZER(init_mm.mmap_sem),
 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
 	.mmlist         = LIST_HEAD_INIT(init_mm.mmlist),
-	.cpu_vm_mask    = CPU_MASK_ALL,
 };
 
 static inline void switch_to_tboot_pt(void)
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
index 787a5e499dd..3f92ce07e52 100644
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -161,7 +161,7 @@ static int test_NX(void)
 	}
 
 #endif
-	return 0;
+	return ret;
 }
 
 static void test_exit(void)
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 25a28a24593..5a64d057be5 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -11,19 +11,19 @@
 
 #include <linux/clockchips.h>
 #include <linux/interrupt.h>
+#include <linux/i8253.h>
 #include <linux/time.h>
 #include <linux/mca.h>
 
 #include <asm/vsyscall.h>
 #include <asm/x86_init.h>
 #include <asm/i8259.h>
-#include <asm/i8253.h>
 #include <asm/timer.h>
 #include <asm/hpet.h>
 #include <asm/time.h>
 
 #ifdef CONFIG_X86_64
-volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
 #endif
 
 unsigned long profile_pc(struct pt_regs *regs)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b9b67166f9d..9682ec50180 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -49,7 +49,7 @@
 #include <asm/stacktrace.h>
 #include <asm/processor.h>
 #include <asm/debugreg.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/system.h>
 #include <asm/traps.h>
 #include <asm/desc.h>
@@ -872,6 +872,12 @@ void __init trap_init(void)
 	set_bit(SYSCALL_VECTOR, used_vectors);
 #endif
 
+#ifdef CONFIG_X86_64
+	BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors));
+	set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall);
+	set_bit(VSYSCALL_EMU_VECTOR, used_vectors);
+#endif
+
 	/*
 	 * Should be a barrier for any external CPU state:
 	 */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9335bf7dd2e..db483369f10 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -5,7 +5,6 @@
 #include <linux/timer.h>
 #include <linux/acpi_pmtmr.h>
 #include <linux/cpufreq.h>
-#include <linux/dmi.h>
 #include <linux/delay.h>
 #include <linux/clocksource.h>
 #include <linux/percpu.h>
@@ -763,25 +762,6 @@ static cycle_t read_tsc(struct clocksource *cs)
 		ret : clocksource_tsc.cycle_last;
 }
 
-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_tsc(void)
-{
-	cycle_t ret;
-
-	/*
-	 * Surround the RDTSC by barriers, to make sure it's not
-	 * speculated to outside the seqlock critical section and
-	 * does not cause time warps:
-	 */
-	rdtsc_barrier();
-	ret = (cycle_t)vget_cycles();
-	rdtsc_barrier();
-
-	return ret >= __vsyscall_gtod_data.clock.cycle_last ?
-		ret : __vsyscall_gtod_data.clock.cycle_last;
-}
-#endif
-
 static void resume_tsc(struct clocksource *cs)
 {
 	clocksource_tsc.cycle_last = 0;
@@ -796,7 +776,7 @@ static struct clocksource clocksource_tsc = {
 	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
 				  CLOCK_SOURCE_MUST_VERIFY,
 #ifdef CONFIG_X86_64
-	.vread                  = vread_tsc,
+	.archdata               = { .vclock_mode = VCLOCK_TSC },
 #endif
 };
 
@@ -819,27 +799,6 @@ void mark_tsc_unstable(char *reason)
 
 EXPORT_SYMBOL_GPL(mark_tsc_unstable);
 
-static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
-{
-	printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
-			d->ident);
-	tsc_unstable = 1;
-	return 0;
-}
-
-/* List of systems that have known TSC problems */
-static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
-	{
-		.callback = dmi_mark_tsc_unstable,
-		.ident = "IBM Thinkpad 380XD",
-		.matches = {
-			DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
-			DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
-		},
-	},
-	{}
-};
-
 static void __init check_system_tsc_reliable(void)
 {
 #ifdef CONFIG_MGEODE_LX
@@ -1029,8 +988,6 @@ void __init tsc_init(void)
 	lpj_fine = lpj;
 
 	use_tsc_delay();
-	/* Check and install the TSC clocksource */
-	dmi_check_system(bad_tsc_dmi_table);
 
 	if (unsynchronized_tsc())
 		mark_tsc_unstable("TSCs unsynchronized");
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 624a2016198..4aa9c54a9b7 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -166,50 +166,18 @@ SECTIONS
 	__vsyscall_0 = .;
 
 	. = VSYSCALL_ADDR;
-	.vsyscall_0 : AT(VLOAD(.vsyscall_0)) {
+	.vsyscall : AT(VLOAD(.vsyscall)) {
 		*(.vsyscall_0)
-	} :user
 
-	. = ALIGN(L1_CACHE_BYTES);
-	.vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
-		*(.vsyscall_fn)
-	}
-
-	. = ALIGN(L1_CACHE_BYTES);
-	.vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
-		*(.vsyscall_gtod_data)
-	}
-
-	vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
-	.vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
-		*(.vsyscall_clock)
-	}
-	vsyscall_clock = VVIRT(.vsyscall_clock);
-
-
-	.vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
+		. = 1024;
 		*(.vsyscall_1)
-	}
-	.vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
-		*(.vsyscall_2)
-	}
 
-	.vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
-		*(.vgetcpu_mode)
-	}
-	vgetcpu_mode = VVIRT(.vgetcpu_mode);
-
-	. = ALIGN(L1_CACHE_BYTES);
-	.jiffies : AT(VLOAD(.jiffies)) {
-		*(.jiffies)
-	}
-	jiffies = VVIRT(.jiffies);
-
-	.vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
-		*(.vsyscall_3)
-	}
+		. = 2048;
+		*(.vsyscall_2)
 
-	. = __vsyscall_0 + PAGE_SIZE;
+		. = 4096;  /* Pad the whole page. */
+	} :user =0xcc
+	. = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE);
 
 #undef VSYSCALL_ADDR
 #undef VLOAD_OFFSET
@@ -217,6 +185,23 @@ SECTIONS
 #undef VVIRT_OFFSET
 #undef VVIRT
 
+	__vvar_page = .;
+
+	.vvar : AT(ADDR(.vvar) - LOAD_OFFSET) {
+
+	      /* Place all vvars at the offsets in asm/vvar.h. */
+#define EMIT_VVAR(name, offset) 		\
+		. = offset;		\
+		*(.vvar_ ## name)
+#define __VVAR_KERNEL_LDS
+#include <asm/vvar.h>
+#undef __VVAR_KERNEL_LDS
+#undef EMIT_VVAR
+
+	} :data
+
+       . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
+
 #endif /* CONFIG_X86_64 */
 
 	/* Init code and data - will be freed after init */
@@ -306,6 +291,13 @@ SECTIONS
 	}
 
 	. = ALIGN(8);
+	.apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) {
+		__apicdrivers = .;
+		*(.apicdrivers);
+		__apicdrivers_end = .;
+	}
+
+	. = ALIGN(8);
 	/*
 	 * .exit.text is discard at runtime, not link time, to deal with
 	 *  references from .altinstructions and .eh_frame
@@ -319,7 +311,7 @@ SECTIONS
 	}
 
 #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
-	PERCPU(INTERNODE_CACHE_BYTES, PAGE_SIZE)
+	PERCPU_SECTION(INTERNODE_CACHE_BYTES)
 #endif
 
 	. = ALIGN(PAGE_SIZE);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dcbb28c4b69..dda7dff9cef 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -2,6 +2,8 @@
  *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
  *  Copyright 2003 Andi Kleen, SuSE Labs.
  *
+ *  [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
+ *
  *  Thanks to hpa@transmeta.com for some useful hint.
  *  Special thanks to Ingo Molnar for his early experience with
  *  a different vsyscall implementation for Linux/IA32 and for the name.
@@ -11,10 +13,9 @@
  *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
  *  jumping out of line if necessary. We cannot add more with this
  *  mechanism because older kernels won't return -ENOSYS.
- *  If we want more than four we need a vDSO.
  *
- *  Note: the concept clashes with user mode linux. If you use UML and
- *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
+ *  Note: the concept clashes with user mode linux.  UML users should
+ *  use the vDSO.
  */
 
 /* Disable profiling for userspace code: */
@@ -32,9 +33,12 @@
 #include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/notifier.h>
+#include <linux/syscalls.h>
+#include <linux/ratelimit.h>
 
 #include <asm/vsyscall.h>
 #include <asm/pgtable.h>
+#include <asm/compat.h>
 #include <asm/page.h>
 #include <asm/unistd.h>
 #include <asm/fixmap.h>
@@ -44,23 +48,12 @@
 #include <asm/desc.h>
 #include <asm/topology.h>
 #include <asm/vgtod.h>
+#include <asm/traps.h>
 
-#define __vsyscall(nr) \
-		__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
-#define __syscall_clobber "r11","cx","memory"
-
-/*
- * vsyscall_gtod_data contains data that is :
- * - readonly from vsyscalls
- * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
- * Try to keep this structure as small as possible to avoid cache line ping pongs
- */
-int __vgetcpu_mode __section_vgetcpu_mode;
-
-struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
+DEFINE_VVAR(int, vgetcpu_mode);
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
 {
-	.lock = SEQLOCK_UNLOCKED,
-	.sysctl_enabled = 1,
+	.lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
 };
 
 void update_vsyscall_tz(void)
@@ -79,178 +72,149 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
 	unsigned long flags;
 
 	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+
 	/* copy vsyscall data */
-	vsyscall_gtod_data.clock.vread = clock->vread;
-	vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
-	vsyscall_gtod_data.clock.mask = clock->mask;
-	vsyscall_gtod_data.clock.mult = mult;
-	vsyscall_gtod_data.clock.shift = clock->shift;
-	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
-	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
-	vsyscall_gtod_data.wall_to_monotonic = *wtm;
-	vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
+	vsyscall_gtod_data.clock.vclock_mode	= clock->archdata.vclock_mode;
+	vsyscall_gtod_data.clock.cycle_last	= clock->cycle_last;
+	vsyscall_gtod_data.clock.mask		= clock->mask;
+	vsyscall_gtod_data.clock.mult		= mult;
+	vsyscall_gtod_data.clock.shift		= clock->shift;
+	vsyscall_gtod_data.wall_time_sec	= wall_time->tv_sec;
+	vsyscall_gtod_data.wall_time_nsec	= wall_time->tv_nsec;
+	vsyscall_gtod_data.wall_to_monotonic	= *wtm;
+	vsyscall_gtod_data.wall_time_coarse	= __current_kernel_time();
+
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
-/* RED-PEN may want to readd seq locking, but then the variable should be
- * write-once.
- */
-static __always_inline void do_get_tz(struct timezone * tz)
+static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
+			      const char *message)
 {
-	*tz = __vsyscall_gtod_data.sys_tz;
-}
+	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+	struct task_struct *tsk;
 
-static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
-{
-	int ret;
-	asm volatile("syscall"
-		: "=a" (ret)
-		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
-		: __syscall_clobber );
-	return ret;
-}
+	if (!show_unhandled_signals || !__ratelimit(&rs))
+		return;
 
-static __always_inline long time_syscall(long *t)
-{
-	long secs;
-	asm volatile("syscall"
-		: "=a" (secs)
-		: "0" (__NR_time),"D" (t) : __syscall_clobber);
-	return secs;
-}
+	tsk = current;
 
-static __always_inline void do_vgettimeofday(struct timeval * tv)
-{
-	cycle_t now, base, mask, cycle_delta;
-	unsigned seq;
-	unsigned long mult, shift, nsec;
-	cycle_t (*vread)(void);
-	do {
-		seq = read_seqbegin(&__vsyscall_gtod_data.lock);
-
-		vread = __vsyscall_gtod_data.clock.vread;
-		if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
-			gettimeofday(tv,NULL);
-			return;
-		}
-
-		now = vread();
-		base = __vsyscall_gtod_data.clock.cycle_last;
-		mask = __vsyscall_gtod_data.clock.mask;
-		mult = __vsyscall_gtod_data.clock.mult;
-		shift = __vsyscall_gtod_data.clock.shift;
-
-		tv->tv_sec = __vsyscall_gtod_data.wall_time_sec;
-		nsec = __vsyscall_gtod_data.wall_time_nsec;
-	} while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
-
-	/* calculate interval: */
-	cycle_delta = (now - base) & mask;
-	/* convert to nsecs: */
-	nsec += (cycle_delta * mult) >> shift;
-
-	while (nsec >= NSEC_PER_SEC) {
-		tv->tv_sec += 1;
-		nsec -= NSEC_PER_SEC;
-	}
-	tv->tv_usec = nsec / NSEC_PER_USEC;
+	printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+	       level, tsk->comm, task_pid_nr(tsk),
+	       message, regs->ip - 2, regs->cs,
+	       regs->sp, regs->ax, regs->si, regs->di);
 }
 
-int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
+static int addr_to_vsyscall_nr(unsigned long addr)
 {
-	if (tv)
-		do_vgettimeofday(tv);
-	if (tz)
-		do_get_tz(tz);
-	return 0;
-}
+	int nr;
 
-/* This will break when the xtime seconds get inaccurate, but that is
- * unlikely */
-time_t __vsyscall(1) vtime(time_t *t)
-{
-	unsigned seq;
-	time_t result;
-	if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
-		return time_syscall(t);
+	if ((addr & ~0xC00UL) != VSYSCALL_START)
+		return -EINVAL;
 
-	do {
-		seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+	nr = (addr & 0xC00UL) >> 10;
+	if (nr >= 3)
+		return -EINVAL;
 
-		result = __vsyscall_gtod_data.wall_time_sec;
+	return nr;
+}
 
-	} while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code)
+{
+	struct task_struct *tsk;
+	unsigned long caller;
+	int vsyscall_nr;
+	long ret;
+
+	local_irq_enable();
+
+	/*
+	 * Real 64-bit user mode code has cs == __USER_CS.  Anything else
+	 * is bogus.
+	 */
+	if (regs->cs != __USER_CS) {
+		/*
+		 * If we trapped from kernel mode, we might as well OOPS now
+		 * instead of returning to some random address and OOPSing
+		 * then.
+		 */
+		BUG_ON(!user_mode(regs));
+
+		/* Compat mode and non-compat 32-bit CS should both segfault. */
+		warn_bad_vsyscall(KERN_WARNING, regs,
+				  "illegal int 0xcc from 32-bit mode");
+		goto sigsegv;
+	}
 
-	if (t)
-		*t = result;
-	return result;
-}
+	/*
+	 * x86-ism here: regs->ip points to the instruction after the int 0xcc,
+	 * and int 0xcc is two bytes long.
+	 */
+	vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2);
+	if (vsyscall_nr < 0) {
+		warn_bad_vsyscall(KERN_WARNING, regs,
+				  "illegal int 0xcc (exploit attempt?)");
+		goto sigsegv;
+	}
 
-/* Fast way to get current CPU and node.
-   This helps to do per node and per CPU caches in user space.
-   The result is not guaranteed without CPU affinity, but usually
-   works out because the scheduler tries to keep a thread on the same
-   CPU.
+	if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
+		warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)");
+		goto sigsegv;
+	}
 
-   tcache must point to a two element sized long array.
-   All arguments can be NULL. */
-long __vsyscall(2)
-vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
-{
-	unsigned int p;
-	unsigned long j = 0;
-
-	/* Fast cache - only recompute value once per jiffies and avoid
-	   relatively costly rdtscp/cpuid otherwise.
-	   This works because the scheduler usually keeps the process
-	   on the same CPU and this syscall doesn't guarantee its
-	   results anyways.
-	   We do this here because otherwise user space would do it on
-	   its own in a likely inferior way (no access to jiffies).
-	   If you don't like it pass NULL. */
-	if (tcache && tcache->blob[0] == (j = __jiffies)) {
-		p = tcache->blob[1];
-	} else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
-		/* Load per CPU data from RDTSCP */
-		native_read_tscp(&p);
-	} else {
-		/* Load per CPU data from GDT */
-		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+	tsk = current;
+	if (seccomp_mode(&tsk->seccomp))
+		do_exit(SIGKILL);
+
+	switch (vsyscall_nr) {
+	case 0:
+		ret = sys_gettimeofday(
+			(struct timeval __user *)regs->di,
+			(struct timezone __user *)regs->si);
+		break;
+
+	case 1:
+		ret = sys_time((time_t __user *)regs->di);
+		break;
+
+	case 2:
+		ret = sys_getcpu((unsigned __user *)regs->di,
+				 (unsigned __user *)regs->si,
+				 0);
+		break;
 	}
-	if (tcache) {
-		tcache->blob[0] = j;
-		tcache->blob[1] = p;
+
+	if (ret == -EFAULT) {
+		/*
+		 * Bad news -- userspace fed a bad pointer to a vsyscall.
+		 *
+		 * With a real vsyscall, that would have caused SIGSEGV.
+		 * To make writing reliable exploits using the emulated
+		 * vsyscalls harder, generate SIGSEGV here as well.
+		 */
+		warn_bad_vsyscall(KERN_INFO, regs,
+				  "vsyscall fault (exploit attempt?)");
+		goto sigsegv;
 	}
-	if (cpu)
-		*cpu = p & 0xfff;
-	if (node)
-		*node = p >> 12;
-	return 0;
-}
 
-static long __vsyscall(3) venosys_1(void)
-{
-	return -ENOSYS;
-}
+	regs->ax = ret;
 
-#ifdef CONFIG_SYSCTL
-static ctl_table kernel_table2[] = {
-	{ .procname = "vsyscall64",
-	  .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
-	  .mode = 0644,
-	  .proc_handler = proc_dointvec },
-	{}
-};
+	/* Emulate a ret instruction. */
+	regs->ip = caller;
+	regs->sp += 8;
 
-static ctl_table kernel_root_table2[] = {
-	{ .procname = "kernel", .mode = 0555,
-	  .child = kernel_table2 },
-	{}
-};
-#endif
+	local_irq_disable();
+	return;
 
-/* Assume __initcall executes before all user space. Hopefully kmod
-   doesn't violate that. We'll find out if it does. */
+sigsegv:
+	regs->ip -= 2;  /* The faulting instruction should be the int 0xcc. */
+	force_sig(SIGSEGV, current);
+	local_irq_disable();
+}
+
+/*
+ * Assume __initcall executes before all user space. Hopefully kmod
+ * doesn't violate that. We'll find out if it does.
+ */
 static void __cpuinit vsyscall_set_cpu(int cpu)
 {
 	unsigned long d;
@@ -261,13 +225,15 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
 	if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
 		write_rdtscp_aux((node << 12) | cpu);
 
-	/* Store cpu number in limit so that it can be loaded quickly
-	   in user space in vgetcpu.
-	   12 bits for the CPU and 8 bits for the node. */
+	/*
+	 * Store cpu number in limit so that it can be loaded quickly
+	 * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
+	 */
 	d = 0x0f40000000000ULL;
 	d |= cpu;
 	d |= (node & 0xf) << 12;
 	d |= (node >> 4) << 48;
+
 	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
 }
 
@@ -281,8 +247,10 @@ static int __cpuinit
 cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
 {
 	long cpu = (long)arg;
+
 	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
 		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
+
 	return NOTIFY_DONE;
 }
 
@@ -290,25 +258,23 @@ void __init map_vsyscall(void)
 {
 	extern char __vsyscall_0;
 	unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
+	extern char __vvar_page;
+	unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
 
 	/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
 	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
+	__set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
+	BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS);
 }
 
 static int __init vsyscall_init(void)
 {
-	BUG_ON(((unsigned long) &vgettimeofday !=
-			VSYSCALL_ADDR(__NR_vgettimeofday)));
-	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
-	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
-	BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
-#ifdef CONFIG_SYSCTL
-	register_sysctl_table(kernel_root_table2);
-#endif
+	BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
+
 	on_each_cpu(cpu_vsyscall_init, NULL, 1);
 	/* notifier priority > KVM */
 	hotcpu_notifier(cpu_vsyscall_notifier, 30);
+
 	return 0;
 }
-
 __initcall(vsyscall_init);
diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S
new file mode 100644
index 00000000000..ffa845eae5c
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_emu_64.S
@@ -0,0 +1,27 @@
+/*
+ * vsyscall_emu_64.S: Vsyscall emulation page
+ *
+ * Copyright (c) 2011 Andy Lutomirski
+ *
+ * Subject to the GNU General Public License, version 2
+ */
+
+#include <linux/linkage.h>
+#include <asm/irq_vectors.h>
+
+/* The unused parts of the page are filled with 0xcc by the linker script. */
+
+.section .vsyscall_0, "a"
+ENTRY(vsyscall_0)
+	int $VSYSCALL_EMU_VECTOR
+END(vsyscall_0)
+
+.section .vsyscall_1, "a"
+ENTRY(vsyscall_1)
+	int $VSYSCALL_EMU_VECTOR
+END(vsyscall_1)
+
+.section .vsyscall_2, "a"
+ENTRY(vsyscall_2)
+	int $VSYSCALL_EMU_VECTOR
+END(vsyscall_2)
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index c11514e9128..6f164bd5e14 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -35,7 +35,7 @@ void iommu_shutdown_noop(void) { }
 struct x86_init_ops x86_init __initdata = {
 
 	.resources = {
-		.probe_roms		= x86_init_noop,
+		.probe_roms		= probe_roms,
 		.reserve_resources	= reserve_standard_io_resources,
 		.memory_setup		= default_machine_specific_memory_setup,
 	},
@@ -61,6 +61,10 @@ struct x86_init_ops x86_init __initdata = {
 		.banner			= default_banner,
 	},
 
+	.mapping = {
+		.pagetable_reserve		= native_pagetable_reserve,
+	},
+
 	.paging = {
 		.pagetable_setup_start	= native_pagetable_setup_start,
 		.pagetable_setup_done	= native_pagetable_setup_done,