75 files changed, 2778 insertions, 1085 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 469f3450bf8..87717f3687d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -138,6 +138,9 @@ config ARCH_HAS_CACHE_LINE_SIZE
 config HAVE_SETUP_PER_CPU_AREA
 	def_bool y
 
+config HAVE_DYNAMIC_PER_CPU_AREA
+	def_bool y
+
 config HAVE_CPUMASK_OF_CPU_MAP
 	def_bool X86_64_SMP
 
@@ -780,6 +783,11 @@ config X86_MCE_AMD
 	   Additional support for AMD specific MCE features such as
 	   the DRAM Error Threshold.
 
+config X86_MCE_THRESHOLD
+	depends on X86_MCE_AMD || X86_MCE_INTEL
+	bool
+	default y
+
 config X86_MCE_NONFATAL
 	tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel Pentium 4"
 	depends on X86_32 && X86_MCE
@@ -1125,7 +1133,7 @@ config NODES_SHIFT
 	  Specify the maximum number of NUMA Nodes available on the target
 	  system.  Increases memory reserved to accomodate various tables.
 
-config HAVE_ARCH_BOOTMEM_NODE
+config HAVE_ARCH_BOOTMEM
 	def_bool y
 	depends on X86_32 && NUMA
 
@@ -1423,7 +1431,7 @@ config CRASH_DUMP
 config KEXEC_JUMP
 	bool "kexec jump (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
-	depends on KEXEC && HIBERNATION && X86_32
+	depends on KEXEC && HIBERNATION
 	---help---
 	  Jump between original kernel and kexeced kernel and invoke
 	  code in physical address mode via KEXEC
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 4ef949c1972..394d177d721 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -379,6 +379,7 @@ static inline u32 safe_apic_wait_icr_idle(void)
 
 static inline void ack_APIC_irq(void)
 {
+#ifdef CONFIG_X86_LOCAL_APIC
 	/*
 	 * ack_APIC_irq() actually gets compiled as a single instruction
 	 * ... yummie.
@@ -386,6 +387,7 @@ static inline void ack_APIC_irq(void)
 
 	/* Docs say use 0 for future compatibility */
 	apic_write(APIC_EOI, 0);
+#endif
 }
 
 static inline unsigned default_get_apic_id(unsigned long x)
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 63134e31e8b..bc9514fb3b1 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -53,6 +53,7 @@
 #define		APIC_ESR_SENDILL	0x00020
 #define		APIC_ESR_RECVILL	0x00040
 #define		APIC_ESR_ILLREGA	0x00080
+#define 	APIC_LVTCMCI	0x2f0
 #define	APIC_ICR	0x300
 #define		APIC_DEST_SELF		0x40000
 #define		APIC_DEST_ALLINC	0x80000
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 2f8466540fb..5b301b7ff5f 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -5,24 +5,43 @@
 #include <linux/mm.h>
 
 /* Caches aren't brain-dead on the intel. */
-#define flush_cache_all()			do { } while (0)
-#define flush_cache_mm(mm)			do { } while (0)
-#define flush_cache_dup_mm(mm)			do { } while (0)
-#define flush_cache_range(vma, start, end)	do { } while (0)
-#define flush_cache_page(vma, vmaddr, pfn)	do { } while (0)
-#define flush_dcache_page(page)			do { } while (0)
-#define flush_dcache_mmap_lock(mapping)		do { } while (0)
-#define flush_dcache_mmap_unlock(mapping)	do { } while (0)
-#define flush_icache_range(start, end)		do { } while (0)
-#define flush_icache_page(vma, pg)		do { } while (0)
-#define flush_icache_user_range(vma, pg, adr, len)	do { } while (0)
-#define flush_cache_vmap(start, end)		do { } while (0)
-#define flush_cache_vunmap(start, end)		do { } while (0)
+static inline void flush_cache_all(void) { }
+static inline void flush_cache_mm(struct mm_struct *mm) { }
+static inline void flush_cache_dup_mm(struct mm_struct *mm) { }
+static inline void flush_cache_range(struct vm_area_struct *vma,
+				     unsigned long start, unsigned long end) { }
+static inline void flush_cache_page(struct vm_area_struct *vma,
+				    unsigned long vmaddr, unsigned long pfn) { }
+static inline void flush_dcache_page(struct page *page) { }
+static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
+static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
+static inline void flush_icache_range(unsigned long start,
+				      unsigned long end) { }
+static inline void flush_icache_page(struct vm_area_struct *vma,
+				     struct page *page) { }
+static inline void flush_icache_user_range(struct vm_area_struct *vma,
+					   struct page *page,
+					   unsigned long addr,
+					   unsigned long len) { }
+static inline void flush_cache_vmap(unsigned long start, unsigned long end) { }
+static inline void flush_cache_vunmap(unsigned long start,
+				      unsigned long end) { }
 
-#define copy_to_user_page(vma, page, vaddr, dst, src, len)	\
-	memcpy((dst), (src), (len))
-#define copy_from_user_page(vma, page, vaddr, dst, src, len)	\
-	memcpy((dst), (src), (len))
+static inline void copy_to_user_page(struct vm_area_struct *vma,
+				     struct page *page, unsigned long vaddr,
+				     void *dst, const void *src,
+				     unsigned long len)
+{
+	memcpy(dst, src, len);
+}
+
+static inline void copy_from_user_page(struct vm_area_struct *vma,
+				       struct page *page, unsigned long vaddr,
+				       void *dst, const void *src,
+				       unsigned long len)
+{
+	memcpy(dst, src, len);
+}
 
 #define PG_non_WB				PG_arch_1
 PAGEFLAG(NonWB, non_WB)
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index ca5ffb2856b..edc90f23e70 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -37,8 +37,6 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...);
 
 #else /* !CONFIG_X86_32 */
 
-#define MAX_EFI_IO_PAGES	100
-
 extern u64 efi_call0(void *fp);
 extern u64 efi_call1(void *fp, u64 arg1);
 extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 854d538ae85..c2e6bedaf25 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -33,6 +33,8 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
 		 smp_invalidate_interrupt)
 #endif
 
+BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR)
+
 /*
  * every pentium local APIC has two 'local interrupts', with a
  * soft-definable vector attached to both interrupts, one of
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index dca8f03da5b..63a79c77d22 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -24,9 +24,6 @@
 #include <asm/kmap_types.h>
 #else
 #include <asm/vsyscall.h>
-#ifdef CONFIG_EFI
-#include <asm/efi.h>
-#endif
 #endif
 
 /*
@@ -92,13 +89,6 @@ enum fixed_addresses {
 	FIX_IO_APIC_BASE_0,
 	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
 #endif
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_EFI
-	FIX_EFI_IO_MAP_LAST_PAGE,
-	FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
-				  + MAX_EFI_IO_PAGES - 1,
-#endif
-#endif
 #ifdef CONFIG_X86_VISWS_APIC
 	FIX_CO_CPU,	/* Cobalt timer */
 	FIX_CO_APIC,	/* Cobalt APIC Redirection Table */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 176f058e715..039db6aa8e0 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -12,6 +12,7 @@ typedef struct {
 	unsigned int apic_timer_irqs;	/* arch dependent */
 	unsigned int irq_spurious_count;
 #endif
+	unsigned int generic_irqs;	/* arch dependent */
 #ifdef CONFIG_SMP
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 370e1c83bb4..b762ea49bd7 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -27,6 +27,7 @@
 
 /* Interrupt handlers registered during init_IRQ */
 extern void apic_timer_interrupt(void);
+extern void generic_interrupt(void);
 extern void error_interrupt(void);
 extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 48f0004db8c..71c9e518398 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -172,7 +172,13 @@ static inline void __save_init_fpu(struct task_struct *tsk)
 
 #else  /* CONFIG_X86_32 */
 
-extern void finit(void);
+#ifdef CONFIG_MATH_EMULATION
+extern void finit_task(struct task_struct *tsk);
+#else
+static inline void finit_task(struct task_struct *tsk)
+{
+}
+#endif
 
 static inline void tolerant_fwait(void)
 {
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
new file mode 100644
index 00000000000..36fb1a6a510
--- /dev/null
+++ b/arch/x86/include/asm/init.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_X86_INIT_32_H
+#define _ASM_X86_INIT_32_H
+
+#ifdef CONFIG_X86_32
+extern void __init early_ioremap_page_table_range_init(void);
+#endif
+
+extern unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+			     unsigned long end,
+			     unsigned long page_size_mask);
+
+
+extern unsigned long __initdata e820_table_start;
+extern unsigned long __meminitdata e820_table_end;
+extern unsigned long __meminitdata e820_table_top;
+
+#endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 683d0b4c00f..e5383e3d2f8 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -172,8 +172,6 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
 
 extern void iounmap(volatile void __iomem *addr);
 
-extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
-
 
 #ifdef CONFIG_X86_32
 # include "io_32.h"
@@ -198,7 +196,6 @@ extern void early_ioremap_reset(void);
 extern void __iomem *early_ioremap(unsigned long offset, unsigned long size);
 extern void __iomem *early_memremap(unsigned long offset, unsigned long size);
 extern void early_iounmap(void __iomem *addr, unsigned long size);
-extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
 
 #define IO_SPACE_LIMIT 0xffff
 
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 107eb219669..f38481bcd45 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -36,6 +36,7 @@ static inline int irq_canonicalize(int irq)
 extern void fixup_irqs(void);
 #endif
 
+extern void (*generic_interrupt_extension)(void);
 extern void init_IRQ(void);
 extern void native_init_IRQ(void);
 extern bool handle_irq(unsigned irq, struct pt_regs *regs);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 8a285f356f8..3cbd79bbb47 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -112,6 +112,11 @@
 #define LOCAL_PERF_VECTOR		0xee
 
 /*
+ * Generic system vector for platform specific use
+ */
+#define GENERIC_INTERRUPT_VECTOR	0xed
+
+/*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
  * levels. (0x80 is the syscall vector)
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 0ceb6d19ed3..317ff1703d0 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -9,13 +9,13 @@
 # define PAGES_NR		4
 #else
 # define PA_CONTROL_PAGE	0
-# define PA_TABLE_PAGE		1
-# define PAGES_NR		2
+# define VA_CONTROL_PAGE	1
+# define PA_TABLE_PAGE		2
+# define PA_SWAP_PAGE		3
+# define PAGES_NR		4
 #endif
 
-#ifdef CONFIG_X86_32
 # define KEXEC_CONTROL_CODE_MAX_SIZE	2048
-#endif
 
 #ifndef __ASSEMBLY__
 
@@ -136,10 +136,11 @@ relocate_kernel(unsigned long indirection_page,
 		unsigned int has_pae,
 		unsigned int preserve_context);
 #else
-NORET_TYPE void
+unsigned long
 relocate_kernel(unsigned long indirection_page,
 		unsigned long page_list,
-		unsigned long start_address) ATTRIB_NORET;
+		unsigned long start_address,
+		unsigned int preserve_context);
 #endif
 
 #define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 9320e2a8a26..a0d70b46c27 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -4,11 +4,6 @@
 #undef notrace
 #define notrace __attribute__((no_instrument_function))
 
-#ifdef CONFIG_X86_64
-#define __ALIGN .p2align 4,,15
-#define __ALIGN_STR ".p2align 4,,15"
-#endif
-
 #ifdef CONFIG_X86_32
 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
 /*
@@ -50,16 +45,25 @@
 	__asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
 			      "g" (arg4), "g" (arg5), "g" (arg6))
 
-#endif
+#endif /* CONFIG_X86_32 */
+
+#ifdef __ASSEMBLY__
 
 #define GLOBAL(name)	\
 	.globl name;	\
 	name:
 
+#ifdef CONFIG_X86_64
+#define __ALIGN .p2align 4,,15
+#define __ALIGN_STR ".p2align 4,,15"
+#endif
+
 #ifdef CONFIG_X86_ALIGNMENT_16
 #define __ALIGN .align 16,0x90
 #define __ALIGN_STR ".align 16,0x90"
 #endif
 
+#endif /* __ASSEMBLY__ */
+
 #endif /* _ASM_X86_LINKAGE_H */
 
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 32c6e17b960..563933e06a3 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -11,6 +11,8 @@
  */
 
 #define MCG_CTL_P	 (1UL<<8)   /* MCG_CAP register available */
+#define MCG_EXT_P	 (1ULL<<9)   /* Extended registers available */
+#define MCG_CMCI_P	 (1ULL<<10)  /* CMCI supported */
 
 #define MCG_STATUS_RIPV  (1UL<<0)   /* restart ip valid */
 #define MCG_STATUS_EIPV  (1UL<<1)   /* ip points to correct instruction */
@@ -90,14 +92,29 @@ extern int mce_disabled;
 
 #include <asm/atomic.h>
 
+void mce_setup(struct mce *m);
 void mce_log(struct mce *m);
 DECLARE_PER_CPU(struct sys_device, device_mce);
 extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
 
+/*
+ * To support more than 128 would need to escape the predefined
+ * Linux defined extended banks first.
+ */
+#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)
+
 #ifdef CONFIG_X86_MCE_INTEL
 void mce_intel_feature_init(struct cpuinfo_x86 *c);
+void cmci_clear(void);
+void cmci_reenable(void);
+void cmci_rediscover(int dying);
+void cmci_recheck(void);
 #else
 static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
+static inline void cmci_clear(void) {}
+static inline void cmci_reenable(void) {}
+static inline void cmci_rediscover(int dying) {}
+static inline void cmci_recheck(void) {}
 #endif
 
 #ifdef CONFIG_X86_MCE_AMD
@@ -106,11 +123,23 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c);
 static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
 #endif
 
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
+extern int mce_available(struct cpuinfo_x86 *c);
+
+void mce_log_therm_throt_event(__u64 status);
 
 extern atomic_t mce_entry;
 
 extern void do_machine_check(struct pt_regs *, long);
+
+typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
+DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
+
+enum mcp_flags {
+	MCP_TIMESTAMP = (1 << 0),	/* log time stamp */
+	MCP_UC = (1 << 1),		/* log uncorrected errors */
+};
+extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
+
 extern int mce_notify_user(void);
 
 #endif /* !CONFIG_X86_32 */
@@ -120,8 +149,8 @@ extern void mcheck_init(struct cpuinfo_x86 *c);
 #else
 #define mcheck_init(c) do { } while (0)
 #endif
-extern void stop_mce(void);
-extern void restart_mce(void);
+
+extern void (*mce_threshold_vector)(void);
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 105fb90a063..ede6998bd92 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -91,46 +91,9 @@ static inline int pfn_valid(int pfn)
 #endif /* CONFIG_DISCONTIGMEM */
 
 #ifdef CONFIG_NEED_MULTIPLE_NODES
-
-/*
- * Following are macros that are specific to this numa platform.
- */
-#define reserve_bootmem(addr, size, flags) \
-	reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
-#define alloc_bootmem(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_nopanic(x) \
-	__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
-				__pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
-#define alloc_bootmem_pages(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_pages_nopanic(x) \
-	__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
-				__pa(MAX_DMA_ADDRESS))
-#define alloc_bootmem_low_pages(x) \
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
-#define alloc_bootmem_node(pgdat, x)					\
-({									\
-	struct pglist_data  __maybe_unused			\
-				*__alloc_bootmem_node__pgdat = (pgdat);	\
-	__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES,	\
-						__pa(MAX_DMA_ADDRESS));	\
-})
-#define alloc_bootmem_pages_node(pgdat, x)				\
-({									\
-	struct pglist_data  __maybe_unused			\
-				*__alloc_bootmem_node__pgdat = (pgdat);	\
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE,		\
-						__pa(MAX_DMA_ADDRESS));	\
-})
-#define alloc_bootmem_low_pages_node(pgdat, x)				\
-({									\
-	struct pglist_data  __maybe_unused			\
-				*__alloc_bootmem_node__pgdat = (pgdat);	\
-	__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0);		\
-})
+/* always use node 0 for bootmem on this numa platform */
+#define bootmem_arch_preferred_node(__bdata, size, align, goal, limit)	\
+	(NODE_DATA(0)->bdata)
 #endif /* CONFIG_NEED_MULTIPLE_NODES */
 
 #endif /* _ASM_X86_MMZONE_32_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 358acc59ae0..2dbd2314139 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -77,6 +77,11 @@
 #define MSR_IA32_MC0_ADDR		0x00000402
 #define MSR_IA32_MC0_MISC		0x00000403
 
+/* These are consecutive and not in the normal 4er MCE bank block */
+#define MSR_IA32_MC0_CTL2		0x00000280
+#define CMCI_EN			(1ULL << 30)
+#define CMCI_THRESHOLD_MASK		0xffffULL
+
 #define MSR_P6_PERFCTR0			0x000000c1
 #define MSR_P6_PERFCTR1			0x000000c2
 #define MSR_P6_EVNTSEL0			0x00000186
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 2d625da6603..826ad37006a 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,14 +40,8 @@
 
 #ifndef __ASSEMBLY__
 
-struct pgprot;
-
 extern int page_is_ram(unsigned long pagenr);
 extern int devmem_is_allowed(unsigned long pagenr);
-extern void map_devmem(unsigned long pfn, unsigned long size,
-		       struct pgprot vma_prot);
-extern void unmap_devmem(unsigned long pfn, unsigned long size,
-			 struct pgprot vma_prot);
 
 extern unsigned long max_low_pfn_mapped;
 extern unsigned long max_pfn_mapped;
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index b0e70056838..2cd07b9422f 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -2,6 +2,7 @@
 #define _ASM_X86_PAT_H
 
 #include <linux/types.h>
+#include <asm/pgtable_types.h>
 
 #ifdef CONFIG_X86_PAT
 extern int pat_enabled;
@@ -17,5 +18,9 @@ extern int free_memtype(u64 start, u64 end);
 
 extern int kernel_map_sync_memtype(u64 base, unsigned long size,
 		unsigned long flag);
+extern void map_devmem(unsigned long pfn, unsigned long size,
+		       struct pgprot vma_prot);
+extern void unmap_devmem(unsigned long pfn, unsigned long size,
+			 struct pgprot vma_prot);
 
 #endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index aee103b26d0..8f1d2fbec1d 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -43,6 +43,14 @@
 #else /* ...!ASSEMBLY */
 
 #include <linux/stringify.h>
+#include <asm/sections.h>
+
+#define __addr_to_pcpu_ptr(addr)					\
+	(void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr	\
+		 + (unsigned long)__per_cpu_start)
+#define __pcpu_ptr_to_addr(ptr)						\
+	(void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr	\
+		 - (unsigned long)__per_cpu_start)
 
 #ifdef CONFIG_SMP
 #define __percpu_arg(x)		"%%"__stringify(__percpu_seg)":%P" #x
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1c097a3a666..d0812e155f1 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -288,6 +288,8 @@ static inline int is_new_memtype_allowed(unsigned long flags,
 	return 1;
 }
 
+pmd_t *populate_extra_pmd(unsigned long vaddr);
+pte_t *populate_extra_pte(unsigned long vaddr);
 #endif	/* __ASSEMBLY__ */
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h
index bd8df3b2fe0..2733fad45f9 100644
--- a/arch/x86/include/asm/pgtable_32_types.h
+++ b/arch/x86/include/asm/pgtable_32_types.h
@@ -25,6 +25,11 @@
  * area for the same reason. ;)
  */
 #define VMALLOC_OFFSET	(8 * 1024 * 1024)
+
+#ifndef __ASSEMBLER__
+extern bool __vmalloc_start_set; /* set once high_memory is set */
+#endif
+
 #define VMALLOC_START	((unsigned long)high_memory + VMALLOC_OFFSET)
 #ifdef CONFIG_X86_PAE
 #define LAST_PKMAP 512
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 4d258ad76a0..b8238dc8786 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,6 +273,7 @@ typedef struct page *pgtable_t;
 
 extern pteval_t __supported_pte_mask;
 extern int nx_enabled;
+extern void set_nx(void);
 
 #define pgprot_writecombine	pgprot_writecombine
 extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 777327ef05c..9f4dfba33b2 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -199,6 +199,10 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
 #define SCIR_CPU_ACTIVITY	0x02	/* not idle */
 #define SCIR_CPU_HB_INTERVAL	(HZ)	/* once per second */
 
+/* Loop through all installed blades */
+#define for_each_possible_blade(bid)		\
+	for ((bid) = 0; (bid) < uv_num_possible_blades(); (bid)++)
+
 /*
  * Macros for converting between kernel virtual addresses, socket local physical
  * addresses, and UV global physical addresses.
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 4bd990ee43d..1a918dde46b 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -164,6 +164,7 @@ static inline pte_t __pte_ma(pteval_t x)
 
 
 xmaddr_t arbitrary_virt_to_machine(void *address);
+unsigned long arbitrary_virt_to_mfn(void *vaddr);
 void make_lowmem_page_readonly(void *vaddr);
 void make_lowmem_page_readwrite(void *vaddr);
 
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 95f216bbfaf..339ce35648e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -111,7 +111,7 @@ obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb_64.o # NB rename without _64
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
-	obj-$(CONFIG_X86_UV)		+= tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o
+	obj-$(CONFIG_X86_UV)		+= tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
 	obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o
 	obj-$(CONFIG_AUDIT)		+= audit_64.o
 
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 6907b8e85d5..4c80f155743 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -414,9 +414,17 @@ void __init alternative_instructions(void)
 	   that might execute the to be patched code.
 	   Other CPUs are not running. */
 	stop_nmi();
-#ifdef CONFIG_X86_MCE
-	stop_mce();
-#endif
+
+	/*
+	 * Don't stop machine check exceptions while patching.
+	 * MCEs only happen when something got corrupted and in this
+	 * case we must do something about the corruption.
+	 * Ignoring it is worse than a unlikely patching race.
+	 * Also machine checks tend to be broadcast and if one CPU
+	 * goes into machine check the others follow quickly, so we don't
+	 * expect a machine check to cause undue problems during to code
+	 * patching.
+	 */
 
 	apply_alternatives(__alt_instructions, __alt_instructions_end);
 
@@ -456,9 +464,6 @@ void __init alternative_instructions(void)
 				(unsigned long)__smp_locks_end);
 
 	restart_nmi();
-#ifdef CONFIG_X86_MCE
-	restart_mce();
-#endif
 }
 
 /**
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index f9cecdfd05c..30909a258d0 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -46,6 +46,7 @@
 #include <asm/idle.h>
 #include <asm/mtrr.h>
 #include <asm/smp.h>
+#include <asm/mce.h>
 
 unsigned int num_processors;
 
@@ -842,6 +843,14 @@ void clear_local_APIC(void)
 		apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
 	}
 #endif
+#ifdef CONFIG_X86_MCE_INTEL
+	if (maxlvt >= 6) {
+		v = apic_read(APIC_LVTCMCI);
+		if (!(v & APIC_LVT_MASKED))
+			apic_write(APIC_LVTCMCI, v | APIC_LVT_MASKED);
+	}
+#endif
+
 	/*
 	 * Clean APIC state for other OSs:
 	 */
@@ -1241,6 +1250,12 @@ void __cpuinit setup_local_APIC(void)
 	apic_write(APIC_LVT1, value);
 
 	preempt_enable();
+
+#ifdef CONFIG_X86_MCE_INTEL
+	/* Recheck CMCI information after local APIC is up on CPU #0 */
+	if (smp_processor_id() == 0)
+		cmci_recheck();
+#endif
 }
 
 void __cpuinit end_local_APIC_setup(void)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 25423a5b80e..f47df59016c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
+#include <asm/cpu.h>
 
 #ifdef CONFIG_X86_64
 # include <asm/numa_64.h>
@@ -141,6 +142,55 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
 	}
 }
 
+static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	/* calling is from identify_secondary_cpu() ? */
+	if (c->cpu_index == boot_cpu_id)
+		return;
+
+	/*
+	 * Certain Athlons might work (for various values of 'work') in SMP
+	 * but they are not certified as MP capable.
+	 */
+	/* Athlon 660/661 is valid. */
+	if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
+	    (c->x86_mask == 1)))
+		goto valid_k7;
+
+	/* Duron 670 is valid */
+	if ((c->x86_model == 7) && (c->x86_mask == 0))
+		goto valid_k7;
+
+	/*
+	 * Athlon 662, Duron 671, and Athlon >model 7 have capability
+	 * bit. It's worth noting that the A5 stepping (662) of some
+	 * Athlon XP's have the MP bit set.
+	 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
+	 * more.
+	 */
+	if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
+	    ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
+	     (c->x86_model > 7))
+		if (cpu_has_mp)
+			goto valid_k7;
+
+	/* If we get here, not a certified SMP capable AMD system. */
+
+	/*
+	 * Don't taint if we are running SMP kernel on a single non-MP
+	 * approved Athlon
+	 */
+	WARN_ONCE(1, "WARNING: This combination of AMD"
+		"processors is not suitable for SMP.\n");
+	if (!test_taint(TAINT_UNSAFE_SMP))
+		add_taint(TAINT_UNSAFE_SMP);
+
+valid_k7:
+	;
+#endif
+}
+
 static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
@@ -175,6 +225,8 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
 	}
 
 	set_cpu_cap(c, X86_FEATURE_K7);
+
+	amd_k7_smp_check(c);
 }
 #endif
 
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319d30c..22590cf688a 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	if (!data)
 		return -ENOMEM;
 
-	data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
+	data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
 	per_cpu(drv_data, cpu) = data;
 
 	if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index b585e04cbc9..3178c3acd97 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -277,7 +277,6 @@ static struct cpufreq_driver p4clockmod_driver = {
 	.name		= "p4-clockmod",
 	.owner		= THIS_MODULE,
 	.attr		= p4clockmod_attr,
-	.hide_interface	= 1,
 };
 
 
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 25c559ba8d5..191117f1ad5 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -13,6 +13,7 @@
 #include <asm/uaccess.h>
 #include <asm/ds.h>
 #include <asm/bugs.h>
+#include <asm/cpu.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/topology.h>
@@ -110,6 +111,28 @@ static void __cpuinit trap_init_f00f_bug(void)
 }
 #endif
 
+static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	/* calling is from identify_secondary_cpu() ? */
+	if (c->cpu_index == boot_cpu_id)
+		return;
+
+	/*
+	 * Mask B, Pentium, but not Pentium MMX
+	 */
+	if (c->x86 == 5 &&
+	    c->x86_mask >= 1 && c->x86_mask <= 4 &&
+	    c->x86_model <= 3) {
+		/*
+		 * Remember we have B step Pentia with bugs
+		 */
+		WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
+				    "with B stepping processors.\n");
+	}
+#endif
+}
+
 static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 {
 	unsigned long lo, hi;
@@ -186,6 +209,8 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 #ifdef CONFIG_X86_NUMAQ
 	numaq_tsc_disable();
 #endif
+
+	intel_smp_check(c);
 }
 #else
 static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index d7d2323bbb6..b2f89829bbe 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_32)		+= k7.o p4.o p5.o p6.o winchip.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel_64.o
 obj-$(CONFIG_X86_MCE_AMD)	+= mce_amd_64.o
 obj-$(CONFIG_X86_MCE_NONFATAL)	+= non-fatal.o
+obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index dfaebce3633..3552119b091 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -60,20 +60,6 @@ void mcheck_init(struct cpuinfo_x86 *c)
 	}
 }
 
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
-	old_cr4 = read_cr4();
-	clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
-	if (old_cr4 & X86_CR4_MCE)
-		set_in_cr4(X86_CR4_MCE);
-}
-
 static int __init mcheck_disable(char *str)
 {
 	mce_disabled = 1;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index fe79985ce0f..ca14604611e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -3,6 +3,8 @@
  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
  * Rest from unknown author(s).
  * 2004 Andi Kleen. Rewrote most of it.
+ * Copyright 2008 Intel Corporation
+ * Author: Andi Kleen
  */
 
 #include <linux/init.h>
@@ -24,6 +26,9 @@
 #include <linux/ctype.h>
 #include <linux/kmod.h>
 #include <linux/kdebug.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/ratelimit.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 #include <asm/mce.h>
@@ -32,7 +37,6 @@
 #include <asm/idle.h>
 
 #define MISC_MCELOG_MINOR 227
-#define NR_SYSFS_BANKS 6
 
 atomic_t mce_entry;
 
@@ -47,7 +51,7 @@ static int mce_dont_init;
  */
 static int tolerant = 1;
 static int banks;
-static unsigned long bank[NR_SYSFS_BANKS] = { [0 ... NR_SYSFS_BANKS-1] = ~0UL };
+static u64 *bank;
 static unsigned long notify_user;
 static int rip_msr;
 static int mce_bootlog = -1;
@@ -58,6 +62,19 @@ static char *trigger_argv[2] = { trigger, NULL };
 
 static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
 
+/* MCA banks polled by the period polling timer for corrected events */
+DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
+	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
+};
+
+/* Do initial initialization of a struct mce */
+void mce_setup(struct mce *m)
+{
+	memset(m, 0, sizeof(struct mce));
+	m->cpu = smp_processor_id();
+	rdtscll(m->tsc);
+}
+
 /*
  * Lockless MCE logging infrastructure.
  * This avoids deadlocks on printk locks without having to break locks. Also
@@ -119,11 +136,11 @@ static void print_mce(struct mce *m)
 			print_symbol("{%s}", m->ip);
 		printk("\n");
 	}
-	printk(KERN_EMERG "TSC %Lx ", m->tsc);
+	printk(KERN_EMERG "TSC %llx ", m->tsc);
 	if (m->addr)
-		printk("ADDR %Lx ", m->addr);
+		printk("ADDR %llx ", m->addr);
 	if (m->misc)
-		printk("MISC %Lx ", m->misc);
+		printk("MISC %llx ", m->misc);
 	printk("\n");
 	printk(KERN_EMERG "This is not a software problem!\n");
 	printk(KERN_EMERG "Run through mcelog --ascii to decode "
@@ -149,8 +166,10 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start)
 	panic(msg);
 }
 
-static int mce_available(struct cpuinfo_x86 *c)
+int mce_available(struct cpuinfo_x86 *c)
 {
+	if (mce_dont_init)
+		return 0;
 	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
 }
 
@@ -172,7 +191,77 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
 }
 
 /*
- * The actual machine check handler
+ * Poll for corrected events or events that happened before reset.
+ * Those are just logged through /dev/mcelog.
+ *
+ * This is executed in standard interrupt context.
+ */
+void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+{
+	struct mce m;
+	int i;
+
+	mce_setup(&m);
+
+	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
+	for (i = 0; i < banks; i++) {
+		if (!bank[i] || !test_bit(i, *b))
+			continue;
+
+		m.misc = 0;
+		m.addr = 0;
+		m.bank = i;
+		m.tsc = 0;
+
+		barrier();
+		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
+		if (!(m.status & MCI_STATUS_VAL))
+			continue;
+
+		/*
+		 * Uncorrected events are handled by the exception handler
+		 * when it is enabled. But when the exception is disabled log
+		 * everything.
+		 *
+		 * TBD do the same check for MCI_STATUS_EN here?
+		 */
+		if ((m.status & MCI_STATUS_UC) && !(flags & MCP_UC))
+			continue;
+
+		if (m.status & MCI_STATUS_MISCV)
+			rdmsrl(MSR_IA32_MC0_MISC + i*4, m.misc);
+		if (m.status & MCI_STATUS_ADDRV)
+			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
+
+		if (!(flags & MCP_TIMESTAMP))
+			m.tsc = 0;
+		/*
+		 * Don't get the IP here because it's unlikely to
+		 * have anything to do with the actual error location.
+		 */
+
+		mce_log(&m);
+		add_taint(TAINT_MACHINE_CHECK);
+
+		/*
+		 * Clear state for this bank.
+		 */
+		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+	}
+
+	/*
+	 * Don't clear MCG_STATUS here because it's only defined for
+	 * exceptions.
+	 */
+}
+
+/*
+ * The actual machine check handler. This only handles real
+ * exceptions when something got corrupted coming in through int 18.
+ *
+ * This is executed in NMI context not subject to normal locking rules. This
+ * implies that most kernel services cannot be safely used. Don't even
+ * think about putting a printk in there!
  */
 void do_machine_check(struct pt_regs * regs, long error_code)
 {
@@ -190,17 +279,18 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	 * error.
 	 */
 	int kill_it = 0;
+	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
 
 	atomic_inc(&mce_entry);
 
-	if ((regs
-	     && notify_die(DIE_NMI, "machine check", regs, error_code,
+	if (notify_die(DIE_NMI, "machine check", regs, error_code,
 			   18, SIGKILL) == NOTIFY_STOP)
-	    || !banks)
+		goto out2;
+	if (!banks)
 		goto out2;
 
-	memset(&m, 0, sizeof(struct mce));
-	m.cpu = smp_processor_id();
+	mce_setup(&m);
+
 	rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus);
 	/* if the restart IP is not valid, we're done for */
 	if (!(m.mcgstatus & MCG_STATUS_RIPV))
@@ -210,18 +300,32 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	barrier();
 
 	for (i = 0; i < banks; i++) {
-		if (i < NR_SYSFS_BANKS && !bank[i])
+		__clear_bit(i, toclear);
+		if (!bank[i])
 			continue;
 
 		m.misc = 0;
 		m.addr = 0;
 		m.bank = i;
-		m.tsc = 0;
 
 		rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
 		if ((m.status & MCI_STATUS_VAL) == 0)
 			continue;
 
+		/*
+		 * Non uncorrected errors are handled by machine_check_poll
+		 * Leave them alone.
+		 */
+		if ((m.status & MCI_STATUS_UC) == 0)
+			continue;
+
+		/*
+		 * Set taint even when machine check was not enabled.
+		 */
+		add_taint(TAINT_MACHINE_CHECK);
+
+		__set_bit(i, toclear);
+
 		if (m.status & MCI_STATUS_EN) {
 			/* if PCC was set, there's no way out */
 			no_way_out |= !!(m.status & MCI_STATUS_PCC);
@@ -235,6 +339,12 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 					no_way_out = 1;
 				kill_it = 1;
 			}
+		} else {
+			/*
+			 * Machine check event was not enabled. Clear, but
+			 * ignore.
+			 */
+			continue;
 		}
 
 		if (m.status & MCI_STATUS_MISCV)
@@ -243,10 +353,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 			rdmsrl(MSR_IA32_MC0_ADDR + i*4, m.addr);
 
 		mce_get_rip(&m, regs);
-		if (error_code >= 0)
-			rdtscll(m.tsc);
-		if (error_code != -2)
-			mce_log(&m);
+		mce_log(&m);
 
 		/* Did this bank cause the exception? */
 		/* Assume that the bank with uncorrectable errors did it,
@@ -255,14 +362,8 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 			panicm = m;
 			panicm_found = 1;
 		}
-
-		add_taint(TAINT_MACHINE_CHECK);
 	}
 
-	/* Never do anything final in the polling timer */
-	if (!regs)
-		goto out;
-
 	/* If we didn't find an uncorrectable error, pick
 	   the last one (shouldn't happen, just being safe). */
 	if (!panicm_found)
@@ -309,10 +410,11 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 	/* notify userspace ASAP */
 	set_thread_flag(TIF_MCE_NOTIFY);
 
- out:
 	/* the last thing we do is clear state */
-	for (i = 0; i < banks; i++)
-		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+	for (i = 0; i < banks; i++) {
+		if (test_bit(i, toclear))
+			wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
+	}
 	wrmsrl(MSR_IA32_MCG_STATUS, 0);
  out2:
 	atomic_dec(&mce_entry);
@@ -332,15 +434,13 @@ void do_machine_check(struct pt_regs * regs, long error_code)
  * and historically has been the register value of the
  * MSR_IA32_THERMAL_STATUS (Intel) msr.
  */
-void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
+void mce_log_therm_throt_event(__u64 status)
 {
 	struct mce m;
 
-	memset(&m, 0, sizeof(m));
-	m.cpu = cpu;
+	mce_setup(&m);
 	m.bank = MCE_THERMAL_BANK;
 	m.status = status;
-	rdtscll(m.tsc);
 	mce_log(&m);
 }
 #endif /* CONFIG_X86_MCE_INTEL */
@@ -353,18 +453,18 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status)
 
 static int check_interval = 5 * 60; /* 5 minutes */
 static int next_interval; /* in jiffies */
-static void mcheck_timer(struct work_struct *work);
-static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer);
+static void mcheck_timer(unsigned long);
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
-static void mcheck_check_cpu(void *info)
+static void mcheck_timer(unsigned long data)
 {
-	if (mce_available(&current_cpu_data))
-		do_machine_check(NULL, 0);
-}
+	struct timer_list *t = &per_cpu(mce_timer, data);
 
-static void mcheck_timer(struct work_struct *work)
-{
-	on_each_cpu(mcheck_check_cpu, NULL, 1);
+	WARN_ON(smp_processor_id() != data);
+
+	if (mce_available(&current_cpu_data))
+		machine_check_poll(MCP_TIMESTAMP,
+				&__get_cpu_var(mce_poll_banks));
 
 	/*
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
@@ -377,31 +477,41 @@ static void mcheck_timer(struct work_struct *work)
 				(int)round_jiffies_relative(check_interval*HZ));
 	}
 
-	schedule_delayed_work(&mcheck_work, next_interval);
+	t->expires = jiffies + next_interval;
+	add_timer(t);
+}
+
+static void mce_do_trigger(struct work_struct *work)
+{
+	call_usermodehelper(trigger, trigger_argv, NULL, UMH_NO_WAIT);
 }
 
+static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
+
 /*
- * This is only called from process context.  This is where we do
- * anything we need to alert userspace about new MCEs.  This is called
- * directly from the poller and also from entry.S and idle, thanks to
- * TIF_MCE_NOTIFY.
+ * Notify the user(s) about new machine check events.
+ * Can be called from interrupt context, but not from machine check/NMI
+ * context.
  */
 int mce_notify_user(void)
 {
+	/* Not more than two messages every minute */
+	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
+
 	clear_thread_flag(TIF_MCE_NOTIFY);
 	if (test_and_clear_bit(0, &notify_user)) {
-		static unsigned long last_print;
-		unsigned long now = jiffies;
-
 		wake_up_interruptible(&mce_wait);
-		if (trigger[0])
-			call_usermodehelper(trigger, trigger_argv, NULL,
-						UMH_NO_WAIT);
 
-		if (time_after_eq(now, last_print + (check_interval*HZ))) {
-			last_print = now;
+		/*
+		 * There is no risk of missing notifications because
+		 * work_pending is always cleared before the function is
+		 * executed.
+		 */
+		if (trigger[0] && !work_pending(&mce_trigger_work))
+			schedule_work(&mce_trigger_work);
+
+		if (__ratelimit(&ratelimit))
 			printk(KERN_INFO "Machine check events logged\n");
-		}
 
 		return 1;
 	}
@@ -425,63 +535,78 @@ static struct notifier_block mce_idle_notifier = {
 
 static __init int periodic_mcheck_init(void)
 {
-	next_interval = check_interval * HZ;
-	if (next_interval)
-		schedule_delayed_work(&mcheck_work,
-				      round_jiffies_relative(next_interval));
-	idle_notifier_register(&mce_idle_notifier);
-	return 0;
+       idle_notifier_register(&mce_idle_notifier);
+       return 0;
 }
 __initcall(periodic_mcheck_init);
 
-
 /*
  * Initialize Machine Checks for a CPU.
  */
-static void mce_init(void *dummy)
+static int mce_cap_init(void)
 {
 	u64 cap;
-	int i;
+	unsigned b;
 
 	rdmsrl(MSR_IA32_MCG_CAP, cap);
-	banks = cap & 0xff;
-	if (banks > MCE_EXTENDED_BANK) {
-		banks = MCE_EXTENDED_BANK;
-		printk(KERN_INFO "MCE: warning: using only %d banks\n",
-		       MCE_EXTENDED_BANK);
+	b = cap & 0xff;
+	if (b > MAX_NR_BANKS) {
+		printk(KERN_WARNING
+		       "MCE: Using only %u machine check banks out of %u\n",
+			MAX_NR_BANKS, b);
+		b = MAX_NR_BANKS;
 	}
+
+	/* Don't support asymmetric configurations today */
+	WARN_ON(banks != 0 && b != banks);
+	banks = b;
+	if (!bank) {
+		bank = kmalloc(banks * sizeof(u64), GFP_KERNEL);
+		if (!bank)
+			return -ENOMEM;
+		memset(bank, 0xff, banks * sizeof(u64));
+	}
+
 	/* Use accurate RIP reporting if available. */
 	if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
 		rip_msr = MSR_IA32_MCG_EIP;
 
-	/* Log the machine checks left over from the previous reset.
-	   This also clears all registers */
-	do_machine_check(NULL, mce_bootlog ? -1 : -2);
+	return 0;
+}
+
+static void mce_init(void *dummy)
+{
+	u64 cap;
+	int i;
+	mce_banks_t all_banks;
+
+	/*
+	 * Log the machine checks left over from the previous reset.
+	 */
+	bitmap_fill(all_banks, MAX_NR_BANKS);
+	machine_check_poll(MCP_UC, &all_banks);
 
 	set_in_cr4(X86_CR4_MCE);
 
+	rdmsrl(MSR_IA32_MCG_CAP, cap);
 	if (cap & MCG_CTL_P)
 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
 
 	for (i = 0; i < banks; i++) {
-		if (i < NR_SYSFS_BANKS)
-			wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
-		else
-			wrmsrl(MSR_IA32_MC0_CTL+4*i, ~0UL);
-
+		wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]);
 		wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0);
 	}
 }
 
 /* Add per CPU specific workarounds here */
-static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
+static void mce_cpu_quirks(struct cpuinfo_x86 *c)
 {
 	/* This should be disabled by the BIOS, but isn't always */
 	if (c->x86_vendor == X86_VENDOR_AMD) {
-		if(c->x86 == 15)
+		if (c->x86 == 15 && banks > 4)
 			/* disable GART TBL walk error reporting, which trips off
 			   incorrectly with the IOMMU & 3ware & Cerberus. */
-			clear_bit(10, &bank[4]);
+			clear_bit(10, (unsigned long *)&bank[4]);
 		if(c->x86 <= 17 && mce_bootlog < 0)
 			/* Lots of broken BIOS around that don't clear them
 			   by default and leave crap in there. Don't log. */
@@ -504,20 +629,38 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
 	}
 }
 
+static void mce_init_timer(void)
+{
+	struct timer_list *t = &__get_cpu_var(mce_timer);
+
+	/* data race harmless because everyone sets to the same value */
+	if (!next_interval)
+		next_interval = check_interval * HZ;
+	if (!next_interval)
+		return;
+	setup_timer(t, mcheck_timer, smp_processor_id());
+	t->expires = round_jiffies(jiffies + next_interval);
+	add_timer(t);
+}
+
 /*
  * Called for each booted CPU to set up machine checks.
  * Must be called with preempt off.
  */
 void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 {
-	mce_cpu_quirks(c);
+	if (!mce_available(c))
+		return;
 
-	if (mce_dont_init ||
-	    !mce_available(c))
+	if (mce_cap_init() < 0) {
+		mce_dont_init = 1;
 		return;
+	}
+	mce_cpu_quirks(c);
 
 	mce_init(NULL);
 	mce_cpu_features(c);
+	mce_init_timer();
 }
 
 /*
@@ -573,7 +716,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 {
 	unsigned long *cpu_tsc;
 	static DEFINE_MUTEX(mce_read_mutex);
-	unsigned next;
+	unsigned prev, next;
 	char __user *buf = ubuf;
 	int i, err;
 
@@ -592,25 +735,32 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 	}
 
 	err = 0;
-	for (i = 0; i < next; i++) {
-		unsigned long start = jiffies;
-
-		while (!mcelog.entry[i].finished) {
-			if (time_after_eq(jiffies, start + 2)) {
-				memset(mcelog.entry + i,0, sizeof(struct mce));
-				goto timeout;
+	prev = 0;
+	do {
+		for (i = prev; i < next; i++) {
+			unsigned long start = jiffies;
+
+			while (!mcelog.entry[i].finished) {
+				if (time_after_eq(jiffies, start + 2)) {
+					memset(mcelog.entry + i, 0,
+					       sizeof(struct mce));
+					goto timeout;
+				}
+				cpu_relax();
 			}
-			cpu_relax();
+			smp_rmb();
+			err |= copy_to_user(buf, mcelog.entry + i,
+					    sizeof(struct mce));
+			buf += sizeof(struct mce);
+timeout:
+			;
 		}
-		smp_rmb();
-		err |= copy_to_user(buf, mcelog.entry + i, sizeof(struct mce));
-		buf += sizeof(struct mce);
- timeout:
-		;
-	}
 
-	memset(mcelog.entry, 0, next * sizeof(struct mce));
-	mcelog.next = 0;
+		memset(mcelog.entry + prev, 0,
+		       (next - prev) * sizeof(struct mce));
+		prev = next;
+		next = cmpxchg(&mcelog.next, prev, 0);
+	} while (next != prev);
 
 	synchronize_sched();
 
@@ -680,20 +830,6 @@ static struct miscdevice mce_log_device = {
 	&mce_chrdev_ops,
 };
 
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
-	old_cr4 = read_cr4();
-	clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
-	if (old_cr4 & X86_CR4_MCE)
-		set_in_cr4(X86_CR4_MCE);
-}
-
 /*
  * Old style boot options parsing. Only for compatibility.
  */
@@ -703,8 +839,7 @@ static int __init mcheck_disable(char *str)
 	return 1;
 }
 
-/* mce=off disables machine check. Note you can re-enable it later
-   using sysfs.
+/* mce=off disables machine check.
    mce=TOLERANCELEVEL (number, see above)
    mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
    mce=nobootlog Don't log MCEs from before booting. */
@@ -728,6 +863,29 @@ __setup("mce=", mcheck_enable);
  * Sysfs support
  */
 
+/*
+ * Disable machine checks on suspend and shutdown. We can't really handle
+ * them later.
+ */
+static int mce_disable(void)
+{
+	int i;
+
+	for (i = 0; i < banks; i++)
+		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+	return 0;
+}
+
+static int mce_suspend(struct sys_device *dev, pm_message_t state)
+{
+	return mce_disable();
+}
+
+static int mce_shutdown(struct sys_device *dev)
+{
+	return mce_disable();
+}
+
 /* On resume clear all MCE state. Don't want to see leftovers from the BIOS.
    Only one CPU is active at this time, the others get readded later using
    CPU hotplug. */
@@ -738,20 +896,24 @@ static int mce_resume(struct sys_device *dev)
 	return 0;
 }
 
+static void mce_cpu_restart(void *data)
+{
+	del_timer_sync(&__get_cpu_var(mce_timer));
+	if (mce_available(&current_cpu_data))
+		mce_init(NULL);
+	mce_init_timer();
+}
+
 /* Reinit MCEs after user configuration changes */
 static void mce_restart(void)
 {
-	if (next_interval)
-		cancel_delayed_work(&mcheck_work);
-	/* Timer race is harmless here */
-	on_each_cpu(mce_init, NULL, 1);
 	next_interval = check_interval * HZ;
-	if (next_interval)
-		schedule_delayed_work(&mcheck_work,
-				      round_jiffies_relative(next_interval));
+	on_each_cpu(mce_cpu_restart, NULL, 1);
 }
 
 static struct sysdev_class mce_sysclass = {
+	.suspend = mce_suspend,
+	.shutdown = mce_shutdown,
 	.resume = mce_resume,
 	.name = "machinecheck",
 };
@@ -778,16 +940,26 @@ void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinit
 	}								\
 	static SYSDEV_ATTR(name, 0644, show_ ## name, set_ ## name);
 
-/*
- * TBD should generate these dynamically based on number of available banks.
- * Have only 6 contol banks in /sysfs until then.
- */
-ACCESSOR(bank0ctl,bank[0],mce_restart())
-ACCESSOR(bank1ctl,bank[1],mce_restart())
-ACCESSOR(bank2ctl,bank[2],mce_restart())
-ACCESSOR(bank3ctl,bank[3],mce_restart())
-ACCESSOR(bank4ctl,bank[4],mce_restart())
-ACCESSOR(bank5ctl,bank[5],mce_restart())
+static struct sysdev_attribute *bank_attrs;
+
+static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
+			 char *buf)
+{
+	u64 b = bank[attr - bank_attrs];
+	return sprintf(buf, "%llx\n", b);
+}
+
+static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
+			const char *buf, size_t siz)
+{
+	char *end;
+	u64 new = simple_strtoull(buf, &end, 0);
+	if (end == buf)
+		return -EINVAL;
+	bank[attr - bank_attrs] = new;
+	mce_restart();
+	return end-buf;
+}
 
 static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
 				char *buf)
@@ -814,8 +986,6 @@ static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
 static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
 ACCESSOR(check_interval,check_interval,mce_restart())
 static struct sysdev_attribute *mce_attributes[] = {
-	&attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
-	&attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
 	&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
 	NULL
 };
@@ -845,11 +1015,22 @@ static __cpuinit int mce_create_device(unsigned int cpu)
 		if (err)
 			goto error;
 	}
+	for (i = 0; i < banks; i++) {
+		err = sysdev_create_file(&per_cpu(device_mce, cpu),
+					&bank_attrs[i]);
+		if (err)
+			goto error2;
+	}
 	cpu_set(cpu, mce_device_initialized);
 
 	return 0;
+error2:
+	while (--i >= 0) {
+		sysdev_remove_file(&per_cpu(device_mce, cpu),
+					&bank_attrs[i]);
+	}
 error:
-	while (i--) {
+	while (--i >= 0) {
 		sysdev_remove_file(&per_cpu(device_mce,cpu),
 				   mce_attributes[i]);
 	}
@@ -868,15 +1049,46 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
 	for (i = 0; mce_attributes[i]; i++)
 		sysdev_remove_file(&per_cpu(device_mce,cpu),
 			mce_attributes[i]);
+	for (i = 0; i < banks; i++)
+		sysdev_remove_file(&per_cpu(device_mce, cpu),
+			&bank_attrs[i]);
 	sysdev_unregister(&per_cpu(device_mce,cpu));
 	cpu_clear(cpu, mce_device_initialized);
 }
 
+/* Make sure there are no machine checks on offlined CPUs. */
+static void mce_disable_cpu(void *h)
+{
+	int i;
+	unsigned long action = *(unsigned long *)h;
+
+	if (!mce_available(&current_cpu_data))
+		return;
+	if (!(action & CPU_TASKS_FROZEN))
+		cmci_clear();
+	for (i = 0; i < banks; i++)
+		wrmsrl(MSR_IA32_MC0_CTL + i*4, 0);
+}
+
+static void mce_reenable_cpu(void *h)
+{
+	int i;
+	unsigned long action = *(unsigned long *)h;
+
+	if (!mce_available(&current_cpu_data))
+		return;
+	if (!(action & CPU_TASKS_FROZEN))
+		cmci_reenable();
+	for (i = 0; i < banks; i++)
+		wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]);
+}
+
 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
 static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 				      unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
+	struct timer_list *t = &per_cpu(mce_timer, cpu);
 
 	switch (action) {
 	case CPU_ONLINE:
@@ -891,6 +1103,21 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 			threshold_cpu_callback(action, cpu);
 		mce_remove_device(cpu);
 		break;
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		del_timer_sync(t);
+		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+		break;
+	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
+		t->expires = round_jiffies(jiffies + next_interval);
+		add_timer_on(t, cpu);
+		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
+		break;
+	case CPU_POST_DEAD:
+		/* intentionally ignoring frozen here */
+		cmci_rediscover(cpu);
+		break;
 	}
 	return NOTIFY_OK;
 }
@@ -899,6 +1126,34 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
 	.notifier_call = mce_cpu_callback,
 };
 
+static __init int mce_init_banks(void)
+{
+	int i;
+
+	bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
+				GFP_KERNEL);
+	if (!bank_attrs)
+		return -ENOMEM;
+
+	for (i = 0; i < banks; i++) {
+		struct sysdev_attribute *a = &bank_attrs[i];
+		a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i);
+		if (!a->attr.name)
+			goto nomem;
+		a->attr.mode = 0644;
+		a->show = show_bank;
+		a->store = set_bank;
+	}
+	return 0;
+
+nomem:
+	while (--i >= 0)
+		kfree(bank_attrs[i].attr.name);
+	kfree(bank_attrs);
+	bank_attrs = NULL;
+	return -ENOMEM;
+}
+
 static __init int mce_init_device(void)
 {
 	int err;
@@ -906,6 +1161,11 @@ static __init int mce_init_device(void)
 
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
+
+	err = mce_init_banks();
+	if (err)
+		return err;
+
 	err = sysdev_class_register(&mce_sysclass);
 	if (err)
 		return err;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 9817506dd46..c5a32f92d07 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -79,6 +79,8 @@ static unsigned char shared_bank[NR_BANKS] = {
 
 static DEFINE_PER_CPU(unsigned char, bank_map);	/* see which banks are on */
 
+static void amd_threshold_interrupt(void);
+
 /*
  * CPU Initialization
  */
@@ -174,6 +176,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 			tr.reset = 0;
 			tr.old_limit = 0;
 			threshold_restart_bank(&tr);
+
+			mce_threshold_vector = amd_threshold_interrupt;
 		}
 	}
 }
@@ -187,19 +191,13 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
  * the interrupt goes off when error_count reaches threshold_limit.
  * the handler will simply log mcelog w/ software defined bank number.
  */
-asmlinkage void mce_threshold_interrupt(void)
+static void amd_threshold_interrupt(void)
 {
 	unsigned int bank, block;
 	struct mce m;
 	u32 low = 0, high = 0, address = 0;
 
-	ack_APIC_irq();
-	exit_idle();
-	irq_enter();
-
-	memset(&m, 0, sizeof(m));
-	rdtscll(m.tsc);
-	m.cpu = smp_processor_id();
+	mce_setup(&m);
 
 	/* assume first bank caused it */
 	for (bank = 0; bank < NR_BANKS; ++bank) {
@@ -233,7 +231,8 @@ asmlinkage void mce_threshold_interrupt(void)
 
 			/* Log the machine check that caused the threshold
 			   event. */
-			do_machine_check(NULL, 0);
+			machine_check_poll(MCP_TIMESTAMP,
+					&__get_cpu_var(mce_poll_banks));
 
 			if (high & MASK_OVERFLOW_HI) {
 				rdmsrl(address, m.misc);
@@ -243,13 +242,10 @@ asmlinkage void mce_threshold_interrupt(void)
 				       + bank * NR_BLOCKS
 				       + block;
 				mce_log(&m);
-				goto out;
+				return;
 			}
 		}
 	}
-out:
-	inc_irq_stat(irq_threshold_count);
-	irq_exit();
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index aa5e287c98e..aaa7d973093 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -1,6 +1,8 @@
 /*
  * Intel specific MCE features.
  * Copyright 2004 Zwane Mwaikambo <zwane@linuxpower.ca>
+ * Copyright (C) 2008, 2009 Intel Corporation
+ * Author: Andi Kleen
  */
 
 #include <linux/init.h>
@@ -13,6 +15,7 @@
 #include <asm/hw_irq.h>
 #include <asm/idle.h>
 #include <asm/therm_throt.h>
+#include <asm/apic.h>
 
 asmlinkage void smp_thermal_interrupt(void)
 {
@@ -25,7 +28,7 @@ asmlinkage void smp_thermal_interrupt(void)
 
 	rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
 	if (therm_throt_process(msr_val & 1))
-		mce_log_therm_throt_event(smp_processor_id(), msr_val);
+		mce_log_therm_throt_event(msr_val);
 
 	inc_irq_stat(irq_thermal_count);
 	irq_exit();
@@ -85,7 +88,209 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 	return;
 }
 
+/*
+ * Support for Intel Correct Machine Check Interrupts. This allows
+ * the CPU to raise an interrupt when a corrected machine check happened.
+ * Normally we pick those up using a regular polling timer.
+ * Also supports reliable discovery of shared banks.
+ */
+
+static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
+
+/*
+ * cmci_discover_lock protects against parallel discovery attempts
+ * which could race against each other.
+ */
+static DEFINE_SPINLOCK(cmci_discover_lock);
+
+#define CMCI_THRESHOLD 1
+
+static int cmci_supported(int *banks)
+{
+	u64 cap;
+
+	/*
+	 * Vendor check is not strictly needed, but the initial
+	 * initialization is vendor keyed and this
+	 * makes sure none of the backdoors are entered otherwise.
+	 */
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+		return 0;
+	if (!cpu_has_apic || lapic_get_maxlvt() < 6)
+		return 0;
+	rdmsrl(MSR_IA32_MCG_CAP, cap);
+	*banks = min_t(unsigned, MAX_NR_BANKS, cap & 0xff);
+	return !!(cap & MCG_CMCI_P);
+}
+
+/*
+ * The interrupt handler. This is called on every event.
+ * Just call the poller directly to log any events.
+ * This could in theory increase the threshold under high load,
+ * but doesn't for now.
+ */
+static void intel_threshold_interrupt(void)
+{
+	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+	mce_notify_user();
+}
+
+static void print_update(char *type, int *hdr, int num)
+{
+	if (*hdr == 0)
+		printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
+	*hdr = 1;
+	printk(KERN_CONT " %s:%d", type, num);
+}
+
+/*
+ * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
+ * on this CPU. Use the algorithm recommended in the SDM to discover shared
+ * banks.
+ */
+static void cmci_discover(int banks, int boot)
+{
+	unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
+	int hdr = 0;
+	int i;
+
+	spin_lock(&cmci_discover_lock);
+	for (i = 0; i < banks; i++) {
+		u64 val;
+
+		if (test_bit(i, owned))
+			continue;
+
+		rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+		/* Already owned by someone else? */
+		if (val & CMCI_EN) {
+			if (test_and_clear_bit(i, owned) || boot)
+				print_update("SHD", &hdr, i);
+			__clear_bit(i, __get_cpu_var(mce_poll_banks));
+			continue;
+		}
+
+		val |= CMCI_EN | CMCI_THRESHOLD;
+		wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+		rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+
+		/* Did the enable bit stick? -- the bank supports CMCI */
+		if (val & CMCI_EN) {
+			if (!test_and_set_bit(i, owned) || boot)
+				print_update("CMCI", &hdr, i);
+			__clear_bit(i, __get_cpu_var(mce_poll_banks));
+		} else {
+			WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
+		}
+	}
+	spin_unlock(&cmci_discover_lock);
+	if (hdr)
+		printk(KERN_CONT "\n");
+}
+
+/*
+ * Just in case we missed an event during initialization check
+ * all the CMCI owned banks.
+ */
+void cmci_recheck(void)
+{
+	unsigned long flags;
+	int banks;
+
+	if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
+		return;
+	local_irq_save(flags);
+	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+	local_irq_restore(flags);
+}
+
+/*
+ * Disable CMCI on this CPU for all banks it owns when it goes down.
+ * This allows other CPUs to claim the banks on rediscovery.
+ */
+void cmci_clear(void)
+{
+	int i;
+	int banks;
+	u64 val;
+
+	if (!cmci_supported(&banks))
+		return;
+	spin_lock(&cmci_discover_lock);
+	for (i = 0; i < banks; i++) {
+		if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
+			continue;
+		/* Disable CMCI */
+		rdmsrl(MSR_IA32_MC0_CTL2 + i, val);
+		val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
+		wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
+		__clear_bit(i, __get_cpu_var(mce_banks_owned));
+	}
+	spin_unlock(&cmci_discover_lock);
+}
+
+/*
+ * After a CPU went down cycle through all the others and rediscover
+ * Must run in process context.
+ */
+void cmci_rediscover(int dying)
+{
+	int banks;
+	int cpu;
+	cpumask_var_t old;
+
+	if (!cmci_supported(&banks))
+		return;
+	if (!alloc_cpumask_var(&old, GFP_KERNEL))
+		return;
+	cpumask_copy(old, &current->cpus_allowed);
+
+	for_each_online_cpu (cpu) {
+		if (cpu == dying)
+			continue;
+		if (set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)))
+			continue;
+		/* Recheck banks in case CPUs don't all have the same */
+		if (cmci_supported(&banks))
+			cmci_discover(banks, 0);
+	}
+
+	set_cpus_allowed_ptr(current, old);
+	free_cpumask_var(old);
+}
+
+/*
+ * Reenable CMCI on this CPU in case a CPU down failed.
+ */
+void cmci_reenable(void)
+{
+	int banks;
+	if (cmci_supported(&banks))
+		cmci_discover(banks, 0);
+}
+
+static __cpuinit void intel_init_cmci(void)
+{
+	int banks;
+
+	if (!cmci_supported(&banks))
+		return;
+
+	mce_threshold_vector = intel_threshold_interrupt;
+	cmci_discover(banks, 1);
+	/*
+	 * For CPU #0 this runs with still disabled APIC, but that's
+	 * ok because only the vector is set up. We still do another
+	 * check for the banks later for CPU #0 just to make sure
+	 * to not miss any events.
+	 */
+	apic_write(APIC_LVTCMCI, THRESHOLD_APIC_VECTOR|APIC_DM_FIXED);
+	cmci_recheck();
+}
+
 void mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
 	intel_init_thermal(c);
+	intel_init_cmci();
 }
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
new file mode 100644
index 00000000000..23ee9e730f7
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -0,0 +1,29 @@
+/*
+ * Common corrected MCE threshold handler code:
+ */
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+
+#include <asm/irq_vectors.h>
+#include <asm/apic.h>
+#include <asm/idle.h>
+#include <asm/mce.h>
+
+static void default_threshold_interrupt(void)
+{
+	printk(KERN_ERR "Unexpected threshold interrupt at vector %x\n",
+			 THRESHOLD_APIC_VECTOR);
+}
+
+void (*mce_threshold_vector)(void) = default_threshold_interrupt;
+
+asmlinkage void mce_threshold_interrupt(void)
+{
+	exit_idle();
+	irq_enter();
+	inc_irq_stat(irq_threshold_count);
+	mce_threshold_vector();
+	irq_exit();
+	/* Ack only at the end to avoid potential reentry */
+	ack_APIC_irq();
+}
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 169a120587b..87b67e3a765 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -729,7 +729,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 
 	spin_unlock_irqrestore(&ds_lock, irq);
 
-	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
 	ds_resume_pebs(tracer);
 
 	return tracer;
@@ -1029,5 +1029,4 @@ void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
 
 void ds_exit_thread(struct task_struct *tsk)
 {
-	WARN_ON(tsk->thread.ds_ctx);
 }
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index b205272ad39..1736acc4d7a 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -469,7 +469,7 @@ void __init efi_enter_virtual_mode(void)
 	efi_memory_desc_t *md;
 	efi_status_t status;
 	unsigned long size;
-	u64 end, systab, addr, npages;
+	u64 end, systab, addr, npages, end_pfn;
 	void *p, *va;
 
 	efi.systab = NULL;
@@ -481,7 +481,10 @@ void __init efi_enter_virtual_mode(void)
 		size = md->num_pages << EFI_PAGE_SHIFT;
 		end = md->phys_addr + size;
 
-		if (PFN_UP(end) <= max_low_pfn_mapped)
+		end_pfn = PFN_UP(end);
+		if (end_pfn <= max_low_pfn_mapped
+		    || (end_pfn > (1UL << (32 - PAGE_SHIFT))
+			&& end_pfn <= max_pfn_mapped))
 			va = __va(md->phys_addr);
 		else
 			va = efi_ioremap(md->phys_addr, size);
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index a4ee29127fd..22c3b7828c5 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -100,24 +100,11 @@ void __init efi_call_phys_epilog(void)
 
 void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size)
 {
-	static unsigned pages_mapped __initdata;
-	unsigned i, pages;
-	unsigned long offset;
+	unsigned long last_map_pfn;
 
-	pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr);
-	offset = phys_addr & ~PAGE_MASK;
-	phys_addr &= PAGE_MASK;
-
-	if (pages_mapped + pages > MAX_EFI_IO_PAGES)
+	last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
+	if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
 		return NULL;
 
-	for (i = 0; i < pages; i++) {
-		__set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped,
-			     phys_addr, PAGE_KERNEL);
-		phys_addr += PAGE_SIZE;
-		pages_mapped++;
-	}
-
-	return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
-					     (pages_mapped - pages)) + offset;
+	return (void __iomem *)__va(phys_addr);
 }
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 83d1836b946..7ba4621c0df 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -984,6 +984,8 @@ apicinterrupt UV_BAU_MESSAGE \
 #endif
 apicinterrupt LOCAL_TIMER_VECTOR \
 	apic_timer_interrupt smp_apic_timer_interrupt
+apicinterrupt GENERIC_INTERRUPT_VECTOR \
+	generic_interrupt smp_generic_interrupt
 
 #ifdef CONFIG_SMP
 apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index b0f61f0dcd0..f2f8540a7f3 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -136,7 +136,7 @@ int init_fpu(struct task_struct *tsk)
 #ifdef CONFIG_X86_32
 	if (!HAVE_HWFP) {
 		memset(tsk->thread.xstate, 0, xstate_size);
-		finit();
+		finit_task(tsk);
 		set_stopped_child_used_math(tsk);
 		return 0;
 	}
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index f13ca1650aa..b864341dcc4 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -15,6 +15,9 @@
 
 atomic_t irq_err_count;
 
+/* Function pointer for generic interrupt vector handling */
+void (*generic_interrupt_extension)(void) = NULL;
+
 /*
  * 'what should we do if we get a hw irq event on an illegal vector'.
  * each architecture has to answer this themselves.
@@ -56,6 +59,12 @@ static int show_other_interrupts(struct seq_file *p)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
 	seq_printf(p, "  Local timer interrupts\n");
 #endif
+	if (generic_interrupt_extension) {
+		seq_printf(p, "PLT: ");
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ", irq_stats(j)->generic_irqs);
+		seq_printf(p, "  Platform interrupts\n");
+	}
 #ifdef CONFIG_SMP
 	seq_printf(p, "RES: ");
 	for_each_online_cpu(j)
@@ -163,6 +172,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 #ifdef CONFIG_X86_LOCAL_APIC
 	sum += irq_stats(cpu)->apic_timer_irqs;
 #endif
+	if (generic_interrupt_extension)
+		sum += irq_stats(cpu)->generic_irqs;
 #ifdef CONFIG_SMP
 	sum += irq_stats(cpu)->irq_resched_count;
 	sum += irq_stats(cpu)->irq_call_count;
@@ -226,4 +237,27 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 	return 1;
 }
 
+/*
+ * Handler for GENERIC_INTERRUPT_VECTOR.
+ */
+void smp_generic_interrupt(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	ack_APIC_irq();
+
+	exit_idle();
+
+	irq_enter();
+
+	inc_irq_stat(generic_irqs);
+
+	if (generic_interrupt_extension)
+		generic_interrupt_extension();
+
+	irq_exit();
+
+	set_irq_regs(old_regs);
+}
+
 EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 9dc6b2b2427..3b09634a515 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -16,6 +16,7 @@
 #include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/uaccess.h>
+#include <linux/percpu.h>
 
 #include <asm/apic.h>
 
@@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }
 union irq_ctx {
 	struct thread_info      tinfo;
 	u32                     stack[THREAD_SIZE/sizeof(u32)];
-};
+} __attribute__((aligned(PAGE_SIZE)));
 
-static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
-static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
+static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
+static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
 
-static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
-static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
+static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
 
 static void call_on_stack(void *func, void *stack)
 {
@@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
 	u32 *isp, arg1, arg2;
 
 	curctx = (union irq_ctx *) current_thread_info();
-	irqctx = hardirq_ctx[smp_processor_id()];
+	irqctx = __get_cpu_var(hardirq_ctx);
 
 	/*
 	 * this is where we switch to the IRQ stack. However, if we are
@@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)
 {
 	union irq_ctx *irqctx;
 
-	if (hardirq_ctx[cpu])
+	if (per_cpu(hardirq_ctx, cpu))
 		return;
 
-	irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
+	irqctx = &per_cpu(hardirq_stack, cpu);
 	irqctx->tinfo.task		= NULL;
 	irqctx->tinfo.exec_domain	= NULL;
 	irqctx->tinfo.cpu		= cpu;
 	irqctx->tinfo.preempt_count	= HARDIRQ_OFFSET;
 	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0);
 
-	hardirq_ctx[cpu] = irqctx;
+	per_cpu(hardirq_ctx, cpu) = irqctx;
 
-	irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE];
+	irqctx = &per_cpu(softirq_stack, cpu);
 	irqctx->tinfo.task		= NULL;
 	irqctx->tinfo.exec_domain	= NULL;
 	irqctx->tinfo.cpu		= cpu;
 	irqctx->tinfo.preempt_count	= 0;
 	irqctx->tinfo.addr_limit	= MAKE_MM_SEG(0);
 
-	softirq_ctx[cpu] = irqctx;
+	per_cpu(softirq_ctx, cpu) = irqctx;
 
 	printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
-	       cpu, hardirq_ctx[cpu], softirq_ctx[cpu]);
+	       cpu, per_cpu(hardirq_ctx, cpu),  per_cpu(softirq_ctx, cpu));
 }
 
 void irq_ctx_exit(int cpu)
 {
-	hardirq_ctx[cpu] = NULL;
+	per_cpu(hardirq_ctx, cpu) = NULL;
 }
 
 asmlinkage void do_softirq(void)
@@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)
 
 	if (local_softirq_pending()) {
 		curctx = current_thread_info();
-		irqctx = softirq_ctx[smp_processor_id()];
+		irqctx = __get_cpu_var(softirq_ctx);
 		irqctx->tinfo.task = curctx->task;
 		irqctx->tinfo.previous_esp = current_stack_pointer;
 
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 50b8c3a3006..bc132610544 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -175,6 +175,9 @@ void __init native_init_IRQ(void)
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
+	/* generic IPI for platform specific use */
+	alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index da481a1e3f3..c7a49e0ffbf 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -147,6 +147,9 @@ static void __init apic_intr_init(void)
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
+	/* generic IPI for platform specific use */
+	alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index f5fc8c781a6..e7368c1da01 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -14,12 +14,12 @@
 #include <linux/ftrace.h>
 #include <linux/suspend.h>
 #include <linux/gfp.h>
+#include <linux/io.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
-#include <asm/io.h>
 #include <asm/apic.h>
 #include <asm/cpufeature.h>
 #include <asm/desc.h>
@@ -63,7 +63,7 @@ static void load_segments(void)
 		"\tmovl %%eax,%%fs\n"
 		"\tmovl %%eax,%%gs\n"
 		"\tmovl %%eax,%%ss\n"
-		::: "eax", "memory");
+		: : : "eax", "memory");
 #undef STR
 #undef __STR
 }
@@ -205,7 +205,8 @@ void machine_kexec(struct kimage *image)
 
 	if (image->preserve_context) {
 #ifdef CONFIG_X86_IO_APIC
-		/* We need to put APICs in legacy mode so that we can
+		/*
+		 * We need to put APICs in legacy mode so that we can
 		 * get timer interrupts in second kernel. kexec/kdump
 		 * paths already have calls to disable_IO_APIC() in
 		 * one form or other. kexec jump path also need
@@ -227,7 +228,8 @@ void machine_kexec(struct kimage *image)
 		page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
 						<< PAGE_SHIFT);
 
-	/* The segment registers are funny things, they have both a
+	/*
+	 * The segment registers are funny things, they have both a
 	 * visible and an invisible part.  Whenever the visible part is
 	 * set to a specific selector, the invisible part is loaded
 	 * with from a table in memory.  At no other time is the
@@ -237,11 +239,12 @@ void machine_kexec(struct kimage *image)
 	 * segments, before I zap the gdt with an invalid value.
 	 */
 	load_segments();
-	/* The gdt & idt are now invalid.
+	/*
+	 * The gdt & idt are now invalid.
 	 * If you want to load them you must set up your own idt & gdt.
 	 */
-	set_gdt(phys_to_virt(0),0);
-	set_idt(phys_to_virt(0),0);
+	set_gdt(phys_to_virt(0), 0);
+	set_idt(phys_to_virt(0), 0);
 
 	/* now call it */
 	image->start = relocate_kernel_ptr((unsigned long)image->head,
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 6993d51b7fd..89cea4d4467 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -12,11 +12,47 @@
 #include <linux/reboot.h>
 #include <linux/numa.h>
 #include <linux/ftrace.h>
+#include <linux/io.h>
+#include <linux/suspend.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
-#include <asm/io.h>
+
+static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
+				unsigned long addr)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+	struct page *page;
+	int result = -ENOMEM;
+
+	addr &= PMD_MASK;
+	pgd += pgd_index(addr);
+	if (!pgd_present(*pgd)) {
+		page = kimage_alloc_control_pages(image, 0);
+		if (!page)
+			goto out;
+		pud = (pud_t *)page_address(page);
+		memset(pud, 0, PAGE_SIZE);
+		set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+	}
+	pud = pud_offset(pgd, addr);
+	if (!pud_present(*pud)) {
+		page = kimage_alloc_control_pages(image, 0);
+		if (!page)
+			goto out;
+		pmd = (pmd_t *)page_address(page);
+		memset(pmd, 0, PAGE_SIZE);
+		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+	}
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd))
+		set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
+	result = 0;
+out:
+	return result;
+}
 
 static void init_level2_page(pmd_t *level2p, unsigned long addr)
 {
@@ -83,9 +119,8 @@ static int init_level4_page(struct kimage *image, pgd_t *level4p,
 		}
 		level3p = (pud_t *)page_address(page);
 		result = init_level3_page(image, level3p, addr, last_addr);
-		if (result) {
+		if (result)
 			goto out;
-		}
 		set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
 		addr += PGDIR_SIZE;
 	}
@@ -156,6 +191,13 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 	result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
 	if (result)
 		return result;
+	/*
+	 * image->start may be outside 0 ~ max_pfn, for example when
+	 * jump back to original kernel from kexeced kernel
+	 */
+	result = init_one_level2_page(image, level4p, image->start);
+	if (result)
+		return result;
 	return init_transition_pgtable(image, level4p);
 }
 
@@ -229,20 +271,45 @@ void machine_kexec(struct kimage *image)
 {
 	unsigned long page_list[PAGES_NR];
 	void *control_page;
+	int save_ftrace_enabled;
 
-	tracer_disable();
+#ifdef CONFIG_KEXEC_JUMP
+	if (kexec_image->preserve_context)
+		save_processor_state();
+#endif
+
+	save_ftrace_enabled = __ftrace_enabled_save();
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 
+	if (image->preserve_context) {
+#ifdef CONFIG_X86_IO_APIC
+		/*
+		 * We need to put APICs in legacy mode so that we can
+		 * get timer interrupts in second kernel. kexec/kdump
+		 * paths already have calls to disable_IO_APIC() in
+		 * one form or other. kexec jump path also need
+		 * one.
+		 */
+		disable_IO_APIC();
+#endif
+	}
+
 	control_page = page_address(image->control_code_page) + PAGE_SIZE;
-	memcpy(control_page, relocate_kernel, PAGE_SIZE);
+	memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
 
 	page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
+	page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
 	page_list[PA_TABLE_PAGE] =
 	  (unsigned long)__pa(page_address(image->control_code_page));
 
-	/* The segment registers are funny things, they have both a
+	if (image->type == KEXEC_TYPE_DEFAULT)
+		page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
+						<< PAGE_SHIFT);
+
+	/*
+	 * The segment registers are funny things, they have both a
 	 * visible and an invisible part.  Whenever the visible part is
 	 * set to a specific selector, the invisible part is loaded
 	 * with from a table in memory.  At no other time is the
@@ -252,15 +319,25 @@ void machine_kexec(struct kimage *image)
 	 * segments, before I zap the gdt with an invalid value.
 	 */
 	load_segments();
-	/* The gdt & idt are now invalid.
+	/*
+	 * The gdt & idt are now invalid.
 	 * If you want to load them you must set up your own idt & gdt.
 	 */
-	set_gdt(phys_to_virt(0),0);
-	set_idt(phys_to_virt(0),0);
+	set_gdt(phys_to_virt(0), 0);
+	set_idt(phys_to_virt(0), 0);
 
 	/* now call it */
-	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
-			image->start);
+	image->start = relocate_kernel((unsigned long)image->head,
+				       (unsigned long)page_list,
+				       image->start,
+				       image->preserve_context);
+
+#ifdef CONFIG_KEXEC_JUMP
+	if (kexec_image->preserve_context)
+		restore_processor_state();
+#endif
+
+	__ftrace_enabled_restore(save_ftrace_enabled);
 }
 
 void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 37cb1bda1ba..e8192401da4 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -558,6 +558,19 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 
 static struct mpf_intel *mpf_found;
 
+static unsigned long __init get_mpc_size(unsigned long physptr)
+{
+	struct mpc_table *mpc;
+	unsigned long size;
+
+	mpc = early_ioremap(physptr, PAGE_SIZE);
+	size = mpc->length;
+	early_iounmap(mpc, PAGE_SIZE);
+	apic_printk(APIC_VERBOSE, "  mpc: %lx-%lx\n", physptr, physptr + size);
+
+	return size;
+}
+
 /*
  * Scan the memory blocks for an SMP configuration block.
  */
@@ -611,12 +624,16 @@ static void __init __get_smp_config(unsigned int early)
 		construct_default_ISA_mptable(mpf->feature1);
 
 	} else if (mpf->physptr) {
+		struct mpc_table *mpc;
+		unsigned long size;
 
+		size = get_mpc_size(mpf->physptr);
+		mpc = early_ioremap(mpf->physptr, size);
 		/*
 		 * Read the physical hardware table.  Anything here will
 		 * override the defaults.
 		 */
-		if (!smp_read_mpc(phys_to_virt(mpf->physptr), early)) {
+		if (!smp_read_mpc(mpc, early)) {
 #ifdef CONFIG_X86_LOCAL_APIC
 			smp_found_config = 0;
 #endif
@@ -624,8 +641,10 @@ static void __init __get_smp_config(unsigned int early)
 			       "BIOS bug, MP table errors detected!...\n");
 			printk(KERN_ERR "... disabling SMP support. "
 			       "(tell your hw vendor)\n");
+			early_iounmap(mpc, size);
 			return;
 		}
+		early_iounmap(mpc, size);
 
 		if (early)
 			return;
@@ -697,10 +716,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 
 			if (!reserve)
 				return 1;
-			reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE,
+			reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
 					BOOTMEM_DEFAULT);
 			if (mpf->physptr) {
-				unsigned long size = PAGE_SIZE;
+				unsigned long size = get_mpc_size(mpf->physptr);
 #ifdef CONFIG_X86_32
 				/*
 				 * We cannot access to MPC table to compute
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 1cc18d439bb..2aef36d8aca 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -216,6 +216,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"),
 		},
 	},
+	{	/* Handle problems with rebooting on Dell XPS710 */
+		.callback = set_bios_reboot,
+		.ident = "Dell XPS710",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"),
+		},
+	},
 	{ }
 };
 
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 2064d0aa8d2..41235531b11 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -17,7 +17,8 @@
 
 #define PTR(x) (x << 2)
 
-/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
  * ~ control_page + PAGE_SIZE are used as data storage and stack for
  * jumping back
  */
@@ -76,8 +77,10 @@ relocate_kernel:
 	movl	%eax, CP_PA_SWAP_PAGE(%edi)
 	movl	%ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
 
-	/* get physical address of control page now */
-	/* this is impossible after page table switch */
+	/*
+	 * get physical address of control page now
+	 * this is impossible after page table switch
+	 */
 	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edi
 
 	/* switch to new set of page tables */
@@ -97,7 +100,8 @@ identity_mapped:
 	/* store the start address on the stack */
 	pushl   %edx
 
-	/* Set cr0 to a known state:
+	/*
+	 * Set cr0 to a known state:
 	 *  - Paging disabled
 	 *  - Alignment check disabled
 	 *  - Write protect disabled
@@ -113,7 +117,8 @@ identity_mapped:
 	/* clear cr4 if applicable */
 	testl	%ecx, %ecx
 	jz	1f
-	/* Set cr4 to a known state:
+	/*
+	 * Set cr4 to a known state:
 	 * Setting everything to zero seems safe.
 	 */
 	xorl	%eax, %eax
@@ -132,15 +137,18 @@ identity_mapped:
 	call	swap_pages
 	addl	$8, %esp
 
-	/* To be certain of avoiding problems with self-modifying code
+	/*
+	 * To be certain of avoiding problems with self-modifying code
 	 * I need to execute a serializing instruction here.
 	 * So I flush the TLB, it's handy, and not processor dependent.
 	 */
 	xorl	%eax, %eax
 	movl	%eax, %cr3
 
-	/* set all of the registers to known values */
-	/* leave %esp alone */
+	/*
+	 * set all of the registers to known values
+	 * leave %esp alone
+	 */
 
 	testl	%esi, %esi
 	jnz 1f
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index d32cfb27a47..4de8f5b3d47 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -19,29 +19,77 @@
 #define PTR(x) (x << 3)
 #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 
+/*
+ * control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+ * ~ control_page + PAGE_SIZE are used as data storage and stack for
+ * jumping back
+ */
+#define DATA(offset)		(KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+
+/* Minimal CPU state */
+#define RSP			DATA(0x0)
+#define CR0			DATA(0x8)
+#define CR3			DATA(0x10)
+#define CR4			DATA(0x18)
+
+/* other data */
+#define CP_PA_TABLE_PAGE	DATA(0x20)
+#define CP_PA_SWAP_PAGE		DATA(0x28)
+#define CP_PA_BACKUP_PAGES_MAP	DATA(0x30)
+
 	.text
 	.align PAGE_SIZE
 	.code64
 	.globl relocate_kernel
 relocate_kernel:
-	/* %rdi indirection_page
+	/*
+	 * %rdi indirection_page
 	 * %rsi page_list
 	 * %rdx start address
+	 * %rcx preserve_context
 	 */
 
+	/* Save the CPU context, used for jumping back */
+	pushq %rbx
+	pushq %rbp
+	pushq %r12
+	pushq %r13
+	pushq %r14
+	pushq %r15
+	pushf
+
+	movq	PTR(VA_CONTROL_PAGE)(%rsi), %r11
+	movq	%rsp, RSP(%r11)
+	movq	%cr0, %rax
+	movq	%rax, CR0(%r11)
+	movq	%cr3, %rax
+	movq	%rax, CR3(%r11)
+	movq	%cr4, %rax
+	movq	%rax, CR4(%r11)
+
 	/* zero out flags, and disable interrupts */
 	pushq $0
 	popfq
 
-	/* get physical address of control page now */
-	/* this is impossible after page table switch */
+	/*
+	 * get physical address of control page now
+	 * this is impossible after page table switch
+	 */
 	movq	PTR(PA_CONTROL_PAGE)(%rsi), %r8
 
 	/* get physical address of page table now too */
-	movq	PTR(PA_TABLE_PAGE)(%rsi), %rcx
+	movq	PTR(PA_TABLE_PAGE)(%rsi), %r9
+
+	/* get physical address of swap page now */
+	movq	PTR(PA_SWAP_PAGE)(%rsi), %r10
+
+	/* save some information for jumping back */
+	movq	%r9, CP_PA_TABLE_PAGE(%r11)
+	movq	%r10, CP_PA_SWAP_PAGE(%r11)
+	movq	%rdi, CP_PA_BACKUP_PAGES_MAP(%r11)
 
 	/* Switch to the identity mapped page tables */
-	movq	%rcx, %cr3
+	movq	%r9, %cr3
 
 	/* setup a new stack at the end of the physical control page */
 	lea	PAGE_SIZE(%r8), %rsp
@@ -55,7 +103,8 @@ identity_mapped:
 	/* store the start address on the stack */
 	pushq   %rdx
 
-	/* Set cr0 to a known state:
+	/*
+	 * Set cr0 to a known state:
 	 *  - Paging enabled
 	 *  - Alignment check disabled
 	 *  - Write protect disabled
@@ -68,7 +117,8 @@ identity_mapped:
 	orl	$(X86_CR0_PG | X86_CR0_PE), %eax
 	movq	%rax, %cr0
 
-	/* Set cr4 to a known state:
+	/*
+	 * Set cr4 to a known state:
 	 *  - physical address extension enabled
 	 */
 	movq	$X86_CR4_PAE, %rax
@@ -78,9 +128,87 @@ identity_mapped:
 1:
 
 	/* Flush the TLB (needed?) */
-	movq	%rcx, %cr3
+	movq	%r9, %cr3
+
+	movq	%rcx, %r11
+	call	swap_pages
+
+	/*
+	 * To be certain of avoiding problems with self-modifying code
+	 * I need to execute a serializing instruction here.
+	 * So I flush the TLB by reloading %cr3 here, it's handy,
+	 * and not processor dependent.
+	 */
+	movq	%cr3, %rax
+	movq	%rax, %cr3
+
+	/*
+	 * set all of the registers to known values
+	 * leave %rsp alone
+	 */
+
+	testq	%r11, %r11
+	jnz 1f
+	xorq	%rax, %rax
+	xorq	%rbx, %rbx
+	xorq    %rcx, %rcx
+	xorq    %rdx, %rdx
+	xorq    %rsi, %rsi
+	xorq    %rdi, %rdi
+	xorq    %rbp, %rbp
+	xorq	%r8,  %r8
+	xorq	%r9,  %r9
+	xorq	%r10, %r9
+	xorq	%r11, %r11
+	xorq	%r12, %r12
+	xorq	%r13, %r13
+	xorq	%r14, %r14
+	xorq	%r15, %r15
+
+	ret
+
+1:
+	popq	%rdx
+	leaq	PAGE_SIZE(%r10), %rsp
+	call	*%rdx
+
+	/* get the re-entry point of the peer system */
+	movq	0(%rsp), %rbp
+	call	1f
+1:
+	popq	%r8
+	subq	$(1b - relocate_kernel), %r8
+	movq	CP_PA_SWAP_PAGE(%r8), %r10
+	movq	CP_PA_BACKUP_PAGES_MAP(%r8), %rdi
+	movq	CP_PA_TABLE_PAGE(%r8), %rax
+	movq	%rax, %cr3
+	lea	PAGE_SIZE(%r8), %rsp
+	call	swap_pages
+	movq	$virtual_mapped, %rax
+	pushq	%rax
+	ret
+
+virtual_mapped:
+	movq	RSP(%r8), %rsp
+	movq	CR4(%r8), %rax
+	movq	%rax, %cr4
+	movq	CR3(%r8), %rax
+	movq	CR0(%r8), %r8
+	movq	%rax, %cr3
+	movq	%r8, %cr0
+	movq	%rbp, %rax
+
+	popf
+	popq	%r15
+	popq	%r14
+	popq	%r13
+	popq	%r12
+	popq	%rbp
+	popq	%rbx
+	ret
 
 	/* Do the copies */
+swap_pages:
 	movq	%rdi, %rcx 	/* Put the page_list in %rcx */
 	xorq	%rdi, %rdi
 	xorq	%rsi, %rsi
@@ -112,36 +240,27 @@ identity_mapped:
 	movq	%rcx,   %rsi  /* For ever source page do a copy */
 	andq	$0xfffffffffffff000, %rsi
 
+	movq	%rdi, %rdx
+	movq	%rsi, %rax
+
+	movq	%r10, %rdi
 	movq	$512,   %rcx
 	rep ; movsq
-	jmp	0b
-3:
-
-	/* To be certain of avoiding problems with self-modifying code
-	 * I need to execute a serializing instruction here.
-	 * So I flush the TLB by reloading %cr3 here, it's handy,
-	 * and not processor dependent.
-	 */
-	movq	%cr3, %rax
-	movq	%rax, %cr3
 
-	/* set all of the registers to known values */
-	/* leave %rsp alone */
+	movq	%rax, %rdi
+	movq	%rdx, %rsi
+	movq	$512,   %rcx
+	rep ; movsq
 
-	xorq	%rax, %rax
-	xorq	%rbx, %rbx
-	xorq    %rcx, %rcx
-	xorq    %rdx, %rdx
-	xorq    %rsi, %rsi
-	xorq    %rdi, %rdi
-	xorq    %rbp, %rbp
-	xorq	%r8,  %r8
-	xorq	%r9,  %r9
-	xorq	%r10, %r9
-	xorq	%r11, %r11
-	xorq	%r12, %r12
-	xorq	%r13, %r13
-	xorq	%r14, %r14
-	xorq	%r15, %r15
+	movq	%rdx, %rdi
+	movq	%r10, %rsi
+	movq	$512,   %rcx
+	rep ; movsq
 
+	lea	PAGE_SIZE(%rax), %rsi
+	jmp	0b
+3:
 	ret
+
+	.globl kexec_control_code_size
+.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4c54bc0d8ff..f28c56e6bf9 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -202,7 +202,9 @@ struct ist_info ist_info;
 #endif
 
 #else
-struct cpuinfo_x86 boot_cpu_data __read_mostly;
+struct cpuinfo_x86 boot_cpu_data __read_mostly = {
+	.x86_phys_bits = MAX_PHYSMEM_BITS,
+};
 EXPORT_SYMBOL(boot_cpu_data);
 #endif
 
@@ -770,6 +772,9 @@ void __init setup_arch(char **cmdline_p)
 
 	finish_e820_parsing();
 
+	if (efi_enabled)
+		efi_init();
+
 	dmi_scan_machine();
 
 	dmi_check_system(bad_bios_dmi_table);
@@ -789,8 +794,6 @@ void __init setup_arch(char **cmdline_p)
 	insert_resource(&iomem_resource, &data_resource);
 	insert_resource(&iomem_resource, &bss_resource);
 
-	if (efi_enabled)
-		efi_init();
 
 #ifdef CONFIG_X86_32
 	if (ppro_with_ram_bug()) {
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d992e6cff73..efa615f2bf4 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -7,6 +7,7 @@
 #include <linux/crash_dump.h>
 #include <linux/smp.h>
 #include <linux/topology.h>
+#include <linux/pfn.h>
 #include <asm/sections.h>
 #include <asm/processor.h>
 #include <asm/setup.h>
@@ -41,6 +42,352 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
 };
 EXPORT_SYMBOL(__per_cpu_offset);
 
+/*
+ * On x86_64 symbols referenced from code should be reachable using
+ * 32bit relocations.  Reserve space for static percpu variables in
+ * modules so that they are always served from the first chunk which
+ * is located at the percpu segment base.  On x86_32, anything can
+ * address anywhere.  No need to reserve space in the first chunk.
+ */
+#ifdef CONFIG_X86_64
+#define PERCPU_FIRST_CHUNK_RESERVE	PERCPU_MODULE_RESERVE
+#else
+#define PERCPU_FIRST_CHUNK_RESERVE	0
+#endif
+
+/**
+ * pcpu_need_numa - determine percpu allocation needs to consider NUMA
+ *
+ * If NUMA is not configured or there is only one NUMA node available,
+ * there is no reason to consider NUMA.  This function determines
+ * whether percpu allocation should consider NUMA or not.
+ *
+ * RETURNS:
+ * true if NUMA should be considered; otherwise, false.
+ */
+static bool __init pcpu_need_numa(void)
+{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+	pg_data_t *last = NULL;
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		int node = early_cpu_to_node(cpu);
+
+		if (node_online(node) && NODE_DATA(node) &&
+		    last && last != NODE_DATA(node))
+			return true;
+
+		last = NODE_DATA(node);
+	}
+#endif
+	return false;
+}
+
+/**
+ * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
+ * @cpu: cpu to allocate for
+ * @size: size allocation in bytes
+ * @align: alignment
+ *
+ * Allocate @size bytes aligned at @align for cpu @cpu.  This wrapper
+ * does the right thing for NUMA regardless of the current
+ * configuration.
+ *
+ * RETURNS:
+ * Pointer to the allocated area on success, NULL on failure.
+ */
+static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
+					unsigned long align)
+{
+	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+	int node = early_cpu_to_node(cpu);
+	void *ptr;
+
+	if (!node_online(node) || !NODE_DATA(node)) {
+		ptr = __alloc_bootmem_nopanic(size, align, goal);
+		pr_info("cpu %d has no node %d or node-local memory\n",
+			cpu, node);
+		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
+			 cpu, size, __pa(ptr));
+	} else {
+		ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
+						   size, align, goal);
+		pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
+			 "%016lx\n", cpu, size, node, __pa(ptr));
+	}
+	return ptr;
+#else
+	return __alloc_bootmem_nopanic(size, align, goal);
+#endif
+}
+
+/*
+ * Remap allocator
+ *
+ * This allocator uses PMD page as unit.  A PMD page is allocated for
+ * each cpu and each is remapped into vmalloc area using PMD mapping.
+ * As PMD page is quite large, only part of it is used for the first
+ * chunk.  Unused part is returned to the bootmem allocator.
+ *
+ * So, the PMD pages are mapped twice - once to the physical mapping
+ * and to the vmalloc area for the first percpu chunk.  The double
+ * mapping does add one more PMD TLB entry pressure but still is much
+ * better than only using 4k mappings while still being NUMA friendly.
+ */
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+static size_t pcpur_size __initdata;
+static void **pcpur_ptrs __initdata;
+
+static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
+{
+	size_t off = (size_t)pageno << PAGE_SHIFT;
+
+	if (off >= pcpur_size)
+		return NULL;
+
+	return virt_to_page(pcpur_ptrs[cpu] + off);
+}
+
+static ssize_t __init setup_pcpu_remap(size_t static_size)
+{
+	static struct vm_struct vm;
+	pg_data_t *last;
+	size_t ptrs_size, dyn_size;
+	unsigned int cpu;
+	ssize_t ret;
+
+	/*
+	 * If large page isn't supported, there's no benefit in doing
+	 * this.  Also, on non-NUMA, embedding is better.
+	 */
+	if (!cpu_has_pse || pcpu_need_numa())
+		return -EINVAL;
+
+	last = NULL;
+	for_each_possible_cpu(cpu) {
+		int node = early_cpu_to_node(cpu);
+
+		if (node_online(node) && NODE_DATA(node) &&
+		    last && last != NODE_DATA(node))
+			goto proceed;
+
+		last = NODE_DATA(node);
+	}
+	return -EINVAL;
+
+proceed:
+	/*
+	 * Currently supports only single page.  Supporting multiple
+	 * pages won't be too difficult if it ever becomes necessary.
+	 */
+	pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+			       PERCPU_DYNAMIC_RESERVE);
+	if (pcpur_size > PMD_SIZE) {
+		pr_warning("PERCPU: static data is larger than large page, "
+			   "can't use large page\n");
+		return -EINVAL;
+	}
+	dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
+
+	/* allocate pointer array and alloc large pages */
+	ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
+	pcpur_ptrs = alloc_bootmem(ptrs_size);
+
+	for_each_possible_cpu(cpu) {
+		pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
+		if (!pcpur_ptrs[cpu])
+			goto enomem;
+
+		/*
+		 * Only use pcpur_size bytes and give back the rest.
+		 *
+		 * Ingo: The 2MB up-rounding bootmem is needed to make
+		 * sure the partial 2MB page is still fully RAM - it's
+		 * not well-specified to have a PAT-incompatible area
+		 * (unmapped RAM, device memory, etc.) in that hole.
+		 */
+		free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
+			     PMD_SIZE - pcpur_size);
+
+		memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
+	}
+
+	/* allocate address and map */
+	vm.flags = VM_ALLOC;
+	vm.size = num_possible_cpus() * PMD_SIZE;
+	vm_area_register_early(&vm, PMD_SIZE);
+
+	for_each_possible_cpu(cpu) {
+		pmd_t *pmd;
+
+		pmd = populate_extra_pmd((unsigned long)vm.addr
+					 + cpu * PMD_SIZE);
+		set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
+				     PAGE_KERNEL_LARGE));
+	}
+
+	/* we're ready, commit */
+	pr_info("PERCPU: Remapped at %p with large pages, static data "
+		"%zu bytes\n", vm.addr, static_size);
+
+	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
+				     PERCPU_FIRST_CHUNK_RESERVE,
+				     PMD_SIZE, dyn_size, vm.addr, NULL);
+	goto out_free_ar;
+
+enomem:
+	for_each_possible_cpu(cpu)
+		if (pcpur_ptrs[cpu])
+			free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
+	ret = -ENOMEM;
+out_free_ar:
+	free_bootmem(__pa(pcpur_ptrs), ptrs_size);
+	return ret;
+}
+#else
+static ssize_t __init setup_pcpu_remap(size_t static_size)
+{
+	return -EINVAL;
+}
+#endif
+
+/*
+ * Embedding allocator
+ *
+ * The first chunk is sized to just contain the static area plus
+ * module and dynamic reserves, and allocated as a contiguous area
+ * using bootmem allocator and used as-is without being mapped into
+ * vmalloc area.  This enables the first chunk to piggy back on the
+ * linear physical PMD mapping and doesn't add any additional pressure
+ * to TLB.  Note that if the needed size is smaller than the minimum
+ * unit size, the leftover is returned to the bootmem allocator.
+ */
+static void *pcpue_ptr __initdata;
+static size_t pcpue_size __initdata;
+static size_t pcpue_unit_size __initdata;
+
+static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
+{
+	size_t off = (size_t)pageno << PAGE_SHIFT;
+
+	if (off >= pcpue_size)
+		return NULL;
+
+	return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
+}
+
+static ssize_t __init setup_pcpu_embed(size_t static_size)
+{
+	unsigned int cpu;
+	size_t dyn_size;
+
+	/*
+	 * If large page isn't supported, there's no benefit in doing
+	 * this.  Also, embedding allocation doesn't play well with
+	 * NUMA.
+	 */
+	if (!cpu_has_pse || pcpu_need_numa())
+		return -EINVAL;
+
+	/* allocate and copy */
+	pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
+			       PERCPU_DYNAMIC_RESERVE);
+	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
+	dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
+
+	pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
+				       PAGE_SIZE);
+	if (!pcpue_ptr)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
+
+		free_bootmem(__pa(ptr + pcpue_size),
+			     pcpue_unit_size - pcpue_size);
+		memcpy(ptr, __per_cpu_load, static_size);
+	}
+
+	/* we're ready, commit */
+	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
+		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
+
+	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
+				      PERCPU_FIRST_CHUNK_RESERVE,
+				      pcpue_unit_size, dyn_size,
+				      pcpue_ptr, NULL);
+}
+
+/*
+ * 4k page allocator
+ *
+ * This is the basic allocator.  Static percpu area is allocated
+ * page-by-page and most of initialization is done by the generic
+ * setup function.
+ */
+static struct page **pcpu4k_pages __initdata;
+static int pcpu4k_nr_static_pages __initdata;
+
+static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
+{
+	if (pageno < pcpu4k_nr_static_pages)
+		return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
+	return NULL;
+}
+
+static void __init pcpu4k_populate_pte(unsigned long addr)
+{
+	populate_extra_pte(addr);
+}
+
+static ssize_t __init setup_pcpu_4k(size_t static_size)
+{
+	size_t pages_size;
+	unsigned int cpu;
+	int i, j;
+	ssize_t ret;
+
+	pcpu4k_nr_static_pages = PFN_UP(static_size);
+
+	/* unaligned allocations can't be freed, round up to page size */
+	pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
+			       * sizeof(pcpu4k_pages[0]));
+	pcpu4k_pages = alloc_bootmem(pages_size);
+
+	/* allocate and copy */
+	j = 0;
+	for_each_possible_cpu(cpu)
+		for (i = 0; i < pcpu4k_nr_static_pages; i++) {
+			void *ptr;
+
+			ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
+			if (!ptr)
+				goto enomem;
+
+			memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
+			pcpu4k_pages[j++] = virt_to_page(ptr);
+		}
+
+	/* we're ready, commit */
+	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
+		pcpu4k_nr_static_pages, static_size);
+
+	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
+				     PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL,
+				     pcpu4k_populate_pte);
+	goto out_free_ar;
+
+enomem:
+	while (--j >= 0)
+		free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
+	ret = -ENOMEM;
+out_free_ar:
+	free_bootmem(__pa(pcpu4k_pages), pages_size);
+	return ret;
+}
+
 static inline void setup_percpu_segment(int cpu)
 {
 #ifdef CONFIG_X86_32
@@ -61,38 +408,35 @@ static inline void setup_percpu_segment(int cpu)
  */
 void __init setup_per_cpu_areas(void)
 {
-	ssize_t size;
-	char *ptr;
-	int cpu;
-
-	/* Copy section for each CPU (we discard the original) */
-	size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
+	size_t static_size = __per_cpu_end - __per_cpu_start;
+	unsigned int cpu;
+	unsigned long delta;
+	size_t pcpu_unit_size;
+	ssize_t ret;
 
 	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
 		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
 
-	pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size);
+	/*
+	 * Allocate percpu area.  If PSE is supported, try to make use
+	 * of large page mappings.  Please read comments on top of
+	 * each allocator for details.
+	 */
+	ret = setup_pcpu_remap(static_size);
+	if (ret < 0)
+		ret = setup_pcpu_embed(static_size);
+	if (ret < 0)
+		ret = setup_pcpu_4k(static_size);
+	if (ret < 0)
+		panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
+		      static_size, ret);
 
-	for_each_possible_cpu(cpu) {
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-		ptr = alloc_bootmem_pages(size);
-#else
-		int node = early_cpu_to_node(cpu);
-		if (!node_online(node) || !NODE_DATA(node)) {
-			ptr = alloc_bootmem_pages(size);
-			pr_info("cpu %d has no node %d or node-local memory\n",
-				cpu, node);
-			pr_debug("per cpu data for cpu%d at %016lx\n",
-				 cpu, __pa(ptr));
-		} else {
-			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
-			pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
-				cpu, node, __pa(ptr));
-		}
-#endif
+	pcpu_unit_size = ret;
 
-		memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
-		per_cpu_offset(cpu) = ptr - __per_cpu_start;
+	/* alrighty, percpu areas up and running */
+	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
+	for_each_possible_cpu(cpu) {
+		per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
 		per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
 		per_cpu(cpu_number, cpu) = cpu;
 		setup_percpu_segment(cpu);
@@ -125,8 +469,6 @@ void __init setup_per_cpu_areas(void)
 		 */
 		if (cpu == boot_cpu_id)
 			switch_to_new_gdt(cpu);
-
-		DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
 	}
 
 	/* indicate the early static arrays will soon be gone */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 249334f5080..ef7d10170c3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -114,10 +114,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
 
 atomic_t init_deasserted;
 
-
-/* Set if we find a B stepping CPU */
-static int __cpuinitdata smp_b_stepping;
-
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
 
 /* which logical CPUs are on which nodes */
@@ -271,8 +267,6 @@ static void __cpuinit smp_callin(void)
 	cpumask_set_cpu(cpuid, cpu_callin_mask);
 }
 
-static int __cpuinitdata unsafe_smp;
-
 /*
  * Activate a secondary processor.
  */
@@ -340,76 +334,6 @@ notrace static void __cpuinit start_secondary(void *unused)
 	cpu_idle();
 }
 
-static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
-{
-	/*
-	 * Mask B, Pentium, but not Pentium MMX
-	 */
-	if (c->x86_vendor == X86_VENDOR_INTEL &&
-	    c->x86 == 5 &&
-	    c->x86_mask >= 1 && c->x86_mask <= 4 &&
-	    c->x86_model <= 3)
-		/*
-		 * Remember we have B step Pentia with bugs
-		 */
-		smp_b_stepping = 1;
-
-	/*
-	 * Certain Athlons might work (for various values of 'work') in SMP
-	 * but they are not certified as MP capable.
-	 */
-	if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
-
-		if (num_possible_cpus() == 1)
-			goto valid_k7;
-
-		/* Athlon 660/661 is valid. */
-		if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
-		    (c->x86_mask == 1)))
-			goto valid_k7;
-
-		/* Duron 670 is valid */
-		if ((c->x86_model == 7) && (c->x86_mask == 0))
-			goto valid_k7;
-
-		/*
-		 * Athlon 662, Duron 671, and Athlon >model 7 have capability
-		 * bit. It's worth noting that the A5 stepping (662) of some
-		 * Athlon XP's have the MP bit set.
-		 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
-		 * more.
-		 */
-		if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
-		    ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
-		     (c->x86_model > 7))
-			if (cpu_has_mp)
-				goto valid_k7;
-
-		/* If we get here, not a certified SMP capable AMD system. */
-		unsafe_smp = 1;
-	}
-
-valid_k7:
-	;
-}
-
-static void __cpuinit smp_checks(void)
-{
-	if (smp_b_stepping)
-		printk(KERN_WARNING "WARNING: SMP operation may be unreliable"
-				    "with B stepping processors.\n");
-
-	/*
-	 * Don't taint if we are running SMP kernel on a single non-MP
-	 * approved Athlon
-	 */
-	if (unsafe_smp && num_online_cpus() > 1) {
-		printk(KERN_INFO "WARNING: This combination of AMD"
-			"processors is not suitable for SMP.\n");
-		add_taint(TAINT_UNSAFE_SMP);
-	}
-}
-
 /*
  * The bootstrap kernel entry code has set these up. Save them for
  * a given CPU
@@ -423,7 +347,6 @@ void __cpuinit smp_store_cpu_info(int id)
 	c->cpu_index = id;
 	if (id != 0)
 		identify_secondary_cpu(c);
-	smp_apply_quirks(c);
 }
 
 
@@ -1193,7 +1116,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 	pr_debug("Boot done.\n");
 
 	impress_friends();
-	smp_checks();
 #ifdef CONFIG_X86_IO_APIC
 	setup_ioapic_dest();
 #endif
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index f04549afcfe..d038b9c45cf 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -314,8 +314,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	int locals = 0;
 	struct bau_desc *bau_desc;
 
-	WARN_ON(!in_atomic());
-
 	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
 
 	uv_cpu = uv_blade_processor_id();
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
new file mode 100644
index 00000000000..2ffb6c53326
--- /dev/null
+++ b/arch/x86/kernel/uv_time.c
@@ -0,0 +1,393 @@
+/*
+ * SGI RTC clock/timer routines.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ *  Copyright (c) 2009 Silicon Graphics, Inc.  All Rights Reserved.
+ *  Copyright (c) Dimitri Sivanich
+ */
+#include <linux/clockchips.h>
+
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv.h>
+#include <asm/apic.h>
+#include <asm/cpu.h>
+
+#define RTC_NAME		"sgi_rtc"
+
+static cycle_t uv_read_rtc(void);
+static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
+static void uv_rtc_timer_setup(enum clock_event_mode,
+				struct clock_event_device *);
+
+static struct clocksource clocksource_uv = {
+	.name		= RTC_NAME,
+	.rating		= 400,
+	.read		= uv_read_rtc,
+	.mask		= (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
+	.shift		= 10,
+	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static struct clock_event_device clock_event_device_uv = {
+	.name		= RTC_NAME,
+	.features	= CLOCK_EVT_FEAT_ONESHOT,
+	.shift		= 20,
+	.rating		= 400,
+	.irq		= -1,
+	.set_next_event	= uv_rtc_next_event,
+	.set_mode	= uv_rtc_timer_setup,
+	.event_handler	= NULL,
+};
+
+static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
+
+/* There is one of these allocated per node */
+struct uv_rtc_timer_head {
+	spinlock_t	lock;
+	/* next cpu waiting for timer, local node relative: */
+	int		next_cpu;
+	/* number of cpus on this node: */
+	int		ncpus;
+	struct {
+		int	lcpu;		/* systemwide logical cpu number */
+		u64	expires;	/* next timer expiration for this cpu */
+	} cpu[1];
+};
+
+/*
+ * Access to uv_rtc_timer_head via blade id.
+ */
+static struct uv_rtc_timer_head		**blade_info __read_mostly;
+
+static int				uv_rtc_enable;
+
+/*
+ * Hardware interface routines
+ */
+
+/* Send IPIs to another node */
+static void uv_rtc_send_IPI(int cpu)
+{
+	unsigned long apicid, val;
+	int pnode;
+
+	apicid = cpu_physical_id(cpu);
+	pnode = uv_apicid_to_pnode(apicid);
+	val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+	      (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+	      (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
+
+	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
+}
+
+/* Check for an RTC interrupt pending */
+static int uv_intr_pending(int pnode)
+{
+	return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
+		UVH_EVENT_OCCURRED0_RTC1_MASK;
+}
+
+/* Setup interrupt and return non-zero if early expiration occurred. */
+static int uv_setup_intr(int cpu, u64 expires)
+{
+	u64 val;
+	int pnode = uv_cpu_to_pnode(cpu);
+
+	uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+		UVH_RTC1_INT_CONFIG_M_MASK);
+	uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
+
+	uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
+		UVH_EVENT_OCCURRED0_RTC1_MASK);
+
+	val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
+		((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
+
+	/* Set configuration */
+	uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
+	/* Initialize comparator value */
+	uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
+
+	return (expires < uv_read_rtc() && !uv_intr_pending(pnode));
+}
+
+/*
+ * Per-cpu timer tracking routines
+ */
+
+static __init void uv_rtc_deallocate_timers(void)
+{
+	int bid;
+
+	for_each_possible_blade(bid) {
+		kfree(blade_info[bid]);
+	}
+	kfree(blade_info);
+}
+
+/* Allocate per-node list of cpu timer expiration times. */
+static __init int uv_rtc_allocate_timers(void)
+{
+	int cpu;
+
+	blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL);
+	if (!blade_info)
+		return -ENOMEM;
+	memset(blade_info, 0, uv_possible_blades * sizeof(void *));
+
+	for_each_present_cpu(cpu) {
+		int nid = cpu_to_node(cpu);
+		int bid = uv_cpu_to_blade_id(cpu);
+		int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+		struct uv_rtc_timer_head *head = blade_info[bid];
+
+		if (!head) {
+			head = kmalloc_node(sizeof(struct uv_rtc_timer_head) +
+				(uv_blade_nr_possible_cpus(bid) *
+					2 * sizeof(u64)),
+				GFP_KERNEL, nid);
+			if (!head) {
+				uv_rtc_deallocate_timers();
+				return -ENOMEM;
+			}
+			spin_lock_init(&head->lock);
+			head->ncpus = uv_blade_nr_possible_cpus(bid);
+			head->next_cpu = -1;
+			blade_info[bid] = head;
+		}
+
+		head->cpu[bcpu].lcpu = cpu;
+		head->cpu[bcpu].expires = ULLONG_MAX;
+	}
+
+	return 0;
+}
+
+/* Find and set the next expiring timer.  */
+static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode)
+{
+	u64 lowest = ULLONG_MAX;
+	int c, bcpu = -1;
+
+	head->next_cpu = -1;
+	for (c = 0; c < head->ncpus; c++) {
+		u64 exp = head->cpu[c].expires;
+		if (exp < lowest) {
+			bcpu = c;
+			lowest = exp;
+		}
+	}
+	if (bcpu >= 0) {
+		head->next_cpu = bcpu;
+		c = head->cpu[bcpu].lcpu;
+		if (uv_setup_intr(c, lowest))
+			/* If we didn't set it up in time, trigger */
+			uv_rtc_send_IPI(c);
+	} else {
+		uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
+			UVH_RTC1_INT_CONFIG_M_MASK);
+	}
+}
+
+/*
+ * Set expiration time for current cpu.
+ *
+ * Returns 1 if we missed the expiration time.
+ */
+static int uv_rtc_set_timer(int cpu, u64 expires)
+{
+	int pnode = uv_cpu_to_pnode(cpu);
+	int bid = uv_cpu_to_blade_id(cpu);
+	struct uv_rtc_timer_head *head = blade_info[bid];
+	int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+	u64 *t = &head->cpu[bcpu].expires;
+	unsigned long flags;
+	int next_cpu;
+
+	spin_lock_irqsave(&head->lock, flags);
+
+	next_cpu = head->next_cpu;
+	*t = expires;
+	/* Will this one be next to go off? */
+	if (next_cpu < 0 || bcpu == next_cpu ||
+			expires < head->cpu[next_cpu].expires) {
+		head->next_cpu = bcpu;
+		if (uv_setup_intr(cpu, expires)) {
+			*t = ULLONG_MAX;
+			uv_rtc_find_next_timer(head, pnode);
+			spin_unlock_irqrestore(&head->lock, flags);
+			return 1;
+		}
+	}
+
+	spin_unlock_irqrestore(&head->lock, flags);
+	return 0;
+}
+
+/*
+ * Unset expiration time for current cpu.
+ *
+ * Returns 1 if this timer was pending.
+ */
+static int uv_rtc_unset_timer(int cpu)
+{
+	int pnode = uv_cpu_to_pnode(cpu);
+	int bid = uv_cpu_to_blade_id(cpu);
+	struct uv_rtc_timer_head *head = blade_info[bid];
+	int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+	u64 *t = &head->cpu[bcpu].expires;
+	unsigned long flags;
+	int rc = 0;
+
+	spin_lock_irqsave(&head->lock, flags);
+
+	if (head->next_cpu == bcpu && uv_read_rtc() >= *t)
+		rc = 1;
+
+	*t = ULLONG_MAX;
+
+	/* Was the hardware setup for this timer? */
+	if (head->next_cpu == bcpu)
+		uv_rtc_find_next_timer(head, pnode);
+
+	spin_unlock_irqrestore(&head->lock, flags);
+
+	return rc;
+}
+
+
+/*
+ * Kernel interface routines.
+ */
+
+/*
+ * Read the RTC.
+ */
+static cycle_t uv_read_rtc(void)
+{
+	return (cycle_t)uv_read_local_mmr(UVH_RTC);
+}
+
+/*
+ * Program the next event, relative to now
+ */
+static int uv_rtc_next_event(unsigned long delta,
+			     struct clock_event_device *ced)
+{
+	int ced_cpu = cpumask_first(ced->cpumask);
+
+	return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc());
+}
+
+/*
+ * Setup the RTC timer in oneshot mode
+ */
+static void uv_rtc_timer_setup(enum clock_event_mode mode,
+			       struct clock_event_device *evt)
+{
+	int ced_cpu = cpumask_first(evt->cpumask);
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+	case CLOCK_EVT_MODE_ONESHOT:
+	case CLOCK_EVT_MODE_RESUME:
+		/* Nothing to do here yet */
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+	case CLOCK_EVT_MODE_SHUTDOWN:
+		uv_rtc_unset_timer(ced_cpu);
+		break;
+	}
+}
+
+static void uv_rtc_interrupt(void)
+{
+	struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
+	int cpu = smp_processor_id();
+
+	if (!ced || !ced->event_handler)
+		return;
+
+	if (uv_rtc_unset_timer(cpu) != 1)
+		return;
+
+	ced->event_handler(ced);
+}
+
+static int __init uv_enable_rtc(char *str)
+{
+	uv_rtc_enable = 1;
+
+	return 1;
+}
+__setup("uvrtc", uv_enable_rtc);
+
+static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
+{
+	struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
+
+	*ced = clock_event_device_uv;
+	ced->cpumask = cpumask_of(smp_processor_id());
+	clockevents_register_device(ced);
+}
+
+static __init int uv_rtc_setup_clock(void)
+{
+	int rc;
+
+	if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension)
+		return -ENODEV;
+
+	generic_interrupt_extension = uv_rtc_interrupt;
+
+	clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
+				clocksource_uv.shift);
+
+	rc = clocksource_register(&clocksource_uv);
+	if (rc) {
+		generic_interrupt_extension = NULL;
+		return rc;
+	}
+
+	/* Setup and register clockevents */
+	rc = uv_rtc_allocate_timers();
+	if (rc) {
+		clocksource_unregister(&clocksource_uv);
+		generic_interrupt_extension = NULL;
+		return rc;
+	}
+
+	clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
+				NSEC_PER_SEC, clock_event_device_uv.shift);
+
+	clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
+						sn_rtc_cycles_per_second;
+
+	clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
+				(NSEC_PER_SEC / sn_rtc_cycles_per_second);
+
+	rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
+	if (rc) {
+		clocksource_unregister(&clocksource_uv);
+		generic_interrupt_extension = NULL;
+		uv_rtc_deallocate_timers();
+	}
+
+	return rc;
+}
+arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fbfced6f680..5bf54e40c6e 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -275,3 +275,10 @@ ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
 ASSERT((per_cpu__irq_stack_union == 0),
         "irq_stack_union is not at start of per-cpu area");
 #endif
+
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+
+ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+       "kexec control code size is too big")
+#endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index f3a5305b8ad..9fe4ddaa8f6 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -348,6 +348,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
 		 * flush_tlb_user() for both user and kernel mappings unless
 		 * the Page Global Enable (PGE) feature bit is set. */
 		*dx |= 0x00002000;
+		/* We also lie, and say we're family id 5.  6 or greater
+		 * leads to a rdmsr in early_init_intel which we can't handle.
+		 * Family ID is returned as bits 8-12 in ax. */
+		*ax &= 0xFFFFF0FF;
+		*ax |= 0x00000500;
 		break;
 	case 0x80000000:
 		/* Futureproof this a little: if they ask how much extended
@@ -594,19 +599,21 @@ static void __init lguest_init_IRQ(void)
 		/* Some systems map "vectors" to interrupts weirdly.  Lguest has
 		 * a straightforward 1 to 1 mapping, so force that here. */
 		__get_cpu_var(vector_irq)[vector] = i;
-		if (vector != SYSCALL_VECTOR) {
-			set_intr_gate(vector,
-				      interrupt[vector-FIRST_EXTERNAL_VECTOR]);
-			set_irq_chip_and_handler_name(i, &lguest_irq_controller,
-						      handle_level_irq,
-						      "level");
-		}
+		if (vector != SYSCALL_VECTOR)
+			set_intr_gate(vector, interrupt[i]);
 	}
 	/* This call is required to set up for 4k stacks, where we have
 	 * separate stacks for hard and soft interrupts. */
 	irq_ctx_init(smp_processor_id());
 }
 
+void lguest_setup_irq(unsigned int irq)
+{
+	irq_to_desc_alloc_cpu(irq, 0);
+	set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
+				      handle_level_irq, "level");
+}
+
 /*
  * Time.
  *
diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c
index 491e737ce54..aa098708877 100644
--- a/arch/x86/math-emu/fpu_aux.c
+++ b/arch/x86/math-emu/fpu_aux.c
@@ -30,20 +30,29 @@ static void fclex(void)
 }
 
 /* Needs to be externally visible */
-void finit(void)
+void finit_task(struct task_struct *tsk)
 {
-	control_word = 0x037f;
-	partial_status = 0;
-	top = 0;		/* We don't keep top in the status word internally. */
-	fpu_tag_word = 0xffff;
+	struct i387_soft_struct *soft = &tsk->thread.xstate->soft;
+	struct address *oaddr, *iaddr;
+	soft->cwd = 0x037f;
+	soft->swd = 0;
+	soft->ftop = 0;	/* We don't keep top in the status word internally. */
+	soft->twd = 0xffff;
 	/* The behaviour is different from that detailed in
 	   Section 15.1.6 of the Intel manual */
-	operand_address.offset = 0;
-	operand_address.selector = 0;
-	instruction_address.offset = 0;
-	instruction_address.selector = 0;
-	instruction_address.opcode = 0;
-	no_ip_update = 1;
+	oaddr = (struct address *)&soft->foo;
+	oaddr->offset = 0;
+	oaddr->selector = 0;
+	iaddr = (struct address *)&soft->fip;
+	iaddr->offset = 0;
+	iaddr->selector = 0;
+	iaddr->opcode = 0;
+	soft->no_update = 1;
+}
+
+void finit(void)
+{
+	finit_task(current);
 }
 
 /*
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 00f127c80b0..d11745334a6 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -158,7 +158,6 @@ EXPORT_SYMBOL(kunmap);
 EXPORT_SYMBOL(kmap_atomic);
 EXPORT_SYMBOL(kunmap_atomic);
 
-#ifdef CONFIG_NUMA
 void __init set_highmem_pages_init(void)
 {
 	struct zone *zone;
@@ -182,11 +181,3 @@ void __init set_highmem_pages_init(void)
 	}
 	totalram_pages += totalhigh_pages;
 }
-#else
-void __init set_highmem_pages_init(void)
-{
-	add_highpages_with_active_regions(0, highstart_pfn, highend_pfn);
-
-	totalram_pages += totalhigh_pages;
-}
-#endif /* CONFIG_NUMA */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ce6a722587d..15219e0d124 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,8 +1,345 @@
+#include <linux/ioport.h>
 #include <linux/swap.h>
+
 #include <asm/cacheflush.h>
+#include <asm/e820.h>
+#include <asm/init.h>
 #include <asm/page.h>
+#include <asm/page_types.h>
 #include <asm/sections.h>
 #include <asm/system.h>
+#include <asm/tlbflush.h>
+
+unsigned long __initdata e820_table_start;
+unsigned long __meminitdata e820_table_end;
+unsigned long __meminitdata e820_table_top;
+
+int after_bootmem;
+
+int direct_gbpages
+#ifdef CONFIG_DIRECT_GBPAGES
+				= 1
+#endif
+;
+
+static void __init find_early_table_space(unsigned long end, int use_pse,
+					  int use_gbpages)
+{
+	unsigned long puds, pmds, ptes, tables, start;
+
+	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
+	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
+
+	if (use_gbpages) {
+		unsigned long extra;
+
+		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
+		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
+	} else
+		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+
+	tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
+
+	if (use_pse) {
+		unsigned long extra;
+
+		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
+#ifdef CONFIG_X86_32
+		extra += PMD_SIZE;
+#endif
+		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	} else
+		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+	tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
+
+#ifdef CONFIG_X86_32
+	/* for fixmap */
+	tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
+#endif
+
+	/*
+	 * RED-PEN putting page tables only on node 0 could
+	 * cause a hotspot and fill up ZONE_DMA. The page tables
+	 * need roughly 0.5KB per GB.
+	 */
+#ifdef CONFIG_X86_32
+	start = 0x7000;
+	e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+					tables, PAGE_SIZE);
+#else /* CONFIG_X86_64 */
+	start = 0x8000;
+	e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
+#endif
+	if (e820_table_start == -1UL)
+		panic("Cannot find space for the kernel page tables");
+
+	e820_table_start >>= PAGE_SHIFT;
+	e820_table_end = e820_table_start;
+	e820_table_top = e820_table_start + (tables >> PAGE_SHIFT);
+
+	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
+		end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT);
+}
+
+struct map_range {
+	unsigned long start;
+	unsigned long end;
+	unsigned page_size_mask;
+};
+
+#ifdef CONFIG_X86_32
+#define NR_RANGE_MR 3
+#else /* CONFIG_X86_64 */
+#define NR_RANGE_MR 5
+#endif
+
+static int save_mr(struct map_range *mr, int nr_range,
+		   unsigned long start_pfn, unsigned long end_pfn,
+		   unsigned long page_size_mask)
+{
+	if (start_pfn < end_pfn) {
+		if (nr_range >= NR_RANGE_MR)
+			panic("run out of range for init_memory_mapping\n");
+		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
+		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
+		mr[nr_range].page_size_mask = page_size_mask;
+		nr_range++;
+	}
+
+	return nr_range;
+}
+
+#ifdef CONFIG_X86_64
+static void __init init_gbpages(void)
+{
+	if (direct_gbpages && cpu_has_gbpages)
+		printk(KERN_INFO "Using GB pages for direct mapping\n");
+	else
+		direct_gbpages = 0;
+}
+#else
+static inline void init_gbpages(void)
+{
+}
+#endif
+
+/*
+ * Setup the direct mapping of the physical memory at PAGE_OFFSET.
+ * This runs before bootmem is initialized and gets pages directly from
+ * the physical memory. To access them they are temporarily mapped.
+ */
+unsigned long __init_refok init_memory_mapping(unsigned long start,
+					       unsigned long end)
+{
+	unsigned long page_size_mask = 0;
+	unsigned long start_pfn, end_pfn;
+	unsigned long ret = 0;
+	unsigned long pos;
+
+	struct map_range mr[NR_RANGE_MR];
+	int nr_range, i;
+	int use_pse, use_gbpages;
+
+	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
+
+	if (!after_bootmem)
+		init_gbpages();
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	/*
+	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
+	 * This will simplify cpa(), which otherwise needs to support splitting
+	 * large pages into small in interrupt context, etc.
+	 */
+	use_pse = use_gbpages = 0;
+#else
+	use_pse = cpu_has_pse;
+	use_gbpages = direct_gbpages;
+#endif
+
+#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_PAE
+	set_nx();
+	if (nx_enabled)
+		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+#endif
+
+	/* Enable PSE if available */
+	if (cpu_has_pse)
+		set_in_cr4(X86_CR4_PSE);
+
+	/* Enable PGE if available */
+	if (cpu_has_pge) {
+		set_in_cr4(X86_CR4_PGE);
+		__supported_pte_mask |= _PAGE_GLOBAL;
+	}
+#endif
+
+	if (use_gbpages)
+		page_size_mask |= 1 << PG_LEVEL_1G;
+	if (use_pse)
+		page_size_mask |= 1 << PG_LEVEL_2M;
+
+	memset(mr, 0, sizeof(mr));
+	nr_range = 0;
+
+	/* head if not big page alignment ? */
+	start_pfn = start >> PAGE_SHIFT;
+	pos = start_pfn << PAGE_SHIFT;
+#ifdef CONFIG_X86_32
+	/*
+	 * Don't use a large page for the first 2/4MB of memory
+	 * because there are often fixed size MTRRs in there
+	 * and overlapping MTRRs into large pages can cause
+	 * slowdowns.
+	 */
+	if (pos == 0)
+		end_pfn = 1<<(PMD_SHIFT - PAGE_SHIFT);
+	else
+		end_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+				 << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+	end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
+			<< (PMD_SHIFT - PAGE_SHIFT);
+#endif
+	if (end_pfn > (end >> PAGE_SHIFT))
+		end_pfn = end >> PAGE_SHIFT;
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+		pos = end_pfn << PAGE_SHIFT;
+	}
+
+	/* big page (2M) range */
+	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+			 << (PMD_SHIFT - PAGE_SHIFT);
+#ifdef CONFIG_X86_32
+	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+#else /* CONFIG_X86_64 */
+	end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+			 << (PUD_SHIFT - PAGE_SHIFT);
+	if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
+		end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
+#endif
+
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask & (1<<PG_LEVEL_2M));
+		pos = end_pfn << PAGE_SHIFT;
+	}
+
+#ifdef CONFIG_X86_64
+	/* big page (1G) range */
+	start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
+			 << (PUD_SHIFT - PAGE_SHIFT);
+	end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask &
+				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
+		pos = end_pfn << PAGE_SHIFT;
+	}
+
+	/* tail is not big page (1G) alignment */
+	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
+			 << (PMD_SHIFT - PAGE_SHIFT);
+	end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
+	if (start_pfn < end_pfn) {
+		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
+				page_size_mask & (1<<PG_LEVEL_2M));
+		pos = end_pfn << PAGE_SHIFT;
+	}
+#endif
+
+	/* tail is not big page (2M) alignment */
+	start_pfn = pos>>PAGE_SHIFT;
+	end_pfn = end>>PAGE_SHIFT;
+	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
+
+	/* try to merge same page size and continuous */
+	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+		unsigned long old_start;
+		if (mr[i].end != mr[i+1].start ||
+		    mr[i].page_size_mask != mr[i+1].page_size_mask)
+			continue;
+		/* move it */
+		old_start = mr[i].start;
+		memmove(&mr[i], &mr[i+1],
+			(nr_range - 1 - i) * sizeof(struct map_range));
+		mr[i--].start = old_start;
+		nr_range--;
+	}
+
+	for (i = 0; i < nr_range; i++)
+		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
+				mr[i].start, mr[i].end,
+			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
+			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
+
+	/*
+	 * Find space for the kernel direct mapping tables.
+	 *
+	 * Later we should allocate these tables in the local node of the
+	 * memory mapped. Unfortunately this is done currently before the
+	 * nodes are discovered.
+	 */
+	if (!after_bootmem)
+		find_early_table_space(end, use_pse, use_gbpages);
+
+#ifdef CONFIG_X86_32
+	for (i = 0; i < nr_range; i++)
+		kernel_physical_mapping_init(mr[i].start, mr[i].end,
+					     mr[i].page_size_mask);
+	ret = end;
+#else /* CONFIG_X86_64 */
+	for (i = 0; i < nr_range; i++)
+		ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
+						   mr[i].page_size_mask);
+#endif
+
+#ifdef CONFIG_X86_32
+	early_ioremap_page_table_range_init();
+
+	load_cr3(swapper_pg_dir);
+#endif
+
+#ifdef CONFIG_X86_64
+	if (!after_bootmem)
+		mmu_cr4_features = read_cr4();
+#endif
+	__flush_tlb_all();
+
+	if (!after_bootmem && e820_table_end > e820_table_start)
+		reserve_early(e820_table_start << PAGE_SHIFT,
+				 e820_table_end << PAGE_SHIFT, "PGTABLE");
+
+	if (!after_bootmem)
+		early_memtest(start, end);
+
+	return ret >> PAGE_SHIFT;
+}
+
+
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+	if (pagenr <= 256)
+		return 1;
+	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
+		return 0;
+	if (!page_is_ram(pagenr))
+		return 1;
+	return 0;
+}
 
 void free_init_pages(char *what, unsigned long begin, unsigned long end)
 {
@@ -47,3 +384,10 @@ void free_initmem(void)
 			(unsigned long)(&__init_begin),
 			(unsigned long)(&__init_end));
 }
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+	free_init_pages("initrd memory", start, end);
+}
+#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0b087dcd2c1..db81e9a8556 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,6 +49,7 @@
 #include <asm/paravirt.h>
 #include <asm/setup.h>
 #include <asm/cacheflush.h>
+#include <asm/init.h>
 
 unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
@@ -58,19 +59,14 @@ unsigned long highstart_pfn, highend_pfn;
 
 static noinline int do_test_wp_bit(void);
 
-
-static unsigned long __initdata table_start;
-static unsigned long __meminitdata table_end;
-static unsigned long __meminitdata table_top;
-
-static int __initdata after_init_bootmem;
+bool __read_mostly __vmalloc_start_set = false;
 
 static __init void *alloc_low_page(void)
 {
-	unsigned long pfn = table_end++;
+	unsigned long pfn = e820_table_end++;
 	void *adr;
 
-	if (pfn >= table_top)
+	if (pfn >= e820_table_top)
 		panic("alloc_low_page: ran out of memory");
 
 	adr = __va(pfn * PAGE_SIZE);
@@ -90,7 +86,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
 
 #ifdef CONFIG_X86_PAE
 	if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
-		if (after_init_bootmem)
+		if (after_bootmem)
 			pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
 		else
 			pmd_table = (pmd_t *)alloc_low_page();
@@ -117,7 +113,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 	if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
 		pte_t *page_table = NULL;
 
-		if (after_init_bootmem) {
+		if (after_bootmem) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 			page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE);
 #endif
@@ -135,6 +131,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 	return pte_offset_kernel(pmd, 0);
 }
 
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+	int pgd_idx = pgd_index(vaddr);
+	int pmd_idx = pmd_index(vaddr);
+
+	return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+	int pte_idx = pte_index(vaddr);
+	pmd_t *pmd;
+
+	pmd = populate_extra_pmd(vaddr);
+	return one_page_table_init(pmd) + pte_idx;
+}
+
 static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
 					   unsigned long vaddr, pte_t *lastpte)
 {
@@ -151,12 +164,12 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
 	if (pmd_idx_kmap_begin != pmd_idx_kmap_end
 	    && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
 	    && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
-	    && ((__pa(pte) >> PAGE_SHIFT) < table_start
-		|| (__pa(pte) >> PAGE_SHIFT) >= table_end)) {
+	    && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start
+		|| (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) {
 		pte_t *newpte;
 		int i;
 
-		BUG_ON(after_init_bootmem);
+		BUG_ON(after_bootmem);
 		newpte = alloc_low_page();
 		for (i = 0; i < PTRS_PER_PTE; i++)
 			set_pte(newpte + i, pte[i]);
@@ -225,11 +238,14 @@ static inline int is_kernel_text(unsigned long addr)
  * of max_low_pfn pages, by creating page tables starting from address
  * PAGE_OFFSET:
  */
-static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
-						unsigned long start_pfn,
-						unsigned long end_pfn,
-						int use_pse)
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+			     unsigned long end,
+			     unsigned long page_size_mask)
 {
+	int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
+	unsigned long start_pfn, end_pfn;
+	pgd_t *pgd_base = swapper_pg_dir;
 	int pgd_idx, pmd_idx, pte_ofs;
 	unsigned long pfn;
 	pgd_t *pgd;
@@ -238,6 +254,9 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
 	unsigned pages_2m, pages_4k;
 	int mapping_iter;
 
+	start_pfn = start >> PAGE_SHIFT;
+	end_pfn = end >> PAGE_SHIFT;
+
 	/*
 	 * First iteration will setup identity mapping using large/small pages
 	 * based on use_pse, with other attributes same as set by
@@ -352,26 +371,6 @@ repeat:
 		mapping_iter = 2;
 		goto repeat;
 	}
-}
-
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
-	if (pagenr <= 256)
-		return 1;
-	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
-		return 0;
-	if (!page_is_ram(pagenr))
-		return 1;
 	return 0;
 }
 
@@ -528,8 +527,9 @@ void __init native_pagetable_setup_done(pgd_t *base)
  * be partially populated, and so it avoids stomping on any existing
  * mappings.
  */
-static void __init early_ioremap_page_table_range_init(pgd_t *pgd_base)
+void __init early_ioremap_page_table_range_init(void)
 {
+	pgd_t *pgd_base = swapper_pg_dir;
 	unsigned long vaddr, end;
 
 	/*
@@ -624,7 +624,7 @@ static int __init noexec_setup(char *str)
 }
 early_param("noexec", noexec_setup);
 
-static void __init set_nx(void)
+void __init set_nx(void)
 {
 	unsigned int v[4], l, h;
 
@@ -776,6 +776,8 @@ void __init initmem_init(unsigned long start_pfn,
 #ifdef CONFIG_FLATMEM
 	max_mapnr = num_physpages;
 #endif
+	__vmalloc_start_set = true;
+
 	printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
 			pages_to_mb(max_low_pfn));
 
@@ -797,176 +799,66 @@ static void __init zone_sizes_init(void)
 	free_area_init_nodes(max_zone_pfns);
 }
 
+static unsigned long __init setup_node_bootmem(int nodeid,
+				 unsigned long start_pfn,
+				 unsigned long end_pfn,
+				 unsigned long bootmap)
+{
+	unsigned long bootmap_size;
+
+	/* don't touch min_low_pfn */
+	bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
+					 bootmap >> PAGE_SHIFT,
+					 start_pfn, end_pfn);
+	printk(KERN_INFO "  node %d low ram: %08lx - %08lx\n",
+		nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+	printk(KERN_INFO "  node %d bootmap %08lx - %08lx\n",
+		 nodeid, bootmap, bootmap + bootmap_size);
+	free_bootmem_with_active_regions(nodeid, end_pfn);
+	early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+
+	return bootmap + bootmap_size;
+}
+
 void __init setup_bootmem_allocator(void)
 {
-	int i;
+	int nodeid;
 	unsigned long bootmap_size, bootmap;
 	/*
 	 * Initialize the boot-time allocator (with low memory only):
 	 */
 	bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
-	bootmap = find_e820_area(min_low_pfn<<PAGE_SHIFT,
-				 max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
+	bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
 				 PAGE_SIZE);
 	if (bootmap == -1L)
 		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
 	reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
 
-	/* don't touch min_low_pfn */
-	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
-					 min_low_pfn, max_low_pfn);
 	printk(KERN_INFO "  mapped low ram: 0 - %08lx\n",
 		 max_pfn_mapped<<PAGE_SHIFT);
-	printk(KERN_INFO "  low ram: %08lx - %08lx\n",
-		 min_low_pfn<<PAGE_SHIFT, max_low_pfn<<PAGE_SHIFT);
-	printk(KERN_INFO "  bootmap %08lx - %08lx\n",
-		 bootmap, bootmap + bootmap_size);
-	for_each_online_node(i)
-		free_bootmem_with_active_regions(i, max_low_pfn);
-	early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
-
-	after_init_bootmem = 1;
-}
-
-static void __init find_early_table_space(unsigned long end, int use_pse)
-{
-	unsigned long puds, pmds, ptes, tables, start;
-
-	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
-	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
-
-	pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-	tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-
-	if (use_pse) {
-		unsigned long extra;
+	printk(KERN_INFO "  low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
 
-		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
-		extra += PMD_SIZE;
-		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	} else
-		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	for_each_online_node(nodeid) {
+		 unsigned long start_pfn, end_pfn;
 
-	tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
-
-	/* for fixmap */
-	tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
-
-	/*
-	 * RED-PEN putting page tables only on node 0 could
-	 * cause a hotspot and fill up ZONE_DMA. The page tables
-	 * need roughly 0.5KB per GB.
-	 */
-	start = 0x7000;
-	table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
-					tables, PAGE_SIZE);
-	if (table_start == -1UL)
-		panic("Cannot find space for the kernel page tables");
-
-	table_start >>= PAGE_SHIFT;
-	table_end = table_start;
-	table_top = table_start + (tables>>PAGE_SHIFT);
-
-	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-		end, table_start << PAGE_SHIFT,
-		(table_start << PAGE_SHIFT) + tables);
-}
-
-unsigned long __init_refok init_memory_mapping(unsigned long start,
-						unsigned long end)
-{
-	pgd_t *pgd_base = swapper_pg_dir;
-	unsigned long start_pfn, end_pfn;
-	unsigned long big_page_start;
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	/*
-	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-	 * This will simplify cpa(), which otherwise needs to support splitting
-	 * large pages into small in interrupt context, etc.
-	 */
-	int use_pse = 0;
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+		start_pfn = node_start_pfn[nodeid];
+		end_pfn = node_end_pfn[nodeid];
+		if (start_pfn > max_low_pfn)
+			continue;
+		if (end_pfn > max_low_pfn)
+			end_pfn = max_low_pfn;
 #else
-	int use_pse = cpu_has_pse;
-#endif
-
-	/*
-	 * Find space for the kernel direct mapping tables.
-	 */
-	if (!after_init_bootmem)
-		find_early_table_space(end, use_pse);
-
-#ifdef CONFIG_X86_PAE
-	set_nx();
-	if (nx_enabled)
-		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
+		start_pfn = 0;
+		end_pfn = max_low_pfn;
 #endif
-
-	/* Enable PSE if available */
-	if (cpu_has_pse)
-		set_in_cr4(X86_CR4_PSE);
-
-	/* Enable PGE if available */
-	if (cpu_has_pge) {
-		set_in_cr4(X86_CR4_PGE);
-		__supported_pte_mask |= _PAGE_GLOBAL;
-	}
-
-	/*
-	 * Don't use a large page for the first 2/4MB of memory
-	 * because there are often fixed size MTRRs in there
-	 * and overlapping MTRRs into large pages can cause
-	 * slowdowns.
-	 */
-	big_page_start = PMD_SIZE;
-
-	if (start < big_page_start) {
-		start_pfn = start >> PAGE_SHIFT;
-		end_pfn = min(big_page_start>>PAGE_SHIFT, end>>PAGE_SHIFT);
-	} else {
-		/* head is not big page alignment ? */
-		start_pfn = start >> PAGE_SHIFT;
-		end_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
-				 << (PMD_SHIFT - PAGE_SHIFT);
-	}
-	if (start_pfn < end_pfn)
-		kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 0);
-
-	/* big page range */
-	start_pfn = ((start + (PMD_SIZE - 1))>>PMD_SHIFT)
-			 << (PMD_SHIFT - PAGE_SHIFT);
-	if (start_pfn < (big_page_start >> PAGE_SHIFT))
-		start_pfn =  big_page_start >> PAGE_SHIFT;
-	end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
-	if (start_pfn < end_pfn)
-		kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
-					     use_pse);
-
-	/* tail is not big page alignment ? */
-	start_pfn = end_pfn;
-	if (start_pfn > (big_page_start>>PAGE_SHIFT)) {
-		end_pfn = end >> PAGE_SHIFT;
-		if (start_pfn < end_pfn)
-			kernel_physical_mapping_init(pgd_base, start_pfn,
-							 end_pfn, 0);
+		bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
+						 bootmap);
 	}
 
-	early_ioremap_page_table_range_init(pgd_base);
-
-	load_cr3(swapper_pg_dir);
-
-	__flush_tlb_all();
-
-	if (!after_init_bootmem)
-		reserve_early(table_start << PAGE_SHIFT,
-				 table_end << PAGE_SHIFT, "PGTABLE");
-
-	if (!after_init_bootmem)
-		early_memtest(start, end);
-
-	return end >> PAGE_SHIFT;
+	after_bootmem = 1;
 }
 
-
 /*
  * paging_init() sets up the page tables - note that the first 8MB are
  * already mapped by head.S.
@@ -1200,13 +1092,6 @@ void mark_rodata_ro(void)
 }
 #endif
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_init_pages("initrd memory", start, end);
-}
-#endif
-
 int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
 				   int flags)
 {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 724e537432e..54efa57d1c0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -48,6 +48,7 @@
 #include <asm/kdebug.h>
 #include <asm/numa.h>
 #include <asm/cacheflush.h>
+#include <asm/init.h>
 
 /*
  * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -61,12 +62,6 @@ static unsigned long dma_reserve __initdata;
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
-				= 1
-#endif
-;
-
 static int __init parse_direct_gbpages_off(char *arg)
 {
 	direct_gbpages = 0;
@@ -87,12 +82,10 @@ early_param("gbpages", parse_direct_gbpages_on);
  * around without checking the pgd every time.
  */
 
-int after_bootmem;
-
 pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
-static int do_not_nx __cpuinitdata;
+static int disable_nx __cpuinitdata;
 
 /*
  * noexec=on|off
@@ -107,9 +100,9 @@ static int __init nonx_setup(char *str)
 		return -EINVAL;
 	if (!strncmp(str, "on", 2)) {
 		__supported_pte_mask |= _PAGE_NX;
-		do_not_nx = 0;
+		disable_nx = 0;
 	} else if (!strncmp(str, "off", 3)) {
-		do_not_nx = 1;
+		disable_nx = 1;
 		__supported_pte_mask &= ~_PAGE_NX;
 	}
 	return 0;
@@ -121,7 +114,7 @@ void __cpuinit check_efer(void)
 	unsigned long efer;
 
 	rdmsrl(MSR_EFER, efer);
-	if (!(efer & EFER_NX) || do_not_nx)
+	if (!(efer & EFER_NX) || disable_nx)
 		__supported_pte_mask &= ~_PAGE_NX;
 }
 
@@ -168,34 +161,51 @@ static __ref void *spp_getpage(void)
 	return ptr;
 }
 
-void
-set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
 {
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
+	if (pgd_none(*pgd)) {
+		pud_t *pud = (pud_t *)spp_getpage();
+		pgd_populate(&init_mm, pgd, pud);
+		if (pud != pud_offset(pgd, 0))
+			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
+			       pud, pud_offset(pgd, 0));
+	}
+	return pud_offset(pgd, vaddr);
+}
 
-	pud = pud_page + pud_index(vaddr);
+static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
+{
 	if (pud_none(*pud)) {
-		pmd = (pmd_t *) spp_getpage();
+		pmd_t *pmd = (pmd_t *) spp_getpage();
 		pud_populate(&init_mm, pud, pmd);
-		if (pmd != pmd_offset(pud, 0)) {
+		if (pmd != pmd_offset(pud, 0))
 			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
-				pmd, pmd_offset(pud, 0));
-			return;
-		}
+			       pmd, pmd_offset(pud, 0));
 	}
-	pmd = pmd_offset(pud, vaddr);
+	return pmd_offset(pud, vaddr);
+}
+
+static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
+{
 	if (pmd_none(*pmd)) {
-		pte = (pte_t *) spp_getpage();
+		pte_t *pte = (pte_t *) spp_getpage();
 		pmd_populate_kernel(&init_mm, pmd, pte);
-		if (pte != pte_offset_kernel(pmd, 0)) {
+		if (pte != pte_offset_kernel(pmd, 0))
 			printk(KERN_ERR "PAGETABLE BUG #02!\n");
-			return;
-		}
 	}
+	return pte_offset_kernel(pmd, vaddr);
+}
+
+void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
+{
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pud = pud_page + pud_index(vaddr);
+	pmd = fill_pmd(pud, vaddr);
+	pte = fill_pte(pmd, vaddr);
 
-	pte = pte_offset_kernel(pmd, vaddr);
 	set_pte(pte, new_pte);
 
 	/*
@@ -205,8 +215,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
 	__flush_tlb_one(vaddr);
 }
 
-void
-set_pte_vaddr(unsigned long vaddr, pte_t pteval)
+void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 {
 	pgd_t *pgd;
 	pud_t *pud_page;
@@ -223,6 +232,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
 	set_pte_vaddr_pud(pud_page, vaddr, pteval);
 }
 
+pmd_t * __init populate_extra_pmd(unsigned long vaddr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+
+	pgd = pgd_offset_k(vaddr);
+	pud = fill_pud(pgd, vaddr);
+	return fill_pmd(pud, vaddr);
+}
+
+pte_t * __init populate_extra_pte(unsigned long vaddr)
+{
+	pmd_t *pmd;
+
+	pmd = populate_extra_pmd(vaddr);
+	return fill_pte(pmd, vaddr);
+}
+
 /*
  * Create large page table mappings for a range of physical addresses.
  */
@@ -291,13 +318,9 @@ void __init cleanup_highmap(void)
 	}
 }
 
-static unsigned long __initdata table_start;
-static unsigned long __meminitdata table_end;
-static unsigned long __meminitdata table_top;
-
 static __ref void *alloc_low_page(unsigned long *phys)
 {
-	unsigned long pfn = table_end++;
+	unsigned long pfn = e820_table_end++;
 	void *adr;
 
 	if (after_bootmem) {
@@ -307,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
 		return adr;
 	}
 
-	if (pfn >= table_top)
+	if (pfn >= e820_table_top)
 		panic("alloc_low_page: ran out of memory");
 
 	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
@@ -547,58 +570,10 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
 	return phys_pud_init(pud, addr, end, page_size_mask);
 }
 
-static void __init find_early_table_space(unsigned long end, int use_pse,
-					  int use_gbpages)
-{
-	unsigned long puds, pmds, ptes, tables, start;
-
-	puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
-	tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
-	if (use_gbpages) {
-		unsigned long extra;
-		extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
-		pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
-	} else
-		pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
-	tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
-
-	if (use_pse) {
-		unsigned long extra;
-		extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
-		ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	} else
-		ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
-
-	/*
-	 * RED-PEN putting page tables only on node 0 could
-	 * cause a hotspot and fill up ZONE_DMA. The page tables
-	 * need roughly 0.5KB per GB.
-	 */
-	start = 0x8000;
-	table_start = find_e820_area(start, end, tables, PAGE_SIZE);
-	if (table_start == -1UL)
-		panic("Cannot find space for the kernel page tables");
-
-	table_start >>= PAGE_SHIFT;
-	table_end = table_start;
-	table_top = table_start + (tables >> PAGE_SHIFT);
-
-	printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
-		end, table_start << PAGE_SHIFT, table_top << PAGE_SHIFT);
-}
-
-static void __init init_gbpages(void)
-{
-	if (direct_gbpages && cpu_has_gbpages)
-		printk(KERN_INFO "Using GB pages for direct mapping\n");
-	else
-		direct_gbpages = 0;
-}
-
-static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
-						unsigned long end,
-						unsigned long page_size_mask)
+unsigned long __init
+kernel_physical_mapping_init(unsigned long start,
+			     unsigned long end,
+			     unsigned long page_size_mask)
 {
 
 	unsigned long next, last_map_addr = end;
@@ -635,176 +610,6 @@ static unsigned long __meminit kernel_physical_mapping_init(unsigned long start,
 	return last_map_addr;
 }
 
-struct map_range {
-	unsigned long start;
-	unsigned long end;
-	unsigned page_size_mask;
-};
-
-#define NR_RANGE_MR 5
-
-static int save_mr(struct map_range *mr, int nr_range,
-		   unsigned long start_pfn, unsigned long end_pfn,
-		   unsigned long page_size_mask)
-{
-
-	if (start_pfn < end_pfn) {
-		if (nr_range >= NR_RANGE_MR)
-			panic("run out of range for init_memory_mapping\n");
-		mr[nr_range].start = start_pfn<<PAGE_SHIFT;
-		mr[nr_range].end   = end_pfn<<PAGE_SHIFT;
-		mr[nr_range].page_size_mask = page_size_mask;
-		nr_range++;
-	}
-
-	return nr_range;
-}
-
-/*
- * Setup the direct mapping of the physical memory at PAGE_OFFSET.
- * This runs before bootmem is initialized and gets pages directly from
- * the physical memory. To access them they are temporarily mapped.
- */
-unsigned long __init_refok init_memory_mapping(unsigned long start,
-					       unsigned long end)
-{
-	unsigned long last_map_addr = 0;
-	unsigned long page_size_mask = 0;
-	unsigned long start_pfn, end_pfn;
-	unsigned long pos;
-
-	struct map_range mr[NR_RANGE_MR];
-	int nr_range, i;
-	int use_pse, use_gbpages;
-
-	printk(KERN_INFO "init_memory_mapping: %016lx-%016lx\n", start, end);
-
-	/*
-	 * Find space for the kernel direct mapping tables.
-	 *
-	 * Later we should allocate these tables in the local node of the
-	 * memory mapped. Unfortunately this is done currently before the
-	 * nodes are discovered.
-	 */
-	if (!after_bootmem)
-		init_gbpages();
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	/*
-	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
-	 * This will simplify cpa(), which otherwise needs to support splitting
-	 * large pages into small in interrupt context, etc.
-	 */
-	use_pse = use_gbpages = 0;
-#else
-	use_pse = cpu_has_pse;
-	use_gbpages = direct_gbpages;
-#endif
-
-	if (use_gbpages)
-		page_size_mask |= 1 << PG_LEVEL_1G;
-	if (use_pse)
-		page_size_mask |= 1 << PG_LEVEL_2M;
-
-	memset(mr, 0, sizeof(mr));
-	nr_range = 0;
-
-	/* head if not big page alignment ?*/
-	start_pfn = start >> PAGE_SHIFT;
-	pos = start_pfn << PAGE_SHIFT;
-	end_pfn = ((pos + (PMD_SIZE - 1)) >> PMD_SHIFT)
-			<< (PMD_SHIFT - PAGE_SHIFT);
-	if (end_pfn > (end >> PAGE_SHIFT))
-		end_pfn = end >> PAGE_SHIFT;
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-		pos = end_pfn << PAGE_SHIFT;
-	}
-
-	/* big page (2M) range*/
-	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-			 << (PMD_SHIFT - PAGE_SHIFT);
-	end_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-			 << (PUD_SHIFT - PAGE_SHIFT);
-	if (end_pfn > ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT)))
-		end_pfn = ((end>>PMD_SHIFT)<<(PMD_SHIFT - PAGE_SHIFT));
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-				page_size_mask & (1<<PG_LEVEL_2M));
-		pos = end_pfn << PAGE_SHIFT;
-	}
-
-	/* big page (1G) range */
-	start_pfn = ((pos + (PUD_SIZE - 1))>>PUD_SHIFT)
-			 << (PUD_SHIFT - PAGE_SHIFT);
-	end_pfn = (end >> PUD_SHIFT) << (PUD_SHIFT - PAGE_SHIFT);
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-				page_size_mask &
-				 ((1<<PG_LEVEL_2M)|(1<<PG_LEVEL_1G)));
-		pos = end_pfn << PAGE_SHIFT;
-	}
-
-	/* tail is not big page (1G) alignment */
-	start_pfn = ((pos + (PMD_SIZE - 1))>>PMD_SHIFT)
-			 << (PMD_SHIFT - PAGE_SHIFT);
-	end_pfn = (end >> PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
-	if (start_pfn < end_pfn) {
-		nr_range = save_mr(mr, nr_range, start_pfn, end_pfn,
-				page_size_mask & (1<<PG_LEVEL_2M));
-		pos = end_pfn << PAGE_SHIFT;
-	}
-
-	/* tail is not big page (2M) alignment */
-	start_pfn = pos>>PAGE_SHIFT;
-	end_pfn = end>>PAGE_SHIFT;
-	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
-
-	/* try to merge same page size and continuous */
-	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
-		unsigned long old_start;
-		if (mr[i].end != mr[i+1].start ||
-		    mr[i].page_size_mask != mr[i+1].page_size_mask)
-			continue;
-		/* move it */
-		old_start = mr[i].start;
-		memmove(&mr[i], &mr[i+1],
-			 (nr_range - 1 - i) * sizeof (struct map_range));
-		mr[i--].start = old_start;
-		nr_range--;
-	}
-
-	for (i = 0; i < nr_range; i++)
-		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
-				mr[i].start, mr[i].end,
-			(mr[i].page_size_mask & (1<<PG_LEVEL_1G))?"1G":(
-			 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
-
-	if (!after_bootmem)
-		find_early_table_space(end, use_pse, use_gbpages);
-
-	for (i = 0; i < nr_range; i++)
-		last_map_addr = kernel_physical_mapping_init(
-					mr[i].start, mr[i].end,
-					mr[i].page_size_mask);
-
-	if (!after_bootmem)
-		mmu_cr4_features = read_cr4();
-	__flush_tlb_all();
-
-	if (!after_bootmem && table_end > table_start)
-		reserve_early(table_start << PAGE_SHIFT,
-				 table_end << PAGE_SHIFT, "PGTABLE");
-
-	printk(KERN_INFO "last_map_addr: %lx end: %lx\n",
-			 last_map_addr, end);
-
-	if (!after_bootmem)
-		early_memtest(start, end);
-
-	return last_map_addr >> PAGE_SHIFT;
-}
-
 #ifndef CONFIG_NUMA
 void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 {
@@ -876,28 +681,6 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
 
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain address
- * is valid. The argument is a physical page number.
- *
- *
- * On x86, access has to be given to the first megabyte of ram because that area
- * contains bios code and data regions used by X and dosemu and similar apps.
- * Access has to be given to non-kernel-ram areas as well, these contain the PCI
- * mmio resources as well as potential bios/acpi data regions.
- */
-int devmem_is_allowed(unsigned long pagenr)
-{
-	if (pagenr <= 256)
-		return 1;
-	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))
-		return 0;
-	if (!page_is_ram(pagenr))
-		return 1;
-	return 0;
-}
-
-
 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
 			 kcore_modules, kcore_vsyscall;
 
@@ -985,13 +768,6 @@ void mark_rodata_ro(void)
 
 #endif
 
-#ifdef CONFIG_BLK_DEV_INITRD
-void free_initrd_mem(unsigned long start, unsigned long end)
-{
-	free_init_pages("initrd memory", start, end);
-}
-#endif
-
 int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
 				   int flags)
 {
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 433f7bd4648..aca924a30ee 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -38,8 +38,7 @@ unsigned long __phys_addr(unsigned long x)
 	} else {
 		VIRTUAL_BUG_ON(x < PAGE_OFFSET);
 		x -= PAGE_OFFSET;
-		VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
-					!phys_addr_valid(x));
+		VIRTUAL_BUG_ON(!phys_addr_valid(x));
 	}
 	return x;
 }
@@ -56,10 +55,8 @@ bool __virt_addr_valid(unsigned long x)
 		if (x < PAGE_OFFSET)
 			return false;
 		x -= PAGE_OFFSET;
-		if (system_state == SYSTEM_BOOTING ?
-				x > MAXMEM : !phys_addr_valid(x)) {
+		if (!phys_addr_valid(x))
 			return false;
-		}
 	}
 
 	return pfn_valid(x >> PAGE_SHIFT);
@@ -76,10 +73,9 @@ static inline int phys_addr_valid(unsigned long addr)
 #ifdef CONFIG_DEBUG_VIRTUAL
 unsigned long __phys_addr(unsigned long x)
 {
-	/* VMALLOC_* aren't constants; not available at the boot time */
+	/* VMALLOC_* aren't constants  */
 	VIRTUAL_BUG_ON(x < PAGE_OFFSET);
-	VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING &&
-		is_vmalloc_addr((void *) x));
+	VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x));
 	return x - PAGE_OFFSET;
 }
 EXPORT_SYMBOL(__phys_addr);
@@ -89,7 +85,9 @@ bool __virt_addr_valid(unsigned long x)
 {
 	if (x < PAGE_OFFSET)
 		return false;
-	if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x))
+	if (__vmalloc_start_set && is_vmalloc_addr((void *) x))
+		return false;
+	if (x >= FIXADDR_START)
 		return false;
 	return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
 }
@@ -508,13 +506,19 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr)
 	return &bm_pte[pte_index(addr)];
 }
 
+static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
+
 void __init early_ioremap_init(void)
 {
 	pmd_t *pmd;
+	int i;
 
 	if (early_ioremap_debug)
 		printk(KERN_INFO "early_ioremap_init()\n");
 
+	for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
+		slot_virt[i] = fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
+
 	pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
 	memset(bm_pte, 0, sizeof(bm_pte));
 	pmd_populate_kernel(&init_mm, pmd, bm_pte);
@@ -581,6 +585,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
 
 static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
 static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
+
 static int __init check_early_ioremap_leak(void)
 {
 	int count = 0;
@@ -602,7 +607,8 @@ static int __init check_early_ioremap_leak(void)
 }
 late_initcall(check_early_ioremap_leak);
 
-static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
+static void __init __iomem *
+__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
 {
 	unsigned long offset, last_addr;
 	unsigned int nrpages;
@@ -668,9 +674,9 @@ static void __init __iomem *__early_ioremap(unsigned long phys_addr, unsigned lo
 		--nrpages;
 	}
 	if (early_ioremap_debug)
-		printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
+		printk(KERN_CONT "%08lx + %08lx\n", offset, slot_virt[slot]);
 
-	prev_map[slot] = (void __iomem *)(offset + fix_to_virt(idx0));
+	prev_map[slot] = (void __iomem *)(offset + slot_virt[slot]);
 	return prev_map[slot];
 }
 
@@ -738,8 +744,3 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
 	}
 	prev_map[slot] = NULL;
 }
-
-void __this_fixmap_does_not_exist(void)
-{
-	WARN_ON(1);
-}
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 93d82038af4..6a518dd08a3 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -32,11 +32,14 @@ struct kmmio_fault_page {
 	struct list_head list;
 	struct kmmio_fault_page *release_next;
 	unsigned long page; /* location of the fault page */
+	bool old_presence; /* page presence prior to arming */
+	bool armed;
 
 	/*
 	 * Number of times this page has been registered as a part
 	 * of a probe. If zero, page is disarmed and this may be freed.
-	 * Used only by writers (RCU).
+	 * Used only by writers (RCU) and post_kmmio_handler().
+	 * Protected by kmmio_lock, when linked into kmmio_page_table.
 	 */
 	int count;
 };
@@ -105,57 +108,85 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 	return NULL;
 }
 
-static void set_page_present(unsigned long addr, bool present,
-							unsigned int *pglevel)
+static void set_pmd_presence(pmd_t *pmd, bool present, bool *old)
+{
+	pmdval_t v = pmd_val(*pmd);
+	*old = !!(v & _PAGE_PRESENT);
+	v &= ~_PAGE_PRESENT;
+	if (present)
+		v |= _PAGE_PRESENT;
+	set_pmd(pmd, __pmd(v));
+}
+
+static void set_pte_presence(pte_t *pte, bool present, bool *old)
+{
+	pteval_t v = pte_val(*pte);
+	*old = !!(v & _PAGE_PRESENT);
+	v &= ~_PAGE_PRESENT;
+	if (present)
+		v |= _PAGE_PRESENT;
+	set_pte_atomic(pte, __pte(v));
+}
+
+static int set_page_presence(unsigned long addr, bool present, bool *old)
 {
-	pteval_t pteval;
-	pmdval_t pmdval;
 	unsigned int level;
-	pmd_t *pmd;
 	pte_t *pte = lookup_address(addr, &level);
 
 	if (!pte) {
 		pr_err("kmmio: no pte for page 0x%08lx\n", addr);
-		return;
+		return -1;
 	}
 
-	if (pglevel)
-		*pglevel = level;
-
 	switch (level) {
 	case PG_LEVEL_2M:
-		pmd = (pmd_t *)pte;
-		pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
-		if (present)
-			pmdval |= _PAGE_PRESENT;
-		set_pmd(pmd, __pmd(pmdval));
+		set_pmd_presence((pmd_t *)pte, present, old);
 		break;
-
 	case PG_LEVEL_4K:
-		pteval = pte_val(*pte) & ~_PAGE_PRESENT;
-		if (present)
-			pteval |= _PAGE_PRESENT;
-		set_pte_atomic(pte, __pte(pteval));
+		set_pte_presence(pte, present, old);
 		break;
-
 	default:
 		pr_err("kmmio: unexpected page level 0x%x.\n", level);
-		return;
+		return -1;
 	}
 
 	__flush_tlb_one(addr);
+	return 0;
 }
 
-/** Mark the given page as not present. Access to it will trigger a fault. */
-static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
+/*
+ * Mark the given page as not present. Access to it will trigger a fault.
+ *
+ * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the
+ * protection is ignored here. RCU read lock is assumed held, so the struct
+ * will not disappear unexpectedly. Furthermore, the caller must guarantee,
+ * that double arming the same virtual address (page) cannot occur.
+ *
+ * Double disarming on the other hand is allowed, and may occur when a fault
+ * and mmiotrace shutdown happen simultaneously.
+ */
+static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
 {
-	set_page_present(page & PAGE_MASK, false, pglevel);
+	int ret;
+	WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
+	if (f->armed) {
+		pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
+					f->page, f->count, f->old_presence);
+	}
+	ret = set_page_presence(f->page, false, &f->old_presence);
+	WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
+	f->armed = true;
+	return ret;
 }
 
-/** Mark the given page as present. */
-static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
+/** Restore the given page to saved presence state. */
+static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
 {
-	set_page_present(page & PAGE_MASK, true, pglevel);
+	bool tmp;
+	int ret = set_page_presence(f->page, f->old_presence, &tmp);
+	WARN_ONCE(ret < 0,
+			KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
+	f->armed = false;
 }
 
 /*
@@ -202,28 +233,32 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 
 	ctx = &get_cpu_var(kmmio_ctx);
 	if (ctx->active) {
-		disarm_kmmio_fault_page(faultpage->page, NULL);
 		if (addr == ctx->addr) {
 			/*
-			 * On SMP we sometimes get recursive probe hits on the
-			 * same address. Context is already saved, fall out.
+			 * A second fault on the same page means some other
+			 * condition needs handling by do_page_fault(), the
+			 * page really not being present is the most common.
 			 */
-			pr_debug("kmmio: duplicate probe hit on CPU %d, for "
-						"address 0x%08lx.\n",
-						smp_processor_id(), addr);
-			ret = 1;
-			goto no_kmmio_ctx;
-		}
-		/*
-		 * Prevent overwriting already in-flight context.
-		 * This should not happen, let's hope disarming at least
-		 * prevents a panic.
-		 */
-		pr_emerg("kmmio: recursive probe hit on CPU %d, "
+			pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n",
+					addr, smp_processor_id());
+
+			if (!faultpage->old_presence)
+				pr_info("kmmio: unexpected secondary hit for "
+					"address 0x%08lx on CPU %d.\n", addr,
+					smp_processor_id());
+		} else {
+			/*
+			 * Prevent overwriting already in-flight context.
+			 * This should not happen, let's hope disarming at
+			 * least prevents a panic.
+			 */
+			pr_emerg("kmmio: recursive probe hit on CPU %d, "
 					"for address 0x%08lx. Ignoring.\n",
 					smp_processor_id(), addr);
-		pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
-					ctx->addr);
+			pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
+						ctx->addr);
+			disarm_kmmio_fault_page(faultpage);
+		}
 		goto no_kmmio_ctx;
 	}
 	ctx->active++;
@@ -244,7 +279,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	regs->flags &= ~X86_EFLAGS_IF;
 
 	/* Now we set present bit in PTE and single step. */
-	disarm_kmmio_fault_page(ctx->fpage->page, NULL);
+	disarm_kmmio_fault_page(ctx->fpage);
 
 	/*
 	 * If another cpu accesses the same page while we are stepping,
@@ -275,7 +310,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
 	if (!ctx->active) {
-		pr_debug("kmmio: spurious debug trap on CPU %d.\n",
+		pr_warning("kmmio: spurious debug trap on CPU %d.\n",
 							smp_processor_id());
 		goto out;
 	}
@@ -283,7 +318,11 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	if (ctx->probe && ctx->probe->post_handler)
 		ctx->probe->post_handler(ctx->probe, condition, regs);
 
-	arm_kmmio_fault_page(ctx->fpage->page, NULL);
+	/* Prevent racing against release_kmmio_fault_page(). */
+	spin_lock(&kmmio_lock);
+	if (ctx->fpage->count)
+		arm_kmmio_fault_page(ctx->fpage);
+	spin_unlock(&kmmio_lock);
 
 	regs->flags &= ~X86_EFLAGS_TF;
 	regs->flags |= ctx->saved_flags;
@@ -315,20 +354,24 @@ static int add_kmmio_fault_page(unsigned long page)
 	f = get_kmmio_fault_page(page);
 	if (f) {
 		if (!f->count)
-			arm_kmmio_fault_page(f->page, NULL);
+			arm_kmmio_fault_page(f);
 		f->count++;
 		return 0;
 	}
 
-	f = kmalloc(sizeof(*f), GFP_ATOMIC);
+	f = kzalloc(sizeof(*f), GFP_ATOMIC);
 	if (!f)
 		return -1;
 
 	f->count = 1;
 	f->page = page;
-	list_add_rcu(&f->list, kmmio_page_list(f->page));
 
-	arm_kmmio_fault_page(f->page, NULL);
+	if (arm_kmmio_fault_page(f)) {
+		kfree(f);
+		return -1;
+	}
+
+	list_add_rcu(&f->list, kmmio_page_list(f->page));
 
 	return 0;
 }
@@ -347,7 +390,7 @@ static void release_kmmio_fault_page(unsigned long page,
 	f->count--;
 	BUG_ON(f->count < 0);
 	if (!f->count) {
-		disarm_kmmio_fault_page(f->page, NULL);
+		disarm_kmmio_fault_page(f);
 		f->release_next = *release_list;
 		*release_list = f;
 	}
@@ -408,23 +451,24 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
 
 static void remove_kmmio_fault_pages(struct rcu_head *head)
 {
-	struct kmmio_delayed_release *dr = container_of(
-						head,
-						struct kmmio_delayed_release,
-						rcu);
+	struct kmmio_delayed_release *dr =
+		container_of(head, struct kmmio_delayed_release, rcu);
 	struct kmmio_fault_page *p = dr->release_list;
 	struct kmmio_fault_page **prevp = &dr->release_list;
 	unsigned long flags;
+
 	spin_lock_irqsave(&kmmio_lock, flags);
 	while (p) {
-		if (!p->count)
+		if (!p->count) {
 			list_del_rcu(&p->list);
-		else
+			prevp = &p->release_next;
+		} else {
 			*prevp = p->release_next;
-		prevp = &p->release_next;
+		}
 		p = p->release_next;
 	}
 	spin_unlock_irqrestore(&kmmio_lock, flags);
+
 	/* This is the real RCU destroy call. */
 	call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
 }
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 0bcd7883d03..605c8be0621 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -100,6 +100,9 @@ static int __init parse_memtest(char *arg)
 {
 	if (arg)
 		memtest_pattern = simple_strtoul(arg, NULL, 0);
+	else
+		memtest_pattern = ARRAY_SIZE(patterns);
+
 	return 0;
 }
 
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 451fe95a035..3daefa04ace 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -416,10 +416,11 @@ void __init initmem_init(unsigned long start_pfn,
 	for_each_online_node(nid)
 		propagate_e820_map_node(nid);
 
-	for_each_online_node(nid)
+	for_each_online_node(nid) {
 		memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+		NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
+	}
 
-	NODE_DATA(0)->bdata = &bootmem_node_data[0];
 	setup_bootmem_allocator();
 }
 
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index ab50a8d7402..427fd1b56df 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -1,5 +1,5 @@
 /*
- * Written by Pekka Paalanen, 2008 <pq@iki.fi>
+ * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
  */
 #include <linux/module.h>
 #include <linux/io.h>
@@ -9,35 +9,74 @@
 
 static unsigned long mmio_address;
 module_param(mmio_address, ulong, 0);
-MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
+MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
+				"(or 8 MB if read_far is non-zero).");
+
+static unsigned long read_far = 0x400100;
+module_param(read_far, ulong, 0);
+MODULE_PARM_DESC(read_far, " Offset of a 32-bit read within 8 MB "
+				"(default: 0x400100).");
+
+static unsigned v16(unsigned i)
+{
+	return i * 12 + 7;
+}
+
+static unsigned v32(unsigned i)
+{
+	return i * 212371 + 13;
+}
 
 static void do_write_test(void __iomem *p)
 {
 	unsigned int i;
+	pr_info(MODULE_NAME ": write test.\n");
 	mmiotrace_printk("Write test.\n");
+
 	for (i = 0; i < 256; i++)
 		iowrite8(i, p + i);
+
 	for (i = 1024; i < (5 * 1024); i += 2)
-		iowrite16(i * 12 + 7, p + i);
+		iowrite16(v16(i), p + i);
+
 	for (i = (5 * 1024); i < (16 * 1024); i += 4)
-		iowrite32(i * 212371 + 13, p + i);
+		iowrite32(v32(i), p + i);
 }
 
 static void do_read_test(void __iomem *p)
 {
 	unsigned int i;
+	unsigned errs[3] = { 0 };
+	pr_info(MODULE_NAME ": read test.\n");
 	mmiotrace_printk("Read test.\n");
+
 	for (i = 0; i < 256; i++)
-		ioread8(p + i);
+		if (ioread8(p + i) != i)
+			++errs[0];
+
 	for (i = 1024; i < (5 * 1024); i += 2)
-		ioread16(p + i);
+		if (ioread16(p + i) != v16(i))
+			++errs[1];
+
 	for (i = (5 * 1024); i < (16 * 1024); i += 4)
-		ioread32(p + i);
+		if (ioread32(p + i) != v32(i))
+			++errs[2];
+
+	mmiotrace_printk("Read errors: 8-bit %d, 16-bit %d, 32-bit %d.\n",
+						errs[0], errs[1], errs[2]);
 }
 
-static void do_test(void)
+static void do_read_far_test(void __iomem *p)
 {
-	void __iomem *p = ioremap_nocache(mmio_address, 0x4000);
+	pr_info(MODULE_NAME ": read far test.\n");
+	mmiotrace_printk("Read far test.\n");
+
+	ioread32(p + read_far);
+}
+
+static void do_test(unsigned long size)
+{
+	void __iomem *p = ioremap_nocache(mmio_address, size);
 	if (!p) {
 		pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
 		return;
@@ -45,11 +84,15 @@ static void do_test(void)
 	mmiotrace_printk("ioremap returned %p.\n", p);
 	do_write_test(p);
 	do_read_test(p);
+	if (read_far && read_far < size - 4)
+		do_read_far_test(p);
 	iounmap(p);
 }
 
 static int __init init(void)
 {
+	unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
+
 	if (mmio_address == 0) {
 		pr_err(MODULE_NAME ": you have to use the module argument "
 							"mmio_address.\n");
@@ -58,10 +101,11 @@ static int __init init(void)
 		return -ENXIO;
 	}
 
-	pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
-					"in PCI address space, and writing "
-					"rubbish in there.\n", mmio_address);
-	do_test();
+	pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI "
+		"address space, and writing 16 kB of rubbish in there.\n",
+		 size >> 10, mmio_address);
+	do_test(size);
+	pr_info(MODULE_NAME ": All done.\n");
 	return 0;
 }
 
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c52f4034c7f..82cd39a6cbd 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -103,7 +103,7 @@ static void xen_vcpu_setup(int cpu)
 
 	vcpup = &per_cpu(xen_vcpu_info, cpu);
 
-	info.mfn = virt_to_mfn(vcpup);
+	info.mfn = arbitrary_virt_to_mfn(vcpup);
 	info.offset = offset_in_page(vcpup);
 
 	printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
@@ -301,8 +301,10 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 	frames = mcs.args;
 
 	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
-		frames[f] = virt_to_mfn(va);
+		frames[f] = arbitrary_virt_to_mfn((void *)va);
+
 		make_lowmem_page_readonly((void *)va);
+		make_lowmem_page_readonly(mfn_to_virt(frames[f]));
 	}
 
 	MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
@@ -314,7 +316,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
 				unsigned int cpu, unsigned int i)
 {
 	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-	xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+	xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
 	struct multicall_space mc = __xen_mc_entry(0);
 
 	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
@@ -488,7 +490,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
 		break;
 
 	default: {
-		xmaddr_t maddr = virt_to_machine(&dt[entry]);
+		xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
 
 		xen_mc_flush();
 		if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 319bd40a57c..cb6afa4ec95 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -276,6 +276,13 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 	p2m_top[topidx][idx] = mfn;
 }
 
+unsigned long arbitrary_virt_to_mfn(void *vaddr)
+{
+	xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
+
+	return PFN_DOWN(maddr.maddr);
+}
+
 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 {
 	unsigned long address = (unsigned long)vaddr;
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 035582ae815..8d470562ffc 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -219,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 {
 	struct vcpu_guest_context *ctxt;
 	struct desc_struct *gdt;
+	unsigned long gdt_mfn;
 
 	if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
 		return 0;
@@ -248,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	ctxt->ldt_ents = 0;
 
 	BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+
+	gdt_mfn = arbitrary_virt_to_mfn(gdt);
 	make_lowmem_page_readonly(gdt);
+	make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
 
-	ctxt->gdt_frames[0] = virt_to_mfn(gdt);
+	ctxt->gdt_frames[0] = gdt_mfn;
 	ctxt->gdt_ents      = GDT_ENTRIES;
 
 	ctxt->user_regs.cs = __KERNEL_CS;