204 files changed, 11999 insertions, 11051 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index da349723d41..6a47bb22657 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -20,6 +20,7 @@ config X86
 	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_IDE
 	select HAVE_OPROFILE
+	select HAVE_PCSPKR_PLATFORM
 	select HAVE_PERF_EVENTS
 	select HAVE_IRQ_WORK
 	select HAVE_IOREMAP_PROT
@@ -70,6 +71,8 @@ config X86
 	select IRQ_FORCED_THREADING
 	select USE_GENERIC_SMP_HELPERS if SMP
 	select HAVE_BPF_JIT if (X86_64 && NET)
+	select CLKEVT_I8253
+	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 
 config INSTRUCTION_DECODER
 	def_bool (KPROBES || PERF_EVENTS)
@@ -93,6 +96,10 @@ config CLOCKSOURCE_WATCHDOG
 config GENERIC_CLOCKEVENTS
 	def_bool y
 
+config ARCH_CLOCKSOURCE_DATA
+	def_bool y
+	depends on X86_64
+
 config GENERIC_CLOCKEVENTS_BROADCAST
 	def_bool y
 	depends on X86_64 || (X86_32 && X86_LOCAL_APIC)
@@ -384,12 +391,21 @@ config X86_INTEL_CE
 	  This option compiles in support for the CE4100 SOC for settop
 	  boxes and media devices.
 
+config X86_INTEL_MID
+	bool "Intel MID platform support"
+	depends on X86_32
+	depends on X86_EXTENDED_PLATFORM
+	---help---
+	  Select to build a kernel capable of supporting Intel MID platform
+	  systems which do not have the PCI legacy interfaces (Moorestown,
+	  Medfield). If you are building for a PC class system say N here.
+
+if X86_INTEL_MID
+
 config X86_MRST
        bool "Moorestown MID platform"
 	depends on PCI
 	depends on PCI_GOANY
-	depends on X86_32
-	depends on X86_EXTENDED_PLATFORM
 	depends on X86_IO_APIC
 	select APB_TIMER
 	select I2C
@@ -404,6 +420,8 @@ config X86_MRST
 	  nor standard legacy replacement devices/features. e.g. Moorestown does
 	  not contain i8259, i8254, HPET, legacy BIOS, most of the io ports.
 
+endif
+
 config X86_RDC321X
 	bool "RDC R-321x SoC"
 	depends on X86_32
@@ -512,6 +530,18 @@ menuconfig PARAVIRT_GUEST
 
 if PARAVIRT_GUEST
 
+config PARAVIRT_TIME_ACCOUNTING
+	bool "Paravirtual steal time accounting"
+	select PARAVIRT
+	default n
+	---help---
+	  Select this option to enable fine granularity task steal time
+	  accounting. Time spent executing other tasks in parallel with
+	  the current vCPU is discounted from the vCPU power. To account for
+	  that, there can be a small performance impact.
+
+	  If in doubt, say N here.
+
 source "arch/x86/xen/Kconfig"
 
 config KVM_CLOCK
@@ -617,6 +647,7 @@ config HPET_EMULATE_RTC
 config APB_TIMER
        def_bool y if MRST
        prompt "Langwell APB Timer Support" if X86_MRST
+       select DW_APB_TIMER
        help
          APB timer is the replacement for 8254, HPET on X86 MID platforms.
          The APBT provides a stable time base on SMP
@@ -680,33 +711,6 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
 	  Calgary anyway, pass 'iommu=calgary' on the kernel command line.
 	  If unsure, say Y.
 
-config AMD_IOMMU
-	bool "AMD IOMMU support"
-	select SWIOTLB
-	select PCI_MSI
-	select PCI_IOV
-	depends on X86_64 && PCI && ACPI
-	---help---
-	  With this option you can enable support for AMD IOMMU hardware in
-	  your system. An IOMMU is a hardware component which provides
-	  remapping of DMA memory accesses from devices. With an AMD IOMMU you
-	  can isolate the the DMA memory of different devices and protect the
-	  system from misbehaving device drivers or hardware.
-
-	  You can find out if your system has an AMD IOMMU if you look into
-	  your BIOS for an option to enable it or if you have an IVRS ACPI
-	  table.
-
-config AMD_IOMMU_STATS
-	bool "Export AMD IOMMU statistics to debugfs"
-	depends on AMD_IOMMU
-	select DEBUG_FS
-	---help---
-	  This option enables code in the AMD IOMMU driver to collect various
-	  statistics about whats happening in the driver and exports that
-	  information to userspace via debugfs.
-	  If unsure, say N.
-
 # need this always selected by IOMMU for the VIA workaround
 config SWIOTLB
 	def_bool y if X86_64
@@ -720,9 +724,6 @@ config SWIOTLB
 config IOMMU_HELPER
 	def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
 
-config IOMMU_API
-	def_bool (AMD_IOMMU || DMAR)
-
 config MAXSMP
 	bool "Enable Maximum number of SMP Processors and NUMA Nodes"
 	depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
@@ -1170,7 +1171,7 @@ comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
 config AMD_NUMA
 	def_bool y
 	prompt "Old style AMD Opteron NUMA detection"
-	depends on NUMA && PCI
+	depends on X86_64 && NUMA && PCI
 	---help---
 	  Enable AMD NUMA node topology detection.  You should say Y here if
 	  you have a multi processor AMD system. This uses an old method to
@@ -1737,8 +1738,8 @@ menuconfig APM
 	  machines with more than one CPU.
 
 	  In order to use APM, you will need supporting software. For location
-	  and more information, read <file:Documentation/power/pm.txt> and the
-	  Battery Powered Linux mini-HOWTO, available from
+	  and more information, read <file:Documentation/power/apm-acpi.txt>
+	  and the Battery Powered Linux mini-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>.
 
 	  This driver does not spin down disk drives (see the hdparm(8)
@@ -1905,7 +1906,7 @@ config PCI_BIOS
 # x86-64 doesn't support PCI BIOS access from long mode so always go direct.
 config PCI_DIRECT
 	def_bool y
-	depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC))
+	depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC || PCI_GOMMCONFIG))
 
 config PCI_MMCONFIG
 	def_bool y
@@ -1942,55 +1943,6 @@ config PCI_CNB20LE_QUIRK
 
 	  You should say N unless you know you need this.
 
-config DMAR
-	bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
-	depends on PCI_MSI && ACPI && EXPERIMENTAL
-	help
-	  DMA remapping (DMAR) devices support enables independent address
-	  translations for Direct Memory Access (DMA) from devices.
-	  These DMA remapping devices are reported via ACPI tables
-	  and include PCI device scope covered by these DMA
-	  remapping devices.
-
-config DMAR_DEFAULT_ON
-	def_bool y
-	prompt "Enable DMA Remapping Devices by default"
-	depends on DMAR
-	help
-	  Selecting this option will enable a DMAR device at boot time if
-	  one is found. If this option is not selected, DMAR support can
-	  be enabled by passing intel_iommu=on to the kernel. It is
-	  recommended you say N here while the DMAR code remains
-	  experimental.
-
-config DMAR_BROKEN_GFX_WA
-	bool "Workaround broken graphics drivers (going away soon)"
-	depends on DMAR && BROKEN
-	---help---
-	  Current Graphics drivers tend to use physical address
-	  for DMA and avoid using DMA APIs. Setting this config
-	  option permits the IOMMU driver to set a unity map for
-	  all the OS-visible memory. Hence the driver can continue
-	  to use physical addresses for DMA, at least until this
-	  option is removed in the 2.6.32 kernel.
-
-config DMAR_FLOPPY_WA
-	def_bool y
-	depends on DMAR
-	---help---
-	  Floppy disk drivers are known to bypass DMA API calls
-	  thereby failing to work when IOMMU is enabled. This
-	  workaround will setup a 1:1 mapping for the first
-	  16MiB to make floppy (an ISA device) work.
-
-config INTR_REMAP
-	bool "Support for Interrupt Remapping (EXPERIMENTAL)"
-	depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
-	---help---
-	  Supports Interrupt remapping for IO-APIC and MSI devices.
-	  To use x2apic mode in the CPU's which support x2APIC enhancements or
-	  to support platforms with CPU's having > 8 bit APIC ID, say Y.
-
 source "drivers/pci/pcie/Kconfig"
 
 source "drivers/pci/Kconfig"
@@ -2073,11 +2025,44 @@ config OLPC
 	  Add support for detecting the unique features of the OLPC
 	  XO hardware.
 
-config OLPC_XO1
-	tristate "OLPC XO-1 support"
-	depends on OLPC && MFD_CS5535
-	---help---
-	  Add support for non-essential features of the OLPC XO-1 laptop.
+config OLPC_XO1_PM
+	bool "OLPC XO-1 Power Management"
+	depends on OLPC && MFD_CS5535 && PM_SLEEP
+	select MFD_CORE
+	---help---
+	  Add support for poweroff and suspend of the OLPC XO-1 laptop.
+
+config OLPC_XO1_RTC
+	bool "OLPC XO-1 Real Time Clock"
+	depends on OLPC_XO1_PM && RTC_DRV_CMOS
+	---help---
+	  Add support for the XO-1 real time clock, which can be used as a
+	  programmable wakeup source.
+
+config OLPC_XO1_SCI
+	bool "OLPC XO-1 SCI extras"
+	depends on OLPC && OLPC_XO1_PM
+	select POWER_SUPPLY
+	select GPIO_CS5535
+	select MFD_CORE
+	---help---
+	  Add support for SCI-based features of the OLPC XO-1 laptop:
+	   - EC-driven system wakeups
+	   - Power button
+	   - Ebook switch
+	   - Lid switch
+	   - AC adapter status updates
+	   - Battery status updates
+
+config OLPC_XO15_SCI
+	bool "OLPC XO-1.5 SCI extras"
+	depends on OLPC && ACPI
+	select POWER_SUPPLY
+	---help---
+	  Add support for SCI-based features of the OLPC XO-1.5 laptop:
+	   - EC-driven system wakeups
+	   - AC adapter status updates
+	   - Battery status updates
 
 endif # X86_32
 
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 6a7cfdf8ff6..e3ca7e0d858 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -312,6 +312,9 @@ config X86_CMPXCHG
 config CMPXCHG_LOCAL
 	def_bool X86_64 || (X86_32 && !M386)
 
+config CMPXCHG_DOUBLE
+	def_bool y
+
 config X86_L1_CACHE_SHIFT
 	int
 	default "7" if MPENTIUM4 || MPSC
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index f7cb086b4ad..95365a82b6a 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -9,12 +9,6 @@
 # Changed by many, many contributors over the years.
 #
 
-# ROOT_DEV specifies the default root-device when making the image.
-# This can be either FLOPPY, CURRENT, /dev/xxxx or empty, in which case
-# the default of FLOPPY is used by 'build'.
-
-ROOT_DEV	:= CURRENT
-
 # If you want to preset the SVGA mode, uncomment the next line and
 # set SVGA_MODE to whatever number you want.
 # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
@@ -75,8 +69,7 @@ GCOV_PROFILE := n
 $(obj)/bzImage: asflags-y  := $(SVGA_MODE)
 
 quiet_cmd_image = BUILD   $@
-cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
-	$(ROOT_DEV) > $@
+cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin > $@
 
 $(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
 	$(call if_changed,image)
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index ee3a4ea923a..fdc60a0b3c2 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -130,7 +130,7 @@ static void die(const char * str, ...)
 
 static void usage(void)
 {
-	die("Usage: build setup system [rootdev] [> image]");
+	die("Usage: build setup system [> image]");
 }
 
 int main(int argc, char ** argv)
@@ -138,39 +138,14 @@ int main(int argc, char ** argv)
 	unsigned int i, sz, setup_sectors;
 	int c;
 	u32 sys_size;
-	u8 major_root, minor_root;
 	struct stat sb;
 	FILE *file;
 	int fd;
 	void *kernel;
 	u32 crc = 0xffffffffUL;
 
-	if ((argc < 3) || (argc > 4))
+	if (argc != 3)
 		usage();
-	if (argc > 3) {
-		if (!strcmp(argv[3], "CURRENT")) {
-			if (stat("/", &sb)) {
-				perror("/");
-				die("Couldn't stat /");
-			}
-			major_root = major(sb.st_dev);
-			minor_root = minor(sb.st_dev);
-		} else if (strcmp(argv[3], "FLOPPY")) {
-			if (stat(argv[3], &sb)) {
-				perror(argv[3]);
-				die("Couldn't stat root device.");
-			}
-			major_root = major(sb.st_rdev);
-			minor_root = minor(sb.st_rdev);
-		} else {
-			major_root = 0;
-			minor_root = 0;
-		}
-	} else {
-		major_root = DEFAULT_MAJOR_ROOT;
-		minor_root = DEFAULT_MINOR_ROOT;
-	}
-	fprintf(stderr, "Root device is (%d, %d)\n", major_root, minor_root);
 
 	/* Copy the setup code */
 	file = fopen(argv[1], "r");
@@ -193,8 +168,8 @@ int main(int argc, char ** argv)
 	memset(buf+c, 0, i-c);
 
 	/* Set the default root device */
-	buf[508] = minor_root;
-	buf[509] = major_root;
+	buf[508] = DEFAULT_MINOR_ROOT;
+	buf[509] = DEFAULT_MAJOR_ROOT;
 
 	fprintf(stderr, "Setup is %d bytes (padded to %d bytes).\n", c, i);
 
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 7a6e68e4f74..976aa64d9a2 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -245,7 +245,7 @@ static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
 	crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child)
 			       & CRYPTO_TFM_RES_MASK);
 
-	return 0;
+	return err;
 }
 
 static int ghash_async_init_tfm(struct crypto_tfm *tfm)
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 588a7aa937e..65577698cab 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -127,15 +127,17 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
 
 asmlinkage long sys32_sigsuspend(int history0, int history1, old_sigset_t mask)
 {
-	mask &= _BLOCKABLE;
-	spin_lock_irq(&current->sighand->siglock);
+	sigset_t blocked;
+
 	current->saved_sigmask = current->blocked;
-	siginitset(&current->blocked, mask);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
+
+	mask &= _BLOCKABLE;
+	siginitset(&blocked, mask);
+	set_current_blocked(&blocked);
 
 	current->state = TASK_INTERRUPTIBLE;
 	schedule();
+
 	set_restore_sigmask();
 	return -ERESTARTNOHAND;
 }
@@ -279,10 +281,7 @@ asmlinkage long sys32_sigreturn(struct pt_regs *regs)
 		goto badframe;
 
 	sigdelsetmask(&set, ~_BLOCKABLE);
-	spin_lock_irq(&current->sighand->siglock);
-	current->blocked = set;
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
+	set_current_blocked(&set);
 
 	if (ia32_restore_sigcontext(regs, &frame->sc, &ax))
 		goto badframe;
@@ -308,10 +307,7 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs)
 		goto badframe;
 
 	sigdelsetmask(&set, ~_BLOCKABLE);
-	spin_lock_irq(&current->sighand->siglock);
-	current->blocked = set;
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
+	set_current_blocked(&set);
 
 	if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
 		goto badframe;
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index c1870dddd32..a0e866d233e 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -143,7 +143,7 @@ ENTRY(ia32_sysenter_target)
 	CFI_REL_OFFSET rip,0
 	pushq_cfi %rax
 	cld
-	SAVE_ARGS 0,0,1
+	SAVE_ARGS 0,1,0
  	/* no need to do an access_ok check here because rbp has been
  	   32bit zero extended */ 
 1:	movl	(%rbp),%ebp
@@ -173,7 +173,7 @@ sysexit_from_sys_call:
 	andl  $~0x200,EFLAGS-R11(%rsp) 
 	movl	RIP-R11(%rsp),%edx		/* User %eip */
 	CFI_REGISTER rip,rdx
-	RESTORE_ARGS 1,24,1,1,1,1
+	RESTORE_ARGS 0,24,0,0,0,0
 	xorq	%r8,%r8
 	xorq	%r9,%r9
 	xorq	%r10,%r10
@@ -289,7 +289,7 @@ ENTRY(ia32_cstar_target)
 	 * disabled irqs and here we enable it straight after entry:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	SAVE_ARGS 8,1,1
+	SAVE_ARGS 8,0,0
 	movl 	%eax,%eax	/* zero extension */
 	movq	%rax,ORIG_RAX-ARGOFFSET(%rsp)
 	movq	%rcx,RIP-ARGOFFSET(%rsp)
@@ -328,7 +328,7 @@ cstar_dispatch:
 	jnz sysretl_audit
 sysretl_from_sys_call:
 	andl $~TS_COMPAT,TI_status(%r10)
-	RESTORE_ARGS 1,-ARG_SKIP,1,1,1
+	RESTORE_ARGS 0,-ARG_SKIP,0,0,0
 	movl RIP-ARGOFFSET(%rsp),%ecx
 	CFI_REGISTER rip,rcx
 	movl EFLAGS-ARGOFFSET(%rsp),%r11d	
@@ -419,7 +419,7 @@ ENTRY(ia32_syscall)
 	cld
 	/* note the registers are not zero extended to the sf.
 	   this could be a problem. */
-	SAVE_ARGS 0,0,1
+	SAVE_ARGS 0,1,0
 	GET_THREAD_INFO(%r10)
 	orl   $TS_COMPAT,TI_status(%r10)
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 5852519b2d0..f6f5c53dc90 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -43,7 +43,7 @@
 #include <asm/mman.h>
 #include <asm/types.h>
 #include <asm/uaccess.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/vgtod.h>
 #include <asm/sys_ia32.h>
 
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 94d420b360d..4554cc6fb96 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -17,8 +17,8 @@
 
 .macro altinstruction_entry orig alt feature orig_len alt_len
 	.align 8
-	.quad \orig
-	.quad \alt
+	.long \orig - .
+	.long \alt - .
 	.word \feature
 	.byte \orig_len
 	.byte \alt_len
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index bf535f947e8..23fb6d79f20 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -43,8 +43,8 @@
 #endif
 
 struct alt_instr {
-	u8 *instr;		/* original instruction */
-	u8 *replacement;
+	s32 instr_offset;	/* original instruction */
+	s32 repl_offset;	/* offset to replacement instruction */
 	u16 cpuid;		/* cpuid bit set for replacement */
 	u8  instrlen;		/* length of original instruction */
 	u8  replacementlen;	/* length of new instruction, <= instrlen */
@@ -84,8 +84,8 @@ static inline int alternatives_text_reserved(void *start, void *end)
       "661:\n\t" oldinstr "\n662:\n"					\
       ".section .altinstructions,\"a\"\n"				\
       _ASM_ALIGN "\n"							\
-      _ASM_PTR "661b\n"				/* label           */	\
-      _ASM_PTR "663f\n"				/* new instruction */	\
+      "	 .long 661b - .\n"			/* label           */	\
+      "	 .long 663f - .\n"			/* new instruction */	\
       "	 .word " __stringify(feature) "\n"	/* feature bit     */	\
       "	 .byte 662b-661b\n"			/* sourcelen       */	\
       "	 .byte 664f-663f\n"			/* replacementlen  */	\
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
deleted file mode 100644
index a6863a2dec1..00000000000
--- a/arch/x86/include/asm/amd_iommu.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- *         Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#ifndef _ASM_X86_AMD_IOMMU_H
-#define _ASM_X86_AMD_IOMMU_H
-
-#include <linux/irqreturn.h>
-
-#ifdef CONFIG_AMD_IOMMU
-
-extern int amd_iommu_detect(void);
-
-#else
-
-static inline int amd_iommu_detect(void) { return -ENODEV; }
-
-#endif
-
-#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
deleted file mode 100644
index 55d95eb789b..00000000000
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
-#define _ASM_X86_AMD_IOMMU_PROTO_H
-
-#include <asm/amd_iommu_types.h>
-
-extern int amd_iommu_init_dma_ops(void);
-extern int amd_iommu_init_passthrough(void);
-extern irqreturn_t amd_iommu_int_thread(int irq, void *data);
-extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
-extern void amd_iommu_apply_erratum_63(u16 devid);
-extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
-extern int amd_iommu_init_devices(void);
-extern void amd_iommu_uninit_devices(void);
-extern void amd_iommu_init_notifier(void);
-extern void amd_iommu_init_api(void);
-#ifndef CONFIG_AMD_IOMMU_STATS
-
-static inline void amd_iommu_stats_init(void) { }
-
-#endif /* !CONFIG_AMD_IOMMU_STATS */
-
-static inline bool is_rd890_iommu(struct pci_dev *pdev)
-{
-	return (pdev->vendor == PCI_VENDOR_ID_ATI) &&
-	       (pdev->device == PCI_DEVICE_ID_RD890_IOMMU);
-}
-
-static inline bool iommu_feature(struct amd_iommu *iommu, u64 f)
-{
-	if (!(iommu->cap & (1 << IOMMU_CAP_EFR)))
-		return false;
-
-	return !!(iommu->features & f);
-}
-
-#endif /* _ASM_X86_AMD_IOMMU_PROTO_H  */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
deleted file mode 100644
index 4c998299541..00000000000
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ /dev/null
@@ -1,580 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- *         Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#ifndef _ASM_X86_AMD_IOMMU_TYPES_H
-#define _ASM_X86_AMD_IOMMU_TYPES_H
-
-#include <linux/types.h>
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-
-/*
- * Maximum number of IOMMUs supported
- */
-#define MAX_IOMMUS	32
-
-/*
- * some size calculation constants
- */
-#define DEV_TABLE_ENTRY_SIZE		32
-#define ALIAS_TABLE_ENTRY_SIZE		2
-#define RLOOKUP_TABLE_ENTRY_SIZE	(sizeof(void *))
-
-/* Length of the MMIO region for the AMD IOMMU */
-#define MMIO_REGION_LENGTH       0x4000
-
-/* Capability offsets used by the driver */
-#define MMIO_CAP_HDR_OFFSET	0x00
-#define MMIO_RANGE_OFFSET	0x0c
-#define MMIO_MISC_OFFSET	0x10
-
-/* Masks, shifts and macros to parse the device range capability */
-#define MMIO_RANGE_LD_MASK	0xff000000
-#define MMIO_RANGE_FD_MASK	0x00ff0000
-#define MMIO_RANGE_BUS_MASK	0x0000ff00
-#define MMIO_RANGE_LD_SHIFT	24
-#define MMIO_RANGE_FD_SHIFT	16
-#define MMIO_RANGE_BUS_SHIFT	8
-#define MMIO_GET_LD(x)  (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT)
-#define MMIO_GET_FD(x)  (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT)
-#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT)
-#define MMIO_MSI_NUM(x)	((x) & 0x1f)
-
-/* Flag masks for the AMD IOMMU exclusion range */
-#define MMIO_EXCL_ENABLE_MASK 0x01ULL
-#define MMIO_EXCL_ALLOW_MASK  0x02ULL
-
-/* Used offsets into the MMIO space */
-#define MMIO_DEV_TABLE_OFFSET   0x0000
-#define MMIO_CMD_BUF_OFFSET     0x0008
-#define MMIO_EVT_BUF_OFFSET     0x0010
-#define MMIO_CONTROL_OFFSET     0x0018
-#define MMIO_EXCL_BASE_OFFSET   0x0020
-#define MMIO_EXCL_LIMIT_OFFSET  0x0028
-#define MMIO_EXT_FEATURES	0x0030
-#define MMIO_CMD_HEAD_OFFSET	0x2000
-#define MMIO_CMD_TAIL_OFFSET	0x2008
-#define MMIO_EVT_HEAD_OFFSET	0x2010
-#define MMIO_EVT_TAIL_OFFSET	0x2018
-#define MMIO_STATUS_OFFSET	0x2020
-
-
-/* Extended Feature Bits */
-#define FEATURE_PREFETCH	(1ULL<<0)
-#define FEATURE_PPR		(1ULL<<1)
-#define FEATURE_X2APIC		(1ULL<<2)
-#define FEATURE_NX		(1ULL<<3)
-#define FEATURE_GT		(1ULL<<4)
-#define FEATURE_IA		(1ULL<<6)
-#define FEATURE_GA		(1ULL<<7)
-#define FEATURE_HE		(1ULL<<8)
-#define FEATURE_PC		(1ULL<<9)
-
-/* MMIO status bits */
-#define MMIO_STATUS_COM_WAIT_INT_MASK	0x04
-
-/* event logging constants */
-#define EVENT_ENTRY_SIZE	0x10
-#define EVENT_TYPE_SHIFT	28
-#define EVENT_TYPE_MASK		0xf
-#define EVENT_TYPE_ILL_DEV	0x1
-#define EVENT_TYPE_IO_FAULT	0x2
-#define EVENT_TYPE_DEV_TAB_ERR	0x3
-#define EVENT_TYPE_PAGE_TAB_ERR	0x4
-#define EVENT_TYPE_ILL_CMD	0x5
-#define EVENT_TYPE_CMD_HARD_ERR	0x6
-#define EVENT_TYPE_IOTLB_INV_TO	0x7
-#define EVENT_TYPE_INV_DEV_REQ	0x8
-#define EVENT_DEVID_MASK	0xffff
-#define EVENT_DEVID_SHIFT	0
-#define EVENT_DOMID_MASK	0xffff
-#define EVENT_DOMID_SHIFT	0
-#define EVENT_FLAGS_MASK	0xfff
-#define EVENT_FLAGS_SHIFT	0x10
-
-/* feature control bits */
-#define CONTROL_IOMMU_EN        0x00ULL
-#define CONTROL_HT_TUN_EN       0x01ULL
-#define CONTROL_EVT_LOG_EN      0x02ULL
-#define CONTROL_EVT_INT_EN      0x03ULL
-#define CONTROL_COMWAIT_EN      0x04ULL
-#define CONTROL_PASSPW_EN       0x08ULL
-#define CONTROL_RESPASSPW_EN    0x09ULL
-#define CONTROL_COHERENT_EN     0x0aULL
-#define CONTROL_ISOC_EN         0x0bULL
-#define CONTROL_CMDBUF_EN       0x0cULL
-#define CONTROL_PPFLOG_EN       0x0dULL
-#define CONTROL_PPFINT_EN       0x0eULL
-
-/* command specific defines */
-#define CMD_COMPL_WAIT          0x01
-#define CMD_INV_DEV_ENTRY       0x02
-#define CMD_INV_IOMMU_PAGES	0x03
-#define CMD_INV_IOTLB_PAGES	0x04
-#define CMD_INV_ALL		0x08
-
-#define CMD_COMPL_WAIT_STORE_MASK	0x01
-#define CMD_COMPL_WAIT_INT_MASK		0x02
-#define CMD_INV_IOMMU_PAGES_SIZE_MASK	0x01
-#define CMD_INV_IOMMU_PAGES_PDE_MASK	0x02
-
-#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS	0x7fffffffffffffffULL
-
-/* macros and definitions for device table entries */
-#define DEV_ENTRY_VALID         0x00
-#define DEV_ENTRY_TRANSLATION   0x01
-#define DEV_ENTRY_IR            0x3d
-#define DEV_ENTRY_IW            0x3e
-#define DEV_ENTRY_NO_PAGE_FAULT	0x62
-#define DEV_ENTRY_EX            0x67
-#define DEV_ENTRY_SYSMGT1       0x68
-#define DEV_ENTRY_SYSMGT2       0x69
-#define DEV_ENTRY_INIT_PASS     0xb8
-#define DEV_ENTRY_EINT_PASS     0xb9
-#define DEV_ENTRY_NMI_PASS      0xba
-#define DEV_ENTRY_LINT0_PASS    0xbe
-#define DEV_ENTRY_LINT1_PASS    0xbf
-#define DEV_ENTRY_MODE_MASK	0x07
-#define DEV_ENTRY_MODE_SHIFT	0x09
-
-/* constants to configure the command buffer */
-#define CMD_BUFFER_SIZE    8192
-#define CMD_BUFFER_UNINITIALIZED 1
-#define CMD_BUFFER_ENTRIES 512
-#define MMIO_CMD_SIZE_SHIFT 56
-#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
-
-/* constants for event buffer handling */
-#define EVT_BUFFER_SIZE		8192 /* 512 entries */
-#define EVT_LEN_MASK		(0x9ULL << 56)
-
-#define PAGE_MODE_NONE    0x00
-#define PAGE_MODE_1_LEVEL 0x01
-#define PAGE_MODE_2_LEVEL 0x02
-#define PAGE_MODE_3_LEVEL 0x03
-#define PAGE_MODE_4_LEVEL 0x04
-#define PAGE_MODE_5_LEVEL 0x05
-#define PAGE_MODE_6_LEVEL 0x06
-
-#define PM_LEVEL_SHIFT(x)	(12 + ((x) * 9))
-#define PM_LEVEL_SIZE(x)	(((x) < 6) ? \
-				  ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \
-				   (0xffffffffffffffffULL))
-#define PM_LEVEL_INDEX(x, a)	(((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL)
-#define PM_LEVEL_ENC(x)		(((x) << 9) & 0xe00ULL)
-#define PM_LEVEL_PDE(x, a)	((a) | PM_LEVEL_ENC((x)) | \
-				 IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
-#define PM_PTE_LEVEL(pte)	(((pte) >> 9) & 0x7ULL)
-
-#define PM_MAP_4k		0
-#define PM_ADDR_MASK		0x000ffffffffff000ULL
-#define PM_MAP_MASK(lvl)	(PM_ADDR_MASK & \
-				(~((1ULL << (12 + ((lvl) * 9))) - 1)))
-#define PM_ALIGNED(lvl, addr)	((PM_MAP_MASK(lvl) & (addr)) == (addr))
-
-/*
- * Returns the page table level to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_LEVEL(pagesize) \
-		((__ffs(pagesize) - 12) / 9)
-/*
- * Returns the number of ptes to use for a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_PTE_COUNT(pagesize) \
-		(1ULL << ((__ffs(pagesize) - 12) % 9))
-
-/*
- * Aligns a given io-virtual address to a given page size
- * Pagesize is expected to be a power-of-two
- */
-#define PAGE_SIZE_ALIGN(address, pagesize) \
-		((address) & ~((pagesize) - 1))
-/*
- * Creates an IOMMU PTE for an address an a given pagesize
- * The PTE has no permission bits set
- * Pagesize is expected to be a power-of-two larger than 4096
- */
-#define PAGE_SIZE_PTE(address, pagesize)		\
-		(((address) | ((pagesize) - 1)) &	\
-		 (~(pagesize >> 1)) & PM_ADDR_MASK)
-
-/*
- * Takes a PTE value with mode=0x07 and returns the page size it maps
- */
-#define PTE_PAGE_SIZE(pte) \
-	(1ULL << (1 + ffz(((pte) | 0xfffULL))))
-
-#define IOMMU_PTE_P  (1ULL << 0)
-#define IOMMU_PTE_TV (1ULL << 1)
-#define IOMMU_PTE_U  (1ULL << 59)
-#define IOMMU_PTE_FC (1ULL << 60)
-#define IOMMU_PTE_IR (1ULL << 61)
-#define IOMMU_PTE_IW (1ULL << 62)
-
-#define DTE_FLAG_IOTLB	0x01
-
-#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
-#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
-#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
-#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
-
-#define IOMMU_PROT_MASK 0x03
-#define IOMMU_PROT_IR 0x01
-#define IOMMU_PROT_IW 0x02
-
-/* IOMMU capabilities */
-#define IOMMU_CAP_IOTLB   24
-#define IOMMU_CAP_NPCACHE 26
-#define IOMMU_CAP_EFR     27
-
-#define MAX_DOMAIN_ID 65536
-
-/* FIXME: move this macro to <linux/pci.h> */
-#define PCI_BUS(x) (((x) >> 8) & 0xff)
-
-/* Protection domain flags */
-#define PD_DMA_OPS_MASK		(1UL << 0) /* domain used for dma_ops */
-#define PD_DEFAULT_MASK		(1UL << 1) /* domain is a default dma_ops
-					      domain for an IOMMU */
-#define PD_PASSTHROUGH_MASK	(1UL << 2) /* domain has no page
-					      translation */
-
-extern bool amd_iommu_dump;
-#define DUMP_printk(format, arg...)					\
-	do {								\
-		if (amd_iommu_dump)						\
-			printk(KERN_INFO "AMD-Vi: " format, ## arg);	\
-	} while(0);
-
-/* global flag if IOMMUs cache non-present entries */
-extern bool amd_iommu_np_cache;
-/* Only true if all IOMMUs support device IOTLBs */
-extern bool amd_iommu_iotlb_sup;
-
-/*
- * Make iterating over all IOMMUs easier
- */
-#define for_each_iommu(iommu) \
-	list_for_each_entry((iommu), &amd_iommu_list, list)
-#define for_each_iommu_safe(iommu, next) \
-	list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
-
-#define APERTURE_RANGE_SHIFT	27	/* 128 MB */
-#define APERTURE_RANGE_SIZE	(1ULL << APERTURE_RANGE_SHIFT)
-#define APERTURE_RANGE_PAGES	(APERTURE_RANGE_SIZE >> PAGE_SHIFT)
-#define APERTURE_MAX_RANGES	32	/* allows 4GB of DMA address space */
-#define APERTURE_RANGE_INDEX(a)	((a) >> APERTURE_RANGE_SHIFT)
-#define APERTURE_PAGE_INDEX(a)	(((a) >> 21) & 0x3fULL)
-
-/*
- * This structure contains generic data for  IOMMU protection domains
- * independent of their use.
- */
-struct protection_domain {
-	struct list_head list;  /* for list of all protection domains */
-	struct list_head dev_list; /* List of all devices in this domain */
-	spinlock_t lock;	/* mostly used to lock the page table*/
-	struct mutex api_lock;	/* protect page tables in the iommu-api path */
-	u16 id;			/* the domain id written to the device table */
-	int mode;		/* paging mode (0-6 levels) */
-	u64 *pt_root;		/* page table root pointer */
-	unsigned long flags;	/* flags to find out type of domain */
-	bool updated;		/* complete domain flush required */
-	unsigned dev_cnt;	/* devices assigned to this domain */
-	unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
-	void *priv;		/* private data */
-
-};
-
-/*
- * This struct contains device specific data for the IOMMU
- */
-struct iommu_dev_data {
-	struct list_head list;		  /* For domain->dev_list */
-	struct device *dev;		  /* Device this data belong to */
-	struct device *alias;		  /* The Alias Device */
-	struct protection_domain *domain; /* Domain the device is bound to */
-	atomic_t bind;			  /* Domain attach reverent count */
-};
-
-/*
- * For dynamic growth the aperture size is split into ranges of 128MB of
- * DMA address space each. This struct represents one such range.
- */
-struct aperture_range {
-
-	/* address allocation bitmap */
-	unsigned long *bitmap;
-
-	/*
-	 * Array of PTE pages for the aperture. In this array we save all the
-	 * leaf pages of the domain page table used for the aperture. This way
-	 * we don't need to walk the page table to find a specific PTE. We can
-	 * just calculate its address in constant time.
-	 */
-	u64 *pte_pages[64];
-
-	unsigned long offset;
-};
-
-/*
- * Data container for a dma_ops specific protection domain
- */
-struct dma_ops_domain {
-	struct list_head list;
-
-	/* generic protection domain information */
-	struct protection_domain domain;
-
-	/* size of the aperture for the mappings */
-	unsigned long aperture_size;
-
-	/* address we start to search for free addresses */
-	unsigned long next_address;
-
-	/* address space relevant data */
-	struct aperture_range *aperture[APERTURE_MAX_RANGES];
-
-	/* This will be set to true when TLB needs to be flushed */
-	bool need_flush;
-
-	/*
-	 * if this is a preallocated domain, keep the device for which it was
-	 * preallocated in this variable
-	 */
-	u16 target_dev;
-};
-
-/*
- * Structure where we save information about one hardware AMD IOMMU in the
- * system.
- */
-struct amd_iommu {
-	struct list_head list;
-
-	/* Index within the IOMMU array */
-	int index;
-
-	/* locks the accesses to the hardware */
-	spinlock_t lock;
-
-	/* Pointer to PCI device of this IOMMU */
-	struct pci_dev *dev;
-
-	/* physical address of MMIO space */
-	u64 mmio_phys;
-	/* virtual address of MMIO space */
-	u8 *mmio_base;
-
-	/* capabilities of that IOMMU read from ACPI */
-	u32 cap;
-
-	/* flags read from acpi table */
-	u8 acpi_flags;
-
-	/* Extended features */
-	u64 features;
-
-	/*
-	 * Capability pointer. There could be more than one IOMMU per PCI
-	 * device function if there are more than one AMD IOMMU capability
-	 * pointers.
-	 */
-	u16 cap_ptr;
-
-	/* pci domain of this IOMMU */
-	u16 pci_seg;
-
-	/* first device this IOMMU handles. read from PCI */
-	u16 first_device;
-	/* last device this IOMMU handles. read from PCI */
-	u16 last_device;
-
-	/* start of exclusion range of that IOMMU */
-	u64 exclusion_start;
-	/* length of exclusion range of that IOMMU */
-	u64 exclusion_length;
-
-	/* command buffer virtual address */
-	u8 *cmd_buf;
-	/* size of command buffer */
-	u32 cmd_buf_size;
-
-	/* size of event buffer */
-	u32 evt_buf_size;
-	/* event buffer virtual address */
-	u8 *evt_buf;
-	/* MSI number for event interrupt */
-	u16 evt_msi_num;
-
-	/* true if interrupts for this IOMMU are already enabled */
-	bool int_enabled;
-
-	/* if one, we need to send a completion wait command */
-	bool need_sync;
-
-	/* default dma_ops domain for that IOMMU */
-	struct dma_ops_domain *default_dom;
-
-	/*
-	 * We can't rely on the BIOS to restore all values on reinit, so we
-	 * need to stash them
-	 */
-
-	/* The iommu BAR */
-	u32 stored_addr_lo;
-	u32 stored_addr_hi;
-
-	/*
-	 * Each iommu has 6 l1s, each of which is documented as having 0x12
-	 * registers
-	 */
-	u32 stored_l1[6][0x12];
-
-	/* The l2 indirect registers */
-	u32 stored_l2[0x83];
-};
-
-/*
- * List with all IOMMUs in the system. This list is not locked because it is
- * only written and read at driver initialization or suspend time
- */
-extern struct list_head amd_iommu_list;
-
-/*
- * Array with pointers to each IOMMU struct
- * The indices are referenced in the protection domains
- */
-extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
-
-/* Number of IOMMUs present in the system */
-extern int amd_iommus_present;
-
-/*
- * Declarations for the global list of all protection domains
- */
-extern spinlock_t amd_iommu_pd_lock;
-extern struct list_head amd_iommu_pd_list;
-
-/*
- * Structure defining one entry in the device table
- */
-struct dev_table_entry {
-	u32 data[8];
-};
-
-/*
- * One entry for unity mappings parsed out of the ACPI table.
- */
-struct unity_map_entry {
-	struct list_head list;
-
-	/* starting device id this entry is used for (including) */
-	u16 devid_start;
-	/* end device id this entry is used for (including) */
-	u16 devid_end;
-
-	/* start address to unity map (including) */
-	u64 address_start;
-	/* end address to unity map (including) */
-	u64 address_end;
-
-	/* required protection */
-	int prot;
-};
-
-/*
- * List of all unity mappings. It is not locked because as runtime it is only
- * read. It is created at ACPI table parsing time.
- */
-extern struct list_head amd_iommu_unity_map;
-
-/*
- * Data structures for device handling
- */
-
-/*
- * Device table used by hardware. Read and write accesses by software are
- * locked with the amd_iommu_pd_table lock.
- */
-extern struct dev_table_entry *amd_iommu_dev_table;
-
-/*
- * Alias table to find requestor ids to device ids. Not locked because only
- * read on runtime.
- */
-extern u16 *amd_iommu_alias_table;
-
-/*
- * Reverse lookup table to find the IOMMU which translates a specific device.
- */
-extern struct amd_iommu **amd_iommu_rlookup_table;
-
-/* size of the dma_ops aperture as power of 2 */
-extern unsigned amd_iommu_aperture_order;
-
-/* largest PCI device id we expect translation requests for */
-extern u16 amd_iommu_last_bdf;
-
-/* allocation bitmap for domain ids */
-extern unsigned long *amd_iommu_pd_alloc_bitmap;
-
-/*
- * If true, the addresses will be flushed on unmap time, not when
- * they are reused
- */
-extern bool amd_iommu_unmap_flush;
-
-/* takes bus and device/function and returns the device id
- * FIXME: should that be in generic PCI code? */
-static inline u16 calc_devid(u8 bus, u8 devfn)
-{
-	return (((u16)bus) << 8) | devfn;
-}
-
-#ifdef CONFIG_AMD_IOMMU_STATS
-
-struct __iommu_counter {
-	char *name;
-	struct dentry *dent;
-	u64 value;
-};
-
-#define DECLARE_STATS_COUNTER(nm) \
-	static struct __iommu_counter nm = {	\
-		.name = #nm,			\
-	}
-
-#define INC_STATS_COUNTER(name)		name.value += 1
-#define ADD_STATS_COUNTER(name, x)	name.value += (x)
-#define SUB_STATS_COUNTER(name, x)	name.value -= (x)
-
-#else /* CONFIG_AMD_IOMMU_STATS */
-
-#define DECLARE_STATS_COUNTER(name)
-#define INC_STATS_COUNTER(name)
-#define ADD_STATS_COUNTER(name, x)
-#define SUB_STATS_COUNTER(name, x)
-
-#endif /* CONFIG_AMD_IOMMU_STATS */
-
-#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index af60d8a2e28..0acbac299e4 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -18,24 +18,6 @@
 
 #ifdef CONFIG_APB_TIMER
 
-/* Langwell DW APB timer registers */
-#define APBTMR_N_LOAD_COUNT    0x00
-#define APBTMR_N_CURRENT_VALUE 0x04
-#define APBTMR_N_CONTROL       0x08
-#define APBTMR_N_EOI           0x0c
-#define APBTMR_N_INT_STATUS    0x10
-
-#define APBTMRS_INT_STATUS     0xa0
-#define APBTMRS_EOI            0xa4
-#define APBTMRS_RAW_INT_STATUS 0xa8
-#define APBTMRS_COMP_VERSION   0xac
-#define APBTMRS_REG_SIZE       0x14
-
-/* register bits */
-#define APBTMR_CONTROL_ENABLE  (1<<0)
-#define APBTMR_CONTROL_MODE_PERIODIC   (1<<1) /*1: periodic 0:free running */
-#define APBTMR_CONTROL_INT     (1<<2)
-
 /* default memory mapped register base */
 #define LNW_SCU_ADDR           0xFF100000
 #define LNW_EXT_TIMER_OFFSET   0x1B800
@@ -43,14 +25,13 @@
 #define LNW_EXT_TIMER_PGOFFSET         0x800
 
 /* APBT clock speed range from PCLK to fabric base, 25-100MHz */
-#define APBT_MAX_FREQ          50
-#define APBT_MIN_FREQ          1
+#define APBT_MAX_FREQ          50000000
+#define APBT_MIN_FREQ          1000000
 #define APBT_MMAP_SIZE         1024
 
 #define APBT_DEV_USED  1
 
 extern void apbt_time_init(void);
-extern struct clock_event_device *global_clock_event;
 extern unsigned long apbt_quick_calibrate(void);
 extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
 extern void apbt_setup_secondary_clock(void);
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 4a0b7c7e2cc..7b3ca8324b6 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -8,7 +8,7 @@
 #include <asm/cpufeature.h>
 #include <asm/processor.h>
 #include <asm/apicdef.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/fixmap.h>
 #include <asm/mpspec.h>
 #include <asm/system.h>
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index b3ed1e1460f..9412d6558c8 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -3,9 +3,11 @@
 
 #ifdef __ASSEMBLY__
 # define __ASM_FORM(x)	x
+# define __ASM_FORM_COMMA(x) x,
 # define __ASM_EX_SEC	.section __ex_table, "a"
 #else
 # define __ASM_FORM(x)	" " #x " "
+# define __ASM_FORM_COMMA(x) " " #x ","
 # define __ASM_EX_SEC	" .section __ex_table,\"a\"\n"
 #endif
 
@@ -15,7 +17,8 @@
 # define __ASM_SEL(a,b) __ASM_FORM(b)
 #endif
 
-#define __ASM_SIZE(inst)	__ASM_SEL(inst##l, inst##q)
+#define __ASM_SIZE(inst, ...)	__ASM_SEL(inst##l##__VA_ARGS__, \
+					  inst##q##__VA_ARGS__)
 #define __ASM_REG(reg)		__ASM_SEL(e##reg, r##reg)
 
 #define _ASM_PTR	__ASM_SEL(.long, .quad)
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 952a826ac4e..10572e309ab 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -221,15 +221,15 @@ static inline int atomic_xchg(atomic_t *v, int new)
 }
 
 /**
- * atomic_add_unless - add unless the number is already a given value
+ * __atomic_add_unless - add unless the number is already a given value
  * @v: pointer of type atomic_t
  * @a: the amount to add to v...
  * @u: ...unless v is equal to u.
  *
  * Atomically adds @a to @v, so long as @v was not already @u.
- * Returns non-zero if @v was not @u, and zero otherwise.
+ * Returns the old value of @v.
  */
-static inline int atomic_add_unless(atomic_t *v, int a, int u)
+static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
 	int c, old;
 	c = atomic_read(v);
@@ -241,10 +241,9 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 			break;
 		c = old;
 	}
-	return c != (u);
+	return c;
 }
 
-#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
 
 /*
  * atomic_dec_if_positive - decrement by 1 if old value positive
@@ -319,5 +318,4 @@ static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
 # include "atomic64_64.h"
 #endif
 
-#include <asm-generic/atomic-long.h>
 #endif /* _ASM_X86_ATOMIC_H */
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 2a934aa19a4..24098aafce0 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -263,7 +263,7 @@ static inline int atomic64_add_negative(long long i, atomic64_t *v)
  * @u: ...unless v is equal to u.
  *
  * Atomically adds @a to @v, so long as it was not @u.
- * Returns non-zero if @v was not @u, and zero otherwise.
+ * Returns the old value of @v.
  */
 static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
 {
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 49fd1ea2295..017594d403f 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -202,7 +202,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new)
  * @u: ...unless v is equal to u.
  *
  * Atomically adds @a to @v, so long as it was not @u.
- * Returns non-zero if @v was not @u, and zero otherwise.
+ * Returns the old value of @v.
  */
 static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
 {
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 69d58131bc8..1775d6e5920 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -458,10 +458,7 @@ static inline int fls(int x)
 
 #include <asm-generic/bitops/le.h>
 
-#define ext2_set_bit_atomic(lock, nr, addr)			\
-	test_and_set_bit((nr), (unsigned long *)(addr))
-#define ext2_clear_bit_atomic(lock, nr, addr)			\
-	test_and_clear_bit((nr), (unsigned long *)(addr))
+#include <asm-generic/bitops/ext2-atomic-setbit.h>
 
 #endif /* __KERNEL__ */
 #endif /* _ASM_X86_BITOPS_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 30af5a83216..a9e3a740f69 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -46,6 +46,7 @@ For 32-bit we have the following conventions - kernel is built with
 
 */
 
+#include "dwarf2.h"
 
 /*
  * 64-bit system call stack frame layout defines and helpers, for
@@ -84,72 +85,57 @@ For 32-bit we have the following conventions - kernel is built with
 #define ARGOFFSET	R11
 #define SWFRAME		ORIG_RAX
 
-	.macro SAVE_ARGS addskip=0, norcx=0, nor891011=0
+	.macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1
 	subq  $9*8+\addskip, %rsp
 	CFI_ADJUST_CFA_OFFSET	9*8+\addskip
-	movq  %rdi, 8*8(%rsp)
-	CFI_REL_OFFSET	rdi, 8*8
-	movq  %rsi, 7*8(%rsp)
-	CFI_REL_OFFSET	rsi, 7*8
-	movq  %rdx, 6*8(%rsp)
-	CFI_REL_OFFSET	rdx, 6*8
-	.if \norcx
-	.else
-	movq  %rcx, 5*8(%rsp)
-	CFI_REL_OFFSET	rcx, 5*8
+	movq_cfi rdi, 8*8
+	movq_cfi rsi, 7*8
+	movq_cfi rdx, 6*8
+
+	.if \save_rcx
+	movq_cfi rcx, 5*8
 	.endif
-	movq  %rax, 4*8(%rsp)
-	CFI_REL_OFFSET	rax, 4*8
-	.if \nor891011
-	.else
-	movq  %r8, 3*8(%rsp)
-	CFI_REL_OFFSET	r8,  3*8
-	movq  %r9, 2*8(%rsp)
-	CFI_REL_OFFSET	r9,  2*8
-	movq  %r10, 1*8(%rsp)
-	CFI_REL_OFFSET	r10, 1*8
-	movq  %r11, (%rsp)
-	CFI_REL_OFFSET	r11, 0*8
+
+	movq_cfi rax, 4*8
+
+	.if \save_r891011
+	movq_cfi r8,  3*8
+	movq_cfi r9,  2*8
+	movq_cfi r10, 1*8
+	movq_cfi r11, 0*8
 	.endif
+
 	.endm
 
 #define ARG_SKIP	(9*8)
 
-	.macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \
-			    skipr8910=0, skiprdx=0
-	.if \skipr11
-	.else
-	movq (%rsp), %r11
-	CFI_RESTORE r11
+	.macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \
+			    rstor_r8910=1, rstor_rdx=1
+	.if \rstor_r11
+	movq_cfi_restore 0*8, r11
 	.endif
-	.if \skipr8910
-	.else
-	movq 1*8(%rsp), %r10
-	CFI_RESTORE r10
-	movq 2*8(%rsp), %r9
-	CFI_RESTORE r9
-	movq 3*8(%rsp), %r8
-	CFI_RESTORE r8
+
+	.if \rstor_r8910
+	movq_cfi_restore 1*8, r10
+	movq_cfi_restore 2*8, r9
+	movq_cfi_restore 3*8, r8
 	.endif
-	.if \skiprax
-	.else
-	movq 4*8(%rsp), %rax
-	CFI_RESTORE rax
+
+	.if \rstor_rax
+	movq_cfi_restore 4*8, rax
 	.endif
-	.if \skiprcx
-	.else
-	movq 5*8(%rsp), %rcx
-	CFI_RESTORE rcx
+
+	.if \rstor_rcx
+	movq_cfi_restore 5*8, rcx
 	.endif
-	.if \skiprdx
-	.else
-	movq 6*8(%rsp), %rdx
-	CFI_RESTORE rdx
+
+	.if \rstor_rdx
+	movq_cfi_restore 6*8, rdx
 	.endif
-	movq 7*8(%rsp), %rsi
-	CFI_RESTORE rsi
-	movq 8*8(%rsp), %rdi
-	CFI_RESTORE rdi
+
+	movq_cfi_restore 7*8, rsi
+	movq_cfi_restore 8*8, rdi
+
 	.if ARG_SKIP+\addskip > 0
 	addq $ARG_SKIP+\addskip, %rsp
 	CFI_ADJUST_CFA_OFFSET	-(ARG_SKIP+\addskip)
@@ -176,33 +162,21 @@ For 32-bit we have the following conventions - kernel is built with
 	.macro SAVE_REST
 	subq $REST_SKIP, %rsp
 	CFI_ADJUST_CFA_OFFSET	REST_SKIP
-	movq %rbx, 5*8(%rsp)
-	CFI_REL_OFFSET	rbx, 5*8
-	movq %rbp, 4*8(%rsp)
-	CFI_REL_OFFSET	rbp, 4*8
-	movq %r12, 3*8(%rsp)
-	CFI_REL_OFFSET	r12, 3*8
-	movq %r13, 2*8(%rsp)
-	CFI_REL_OFFSET	r13, 2*8
-	movq %r14, 1*8(%rsp)
-	CFI_REL_OFFSET	r14, 1*8
-	movq %r15, (%rsp)
-	CFI_REL_OFFSET	r15, 0*8
+	movq_cfi rbx, 5*8
+	movq_cfi rbp, 4*8
+	movq_cfi r12, 3*8
+	movq_cfi r13, 2*8
+	movq_cfi r14, 1*8
+	movq_cfi r15, 0*8
 	.endm
 
 	.macro RESTORE_REST
-	movq (%rsp),     %r15
-	CFI_RESTORE r15
-	movq 1*8(%rsp),  %r14
-	CFI_RESTORE r14
-	movq 2*8(%rsp),  %r13
-	CFI_RESTORE r13
-	movq 3*8(%rsp),  %r12
-	CFI_RESTORE r12
-	movq 4*8(%rsp),  %rbp
-	CFI_RESTORE rbp
-	movq 5*8(%rsp),  %rbx
-	CFI_RESTORE rbx
+	movq_cfi_restore 0*8, r15
+	movq_cfi_restore 1*8, r14
+	movq_cfi_restore 2*8, r13
+	movq_cfi_restore 3*8, r12
+	movq_cfi_restore 4*8, rbp
+	movq_cfi_restore 5*8, rbx
 	addq $REST_SKIP, %rsp
 	CFI_ADJUST_CFA_OFFSET	-(REST_SKIP)
 	.endm
@@ -214,7 +188,7 @@ For 32-bit we have the following conventions - kernel is built with
 
 	.macro RESTORE_ALL addskip=0
 	RESTORE_REST
-	RESTORE_ARGS 0, \addskip
+	RESTORE_ARGS 1, \addskip
 	.endm
 
 	.macro icebp
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
new file mode 100644
index 00000000000..0bdbbb3b9ce
--- /dev/null
+++ b/arch/x86/include/asm/clocksource.h
@@ -0,0 +1,18 @@
+/* x86-specific clocksource additions */
+
+#ifndef _ASM_X86_CLOCKSOURCE_H
+#define _ASM_X86_CLOCKSOURCE_H
+
+#ifdef CONFIG_X86_64
+
+#define VCLOCK_NONE 0  /* No vDSO clock available.	*/
+#define VCLOCK_TSC  1  /* vDSO should use vread_tsc.	*/
+#define VCLOCK_HPET 2  /* vDSO should use vread_hpet.	*/
+
+struct arch_clocksource_data {
+	int vclock_mode;
+};
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* _ASM_X86_CLOCKSOURCE_H */
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index 284a6e8f7ce..3deb7250624 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -280,4 +280,52 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
 
 #endif
 
+#define cmpxchg8b(ptr, o1, o2, n1, n2)				\
+({								\
+	char __ret;						\
+	__typeof__(o2) __dummy;					\
+	__typeof__(*(ptr)) __old1 = (o1);			\
+	__typeof__(o2) __old2 = (o2);				\
+	__typeof__(*(ptr)) __new1 = (n1);			\
+	__typeof__(o2) __new2 = (n2);				\
+	asm volatile(LOCK_PREFIX "cmpxchg8b %2; setz %1"	\
+		       : "=d"(__dummy), "=a" (__ret), "+m" (*ptr)\
+		       : "a" (__old1), "d"(__old2),		\
+		         "b" (__new1), "c" (__new2)		\
+		       : "memory");				\
+	__ret; })
+
+
+#define cmpxchg8b_local(ptr, o1, o2, n1, n2)			\
+({								\
+	char __ret;						\
+	__typeof__(o2) __dummy;					\
+	__typeof__(*(ptr)) __old1 = (o1);			\
+	__typeof__(o2) __old2 = (o2);				\
+	__typeof__(*(ptr)) __new1 = (n1);			\
+	__typeof__(o2) __new2 = (n2);				\
+	asm volatile("cmpxchg8b %2; setz %1"			\
+		       : "=d"(__dummy), "=a"(__ret), "+m" (*ptr)\
+		       : "a" (__old), "d"(__old2),		\
+		         "b" (__new1), "c" (__new2),		\
+		       : "memory");				\
+	__ret; })
+
+
+#define cmpxchg_double(ptr, o1, o2, n1, n2)				\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 4);				\
+	VM_BUG_ON((unsigned long)(ptr) % 8);				\
+	cmpxchg8b((ptr), (o1), (o2), (n1), (n2));			\
+})
+
+#define cmpxchg_double_local(ptr, o1, o2, n1, n2)			\
+({									\
+       BUILD_BUG_ON(sizeof(*(ptr)) != 4);				\
+       VM_BUG_ON((unsigned long)(ptr) % 8);				\
+       cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2));			\
+})
+
+#define system_has_cmpxchg_double() cpu_has_cx8
+
 #endif /* _ASM_X86_CMPXCHG_32_H */
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 423ae58aa02..7cf5c0a2443 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -151,4 +151,49 @@ extern void __cmpxchg_wrong_size(void);
 	cmpxchg_local((ptr), (o), (n));					\
 })
 
+#define cmpxchg16b(ptr, o1, o2, n1, n2)				\
+({								\
+	char __ret;						\
+	__typeof__(o2) __junk;					\
+	__typeof__(*(ptr)) __old1 = (o1);			\
+	__typeof__(o2) __old2 = (o2);				\
+	__typeof__(*(ptr)) __new1 = (n1);			\
+	__typeof__(o2) __new2 = (n2);				\
+	asm volatile(LOCK_PREFIX "cmpxchg16b %2;setz %1"	\
+		       : "=d"(__junk), "=a"(__ret), "+m" (*ptr)	\
+		       : "b"(__new1), "c"(__new2),		\
+		         "a"(__old1), "d"(__old2));		\
+	__ret; })
+
+
+#define cmpxchg16b_local(ptr, o1, o2, n1, n2)			\
+({								\
+	char __ret;						\
+	__typeof__(o2) __junk;					\
+	__typeof__(*(ptr)) __old1 = (o1);			\
+	__typeof__(o2) __old2 = (o2);				\
+	__typeof__(*(ptr)) __new1 = (n1);			\
+	__typeof__(o2) __new2 = (n2);				\
+	asm volatile("cmpxchg16b %2;setz %1"			\
+		       : "=d"(__junk), "=a"(__ret), "+m" (*ptr)	\
+		       : "b"(__new1), "c"(__new2),		\
+		         "a"(__old1), "d"(__old2));		\
+	__ret; })
+
+#define cmpxchg_double(ptr, o1, o2, n1, n2)				\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	VM_BUG_ON((unsigned long)(ptr) % 16);				\
+	cmpxchg16b((ptr), (o1), (o2), (n1), (n2));			\
+})
+
+#define cmpxchg_double_local(ptr, o1, o2, n1, n2)			\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	VM_BUG_ON((unsigned long)(ptr) % 16);				\
+	cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2));		\
+})
+
+#define system_has_cmpxchg_double() cpu_has_cx16
+
 #endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 71cc3800712..4258aac99a6 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -288,6 +288,8 @@ extern const char * const x86_power_flags[32];
 #define cpu_has_hypervisor	boot_cpu_has(X86_FEATURE_HYPERVISOR)
 #define cpu_has_pclmulqdq	boot_cpu_has(X86_FEATURE_PCLMULQDQ)
 #define cpu_has_perfctr_core	boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
+#define cpu_has_cx8		boot_cpu_has(X86_FEATURE_CX8)
+#define cpu_has_cx16		boot_cpu_has(X86_FEATURE_CX16)
 
 #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
 # define cpu_has_invlpg		1
@@ -331,8 +333,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
 			 "2:\n"
 			 ".section .altinstructions,\"a\"\n"
 			 _ASM_ALIGN "\n"
-			 _ASM_PTR "1b\n"
-			 _ASM_PTR "0\n" 	/* no replacement */
+			 " .long 1b - .\n"
+			 " .long 0\n"		/* no replacement */
 			 " .word %P0\n"		/* feature bit */
 			 " .byte 2b - 1b\n"	/* source len */
 			 " .byte 0\n"		/* replacement len */
@@ -349,8 +351,8 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
 			     "2:\n"
 			     ".section .altinstructions,\"a\"\n"
 			     _ASM_ALIGN "\n"
-			     _ASM_PTR "1b\n"
-			     _ASM_PTR "3f\n"
+			     " .long 1b - .\n"
+			     " .long 3f - .\n"
 			     " .word %P1\n"		/* feature bit */
 			     " .byte 2b - 1b\n"		/* source len */
 			     " .byte 4f - 3f\n"		/* replacement len */
diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h
index 409a649204a..9b3b4f2754c 100644
--- a/arch/x86/include/asm/delay.h
+++ b/arch/x86/include/asm/delay.h
@@ -1,30 +1,7 @@
 #ifndef _ASM_X86_DELAY_H
 #define _ASM_X86_DELAY_H
 
-/*
- * Copyright (C) 1993 Linus Torvalds
- *
- * Delay routines calling functions in arch/x86/lib/delay.c
- */
-
-/* Undefined functions to get compile-time errors */
-extern void __bad_udelay(void);
-extern void __bad_ndelay(void);
-
-extern void __udelay(unsigned long usecs);
-extern void __ndelay(unsigned long nsecs);
-extern void __const_udelay(unsigned long xloops);
-extern void __delay(unsigned long loops);
-
-/* 0x10c7 is 2**32 / 1000000 (rounded up) */
-#define udelay(n) (__builtin_constant_p(n) ? \
-	((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
-	__udelay(n))
-
-/* 0x5 is 2**32 / 1000000000 (rounded up) */
-#define ndelay(n) (__builtin_constant_p(n) ? \
-	((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
-	__ndelay(n))
+#include <asm-generic/delay.h>
 
 void use_tsc_delay(void);
 
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 1cd6d26a0a8..0baa628e330 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -53,8 +53,4 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
 BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
 #endif
 
-#ifdef CONFIG_X86_MCE
-BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR)
-#endif
-
 #endif
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 4729b2b6311..460c74e4852 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -78,6 +78,7 @@ enum fixed_addresses {
 	VSYSCALL_LAST_PAGE,
 	VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
 			    + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
+	VVAR_PAGE,
 	VSYSCALL_HPET,
 #endif
 	FIX_DBGP_BASE,
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 2c6fc9e6281..3b629f47eb6 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -1,5 +1,6 @@
 #ifdef __ASSEMBLY__
 
+#include <asm/asm.h>
 #include <asm/dwarf2.h>
 
 /* The annotation hides the frame from the unwinder and makes it look
@@ -7,13 +8,13 @@
    frame pointer later */
 #ifdef CONFIG_FRAME_POINTER
 	.macro FRAME
-	pushl_cfi %ebp
-	CFI_REL_OFFSET ebp,0
-	movl %esp,%ebp
+	__ASM_SIZE(push,_cfi)	%__ASM_REG(bp)
+	CFI_REL_OFFSET		__ASM_REG(bp), 0
+	__ASM_SIZE(mov)		%__ASM_REG(sp), %__ASM_REG(bp)
 	.endm
 	.macro ENDFRAME
-	popl_cfi %ebp
-	CFI_RESTORE ebp
+	__ASM_SIZE(pop,_cfi)	%__ASM_REG(bp)
+	CFI_RESTORE		__ASM_REG(bp)
 	.endm
 #else
 	.macro FRAME
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index bb9efe8706e..09199052060 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -21,7 +21,7 @@
 #include <linux/profile.h>
 #include <linux/smp.h>
 
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/irq.h>
 #include <asm/sections.h>
 
@@ -34,7 +34,6 @@ extern void irq_work_interrupt(void);
 extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
 extern void reschedule_interrupt(void);
-extern void mce_self_interrupt(void);
 
 extern void invalidate_interrupt(void);
 extern void invalidate_interrupt0(void);
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
deleted file mode 100644
index 65aaa91d585..00000000000
--- a/arch/x86/include/asm/i8253.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef _ASM_X86_I8253_H
-#define _ASM_X86_I8253_H
-
-/* i8253A PIT registers */
-#define PIT_MODE		0x43
-#define PIT_CH0			0x40
-#define PIT_CH2			0x42
-
-#define PIT_LATCH	LATCH
-
-extern raw_spinlock_t i8253_lock;
-
-extern struct clock_event_device *global_clock_event;
-
-extern void setup_pit_timer(void);
-
-#define inb_pit		inb_p
-#define outb_pit	outb_p
-
-#endif /* _ASM_X86_I8253_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index d02804d650c..d8e8eefbe24 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -40,8 +40,6 @@
 #include <linux/compiler.h>
 #include <asm/page.h>
 
-#include <xen/xen.h>
-
 #define build_mmio_read(name, size, type, reg, barrier) \
 static inline type name(const volatile void __iomem *addr) \
 { type ret; asm volatile("mov" size " %1,%0":reg (ret) \
@@ -334,6 +332,7 @@ extern void fixup_early_ioremap(void);
 extern bool is_early_ioremap_ptep(pte_t *ptep);
 
 #ifdef CONFIG_XEN
+#include <xen/xen.h>
 struct bio_vec;
 
 extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 6e976ee3b3e..f9a320984a1 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -17,7 +17,8 @@
  *  Vectors   0 ...  31 : system traps and exceptions - hardcoded events
  *  Vectors  32 ... 127 : device interrupts
  *  Vector  128         : legacy int80 syscall interface
- *  Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts
+ *  Vector  204         : legacy x86_64 vsyscall emulation
+ *  Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts
  *  Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts
  *
  * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
@@ -50,6 +51,9 @@
 #ifdef CONFIG_X86_32
 # define SYSCALL_VECTOR			0x80
 #endif
+#ifdef CONFIG_X86_64
+# define VSYSCALL_EMU_VECTOR		0xcc
+#endif
 
 /*
  * Vectors 0x30-0x3f are used for ISA interrupts.
@@ -109,11 +113,6 @@
 
 #define UV_BAU_MESSAGE			0xf5
 
-/*
- * Self IPI vector for machine checks
- */
-#define MCE_SELF_VECTOR			0xf4
-
 /* Xen vector callback to receive events in a HVM domain */
 #define XEN_HVM_EVTCHN_CALLBACK		0xf3
 
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 5745ce8bf10..bba3cf88e62 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -60,23 +60,24 @@ static inline void native_halt(void)
 #include <asm/paravirt.h>
 #else
 #ifndef __ASSEMBLY__
+#include <linux/types.h>
 
-static inline unsigned long arch_local_save_flags(void)
+static inline notrace unsigned long arch_local_save_flags(void)
 {
 	return native_save_fl();
 }
 
-static inline void arch_local_irq_restore(unsigned long flags)
+static inline notrace void arch_local_irq_restore(unsigned long flags)
 {
 	native_restore_fl(flags);
 }
 
-static inline void arch_local_irq_disable(void)
+static inline notrace void arch_local_irq_disable(void)
 {
 	native_irq_disable();
 }
 
-static inline void arch_local_irq_enable(void)
+static inline notrace void arch_local_irq_enable(void)
 {
 	native_irq_enable();
 }
@@ -102,7 +103,7 @@ static inline void halt(void)
 /*
  * For spinlocks, etc:
  */
-static inline unsigned long arch_local_irq_save(void)
+static inline notrace unsigned long arch_local_irq_save(void)
 {
 	unsigned long flags = arch_local_save_flags();
 	arch_local_irq_disable();
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index fe2cc6e105f..d73f1571bde 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -28,7 +28,6 @@ extern void show_registers(struct pt_regs *regs);
 extern void show_trace(struct task_struct *t, struct pt_regs *regs,
 		       unsigned long *sp, unsigned long bp);
 extern void __show_regs(struct pt_regs *regs, int all);
-extern void show_regs(struct pt_regs *regs);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
 #ifdef CONFIG_KEXEC
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0049211959c..6040d115ef5 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -229,7 +229,26 @@ struct read_cache {
 	unsigned long end;
 };
 
-struct decode_cache {
+struct x86_emulate_ctxt {
+	struct x86_emulate_ops *ops;
+
+	/* Register state before/after emulation. */
+	unsigned long eflags;
+	unsigned long eip; /* eip before instruction emulation */
+	/* Emulated execution mode, represented by an X86EMUL_MODE value. */
+	int mode;
+
+	/* interruptibility state, as a result of execution of STI or MOV SS */
+	int interruptibility;
+
+	bool guest_mode; /* guest running a nested guest */
+	bool perm_ok; /* do not check permissions if true */
+	bool only_vendor_specific_insn;
+
+	bool have_exception;
+	struct x86_exception exception;
+
+	/* decode cache */
 	u8 twobyte;
 	u8 b;
 	u8 intercept;
@@ -246,8 +265,6 @@ struct decode_cache {
 	unsigned int d;
 	int (*execute)(struct x86_emulate_ctxt *ctxt);
 	int (*check_perm)(struct x86_emulate_ctxt *ctxt);
-	unsigned long regs[NR_VCPU_REGS];
-	unsigned long eip;
 	/* modrm */
 	u8 modrm;
 	u8 modrm_mod;
@@ -255,34 +272,14 @@ struct decode_cache {
 	u8 modrm_rm;
 	u8 modrm_seg;
 	bool rip_relative;
+	unsigned long _eip;
+	/* Fields above regs are cleared together. */
+	unsigned long regs[NR_VCPU_REGS];
 	struct fetch_cache fetch;
 	struct read_cache io_read;
 	struct read_cache mem_read;
 };
 
-struct x86_emulate_ctxt {
-	struct x86_emulate_ops *ops;
-
-	/* Register state before/after emulation. */
-	unsigned long eflags;
-	unsigned long eip; /* eip before instruction emulation */
-	/* Emulated execution mode, represented by an X86EMUL_MODE value. */
-	int mode;
-
-	/* interruptibility state, as a result of execution of STI or MOV SS */
-	int interruptibility;
-
-	bool guest_mode; /* guest running a nested guest */
-	bool perm_ok; /* do not check permissions if true */
-	bool only_vendor_specific_insn;
-
-	bool have_exception;
-	struct x86_exception exception;
-
-	/* decode cache */
-	struct decode_cache decode;
-};
-
 /* Repeat String Operation Prefix */
 #define REPE_PREFIX	0xf3
 #define REPNE_PREFIX	0xf2
@@ -373,6 +370,5 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 			 u16 tss_selector, int reason,
 			 bool has_error_code, u32 error_code);
-int emulate_int_real(struct x86_emulate_ctxt *ctxt,
-		     struct x86_emulate_ops *ops, int irq);
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d2ac8e2ee89..dd51c83aa5d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -48,7 +48,7 @@
 	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
 			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
 			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
-			  | X86_CR4_OSXSAVE \
+			  | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \
 			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -205,6 +205,7 @@ union kvm_mmu_page_role {
 		unsigned invalid:1;
 		unsigned nxe:1;
 		unsigned cr0_wp:1;
+		unsigned smep_andnot_wp:1;
 	};
 };
 
@@ -227,15 +228,17 @@ struct kvm_mmu_page {
 	 * in this shadow page.
 	 */
 	DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
-	bool multimapped;         /* More than one parent_pte? */
 	bool unsync;
 	int root_count;          /* Currently serving as active root */
 	unsigned int unsync_children;
-	union {
-		u64 *parent_pte;               /* !multimapped */
-		struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
-	};
+	unsigned long parent_ptes;	/* Reverse mapping for parent_pte */
 	DECLARE_BITMAP(unsync_child_bitmap, 512);
+
+#ifdef CONFIG_X86_32
+	int clear_spte_count;
+#endif
+
+	struct rcu_head rcu;
 };
 
 struct kvm_pv_mmu_op_buffer {
@@ -269,8 +272,6 @@ struct kvm_mmu {
 	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
 			    struct x86_exception *exception);
 	gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
-	void (*prefetch_page)(struct kvm_vcpu *vcpu,
-			      struct kvm_mmu_page *page);
 	int (*sync_page)(struct kvm_vcpu *vcpu,
 			 struct kvm_mmu_page *sp);
 	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
@@ -346,8 +347,7 @@ struct kvm_vcpu_arch {
 	 * put it here to avoid allocation */
 	struct kvm_pv_mmu_op_buffer mmu_op_buffer;
 
-	struct kvm_mmu_memory_cache mmu_pte_chain_cache;
-	struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
+	struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
 	struct kvm_mmu_memory_cache mmu_page_cache;
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
 
@@ -393,6 +393,15 @@ struct kvm_vcpu_arch {
 	unsigned int hw_tsc_khz;
 	unsigned int time_offset;
 	struct page *time_page;
+
+	struct {
+		u64 msr_val;
+		u64 last_steal;
+		u64 accum_steal;
+		struct gfn_to_hva_cache stime;
+		struct kvm_steal_time steal;
+	} st;
+
 	u64 last_guest_tsc;
 	u64 last_kernel_ns;
 	u64 last_tsc_nsec;
@@ -419,6 +428,11 @@ struct kvm_vcpu_arch {
 	u64 mcg_ctl;
 	u64 *mce_banks;
 
+	/* Cache MMIO info */
+	u64 mmio_gva;
+	unsigned access;
+	gfn_t mmio_gfn;
+
 	/* used for guest single stepping over the given code position */
 	unsigned long singlestep_rip;
 
@@ -441,6 +455,7 @@ struct kvm_arch {
 	unsigned int n_used_mmu_pages;
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_max_mmu_pages;
+	unsigned int indirect_shadow_pages;
 	atomic_t invlpg_counter;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
 	/*
@@ -477,6 +492,8 @@ struct kvm_arch {
 	u64 hv_guest_os_id;
 	u64 hv_hypercall;
 
+	atomic_t reader_counter;
+
 	#ifdef CONFIG_KVM_MMU_AUDIT
 	int audit_point;
 	#endif
@@ -559,7 +576,7 @@ struct kvm_x86_ops {
 	void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
 	void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
-	void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
+	int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
 	void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
 	void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
 	void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
@@ -636,7 +653,6 @@ void kvm_mmu_module_exit(void);
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 int kvm_mmu_setup(struct kvm_vcpu *vcpu);
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
@@ -830,11 +846,12 @@ enum {
 asmlinkage void kvm_spurious_fault(void);
 extern bool kvm_rebooting;
 
-#define __kvm_handle_fault_on_reboot(insn) \
+#define ____kvm_handle_fault_on_reboot(insn, cleanup_insn)	\
 	"666: " insn "\n\t" \
 	"668: \n\t"                           \
 	".pushsection .fixup, \"ax\" \n" \
 	"667: \n\t" \
+	cleanup_insn "\n\t"		      \
 	"cmpb $0, kvm_rebooting \n\t"	      \
 	"jne 668b \n\t"      		      \
 	__ASM_SIZE(push) " $666b \n\t"	      \
@@ -844,6 +861,9 @@ extern bool kvm_rebooting;
 	_ASM_PTR " 666b, 667b \n\t" \
 	".popsection"
 
+#define __kvm_handle_fault_on_reboot(insn)		\
+	____kvm_handle_fault_on_reboot(insn, "")
+
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index a427bf77a93..734c3767cfa 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -21,6 +21,7 @@
  */
 #define KVM_FEATURE_CLOCKSOURCE2        3
 #define KVM_FEATURE_ASYNC_PF		4
+#define KVM_FEATURE_STEAL_TIME		5
 
 /* The last 8 bits are used to indicate how to interpret the flags field
  * in pvclock structure. If no bits are set, all flags are ignored.
@@ -30,10 +31,23 @@
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
+#define KVM_MSR_ENABLED 1
 /* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
 #define MSR_KVM_WALL_CLOCK_NEW  0x4b564d00
 #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
 #define MSR_KVM_ASYNC_PF_EN 0x4b564d02
+#define MSR_KVM_STEAL_TIME  0x4b564d03
+
+struct kvm_steal_time {
+	__u64 steal;
+	__u32 version;
+	__u32 flags;
+	__u32 pad[12];
+};
+
+#define KVM_STEAL_ALIGNMENT_BITS 5
+#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
+#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
 
 #define KVM_MAX_MMU_OP_BATCH           32
 
@@ -178,6 +192,7 @@ void __init kvm_guest_init(void);
 void kvm_async_pf_task_wait(u32 token);
 void kvm_async_pf_task_wake(u32 token);
 u32 kvm_read_and_reset_pf_reason(void);
+extern void kvm_disable_steal_time(void);
 #else
 #define kvm_guest_init() do { } while (0)
 #define kvm_async_pf_task_wait(T) do {} while(0)
@@ -186,6 +201,11 @@ static inline u32 kvm_read_and_reset_pf_reason(void)
 {
 	return 0;
 }
+
+static inline void kvm_disable_steal_time(void)
+{
+	return;
+}
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index b60f2924c41..879fd7d3387 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -61,6 +61,7 @@ hcall(unsigned long call,
 		     : "memory");
 	return call;
 }
+/*:*/
 
 /* Can't use our min() macro here: needs to be a constant */
 #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 2e9972468a5..9cdae5d47e8 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -4,7 +4,7 @@
 #include <linux/percpu.h>
 
 #include <asm/system.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/asm.h>
 
 typedef struct {
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 021979a6e23..c9321f34e55 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -8,6 +8,7 @@
  * Machine Check support for x86
  */
 
+/* MCG_CAP register defines */
 #define MCG_BANKCNT_MASK	0xff         /* Number of Banks */
 #define MCG_CTL_P		(1ULL<<8)    /* MCG_CTL register available */
 #define MCG_EXT_P		(1ULL<<9)    /* Extended registers available */
@@ -17,10 +18,12 @@
 #define MCG_EXT_CNT(c)		(((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
 #define MCG_SER_P	 	(1ULL<<24)   /* MCA recovery/new status bits */
 
+/* MCG_STATUS register defines */
 #define MCG_STATUS_RIPV  (1ULL<<0)   /* restart ip valid */
 #define MCG_STATUS_EIPV  (1ULL<<1)   /* ip points to correct instruction */
 #define MCG_STATUS_MCIP  (1ULL<<2)   /* machine check in progress */
 
+/* MCi_STATUS register defines */
 #define MCI_STATUS_VAL   (1ULL<<63)  /* valid error */
 #define MCI_STATUS_OVER  (1ULL<<62)  /* previous errors lost */
 #define MCI_STATUS_UC    (1ULL<<61)  /* uncorrected error */
@@ -31,12 +34,14 @@
 #define MCI_STATUS_S	 (1ULL<<56)  /* Signaled machine check */
 #define MCI_STATUS_AR	 (1ULL<<55)  /* Action required */
 
-/* MISC register defines */
-#define MCM_ADDR_SEGOFF  0	/* segment offset */
-#define MCM_ADDR_LINEAR  1	/* linear address */
-#define MCM_ADDR_PHYS	 2	/* physical address */
-#define MCM_ADDR_MEM	 3	/* memory address */
-#define MCM_ADDR_GENERIC 7	/* generic */
+/* MCi_MISC register defines */
+#define MCI_MISC_ADDR_LSB(m)	((m) & 0x3f)
+#define MCI_MISC_ADDR_MODE(m)	(((m) >> 6) & 7)
+#define  MCI_MISC_ADDR_SEGOFF	0	/* segment offset */
+#define  MCI_MISC_ADDR_LINEAR	1	/* linear address */
+#define  MCI_MISC_ADDR_PHYS	2	/* physical address */
+#define  MCI_MISC_ADDR_MEM	3	/* memory address */
+#define  MCI_MISC_ADDR_GENERIC	7	/* generic */
 
 /* CTL2 register defines */
 #define MCI_CTL2_CMCI_EN		(1ULL << 30)
@@ -119,7 +124,7 @@ extern struct atomic_notifier_head x86_mce_decoder_chain;
 
 #include <linux/percpu.h>
 #include <linux/init.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 
 extern int mce_disabled;
 extern int mce_p5_enabled;
@@ -144,7 +149,7 @@ static inline void enable_p5_mce(void) {}
 
 void mce_setup(struct mce *m);
 void mce_log(struct mce *m);
-DECLARE_PER_CPU(struct sys_device, mce_dev);
+DECLARE_PER_CPU(struct sys_device, mce_sysdev);
 
 /*
  * Maximum banks number.
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 8b5393ec108..69021528b43 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -2,7 +2,7 @@
 #define _ASM_X86_MMU_CONTEXT_H
 
 #include <asm/desc.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include <asm/paravirt.h>
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index ffa037f28d3..55728e12147 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -34,15 +34,15 @@ static inline void resume_map_numa_kva(pgd_t *pgd) {}
  *    64Gb / 4096bytes/page = 16777216 pages
  */
 #define MAX_NR_PAGES 16777216
-#define MAX_ELEMENTS 1024
-#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS)
+#define MAX_SECTIONS 1024
+#define PAGES_PER_SECTION (MAX_NR_PAGES/MAX_SECTIONS)
 
 extern s8 physnode_map[];
 
 static inline int pfn_to_nid(unsigned long pfn)
 {
 #ifdef CONFIG_NUMA
-	return((int) physnode_map[(pfn) / PAGES_PER_ELEMENT]);
+	return((int) physnode_map[(pfn) / PAGES_PER_SECTION]);
 #else
 	return 0;
 #endif
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 485b4f1f079..d52609aeeab 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -259,6 +259,9 @@
 #define MSR_IA32_TEMPERATURE_TARGET	0x000001a2
 
 #define MSR_IA32_ENERGY_PERF_BIAS	0x000001b0
+#define ENERGY_PERF_BIAS_PERFORMANCE	0
+#define ENERGY_PERF_BIAS_NORMAL		6
+#define ENERGY_PERF_BIAS_POWERSAVE	15
 
 #define MSR_IA32_PACKAGE_THERM_STATUS		0x000001b1
 
@@ -438,6 +441,18 @@
 #define MSR_IA32_VMX_VMCS_ENUM          0x0000048a
 #define MSR_IA32_VMX_PROCBASED_CTLS2    0x0000048b
 #define MSR_IA32_VMX_EPT_VPID_CAP       0x0000048c
+#define MSR_IA32_VMX_TRUE_PINBASED_CTLS  0x0000048d
+#define MSR_IA32_VMX_TRUE_PROCBASED_CTLS 0x0000048e
+#define MSR_IA32_VMX_TRUE_EXIT_CTLS      0x0000048f
+#define MSR_IA32_VMX_TRUE_ENTRY_CTLS     0x00000490
+
+/* VMX_BASIC bits and bitmasks */
+#define VMX_BASIC_VMCS_SIZE_SHIFT	32
+#define VMX_BASIC_64		0x0001000000000000LLU
+#define VMX_BASIC_MEM_TYPE_SHIFT	50
+#define VMX_BASIC_MEM_TYPE_MASK	0x003c000000000000LLU
+#define VMX_BASIC_MEM_TYPE_WB	6LLU
+#define VMX_BASIC_INOUT		0x0040000000000000LLU
 
 /* AMD-V MSRs */
 
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 5ca6801b75f..87bdbca72f9 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -13,6 +13,7 @@ struct olpc_platform_t {
 
 #define OLPC_F_PRESENT		0x01
 #define OLPC_F_DCON		0x02
+#define OLPC_F_EC_WIDE_SCI	0x04
 
 #ifdef CONFIG_OLPC
 
@@ -62,6 +63,13 @@ static inline int olpc_board_at_least(uint32_t rev)
 	return olpc_platform_info.boardrev >= rev;
 }
 
+extern void olpc_ec_wakeup_set(u16 value);
+extern void olpc_ec_wakeup_clear(u16 value);
+extern bool olpc_ec_wakeup_available(void);
+
+extern int olpc_ec_mask_write(u16 bits);
+extern int olpc_ec_sci_query(u16 *sci_value);
+
 #else
 
 static inline int machine_is_olpc(void)
@@ -74,6 +82,20 @@ static inline int olpc_has_dcon(void)
 	return 0;
 }
 
+static inline void olpc_ec_wakeup_set(u16 value) { }
+static inline void olpc_ec_wakeup_clear(u16 value) { }
+
+static inline bool olpc_ec_wakeup_available(void)
+{
+	return false;
+}
+
+#endif
+
+#ifdef CONFIG_OLPC_XO1_PM
+extern void do_olpc_suspend_lowlevel(void);
+extern void olpc_xo1_pm_wakeup_set(u16 value);
+extern void olpc_xo1_pm_wakeup_clear(u16 value);
 #endif
 
 extern int pci_olpc_init(void);
@@ -83,14 +105,19 @@ extern int pci_olpc_init(void);
 extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
 		unsigned char *outbuf, size_t outlen);
 
-extern int olpc_ec_mask_set(uint8_t bits);
-extern int olpc_ec_mask_unset(uint8_t bits);
-
 /* EC commands */
 
-#define EC_FIRMWARE_REV		0x08
-#define EC_WLAN_ENTER_RESET	0x35
-#define EC_WLAN_LEAVE_RESET	0x25
+#define EC_FIRMWARE_REV			0x08
+#define EC_WRITE_SCI_MASK		0x1b
+#define EC_WAKE_UP_WLAN			0x24
+#define EC_WLAN_LEAVE_RESET		0x25
+#define EC_READ_EB_MODE			0x2a
+#define EC_SET_SCI_INHIBIT		0x32
+#define EC_SET_SCI_INHIBIT_RELEASE	0x34
+#define EC_WLAN_ENTER_RESET		0x35
+#define EC_WRITE_EXT_SCI_MASK		0x38
+#define EC_SCI_QUERY			0x84
+#define EC_EXT_SCI_QUERY		0x85
 
 /* SCI source values */
 
@@ -99,10 +126,12 @@ extern int olpc_ec_mask_unset(uint8_t bits);
 #define EC_SCI_SRC_BATTERY	0x02
 #define EC_SCI_SRC_BATSOC	0x04
 #define EC_SCI_SRC_BATERR	0x08
-#define EC_SCI_SRC_EBOOK	0x10
-#define EC_SCI_SRC_WLAN		0x20
+#define EC_SCI_SRC_EBOOK	0x10	/* XO-1 only */
+#define EC_SCI_SRC_WLAN		0x20	/* XO-1 only */
 #define EC_SCI_SRC_ACPWR	0x40
-#define EC_SCI_SRC_ALL		0x7F
+#define EC_SCI_SRC_BATCRIT	0x80
+#define EC_SCI_SRC_GPWAKE	0x100	/* XO-1.5 only */
+#define EC_SCI_SRC_ALL		0x1FF
 
 /* GPIO assignments */
 
@@ -116,7 +145,7 @@ extern int olpc_ec_mask_unset(uint8_t bits);
 #define OLPC_GPIO_SMB_CLK      14
 #define OLPC_GPIO_SMB_DATA     15
 #define OLPC_GPIO_WORKAUX	geode_gpio(24)
-#define OLPC_GPIO_LID		geode_gpio(26)
-#define OLPC_GPIO_ECSCI		geode_gpio(27)
+#define OLPC_GPIO_LID		26
+#define OLPC_GPIO_ECSCI		27
 
 #endif /* _ASM_X86_OLPC_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ebbc4d8ab17..a7d2db9a74f 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -230,6 +230,15 @@ static inline unsigned long long paravirt_sched_clock(void)
 	return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
 }
 
+struct jump_label_key;
+extern struct jump_label_key paravirt_steal_enabled;
+extern struct jump_label_key paravirt_steal_rq_enabled;
+
+static inline u64 paravirt_steal_clock(int cpu)
+{
+	return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu);
+}
+
 static inline unsigned long long paravirt_read_pmc(int counter)
 {
 	return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 82885099c86..2c765216311 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -89,6 +89,7 @@ struct pv_lazy_ops {
 
 struct pv_time_ops {
 	unsigned long long (*sched_clock)(void);
+	unsigned long long (*steal_clock)(int cpu);
 	unsigned long (*get_tsc_khz)(void);
 };
 
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index a0a9779084d..3470c9d0ebb 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -388,12 +388,9 @@ do {									\
 #define __this_cpu_xor_1(pcp, val)	percpu_to_op("xor", (pcp), val)
 #define __this_cpu_xor_2(pcp, val)	percpu_to_op("xor", (pcp), val)
 #define __this_cpu_xor_4(pcp, val)	percpu_to_op("xor", (pcp), val)
-/*
- * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much
- * faster than an xchg with forced lock semantics.
- */
-#define __this_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
-#define __this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
+#define __this_cpu_xchg_1(pcp, val)	percpu_xchg_op(pcp, val)
+#define __this_cpu_xchg_2(pcp, val)	percpu_xchg_op(pcp, val)
+#define __this_cpu_xchg_4(pcp, val)	percpu_xchg_op(pcp, val)
 
 #define this_cpu_read_1(pcp)		percpu_from_op("mov", (pcp), "m"(pcp))
 #define this_cpu_read_2(pcp)		percpu_from_op("mov", (pcp), "m"(pcp))
@@ -485,6 +482,8 @@ do {									\
 #define __this_cpu_or_8(pcp, val)	percpu_to_op("or", (pcp), val)
 #define __this_cpu_xor_8(pcp, val)	percpu_to_op("xor", (pcp), val)
 #define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
+#define __this_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval)
+#define __this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)
 
 #define this_cpu_read_8(pcp)		percpu_from_op("mov", (pcp), "m"(pcp))
 #define this_cpu_write_8(pcp, val)	percpu_to_op("mov", (pcp), val)
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index d9d4dae305f..094fb30817a 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -152,6 +152,11 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
 	(regs)->bp = caller_frame_pointer();			\
 	(regs)->cs = __KERNEL_CS;				\
 	regs->flags = 0;					\
+	asm volatile(						\
+		_ASM_MOV "%%"_ASM_SP ", %0\n"			\
+		: "=m" ((regs)->sp)				\
+		:: "memory"					\
+	);							\
 }
 
 #else
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index 56fd9e3abbd..4f7e67e2345 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -102,6 +102,14 @@
 #define P4_CONFIG_HT			(1ULL << P4_CONFIG_HT_SHIFT)
 
 /*
+ * If an event has alias it should be marked
+ * with a special bit. (Don't forget to check
+ * P4_PEBS_CONFIG_MASK and related bits on
+ * modification.)
+ */
+#define P4_CONFIG_ALIASABLE		(1 << 9)
+
+/*
  * The bits we allow to pass for RAW events
  */
 #define P4_CONFIG_MASK_ESCR		\
@@ -123,6 +131,31 @@
 	(p4_config_pack_escr(P4_CONFIG_MASK_ESCR))	| \
 	(p4_config_pack_cccr(P4_CONFIG_MASK_CCCR))
 
+/*
+ * In case of event aliasing we need to preserve some
+ * caller bits, otherwise the mapping won't be complete.
+ */
+#define P4_CONFIG_EVENT_ALIAS_MASK			  \
+	(p4_config_pack_escr(P4_CONFIG_MASK_ESCR)	| \
+	 p4_config_pack_cccr(P4_CCCR_EDGE		| \
+			     P4_CCCR_THRESHOLD_MASK	| \
+			     P4_CCCR_COMPLEMENT		| \
+			     P4_CCCR_COMPARE))
+
+#define  P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS		  \
+	((P4_CONFIG_HT)					| \
+	 p4_config_pack_escr(P4_ESCR_T0_OS		| \
+			     P4_ESCR_T0_USR		| \
+			     P4_ESCR_T1_OS		| \
+			     P4_ESCR_T1_USR)		| \
+	 p4_config_pack_cccr(P4_CCCR_OVF		| \
+			     P4_CCCR_CASCADE		| \
+			     P4_CCCR_FORCE_OVF		| \
+			     P4_CCCR_THREAD_ANY		| \
+			     P4_CCCR_OVF_PMI_T0		| \
+			     P4_CCCR_OVF_PMI_T1		| \
+			     P4_CONFIG_ALIASABLE))
+
 static inline bool p4_is_event_cascaded(u64 config)
 {
 	u32 cccr = p4_config_unpack_cccr(config);
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d56187c6b83..013286a10c2 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -107,7 +107,8 @@
 #define __PAGE_KERNEL_NOCACHE		(__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
 #define __PAGE_KERNEL_UC_MINUS		(__PAGE_KERNEL | _PAGE_PCD)
 #define __PAGE_KERNEL_VSYSCALL		(__PAGE_KERNEL_RX | _PAGE_USER)
-#define __PAGE_KERNEL_VSYSCALL_NOCACHE	(__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
+#define __PAGE_KERNEL_VVAR		(__PAGE_KERNEL_RO | _PAGE_USER)
+#define __PAGE_KERNEL_VVAR_NOCACHE	(__PAGE_KERNEL_VVAR | _PAGE_PCD | _PAGE_PWT)
 #define __PAGE_KERNEL_LARGE		(__PAGE_KERNEL | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_NOCACHE	(__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
@@ -129,7 +130,8 @@
 #define PAGE_KERNEL_LARGE_NOCACHE	__pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
 #define PAGE_KERNEL_LARGE_EXEC		__pgprot(__PAGE_KERNEL_LARGE_EXEC)
 #define PAGE_KERNEL_VSYSCALL		__pgprot(__PAGE_KERNEL_VSYSCALL)
-#define PAGE_KERNEL_VSYSCALL_NOCACHE	__pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
+#define PAGE_KERNEL_VVAR		__pgprot(__PAGE_KERNEL_VVAR)
+#define PAGE_KERNEL_VVAR_NOCACHE	__pgprot(__PAGE_KERNEL_VVAR_NOCACHE)
 
 #define PAGE_KERNEL_IO			__pgprot(__PAGE_KERNEL_IO)
 #define PAGE_KERNEL_IO_NOCACHE		__pgprot(__PAGE_KERNEL_IO_NOCACHE)
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 59ab4dffa37..2dddb317bb3 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -59,6 +59,7 @@
 #define X86_CR4_OSFXSR	0x00000200 /* enable fast FPU save and restore */
 #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
 #define X86_CR4_VMXE	0x00002000 /* enable VMX virtualization */
+#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */
 #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
 #define X86_CR4_SMEP	0x00100000 /* enable SMEP support */
 
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 219371546af..0d1171c9772 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -751,8 +751,6 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
 		     :: "a" (eax), "c" (ecx));
 }
 
-extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
-
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
 extern void init_amd_e400_c1e_mask(void);
 
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
index 971e0b46446..644dd885f05 100644
--- a/arch/x86/include/asm/prom.h
+++ b/arch/x86/include/asm/prom.h
@@ -19,7 +19,7 @@
 #include <linux/pci.h>
 
 #include <asm/irq.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/setup.h>
 #include <asm/irq_controller.h>
 
@@ -30,17 +30,6 @@ extern void add_dtb(u64 data);
 extern void x86_add_irq_domains(void);
 void __cpuinit x86_of_pci_init(void);
 void x86_dtb_init(void);
-
-static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
-{
-	return pdev ? pdev->dev.of_node : NULL;
-}
-
-static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
-{
-	return pci_device_to_OF_node(bus->self);
-}
-
 #else
 static inline void add_dtb(u64 data) { }
 static inline void x86_add_irq_domains(void) { }
diff --git a/arch/x86/include/asm/rwlock.h b/arch/x86/include/asm/rwlock.h
index 6a8c0d64510..a5370a03d90 100644
--- a/arch/x86/include/asm/rwlock.h
+++ b/arch/x86/include/asm/rwlock.h
@@ -1,7 +1,48 @@
 #ifndef _ASM_X86_RWLOCK_H
 #define _ASM_X86_RWLOCK_H
 
-#define RW_LOCK_BIAS		 0x01000000
+#include <asm/asm.h>
+
+#if CONFIG_NR_CPUS <= 2048
+
+#ifndef __ASSEMBLY__
+typedef union {
+	s32 lock;
+	s32 write;
+} arch_rwlock_t;
+#endif
+
+#define RW_LOCK_BIAS		0x00100000
+#define READ_LOCK_SIZE(insn)	__ASM_FORM(insn##l)
+#define READ_LOCK_ATOMIC(n)	atomic_##n
+#define WRITE_LOCK_ADD(n)	__ASM_FORM_COMMA(addl n)
+#define WRITE_LOCK_SUB(n)	__ASM_FORM_COMMA(subl n)
+#define WRITE_LOCK_CMP		RW_LOCK_BIAS
+
+#else /* CONFIG_NR_CPUS > 2048 */
+
+#include <linux/const.h>
+
+#ifndef __ASSEMBLY__
+typedef union {
+	s64 lock;
+	struct {
+		u32 read;
+		s32 write;
+	};
+} arch_rwlock_t;
+#endif
+
+#define RW_LOCK_BIAS		(_AC(1,L) << 32)
+#define READ_LOCK_SIZE(insn)	__ASM_FORM(insn##q)
+#define READ_LOCK_ATOMIC(n)	atomic64_##n
+#define WRITE_LOCK_ADD(n)	__ASM_FORM(incl)
+#define WRITE_LOCK_SUB(n)	__ASM_FORM(decl)
+#define WRITE_LOCK_CMP		1
+
+#endif /* CONFIG_NR_CPUS */
+
+#define __ARCH_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
 
 /* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */
 
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index cd84f7208f7..5e641715c3f 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -162,7 +162,7 @@
 #define GDT_ENTRY_DEFAULT_USER32_CS 4
 #define GDT_ENTRY_DEFAULT_USER_DS 5
 #define GDT_ENTRY_DEFAULT_USER_CS 6
-#define __USER32_CS   (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
+#define __USER32_CS   (GDT_ENTRY_DEFAULT_USER32_CS*8+3)
 #define __USER32_DS	__USER_DS
 
 #define GDT_ENTRY_TSS 8	/* needs two entries */
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
index 725b7783199..49adfd7bb4a 100644
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ b/arch/x86/include/asm/smpboot_hooks.h
@@ -10,7 +10,11 @@ static inline void smpboot_clear_io_apic_irqs(void)
 
 static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
 {
+	unsigned long flags;
+
+	spin_lock_irqsave(&rtc_lock, flags);
 	CMOS_WRITE(0xa, 0xf);
+	spin_unlock_irqrestore(&rtc_lock, flags);
 	local_flush_tlb();
 	pr_debug("1.\n");
 	*((volatile unsigned short *)phys_to_virt(apic->trampoline_phys_high)) =
@@ -23,6 +27,8 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
 
 static inline void smpboot_restore_warm_reset_vector(void)
 {
+	unsigned long flags;
+
 	/*
 	 * Install writable page 0 entry to set BIOS data area.
 	 */
@@ -32,7 +38,9 @@ static inline void smpboot_restore_warm_reset_vector(void)
 	 * Paranoid:  Set warm reset code and vector here back
 	 * to default values.
 	 */
+	spin_lock_irqsave(&rtc_lock, flags);
 	CMOS_WRITE(0, 0xf);
+	spin_unlock_irqrestore(&rtc_lock, flags);
 
 	*((volatile u32 *)phys_to_virt(apic->trampoline_phys_low)) = 0;
 }
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 3089f70c0c5..ee67edf86fd 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -1,8 +1,7 @@
 #ifndef _ASM_X86_SPINLOCK_H
 #define _ASM_X86_SPINLOCK_H
 
-#include <asm/atomic.h>
-#include <asm/rwlock.h>
+#include <linux/atomic.h>
 #include <asm/page.h>
 #include <asm/processor.h>
 #include <linux/compiler.h>
@@ -234,7 +233,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
  */
 static inline int arch_read_can_lock(arch_rwlock_t *lock)
 {
-	return (int)(lock)->lock > 0;
+	return lock->lock > 0;
 }
 
 /**
@@ -243,12 +242,12 @@ static inline int arch_read_can_lock(arch_rwlock_t *lock)
  */
 static inline int arch_write_can_lock(arch_rwlock_t *lock)
 {
-	return (lock)->lock == RW_LOCK_BIAS;
+	return lock->write == WRITE_LOCK_CMP;
 }
 
 static inline void arch_read_lock(arch_rwlock_t *rw)
 {
-	asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
+	asm volatile(LOCK_PREFIX READ_LOCK_SIZE(dec) " (%0)\n\t"
 		     "jns 1f\n"
 		     "call __read_lock_failed\n\t"
 		     "1:\n"
@@ -257,47 +256,55 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
 
 static inline void arch_write_lock(arch_rwlock_t *rw)
 {
-	asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
+	asm volatile(LOCK_PREFIX WRITE_LOCK_SUB(%1) "(%0)\n\t"
 		     "jz 1f\n"
 		     "call __write_lock_failed\n\t"
 		     "1:\n"
-		     ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
+		     ::LOCK_PTR_REG (&rw->write), "i" (RW_LOCK_BIAS)
+		     : "memory");
 }
 
 static inline int arch_read_trylock(arch_rwlock_t *lock)
 {
-	atomic_t *count = (atomic_t *)lock;
+	READ_LOCK_ATOMIC(t) *count = (READ_LOCK_ATOMIC(t) *)lock;
 
-	if (atomic_dec_return(count) >= 0)
+	if (READ_LOCK_ATOMIC(dec_return)(count) >= 0)
 		return 1;
-	atomic_inc(count);
+	READ_LOCK_ATOMIC(inc)(count);
 	return 0;
 }
 
 static inline int arch_write_trylock(arch_rwlock_t *lock)
 {
-	atomic_t *count = (atomic_t *)lock;
+	atomic_t *count = (atomic_t *)&lock->write;
 
-	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
+	if (atomic_sub_and_test(WRITE_LOCK_CMP, count))
 		return 1;
-	atomic_add(RW_LOCK_BIAS, count);
+	atomic_add(WRITE_LOCK_CMP, count);
 	return 0;
 }
 
 static inline void arch_read_unlock(arch_rwlock_t *rw)
 {
-	asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
+	asm volatile(LOCK_PREFIX READ_LOCK_SIZE(inc) " %0"
+		     :"+m" (rw->lock) : : "memory");
 }
 
 static inline void arch_write_unlock(arch_rwlock_t *rw)
 {
-	asm volatile(LOCK_PREFIX "addl %1, %0"
-		     : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
+	asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0"
+		     : "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory");
 }
 
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
 #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
 
+#undef READ_LOCK_SIZE
+#undef READ_LOCK_ATOMIC
+#undef WRITE_LOCK_ADD
+#undef WRITE_LOCK_SUB
+#undef WRITE_LOCK_CMP
+
 #define arch_spin_relax(lock)	cpu_relax()
 #define arch_read_relax(lock)	cpu_relax()
 #define arch_write_relax(lock)	cpu_relax()
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index dcb48b2edc1..7c7a486fcb6 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -11,10 +11,6 @@ typedef struct arch_spinlock {
 
 #define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
 
-typedef struct {
-	unsigned int lock;
-} arch_rwlock_t;
-
-#define __ARCH_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
+#include <asm/rwlock.h>
 
 #endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 1f2e61e2898..a1fe5c127b5 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -21,7 +21,7 @@ struct task_struct;
 struct exec_domain;
 #include <asm/processor.h>
 #include <asm/ftrace.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 
 struct thread_info {
 	struct task_struct	*task;		/* main task structure */
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
index 7bdec4e9b73..92b8aec0697 100644
--- a/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@ -1,10 +1,12 @@
 #ifndef _ASM_X86_TIME_H
 #define _ASM_X86_TIME_H
 
-extern void hpet_time_init(void);
-
+#include <linux/clocksource.h>
 #include <asm/mc146818rtc.h>
 
+extern void hpet_time_init(void);
 extern void time_init(void);
 
+extern struct clock_event_device *global_clock_event;
+
 #endif /* _ASM_X86_TIME_H */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0310da67307..2bae0a513b4 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_TRAPS_H
 #define _ASM_X86_TRAPS_H
 
+#include <linux/kprobes.h>
+
 #include <asm/debugreg.h>
 #include <asm/siginfo.h>			/* TRAP_TRACE, ... */
 
@@ -38,6 +40,7 @@ asmlinkage void alignment_check(void);
 asmlinkage void machine_check(void);
 #endif /* CONFIG_X86_MCE */
 asmlinkage void simd_coprocessor_error(void);
+asmlinkage void emulate_vsyscall(void);
 
 dotraplinkage void do_divide_error(struct pt_regs *, long);
 dotraplinkage void do_debug(struct pt_regs *, long);
@@ -64,6 +67,7 @@ dotraplinkage void do_alignment_check(struct pt_regs *, long);
 dotraplinkage void do_machine_check(struct pt_regs *, long);
 #endif
 dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long);
+dotraplinkage void do_emulate_vsyscall(struct pt_regs *, long);
 #ifdef CONFIG_X86_32
 dotraplinkage void do_iret_error(struct pt_regs *, long);
 #endif
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 9db5583b6d3..83e2efd181e 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,10 +51,6 @@ extern int unsynchronized_tsc(void);
 extern int check_tsc_unstable(void);
 extern unsigned long native_calibrate_tsc(void);
 
-#ifdef CONFIG_X86_64
-extern cycles_t vread_tsc(void);
-#endif
-
 /*
  * Boot-time check whether the TSCs are synchronized across
  * all CPUs/cores:
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 99ddd148a76..36361bf6fdd 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -555,6 +555,9 @@ struct __large_struct { unsigned long buf[100]; };
 
 #endif /* CONFIG_X86_WP_WORKS_OK */
 
+extern unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n);
+
 /*
  * movsl can be slow when source and dest are not both 8-byte aligned
  */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index a291c40efd4..37d369859c8 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -67,7 +67,7 @@
  *  we're using 655us, similar to UV1: 65 units of 10us
  */
 #define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL)
-#define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (65*10UL)
+#define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (15UL)
 
 #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD	(is_uv1_hub() ?			\
 		UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD :			\
@@ -106,12 +106,20 @@
 #define DS_SOURCE_TIMEOUT		3
 /*
  * bits put together from HRP_LB_BAU_SB_ACTIVATION_STATUS_0/1/2
- * values 1 and 5 will not occur
+ * values 1 and 3 will not occur
+ *        Decoded meaning              ERROR  BUSY    AUX ERR
+ * -------------------------------     ----   -----   -------
+ * IDLE                                 0       0        0
+ * BUSY (active)                        0       1        0
+ * SW Ack Timeout (destination)         1       0        0
+ * SW Ack INTD rejected (strong NACK)   1       0        1
+ * Source Side Time Out Detected        1       1        0
+ * Destination Side PUT Failed          1       1        1
  */
 #define UV2H_DESC_IDLE			0
-#define UV2H_DESC_DEST_TIMEOUT		2
-#define UV2H_DESC_DEST_STRONG_NACK	3
-#define UV2H_DESC_BUSY			4
+#define UV2H_DESC_BUSY			2
+#define UV2H_DESC_DEST_TIMEOUT		4
+#define UV2H_DESC_DEST_STRONG_NACK	5
 #define UV2H_DESC_SOURCE_TIMEOUT	6
 #define UV2H_DESC_DEST_PUT_ERR		7
 
@@ -183,7 +191,7 @@
  * 'base_dest_nasid' field of the header corresponds to the
  * destination nodeID associated with that specified bit.
  */
-struct bau_targ_hubmask {
+struct pnmask {
 	unsigned long		bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)];
 };
 
@@ -314,7 +322,7 @@ struct bau_msg_header {
  * Should be 64 bytes
  */
 struct bau_desc {
-	struct bau_targ_hubmask	distribution;
+	struct pnmask			distribution;
 	/*
 	 * message template, consisting of header and payload:
 	 */
@@ -488,6 +496,7 @@ struct bau_control {
 	struct bau_control	*uvhub_master;
 	struct bau_control	*socket_master;
 	struct ptc_stats	*statp;
+	cpumask_t		*cpumask;
 	unsigned long		timeout_interval;
 	unsigned long		set_bau_on_time;
 	atomic_t		active_descriptor_count;
@@ -526,90 +535,90 @@ struct bau_control {
 	struct hub_and_pnode	*thp;
 };
 
-static unsigned long read_mmr_uv2_status(void)
+static inline unsigned long read_mmr_uv2_status(void)
 {
 	return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2);
 }
 
-static void write_mmr_data_broadcast(int pnode, unsigned long mmr_image)
+static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image)
 {
 	write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image);
 }
 
-static void write_mmr_descriptor_base(int pnode, unsigned long mmr_image)
+static inline void write_mmr_descriptor_base(int pnode, unsigned long mmr_image)
 {
 	write_gmmr(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, mmr_image);
 }
 
-static void write_mmr_activation(unsigned long index)
+static inline void write_mmr_activation(unsigned long index)
 {
 	write_lmmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
 }
 
-static void write_gmmr_activation(int pnode, unsigned long mmr_image)
+static inline void write_gmmr_activation(int pnode, unsigned long mmr_image)
 {
 	write_gmmr(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, mmr_image);
 }
 
-static void write_mmr_payload_first(int pnode, unsigned long mmr_image)
+static inline void write_mmr_payload_first(int pnode, unsigned long mmr_image)
 {
 	write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, mmr_image);
 }
 
-static void write_mmr_payload_tail(int pnode, unsigned long mmr_image)
+static inline void write_mmr_payload_tail(int pnode, unsigned long mmr_image)
 {
 	write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, mmr_image);
 }
 
-static void write_mmr_payload_last(int pnode, unsigned long mmr_image)
+static inline void write_mmr_payload_last(int pnode, unsigned long mmr_image)
 {
 	write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, mmr_image);
 }
 
-static void write_mmr_misc_control(int pnode, unsigned long mmr_image)
+static inline void write_mmr_misc_control(int pnode, unsigned long mmr_image)
 {
 	write_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
 }
 
-static unsigned long read_mmr_misc_control(int pnode)
+static inline unsigned long read_mmr_misc_control(int pnode)
 {
 	return read_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL);
 }
 
-static void write_mmr_sw_ack(unsigned long mr)
+static inline void write_mmr_sw_ack(unsigned long mr)
 {
 	uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
 }
 
-static unsigned long read_mmr_sw_ack(void)
+static inline unsigned long read_mmr_sw_ack(void)
 {
 	return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
 }
 
-static unsigned long read_gmmr_sw_ack(int pnode)
+static inline unsigned long read_gmmr_sw_ack(int pnode)
 {
 	return read_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
 }
 
-static void write_mmr_data_config(int pnode, unsigned long mr)
+static inline void write_mmr_data_config(int pnode, unsigned long mr)
 {
 	uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, mr);
 }
 
-static inline int bau_uvhub_isset(int uvhub, struct bau_targ_hubmask *dstp)
+static inline int bau_uvhub_isset(int uvhub, struct pnmask *dstp)
 {
 	return constant_test_bit(uvhub, &dstp->bits[0]);
 }
-static inline void bau_uvhub_set(int pnode, struct bau_targ_hubmask *dstp)
+static inline void bau_uvhub_set(int pnode, struct pnmask *dstp)
 {
 	__set_bit(pnode, &dstp->bits[0]);
 }
-static inline void bau_uvhubs_clear(struct bau_targ_hubmask *dstp,
+static inline void bau_uvhubs_clear(struct pnmask *dstp,
 				    int nbits)
 {
 	bitmap_zero(&dstp->bits[0], nbits);
 }
-static inline int bau_uvhub_weight(struct bau_targ_hubmask *dstp)
+static inline int bau_uvhub_weight(struct pnmask *dstp)
 {
 	return bitmap_weight((unsigned long *)&dstp->bits[0],
 				UV_DISTRIBUTION_SIZE);
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index 4be52c86344..10474fb1185 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -61,1689 +61,2016 @@
 /* Compat: if this #define is present, UV headers support UV2 */
 #define UV2_HUB_IS_SUPPORTED	1
 
-/* KABI compat: if this #define is present, KABI hacks are present */
-#define UV2_HUB_KABI_HACKS	1
-
 /* ========================================================================= */
 /*                          UVH_BAU_DATA_BROADCAST                           */
 /* ========================================================================= */
-#define UVH_BAU_DATA_BROADCAST 0x61688UL
-#define UVH_BAU_DATA_BROADCAST_32 0x440
+#define UVH_BAU_DATA_BROADCAST				0x61688UL
+#define UVH_BAU_DATA_BROADCAST_32			0x440
 
-#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0
-#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL
+#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT		0
+#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK		0x0000000000000001UL
 
 union uvh_bau_data_broadcast_u {
-    unsigned long	v;
-    struct uvh_bau_data_broadcast_s {
-	unsigned long	enable :  1;  /* RW */
-	unsigned long	rsvd_1_63: 63;  /*    */
-    } s;
+	unsigned long	v;
+	struct uvh_bau_data_broadcast_s {
+		unsigned long	enable:1;			/* RW */
+		unsigned long	rsvd_1_63:63;
+	} s;
 };
 
 /* ========================================================================= */
 /*                           UVH_BAU_DATA_CONFIG                             */
 /* ========================================================================= */
-#define UVH_BAU_DATA_CONFIG 0x61680UL
-#define UVH_BAU_DATA_CONFIG_32 0x438
-
-#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0
-#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_BAU_DATA_CONFIG_DM_SHFT 8
-#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11
-#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12
-#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_BAU_DATA_CONFIG_P_SHFT 13
-#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_BAU_DATA_CONFIG_T_SHFT 15
-#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_BAU_DATA_CONFIG_M_SHFT 16
-#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32
-#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_BAU_DATA_CONFIG				0x61680UL
+#define UVH_BAU_DATA_CONFIG_32				0x438
+
+#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT			0
+#define UVH_BAU_DATA_CONFIG_DM_SHFT			8
+#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT		11
+#define UVH_BAU_DATA_CONFIG_STATUS_SHFT			12
+#define UVH_BAU_DATA_CONFIG_P_SHFT			13
+#define UVH_BAU_DATA_CONFIG_T_SHFT			15
+#define UVH_BAU_DATA_CONFIG_M_SHFT			16
+#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT		32
+#define UVH_BAU_DATA_CONFIG_VECTOR_MASK			0x00000000000000ffUL
+#define UVH_BAU_DATA_CONFIG_DM_MASK			0x0000000000000700UL
+#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK		0x0000000000000800UL
+#define UVH_BAU_DATA_CONFIG_STATUS_MASK			0x0000000000001000UL
+#define UVH_BAU_DATA_CONFIG_P_MASK			0x0000000000002000UL
+#define UVH_BAU_DATA_CONFIG_T_MASK			0x0000000000008000UL
+#define UVH_BAU_DATA_CONFIG_M_MASK			0x0000000000010000UL
+#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK		0xffffffff00000000UL
 
 union uvh_bau_data_config_u {
-    unsigned long	v;
-    struct uvh_bau_data_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
+	unsigned long	v;
+	struct uvh_bau_data_config_s {
+		unsigned long	vector_:8;			/* RW */
+		unsigned long	dm:3;				/* RW */
+		unsigned long	destmode:1;			/* RW */
+		unsigned long	status:1;			/* RO */
+		unsigned long	p:1;				/* RO */
+		unsigned long	rsvd_14:1;
+		unsigned long	t:1;				/* RO */
+		unsigned long	m:1;				/* RW */
+		unsigned long	rsvd_17_31:15;
+		unsigned long	apic_id:32;			/* RW */
+	} s;
 };
 
 /* ========================================================================= */
 /*                           UVH_EVENT_OCCURRED0                             */
 /* ========================================================================= */
-#define UVH_EVENT_OCCURRED0 0x70000UL
-#define UVH_EVENT_OCCURRED0_32 0x5e8
-
-#define UV1H_EVENT_OCCURRED0_LB_HCERR_SHFT 0
-#define UV1H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
-#define UV1H_EVENT_OCCURRED0_GR0_HCERR_SHFT 1
-#define UV1H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL
-#define UV1H_EVENT_OCCURRED0_GR1_HCERR_SHFT 2
-#define UV1H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL
-#define UV1H_EVENT_OCCURRED0_LH_HCERR_SHFT 3
-#define UV1H_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL
-#define UV1H_EVENT_OCCURRED0_RH_HCERR_SHFT 4
-#define UV1H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL
-#define UV1H_EVENT_OCCURRED0_XN_HCERR_SHFT 5
-#define UV1H_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL
-#define UV1H_EVENT_OCCURRED0_SI_HCERR_SHFT 6
-#define UV1H_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL
-#define UV1H_EVENT_OCCURRED0_LB_AOERR0_SHFT 7
-#define UV1H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL
-#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8
-#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL
-#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9
-#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL
-#define UV1H_EVENT_OCCURRED0_LH_AOERR0_SHFT 10
-#define UV1H_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL
-#define UV1H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
-#define UV1H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
-#define UV1H_EVENT_OCCURRED0_XN_AOERR0_SHFT 12
-#define UV1H_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL
-#define UV1H_EVENT_OCCURRED0_SI_AOERR0_SHFT 13
-#define UV1H_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL
-#define UV1H_EVENT_OCCURRED0_LB_AOERR1_SHFT 14
-#define UV1H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL
-#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15
-#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL
-#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16
-#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL
-#define UV1H_EVENT_OCCURRED0_LH_AOERR1_SHFT 17
-#define UV1H_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL
-#define UV1H_EVENT_OCCURRED0_RH_AOERR1_SHFT 18
-#define UV1H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL
-#define UV1H_EVENT_OCCURRED0_XN_AOERR1_SHFT 19
-#define UV1H_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL
-#define UV1H_EVENT_OCCURRED0_SI_AOERR1_SHFT 20
-#define UV1H_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL
-#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21
-#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL
-#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22
-#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38
-#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL
-#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39
-#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL
-#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40
-#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL
-#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41
-#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL
-#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42
-#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL
-#define UV1H_EVENT_OCCURRED0_LTC_INT_SHFT 43
-#define UV1H_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL
-#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44
-#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
-#define UV1H_EVENT_OCCURRED0_IPI_INT_SHFT 45
-#define UV1H_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL
-#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46
-#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL
-#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47
-#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL
-#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48
-#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL
-#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49
-#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL
-#define UV1H_EVENT_OCCURRED0_PROFILE_INT_SHFT 50
-#define UV1H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL
-#define UV1H_EVENT_OCCURRED0_RTC0_SHFT 51
-#define UV1H_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL
-#define UV1H_EVENT_OCCURRED0_RTC1_SHFT 52
-#define UV1H_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL
-#define UV1H_EVENT_OCCURRED0_RTC2_SHFT 53
-#define UV1H_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL
-#define UV1H_EVENT_OCCURRED0_RTC3_SHFT 54
-#define UV1H_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL
-#define UV1H_EVENT_OCCURRED0_BAU_DATA_SHFT 55
-#define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL
-#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56
-#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL
-
-#define UV2H_EVENT_OCCURRED0_LB_HCERR_SHFT 0
-#define UV2H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
-#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT 1
-#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL
-#define UV2H_EVENT_OCCURRED0_RH_HCERR_SHFT 2
-#define UV2H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL
-#define UV2H_EVENT_OCCURRED0_LH0_HCERR_SHFT 3
-#define UV2H_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL
-#define UV2H_EVENT_OCCURRED0_LH1_HCERR_SHFT 4
-#define UV2H_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL
-#define UV2H_EVENT_OCCURRED0_GR0_HCERR_SHFT 5
-#define UV2H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000020UL
-#define UV2H_EVENT_OCCURRED0_GR1_HCERR_SHFT 6
-#define UV2H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000040UL
-#define UV2H_EVENT_OCCURRED0_NI0_HCERR_SHFT 7
-#define UV2H_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL
-#define UV2H_EVENT_OCCURRED0_NI1_HCERR_SHFT 8
-#define UV2H_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL
-#define UV2H_EVENT_OCCURRED0_LB_AOERR0_SHFT 9
-#define UV2H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL
-#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10
-#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL
-#define UV2H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
-#define UV2H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
-#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12
-#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL
-#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13
-#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL
-#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14
-#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL
-#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15
-#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL
-#define UV2H_EVENT_OCCURRED0_XB_AOERR0_SHFT 16
-#define UV2H_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL
-#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17
-#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL
-#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18
-#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL
-#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19
-#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL
-#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20
-#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL
-#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21
-#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL
-#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22
-#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL
-#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23
-#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL
-#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24
-#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL
-#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25
-#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL
-#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26
-#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL
-#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27
-#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL
-#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28
-#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL
-#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29
-#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL
-#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30
-#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL
-#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31
-#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47
-#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL
-#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48
-#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL
-#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49
-#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL
-#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50
-#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL
-#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51
-#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL
-#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52
-#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL
-#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT 53
-#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL
-#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54
-#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL
-#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55
-#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL
-#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56
-#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL
-#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57
-#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL
-#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58
-#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL
+#define UVH_EVENT_OCCURRED0				0x70000UL
+#define UVH_EVENT_OCCURRED0_32				0x5e8
+
+#define UV1H_EVENT_OCCURRED0_LB_HCERR_SHFT		0
+#define UV1H_EVENT_OCCURRED0_GR0_HCERR_SHFT		1
+#define UV1H_EVENT_OCCURRED0_GR1_HCERR_SHFT		2
+#define UV1H_EVENT_OCCURRED0_LH_HCERR_SHFT		3
+#define UV1H_EVENT_OCCURRED0_RH_HCERR_SHFT		4
+#define UV1H_EVENT_OCCURRED0_XN_HCERR_SHFT		5
+#define UV1H_EVENT_OCCURRED0_SI_HCERR_SHFT		6
+#define UV1H_EVENT_OCCURRED0_LB_AOERR0_SHFT		7
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_SHFT		8
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_SHFT		9
+#define UV1H_EVENT_OCCURRED0_LH_AOERR0_SHFT		10
+#define UV1H_EVENT_OCCURRED0_RH_AOERR0_SHFT		11
+#define UV1H_EVENT_OCCURRED0_XN_AOERR0_SHFT		12
+#define UV1H_EVENT_OCCURRED0_SI_AOERR0_SHFT		13
+#define UV1H_EVENT_OCCURRED0_LB_AOERR1_SHFT		14
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_SHFT		15
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_SHFT		16
+#define UV1H_EVENT_OCCURRED0_LH_AOERR1_SHFT		17
+#define UV1H_EVENT_OCCURRED0_RH_AOERR1_SHFT		18
+#define UV1H_EVENT_OCCURRED0_XN_AOERR1_SHFT		19
+#define UV1H_EVENT_OCCURRED0_SI_AOERR1_SHFT		20
+#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_SHFT		21
+#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT	22
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT		23
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT		24
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT		25
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT		26
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT		27
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT		28
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT		29
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT		30
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT		31
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT		32
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT		33
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT		34
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT		35
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT		36
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT		37
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT		38
+#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_SHFT		39
+#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_SHFT		40
+#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT		41
+#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT		42
+#define UV1H_EVENT_OCCURRED0_LTC_INT_SHFT		43
+#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT	44
+#define UV1H_EVENT_OCCURRED0_IPI_INT_SHFT		45
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT		46
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_SHFT		47
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_SHFT		48
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_SHFT		49
+#define UV1H_EVENT_OCCURRED0_PROFILE_INT_SHFT		50
+#define UV1H_EVENT_OCCURRED0_RTC0_SHFT			51
+#define UV1H_EVENT_OCCURRED0_RTC1_SHFT			52
+#define UV1H_EVENT_OCCURRED0_RTC2_SHFT			53
+#define UV1H_EVENT_OCCURRED0_RTC3_SHFT			54
+#define UV1H_EVENT_OCCURRED0_BAU_DATA_SHFT		55
+#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT	56
+#define UV1H_EVENT_OCCURRED0_LB_HCERR_MASK		0x0000000000000001UL
+#define UV1H_EVENT_OCCURRED0_GR0_HCERR_MASK		0x0000000000000002UL
+#define UV1H_EVENT_OCCURRED0_GR1_HCERR_MASK		0x0000000000000004UL
+#define UV1H_EVENT_OCCURRED0_LH_HCERR_MASK		0x0000000000000008UL
+#define UV1H_EVENT_OCCURRED0_RH_HCERR_MASK		0x0000000000000010UL
+#define UV1H_EVENT_OCCURRED0_XN_HCERR_MASK		0x0000000000000020UL
+#define UV1H_EVENT_OCCURRED0_SI_HCERR_MASK		0x0000000000000040UL
+#define UV1H_EVENT_OCCURRED0_LB_AOERR0_MASK		0x0000000000000080UL
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_MASK		0x0000000000000100UL
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_MASK		0x0000000000000200UL
+#define UV1H_EVENT_OCCURRED0_LH_AOERR0_MASK		0x0000000000000400UL
+#define UV1H_EVENT_OCCURRED0_RH_AOERR0_MASK		0x0000000000000800UL
+#define UV1H_EVENT_OCCURRED0_XN_AOERR0_MASK		0x0000000000001000UL
+#define UV1H_EVENT_OCCURRED0_SI_AOERR0_MASK		0x0000000000002000UL
+#define UV1H_EVENT_OCCURRED0_LB_AOERR1_MASK		0x0000000000004000UL
+#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_MASK		0x0000000000008000UL
+#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_MASK		0x0000000000010000UL
+#define UV1H_EVENT_OCCURRED0_LH_AOERR1_MASK		0x0000000000020000UL
+#define UV1H_EVENT_OCCURRED0_RH_AOERR1_MASK		0x0000000000040000UL
+#define UV1H_EVENT_OCCURRED0_XN_AOERR1_MASK		0x0000000000080000UL
+#define UV1H_EVENT_OCCURRED0_SI_AOERR1_MASK		0x0000000000100000UL
+#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_MASK		0x0000000000200000UL
+#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK	0x0000000000400000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK		0x0000000000800000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK		0x0000000001000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK		0x0000000002000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK		0x0000000004000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK		0x0000000008000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK		0x0000000010000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK		0x0000000020000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK		0x0000000040000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK		0x0000000080000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK		0x0000000100000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK		0x0000000200000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK		0x0000000400000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK		0x0000000800000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK		0x0000001000000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK		0x0000002000000000UL
+#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK		0x0000004000000000UL
+#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_MASK		0x0000008000000000UL
+#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_MASK		0x0000010000000000UL
+#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_MASK		0x0000020000000000UL
+#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_MASK		0x0000040000000000UL
+#define UV1H_EVENT_OCCURRED0_LTC_INT_MASK		0x0000080000000000UL
+#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK	0x0000100000000000UL
+#define UV1H_EVENT_OCCURRED0_IPI_INT_MASK		0x0000200000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_MASK		0x0000400000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_MASK		0x0000800000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_MASK		0x0001000000000000UL
+#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_MASK		0x0002000000000000UL
+#define UV1H_EVENT_OCCURRED0_PROFILE_INT_MASK		0x0004000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC0_MASK			0x0008000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC1_MASK			0x0010000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC2_MASK			0x0020000000000000UL
+#define UV1H_EVENT_OCCURRED0_RTC3_MASK			0x0040000000000000UL
+#define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK		0x0080000000000000UL
+#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK	0x0100000000000000UL
+
+#define UV2H_EVENT_OCCURRED0_LB_HCERR_SHFT		0
+#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT		1
+#define UV2H_EVENT_OCCURRED0_RH_HCERR_SHFT		2
+#define UV2H_EVENT_OCCURRED0_LH0_HCERR_SHFT		3
+#define UV2H_EVENT_OCCURRED0_LH1_HCERR_SHFT		4
+#define UV2H_EVENT_OCCURRED0_GR0_HCERR_SHFT		5
+#define UV2H_EVENT_OCCURRED0_GR1_HCERR_SHFT		6
+#define UV2H_EVENT_OCCURRED0_NI0_HCERR_SHFT		7
+#define UV2H_EVENT_OCCURRED0_NI1_HCERR_SHFT		8
+#define UV2H_EVENT_OCCURRED0_LB_AOERR0_SHFT		9
+#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT		10
+#define UV2H_EVENT_OCCURRED0_RH_AOERR0_SHFT		11
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_SHFT		12
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_SHFT		13
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_SHFT		14
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_SHFT		15
+#define UV2H_EVENT_OCCURRED0_XB_AOERR0_SHFT		16
+#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT		17
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT		18
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT		19
+#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT		20
+#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT		21
+#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT		22
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT		23
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT		24
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT		25
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT		26
+#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT		27
+#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT		28
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT		29
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT		30
+#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT	31
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT		32
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT		33
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT		34
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT		35
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT		36
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT		37
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT		38
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT		39
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT		40
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT		41
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT		42
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT		43
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT		44
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT		45
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT		46
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT		47
+#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT		48
+#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT		49
+#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT		50
+#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT		51
+#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT	52
+#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT		53
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT		54
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT		55
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT		56
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT		57
+#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT		58
+#define UV2H_EVENT_OCCURRED0_LB_HCERR_MASK		0x0000000000000001UL
+#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK		0x0000000000000002UL
+#define UV2H_EVENT_OCCURRED0_RH_HCERR_MASK		0x0000000000000004UL
+#define UV2H_EVENT_OCCURRED0_LH0_HCERR_MASK		0x0000000000000008UL
+#define UV2H_EVENT_OCCURRED0_LH1_HCERR_MASK		0x0000000000000010UL
+#define UV2H_EVENT_OCCURRED0_GR0_HCERR_MASK		0x0000000000000020UL
+#define UV2H_EVENT_OCCURRED0_GR1_HCERR_MASK		0x0000000000000040UL
+#define UV2H_EVENT_OCCURRED0_NI0_HCERR_MASK		0x0000000000000080UL
+#define UV2H_EVENT_OCCURRED0_NI1_HCERR_MASK		0x0000000000000100UL
+#define UV2H_EVENT_OCCURRED0_LB_AOERR0_MASK		0x0000000000000200UL
+#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK		0x0000000000000400UL
+#define UV2H_EVENT_OCCURRED0_RH_AOERR0_MASK		0x0000000000000800UL
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_MASK		0x0000000000001000UL
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_MASK		0x0000000000002000UL
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_MASK		0x0000000000004000UL
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_MASK		0x0000000000008000UL
+#define UV2H_EVENT_OCCURRED0_XB_AOERR0_MASK		0x0000000000010000UL
+#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK		0x0000000000020000UL
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK		0x0000000000040000UL
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK		0x0000000000080000UL
+#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK		0x0000000000100000UL
+#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK		0x0000000000200000UL
+#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK		0x0000000000400000UL
+#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK		0x0000000000800000UL
+#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK		0x0000000001000000UL
+#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK		0x0000000002000000UL
+#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK		0x0000000004000000UL
+#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK		0x0000000008000000UL
+#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK		0x0000000010000000UL
+#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK		0x0000000020000000UL
+#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK		0x0000000040000000UL
+#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK	0x0000000080000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK		0x0000000100000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK		0x0000000200000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK		0x0000000400000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK		0x0000000800000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK		0x0000001000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK		0x0000002000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK		0x0000004000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK		0x0000008000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK		0x0000010000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK		0x0000020000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK		0x0000040000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK		0x0000080000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK		0x0000100000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK		0x0000200000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK		0x0000400000000000UL
+#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK		0x0000800000000000UL
+#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK		0x0001000000000000UL
+#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK		0x0002000000000000UL
+#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK		0x0004000000000000UL
+#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK		0x0008000000000000UL
+#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK	0x0010000000000000UL
+#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK		0x0020000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK		0x0040000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK		0x0080000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK		0x0100000000000000UL
+#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK		0x0200000000000000UL
+#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK		0x0400000000000000UL
 
 union uvh_event_occurred0_u {
-    unsigned long	v;
-    struct uv1h_event_occurred0_s {
-	unsigned long	lb_hcerr             :  1;  /* RW, W1C */
-	unsigned long	gr0_hcerr            :  1;  /* RW, W1C */
-	unsigned long	gr1_hcerr            :  1;  /* RW, W1C */
-	unsigned long	lh_hcerr             :  1;  /* RW, W1C */
-	unsigned long	rh_hcerr             :  1;  /* RW, W1C */
-	unsigned long	xn_hcerr             :  1;  /* RW, W1C */
-	unsigned long	si_hcerr             :  1;  /* RW, W1C */
-	unsigned long	lb_aoerr0            :  1;  /* RW, W1C */
-	unsigned long	gr0_aoerr0           :  1;  /* RW, W1C */
-	unsigned long	gr1_aoerr0           :  1;  /* RW, W1C */
-	unsigned long	lh_aoerr0            :  1;  /* RW, W1C */
-	unsigned long	rh_aoerr0            :  1;  /* RW, W1C */
-	unsigned long	xn_aoerr0            :  1;  /* RW, W1C */
-	unsigned long	si_aoerr0            :  1;  /* RW, W1C */
-	unsigned long	lb_aoerr1            :  1;  /* RW, W1C */
-	unsigned long	gr0_aoerr1           :  1;  /* RW, W1C */
-	unsigned long	gr1_aoerr1           :  1;  /* RW, W1C */
-	unsigned long	lh_aoerr1            :  1;  /* RW, W1C */
-	unsigned long	rh_aoerr1            :  1;  /* RW, W1C */
-	unsigned long	xn_aoerr1            :  1;  /* RW, W1C */
-	unsigned long	si_aoerr1            :  1;  /* RW, W1C */
-	unsigned long	rh_vpi_int           :  1;  /* RW, W1C */
-	unsigned long	system_shutdown_int  :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_0         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_1         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_2         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_3         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_4         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_5         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_6         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_7         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_8         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_9         :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_10        :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_11        :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_12        :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_13        :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_14        :  1;  /* RW, W1C */
-	unsigned long	lb_irq_int_15        :  1;  /* RW, W1C */
-	unsigned long	l1_nmi_int           :  1;  /* RW, W1C */
-	unsigned long	stop_clock           :  1;  /* RW, W1C */
-	unsigned long	asic_to_l1           :  1;  /* RW, W1C */
-	unsigned long	l1_to_asic           :  1;  /* RW, W1C */
-	unsigned long	ltc_int              :  1;  /* RW, W1C */
-	unsigned long	la_seq_trigger       :  1;  /* RW, W1C */
-	unsigned long	ipi_int              :  1;  /* RW, W1C */
-	unsigned long	extio_int0           :  1;  /* RW, W1C */
-	unsigned long	extio_int1           :  1;  /* RW, W1C */
-	unsigned long	extio_int2           :  1;  /* RW, W1C */
-	unsigned long	extio_int3           :  1;  /* RW, W1C */
-	unsigned long	profile_int          :  1;  /* RW, W1C */
-	unsigned long	rtc0                 :  1;  /* RW, W1C */
-	unsigned long	rtc1                 :  1;  /* RW, W1C */
-	unsigned long	rtc2                 :  1;  /* RW, W1C */
-	unsigned long	rtc3                 :  1;  /* RW, W1C */
-	unsigned long	bau_data             :  1;  /* RW, W1C */
-	unsigned long	power_management_req :  1;  /* RW, W1C */
-	unsigned long	rsvd_57_63           :  7;  /*    */
-    } s1;
-    struct uv2h_event_occurred0_s {
-	unsigned long	lb_hcerr            :  1;  /* RW */
-	unsigned long	qp_hcerr            :  1;  /* RW */
-	unsigned long	rh_hcerr            :  1;  /* RW */
-	unsigned long	lh0_hcerr           :  1;  /* RW */
-	unsigned long	lh1_hcerr           :  1;  /* RW */
-	unsigned long	gr0_hcerr           :  1;  /* RW */
-	unsigned long	gr1_hcerr           :  1;  /* RW */
-	unsigned long	ni0_hcerr           :  1;  /* RW */
-	unsigned long	ni1_hcerr           :  1;  /* RW */
-	unsigned long	lb_aoerr0           :  1;  /* RW */
-	unsigned long	qp_aoerr0           :  1;  /* RW */
-	unsigned long	rh_aoerr0           :  1;  /* RW */
-	unsigned long	lh0_aoerr0          :  1;  /* RW */
-	unsigned long	lh1_aoerr0          :  1;  /* RW */
-	unsigned long	gr0_aoerr0          :  1;  /* RW */
-	unsigned long	gr1_aoerr0          :  1;  /* RW */
-	unsigned long	xb_aoerr0           :  1;  /* RW */
-	unsigned long	rt_aoerr0           :  1;  /* RW */
-	unsigned long	ni0_aoerr0          :  1;  /* RW */
-	unsigned long	ni1_aoerr0          :  1;  /* RW */
-	unsigned long	lb_aoerr1           :  1;  /* RW */
-	unsigned long	qp_aoerr1           :  1;  /* RW */
-	unsigned long	rh_aoerr1           :  1;  /* RW */
-	unsigned long	lh0_aoerr1          :  1;  /* RW */
-	unsigned long	lh1_aoerr1          :  1;  /* RW */
-	unsigned long	gr0_aoerr1          :  1;  /* RW */
-	unsigned long	gr1_aoerr1          :  1;  /* RW */
-	unsigned long	xb_aoerr1           :  1;  /* RW */
-	unsigned long	rt_aoerr1           :  1;  /* RW */
-	unsigned long	ni0_aoerr1          :  1;  /* RW */
-	unsigned long	ni1_aoerr1          :  1;  /* RW */
-	unsigned long	system_shutdown_int :  1;  /* RW */
-	unsigned long	lb_irq_int_0        :  1;  /* RW */
-	unsigned long	lb_irq_int_1        :  1;  /* RW */
-	unsigned long	lb_irq_int_2        :  1;  /* RW */
-	unsigned long	lb_irq_int_3        :  1;  /* RW */
-	unsigned long	lb_irq_int_4        :  1;  /* RW */
-	unsigned long	lb_irq_int_5        :  1;  /* RW */
-	unsigned long	lb_irq_int_6        :  1;  /* RW */
-	unsigned long	lb_irq_int_7        :  1;  /* RW */
-	unsigned long	lb_irq_int_8        :  1;  /* RW */
-	unsigned long	lb_irq_int_9        :  1;  /* RW */
-	unsigned long	lb_irq_int_10       :  1;  /* RW */
-	unsigned long	lb_irq_int_11       :  1;  /* RW */
-	unsigned long	lb_irq_int_12       :  1;  /* RW */
-	unsigned long	lb_irq_int_13       :  1;  /* RW */
-	unsigned long	lb_irq_int_14       :  1;  /* RW */
-	unsigned long	lb_irq_int_15       :  1;  /* RW */
-	unsigned long	l1_nmi_int          :  1;  /* RW */
-	unsigned long	stop_clock          :  1;  /* RW */
-	unsigned long	asic_to_l1          :  1;  /* RW */
-	unsigned long	l1_to_asic          :  1;  /* RW */
-	unsigned long	la_seq_trigger      :  1;  /* RW */
-	unsigned long	ipi_int             :  1;  /* RW */
-	unsigned long	extio_int0          :  1;  /* RW */
-	unsigned long	extio_int1          :  1;  /* RW */
-	unsigned long	extio_int2          :  1;  /* RW */
-	unsigned long	extio_int3          :  1;  /* RW */
-	unsigned long	profile_int         :  1;  /* RW */
-	unsigned long	rsvd_59_63          :  5;  /*    */
-    } s2;
+	unsigned long	v;
+	struct uv1h_event_occurred0_s {
+		unsigned long	lb_hcerr:1;			/* RW, W1C */
+		unsigned long	gr0_hcerr:1;			/* RW, W1C */
+		unsigned long	gr1_hcerr:1;			/* RW, W1C */
+		unsigned long	lh_hcerr:1;			/* RW, W1C */
+		unsigned long	rh_hcerr:1;			/* RW, W1C */
+		unsigned long	xn_hcerr:1;			/* RW, W1C */
+		unsigned long	si_hcerr:1;			/* RW, W1C */
+		unsigned long	lb_aoerr0:1;			/* RW, W1C */
+		unsigned long	gr0_aoerr0:1;			/* RW, W1C */
+		unsigned long	gr1_aoerr0:1;			/* RW, W1C */
+		unsigned long	lh_aoerr0:1;			/* RW, W1C */
+		unsigned long	rh_aoerr0:1;			/* RW, W1C */
+		unsigned long	xn_aoerr0:1;			/* RW, W1C */
+		unsigned long	si_aoerr0:1;			/* RW, W1C */
+		unsigned long	lb_aoerr1:1;			/* RW, W1C */
+		unsigned long	gr0_aoerr1:1;			/* RW, W1C */
+		unsigned long	gr1_aoerr1:1;			/* RW, W1C */
+		unsigned long	lh_aoerr1:1;			/* RW, W1C */
+		unsigned long	rh_aoerr1:1;			/* RW, W1C */
+		unsigned long	xn_aoerr1:1;			/* RW, W1C */
+		unsigned long	si_aoerr1:1;			/* RW, W1C */
+		unsigned long	rh_vpi_int:1;			/* RW, W1C */
+		unsigned long	system_shutdown_int:1;		/* RW, W1C */
+		unsigned long	lb_irq_int_0:1;			/* RW, W1C */
+		unsigned long	lb_irq_int_1:1;			/* RW, W1C */
+		unsigned long	lb_irq_int_2:1;			/* RW, W1C */
+		unsigned long	lb_irq_int_3:1;			/* RW, W1C */
+		unsigned long	lb_irq_int_4:1;			/* RW, W1C */
+		unsigned long	lb_irq_int_5:1;			/* RW, W1C */
+		unsigned long	lb_irq_int_6:1;			/* RW, W1C */
+		unsigned long	lb_irq_int_7:1;			/* RW, W1C */
+		unsigned long	lb_irq_int_8:1;			/* RW, W1C */
+		unsigned long	lb_irq_int_9:1;			/* RW, W1C */
+		unsigned long	lb_irq_int_10:1;		/* RW, W1C */
+		unsigned long	lb_irq_int_11:1;		/* RW, W1C */
+		unsigned long	lb_irq_int_12:1;		/* RW, W1C */
+		unsigned long	lb_irq_int_13:1;		/* RW, W1C */
+		unsigned long	lb_irq_int_14:1;		/* RW, W1C */
+		unsigned long	lb_irq_int_15:1;		/* RW, W1C */
+		unsigned long	l1_nmi_int:1;			/* RW, W1C */
+		unsigned long	stop_clock:1;			/* RW, W1C */
+		unsigned long	asic_to_l1:1;			/* RW, W1C */
+		unsigned long	l1_to_asic:1;			/* RW, W1C */
+		unsigned long	ltc_int:1;			/* RW, W1C */
+		unsigned long	la_seq_trigger:1;		/* RW, W1C */
+		unsigned long	ipi_int:1;			/* RW, W1C */
+		unsigned long	extio_int0:1;			/* RW, W1C */
+		unsigned long	extio_int1:1;			/* RW, W1C */
+		unsigned long	extio_int2:1;			/* RW, W1C */
+		unsigned long	extio_int3:1;			/* RW, W1C */
+		unsigned long	profile_int:1;			/* RW, W1C */
+		unsigned long	rtc0:1;				/* RW, W1C */
+		unsigned long	rtc1:1;				/* RW, W1C */
+		unsigned long	rtc2:1;				/* RW, W1C */
+		unsigned long	rtc3:1;				/* RW, W1C */
+		unsigned long	bau_data:1;			/* RW, W1C */
+		unsigned long	power_management_req:1;		/* RW, W1C */
+		unsigned long	rsvd_57_63:7;
+	} s1;
+	struct uv2h_event_occurred0_s {
+		unsigned long	lb_hcerr:1;			/* RW */
+		unsigned long	qp_hcerr:1;			/* RW */
+		unsigned long	rh_hcerr:1;			/* RW */
+		unsigned long	lh0_hcerr:1;			/* RW */
+		unsigned long	lh1_hcerr:1;			/* RW */
+		unsigned long	gr0_hcerr:1;			/* RW */
+		unsigned long	gr1_hcerr:1;			/* RW */
+		unsigned long	ni0_hcerr:1;			/* RW */
+		unsigned long	ni1_hcerr:1;			/* RW */
+		unsigned long	lb_aoerr0:1;			/* RW */
+		unsigned long	qp_aoerr0:1;			/* RW */
+		unsigned long	rh_aoerr0:1;			/* RW */
+		unsigned long	lh0_aoerr0:1;			/* RW */
+		unsigned long	lh1_aoerr0:1;			/* RW */
+		unsigned long	gr0_aoerr0:1;			/* RW */
+		unsigned long	gr1_aoerr0:1;			/* RW */
+		unsigned long	xb_aoerr0:1;			/* RW */
+		unsigned long	rt_aoerr0:1;			/* RW */
+		unsigned long	ni0_aoerr0:1;			/* RW */
+		unsigned long	ni1_aoerr0:1;			/* RW */
+		unsigned long	lb_aoerr1:1;			/* RW */
+		unsigned long	qp_aoerr1:1;			/* RW */
+		unsigned long	rh_aoerr1:1;			/* RW */
+		unsigned long	lh0_aoerr1:1;			/* RW */
+		unsigned long	lh1_aoerr1:1;			/* RW */
+		unsigned long	gr0_aoerr1:1;			/* RW */
+		unsigned long	gr1_aoerr1:1;			/* RW */
+		unsigned long	xb_aoerr1:1;			/* RW */
+		unsigned long	rt_aoerr1:1;			/* RW */
+		unsigned long	ni0_aoerr1:1;			/* RW */
+		unsigned long	ni1_aoerr1:1;			/* RW */
+		unsigned long	system_shutdown_int:1;		/* RW */
+		unsigned long	lb_irq_int_0:1;			/* RW */
+		unsigned long	lb_irq_int_1:1;			/* RW */
+		unsigned long	lb_irq_int_2:1;			/* RW */
+		unsigned long	lb_irq_int_3:1;			/* RW */
+		unsigned long	lb_irq_int_4:1;			/* RW */
+		unsigned long	lb_irq_int_5:1;			/* RW */
+		unsigned long	lb_irq_int_6:1;			/* RW */
+		unsigned long	lb_irq_int_7:1;			/* RW */
+		unsigned long	lb_irq_int_8:1;			/* RW */
+		unsigned long	lb_irq_int_9:1;			/* RW */
+		unsigned long	lb_irq_int_10:1;		/* RW */
+		unsigned long	lb_irq_int_11:1;		/* RW */
+		unsigned long	lb_irq_int_12:1;		/* RW */
+		unsigned long	lb_irq_int_13:1;		/* RW */
+		unsigned long	lb_irq_int_14:1;		/* RW */
+		unsigned long	lb_irq_int_15:1;		/* RW */
+		unsigned long	l1_nmi_int:1;			/* RW */
+		unsigned long	stop_clock:1;			/* RW */
+		unsigned long	asic_to_l1:1;			/* RW */
+		unsigned long	l1_to_asic:1;			/* RW */
+		unsigned long	la_seq_trigger:1;		/* RW */
+		unsigned long	ipi_int:1;			/* RW */
+		unsigned long	extio_int0:1;			/* RW */
+		unsigned long	extio_int1:1;			/* RW */
+		unsigned long	extio_int2:1;			/* RW */
+		unsigned long	extio_int3:1;			/* RW */
+		unsigned long	profile_int:1;			/* RW */
+		unsigned long	rsvd_59_63:5;
+	} s2;
 };
 
 /* ========================================================================= */
 /*                        UVH_EVENT_OCCURRED0_ALIAS                          */
 /* ========================================================================= */
-#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL
-#define UVH_EVENT_OCCURRED0_ALIAS_32 0x5f0
+#define UVH_EVENT_OCCURRED0_ALIAS			0x0000000000070008UL
+#define UVH_EVENT_OCCURRED0_ALIAS_32			0x5f0
 
 /* ========================================================================= */
 /*                         UVH_GR0_TLB_INT0_CONFIG                           */
 /* ========================================================================= */
-#define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL
-
-#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0
-#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8
-#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12
-#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT 13
-#define UVH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT 15
-#define UVH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT 16
-#define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_GR0_TLB_INT0_CONFIG				0x61b00UL
+
+#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT		0
+#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT			8
+#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT		11
+#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT		12
+#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT			13
+#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT			15
+#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT			16
+#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT		32
+#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK		0x00000000000000ffUL
+#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK			0x0000000000000700UL
+#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK		0x0000000000000800UL
+#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK		0x0000000000001000UL
+#define UVH_GR0_TLB_INT0_CONFIG_P_MASK			0x0000000000002000UL
+#define UVH_GR0_TLB_INT0_CONFIG_T_MASK			0x0000000000008000UL
+#define UVH_GR0_TLB_INT0_CONFIG_M_MASK			0x0000000000010000UL
+#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK		0xffffffff00000000UL
 
 union uvh_gr0_tlb_int0_config_u {
-    unsigned long	v;
-    struct uvh_gr0_tlb_int0_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
+	unsigned long	v;
+	struct uvh_gr0_tlb_int0_config_s {
+		unsigned long	vector_:8;			/* RW */
+		unsigned long	dm:3;				/* RW */
+		unsigned long	destmode:1;			/* RW */
+		unsigned long	status:1;			/* RO */
+		unsigned long	p:1;				/* RO */
+		unsigned long	rsvd_14:1;
+		unsigned long	t:1;				/* RO */
+		unsigned long	m:1;				/* RW */
+		unsigned long	rsvd_17_31:15;
+		unsigned long	apic_id:32;			/* RW */
+	} s;
 };
 
 /* ========================================================================= */
 /*                         UVH_GR0_TLB_INT1_CONFIG                           */
 /* ========================================================================= */
-#define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL
-
-#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0
-#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8
-#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12
-#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT 13
-#define UVH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT 15
-#define UVH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT 16
-#define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_GR0_TLB_INT1_CONFIG				0x61b40UL
+
+#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT		0
+#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT			8
+#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT		11
+#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT		12
+#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT			13
+#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT			15
+#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT			16
+#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT		32
+#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK		0x00000000000000ffUL
+#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK			0x0000000000000700UL
+#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK		0x0000000000000800UL
+#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK		0x0000000000001000UL
+#define UVH_GR0_TLB_INT1_CONFIG_P_MASK			0x0000000000002000UL
+#define UVH_GR0_TLB_INT1_CONFIG_T_MASK			0x0000000000008000UL
+#define UVH_GR0_TLB_INT1_CONFIG_M_MASK			0x0000000000010000UL
+#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK		0xffffffff00000000UL
 
 union uvh_gr0_tlb_int1_config_u {
-    unsigned long	v;
-    struct uvh_gr0_tlb_int1_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
+	unsigned long	v;
+	struct uvh_gr0_tlb_int1_config_s {
+		unsigned long	vector_:8;			/* RW */
+		unsigned long	dm:3;				/* RW */
+		unsigned long	destmode:1;			/* RW */
+		unsigned long	status:1;			/* RO */
+		unsigned long	p:1;				/* RO */
+		unsigned long	rsvd_14:1;
+		unsigned long	t:1;				/* RO */
+		unsigned long	m:1;				/* RW */
+		unsigned long	rsvd_17_31:15;
+		unsigned long	apic_id:32;			/* RW */
+	} s;
+};
+
+/* ========================================================================= */
+/*                         UVH_GR0_TLB_MMR_CONTROL                           */
+/* ========================================================================= */
+#define UV1H_GR0_TLB_MMR_CONTROL 0x401080UL
+#define UV2H_GR0_TLB_MMR_CONTROL 0xc01080UL
+#define UVH_GR0_TLB_MMR_CONTROL (is_uv1_hub() ?				\
+			UV1H_GR0_TLB_MMR_CONTROL :			\
+			UV2H_GR0_TLB_MMR_CONTROL)
+
+#define UVH_GR0_TLB_MMR_CONTROL_INDEX_SHFT		0
+#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT		12
+#define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT	16
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT	20
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT		30
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT		31
+#define UVH_GR0_TLB_MMR_CONTROL_INDEX_MASK		0x0000000000000fffUL
+#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK		0x0000000000003000UL
+#define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK	0x0000000000010000UL
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK	0x0000000000100000UL
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK		0x0000000040000000UL
+#define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_MASK		0x0000000080000000UL
+
+#define UV1H_GR0_TLB_MMR_CONTROL_INDEX_SHFT		0
+#define UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT		12
+#define UV1H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT	16
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT	20
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT		30
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT		31
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT	48
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT	52
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_SHFT	54
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_SHFT	56
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_SHFT	60
+#define UV1H_GR0_TLB_MMR_CONTROL_INDEX_MASK		0x0000000000000fffUL
+#define UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK		0x0000000000003000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK	0x0000000000010000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK	0x0000000000100000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK		0x0000000040000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK		0x0000000080000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_MASK	0x0001000000000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK	0x0010000000000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_MASK	0x0040000000000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK	0x0100000000000000UL
+#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK	0x1000000000000000UL
+
+#define UV2H_GR0_TLB_MMR_CONTROL_INDEX_SHFT		0
+#define UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT		12
+#define UV2H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT	16
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT	20
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT		30
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT		31
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT	32
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT	48
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT	52
+#define UV2H_GR0_TLB_MMR_CONTROL_INDEX_MASK		0x0000000000000fffUL
+#define UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK		0x0000000000003000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK	0x0000000000010000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK	0x0000000000100000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK		0x0000000040000000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK		0x0000000080000000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK	0x0000000100000000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_MASK	0x0001000000000000UL
+#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK	0x0010000000000000UL
+
+union uvh_gr0_tlb_mmr_control_u {
+	unsigned long	v;
+	struct uvh_gr0_tlb_mmr_control_s {
+		unsigned long	index:12;			/* RW */
+		unsigned long	mem_sel:2;			/* RW */
+		unsigned long	rsvd_14_15:2;
+		unsigned long	auto_valid_en:1;		/* RW */
+		unsigned long	rsvd_17_19:3;
+		unsigned long	mmr_hash_index_en:1;		/* RW */
+		unsigned long	rsvd_21_29:9;
+		unsigned long	mmr_write:1;			/* WP */
+		unsigned long	mmr_read:1;			/* WP */
+		unsigned long	rsvd_32_63:32;
+	} s;
+	struct uv1h_gr0_tlb_mmr_control_s {
+		unsigned long	index:12;			/* RW */
+		unsigned long	mem_sel:2;			/* RW */
+		unsigned long	rsvd_14_15:2;
+		unsigned long	auto_valid_en:1;		/* RW */
+		unsigned long	rsvd_17_19:3;
+		unsigned long	mmr_hash_index_en:1;		/* RW */
+		unsigned long	rsvd_21_29:9;
+		unsigned long	mmr_write:1;			/* WP */
+		unsigned long	mmr_read:1;			/* WP */
+		unsigned long	rsvd_32_47:16;
+		unsigned long	mmr_inj_con:1;			/* RW */
+		unsigned long	rsvd_49_51:3;
+		unsigned long	mmr_inj_tlbram:1;		/* RW */
+		unsigned long	rsvd_53:1;
+		unsigned long	mmr_inj_tlbpgsize:1;		/* RW */
+		unsigned long	rsvd_55:1;
+		unsigned long	mmr_inj_tlbrreg:1;		/* RW */
+		unsigned long	rsvd_57_59:3;
+		unsigned long	mmr_inj_tlblruv:1;		/* RW */
+		unsigned long	rsvd_61_63:3;
+	} s1;
+	struct uv2h_gr0_tlb_mmr_control_s {
+		unsigned long	index:12;			/* RW */
+		unsigned long	mem_sel:2;			/* RW */
+		unsigned long	rsvd_14_15:2;
+		unsigned long	auto_valid_en:1;		/* RW */
+		unsigned long	rsvd_17_19:3;
+		unsigned long	mmr_hash_index_en:1;		/* RW */
+		unsigned long	rsvd_21_29:9;
+		unsigned long	mmr_write:1;			/* WP */
+		unsigned long	mmr_read:1;			/* WP */
+		unsigned long	mmr_op_done:1;			/* RW */
+		unsigned long	rsvd_33_47:15;
+		unsigned long	mmr_inj_con:1;			/* RW */
+		unsigned long	rsvd_49_51:3;
+		unsigned long	mmr_inj_tlbram:1;		/* RW */
+		unsigned long	rsvd_53_63:11;
+	} s2;
+};
+
+/* ========================================================================= */
+/*                       UVH_GR0_TLB_MMR_READ_DATA_HI                        */
+/* ========================================================================= */
+#define UV1H_GR0_TLB_MMR_READ_DATA_HI 0x4010a0UL
+#define UV2H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL
+#define UVH_GR0_TLB_MMR_READ_DATA_HI (is_uv1_hub() ?			\
+			UV1H_GR0_TLB_MMR_READ_DATA_HI :			\
+			UV2H_GR0_TLB_MMR_READ_DATA_HI)
+
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT		43
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT	44
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK		0x000001ffffffffffUL
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK		0x0000060000000000UL
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK		0x0000080000000000UL
+#define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0000100000000000UL
+
+union uvh_gr0_tlb_mmr_read_data_hi_u {
+	unsigned long	v;
+	struct uvh_gr0_tlb_mmr_read_data_hi_s {
+		unsigned long	pfn:41;				/* RO */
+		unsigned long	gaa:2;				/* RO */
+		unsigned long	dirty:1;			/* RO */
+		unsigned long	larger:1;			/* RO */
+		unsigned long	rsvd_45_63:19;
+	} s;
+};
+
+/* ========================================================================= */
+/*                       UVH_GR0_TLB_MMR_READ_DATA_LO                        */
+/* ========================================================================= */
+#define UV1H_GR0_TLB_MMR_READ_DATA_LO 0x4010a8UL
+#define UV2H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL
+#define UVH_GR0_TLB_MMR_READ_DATA_LO (is_uv1_hub() ?			\
+			UV1H_GR0_TLB_MMR_READ_DATA_LO :			\
+			UV2H_GR0_TLB_MMR_READ_DATA_LO)
+
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT		0
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT		39
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT		63
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK		0x0000007fffffffffUL
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK		0x7fffff8000000000UL
+#define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK		0x8000000000000000UL
+
+union uvh_gr0_tlb_mmr_read_data_lo_u {
+	unsigned long	v;
+	struct uvh_gr0_tlb_mmr_read_data_lo_s {
+		unsigned long	vpn:39;				/* RO */
+		unsigned long	asid:24;			/* RO */
+		unsigned long	valid:1;			/* RO */
+	} s;
 };
 
 /* ========================================================================= */
 /*                         UVH_GR1_TLB_INT0_CONFIG                           */
 /* ========================================================================= */
-#define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL
-
-#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0
-#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8
-#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12
-#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT 13
-#define UVH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT 15
-#define UVH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT 16
-#define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_GR1_TLB_INT0_CONFIG				0x61f00UL
+
+#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT		0
+#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT			8
+#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT		11
+#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT		12
+#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT			13
+#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT			15
+#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT			16
+#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT		32
+#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK		0x00000000000000ffUL
+#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK			0x0000000000000700UL
+#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK		0x0000000000000800UL
+#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK		0x0000000000001000UL
+#define UVH_GR1_TLB_INT0_CONFIG_P_MASK			0x0000000000002000UL
+#define UVH_GR1_TLB_INT0_CONFIG_T_MASK			0x0000000000008000UL
+#define UVH_GR1_TLB_INT0_CONFIG_M_MASK			0x0000000000010000UL
+#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK		0xffffffff00000000UL
 
 union uvh_gr1_tlb_int0_config_u {
-    unsigned long	v;
-    struct uvh_gr1_tlb_int0_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
+	unsigned long	v;
+	struct uvh_gr1_tlb_int0_config_s {
+		unsigned long	vector_:8;			/* RW */
+		unsigned long	dm:3;				/* RW */
+		unsigned long	destmode:1;			/* RW */
+		unsigned long	status:1;			/* RO */
+		unsigned long	p:1;				/* RO */
+		unsigned long	rsvd_14:1;
+		unsigned long	t:1;				/* RO */
+		unsigned long	m:1;				/* RW */
+		unsigned long	rsvd_17_31:15;
+		unsigned long	apic_id:32;			/* RW */
+	} s;
 };
 
 /* ========================================================================= */
 /*                         UVH_GR1_TLB_INT1_CONFIG                           */
 /* ========================================================================= */
-#define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL
-
-#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0
-#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8
-#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11
-#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12
-#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT 13
-#define UVH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT 15
-#define UVH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT 16
-#define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32
-#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_GR1_TLB_INT1_CONFIG				0x61f40UL
+
+#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT		0
+#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT			8
+#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT		11
+#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT		12
+#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT			13
+#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT			15
+#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT			16
+#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT		32
+#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK		0x00000000000000ffUL
+#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK			0x0000000000000700UL
+#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK		0x0000000000000800UL
+#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK		0x0000000000001000UL
+#define UVH_GR1_TLB_INT1_CONFIG_P_MASK			0x0000000000002000UL
+#define UVH_GR1_TLB_INT1_CONFIG_T_MASK			0x0000000000008000UL
+#define UVH_GR1_TLB_INT1_CONFIG_M_MASK			0x0000000000010000UL
+#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK		0xffffffff00000000UL
 
 union uvh_gr1_tlb_int1_config_u {
-    unsigned long	v;
-    struct uvh_gr1_tlb_int1_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
+	unsigned long	v;
+	struct uvh_gr1_tlb_int1_config_s {
+		unsigned long	vector_:8;			/* RW */
+		unsigned long	dm:3;				/* RW */
+		unsigned long	destmode:1;			/* RW */
+		unsigned long	status:1;			/* RO */
+		unsigned long	p:1;				/* RO */
+		unsigned long	rsvd_14:1;
+		unsigned long	t:1;				/* RO */
+		unsigned long	m:1;				/* RW */
+		unsigned long	rsvd_17_31:15;
+		unsigned long	apic_id:32;			/* RW */
+	} s;
+};
+
+/* ========================================================================= */
+/*                         UVH_GR1_TLB_MMR_CONTROL                           */
+/* ========================================================================= */
+#define UV1H_GR1_TLB_MMR_CONTROL 0x801080UL
+#define UV2H_GR1_TLB_MMR_CONTROL 0x1001080UL
+#define UVH_GR1_TLB_MMR_CONTROL (is_uv1_hub() ?				\
+			UV1H_GR1_TLB_MMR_CONTROL :			\
+			UV2H_GR1_TLB_MMR_CONTROL)
+
+#define UVH_GR1_TLB_MMR_CONTROL_INDEX_SHFT		0
+#define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT		12
+#define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT	16
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT	20
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT		30
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT		31
+#define UVH_GR1_TLB_MMR_CONTROL_INDEX_MASK		0x0000000000000fffUL
+#define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK		0x0000000000003000UL
+#define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK	0x0000000000010000UL
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK	0x0000000000100000UL
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK		0x0000000040000000UL
+#define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_MASK		0x0000000080000000UL
+
+#define UV1H_GR1_TLB_MMR_CONTROL_INDEX_SHFT		0
+#define UV1H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT		12
+#define UV1H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT	16
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT	20
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT		30
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT		31
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT	48
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT	52
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_SHFT	54
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_SHFT	56
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_SHFT	60
+#define UV1H_GR1_TLB_MMR_CONTROL_INDEX_MASK		0x0000000000000fffUL
+#define UV1H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK		0x0000000000003000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK	0x0000000000010000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK	0x0000000000100000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK		0x0000000040000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK		0x0000000080000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_MASK	0x0001000000000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK	0x0010000000000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_MASK	0x0040000000000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK	0x0100000000000000UL
+#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK	0x1000000000000000UL
+
+#define UV2H_GR1_TLB_MMR_CONTROL_INDEX_SHFT		0
+#define UV2H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT		12
+#define UV2H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT	16
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT	20
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT		30
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT		31
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT	32
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT	48
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT	52
+#define UV2H_GR1_TLB_MMR_CONTROL_INDEX_MASK		0x0000000000000fffUL
+#define UV2H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK		0x0000000000003000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK	0x0000000000010000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK	0x0000000000100000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK		0x0000000040000000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK		0x0000000080000000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK	0x0000000100000000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_MASK	0x0001000000000000UL
+#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK	0x0010000000000000UL
+
+union uvh_gr1_tlb_mmr_control_u {
+	unsigned long	v;
+	struct uvh_gr1_tlb_mmr_control_s {
+		unsigned long	index:12;			/* RW */
+		unsigned long	mem_sel:2;			/* RW */
+		unsigned long	rsvd_14_15:2;
+		unsigned long	auto_valid_en:1;		/* RW */
+		unsigned long	rsvd_17_19:3;
+		unsigned long	mmr_hash_index_en:1;		/* RW */
+		unsigned long	rsvd_21_29:9;
+		unsigned long	mmr_write:1;			/* WP */
+		unsigned long	mmr_read:1;			/* WP */
+		unsigned long	rsvd_32_63:32;
+	} s;
+	struct uv1h_gr1_tlb_mmr_control_s {
+		unsigned long	index:12;			/* RW */
+		unsigned long	mem_sel:2;			/* RW */
+		unsigned long	rsvd_14_15:2;
+		unsigned long	auto_valid_en:1;		/* RW */
+		unsigned long	rsvd_17_19:3;
+		unsigned long	mmr_hash_index_en:1;		/* RW */
+		unsigned long	rsvd_21_29:9;
+		unsigned long	mmr_write:1;			/* WP */
+		unsigned long	mmr_read:1;			/* WP */
+		unsigned long	rsvd_32_47:16;
+		unsigned long	mmr_inj_con:1;			/* RW */
+		unsigned long	rsvd_49_51:3;
+		unsigned long	mmr_inj_tlbram:1;		/* RW */
+		unsigned long	rsvd_53:1;
+		unsigned long	mmr_inj_tlbpgsize:1;		/* RW */
+		unsigned long	rsvd_55:1;
+		unsigned long	mmr_inj_tlbrreg:1;		/* RW */
+		unsigned long	rsvd_57_59:3;
+		unsigned long	mmr_inj_tlblruv:1;		/* RW */
+		unsigned long	rsvd_61_63:3;
+	} s1;
+	struct uv2h_gr1_tlb_mmr_control_s {
+		unsigned long	index:12;			/* RW */
+		unsigned long	mem_sel:2;			/* RW */
+		unsigned long	rsvd_14_15:2;
+		unsigned long	auto_valid_en:1;		/* RW */
+		unsigned long	rsvd_17_19:3;
+		unsigned long	mmr_hash_index_en:1;		/* RW */
+		unsigned long	rsvd_21_29:9;
+		unsigned long	mmr_write:1;			/* WP */
+		unsigned long	mmr_read:1;			/* WP */
+		unsigned long	mmr_op_done:1;			/* RW */
+		unsigned long	rsvd_33_47:15;
+		unsigned long	mmr_inj_con:1;			/* RW */
+		unsigned long	rsvd_49_51:3;
+		unsigned long	mmr_inj_tlbram:1;		/* RW */
+		unsigned long	rsvd_53_63:11;
+	} s2;
+};
+
+/* ========================================================================= */
+/*                       UVH_GR1_TLB_MMR_READ_DATA_HI                        */
+/* ========================================================================= */
+#define UV1H_GR1_TLB_MMR_READ_DATA_HI 0x8010a0UL
+#define UV2H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL
+#define UVH_GR1_TLB_MMR_READ_DATA_HI (is_uv1_hub() ?			\
+			UV1H_GR1_TLB_MMR_READ_DATA_HI :			\
+			UV2H_GR1_TLB_MMR_READ_DATA_HI)
+
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT		0
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT		41
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT		43
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT	44
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK		0x000001ffffffffffUL
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK		0x0000060000000000UL
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK		0x0000080000000000UL
+#define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK	0x0000100000000000UL
+
+union uvh_gr1_tlb_mmr_read_data_hi_u {
+	unsigned long	v;
+	struct uvh_gr1_tlb_mmr_read_data_hi_s {
+		unsigned long	pfn:41;				/* RO */
+		unsigned long	gaa:2;				/* RO */
+		unsigned long	dirty:1;			/* RO */
+		unsigned long	larger:1;			/* RO */
+		unsigned long	rsvd_45_63:19;
+	} s;
+};
+
+/* ========================================================================= */
+/*                       UVH_GR1_TLB_MMR_READ_DATA_LO                        */
+/* ========================================================================= */
+#define UV1H_GR1_TLB_MMR_READ_DATA_LO 0x8010a8UL
+#define UV2H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL
+#define UVH_GR1_TLB_MMR_READ_DATA_LO (is_uv1_hub() ?			\
+			UV1H_GR1_TLB_MMR_READ_DATA_LO :			\
+			UV2H_GR1_TLB_MMR_READ_DATA_LO)
+
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT		0
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT		39
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT		63
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK		0x0000007fffffffffUL
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK		0x7fffff8000000000UL
+#define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK		0x8000000000000000UL
+
+union uvh_gr1_tlb_mmr_read_data_lo_u {
+	unsigned long	v;
+	struct uvh_gr1_tlb_mmr_read_data_lo_s {
+		unsigned long	vpn:39;				/* RO */
+		unsigned long	asid:24;			/* RO */
+		unsigned long	valid:1;			/* RO */
+	} s;
 };
 
 /* ========================================================================= */
 /*                               UVH_INT_CMPB                                */
 /* ========================================================================= */
-#define UVH_INT_CMPB 0x22080UL
+#define UVH_INT_CMPB					0x22080UL
 
-#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0
-#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL
+#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT		0
+#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK		0x00ffffffffffffffUL
 
 union uvh_int_cmpb_u {
-    unsigned long	v;
-    struct uvh_int_cmpb_s {
-	unsigned long	real_time_cmpb : 56;  /* RW */
-	unsigned long	rsvd_56_63     :  8;  /*    */
-    } s;
+	unsigned long	v;
+	struct uvh_int_cmpb_s {
+		unsigned long	real_time_cmpb:56;		/* RW */
+		unsigned long	rsvd_56_63:8;
+	} s;
 };
 
 /* ========================================================================= */
 /*                               UVH_INT_CMPC                                */
 /* ========================================================================= */
-#define UVH_INT_CMPC 0x22100UL
+#define UVH_INT_CMPC					0x22100UL
 
-#define UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT	0
-#define UV2H_INT_CMPC_REAL_TIME_CMPC_SHFT	0
-#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT	(is_uv1_hub() ?		\
-			UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT :	\
-			UV2H_INT_CMPC_REAL_TIME_CMPC_SHFT)
-#define UV1H_INT_CMPC_REAL_TIME_CMPC_MASK	0xffffffffffffffUL
-#define UV2H_INT_CMPC_REAL_TIME_CMPC_MASK	0xffffffffffffffUL
-#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK	(is_uv1_hub() ?		\
-			UV1H_INT_CMPC_REAL_TIME_CMPC_MASK :	\
-			UV2H_INT_CMPC_REAL_TIME_CMPC_MASK)
+#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT		0
+#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK		0xffffffffffffffUL
 
 union uvh_int_cmpc_u {
-    unsigned long	v;
-    struct uvh_int_cmpc_s {
-	unsigned long	real_time_cmpc : 56;  /* RW */
-	unsigned long	rsvd_56_63     :  8;  /*    */
-    } s;
+	unsigned long	v;
+	struct uvh_int_cmpc_s {
+		unsigned long	real_time_cmpc:56;		/* RW */
+		unsigned long	rsvd_56_63:8;
+	} s;
 };
 
 /* ========================================================================= */
 /*                               UVH_INT_CMPD                                */
 /* ========================================================================= */
-#define UVH_INT_CMPD 0x22180UL
+#define UVH_INT_CMPD					0x22180UL
 
-#define UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT	0
-#define UV2H_INT_CMPD_REAL_TIME_CMPD_SHFT	0
-#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT	(is_uv1_hub() ?		\
-			UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT :	\
-			UV2H_INT_CMPD_REAL_TIME_CMPD_SHFT)
-#define UV1H_INT_CMPD_REAL_TIME_CMPD_MASK	0xffffffffffffffUL
-#define UV2H_INT_CMPD_REAL_TIME_CMPD_MASK	0xffffffffffffffUL
-#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK	(is_uv1_hub() ?		\
-			UV1H_INT_CMPD_REAL_TIME_CMPD_MASK :	\
-			UV2H_INT_CMPD_REAL_TIME_CMPD_MASK)
+#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT		0
+#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK		0xffffffffffffffUL
 
 union uvh_int_cmpd_u {
-    unsigned long	v;
-    struct uvh_int_cmpd_s {
-	unsigned long	real_time_cmpd : 56;  /* RW */
-	unsigned long	rsvd_56_63     :  8;  /*    */
-    } s;
+	unsigned long	v;
+	struct uvh_int_cmpd_s {
+		unsigned long	real_time_cmpd:56;		/* RW */
+		unsigned long	rsvd_56_63:8;
+	} s;
 };
 
 /* ========================================================================= */
 /*                               UVH_IPI_INT                                 */
 /* ========================================================================= */
-#define UVH_IPI_INT 0x60500UL
-#define UVH_IPI_INT_32 0x348
+#define UVH_IPI_INT					0x60500UL
+#define UVH_IPI_INT_32					0x348
 
-#define UVH_IPI_INT_VECTOR_SHFT 0
-#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8
-#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL
-#define UVH_IPI_INT_DESTMODE_SHFT 11
-#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_IPI_INT_APIC_ID_SHFT 16
-#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL
-#define UVH_IPI_INT_SEND_SHFT 63
-#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL
+#define UVH_IPI_INT_VECTOR_SHFT				0
+#define UVH_IPI_INT_DELIVERY_MODE_SHFT			8
+#define UVH_IPI_INT_DESTMODE_SHFT			11
+#define UVH_IPI_INT_APIC_ID_SHFT			16
+#define UVH_IPI_INT_SEND_SHFT				63
+#define UVH_IPI_INT_VECTOR_MASK				0x00000000000000ffUL
+#define UVH_IPI_INT_DELIVERY_MODE_MASK			0x0000000000000700UL
+#define UVH_IPI_INT_DESTMODE_MASK			0x0000000000000800UL
+#define UVH_IPI_INT_APIC_ID_MASK			0x0000ffffffff0000UL
+#define UVH_IPI_INT_SEND_MASK				0x8000000000000000UL
 
 union uvh_ipi_int_u {
-    unsigned long	v;
-    struct uvh_ipi_int_s {
-	unsigned long	vector_       :  8;  /* RW */
-	unsigned long	delivery_mode :  3;  /* RW */
-	unsigned long	destmode      :  1;  /* RW */
-	unsigned long	rsvd_12_15    :  4;  /*    */
-	unsigned long	apic_id       : 32;  /* RW */
-	unsigned long	rsvd_48_62    : 15;  /*    */
-	unsigned long	send          :  1;  /* WP */
-    } s;
+	unsigned long	v;
+	struct uvh_ipi_int_s {
+		unsigned long	vector_:8;			/* RW */
+		unsigned long	delivery_mode:3;		/* RW */
+		unsigned long	destmode:1;			/* RW */
+		unsigned long	rsvd_12_15:4;
+		unsigned long	apic_id:32;			/* RW */
+		unsigned long	rsvd_48_62:15;
+		unsigned long	send:1;				/* WP */
+	} s;
 };
 
 /* ========================================================================= */
 /*                   UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST                     */
 /* ========================================================================= */
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST		0x320050UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32		0x9c0
 
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL
 
 union uvh_lb_bau_intd_payload_queue_first_u {
-    unsigned long	v;
-    struct uvh_lb_bau_intd_payload_queue_first_s {
-	unsigned long	rsvd_0_3:  4;  /*    */
-	unsigned long	address : 39;  /* RW */
-	unsigned long	rsvd_43_48:  6;  /*    */
-	unsigned long	node_id : 14;  /* RW */
-	unsigned long	rsvd_63 :  1;  /*    */
-    } s;
+	unsigned long	v;
+	struct uvh_lb_bau_intd_payload_queue_first_s {
+		unsigned long	rsvd_0_3:4;
+		unsigned long	address:39;			/* RW */
+		unsigned long	rsvd_43_48:6;
+		unsigned long	node_id:14;			/* RW */
+		unsigned long	rsvd_63:1;
+	} s;
 };
 
 /* ========================================================================= */
 /*                    UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST                     */
 /* ========================================================================= */
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST		0x320060UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32		0x9c8
 
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT	4
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK	0x000007fffffffff0UL
 
 union uvh_lb_bau_intd_payload_queue_last_u {
-    unsigned long	v;
-    struct uvh_lb_bau_intd_payload_queue_last_s {
-	unsigned long	rsvd_0_3:  4;  /*    */
-	unsigned long	address : 39;  /* RW */
-	unsigned long	rsvd_43_63: 21;  /*    */
-    } s;
+	unsigned long	v;
+	struct uvh_lb_bau_intd_payload_queue_last_s {
+		unsigned long	rsvd_0_3:4;
+		unsigned long	address:39;			/* RW */
+		unsigned long	rsvd_43_63:21;
+	} s;
 };
 
 /* ========================================================================= */
 /*                    UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL                     */
 /* ========================================================================= */
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL		0x320070UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32		0x9d0
 
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
-#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT	4
+#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK	0x000007fffffffff0UL
 
 union uvh_lb_bau_intd_payload_queue_tail_u {
-    unsigned long	v;
-    struct uvh_lb_bau_intd_payload_queue_tail_s {
-	unsigned long	rsvd_0_3:  4;  /*    */
-	unsigned long	address : 39;  /* RW */
-	unsigned long	rsvd_43_63: 21;  /*    */
-    } s;
+	unsigned long	v;
+	struct uvh_lb_bau_intd_payload_queue_tail_s {
+		unsigned long	rsvd_0_3:4;
+		unsigned long	address:39;			/* RW */
+		unsigned long	rsvd_43_63:21;
+	} s;
 };
 
 /* ========================================================================= */
 /*                   UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE                    */
 /* ========================================================================= */
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE		0x320080UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32		0xa68
 
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
 
 union uvh_lb_bau_intd_software_acknowledge_u {
-    unsigned long	v;
-    struct uvh_lb_bau_intd_software_acknowledge_s {
-	unsigned long	pending_0 :  1;  /* RW, W1C */
-	unsigned long	pending_1 :  1;  /* RW, W1C */
-	unsigned long	pending_2 :  1;  /* RW, W1C */
-	unsigned long	pending_3 :  1;  /* RW, W1C */
-	unsigned long	pending_4 :  1;  /* RW, W1C */
-	unsigned long	pending_5 :  1;  /* RW, W1C */
-	unsigned long	pending_6 :  1;  /* RW, W1C */
-	unsigned long	pending_7 :  1;  /* RW, W1C */
-	unsigned long	timeout_0 :  1;  /* RW, W1C */
-	unsigned long	timeout_1 :  1;  /* RW, W1C */
-	unsigned long	timeout_2 :  1;  /* RW, W1C */
-	unsigned long	timeout_3 :  1;  /* RW, W1C */
-	unsigned long	timeout_4 :  1;  /* RW, W1C */
-	unsigned long	timeout_5 :  1;  /* RW, W1C */
-	unsigned long	timeout_6 :  1;  /* RW, W1C */
-	unsigned long	timeout_7 :  1;  /* RW, W1C */
-	unsigned long	rsvd_16_63: 48;  /*    */
-    } s;
+	unsigned long	v;
+	struct uvh_lb_bau_intd_software_acknowledge_s {
+		unsigned long	pending_0:1;			/* RW, W1C */
+		unsigned long	pending_1:1;			/* RW, W1C */
+		unsigned long	pending_2:1;			/* RW, W1C */
+		unsigned long	pending_3:1;			/* RW, W1C */
+		unsigned long	pending_4:1;			/* RW, W1C */
+		unsigned long	pending_5:1;			/* RW, W1C */
+		unsigned long	pending_6:1;			/* RW, W1C */
+		unsigned long	pending_7:1;			/* RW, W1C */
+		unsigned long	timeout_0:1;			/* RW, W1C */
+		unsigned long	timeout_1:1;			/* RW, W1C */
+		unsigned long	timeout_2:1;			/* RW, W1C */
+		unsigned long	timeout_3:1;			/* RW, W1C */
+		unsigned long	timeout_4:1;			/* RW, W1C */
+		unsigned long	timeout_5:1;			/* RW, W1C */
+		unsigned long	timeout_6:1;			/* RW, W1C */
+		unsigned long	timeout_7:1;			/* RW, W1C */
+		unsigned long	rsvd_16_63:48;
+	} s;
 };
 
 /* ========================================================================= */
 /*                UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS                 */
 /* ========================================================================= */
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL
-#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS	0x0000000000320088UL
+#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32	0xa70
 
 /* ========================================================================= */
 /*                         UVH_LB_BAU_MISC_CONTROL                           */
 /* ========================================================================= */
-#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
-#define UVH_LB_BAU_MISC_CONTROL_32 0xa10
-
-#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
-#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
-#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
-#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
-#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
-#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
-#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
-#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UVH_LB_BAU_MISC_CONTROL				0x320170UL
+#define UVH_LB_BAU_MISC_CONTROL_32			0xa10
+
+#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT	0
+#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT		8
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT	9
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT	10
 #define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
-#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
 #define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
-#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
 #define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
-#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
 #define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
-#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
 #define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
-#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
 #define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
-#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
 #define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
-#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
 #define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
-#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
 #define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
-#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
 #define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
-#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
 #define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK	0x00000000000000ffUL
+#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK		0x0000000000000100UL
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK	0x0000000000000200UL
+#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK	0x0000000000000400UL
+#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
 #define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
 
-#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
-#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
-#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
-#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
-#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
-#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
-#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
-#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT	0
+#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT		8
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT	9
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT	10
 #define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
-#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
 #define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
-#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
 #define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
-#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
 #define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
-#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
 #define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
-#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
 #define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
-#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
 #define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
-#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
 #define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
-#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
 #define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
-#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
 #define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
-#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
 #define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
+#define UV1H_LB_BAU_MISC_CONTROL_FUN_SHFT		48
+#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK	0x00000000000000ffUL
+#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK		0x0000000000000100UL
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK	0x0000000000000200UL
+#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK	0x0000000000000400UL
+#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
 #define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
-#define UV1H_LB_BAU_MISC_CONTROL_FUN_SHFT 48
-#define UV1H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
-
-#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
-#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
-#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
-#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
-#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
-#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
-#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
-#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
+#define UV1H_LB_BAU_MISC_CONTROL_FUN_MASK		0xffff000000000000UL
+
+#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT	0
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT		8
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT	9
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT	10
 #define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
-#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
 #define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
-#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
 #define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
-#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
 #define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
-#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
 #define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
-#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
 #define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
-#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
 #define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
-#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
 #define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
-#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
 #define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
-#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
 #define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
-#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
 #define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
-#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
 #define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29
-#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL
-#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30
-#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT	30
 #define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31
-#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL
 #define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32
-#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL
 #define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33
-#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL
 #define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34
-#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL
 #define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35
+#define UV2H_LB_BAU_MISC_CONTROL_FUN_SHFT		48
+#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK	0x00000000000000ffUL
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK		0x0000000000000100UL
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK	0x0000000000000200UL
+#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK	0x0000000000000400UL
+#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
+#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
+#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
+#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
+#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
+#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK	0x0000000040000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL
 #define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL
-#define UV2H_LB_BAU_MISC_CONTROL_FUN_SHFT 48
-#define UV2H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
+#define UV2H_LB_BAU_MISC_CONTROL_FUN_MASK		0xffff000000000000UL
 
 union uvh_lb_bau_misc_control_u {
-    unsigned long	v;
-    struct uvh_lb_bau_misc_control_s {
-	unsigned long	rejection_delay                    :  8;  /* RW */
-	unsigned long	apic_mode                          :  1;  /* RW */
-	unsigned long	force_broadcast                    :  1;  /* RW */
-	unsigned long	force_lock_nop                     :  1;  /* RW */
-	unsigned long	qpi_agent_presence_vector          :  3;  /* RW */
-	unsigned long	descriptor_fetch_mode              :  1;  /* RW */
-	unsigned long	enable_intd_soft_ack_mode          :  1;  /* RW */
-	unsigned long	intd_soft_ack_timeout_period       :  4;  /* RW */
-	unsigned long	enable_dual_mapping_mode           :  1;  /* RW */
-	unsigned long	vga_io_port_decode_enable          :  1;  /* RW */
-	unsigned long	vga_io_port_16_bit_decode          :  1;  /* RW */
-	unsigned long	suppress_dest_registration         :  1;  /* RW */
-	unsigned long	programmed_initial_priority        :  3;  /* RW */
-	unsigned long	use_incoming_priority              :  1;  /* RW */
-	unsigned long	enable_programmed_initial_priority :  1;  /* RW */
-	unsigned long	rsvd_29_63    : 35;
-    } s;
-    struct uv1h_lb_bau_misc_control_s {
-	unsigned long	rejection_delay                    :  8;  /* RW */
-	unsigned long	apic_mode                          :  1;  /* RW */
-	unsigned long	force_broadcast                    :  1;  /* RW */
-	unsigned long	force_lock_nop                     :  1;  /* RW */
-	unsigned long	qpi_agent_presence_vector          :  3;  /* RW */
-	unsigned long	descriptor_fetch_mode              :  1;  /* RW */
-	unsigned long	enable_intd_soft_ack_mode          :  1;  /* RW */
-	unsigned long	intd_soft_ack_timeout_period       :  4;  /* RW */
-	unsigned long	enable_dual_mapping_mode           :  1;  /* RW */
-	unsigned long	vga_io_port_decode_enable          :  1;  /* RW */
-	unsigned long	vga_io_port_16_bit_decode          :  1;  /* RW */
-	unsigned long	suppress_dest_registration         :  1;  /* RW */
-	unsigned long	programmed_initial_priority        :  3;  /* RW */
-	unsigned long	use_incoming_priority              :  1;  /* RW */
-	unsigned long	enable_programmed_initial_priority :  1;  /* RW */
-	unsigned long	rsvd_29_47                         : 19;  /*    */
-	unsigned long	fun                                : 16;  /* RW */
-    } s1;
-    struct uv2h_lb_bau_misc_control_s {
-	unsigned long	rejection_delay                      :  8;  /* RW */
-	unsigned long	apic_mode                            :  1;  /* RW */
-	unsigned long	force_broadcast                      :  1;  /* RW */
-	unsigned long	force_lock_nop                       :  1;  /* RW */
-	unsigned long	qpi_agent_presence_vector            :  3;  /* RW */
-	unsigned long	descriptor_fetch_mode                :  1;  /* RW */
-	unsigned long	enable_intd_soft_ack_mode            :  1;  /* RW */
-	unsigned long	intd_soft_ack_timeout_period         :  4;  /* RW */
-	unsigned long	enable_dual_mapping_mode             :  1;  /* RW */
-	unsigned long	vga_io_port_decode_enable            :  1;  /* RW */
-	unsigned long	vga_io_port_16_bit_decode            :  1;  /* RW */
-	unsigned long	suppress_dest_registration           :  1;  /* RW */
-	unsigned long	programmed_initial_priority          :  3;  /* RW */
-	unsigned long	use_incoming_priority                :  1;  /* RW */
-	unsigned long	enable_programmed_initial_priority   :  1;  /* RW */
-	unsigned long	enable_automatic_apic_mode_selection :  1;  /* RW */
-	unsigned long	apic_mode_status                     :  1;  /* RO */
-	unsigned long	suppress_interrupts_to_self          :  1;  /* RW */
-	unsigned long	enable_lock_based_system_flush       :  1;  /* RW */
-	unsigned long	enable_extended_sb_status            :  1;  /* RW */
-	unsigned long	suppress_int_prio_udt_to_self        :  1;  /* RW */
-	unsigned long	use_legacy_descriptor_formats        :  1;  /* RW */
-	unsigned long	rsvd_36_47                           : 12;  /*    */
-	unsigned long	fun                                  : 16;  /* RW */
-    } s2;
+	unsigned long	v;
+	struct uvh_lb_bau_misc_control_s {
+		unsigned long	rejection_delay:8;		/* RW */
+		unsigned long	apic_mode:1;			/* RW */
+		unsigned long	force_broadcast:1;		/* RW */
+		unsigned long	force_lock_nop:1;		/* RW */
+		unsigned long	qpi_agent_presence_vector:3;	/* RW */
+		unsigned long	descriptor_fetch_mode:1;	/* RW */
+		unsigned long	enable_intd_soft_ack_mode:1;	/* RW */
+		unsigned long	intd_soft_ack_timeout_period:4;	/* RW */
+		unsigned long	enable_dual_mapping_mode:1;	/* RW */
+		unsigned long	vga_io_port_decode_enable:1;	/* RW */
+		unsigned long	vga_io_port_16_bit_decode:1;	/* RW */
+		unsigned long	suppress_dest_registration:1;	/* RW */
+		unsigned long	programmed_initial_priority:3;	/* RW */
+		unsigned long	use_incoming_priority:1;	/* RW */
+		unsigned long	enable_programmed_initial_priority:1;/* RW */
+		unsigned long	rsvd_29_63:35;
+	} s;
+	struct uv1h_lb_bau_misc_control_s {
+		unsigned long	rejection_delay:8;		/* RW */
+		unsigned long	apic_mode:1;			/* RW */
+		unsigned long	force_broadcast:1;		/* RW */
+		unsigned long	force_lock_nop:1;		/* RW */
+		unsigned long	qpi_agent_presence_vector:3;	/* RW */
+		unsigned long	descriptor_fetch_mode:1;	/* RW */
+		unsigned long	enable_intd_soft_ack_mode:1;	/* RW */
+		unsigned long	intd_soft_ack_timeout_period:4;	/* RW */
+		unsigned long	enable_dual_mapping_mode:1;	/* RW */
+		unsigned long	vga_io_port_decode_enable:1;	/* RW */
+		unsigned long	vga_io_port_16_bit_decode:1;	/* RW */
+		unsigned long	suppress_dest_registration:1;	/* RW */
+		unsigned long	programmed_initial_priority:3;	/* RW */
+		unsigned long	use_incoming_priority:1;	/* RW */
+		unsigned long	enable_programmed_initial_priority:1;/* RW */
+		unsigned long	rsvd_29_47:19;
+		unsigned long	fun:16;				/* RW */
+	} s1;
+	struct uv2h_lb_bau_misc_control_s {
+		unsigned long	rejection_delay:8;		/* RW */
+		unsigned long	apic_mode:1;			/* RW */
+		unsigned long	force_broadcast:1;		/* RW */
+		unsigned long	force_lock_nop:1;		/* RW */
+		unsigned long	qpi_agent_presence_vector:3;	/* RW */
+		unsigned long	descriptor_fetch_mode:1;	/* RW */
+		unsigned long	enable_intd_soft_ack_mode:1;	/* RW */
+		unsigned long	intd_soft_ack_timeout_period:4;	/* RW */
+		unsigned long	enable_dual_mapping_mode:1;	/* RW */
+		unsigned long	vga_io_port_decode_enable:1;	/* RW */
+		unsigned long	vga_io_port_16_bit_decode:1;	/* RW */
+		unsigned long	suppress_dest_registration:1;	/* RW */
+		unsigned long	programmed_initial_priority:3;	/* RW */
+		unsigned long	use_incoming_priority:1;	/* RW */
+		unsigned long	enable_programmed_initial_priority:1;/* RW */
+		unsigned long	enable_automatic_apic_mode_selection:1;/* RW */
+		unsigned long	apic_mode_status:1;		/* RO */
+		unsigned long	suppress_interrupts_to_self:1;	/* RW */
+		unsigned long	enable_lock_based_system_flush:1;/* RW */
+		unsigned long	enable_extended_sb_status:1;	/* RW */
+		unsigned long	suppress_int_prio_udt_to_self:1;/* RW */
+		unsigned long	use_legacy_descriptor_formats:1;/* RW */
+		unsigned long	rsvd_36_47:12;
+		unsigned long	fun:16;				/* RW */
+	} s2;
 };
 
 /* ========================================================================= */
 /*                     UVH_LB_BAU_SB_ACTIVATION_CONTROL                      */
 /* ========================================================================= */
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL		0x320020UL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32		0x9a8
 
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63
-#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT	0
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT	62
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT	63
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK	0x000000000000003fUL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK	0x4000000000000000UL
+#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK	0x8000000000000000UL
 
 union uvh_lb_bau_sb_activation_control_u {
-    unsigned long	v;
-    struct uvh_lb_bau_sb_activation_control_s {
-	unsigned long	index :  6;  /* RW */
-	unsigned long	rsvd_6_61: 56;  /*    */
-	unsigned long	push  :  1;  /* WP */
-	unsigned long	init  :  1;  /* WP */
-    } s;
+	unsigned long	v;
+	struct uvh_lb_bau_sb_activation_control_s {
+		unsigned long	index:6;			/* RW */
+		unsigned long	rsvd_6_61:56;
+		unsigned long	push:1;				/* WP */
+		unsigned long	init:1;				/* WP */
+	} s;
 };
 
 /* ========================================================================= */
 /*                    UVH_LB_BAU_SB_ACTIVATION_STATUS_0                      */
 /* ========================================================================= */
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0		0x320030UL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32		0x9b0
 
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT	0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK	0xffffffffffffffffUL
 
 union uvh_lb_bau_sb_activation_status_0_u {
-    unsigned long	v;
-    struct uvh_lb_bau_sb_activation_status_0_s {
-	unsigned long	status : 64;  /* RW */
-    } s;
+	unsigned long	v;
+	struct uvh_lb_bau_sb_activation_status_0_s {
+		unsigned long	status:64;			/* RW */
+	} s;
 };
 
 /* ========================================================================= */
 /*                    UVH_LB_BAU_SB_ACTIVATION_STATUS_1                      */
 /* ========================================================================= */
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1		0x320040UL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32		0x9b8
 
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0
-#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT	0
+#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK	0xffffffffffffffffUL
 
 union uvh_lb_bau_sb_activation_status_1_u {
-    unsigned long	v;
-    struct uvh_lb_bau_sb_activation_status_1_s {
-	unsigned long	status : 64;  /* RW */
-    } s;
+	unsigned long	v;
+	struct uvh_lb_bau_sb_activation_status_1_s {
+		unsigned long	status:64;			/* RW */
+	} s;
 };
 
 /* ========================================================================= */
 /*                      UVH_LB_BAU_SB_DESCRIPTOR_BASE                        */
 /* ========================================================================= */
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE			0x320010UL
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32		0x9a0
 
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49
-#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT	12
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT	49
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK	0x000007fffffff000UL
+#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK	0x7ffe000000000000UL
 
 union uvh_lb_bau_sb_descriptor_base_u {
-    unsigned long	v;
-    struct uvh_lb_bau_sb_descriptor_base_s {
-	unsigned long	rsvd_0_11    : 12;  /*    */
-	unsigned long	page_address : 31;  /* RW */
-	unsigned long	rsvd_43_48   :  6;  /*    */
-	unsigned long	node_id      : 14;  /* RW */
-	unsigned long	rsvd_63      :  1;  /*    */
-    } s;
+	unsigned long	v;
+	struct uvh_lb_bau_sb_descriptor_base_s {
+		unsigned long	rsvd_0_11:12;
+		unsigned long	page_address:31;		/* RW */
+		unsigned long	rsvd_43_48:6;
+		unsigned long	node_id:14;			/* RW */
+		unsigned long	rsvd_63:1;
+	} s;
 };
 
 /* ========================================================================= */
 /*                               UVH_NODE_ID                                 */
 /* ========================================================================= */
-#define UVH_NODE_ID 0x0UL
-
-#define UVH_NODE_ID_FORCE1_SHFT 0
-#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL
-#define UVH_NODE_ID_MANUFACTURER_SHFT 1
-#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
-#define UVH_NODE_ID_PART_NUMBER_SHFT 12
-#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
-#define UVH_NODE_ID_REVISION_SHFT 28
-#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
-#define UVH_NODE_ID_NODE_ID_SHFT 32
-#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
-
-#define UV1H_NODE_ID_FORCE1_SHFT 0
-#define UV1H_NODE_ID_FORCE1_MASK 0x0000000000000001UL
-#define UV1H_NODE_ID_MANUFACTURER_SHFT 1
-#define UV1H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
-#define UV1H_NODE_ID_PART_NUMBER_SHFT 12
-#define UV1H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
-#define UV1H_NODE_ID_REVISION_SHFT 28
-#define UV1H_NODE_ID_REVISION_MASK 0x00000000f0000000UL
-#define UV1H_NODE_ID_NODE_ID_SHFT 32
-#define UV1H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
-#define UV1H_NODE_ID_NODES_PER_BIT_SHFT 48
-#define UV1H_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL
-#define UV1H_NODE_ID_NI_PORT_SHFT 56
-#define UV1H_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL
-
-#define UV2H_NODE_ID_FORCE1_SHFT 0
-#define UV2H_NODE_ID_FORCE1_MASK 0x0000000000000001UL
-#define UV2H_NODE_ID_MANUFACTURER_SHFT 1
-#define UV2H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
-#define UV2H_NODE_ID_PART_NUMBER_SHFT 12
-#define UV2H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
-#define UV2H_NODE_ID_REVISION_SHFT 28
-#define UV2H_NODE_ID_REVISION_MASK 0x00000000f0000000UL
-#define UV2H_NODE_ID_NODE_ID_SHFT 32
-#define UV2H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
-#define UV2H_NODE_ID_NODES_PER_BIT_SHFT 50
-#define UV2H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL
-#define UV2H_NODE_ID_NI_PORT_SHFT 57
-#define UV2H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL
+#define UVH_NODE_ID					0x0UL
+
+#define UVH_NODE_ID_FORCE1_SHFT				0
+#define UVH_NODE_ID_MANUFACTURER_SHFT			1
+#define UVH_NODE_ID_PART_NUMBER_SHFT			12
+#define UVH_NODE_ID_REVISION_SHFT			28
+#define UVH_NODE_ID_NODE_ID_SHFT			32
+#define UVH_NODE_ID_FORCE1_MASK				0x0000000000000001UL
+#define UVH_NODE_ID_MANUFACTURER_MASK			0x0000000000000ffeUL
+#define UVH_NODE_ID_PART_NUMBER_MASK			0x000000000ffff000UL
+#define UVH_NODE_ID_REVISION_MASK			0x00000000f0000000UL
+#define UVH_NODE_ID_NODE_ID_MASK			0x00007fff00000000UL
+
+#define UV1H_NODE_ID_FORCE1_SHFT			0
+#define UV1H_NODE_ID_MANUFACTURER_SHFT			1
+#define UV1H_NODE_ID_PART_NUMBER_SHFT			12
+#define UV1H_NODE_ID_REVISION_SHFT			28
+#define UV1H_NODE_ID_NODE_ID_SHFT			32
+#define UV1H_NODE_ID_NODES_PER_BIT_SHFT			48
+#define UV1H_NODE_ID_NI_PORT_SHFT			56
+#define UV1H_NODE_ID_FORCE1_MASK			0x0000000000000001UL
+#define UV1H_NODE_ID_MANUFACTURER_MASK			0x0000000000000ffeUL
+#define UV1H_NODE_ID_PART_NUMBER_MASK			0x000000000ffff000UL
+#define UV1H_NODE_ID_REVISION_MASK			0x00000000f0000000UL
+#define UV1H_NODE_ID_NODE_ID_MASK			0x00007fff00000000UL
+#define UV1H_NODE_ID_NODES_PER_BIT_MASK			0x007f000000000000UL
+#define UV1H_NODE_ID_NI_PORT_MASK			0x0f00000000000000UL
+
+#define UV2H_NODE_ID_FORCE1_SHFT			0
+#define UV2H_NODE_ID_MANUFACTURER_SHFT			1
+#define UV2H_NODE_ID_PART_NUMBER_SHFT			12
+#define UV2H_NODE_ID_REVISION_SHFT			28
+#define UV2H_NODE_ID_NODE_ID_SHFT			32
+#define UV2H_NODE_ID_NODES_PER_BIT_SHFT			50
+#define UV2H_NODE_ID_NI_PORT_SHFT			57
+#define UV2H_NODE_ID_FORCE1_MASK			0x0000000000000001UL
+#define UV2H_NODE_ID_MANUFACTURER_MASK			0x0000000000000ffeUL
+#define UV2H_NODE_ID_PART_NUMBER_MASK			0x000000000ffff000UL
+#define UV2H_NODE_ID_REVISION_MASK			0x00000000f0000000UL
+#define UV2H_NODE_ID_NODE_ID_MASK			0x00007fff00000000UL
+#define UV2H_NODE_ID_NODES_PER_BIT_MASK			0x01fc000000000000UL
+#define UV2H_NODE_ID_NI_PORT_MASK			0x3e00000000000000UL
 
 union uvh_node_id_u {
-    unsigned long	v;
-    struct uvh_node_id_s {
-	unsigned long	force1        :  1;  /* RO */
-	unsigned long	manufacturer  : 11;  /* RO */
-	unsigned long	part_number   : 16;  /* RO */
-	unsigned long	revision      :  4;  /* RO */
-	unsigned long	node_id       : 15;  /* RW */
-	unsigned long	rsvd_47_63    : 17;
-    } s;
-    struct uv1h_node_id_s {
-	unsigned long	force1        :  1;  /* RO */
-	unsigned long	manufacturer  : 11;  /* RO */
-	unsigned long	part_number   : 16;  /* RO */
-	unsigned long	revision      :  4;  /* RO */
-	unsigned long	node_id       : 15;  /* RW */
-	unsigned long	rsvd_47       :  1;  /*    */
-	unsigned long	nodes_per_bit :  7;  /* RW */
-	unsigned long	rsvd_55       :  1;  /*    */
-	unsigned long	ni_port       :  4;  /* RO */
-	unsigned long	rsvd_60_63    :  4;  /*    */
-    } s1;
-    struct uv2h_node_id_s {
-	unsigned long	force1        :  1;  /* RO */
-	unsigned long	manufacturer  : 11;  /* RO */
-	unsigned long	part_number   : 16;  /* RO */
-	unsigned long	revision      :  4;  /* RO */
-	unsigned long	node_id       : 15;  /* RW */
-	unsigned long	rsvd_47_49    :  3;  /*    */
-	unsigned long	nodes_per_bit :  7;  /* RO */
-	unsigned long	ni_port       :  5;  /* RO */
-	unsigned long	rsvd_62_63    :  2;  /*    */
-    } s2;
+	unsigned long	v;
+	struct uvh_node_id_s {
+		unsigned long	force1:1;			/* RO */
+		unsigned long	manufacturer:11;		/* RO */
+		unsigned long	part_number:16;			/* RO */
+		unsigned long	revision:4;			/* RO */
+		unsigned long	node_id:15;			/* RW */
+		unsigned long	rsvd_47_63:17;
+	} s;
+	struct uv1h_node_id_s {
+		unsigned long	force1:1;			/* RO */
+		unsigned long	manufacturer:11;		/* RO */
+		unsigned long	part_number:16;			/* RO */
+		unsigned long	revision:4;			/* RO */
+		unsigned long	node_id:15;			/* RW */
+		unsigned long	rsvd_47:1;
+		unsigned long	nodes_per_bit:7;		/* RW */
+		unsigned long	rsvd_55:1;
+		unsigned long	ni_port:4;			/* RO */
+		unsigned long	rsvd_60_63:4;
+	} s1;
+	struct uv2h_node_id_s {
+		unsigned long	force1:1;			/* RO */
+		unsigned long	manufacturer:11;		/* RO */
+		unsigned long	part_number:16;			/* RO */
+		unsigned long	revision:4;			/* RO */
+		unsigned long	node_id:15;			/* RW */
+		unsigned long	rsvd_47_49:3;
+		unsigned long	nodes_per_bit:7;		/* RO */
+		unsigned long	ni_port:5;			/* RO */
+		unsigned long	rsvd_62_63:2;
+	} s2;
 };
 
 /* ========================================================================= */
 /*                          UVH_NODE_PRESENT_TABLE                           */
 /* ========================================================================= */
-#define UVH_NODE_PRESENT_TABLE 0x1400UL
-#define UVH_NODE_PRESENT_TABLE_DEPTH 16
+#define UVH_NODE_PRESENT_TABLE				0x1400UL
+#define UVH_NODE_PRESENT_TABLE_DEPTH			16
 
-#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0
-#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL
+#define UVH_NODE_PRESENT_TABLE_NODES_SHFT		0
+#define UVH_NODE_PRESENT_TABLE_NODES_MASK		0xffffffffffffffffUL
 
 union uvh_node_present_table_u {
-    unsigned long	v;
-    struct uvh_node_present_table_s {
-	unsigned long	nodes : 64;  /* RW */
-    } s;
+	unsigned long	v;
+	struct uvh_node_present_table_s {
+		unsigned long	nodes:64;			/* RW */
+	} s;
 };
 
 /* ========================================================================= */
 /*                 UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR                  */
 /* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR	0x16000c8UL
 
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL
 
 union uvh_rh_gam_alias210_overlay_config_0_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_alias210_overlay_config_0_mmr_s {
-	unsigned long	rsvd_0_23: 24;  /*    */
-	unsigned long	base    :  8;  /* RW */
-	unsigned long	rsvd_32_47: 16;  /*    */
-	unsigned long	m_alias :  5;  /* RW */
-	unsigned long	rsvd_53_62: 10;  /*    */
-	unsigned long	enable  :  1;  /* RW */
-    } s;
+	unsigned long	v;
+	struct uvh_rh_gam_alias210_overlay_config_0_mmr_s {
+		unsigned long	rsvd_0_23:24;
+		unsigned long	base:8;				/* RW */
+		unsigned long	rsvd_32_47:16;
+		unsigned long	m_alias:5;			/* RW */
+		unsigned long	rsvd_53_62:10;
+		unsigned long	enable:1;			/* RW */
+	} s;
 };
 
 /* ========================================================================= */
 /*                 UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR                  */
 /* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR	0x16000d8UL
 
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL
 
 union uvh_rh_gam_alias210_overlay_config_1_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_alias210_overlay_config_1_mmr_s {
-	unsigned long	rsvd_0_23: 24;  /*    */
-	unsigned long	base    :  8;  /* RW */
-	unsigned long	rsvd_32_47: 16;  /*    */
-	unsigned long	m_alias :  5;  /* RW */
-	unsigned long	rsvd_53_62: 10;  /*    */
-	unsigned long	enable  :  1;  /* RW */
-    } s;
+	unsigned long	v;
+	struct uvh_rh_gam_alias210_overlay_config_1_mmr_s {
+		unsigned long	rsvd_0_23:24;
+		unsigned long	base:8;				/* RW */
+		unsigned long	rsvd_32_47:16;
+		unsigned long	m_alias:5;			/* RW */
+		unsigned long	rsvd_53_62:10;
+		unsigned long	enable:1;			/* RW */
+	} s;
 };
 
 /* ========================================================================= */
 /*                 UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR                  */
 /* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR	0x16000e8UL
 
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48
-#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
 #define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL
 
 union uvh_rh_gam_alias210_overlay_config_2_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_alias210_overlay_config_2_mmr_s {
-	unsigned long	rsvd_0_23: 24;  /*    */
-	unsigned long	base    :  8;  /* RW */
-	unsigned long	rsvd_32_47: 16;  /*    */
-	unsigned long	m_alias :  5;  /* RW */
-	unsigned long	rsvd_53_62: 10;  /*    */
-	unsigned long	enable  :  1;  /* RW */
-    } s;
+	unsigned long	v;
+	struct uvh_rh_gam_alias210_overlay_config_2_mmr_s {
+		unsigned long	rsvd_0_23:24;
+		unsigned long	base:8;				/* RW */
+		unsigned long	rsvd_32_47:16;
+		unsigned long	m_alias:5;			/* RW */
+		unsigned long	rsvd_53_62:10;
+		unsigned long	enable:1;			/* RW */
+	} s;
 };
 
 /* ========================================================================= */
 /*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR                  */
 /* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR	0x16000d0UL
 
 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
 
 union uvh_rh_gam_alias210_redirect_config_0_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_alias210_redirect_config_0_mmr_s {
-	unsigned long	rsvd_0_23 : 24;  /*    */
-	unsigned long	dest_base : 22;  /* RW */
-	unsigned long	rsvd_46_63: 18;  /*    */
-    } s;
+	unsigned long	v;
+	struct uvh_rh_gam_alias210_redirect_config_0_mmr_s {
+		unsigned long	rsvd_0_23:24;
+		unsigned long	dest_base:22;			/* RW */
+		unsigned long	rsvd_46_63:18;
+	} s;
 };
 
 /* ========================================================================= */
 /*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR                  */
 /* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR	0x16000e0UL
 
 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
 
 union uvh_rh_gam_alias210_redirect_config_1_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_alias210_redirect_config_1_mmr_s {
-	unsigned long	rsvd_0_23 : 24;  /*    */
-	unsigned long	dest_base : 22;  /* RW */
-	unsigned long	rsvd_46_63: 18;  /*    */
-    } s;
+	unsigned long	v;
+	struct uvh_rh_gam_alias210_redirect_config_1_mmr_s {
+		unsigned long	rsvd_0_23:24;
+		unsigned long	dest_base:22;			/* RW */
+		unsigned long	rsvd_46_63:18;
+	} s;
 };
 
 /* ========================================================================= */
 /*                UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR                  */
 /* ========================================================================= */
-#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
+#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR	0x16000f0UL
 
 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
 #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
 
 union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_alias210_redirect_config_2_mmr_s {
-	unsigned long	rsvd_0_23 : 24;  /*    */
-	unsigned long	dest_base : 22;  /* RW */
-	unsigned long	rsvd_46_63: 18;  /*    */
-    } s;
+	unsigned long	v;
+	struct uvh_rh_gam_alias210_redirect_config_2_mmr_s {
+		unsigned long	rsvd_0_23:24;
+		unsigned long	dest_base:22;			/* RW */
+		unsigned long	rsvd_46_63:18;
+	} s;
 };
 
 /* ========================================================================= */
 /*                          UVH_RH_GAM_CONFIG_MMR                            */
 /* ========================================================================= */
-#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL
+#define UVH_RH_GAM_CONFIG_MMR				0x1600000UL
 
-#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
-#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
-#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
-#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
+#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT		0
+#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT		6
+#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK		0x000000000000003fUL
+#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK		0x00000000000003c0UL
 
-#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
-#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
-#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
-#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
-#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12
-#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL
+#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_SHFT		0
+#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_SHFT		6
+#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT		12
+#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_MASK		0x000000000000003fUL
+#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK		0x00000000000003c0UL
+#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK		0x0000000000001000UL
 
-#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
-#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
-#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
-#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
+#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT		0
+#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_SHFT		6
+#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_MASK		0x000000000000003fUL
+#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_MASK		0x00000000000003c0UL
 
 union uvh_rh_gam_config_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_config_mmr_s {
-	unsigned long	m_skt     :  6;  /* RW */
-	unsigned long	n_skt     :  4;  /* RW */
-	unsigned long	rsvd_10_63    : 54;
-    } s;
-    struct uv1h_rh_gam_config_mmr_s {
-	unsigned long	m_skt     :  6;  /* RW */
-	unsigned long	n_skt     :  4;  /* RW */
-	unsigned long	rsvd_10_11:  2;  /*    */
-	unsigned long	mmiol_cfg :  1;  /* RW */
-	unsigned long	rsvd_13_63: 51;  /*    */
-    } s1;
-    struct uv2h_rh_gam_config_mmr_s {
-	unsigned long	m_skt :  6;  /* RW */
-	unsigned long	n_skt :  4;  /* RW */
-	unsigned long	rsvd_10_63: 54;  /*    */
-    } s2;
+	unsigned long	v;
+	struct uvh_rh_gam_config_mmr_s {
+		unsigned long	m_skt:6;			/* RW */
+		unsigned long	n_skt:4;			/* RW */
+		unsigned long	rsvd_10_63:54;
+	} s;
+	struct uv1h_rh_gam_config_mmr_s {
+		unsigned long	m_skt:6;			/* RW */
+		unsigned long	n_skt:4;			/* RW */
+		unsigned long	rsvd_10_11:2;
+		unsigned long	mmiol_cfg:1;			/* RW */
+		unsigned long	rsvd_13_63:51;
+	} s1;
+	struct uv2h_rh_gam_config_mmr_s {
+		unsigned long	m_skt:6;			/* RW */
+		unsigned long	n_skt:4;			/* RW */
+		unsigned long	rsvd_10_63:54;
+	} s2;
 };
 
 /* ========================================================================= */
 /*                    UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR                      */
 /* ========================================================================= */
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR		0x1600010UL
 
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
-#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT	28
+#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffff0000000UL
 
-#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
-#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
-#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48
-#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL
-#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
-#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
-#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT	28
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT	48
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT	52
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT	63
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffff0000000UL
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK	0x0001000000000000UL
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK	0x00f0000000000000UL
+#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
 
-#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
-#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
-#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
-#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
-#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT	28
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT	52
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT	63
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffff0000000UL
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK	0x00f0000000000000UL
+#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
 
 union uvh_rh_gam_gru_overlay_config_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_gru_overlay_config_mmr_s {
-	unsigned long	rsvd_0_27: 28;  /*    */
-	unsigned long	base   : 18;  /* RW */
-	unsigned long	rsvd_46_62    : 17;
-	unsigned long	enable :  1;  /* RW */
-    } s;
-    struct uv1h_rh_gam_gru_overlay_config_mmr_s {
-	unsigned long	rsvd_0_27: 28;  /*    */
-	unsigned long	base   : 18;  /* RW */
-	unsigned long	rsvd_46_47:  2;  /*    */
-	unsigned long	gr4    :  1;  /* RW */
-	unsigned long	rsvd_49_51:  3;  /*    */
-	unsigned long	n_gru  :  4;  /* RW */
-	unsigned long	rsvd_56_62:  7;  /*    */
-	unsigned long	enable :  1;  /* RW */
-    } s1;
-    struct uv2h_rh_gam_gru_overlay_config_mmr_s {
-	unsigned long	rsvd_0_27: 28;  /*    */
-	unsigned long	base   : 18;  /* RW */
-	unsigned long	rsvd_46_51:  6;  /*    */
-	unsigned long	n_gru  :  4;  /* RW */
-	unsigned long	rsvd_56_62:  7;  /*    */
-	unsigned long	enable :  1;  /* RW */
-    } s2;
+	unsigned long	v;
+	struct uvh_rh_gam_gru_overlay_config_mmr_s {
+		unsigned long	rsvd_0_27:28;
+		unsigned long	base:18;			/* RW */
+		unsigned long	rsvd_46_62:17;
+		unsigned long	enable:1;			/* RW */
+	} s;
+	struct uv1h_rh_gam_gru_overlay_config_mmr_s {
+		unsigned long	rsvd_0_27:28;
+		unsigned long	base:18;			/* RW */
+		unsigned long	rsvd_46_47:2;
+		unsigned long	gr4:1;				/* RW */
+		unsigned long	rsvd_49_51:3;
+		unsigned long	n_gru:4;			/* RW */
+		unsigned long	rsvd_56_62:7;
+		unsigned long	enable:1;			/* RW */
+	} s1;
+	struct uv2h_rh_gam_gru_overlay_config_mmr_s {
+		unsigned long	rsvd_0_27:28;
+		unsigned long	base:18;			/* RW */
+		unsigned long	rsvd_46_51:6;
+		unsigned long	n_gru:4;			/* RW */
+		unsigned long	rsvd_56_62:7;
+		unsigned long	enable:1;			/* RW */
+	} s2;
 };
 
 /* ========================================================================= */
 /*                   UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR                     */
 /* ========================================================================= */
-#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
+#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR		0x1600030UL
 
-#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30
-#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL
-#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
-#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
-#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
-#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT	30
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT	46
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT	52
 #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003fffc0000000UL
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK	0x000fc00000000000UL
+#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK	0x00f0000000000000UL
 #define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
 
-#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 27
-#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff8000000UL
-#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
-#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
-#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
-#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT	27
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT	46
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT	52
 #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffff8000000UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK	0x000fc00000000000UL
+#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK	0x00f0000000000000UL
 #define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
 
 union uvh_rh_gam_mmioh_overlay_config_mmr_u {
-    unsigned long	v;
-    struct uv1h_rh_gam_mmioh_overlay_config_mmr_s {
-	unsigned long	rsvd_0_29: 30;  /*    */
-	unsigned long	base   : 16;  /* RW */
-	unsigned long	m_io   :  6;  /* RW */
-	unsigned long	n_io   :  4;  /* RW */
-	unsigned long	rsvd_56_62:  7;  /*    */
-	unsigned long	enable :  1;  /* RW */
-    } s1;
-    struct uv2h_rh_gam_mmioh_overlay_config_mmr_s {
-	unsigned long	rsvd_0_26: 27;  /*    */
-	unsigned long	base   : 19;  /* RW */
-	unsigned long	m_io   :  6;  /* RW */
-	unsigned long	n_io   :  4;  /* RW */
-	unsigned long	rsvd_56_62:  7;  /*    */
-	unsigned long	enable :  1;  /* RW */
-    } s2;
+	unsigned long	v;
+	struct uv1h_rh_gam_mmioh_overlay_config_mmr_s {
+		unsigned long	rsvd_0_29:30;
+		unsigned long	base:16;			/* RW */
+		unsigned long	m_io:6;				/* RW */
+		unsigned long	n_io:4;				/* RW */
+		unsigned long	rsvd_56_62:7;
+		unsigned long	enable:1;			/* RW */
+	} s1;
+	struct uv2h_rh_gam_mmioh_overlay_config_mmr_s {
+		unsigned long	rsvd_0_26:27;
+		unsigned long	base:19;			/* RW */
+		unsigned long	m_io:6;				/* RW */
+		unsigned long	n_io:4;				/* RW */
+		unsigned long	rsvd_56_62:7;
+		unsigned long	enable:1;			/* RW */
+	} s2;
 };
 
 /* ========================================================================= */
 /*                    UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR                      */
 /* ========================================================================= */
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR		0x1600028UL
 
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
-#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT	26
+#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffffc000000UL
 
-#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
-#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT	26
 #define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT	63
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffffc000000UL
 #define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
-#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
 
-#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
-#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
-#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
-#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT	26
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT	63
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK	0x00003ffffc000000UL
+#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK	0x8000000000000000UL
 
 union uvh_rh_gam_mmr_overlay_config_mmr_u {
-    unsigned long	v;
-    struct uvh_rh_gam_mmr_overlay_config_mmr_s {
-	unsigned long	rsvd_0_25: 26;  /*    */
-	unsigned long	base     : 20;  /* RW */
-	unsigned long	rsvd_46_62    : 17;
-	unsigned long	enable   :  1;  /* RW */
-    } s;
-    struct uv1h_rh_gam_mmr_overlay_config_mmr_s {
-	unsigned long	rsvd_0_25: 26;  /*    */
-	unsigned long	base     : 20;  /* RW */
-	unsigned long	dual_hub :  1;  /* RW */
-	unsigned long	rsvd_47_62: 16;  /*    */
-	unsigned long	enable   :  1;  /* RW */
-    } s1;
-    struct uv2h_rh_gam_mmr_overlay_config_mmr_s {
-	unsigned long	rsvd_0_25: 26;  /*    */
-	unsigned long	base   : 20;  /* RW */
-	unsigned long	rsvd_46_62: 17;  /*    */
-	unsigned long	enable :  1;  /* RW */
-    } s2;
+	unsigned long	v;
+	struct uvh_rh_gam_mmr_overlay_config_mmr_s {
+		unsigned long	rsvd_0_25:26;
+		unsigned long	base:20;			/* RW */
+		unsigned long	rsvd_46_62:17;
+		unsigned long	enable:1;			/* RW */
+	} s;
+	struct uv1h_rh_gam_mmr_overlay_config_mmr_s {
+		unsigned long	rsvd_0_25:26;
+		unsigned long	base:20;			/* RW */
+		unsigned long	dual_hub:1;			/* RW */
+		unsigned long	rsvd_47_62:16;
+		unsigned long	enable:1;			/* RW */
+	} s1;
+	struct uv2h_rh_gam_mmr_overlay_config_mmr_s {
+		unsigned long	rsvd_0_25:26;
+		unsigned long	base:20;			/* RW */
+		unsigned long	rsvd_46_62:17;
+		unsigned long	enable:1;			/* RW */
+	} s2;
 };
 
 /* ========================================================================= */
 /*                                 UVH_RTC                                   */
 /* ========================================================================= */
-#define UVH_RTC 0x340000UL
+#define UVH_RTC						0x340000UL
 
-#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0
-#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL
+#define UVH_RTC_REAL_TIME_CLOCK_SHFT			0
+#define UVH_RTC_REAL_TIME_CLOCK_MASK			0x00ffffffffffffffUL
 
 union uvh_rtc_u {
-    unsigned long	v;
-    struct uvh_rtc_s {
-	unsigned long	real_time_clock : 56;  /* RW */
-	unsigned long	rsvd_56_63      :  8;  /*    */
-    } s;
+	unsigned long	v;
+	struct uvh_rtc_s {
+		unsigned long	real_time_clock:56;		/* RW */
+		unsigned long	rsvd_56_63:8;
+	} s;
 };
 
 /* ========================================================================= */
 /*                           UVH_RTC1_INT_CONFIG                             */
 /* ========================================================================= */
-#define UVH_RTC1_INT_CONFIG 0x615c0UL
-
-#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0
-#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
-#define UVH_RTC1_INT_CONFIG_DM_SHFT 8
-#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL
-#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11
-#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
-#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12
-#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
-#define UVH_RTC1_INT_CONFIG_P_SHFT 13
-#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL
-#define UVH_RTC1_INT_CONFIG_T_SHFT 15
-#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL
-#define UVH_RTC1_INT_CONFIG_M_SHFT 16
-#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL
-#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32
-#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
+#define UVH_RTC1_INT_CONFIG				0x615c0UL
+
+#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT			0
+#define UVH_RTC1_INT_CONFIG_DM_SHFT			8
+#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT		11
+#define UVH_RTC1_INT_CONFIG_STATUS_SHFT			12
+#define UVH_RTC1_INT_CONFIG_P_SHFT			13
+#define UVH_RTC1_INT_CONFIG_T_SHFT			15
+#define UVH_RTC1_INT_CONFIG_M_SHFT			16
+#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT		32
+#define UVH_RTC1_INT_CONFIG_VECTOR_MASK			0x00000000000000ffUL
+#define UVH_RTC1_INT_CONFIG_DM_MASK			0x0000000000000700UL
+#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK		0x0000000000000800UL
+#define UVH_RTC1_INT_CONFIG_STATUS_MASK			0x0000000000001000UL
+#define UVH_RTC1_INT_CONFIG_P_MASK			0x0000000000002000UL
+#define UVH_RTC1_INT_CONFIG_T_MASK			0x0000000000008000UL
+#define UVH_RTC1_INT_CONFIG_M_MASK			0x0000000000010000UL
+#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK		0xffffffff00000000UL
 
 union uvh_rtc1_int_config_u {
-    unsigned long	v;
-    struct uvh_rtc1_int_config_s {
-	unsigned long	vector_  :  8;  /* RW */
-	unsigned long	dm       :  3;  /* RW */
-	unsigned long	destmode :  1;  /* RW */
-	unsigned long	status   :  1;  /* RO */
-	unsigned long	p        :  1;  /* RO */
-	unsigned long	rsvd_14  :  1;  /*    */
-	unsigned long	t        :  1;  /* RO */
-	unsigned long	m        :  1;  /* RW */
-	unsigned long	rsvd_17_31: 15;  /*    */
-	unsigned long	apic_id  : 32;  /* RW */
-    } s;
+	unsigned long	v;
+	struct uvh_rtc1_int_config_s {
+		unsigned long	vector_:8;			/* RW */
+		unsigned long	dm:3;				/* RW */
+		unsigned long	destmode:1;			/* RW */
+		unsigned long	status:1;			/* RO */
+		unsigned long	p:1;				/* RO */
+		unsigned long	rsvd_14:1;
+		unsigned long	t:1;				/* RO */
+		unsigned long	m:1;				/* RW */
+		unsigned long	rsvd_17_31:15;
+		unsigned long	apic_id:32;			/* RW */
+	} s;
 };
 
 /* ========================================================================= */
 /*                               UVH_SCRATCH5                                */
 /* ========================================================================= */
-#define UVH_SCRATCH5 0x2d0200UL
-#define UVH_SCRATCH5_32 0x778
+#define UVH_SCRATCH5					0x2d0200UL
+#define UVH_SCRATCH5_32					0x778
 
-#define UVH_SCRATCH5_SCRATCH5_SHFT 0
-#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
+#define UVH_SCRATCH5_SCRATCH5_SHFT			0
+#define UVH_SCRATCH5_SCRATCH5_MASK			0xffffffffffffffffUL
 
 union uvh_scratch5_u {
-    unsigned long	v;
-    struct uvh_scratch5_s {
-	unsigned long	scratch5 : 64;  /* RW, W1CS */
-    } s;
+	unsigned long	v;
+	struct uvh_scratch5_s {
+		unsigned long	scratch5:64;			/* RW, W1CS */
+	} s;
 };
 
 /* ========================================================================= */
 /*                           UV2H_EVENT_OCCURRED2                            */
 /* ========================================================================= */
-#define UV2H_EVENT_OCCURRED2 0x70100UL
-#define UV2H_EVENT_OCCURRED2_32 0xb68
-
-#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT 0
-#define UV2H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL
-#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT 1
-#define UV2H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL
-#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT 2
-#define UV2H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL
-#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT 3
-#define UV2H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL
-#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT 4
-#define UV2H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL
-#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT 5
-#define UV2H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL
-#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT 6
-#define UV2H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL
-#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT 7
-#define UV2H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL
-#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT 8
-#define UV2H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL
-#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT 9
-#define UV2H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL
-#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT 10
-#define UV2H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL
-#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT 11
-#define UV2H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL
-#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT 12
-#define UV2H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL
-#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT 13
-#define UV2H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL
-#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT 14
-#define UV2H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL
-#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT 15
-#define UV2H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL
-#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT 16
-#define UV2H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL
-#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT 17
-#define UV2H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL
-#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT 18
-#define UV2H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL
-#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT 19
-#define UV2H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL
-#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT 20
-#define UV2H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL
-#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT 21
-#define UV2H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL
-#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT 22
-#define UV2H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL
-#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT 23
-#define UV2H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL
-#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT 24
-#define UV2H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL
-#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT 25
-#define UV2H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL
-#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT 26
-#define UV2H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL
-#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT 27
-#define UV2H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL
-#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT 28
-#define UV2H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL
-#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT 29
-#define UV2H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL
-#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT 30
-#define UV2H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL
-#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT 31
-#define UV2H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL
+#define UV2H_EVENT_OCCURRED2				0x70100UL
+#define UV2H_EVENT_OCCURRED2_32				0xb68
+
+#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT			0
+#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT			1
+#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT			2
+#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT			3
+#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT			4
+#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT			5
+#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT			6
+#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT			7
+#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT			8
+#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT			9
+#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT		10
+#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT		11
+#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT		12
+#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT		13
+#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT		14
+#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT		15
+#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT		16
+#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT		17
+#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT		18
+#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT		19
+#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT		20
+#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT		21
+#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT		22
+#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT		23
+#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT		24
+#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT		25
+#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT		26
+#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT		27
+#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT		28
+#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT		29
+#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT		30
+#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT		31
+#define UV2H_EVENT_OCCURRED2_RTC_0_MASK			0x0000000000000001UL
+#define UV2H_EVENT_OCCURRED2_RTC_1_MASK			0x0000000000000002UL
+#define UV2H_EVENT_OCCURRED2_RTC_2_MASK			0x0000000000000004UL
+#define UV2H_EVENT_OCCURRED2_RTC_3_MASK			0x0000000000000008UL
+#define UV2H_EVENT_OCCURRED2_RTC_4_MASK			0x0000000000000010UL
+#define UV2H_EVENT_OCCURRED2_RTC_5_MASK			0x0000000000000020UL
+#define UV2H_EVENT_OCCURRED2_RTC_6_MASK			0x0000000000000040UL
+#define UV2H_EVENT_OCCURRED2_RTC_7_MASK			0x0000000000000080UL
+#define UV2H_EVENT_OCCURRED2_RTC_8_MASK			0x0000000000000100UL
+#define UV2H_EVENT_OCCURRED2_RTC_9_MASK			0x0000000000000200UL
+#define UV2H_EVENT_OCCURRED2_RTC_10_MASK		0x0000000000000400UL
+#define UV2H_EVENT_OCCURRED2_RTC_11_MASK		0x0000000000000800UL
+#define UV2H_EVENT_OCCURRED2_RTC_12_MASK		0x0000000000001000UL
+#define UV2H_EVENT_OCCURRED2_RTC_13_MASK		0x0000000000002000UL
+#define UV2H_EVENT_OCCURRED2_RTC_14_MASK		0x0000000000004000UL
+#define UV2H_EVENT_OCCURRED2_RTC_15_MASK		0x0000000000008000UL
+#define UV2H_EVENT_OCCURRED2_RTC_16_MASK		0x0000000000010000UL
+#define UV2H_EVENT_OCCURRED2_RTC_17_MASK		0x0000000000020000UL
+#define UV2H_EVENT_OCCURRED2_RTC_18_MASK		0x0000000000040000UL
+#define UV2H_EVENT_OCCURRED2_RTC_19_MASK		0x0000000000080000UL
+#define UV2H_EVENT_OCCURRED2_RTC_20_MASK		0x0000000000100000UL
+#define UV2H_EVENT_OCCURRED2_RTC_21_MASK		0x0000000000200000UL
+#define UV2H_EVENT_OCCURRED2_RTC_22_MASK		0x0000000000400000UL
+#define UV2H_EVENT_OCCURRED2_RTC_23_MASK		0x0000000000800000UL
+#define UV2H_EVENT_OCCURRED2_RTC_24_MASK		0x0000000001000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_25_MASK		0x0000000002000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_26_MASK		0x0000000004000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_27_MASK		0x0000000008000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_28_MASK		0x0000000010000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_29_MASK		0x0000000020000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_30_MASK		0x0000000040000000UL
+#define UV2H_EVENT_OCCURRED2_RTC_31_MASK		0x0000000080000000UL
 
 union uv2h_event_occurred2_u {
-    unsigned long	v;
-    struct uv2h_event_occurred2_s {
-	unsigned long	rtc_0  :  1;  /* RW */
-	unsigned long	rtc_1  :  1;  /* RW */
-	unsigned long	rtc_2  :  1;  /* RW */
-	unsigned long	rtc_3  :  1;  /* RW */
-	unsigned long	rtc_4  :  1;  /* RW */
-	unsigned long	rtc_5  :  1;  /* RW */
-	unsigned long	rtc_6  :  1;  /* RW */
-	unsigned long	rtc_7  :  1;  /* RW */
-	unsigned long	rtc_8  :  1;  /* RW */
-	unsigned long	rtc_9  :  1;  /* RW */
-	unsigned long	rtc_10 :  1;  /* RW */
-	unsigned long	rtc_11 :  1;  /* RW */
-	unsigned long	rtc_12 :  1;  /* RW */
-	unsigned long	rtc_13 :  1;  /* RW */
-	unsigned long	rtc_14 :  1;  /* RW */
-	unsigned long	rtc_15 :  1;  /* RW */
-	unsigned long	rtc_16 :  1;  /* RW */
-	unsigned long	rtc_17 :  1;  /* RW */
-	unsigned long	rtc_18 :  1;  /* RW */
-	unsigned long	rtc_19 :  1;  /* RW */
-	unsigned long	rtc_20 :  1;  /* RW */
-	unsigned long	rtc_21 :  1;  /* RW */
-	unsigned long	rtc_22 :  1;  /* RW */
-	unsigned long	rtc_23 :  1;  /* RW */
-	unsigned long	rtc_24 :  1;  /* RW */
-	unsigned long	rtc_25 :  1;  /* RW */
-	unsigned long	rtc_26 :  1;  /* RW */
-	unsigned long	rtc_27 :  1;  /* RW */
-	unsigned long	rtc_28 :  1;  /* RW */
-	unsigned long	rtc_29 :  1;  /* RW */
-	unsigned long	rtc_30 :  1;  /* RW */
-	unsigned long	rtc_31 :  1;  /* RW */
-	unsigned long	rsvd_32_63: 32;  /*    */
-    } s1;
+	unsigned long	v;
+	struct uv2h_event_occurred2_s {
+		unsigned long	rtc_0:1;			/* RW */
+		unsigned long	rtc_1:1;			/* RW */
+		unsigned long	rtc_2:1;			/* RW */
+		unsigned long	rtc_3:1;			/* RW */
+		unsigned long	rtc_4:1;			/* RW */
+		unsigned long	rtc_5:1;			/* RW */
+		unsigned long	rtc_6:1;			/* RW */
+		unsigned long	rtc_7:1;			/* RW */
+		unsigned long	rtc_8:1;			/* RW */
+		unsigned long	rtc_9:1;			/* RW */
+		unsigned long	rtc_10:1;			/* RW */
+		unsigned long	rtc_11:1;			/* RW */
+		unsigned long	rtc_12:1;			/* RW */
+		unsigned long	rtc_13:1;			/* RW */
+		unsigned long	rtc_14:1;			/* RW */
+		unsigned long	rtc_15:1;			/* RW */
+		unsigned long	rtc_16:1;			/* RW */
+		unsigned long	rtc_17:1;			/* RW */
+		unsigned long	rtc_18:1;			/* RW */
+		unsigned long	rtc_19:1;			/* RW */
+		unsigned long	rtc_20:1;			/* RW */
+		unsigned long	rtc_21:1;			/* RW */
+		unsigned long	rtc_22:1;			/* RW */
+		unsigned long	rtc_23:1;			/* RW */
+		unsigned long	rtc_24:1;			/* RW */
+		unsigned long	rtc_25:1;			/* RW */
+		unsigned long	rtc_26:1;			/* RW */
+		unsigned long	rtc_27:1;			/* RW */
+		unsigned long	rtc_28:1;			/* RW */
+		unsigned long	rtc_29:1;			/* RW */
+		unsigned long	rtc_30:1;			/* RW */
+		unsigned long	rtc_31:1;			/* RW */
+		unsigned long	rsvd_32_63:32;
+	} s1;
 };
 
 /* ========================================================================= */
 /*                        UV2H_EVENT_OCCURRED2_ALIAS                         */
 /* ========================================================================= */
-#define UV2H_EVENT_OCCURRED2_ALIAS 0x70108UL
-#define UV2H_EVENT_OCCURRED2_ALIAS_32 0xb70
+#define UV2H_EVENT_OCCURRED2_ALIAS			0x70108UL
+#define UV2H_EVENT_OCCURRED2_ALIAS_32			0xb70
 
 /* ========================================================================= */
 /*                    UV2H_LB_BAU_SB_ACTIVATION_STATUS_2                     */
 /* ========================================================================= */
-#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL
-#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2		0x320130UL
+#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32		0x9f0
 
 #define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0
 #define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL
 
 union uv2h_lb_bau_sb_activation_status_2_u {
-    unsigned long	v;
-    struct uv2h_lb_bau_sb_activation_status_2_s {
-	unsigned long	aux_error : 64;  /* RW */
-    } s1;
+	unsigned long	v;
+	struct uv2h_lb_bau_sb_activation_status_2_s {
+		unsigned long	aux_error:64;			/* RW */
+	} s1;
 };
 
 /* ========================================================================= */
 /*                   UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK                    */
 /* ========================================================================= */
-#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL
-#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x9f0
+#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK		0x320130UL
+#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_32		0x9f0
 
 #define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0
 #define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL
 
 union uv1h_lb_target_physical_apic_id_mask_u {
-    unsigned long	v;
-    struct uv1h_lb_target_physical_apic_id_mask_s {
-	unsigned long	bit_enables : 32;  /* RW */
-	unsigned long	rsvd_32_63  : 32;  /*    */
-    } s1;
+	unsigned long	v;
+	struct uv1h_lb_target_physical_apic_id_mask_s {
+		unsigned long	bit_enables:32;			/* RW */
+		unsigned long	rsvd_32_63:32;
+	} s1;
 };
 
 
-#endif /* __ASM_UV_MMRS_X86_H__ */
+#endif /* _ASM_X86_UV_UV_MMRS_H */
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 646b4c1ca69..815285bcace 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -11,10 +11,9 @@ struct vsyscall_gtod_data {
 	time_t		wall_time_sec;
 	u32		wall_time_nsec;
 
-	int		sysctl_enabled;
 	struct timezone sys_tz;
 	struct { /* extract of a clocksource struct */
-		cycle_t (*vread)(void);
+		int vclock_mode;
 		cycle_t	cycle_last;
 		cycle_t	mask;
 		u32	mult;
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 84471b81046..2caf290e989 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -132,6 +132,8 @@ enum vmcs_field {
 	GUEST_IA32_PAT_HIGH		= 0x00002805,
 	GUEST_IA32_EFER			= 0x00002806,
 	GUEST_IA32_EFER_HIGH		= 0x00002807,
+	GUEST_IA32_PERF_GLOBAL_CTRL	= 0x00002808,
+	GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
 	GUEST_PDPTR0                    = 0x0000280a,
 	GUEST_PDPTR0_HIGH               = 0x0000280b,
 	GUEST_PDPTR1                    = 0x0000280c,
@@ -144,6 +146,8 @@ enum vmcs_field {
 	HOST_IA32_PAT_HIGH		= 0x00002c01,
 	HOST_IA32_EFER			= 0x00002c02,
 	HOST_IA32_EFER_HIGH		= 0x00002c03,
+	HOST_IA32_PERF_GLOBAL_CTRL	= 0x00002c04,
+	HOST_IA32_PERF_GLOBAL_CTRL_HIGH	= 0x00002c05,
 	PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
 	CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
 	EXCEPTION_BITMAP                = 0x00004004,
@@ -426,4 +430,43 @@ struct vmx_msr_entry {
 	u64 value;
 } __aligned(16);
 
+/*
+ * Exit Qualifications for entry failure during or after loading guest state
+ */
+#define ENTRY_FAIL_DEFAULT		0
+#define ENTRY_FAIL_PDPTE		2
+#define ENTRY_FAIL_NMI			3
+#define ENTRY_FAIL_VMCS_LINK_PTR	4
+
+/*
+ * VM-instruction error numbers
+ */
+enum vm_instruction_error_number {
+	VMXERR_VMCALL_IN_VMX_ROOT_OPERATION = 1,
+	VMXERR_VMCLEAR_INVALID_ADDRESS = 2,
+	VMXERR_VMCLEAR_VMXON_POINTER = 3,
+	VMXERR_VMLAUNCH_NONCLEAR_VMCS = 4,
+	VMXERR_VMRESUME_NONLAUNCHED_VMCS = 5,
+	VMXERR_VMRESUME_AFTER_VMXOFF = 6,
+	VMXERR_ENTRY_INVALID_CONTROL_FIELD = 7,
+	VMXERR_ENTRY_INVALID_HOST_STATE_FIELD = 8,
+	VMXERR_VMPTRLD_INVALID_ADDRESS = 9,
+	VMXERR_VMPTRLD_VMXON_POINTER = 10,
+	VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID = 11,
+	VMXERR_UNSUPPORTED_VMCS_COMPONENT = 12,
+	VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT = 13,
+	VMXERR_VMXON_IN_VMX_ROOT_OPERATION = 15,
+	VMXERR_ENTRY_INVALID_EXECUTIVE_VMCS_POINTER = 16,
+	VMXERR_ENTRY_NONLAUNCHED_EXECUTIVE_VMCS = 17,
+	VMXERR_ENTRY_EXECUTIVE_VMCS_POINTER_NOT_VMXON_POINTER = 18,
+	VMXERR_VMCALL_NONCLEAR_VMCS = 19,
+	VMXERR_VMCALL_INVALID_VM_EXIT_CONTROL_FIELDS = 20,
+	VMXERR_VMCALL_INCORRECT_MSEG_REVISION_ID = 22,
+	VMXERR_VMXOFF_UNDER_DUAL_MONITOR_TREATMENT_OF_SMIS_AND_SMM = 23,
+	VMXERR_VMCALL_INVALID_SMM_MONITOR_FEATURES = 24,
+	VMXERR_ENTRY_INVALID_VM_EXECUTION_CONTROL_FIELDS_IN_EXECUTIVE_VMCS = 25,
+	VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS = 26,
+	VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID = 28,
+};
+
 #endif
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d55597351f6..60107072c28 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -16,10 +16,6 @@ enum vsyscall_num {
 #ifdef __KERNEL__
 #include <linux/seqlock.h>
 
-/* Definitions for CONFIG_GENERIC_TIME definitions */
-#define __vsyscall_fn \
-	__attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
-
 #define VGETCPU_RDTSCP	1
 #define VGETCPU_LSL	2
 
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
index 341b3559452..de656ac2af4 100644
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -10,15 +10,14 @@
  * In normal kernel code, they are used like any other variable.
  * In user code, they are accessed through the VVAR macro.
  *
- * Each of these variables lives in the vsyscall page, and each
- * one needs a unique offset within the little piece of the page
- * reserved for vvars.  Specify that offset in DECLARE_VVAR.
- * (There are 896 bytes available.  If you mess up, the linker will
- * catch it.)
+ * These variables live in a page of kernel data that has an extra RO
+ * mapping for userspace.  Each variable needs a unique offset within
+ * that page; specify that offset with the DECLARE_VVAR macro.  (If
+ * you mess up, the linker will catch it.)
  */
 
-/* Offset of vars within vsyscall page */
-#define VSYSCALL_VARS_OFFSET (3072 + 128)
+/* Base address of vvars.  This is not ABI. */
+#define VVAR_ADDRESS (-10*1024*1024 - 4096)
 
 #if defined(__VVAR_KERNEL_LDS)
 
@@ -26,17 +25,17 @@
  * right place.
  */
 #define DECLARE_VVAR(offset, type, name) \
-	EMIT_VVAR(name, VSYSCALL_VARS_OFFSET + offset)
+	EMIT_VVAR(name, offset)
 
 #else
 
 #define DECLARE_VVAR(offset, type, name)				\
 	static type const * const vvaraddr_ ## name =			\
-		(void *)(VSYSCALL_START + VSYSCALL_VARS_OFFSET + (offset));
+		(void *)(VVAR_ADDRESS + (offset));
 
 #define DEFINE_VVAR(type, name)						\
-	type __vvar_ ## name						\
-	__attribute__((section(".vsyscall_var_" #name), aligned(16)))
+	type name							\
+	__attribute__((section(".vvar_" #name), aligned(16)))
 
 #define VVAR(name) (*vvaraddr_ ## name)
 
@@ -45,8 +44,7 @@
 /* DECLARE_VVAR(offset, type, name) */
 
 DECLARE_VVAR(0, volatile unsigned long, jiffies)
-DECLARE_VVAR(8, int, vgetcpu_mode)
+DECLARE_VVAR(16, int, vgetcpu_mode)
 DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
 
 #undef DECLARE_VVAR
-#undef VSYSCALL_VARS_OFFSET
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index d240ea95051..417777de5a4 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -39,6 +39,8 @@
 #include <linux/string.h>
 #include <linux/types.h>
 
+#include <trace/events/xen.h>
+
 #include <asm/page.h>
 #include <asm/pgtable.h>
 
@@ -459,6 +461,8 @@ MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
 {
 	mcl->op = __HYPERVISOR_fpu_taskswitch;
 	mcl->args[0] = set;
+
+	trace_xen_mc_entry(mcl, 1);
 }
 
 static inline void
@@ -475,6 +479,8 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
 		mcl->args[2] = new_val.pte >> 32;
 		mcl->args[3] = flags;
 	}
+
+	trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 3 : 4);
 }
 
 static inline void
@@ -485,6 +491,8 @@ MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd,
 	mcl->args[0] = cmd;
 	mcl->args[1] = (unsigned long)uop;
 	mcl->args[2] = count;
+
+	trace_xen_mc_entry(mcl, 3);
 }
 
 static inline void
@@ -504,6 +512,8 @@ MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long v
 		mcl->args[3] = flags;
 		mcl->args[4] = domid;
 	}
+
+	trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 4 : 5);
 }
 
 static inline void
@@ -520,6 +530,8 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
 		mcl->args[2] = desc.a;
 		mcl->args[3] = desc.b;
 	}
+
+	trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4);
 }
 
 static inline void
@@ -528,6 +540,8 @@ MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg)
 	mcl->op = __HYPERVISOR_memory_op;
 	mcl->args[0] = cmd;
 	mcl->args[1] = (unsigned long)arg;
+
+	trace_xen_mc_entry(mcl, 2);
 }
 
 static inline void
@@ -539,6 +553,8 @@ MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
 	mcl->args[1] = count;
 	mcl->args[2] = (unsigned long)success_count;
 	mcl->args[3] = domid;
+
+	trace_xen_mc_entry(mcl, 4);
 }
 
 static inline void
@@ -550,6 +566,8 @@ MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
 	mcl->args[1] = count;
 	mcl->args[2] = (unsigned long)success_count;
 	mcl->args[3] = domid;
+
+	trace_xen_mc_entry(mcl, 4);
 }
 
 static inline void
@@ -558,6 +576,8 @@ MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries)
 	mcl->op = __HYPERVISOR_set_gdt;
 	mcl->args[0] = (unsigned long)frames;
 	mcl->args[1] = entries;
+
+	trace_xen_mc_entry(mcl, 2);
 }
 
 static inline void
@@ -567,6 +587,8 @@ MULTI_stack_switch(struct multicall_entry *mcl,
 	mcl->op = __HYPERVISOR_stack_switch;
 	mcl->args[0] = ss;
 	mcl->args[1] = esp;
+
+	trace_xen_mc_entry(mcl, 2);
 }
 
 #endif /* _ASM_X86_XEN_HYPERCALL_H */
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index 4fbda9a3f33..968d57dd54c 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -14,13 +14,14 @@ static inline int pci_xen_hvm_init(void)
 }
 #endif
 #if defined(CONFIG_XEN_DOM0)
-void __init xen_setup_pirqs(void);
+int __init pci_xen_initial_domain(void);
 int xen_find_device_domain_owner(struct pci_dev *dev);
 int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain);
 int xen_unregister_device_domain_owner(struct pci_dev *dev);
 #else
-static inline void __init xen_setup_pirqs(void)
+static inline int __init pci_xen_initial_domain(void)
 {
+	return -1;
 }
 static inline int xen_find_device_domain_owner(struct pci_dev *dev)
 {
diff --git a/arch/x86/include/asm/xen/trace_types.h b/arch/x86/include/asm/xen/trace_types.h
new file mode 100644
index 00000000000..21e1874c0a0
--- /dev/null
+++ b/arch/x86/include/asm/xen/trace_types.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_XEN_TRACE_TYPES_H
+#define _ASM_XEN_TRACE_TYPES_H
+
+enum xen_mc_flush_reason {
+	XEN_MC_FL_NONE,		/* explicit flush */
+	XEN_MC_FL_BATCH,	/* out of hypercall space */
+	XEN_MC_FL_ARGS,		/* out of argument space */
+	XEN_MC_FL_CALLBACK,	/* out of callback space */
+};
+
+enum xen_mc_extend_args {
+	XEN_MC_XE_OK,
+	XEN_MC_XE_BAD_OP,
+	XEN_MC_XE_NO_SPACE
+};
+typedef void (*xen_mc_callback_fn_t)(void *);
+
+#endif	/* _ASM_XEN_TRACE_TYPES_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 90b06d4daee..04105574c8e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,17 +24,12 @@ endif
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_vsyscall_64.o	:= $(PROFILING) -g0 $(nostackp)
 CFLAGS_hpet.o		:= $(nostackp)
-CFLAGS_vread_tsc_64.o	:= $(nostackp)
 CFLAGS_paravirt.o	:= $(nostackp)
 GCOV_PROFILE_vsyscall_64.o	:= n
 GCOV_PROFILE_hpet.o		:= n
 GCOV_PROFILE_tsc.o		:= n
-GCOV_PROFILE_vread_tsc_64.o	:= n
 GCOV_PROFILE_paravirt.o		:= n
 
-# vread_tsc_64 is hot and should be fully optimized:
-CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls
-
 obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time.o ioport.o ldt.o dumpstack.o
@@ -43,7 +38,8 @@ obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-y			+= probe_roms.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
-obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o vread_tsc_64.o
+obj-$(CONFIG_X86_64)	+= syscall_64.o vsyscall_64.o
+obj-$(CONFIG_X86_64)	+= vsyscall_emu_64.o
 obj-y			+= bootflag.o e820.o
 obj-y			+= pci-dma.o quirks.o topology.o kdebugfs.o
 obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
@@ -123,7 +119,6 @@ ifeq ($(CONFIG_X86_64),y)
 
 	obj-$(CONFIG_GART_IOMMU)	+= amd_gart_64.o aperture_64.o
 	obj-$(CONFIG_CALGARY_IOMMU)	+= pci-calgary_64.o tce_64.o
-	obj-$(CONFIG_AMD_IOMMU)		+= amd_iommu_init.o amd_iommu.o
 
 	obj-$(CONFIG_PCI_MMCONFIG)	+= mmconf-fam10h_64.o
 	obj-y				+= vsmp_64.o
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 5812404a0d4..f50e7fb2a20 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -149,6 +149,29 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
 }
 EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
 
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ *
+ * New with Core Duo processors, MWAIT can take some hints based on CPU
+ * capability.
+ */
+void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
+{
+	if (!need_resched()) {
+		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
+			clflush((void *)&current_thread_info()->flags);
+
+		__monitor((void *)&current_thread_info()->flags, 0, 0);
+		smp_mb();
+		if (!need_resched())
+			__mwait(ax, cx);
+	}
+}
+
 void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
 {
 	unsigned int cpu = smp_processor_id();
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index a81f2d52f86..c6382281624 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -14,7 +14,6 @@
 #include <asm/pgtable.h>
 #include <asm/mce.h>
 #include <asm/nmi.h>
-#include <asm/vsyscall.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
@@ -250,7 +249,6 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
 
 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern s32 __smp_locks[], __smp_locks_end[];
-extern char __vsyscall_0;
 void *text_poke_early(void *addr, const void *opcode, size_t len);
 
 /* Replace instructions with better alternatives for this CPU type.
@@ -263,6 +261,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 					 struct alt_instr *end)
 {
 	struct alt_instr *a;
+	u8 *instr, *replacement;
 	u8 insnbuf[MAX_PATCH_LEN];
 
 	DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
@@ -276,25 +275,23 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 	 * order.
 	 */
 	for (a = start; a < end; a++) {
-		u8 *instr = a->instr;
+		instr = (u8 *)&a->instr_offset + a->instr_offset;
+		replacement = (u8 *)&a->repl_offset + a->repl_offset;
 		BUG_ON(a->replacementlen > a->instrlen);
 		BUG_ON(a->instrlen > sizeof(insnbuf));
 		BUG_ON(a->cpuid >= NCAPINTS*32);
 		if (!boot_cpu_has(a->cpuid))
 			continue;
-#ifdef CONFIG_X86_64
-		/* vsyscall code is not mapped yet. resolve it manually. */
-		if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
-			instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
-			DPRINTK("%s: vsyscall fixup: %p => %p\n",
-				__func__, a->instr, instr);
-		}
-#endif
-		memcpy(insnbuf, a->replacement, a->replacementlen);
+
+		memcpy(insnbuf, replacement, a->replacementlen);
+
+		/* 0xe8 is a relative jump; fix the offset. */
 		if (*insnbuf == 0xe8 && a->replacementlen == 5)
-		    *(s32 *)(insnbuf + 1) += a->replacement - a->instr;
+		    *(s32 *)(insnbuf + 1) += replacement - instr;
+
 		add_nops(insnbuf + a->replacementlen,
 			 a->instrlen - a->replacementlen);
+
 		text_poke_early(instr, insnbuf, a->instrlen);
 	}
 }
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index b117efd24f7..8a439d364b9 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -30,7 +30,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/io.h>
 #include <linux/gfp.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/mtrr.h>
 #include <asm/pgtable.h>
 #include <asm/proto.h>
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
deleted file mode 100644
index 7c3a95e54ec..00000000000
--- a/arch/x86/kernel/amd_iommu.c
+++ /dev/null
@@ -1,2764 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- *         Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#include <linux/pci.h>
-#include <linux/pci-ats.h>
-#include <linux/bitmap.h>
-#include <linux/slab.h>
-#include <linux/debugfs.h>
-#include <linux/scatterlist.h>
-#include <linux/dma-mapping.h>
-#include <linux/iommu-helper.h>
-#include <linux/iommu.h>
-#include <linux/delay.h>
-#include <asm/proto.h>
-#include <asm/iommu.h>
-#include <asm/gart.h>
-#include <asm/dma.h>
-#include <asm/amd_iommu_proto.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
-
-#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
-
-#define LOOP_TIMEOUT	100000
-
-static DEFINE_RWLOCK(amd_iommu_devtable_lock);
-
-/* A list of preallocated protection domains */
-static LIST_HEAD(iommu_pd_list);
-static DEFINE_SPINLOCK(iommu_pd_list_lock);
-
-/*
- * Domain for untranslated devices - only allocated
- * if iommu=pt passed on kernel cmd line.
- */
-static struct protection_domain *pt_domain;
-
-static struct iommu_ops amd_iommu_ops;
-
-/*
- * general struct to manage commands send to an IOMMU
- */
-struct iommu_cmd {
-	u32 data[4];
-};
-
-static void update_domain(struct protection_domain *domain);
-
-/****************************************************************************
- *
- * Helper functions
- *
- ****************************************************************************/
-
-static inline u16 get_device_id(struct device *dev)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-
-	return calc_devid(pdev->bus->number, pdev->devfn);
-}
-
-static struct iommu_dev_data *get_dev_data(struct device *dev)
-{
-	return dev->archdata.iommu;
-}
-
-/*
- * In this function the list of preallocated protection domains is traversed to
- * find the domain for a specific device
- */
-static struct dma_ops_domain *find_protection_domain(u16 devid)
-{
-	struct dma_ops_domain *entry, *ret = NULL;
-	unsigned long flags;
-	u16 alias = amd_iommu_alias_table[devid];
-
-	if (list_empty(&iommu_pd_list))
-		return NULL;
-
-	spin_lock_irqsave(&iommu_pd_list_lock, flags);
-
-	list_for_each_entry(entry, &iommu_pd_list, list) {
-		if (entry->target_dev == devid ||
-		    entry->target_dev == alias) {
-			ret = entry;
-			break;
-		}
-	}
-
-	spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
-
-	return ret;
-}
-
-/*
- * This function checks if the driver got a valid device from the caller to
- * avoid dereferencing invalid pointers.
- */
-static bool check_device(struct device *dev)
-{
-	u16 devid;
-
-	if (!dev || !dev->dma_mask)
-		return false;
-
-	/* No device or no PCI device */
-	if (dev->bus != &pci_bus_type)
-		return false;
-
-	devid = get_device_id(dev);
-
-	/* Out of our scope? */
-	if (devid > amd_iommu_last_bdf)
-		return false;
-
-	if (amd_iommu_rlookup_table[devid] == NULL)
-		return false;
-
-	return true;
-}
-
-static int iommu_init_device(struct device *dev)
-{
-	struct iommu_dev_data *dev_data;
-	struct pci_dev *pdev;
-	u16 devid, alias;
-
-	if (dev->archdata.iommu)
-		return 0;
-
-	dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
-	if (!dev_data)
-		return -ENOMEM;
-
-	dev_data->dev = dev;
-
-	devid = get_device_id(dev);
-	alias = amd_iommu_alias_table[devid];
-	pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
-	if (pdev)
-		dev_data->alias = &pdev->dev;
-	else {
-		kfree(dev_data);
-		return -ENOTSUPP;
-	}
-
-	atomic_set(&dev_data->bind, 0);
-
-	dev->archdata.iommu = dev_data;
-
-
-	return 0;
-}
-
-static void iommu_ignore_device(struct device *dev)
-{
-	u16 devid, alias;
-
-	devid = get_device_id(dev);
-	alias = amd_iommu_alias_table[devid];
-
-	memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry));
-	memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry));
-
-	amd_iommu_rlookup_table[devid] = NULL;
-	amd_iommu_rlookup_table[alias] = NULL;
-}
-
-static void iommu_uninit_device(struct device *dev)
-{
-	kfree(dev->archdata.iommu);
-}
-
-void __init amd_iommu_uninit_devices(void)
-{
-	struct pci_dev *pdev = NULL;
-
-	for_each_pci_dev(pdev) {
-
-		if (!check_device(&pdev->dev))
-			continue;
-
-		iommu_uninit_device(&pdev->dev);
-	}
-}
-
-int __init amd_iommu_init_devices(void)
-{
-	struct pci_dev *pdev = NULL;
-	int ret = 0;
-
-	for_each_pci_dev(pdev) {
-
-		if (!check_device(&pdev->dev))
-			continue;
-
-		ret = iommu_init_device(&pdev->dev);
-		if (ret == -ENOTSUPP)
-			iommu_ignore_device(&pdev->dev);
-		else if (ret)
-			goto out_free;
-	}
-
-	return 0;
-
-out_free:
-
-	amd_iommu_uninit_devices();
-
-	return ret;
-}
-#ifdef CONFIG_AMD_IOMMU_STATS
-
-/*
- * Initialization code for statistics collection
- */
-
-DECLARE_STATS_COUNTER(compl_wait);
-DECLARE_STATS_COUNTER(cnt_map_single);
-DECLARE_STATS_COUNTER(cnt_unmap_single);
-DECLARE_STATS_COUNTER(cnt_map_sg);
-DECLARE_STATS_COUNTER(cnt_unmap_sg);
-DECLARE_STATS_COUNTER(cnt_alloc_coherent);
-DECLARE_STATS_COUNTER(cnt_free_coherent);
-DECLARE_STATS_COUNTER(cross_page);
-DECLARE_STATS_COUNTER(domain_flush_single);
-DECLARE_STATS_COUNTER(domain_flush_all);
-DECLARE_STATS_COUNTER(alloced_io_mem);
-DECLARE_STATS_COUNTER(total_map_requests);
-
-static struct dentry *stats_dir;
-static struct dentry *de_fflush;
-
-static void amd_iommu_stats_add(struct __iommu_counter *cnt)
-{
-	if (stats_dir == NULL)
-		return;
-
-	cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
-				       &cnt->value);
-}
-
-static void amd_iommu_stats_init(void)
-{
-	stats_dir = debugfs_create_dir("amd-iommu", NULL);
-	if (stats_dir == NULL)
-		return;
-
-	de_fflush  = debugfs_create_bool("fullflush", 0444, stats_dir,
-					 (u32 *)&amd_iommu_unmap_flush);
-
-	amd_iommu_stats_add(&compl_wait);
-	amd_iommu_stats_add(&cnt_map_single);
-	amd_iommu_stats_add(&cnt_unmap_single);
-	amd_iommu_stats_add(&cnt_map_sg);
-	amd_iommu_stats_add(&cnt_unmap_sg);
-	amd_iommu_stats_add(&cnt_alloc_coherent);
-	amd_iommu_stats_add(&cnt_free_coherent);
-	amd_iommu_stats_add(&cross_page);
-	amd_iommu_stats_add(&domain_flush_single);
-	amd_iommu_stats_add(&domain_flush_all);
-	amd_iommu_stats_add(&alloced_io_mem);
-	amd_iommu_stats_add(&total_map_requests);
-}
-
-#endif
-
-/****************************************************************************
- *
- * Interrupt handling functions
- *
- ****************************************************************************/
-
-static void dump_dte_entry(u16 devid)
-{
-	int i;
-
-	for (i = 0; i < 8; ++i)
-		pr_err("AMD-Vi: DTE[%d]: %08x\n", i,
-			amd_iommu_dev_table[devid].data[i]);
-}
-
-static void dump_command(unsigned long phys_addr)
-{
-	struct iommu_cmd *cmd = phys_to_virt(phys_addr);
-	int i;
-
-	for (i = 0; i < 4; ++i)
-		pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
-}
-
-static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
-{
-	u32 *event = __evt;
-	int type  = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
-	int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
-	int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
-	int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
-	u64 address = (u64)(((u64)event[3]) << 32) | event[2];
-
-	printk(KERN_ERR "AMD-Vi: Event logged [");
-
-	switch (type) {
-	case EVENT_TYPE_ILL_DEV:
-		printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
-		       "address=0x%016llx flags=0x%04x]\n",
-		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-		       address, flags);
-		dump_dte_entry(devid);
-		break;
-	case EVENT_TYPE_IO_FAULT:
-		printk("IO_PAGE_FAULT device=%02x:%02x.%x "
-		       "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
-		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-		       domid, address, flags);
-		break;
-	case EVENT_TYPE_DEV_TAB_ERR:
-		printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
-		       "address=0x%016llx flags=0x%04x]\n",
-		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-		       address, flags);
-		break;
-	case EVENT_TYPE_PAGE_TAB_ERR:
-		printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
-		       "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
-		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-		       domid, address, flags);
-		break;
-	case EVENT_TYPE_ILL_CMD:
-		printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
-		dump_command(address);
-		break;
-	case EVENT_TYPE_CMD_HARD_ERR:
-		printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
-		       "flags=0x%04x]\n", address, flags);
-		break;
-	case EVENT_TYPE_IOTLB_INV_TO:
-		printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
-		       "address=0x%016llx]\n",
-		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-		       address);
-		break;
-	case EVENT_TYPE_INV_DEV_REQ:
-		printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
-		       "address=0x%016llx flags=0x%04x]\n",
-		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
-		       address, flags);
-		break;
-	default:
-		printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
-	}
-}
-
-static void iommu_poll_events(struct amd_iommu *iommu)
-{
-	u32 head, tail;
-	unsigned long flags;
-
-	spin_lock_irqsave(&iommu->lock, flags);
-
-	head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
-	tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
-
-	while (head != tail) {
-		iommu_print_event(iommu, iommu->evt_buf + head);
-		head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
-	}
-
-	writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
-
-	spin_unlock_irqrestore(&iommu->lock, flags);
-}
-
-irqreturn_t amd_iommu_int_thread(int irq, void *data)
-{
-	struct amd_iommu *iommu;
-
-	for_each_iommu(iommu)
-		iommu_poll_events(iommu);
-
-	return IRQ_HANDLED;
-}
-
-irqreturn_t amd_iommu_int_handler(int irq, void *data)
-{
-	return IRQ_WAKE_THREAD;
-}
-
-/****************************************************************************
- *
- * IOMMU command queuing functions
- *
- ****************************************************************************/
-
-static int wait_on_sem(volatile u64 *sem)
-{
-	int i = 0;
-
-	while (*sem == 0 && i < LOOP_TIMEOUT) {
-		udelay(1);
-		i += 1;
-	}
-
-	if (i == LOOP_TIMEOUT) {
-		pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
-		return -EIO;
-	}
-
-	return 0;
-}
-
-static void copy_cmd_to_buffer(struct amd_iommu *iommu,
-			       struct iommu_cmd *cmd,
-			       u32 tail)
-{
-	u8 *target;
-
-	target = iommu->cmd_buf + tail;
-	tail   = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
-
-	/* Copy command to buffer */
-	memcpy(target, cmd, sizeof(*cmd));
-
-	/* Tell the IOMMU about it */
-	writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-}
-
-static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
-{
-	WARN_ON(address & 0x7ULL);
-
-	memset(cmd, 0, sizeof(*cmd));
-	cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
-	cmd->data[1] = upper_32_bits(__pa(address));
-	cmd->data[2] = 1;
-	CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
-}
-
-static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
-{
-	memset(cmd, 0, sizeof(*cmd));
-	cmd->data[0] = devid;
-	CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
-}
-
-static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
-				  size_t size, u16 domid, int pde)
-{
-	u64 pages;
-	int s;
-
-	pages = iommu_num_pages(address, size, PAGE_SIZE);
-	s     = 0;
-
-	if (pages > 1) {
-		/*
-		 * If we have to flush more than one page, flush all
-		 * TLB entries for this domain
-		 */
-		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
-		s = 1;
-	}
-
-	address &= PAGE_MASK;
-
-	memset(cmd, 0, sizeof(*cmd));
-	cmd->data[1] |= domid;
-	cmd->data[2]  = lower_32_bits(address);
-	cmd->data[3]  = upper_32_bits(address);
-	CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
-	if (s) /* size bit - we flush more than one 4kb page */
-		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
-	if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
-		cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
-}
-
-static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
-				  u64 address, size_t size)
-{
-	u64 pages;
-	int s;
-
-	pages = iommu_num_pages(address, size, PAGE_SIZE);
-	s     = 0;
-
-	if (pages > 1) {
-		/*
-		 * If we have to flush more than one page, flush all
-		 * TLB entries for this domain
-		 */
-		address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
-		s = 1;
-	}
-
-	address &= PAGE_MASK;
-
-	memset(cmd, 0, sizeof(*cmd));
-	cmd->data[0]  = devid;
-	cmd->data[0] |= (qdep & 0xff) << 24;
-	cmd->data[1]  = devid;
-	cmd->data[2]  = lower_32_bits(address);
-	cmd->data[3]  = upper_32_bits(address);
-	CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
-	if (s)
-		cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
-}
-
-static void build_inv_all(struct iommu_cmd *cmd)
-{
-	memset(cmd, 0, sizeof(*cmd));
-	CMD_SET_TYPE(cmd, CMD_INV_ALL);
-}
-
-/*
- * Writes the command to the IOMMUs command buffer and informs the
- * hardware about the new command.
- */
-static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
-{
-	u32 left, tail, head, next_tail;
-	unsigned long flags;
-
-	WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
-
-again:
-	spin_lock_irqsave(&iommu->lock, flags);
-
-	head      = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
-	tail      = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-	next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
-	left      = (head - next_tail) % iommu->cmd_buf_size;
-
-	if (left <= 2) {
-		struct iommu_cmd sync_cmd;
-		volatile u64 sem = 0;
-		int ret;
-
-		build_completion_wait(&sync_cmd, (u64)&sem);
-		copy_cmd_to_buffer(iommu, &sync_cmd, tail);
-
-		spin_unlock_irqrestore(&iommu->lock, flags);
-
-		if ((ret = wait_on_sem(&sem)) != 0)
-			return ret;
-
-		goto again;
-	}
-
-	copy_cmd_to_buffer(iommu, cmd, tail);
-
-	/* We need to sync now to make sure all commands are processed */
-	iommu->need_sync = true;
-
-	spin_unlock_irqrestore(&iommu->lock, flags);
-
-	return 0;
-}
-
-/*
- * This function queues a completion wait command into the command
- * buffer of an IOMMU
- */
-static int iommu_completion_wait(struct amd_iommu *iommu)
-{
-	struct iommu_cmd cmd;
-	volatile u64 sem = 0;
-	int ret;
-
-	if (!iommu->need_sync)
-		return 0;
-
-	build_completion_wait(&cmd, (u64)&sem);
-
-	ret = iommu_queue_command(iommu, &cmd);
-	if (ret)
-		return ret;
-
-	return wait_on_sem(&sem);
-}
-
-static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
-{
-	struct iommu_cmd cmd;
-
-	build_inv_dte(&cmd, devid);
-
-	return iommu_queue_command(iommu, &cmd);
-}
-
-static void iommu_flush_dte_all(struct amd_iommu *iommu)
-{
-	u32 devid;
-
-	for (devid = 0; devid <= 0xffff; ++devid)
-		iommu_flush_dte(iommu, devid);
-
-	iommu_completion_wait(iommu);
-}
-
-/*
- * This function uses heavy locking and may disable irqs for some time. But
- * this is no issue because it is only called during resume.
- */
-static void iommu_flush_tlb_all(struct amd_iommu *iommu)
-{
-	u32 dom_id;
-
-	for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
-		struct iommu_cmd cmd;
-		build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
-				      dom_id, 1);
-		iommu_queue_command(iommu, &cmd);
-	}
-
-	iommu_completion_wait(iommu);
-}
-
-static void iommu_flush_all(struct amd_iommu *iommu)
-{
-	struct iommu_cmd cmd;
-
-	build_inv_all(&cmd);
-
-	iommu_queue_command(iommu, &cmd);
-	iommu_completion_wait(iommu);
-}
-
-void iommu_flush_all_caches(struct amd_iommu *iommu)
-{
-	if (iommu_feature(iommu, FEATURE_IA)) {
-		iommu_flush_all(iommu);
-	} else {
-		iommu_flush_dte_all(iommu);
-		iommu_flush_tlb_all(iommu);
-	}
-}
-
-/*
- * Command send function for flushing on-device TLB
- */
-static int device_flush_iotlb(struct device *dev, u64 address, size_t size)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-	struct amd_iommu *iommu;
-	struct iommu_cmd cmd;
-	u16 devid;
-	int qdep;
-
-	qdep  = pci_ats_queue_depth(pdev);
-	devid = get_device_id(dev);
-	iommu = amd_iommu_rlookup_table[devid];
-
-	build_inv_iotlb_pages(&cmd, devid, qdep, address, size);
-
-	return iommu_queue_command(iommu, &cmd);
-}
-
-/*
- * Command send function for invalidating a device table entry
- */
-static int device_flush_dte(struct device *dev)
-{
-	struct amd_iommu *iommu;
-	struct pci_dev *pdev;
-	u16 devid;
-	int ret;
-
-	pdev  = to_pci_dev(dev);
-	devid = get_device_id(dev);
-	iommu = amd_iommu_rlookup_table[devid];
-
-	ret = iommu_flush_dte(iommu, devid);
-	if (ret)
-		return ret;
-
-	if (pci_ats_enabled(pdev))
-		ret = device_flush_iotlb(dev, 0, ~0UL);
-
-	return ret;
-}
-
-/*
- * TLB invalidation function which is called from the mapping functions.
- * It invalidates a single PTE if the range to flush is within a single
- * page. Otherwise it flushes the whole TLB of the IOMMU.
- */
-static void __domain_flush_pages(struct protection_domain *domain,
-				 u64 address, size_t size, int pde)
-{
-	struct iommu_dev_data *dev_data;
-	struct iommu_cmd cmd;
-	int ret = 0, i;
-
-	build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
-
-	for (i = 0; i < amd_iommus_present; ++i) {
-		if (!domain->dev_iommu[i])
-			continue;
-
-		/*
-		 * Devices of this domain are behind this IOMMU
-		 * We need a TLB flush
-		 */
-		ret |= iommu_queue_command(amd_iommus[i], &cmd);
-	}
-
-	list_for_each_entry(dev_data, &domain->dev_list, list) {
-		struct pci_dev *pdev = to_pci_dev(dev_data->dev);
-
-		if (!pci_ats_enabled(pdev))
-			continue;
-
-		ret |= device_flush_iotlb(dev_data->dev, address, size);
-	}
-
-	WARN_ON(ret);
-}
-
-static void domain_flush_pages(struct protection_domain *domain,
-			       u64 address, size_t size)
-{
-	__domain_flush_pages(domain, address, size, 0);
-}
-
-/* Flush the whole IO/TLB for a given protection domain */
-static void domain_flush_tlb(struct protection_domain *domain)
-{
-	__domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
-}
-
-/* Flush the whole IO/TLB for a given protection domain - including PDE */
-static void domain_flush_tlb_pde(struct protection_domain *domain)
-{
-	__domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
-}
-
-static void domain_flush_complete(struct protection_domain *domain)
-{
-	int i;
-
-	for (i = 0; i < amd_iommus_present; ++i) {
-		if (!domain->dev_iommu[i])
-			continue;
-
-		/*
-		 * Devices of this domain are behind this IOMMU
-		 * We need to wait for completion of all commands.
-		 */
-		iommu_completion_wait(amd_iommus[i]);
-	}
-}
-
-
-/*
- * This function flushes the DTEs for all devices in domain
- */
-static void domain_flush_devices(struct protection_domain *domain)
-{
-	struct iommu_dev_data *dev_data;
-	unsigned long flags;
-
-	spin_lock_irqsave(&domain->lock, flags);
-
-	list_for_each_entry(dev_data, &domain->dev_list, list)
-		device_flush_dte(dev_data->dev);
-
-	spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-/****************************************************************************
- *
- * The functions below are used the create the page table mappings for
- * unity mapped regions.
- *
- ****************************************************************************/
-
-/*
- * This function is used to add another level to an IO page table. Adding
- * another level increases the size of the address space by 9 bits to a size up
- * to 64 bits.
- */
-static bool increase_address_space(struct protection_domain *domain,
-				   gfp_t gfp)
-{
-	u64 *pte;
-
-	if (domain->mode == PAGE_MODE_6_LEVEL)
-		/* address space already 64 bit large */
-		return false;
-
-	pte = (void *)get_zeroed_page(gfp);
-	if (!pte)
-		return false;
-
-	*pte             = PM_LEVEL_PDE(domain->mode,
-					virt_to_phys(domain->pt_root));
-	domain->pt_root  = pte;
-	domain->mode    += 1;
-	domain->updated  = true;
-
-	return true;
-}
-
-static u64 *alloc_pte(struct protection_domain *domain,
-		      unsigned long address,
-		      unsigned long page_size,
-		      u64 **pte_page,
-		      gfp_t gfp)
-{
-	int level, end_lvl;
-	u64 *pte, *page;
-
-	BUG_ON(!is_power_of_2(page_size));
-
-	while (address > PM_LEVEL_SIZE(domain->mode))
-		increase_address_space(domain, gfp);
-
-	level   = domain->mode - 1;
-	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
-	address = PAGE_SIZE_ALIGN(address, page_size);
-	end_lvl = PAGE_SIZE_LEVEL(page_size);
-
-	while (level > end_lvl) {
-		if (!IOMMU_PTE_PRESENT(*pte)) {
-			page = (u64 *)get_zeroed_page(gfp);
-			if (!page)
-				return NULL;
-			*pte = PM_LEVEL_PDE(level, virt_to_phys(page));
-		}
-
-		/* No level skipping support yet */
-		if (PM_PTE_LEVEL(*pte) != level)
-			return NULL;
-
-		level -= 1;
-
-		pte = IOMMU_PTE_PAGE(*pte);
-
-		if (pte_page && level == end_lvl)
-			*pte_page = pte;
-
-		pte = &pte[PM_LEVEL_INDEX(level, address)];
-	}
-
-	return pte;
-}
-
-/*
- * This function checks if there is a PTE for a given dma address. If
- * there is one, it returns the pointer to it.
- */
-static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
-{
-	int level;
-	u64 *pte;
-
-	if (address > PM_LEVEL_SIZE(domain->mode))
-		return NULL;
-
-	level   =  domain->mode - 1;
-	pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
-
-	while (level > 0) {
-
-		/* Not Present */
-		if (!IOMMU_PTE_PRESENT(*pte))
-			return NULL;
-
-		/* Large PTE */
-		if (PM_PTE_LEVEL(*pte) == 0x07) {
-			unsigned long pte_mask, __pte;
-
-			/*
-			 * If we have a series of large PTEs, make
-			 * sure to return a pointer to the first one.
-			 */
-			pte_mask = PTE_PAGE_SIZE(*pte);
-			pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
-			__pte    = ((unsigned long)pte) & pte_mask;
-
-			return (u64 *)__pte;
-		}
-
-		/* No level skipping support yet */
-		if (PM_PTE_LEVEL(*pte) != level)
-			return NULL;
-
-		level -= 1;
-
-		/* Walk to the next level */
-		pte = IOMMU_PTE_PAGE(*pte);
-		pte = &pte[PM_LEVEL_INDEX(level, address)];
-	}
-
-	return pte;
-}
-
-/*
- * Generic mapping functions. It maps a physical address into a DMA
- * address space. It allocates the page table pages if necessary.
- * In the future it can be extended to a generic mapping function
- * supporting all features of AMD IOMMU page tables like level skipping
- * and full 64 bit address spaces.
- */
-static int iommu_map_page(struct protection_domain *dom,
-			  unsigned long bus_addr,
-			  unsigned long phys_addr,
-			  int prot,
-			  unsigned long page_size)
-{
-	u64 __pte, *pte;
-	int i, count;
-
-	if (!(prot & IOMMU_PROT_MASK))
-		return -EINVAL;
-
-	bus_addr  = PAGE_ALIGN(bus_addr);
-	phys_addr = PAGE_ALIGN(phys_addr);
-	count     = PAGE_SIZE_PTE_COUNT(page_size);
-	pte       = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
-
-	for (i = 0; i < count; ++i)
-		if (IOMMU_PTE_PRESENT(pte[i]))
-			return -EBUSY;
-
-	if (page_size > PAGE_SIZE) {
-		__pte = PAGE_SIZE_PTE(phys_addr, page_size);
-		__pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
-	} else
-		__pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
-
-	if (prot & IOMMU_PROT_IR)
-		__pte |= IOMMU_PTE_IR;
-	if (prot & IOMMU_PROT_IW)
-		__pte |= IOMMU_PTE_IW;
-
-	for (i = 0; i < count; ++i)
-		pte[i] = __pte;
-
-	update_domain(dom);
-
-	return 0;
-}
-
-static unsigned long iommu_unmap_page(struct protection_domain *dom,
-				      unsigned long bus_addr,
-				      unsigned long page_size)
-{
-	unsigned long long unmap_size, unmapped;
-	u64 *pte;
-
-	BUG_ON(!is_power_of_2(page_size));
-
-	unmapped = 0;
-
-	while (unmapped < page_size) {
-
-		pte = fetch_pte(dom, bus_addr);
-
-		if (!pte) {
-			/*
-			 * No PTE for this address
-			 * move forward in 4kb steps
-			 */
-			unmap_size = PAGE_SIZE;
-		} else if (PM_PTE_LEVEL(*pte) == 0) {
-			/* 4kb PTE found for this address */
-			unmap_size = PAGE_SIZE;
-			*pte       = 0ULL;
-		} else {
-			int count, i;
-
-			/* Large PTE found which maps this address */
-			unmap_size = PTE_PAGE_SIZE(*pte);
-			count      = PAGE_SIZE_PTE_COUNT(unmap_size);
-			for (i = 0; i < count; i++)
-				pte[i] = 0ULL;
-		}
-
-		bus_addr  = (bus_addr & ~(unmap_size - 1)) + unmap_size;
-		unmapped += unmap_size;
-	}
-
-	BUG_ON(!is_power_of_2(unmapped));
-
-	return unmapped;
-}
-
-/*
- * This function checks if a specific unity mapping entry is needed for
- * this specific IOMMU.
- */
-static int iommu_for_unity_map(struct amd_iommu *iommu,
-			       struct unity_map_entry *entry)
-{
-	u16 bdf, i;
-
-	for (i = entry->devid_start; i <= entry->devid_end; ++i) {
-		bdf = amd_iommu_alias_table[i];
-		if (amd_iommu_rlookup_table[bdf] == iommu)
-			return 1;
-	}
-
-	return 0;
-}
-
-/*
- * This function actually applies the mapping to the page table of the
- * dma_ops domain.
- */
-static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
-			     struct unity_map_entry *e)
-{
-	u64 addr;
-	int ret;
-
-	for (addr = e->address_start; addr < e->address_end;
-	     addr += PAGE_SIZE) {
-		ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
-				     PAGE_SIZE);
-		if (ret)
-			return ret;
-		/*
-		 * if unity mapping is in aperture range mark the page
-		 * as allocated in the aperture
-		 */
-		if (addr < dma_dom->aperture_size)
-			__set_bit(addr >> PAGE_SHIFT,
-				  dma_dom->aperture[0]->bitmap);
-	}
-
-	return 0;
-}
-
-/*
- * Init the unity mappings for a specific IOMMU in the system
- *
- * Basically iterates over all unity mapping entries and applies them to
- * the default domain DMA of that IOMMU if necessary.
- */
-static int iommu_init_unity_mappings(struct amd_iommu *iommu)
-{
-	struct unity_map_entry *entry;
-	int ret;
-
-	list_for_each_entry(entry, &amd_iommu_unity_map, list) {
-		if (!iommu_for_unity_map(iommu, entry))
-			continue;
-		ret = dma_ops_unity_map(iommu->default_dom, entry);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-/*
- * Inits the unity mappings required for a specific device
- */
-static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
-					  u16 devid)
-{
-	struct unity_map_entry *e;
-	int ret;
-
-	list_for_each_entry(e, &amd_iommu_unity_map, list) {
-		if (!(devid >= e->devid_start && devid <= e->devid_end))
-			continue;
-		ret = dma_ops_unity_map(dma_dom, e);
-		if (ret)
-			return ret;
-	}
-
-	return 0;
-}
-
-/****************************************************************************
- *
- * The next functions belong to the address allocator for the dma_ops
- * interface functions. They work like the allocators in the other IOMMU
- * drivers. Its basically a bitmap which marks the allocated pages in
- * the aperture. Maybe it could be enhanced in the future to a more
- * efficient allocator.
- *
- ****************************************************************************/
-
-/*
- * The address allocator core functions.
- *
- * called with domain->lock held
- */
-
-/*
- * Used to reserve address ranges in the aperture (e.g. for exclusion
- * ranges.
- */
-static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
-				      unsigned long start_page,
-				      unsigned int pages)
-{
-	unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
-
-	if (start_page + pages > last_page)
-		pages = last_page - start_page;
-
-	for (i = start_page; i < start_page + pages; ++i) {
-		int index = i / APERTURE_RANGE_PAGES;
-		int page  = i % APERTURE_RANGE_PAGES;
-		__set_bit(page, dom->aperture[index]->bitmap);
-	}
-}
-
-/*
- * This function is used to add a new aperture range to an existing
- * aperture in case of dma_ops domain allocation or address allocation
- * failure.
- */
-static int alloc_new_range(struct dma_ops_domain *dma_dom,
-			   bool populate, gfp_t gfp)
-{
-	int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
-	struct amd_iommu *iommu;
-	unsigned long i;
-
-#ifdef CONFIG_IOMMU_STRESS
-	populate = false;
-#endif
-
-	if (index >= APERTURE_MAX_RANGES)
-		return -ENOMEM;
-
-	dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
-	if (!dma_dom->aperture[index])
-		return -ENOMEM;
-
-	dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
-	if (!dma_dom->aperture[index]->bitmap)
-		goto out_free;
-
-	dma_dom->aperture[index]->offset = dma_dom->aperture_size;
-
-	if (populate) {
-		unsigned long address = dma_dom->aperture_size;
-		int i, num_ptes = APERTURE_RANGE_PAGES / 512;
-		u64 *pte, *pte_page;
-
-		for (i = 0; i < num_ptes; ++i) {
-			pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
-					&pte_page, gfp);
-			if (!pte)
-				goto out_free;
-
-			dma_dom->aperture[index]->pte_pages[i] = pte_page;
-
-			address += APERTURE_RANGE_SIZE / 64;
-		}
-	}
-
-	dma_dom->aperture_size += APERTURE_RANGE_SIZE;
-
-	/* Initialize the exclusion range if necessary */
-	for_each_iommu(iommu) {
-		if (iommu->exclusion_start &&
-		    iommu->exclusion_start >= dma_dom->aperture[index]->offset
-		    && iommu->exclusion_start < dma_dom->aperture_size) {
-			unsigned long startpage;
-			int pages = iommu_num_pages(iommu->exclusion_start,
-						    iommu->exclusion_length,
-						    PAGE_SIZE);
-			startpage = iommu->exclusion_start >> PAGE_SHIFT;
-			dma_ops_reserve_addresses(dma_dom, startpage, pages);
-		}
-	}
-
-	/*
-	 * Check for areas already mapped as present in the new aperture
-	 * range and mark those pages as reserved in the allocator. Such
-	 * mappings may already exist as a result of requested unity
-	 * mappings for devices.
-	 */
-	for (i = dma_dom->aperture[index]->offset;
-	     i < dma_dom->aperture_size;
-	     i += PAGE_SIZE) {
-		u64 *pte = fetch_pte(&dma_dom->domain, i);
-		if (!pte || !IOMMU_PTE_PRESENT(*pte))
-			continue;
-
-		dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
-	}
-
-	update_domain(&dma_dom->domain);
-
-	return 0;
-
-out_free:
-	update_domain(&dma_dom->domain);
-
-	free_page((unsigned long)dma_dom->aperture[index]->bitmap);
-
-	kfree(dma_dom->aperture[index]);
-	dma_dom->aperture[index] = NULL;
-
-	return -ENOMEM;
-}
-
-static unsigned long dma_ops_area_alloc(struct device *dev,
-					struct dma_ops_domain *dom,
-					unsigned int pages,
-					unsigned long align_mask,
-					u64 dma_mask,
-					unsigned long start)
-{
-	unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
-	int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
-	int i = start >> APERTURE_RANGE_SHIFT;
-	unsigned long boundary_size;
-	unsigned long address = -1;
-	unsigned long limit;
-
-	next_bit >>= PAGE_SHIFT;
-
-	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
-			PAGE_SIZE) >> PAGE_SHIFT;
-
-	for (;i < max_index; ++i) {
-		unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
-
-		if (dom->aperture[i]->offset >= dma_mask)
-			break;
-
-		limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
-					       dma_mask >> PAGE_SHIFT);
-
-		address = iommu_area_alloc(dom->aperture[i]->bitmap,
-					   limit, next_bit, pages, 0,
-					    boundary_size, align_mask);
-		if (address != -1) {
-			address = dom->aperture[i]->offset +
-				  (address << PAGE_SHIFT);
-			dom->next_address = address + (pages << PAGE_SHIFT);
-			break;
-		}
-
-		next_bit = 0;
-	}
-
-	return address;
-}
-
-static unsigned long dma_ops_alloc_addresses(struct device *dev,
-					     struct dma_ops_domain *dom,
-					     unsigned int pages,
-					     unsigned long align_mask,
-					     u64 dma_mask)
-{
-	unsigned long address;
-
-#ifdef CONFIG_IOMMU_STRESS
-	dom->next_address = 0;
-	dom->need_flush = true;
-#endif
-
-	address = dma_ops_area_alloc(dev, dom, pages, align_mask,
-				     dma_mask, dom->next_address);
-
-	if (address == -1) {
-		dom->next_address = 0;
-		address = dma_ops_area_alloc(dev, dom, pages, align_mask,
-					     dma_mask, 0);
-		dom->need_flush = true;
-	}
-
-	if (unlikely(address == -1))
-		address = DMA_ERROR_CODE;
-
-	WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
-
-	return address;
-}
-
-/*
- * The address free function.
- *
- * called with domain->lock held
- */
-static void dma_ops_free_addresses(struct dma_ops_domain *dom,
-				   unsigned long address,
-				   unsigned int pages)
-{
-	unsigned i = address >> APERTURE_RANGE_SHIFT;
-	struct aperture_range *range = dom->aperture[i];
-
-	BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
-
-#ifdef CONFIG_IOMMU_STRESS
-	if (i < 4)
-		return;
-#endif
-
-	if (address >= dom->next_address)
-		dom->need_flush = true;
-
-	address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
-
-	bitmap_clear(range->bitmap, address, pages);
-
-}
-
-/****************************************************************************
- *
- * The next functions belong to the domain allocation. A domain is
- * allocated for every IOMMU as the default domain. If device isolation
- * is enabled, every device get its own domain. The most important thing
- * about domains is the page table mapping the DMA address space they
- * contain.
- *
- ****************************************************************************/
-
-/*
- * This function adds a protection domain to the global protection domain list
- */
-static void add_domain_to_list(struct protection_domain *domain)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
-	list_add(&domain->list, &amd_iommu_pd_list);
-	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-/*
- * This function removes a protection domain to the global
- * protection domain list
- */
-static void del_domain_from_list(struct protection_domain *domain)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&amd_iommu_pd_lock, flags);
-	list_del(&domain->list);
-	spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
-}
-
-static u16 domain_id_alloc(void)
-{
-	unsigned long flags;
-	int id;
-
-	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-	id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID);
-	BUG_ON(id == 0);
-	if (id > 0 && id < MAX_DOMAIN_ID)
-		__set_bit(id, amd_iommu_pd_alloc_bitmap);
-	else
-		id = 0;
-	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
-	return id;
-}
-
-static void domain_id_free(int id)
-{
-	unsigned long flags;
-
-	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-	if (id > 0 && id < MAX_DOMAIN_ID)
-		__clear_bit(id, amd_iommu_pd_alloc_bitmap);
-	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-static void free_pagetable(struct protection_domain *domain)
-{
-	int i, j;
-	u64 *p1, *p2, *p3;
-
-	p1 = domain->pt_root;
-
-	if (!p1)
-		return;
-
-	for (i = 0; i < 512; ++i) {
-		if (!IOMMU_PTE_PRESENT(p1[i]))
-			continue;
-
-		p2 = IOMMU_PTE_PAGE(p1[i]);
-		for (j = 0; j < 512; ++j) {
-			if (!IOMMU_PTE_PRESENT(p2[j]))
-				continue;
-			p3 = IOMMU_PTE_PAGE(p2[j]);
-			free_page((unsigned long)p3);
-		}
-
-		free_page((unsigned long)p2);
-	}
-
-	free_page((unsigned long)p1);
-
-	domain->pt_root = NULL;
-}
-
-/*
- * Free a domain, only used if something went wrong in the
- * allocation path and we need to free an already allocated page table
- */
-static void dma_ops_domain_free(struct dma_ops_domain *dom)
-{
-	int i;
-
-	if (!dom)
-		return;
-
-	del_domain_from_list(&dom->domain);
-
-	free_pagetable(&dom->domain);
-
-	for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
-		if (!dom->aperture[i])
-			continue;
-		free_page((unsigned long)dom->aperture[i]->bitmap);
-		kfree(dom->aperture[i]);
-	}
-
-	kfree(dom);
-}
-
-/*
- * Allocates a new protection domain usable for the dma_ops functions.
- * It also initializes the page table and the address allocator data
- * structures required for the dma_ops interface
- */
-static struct dma_ops_domain *dma_ops_domain_alloc(void)
-{
-	struct dma_ops_domain *dma_dom;
-
-	dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
-	if (!dma_dom)
-		return NULL;
-
-	spin_lock_init(&dma_dom->domain.lock);
-
-	dma_dom->domain.id = domain_id_alloc();
-	if (dma_dom->domain.id == 0)
-		goto free_dma_dom;
-	INIT_LIST_HEAD(&dma_dom->domain.dev_list);
-	dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
-	dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
-	dma_dom->domain.flags = PD_DMA_OPS_MASK;
-	dma_dom->domain.priv = dma_dom;
-	if (!dma_dom->domain.pt_root)
-		goto free_dma_dom;
-
-	dma_dom->need_flush = false;
-	dma_dom->target_dev = 0xffff;
-
-	add_domain_to_list(&dma_dom->domain);
-
-	if (alloc_new_range(dma_dom, true, GFP_KERNEL))
-		goto free_dma_dom;
-
-	/*
-	 * mark the first page as allocated so we never return 0 as
-	 * a valid dma-address. So we can use 0 as error value
-	 */
-	dma_dom->aperture[0]->bitmap[0] = 1;
-	dma_dom->next_address = 0;
-
-
-	return dma_dom;
-
-free_dma_dom:
-	dma_ops_domain_free(dma_dom);
-
-	return NULL;
-}
-
-/*
- * little helper function to check whether a given protection domain is a
- * dma_ops domain
- */
-static bool dma_ops_domain(struct protection_domain *domain)
-{
-	return domain->flags & PD_DMA_OPS_MASK;
-}
-
-static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
-{
-	u64 pte_root = virt_to_phys(domain->pt_root);
-	u32 flags = 0;
-
-	pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
-		    << DEV_ENTRY_MODE_SHIFT;
-	pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
-
-	if (ats)
-		flags |= DTE_FLAG_IOTLB;
-
-	amd_iommu_dev_table[devid].data[3] |= flags;
-	amd_iommu_dev_table[devid].data[2]  = domain->id;
-	amd_iommu_dev_table[devid].data[1]  = upper_32_bits(pte_root);
-	amd_iommu_dev_table[devid].data[0]  = lower_32_bits(pte_root);
-}
-
-static void clear_dte_entry(u16 devid)
-{
-	/* remove entry from the device table seen by the hardware */
-	amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
-	amd_iommu_dev_table[devid].data[1] = 0;
-	amd_iommu_dev_table[devid].data[2] = 0;
-
-	amd_iommu_apply_erratum_63(devid);
-}
-
-static void do_attach(struct device *dev, struct protection_domain *domain)
-{
-	struct iommu_dev_data *dev_data;
-	struct amd_iommu *iommu;
-	struct pci_dev *pdev;
-	bool ats = false;
-	u16 devid;
-
-	devid    = get_device_id(dev);
-	iommu    = amd_iommu_rlookup_table[devid];
-	dev_data = get_dev_data(dev);
-	pdev     = to_pci_dev(dev);
-
-	if (amd_iommu_iotlb_sup)
-		ats = pci_ats_enabled(pdev);
-
-	/* Update data structures */
-	dev_data->domain = domain;
-	list_add(&dev_data->list, &domain->dev_list);
-	set_dte_entry(devid, domain, ats);
-
-	/* Do reference counting */
-	domain->dev_iommu[iommu->index] += 1;
-	domain->dev_cnt                 += 1;
-
-	/* Flush the DTE entry */
-	device_flush_dte(dev);
-}
-
-static void do_detach(struct device *dev)
-{
-	struct iommu_dev_data *dev_data;
-	struct amd_iommu *iommu;
-	u16 devid;
-
-	devid    = get_device_id(dev);
-	iommu    = amd_iommu_rlookup_table[devid];
-	dev_data = get_dev_data(dev);
-
-	/* decrease reference counters */
-	dev_data->domain->dev_iommu[iommu->index] -= 1;
-	dev_data->domain->dev_cnt                 -= 1;
-
-	/* Update data structures */
-	dev_data->domain = NULL;
-	list_del(&dev_data->list);
-	clear_dte_entry(devid);
-
-	/* Flush the DTE entry */
-	device_flush_dte(dev);
-}
-
-/*
- * If a device is not yet associated with a domain, this function does
- * assigns it visible for the hardware
- */
-static int __attach_device(struct device *dev,
-			   struct protection_domain *domain)
-{
-	struct iommu_dev_data *dev_data, *alias_data;
-	int ret;
-
-	dev_data   = get_dev_data(dev);
-	alias_data = get_dev_data(dev_data->alias);
-
-	if (!alias_data)
-		return -EINVAL;
-
-	/* lock domain */
-	spin_lock(&domain->lock);
-
-	/* Some sanity checks */
-	ret = -EBUSY;
-	if (alias_data->domain != NULL &&
-	    alias_data->domain != domain)
-		goto out_unlock;
-
-	if (dev_data->domain != NULL &&
-	    dev_data->domain != domain)
-		goto out_unlock;
-
-	/* Do real assignment */
-	if (dev_data->alias != dev) {
-		alias_data = get_dev_data(dev_data->alias);
-		if (alias_data->domain == NULL)
-			do_attach(dev_data->alias, domain);
-
-		atomic_inc(&alias_data->bind);
-	}
-
-	if (dev_data->domain == NULL)
-		do_attach(dev, domain);
-
-	atomic_inc(&dev_data->bind);
-
-	ret = 0;
-
-out_unlock:
-
-	/* ready */
-	spin_unlock(&domain->lock);
-
-	return ret;
-}
-
-/*
- * If a device is not yet associated with a domain, this function does
- * assigns it visible for the hardware
- */
-static int attach_device(struct device *dev,
-			 struct protection_domain *domain)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-	unsigned long flags;
-	int ret;
-
-	if (amd_iommu_iotlb_sup)
-		pci_enable_ats(pdev, PAGE_SHIFT);
-
-	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-	ret = __attach_device(dev, domain);
-	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
-	/*
-	 * We might boot into a crash-kernel here. The crashed kernel
-	 * left the caches in the IOMMU dirty. So we have to flush
-	 * here to evict all dirty stuff.
-	 */
-	domain_flush_tlb_pde(domain);
-
-	return ret;
-}
-
-/*
- * Removes a device from a protection domain (unlocked)
- */
-static void __detach_device(struct device *dev)
-{
-	struct iommu_dev_data *dev_data = get_dev_data(dev);
-	struct iommu_dev_data *alias_data;
-	struct protection_domain *domain;
-	unsigned long flags;
-
-	BUG_ON(!dev_data->domain);
-
-	domain = dev_data->domain;
-
-	spin_lock_irqsave(&domain->lock, flags);
-
-	if (dev_data->alias != dev) {
-		alias_data = get_dev_data(dev_data->alias);
-		if (atomic_dec_and_test(&alias_data->bind))
-			do_detach(dev_data->alias);
-	}
-
-	if (atomic_dec_and_test(&dev_data->bind))
-		do_detach(dev);
-
-	spin_unlock_irqrestore(&domain->lock, flags);
-
-	/*
-	 * If we run in passthrough mode the device must be assigned to the
-	 * passthrough domain if it is detached from any other domain.
-	 * Make sure we can deassign from the pt_domain itself.
-	 */
-	if (iommu_pass_through &&
-	    (dev_data->domain == NULL && domain != pt_domain))
-		__attach_device(dev, pt_domain);
-}
-
-/*
- * Removes a device from a protection domain (with devtable_lock held)
- */
-static void detach_device(struct device *dev)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-	unsigned long flags;
-
-	/* lock device table */
-	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-	__detach_device(dev);
-	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
-	if (amd_iommu_iotlb_sup && pci_ats_enabled(pdev))
-		pci_disable_ats(pdev);
-}
-
-/*
- * Find out the protection domain structure for a given PCI device. This
- * will give us the pointer to the page table root for example.
- */
-static struct protection_domain *domain_for_device(struct device *dev)
-{
-	struct protection_domain *dom;
-	struct iommu_dev_data *dev_data, *alias_data;
-	unsigned long flags;
-	u16 devid;
-
-	devid      = get_device_id(dev);
-	dev_data   = get_dev_data(dev);
-	alias_data = get_dev_data(dev_data->alias);
-	if (!alias_data)
-		return NULL;
-
-	read_lock_irqsave(&amd_iommu_devtable_lock, flags);
-	dom = dev_data->domain;
-	if (dom == NULL &&
-	    alias_data->domain != NULL) {
-		__attach_device(dev, alias_data->domain);
-		dom = alias_data->domain;
-	}
-
-	read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-
-	return dom;
-}
-
-static int device_change_notifier(struct notifier_block *nb,
-				  unsigned long action, void *data)
-{
-	struct device *dev = data;
-	u16 devid;
-	struct protection_domain *domain;
-	struct dma_ops_domain *dma_domain;
-	struct amd_iommu *iommu;
-	unsigned long flags;
-
-	if (!check_device(dev))
-		return 0;
-
-	devid  = get_device_id(dev);
-	iommu  = amd_iommu_rlookup_table[devid];
-
-	switch (action) {
-	case BUS_NOTIFY_UNBOUND_DRIVER:
-
-		domain = domain_for_device(dev);
-
-		if (!domain)
-			goto out;
-		if (iommu_pass_through)
-			break;
-		detach_device(dev);
-		break;
-	case BUS_NOTIFY_ADD_DEVICE:
-
-		iommu_init_device(dev);
-
-		domain = domain_for_device(dev);
-
-		/* allocate a protection domain if a device is added */
-		dma_domain = find_protection_domain(devid);
-		if (dma_domain)
-			goto out;
-		dma_domain = dma_ops_domain_alloc();
-		if (!dma_domain)
-			goto out;
-		dma_domain->target_dev = devid;
-
-		spin_lock_irqsave(&iommu_pd_list_lock, flags);
-		list_add_tail(&dma_domain->list, &iommu_pd_list);
-		spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
-
-		break;
-	case BUS_NOTIFY_DEL_DEVICE:
-
-		iommu_uninit_device(dev);
-
-	default:
-		goto out;
-	}
-
-	device_flush_dte(dev);
-	iommu_completion_wait(iommu);
-
-out:
-	return 0;
-}
-
-static struct notifier_block device_nb = {
-	.notifier_call = device_change_notifier,
-};
-
-void amd_iommu_init_notifier(void)
-{
-	bus_register_notifier(&pci_bus_type, &device_nb);
-}
-
-/*****************************************************************************
- *
- * The next functions belong to the dma_ops mapping/unmapping code.
- *
- *****************************************************************************/
-
-/*
- * In the dma_ops path we only have the struct device. This function
- * finds the corresponding IOMMU, the protection domain and the
- * requestor id for a given device.
- * If the device is not yet associated with a domain this is also done
- * in this function.
- */
-static struct protection_domain *get_domain(struct device *dev)
-{
-	struct protection_domain *domain;
-	struct dma_ops_domain *dma_dom;
-	u16 devid = get_device_id(dev);
-
-	if (!check_device(dev))
-		return ERR_PTR(-EINVAL);
-
-	domain = domain_for_device(dev);
-	if (domain != NULL && !dma_ops_domain(domain))
-		return ERR_PTR(-EBUSY);
-
-	if (domain != NULL)
-		return domain;
-
-	/* Device not bount yet - bind it */
-	dma_dom = find_protection_domain(devid);
-	if (!dma_dom)
-		dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
-	attach_device(dev, &dma_dom->domain);
-	DUMP_printk("Using protection domain %d for device %s\n",
-		    dma_dom->domain.id, dev_name(dev));
-
-	return &dma_dom->domain;
-}
-
-static void update_device_table(struct protection_domain *domain)
-{
-	struct iommu_dev_data *dev_data;
-
-	list_for_each_entry(dev_data, &domain->dev_list, list) {
-		struct pci_dev *pdev = to_pci_dev(dev_data->dev);
-		u16 devid = get_device_id(dev_data->dev);
-		set_dte_entry(devid, domain, pci_ats_enabled(pdev));
-	}
-}
-
-static void update_domain(struct protection_domain *domain)
-{
-	if (!domain->updated)
-		return;
-
-	update_device_table(domain);
-
-	domain_flush_devices(domain);
-	domain_flush_tlb_pde(domain);
-
-	domain->updated = false;
-}
-
-/*
- * This function fetches the PTE for a given address in the aperture
- */
-static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
-			    unsigned long address)
-{
-	struct aperture_range *aperture;
-	u64 *pte, *pte_page;
-
-	aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
-	if (!aperture)
-		return NULL;
-
-	pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
-	if (!pte) {
-		pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
-				GFP_ATOMIC);
-		aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
-	} else
-		pte += PM_LEVEL_INDEX(0, address);
-
-	update_domain(&dom->domain);
-
-	return pte;
-}
-
-/*
- * This is the generic map function. It maps one 4kb page at paddr to
- * the given address in the DMA address space for the domain.
- */
-static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
-				     unsigned long address,
-				     phys_addr_t paddr,
-				     int direction)
-{
-	u64 *pte, __pte;
-
-	WARN_ON(address > dom->aperture_size);
-
-	paddr &= PAGE_MASK;
-
-	pte  = dma_ops_get_pte(dom, address);
-	if (!pte)
-		return DMA_ERROR_CODE;
-
-	__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
-
-	if (direction == DMA_TO_DEVICE)
-		__pte |= IOMMU_PTE_IR;
-	else if (direction == DMA_FROM_DEVICE)
-		__pte |= IOMMU_PTE_IW;
-	else if (direction == DMA_BIDIRECTIONAL)
-		__pte |= IOMMU_PTE_IR | IOMMU_PTE_IW;
-
-	WARN_ON(*pte);
-
-	*pte = __pte;
-
-	return (dma_addr_t)address;
-}
-
-/*
- * The generic unmapping function for on page in the DMA address space.
- */
-static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
-				 unsigned long address)
-{
-	struct aperture_range *aperture;
-	u64 *pte;
-
-	if (address >= dom->aperture_size)
-		return;
-
-	aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
-	if (!aperture)
-		return;
-
-	pte  = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
-	if (!pte)
-		return;
-
-	pte += PM_LEVEL_INDEX(0, address);
-
-	WARN_ON(!*pte);
-
-	*pte = 0ULL;
-}
-
-/*
- * This function contains common code for mapping of a physically
- * contiguous memory region into DMA address space. It is used by all
- * mapping functions provided with this IOMMU driver.
- * Must be called with the domain lock held.
- */
-static dma_addr_t __map_single(struct device *dev,
-			       struct dma_ops_domain *dma_dom,
-			       phys_addr_t paddr,
-			       size_t size,
-			       int dir,
-			       bool align,
-			       u64 dma_mask)
-{
-	dma_addr_t offset = paddr & ~PAGE_MASK;
-	dma_addr_t address, start, ret;
-	unsigned int pages;
-	unsigned long align_mask = 0;
-	int i;
-
-	pages = iommu_num_pages(paddr, size, PAGE_SIZE);
-	paddr &= PAGE_MASK;
-
-	INC_STATS_COUNTER(total_map_requests);
-
-	if (pages > 1)
-		INC_STATS_COUNTER(cross_page);
-
-	if (align)
-		align_mask = (1UL << get_order(size)) - 1;
-
-retry:
-	address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
-					  dma_mask);
-	if (unlikely(address == DMA_ERROR_CODE)) {
-		/*
-		 * setting next_address here will let the address
-		 * allocator only scan the new allocated range in the
-		 * first run. This is a small optimization.
-		 */
-		dma_dom->next_address = dma_dom->aperture_size;
-
-		if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
-			goto out;
-
-		/*
-		 * aperture was successfully enlarged by 128 MB, try
-		 * allocation again
-		 */
-		goto retry;
-	}
-
-	start = address;
-	for (i = 0; i < pages; ++i) {
-		ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
-		if (ret == DMA_ERROR_CODE)
-			goto out_unmap;
-
-		paddr += PAGE_SIZE;
-		start += PAGE_SIZE;
-	}
-	address += offset;
-
-	ADD_STATS_COUNTER(alloced_io_mem, size);
-
-	if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
-		domain_flush_tlb(&dma_dom->domain);
-		dma_dom->need_flush = false;
-	} else if (unlikely(amd_iommu_np_cache))
-		domain_flush_pages(&dma_dom->domain, address, size);
-
-out:
-	return address;
-
-out_unmap:
-
-	for (--i; i >= 0; --i) {
-		start -= PAGE_SIZE;
-		dma_ops_domain_unmap(dma_dom, start);
-	}
-
-	dma_ops_free_addresses(dma_dom, address, pages);
-
-	return DMA_ERROR_CODE;
-}
-
-/*
- * Does the reverse of the __map_single function. Must be called with
- * the domain lock held too
- */
-static void __unmap_single(struct dma_ops_domain *dma_dom,
-			   dma_addr_t dma_addr,
-			   size_t size,
-			   int dir)
-{
-	dma_addr_t flush_addr;
-	dma_addr_t i, start;
-	unsigned int pages;
-
-	if ((dma_addr == DMA_ERROR_CODE) ||
-	    (dma_addr + size > dma_dom->aperture_size))
-		return;
-
-	flush_addr = dma_addr;
-	pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
-	dma_addr &= PAGE_MASK;
-	start = dma_addr;
-
-	for (i = 0; i < pages; ++i) {
-		dma_ops_domain_unmap(dma_dom, start);
-		start += PAGE_SIZE;
-	}
-
-	SUB_STATS_COUNTER(alloced_io_mem, size);
-
-	dma_ops_free_addresses(dma_dom, dma_addr, pages);
-
-	if (amd_iommu_unmap_flush || dma_dom->need_flush) {
-		domain_flush_pages(&dma_dom->domain, flush_addr, size);
-		dma_dom->need_flush = false;
-	}
-}
-
-/*
- * The exported map_single function for dma_ops.
- */
-static dma_addr_t map_page(struct device *dev, struct page *page,
-			   unsigned long offset, size_t size,
-			   enum dma_data_direction dir,
-			   struct dma_attrs *attrs)
-{
-	unsigned long flags;
-	struct protection_domain *domain;
-	dma_addr_t addr;
-	u64 dma_mask;
-	phys_addr_t paddr = page_to_phys(page) + offset;
-
-	INC_STATS_COUNTER(cnt_map_single);
-
-	domain = get_domain(dev);
-	if (PTR_ERR(domain) == -EINVAL)
-		return (dma_addr_t)paddr;
-	else if (IS_ERR(domain))
-		return DMA_ERROR_CODE;
-
-	dma_mask = *dev->dma_mask;
-
-	spin_lock_irqsave(&domain->lock, flags);
-
-	addr = __map_single(dev, domain->priv, paddr, size, dir, false,
-			    dma_mask);
-	if (addr == DMA_ERROR_CODE)
-		goto out;
-
-	domain_flush_complete(domain);
-
-out:
-	spin_unlock_irqrestore(&domain->lock, flags);
-
-	return addr;
-}
-
-/*
- * The exported unmap_single function for dma_ops.
- */
-static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
-		       enum dma_data_direction dir, struct dma_attrs *attrs)
-{
-	unsigned long flags;
-	struct protection_domain *domain;
-
-	INC_STATS_COUNTER(cnt_unmap_single);
-
-	domain = get_domain(dev);
-	if (IS_ERR(domain))
-		return;
-
-	spin_lock_irqsave(&domain->lock, flags);
-
-	__unmap_single(domain->priv, dma_addr, size, dir);
-
-	domain_flush_complete(domain);
-
-	spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-/*
- * This is a special map_sg function which is used if we should map a
- * device which is not handled by an AMD IOMMU in the system.
- */
-static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
-			   int nelems, int dir)
-{
-	struct scatterlist *s;
-	int i;
-
-	for_each_sg(sglist, s, nelems, i) {
-		s->dma_address = (dma_addr_t)sg_phys(s);
-		s->dma_length  = s->length;
-	}
-
-	return nelems;
-}
-
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static int map_sg(struct device *dev, struct scatterlist *sglist,
-		  int nelems, enum dma_data_direction dir,
-		  struct dma_attrs *attrs)
-{
-	unsigned long flags;
-	struct protection_domain *domain;
-	int i;
-	struct scatterlist *s;
-	phys_addr_t paddr;
-	int mapped_elems = 0;
-	u64 dma_mask;
-
-	INC_STATS_COUNTER(cnt_map_sg);
-
-	domain = get_domain(dev);
-	if (PTR_ERR(domain) == -EINVAL)
-		return map_sg_no_iommu(dev, sglist, nelems, dir);
-	else if (IS_ERR(domain))
-		return 0;
-
-	dma_mask = *dev->dma_mask;
-
-	spin_lock_irqsave(&domain->lock, flags);
-
-	for_each_sg(sglist, s, nelems, i) {
-		paddr = sg_phys(s);
-
-		s->dma_address = __map_single(dev, domain->priv,
-					      paddr, s->length, dir, false,
-					      dma_mask);
-
-		if (s->dma_address) {
-			s->dma_length = s->length;
-			mapped_elems++;
-		} else
-			goto unmap;
-	}
-
-	domain_flush_complete(domain);
-
-out:
-	spin_unlock_irqrestore(&domain->lock, flags);
-
-	return mapped_elems;
-unmap:
-	for_each_sg(sglist, s, mapped_elems, i) {
-		if (s->dma_address)
-			__unmap_single(domain->priv, s->dma_address,
-				       s->dma_length, dir);
-		s->dma_address = s->dma_length = 0;
-	}
-
-	mapped_elems = 0;
-
-	goto out;
-}
-
-/*
- * The exported map_sg function for dma_ops (handles scatter-gather
- * lists).
- */
-static void unmap_sg(struct device *dev, struct scatterlist *sglist,
-		     int nelems, enum dma_data_direction dir,
-		     struct dma_attrs *attrs)
-{
-	unsigned long flags;
-	struct protection_domain *domain;
-	struct scatterlist *s;
-	int i;
-
-	INC_STATS_COUNTER(cnt_unmap_sg);
-
-	domain = get_domain(dev);
-	if (IS_ERR(domain))
-		return;
-
-	spin_lock_irqsave(&domain->lock, flags);
-
-	for_each_sg(sglist, s, nelems, i) {
-		__unmap_single(domain->priv, s->dma_address,
-			       s->dma_length, dir);
-		s->dma_address = s->dma_length = 0;
-	}
-
-	domain_flush_complete(domain);
-
-	spin_unlock_irqrestore(&domain->lock, flags);
-}
-
-/*
- * The exported alloc_coherent function for dma_ops.
- */
-static void *alloc_coherent(struct device *dev, size_t size,
-			    dma_addr_t *dma_addr, gfp_t flag)
-{
-	unsigned long flags;
-	void *virt_addr;
-	struct protection_domain *domain;
-	phys_addr_t paddr;
-	u64 dma_mask = dev->coherent_dma_mask;
-
-	INC_STATS_COUNTER(cnt_alloc_coherent);
-
-	domain = get_domain(dev);
-	if (PTR_ERR(domain) == -EINVAL) {
-		virt_addr = (void *)__get_free_pages(flag, get_order(size));
-		*dma_addr = __pa(virt_addr);
-		return virt_addr;
-	} else if (IS_ERR(domain))
-		return NULL;
-
-	dma_mask  = dev->coherent_dma_mask;
-	flag     &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-	flag     |= __GFP_ZERO;
-
-	virt_addr = (void *)__get_free_pages(flag, get_order(size));
-	if (!virt_addr)
-		return NULL;
-
-	paddr = virt_to_phys(virt_addr);
-
-	if (!dma_mask)
-		dma_mask = *dev->dma_mask;
-
-	spin_lock_irqsave(&domain->lock, flags);
-
-	*dma_addr = __map_single(dev, domain->priv, paddr,
-				 size, DMA_BIDIRECTIONAL, true, dma_mask);
-
-	if (*dma_addr == DMA_ERROR_CODE) {
-		spin_unlock_irqrestore(&domain->lock, flags);
-		goto out_free;
-	}
-
-	domain_flush_complete(domain);
-
-	spin_unlock_irqrestore(&domain->lock, flags);
-
-	return virt_addr;
-
-out_free:
-
-	free_pages((unsigned long)virt_addr, get_order(size));
-
-	return NULL;
-}
-
-/*
- * The exported free_coherent function for dma_ops.
- */
-static void free_coherent(struct device *dev, size_t size,
-			  void *virt_addr, dma_addr_t dma_addr)
-{
-	unsigned long flags;
-	struct protection_domain *domain;
-
-	INC_STATS_COUNTER(cnt_free_coherent);
-
-	domain = get_domain(dev);
-	if (IS_ERR(domain))
-		goto free_mem;
-
-	spin_lock_irqsave(&domain->lock, flags);
-
-	__unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
-
-	domain_flush_complete(domain);
-
-	spin_unlock_irqrestore(&domain->lock, flags);
-
-free_mem:
-	free_pages((unsigned long)virt_addr, get_order(size));
-}
-
-/*
- * This function is called by the DMA layer to find out if we can handle a
- * particular device. It is part of the dma_ops.
- */
-static int amd_iommu_dma_supported(struct device *dev, u64 mask)
-{
-	return check_device(dev);
-}
-
-/*
- * The function for pre-allocating protection domains.
- *
- * If the driver core informs the DMA layer if a driver grabs a device
- * we don't need to preallocate the protection domains anymore.
- * For now we have to.
- */
-static void prealloc_protection_domains(void)
-{
-	struct pci_dev *dev = NULL;
-	struct dma_ops_domain *dma_dom;
-	u16 devid;
-
-	for_each_pci_dev(dev) {
-
-		/* Do we handle this device? */
-		if (!check_device(&dev->dev))
-			continue;
-
-		/* Is there already any domain for it? */
-		if (domain_for_device(&dev->dev))
-			continue;
-
-		devid = get_device_id(&dev->dev);
-
-		dma_dom = dma_ops_domain_alloc();
-		if (!dma_dom)
-			continue;
-		init_unity_mappings_for_device(dma_dom, devid);
-		dma_dom->target_dev = devid;
-
-		attach_device(&dev->dev, &dma_dom->domain);
-
-		list_add_tail(&dma_dom->list, &iommu_pd_list);
-	}
-}
-
-static struct dma_map_ops amd_iommu_dma_ops = {
-	.alloc_coherent = alloc_coherent,
-	.free_coherent = free_coherent,
-	.map_page = map_page,
-	.unmap_page = unmap_page,
-	.map_sg = map_sg,
-	.unmap_sg = unmap_sg,
-	.dma_supported = amd_iommu_dma_supported,
-};
-
-static unsigned device_dma_ops_init(void)
-{
-	struct pci_dev *pdev = NULL;
-	unsigned unhandled = 0;
-
-	for_each_pci_dev(pdev) {
-		if (!check_device(&pdev->dev)) {
-			unhandled += 1;
-			continue;
-		}
-
-		pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops;
-	}
-
-	return unhandled;
-}
-
-/*
- * The function which clues the AMD IOMMU driver into dma_ops.
- */
-
-void __init amd_iommu_init_api(void)
-{
-	register_iommu(&amd_iommu_ops);
-}
-
-int __init amd_iommu_init_dma_ops(void)
-{
-	struct amd_iommu *iommu;
-	int ret, unhandled;
-
-	/*
-	 * first allocate a default protection domain for every IOMMU we
-	 * found in the system. Devices not assigned to any other
-	 * protection domain will be assigned to the default one.
-	 */
-	for_each_iommu(iommu) {
-		iommu->default_dom = dma_ops_domain_alloc();
-		if (iommu->default_dom == NULL)
-			return -ENOMEM;
-		iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
-		ret = iommu_init_unity_mappings(iommu);
-		if (ret)
-			goto free_domains;
-	}
-
-	/*
-	 * Pre-allocate the protection domains for each device.
-	 */
-	prealloc_protection_domains();
-
-	iommu_detected = 1;
-	swiotlb = 0;
-
-	/* Make the driver finally visible to the drivers */
-	unhandled = device_dma_ops_init();
-	if (unhandled && max_pfn > MAX_DMA32_PFN) {
-		/* There are unhandled devices - initialize swiotlb for them */
-		swiotlb = 1;
-	}
-
-	amd_iommu_stats_init();
-
-	return 0;
-
-free_domains:
-
-	for_each_iommu(iommu) {
-		if (iommu->default_dom)
-			dma_ops_domain_free(iommu->default_dom);
-	}
-
-	return ret;
-}
-
-/*****************************************************************************
- *
- * The following functions belong to the exported interface of AMD IOMMU
- *
- * This interface allows access to lower level functions of the IOMMU
- * like protection domain handling and assignement of devices to domains
- * which is not possible with the dma_ops interface.
- *
- *****************************************************************************/
-
-static void cleanup_domain(struct protection_domain *domain)
-{
-	struct iommu_dev_data *dev_data, *next;
-	unsigned long flags;
-
-	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-
-	list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
-		struct device *dev = dev_data->dev;
-
-		__detach_device(dev);
-		atomic_set(&dev_data->bind, 0);
-	}
-
-	write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
-}
-
-static void protection_domain_free(struct protection_domain *domain)
-{
-	if (!domain)
-		return;
-
-	del_domain_from_list(domain);
-
-	if (domain->id)
-		domain_id_free(domain->id);
-
-	kfree(domain);
-}
-
-static struct protection_domain *protection_domain_alloc(void)
-{
-	struct protection_domain *domain;
-
-	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
-	if (!domain)
-		return NULL;
-
-	spin_lock_init(&domain->lock);
-	mutex_init(&domain->api_lock);
-	domain->id = domain_id_alloc();
-	if (!domain->id)
-		goto out_err;
-	INIT_LIST_HEAD(&domain->dev_list);
-
-	add_domain_to_list(domain);
-
-	return domain;
-
-out_err:
-	kfree(domain);
-
-	return NULL;
-}
-
-static int amd_iommu_domain_init(struct iommu_domain *dom)
-{
-	struct protection_domain *domain;
-
-	domain = protection_domain_alloc();
-	if (!domain)
-		goto out_free;
-
-	domain->mode    = PAGE_MODE_3_LEVEL;
-	domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
-	if (!domain->pt_root)
-		goto out_free;
-
-	dom->priv = domain;
-
-	return 0;
-
-out_free:
-	protection_domain_free(domain);
-
-	return -ENOMEM;
-}
-
-static void amd_iommu_domain_destroy(struct iommu_domain *dom)
-{
-	struct protection_domain *domain = dom->priv;
-
-	if (!domain)
-		return;
-
-	if (domain->dev_cnt > 0)
-		cleanup_domain(domain);
-
-	BUG_ON(domain->dev_cnt != 0);
-
-	free_pagetable(domain);
-
-	protection_domain_free(domain);
-
-	dom->priv = NULL;
-}
-
-static void amd_iommu_detach_device(struct iommu_domain *dom,
-				    struct device *dev)
-{
-	struct iommu_dev_data *dev_data = dev->archdata.iommu;
-	struct amd_iommu *iommu;
-	u16 devid;
-
-	if (!check_device(dev))
-		return;
-
-	devid = get_device_id(dev);
-
-	if (dev_data->domain != NULL)
-		detach_device(dev);
-
-	iommu = amd_iommu_rlookup_table[devid];
-	if (!iommu)
-		return;
-
-	device_flush_dte(dev);
-	iommu_completion_wait(iommu);
-}
-
-static int amd_iommu_attach_device(struct iommu_domain *dom,
-				   struct device *dev)
-{
-	struct protection_domain *domain = dom->priv;
-	struct iommu_dev_data *dev_data;
-	struct amd_iommu *iommu;
-	int ret;
-	u16 devid;
-
-	if (!check_device(dev))
-		return -EINVAL;
-
-	dev_data = dev->archdata.iommu;
-
-	devid = get_device_id(dev);
-
-	iommu = amd_iommu_rlookup_table[devid];
-	if (!iommu)
-		return -EINVAL;
-
-	if (dev_data->domain)
-		detach_device(dev);
-
-	ret = attach_device(dev, domain);
-
-	iommu_completion_wait(iommu);
-
-	return ret;
-}
-
-static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
-			 phys_addr_t paddr, int gfp_order, int iommu_prot)
-{
-	unsigned long page_size = 0x1000UL << gfp_order;
-	struct protection_domain *domain = dom->priv;
-	int prot = 0;
-	int ret;
-
-	if (iommu_prot & IOMMU_READ)
-		prot |= IOMMU_PROT_IR;
-	if (iommu_prot & IOMMU_WRITE)
-		prot |= IOMMU_PROT_IW;
-
-	mutex_lock(&domain->api_lock);
-	ret = iommu_map_page(domain, iova, paddr, prot, page_size);
-	mutex_unlock(&domain->api_lock);
-
-	return ret;
-}
-
-static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
-			   int gfp_order)
-{
-	struct protection_domain *domain = dom->priv;
-	unsigned long page_size, unmap_size;
-
-	page_size  = 0x1000UL << gfp_order;
-
-	mutex_lock(&domain->api_lock);
-	unmap_size = iommu_unmap_page(domain, iova, page_size);
-	mutex_unlock(&domain->api_lock);
-
-	domain_flush_tlb_pde(domain);
-
-	return get_order(unmap_size);
-}
-
-static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
-					  unsigned long iova)
-{
-	struct protection_domain *domain = dom->priv;
-	unsigned long offset_mask;
-	phys_addr_t paddr;
-	u64 *pte, __pte;
-
-	pte = fetch_pte(domain, iova);
-
-	if (!pte || !IOMMU_PTE_PRESENT(*pte))
-		return 0;
-
-	if (PM_PTE_LEVEL(*pte) == 0)
-		offset_mask = PAGE_SIZE - 1;
-	else
-		offset_mask = PTE_PAGE_SIZE(*pte) - 1;
-
-	__pte = *pte & PM_ADDR_MASK;
-	paddr = (__pte & ~offset_mask) | (iova & offset_mask);
-
-	return paddr;
-}
-
-static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
-				    unsigned long cap)
-{
-	switch (cap) {
-	case IOMMU_CAP_CACHE_COHERENCY:
-		return 1;
-	}
-
-	return 0;
-}
-
-static struct iommu_ops amd_iommu_ops = {
-	.domain_init = amd_iommu_domain_init,
-	.domain_destroy = amd_iommu_domain_destroy,
-	.attach_dev = amd_iommu_attach_device,
-	.detach_dev = amd_iommu_detach_device,
-	.map = amd_iommu_map,
-	.unmap = amd_iommu_unmap,
-	.iova_to_phys = amd_iommu_iova_to_phys,
-	.domain_has_cap = amd_iommu_domain_has_cap,
-};
-
-/*****************************************************************************
- *
- * The next functions do a basic initialization of IOMMU for pass through
- * mode
- *
- * In passthrough mode the IOMMU is initialized and enabled but not used for
- * DMA-API translation.
- *
- *****************************************************************************/
-
-int __init amd_iommu_init_passthrough(void)
-{
-	struct amd_iommu *iommu;
-	struct pci_dev *dev = NULL;
-	u16 devid;
-
-	/* allocate passthrough domain */
-	pt_domain = protection_domain_alloc();
-	if (!pt_domain)
-		return -ENOMEM;
-
-	pt_domain->mode |= PAGE_MODE_NONE;
-
-	for_each_pci_dev(dev) {
-		if (!check_device(&dev->dev))
-			continue;
-
-		devid = get_device_id(&dev->dev);
-
-		iommu = amd_iommu_rlookup_table[devid];
-		if (!iommu)
-			continue;
-
-		attach_device(&dev->dev, pt_domain);
-	}
-
-	pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
-
-	return 0;
-}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
deleted file mode 100644
index bfc8453bd98..00000000000
--- a/arch/x86/kernel/amd_iommu_init.c
+++ /dev/null
@@ -1,1572 +0,0 @@
-/*
- * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
- * Author: Joerg Roedel <joerg.roedel@amd.com>
- *         Leo Duran <leo.duran@amd.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
- */
-
-#include <linux/pci.h>
-#include <linux/acpi.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/syscore_ops.h>
-#include <linux/interrupt.h>
-#include <linux/msi.h>
-#include <asm/pci-direct.h>
-#include <asm/amd_iommu_proto.h>
-#include <asm/amd_iommu_types.h>
-#include <asm/amd_iommu.h>
-#include <asm/iommu.h>
-#include <asm/gart.h>
-#include <asm/x86_init.h>
-#include <asm/iommu_table.h>
-/*
- * definitions for the ACPI scanning code
- */
-#define IVRS_HEADER_LENGTH 48
-
-#define ACPI_IVHD_TYPE                  0x10
-#define ACPI_IVMD_TYPE_ALL              0x20
-#define ACPI_IVMD_TYPE                  0x21
-#define ACPI_IVMD_TYPE_RANGE            0x22
-
-#define IVHD_DEV_ALL                    0x01
-#define IVHD_DEV_SELECT                 0x02
-#define IVHD_DEV_SELECT_RANGE_START     0x03
-#define IVHD_DEV_RANGE_END              0x04
-#define IVHD_DEV_ALIAS                  0x42
-#define IVHD_DEV_ALIAS_RANGE            0x43
-#define IVHD_DEV_EXT_SELECT             0x46
-#define IVHD_DEV_EXT_SELECT_RANGE       0x47
-
-#define IVHD_FLAG_HT_TUN_EN_MASK        0x01
-#define IVHD_FLAG_PASSPW_EN_MASK        0x02
-#define IVHD_FLAG_RESPASSPW_EN_MASK     0x04
-#define IVHD_FLAG_ISOC_EN_MASK          0x08
-
-#define IVMD_FLAG_EXCL_RANGE            0x08
-#define IVMD_FLAG_UNITY_MAP             0x01
-
-#define ACPI_DEVFLAG_INITPASS           0x01
-#define ACPI_DEVFLAG_EXTINT             0x02
-#define ACPI_DEVFLAG_NMI                0x04
-#define ACPI_DEVFLAG_SYSMGT1            0x10
-#define ACPI_DEVFLAG_SYSMGT2            0x20
-#define ACPI_DEVFLAG_LINT0              0x40
-#define ACPI_DEVFLAG_LINT1              0x80
-#define ACPI_DEVFLAG_ATSDIS             0x10000000
-
-/*
- * ACPI table definitions
- *
- * These data structures are laid over the table to parse the important values
- * out of it.
- */
-
-/*
- * structure describing one IOMMU in the ACPI table. Typically followed by one
- * or more ivhd_entrys.
- */
-struct ivhd_header {
-	u8 type;
-	u8 flags;
-	u16 length;
-	u16 devid;
-	u16 cap_ptr;
-	u64 mmio_phys;
-	u16 pci_seg;
-	u16 info;
-	u32 reserved;
-} __attribute__((packed));
-
-/*
- * A device entry describing which devices a specific IOMMU translates and
- * which requestor ids they use.
- */
-struct ivhd_entry {
-	u8 type;
-	u16 devid;
-	u8 flags;
-	u32 ext;
-} __attribute__((packed));
-
-/*
- * An AMD IOMMU memory definition structure. It defines things like exclusion
- * ranges for devices and regions that should be unity mapped.
- */
-struct ivmd_header {
-	u8 type;
-	u8 flags;
-	u16 length;
-	u16 devid;
-	u16 aux;
-	u64 resv;
-	u64 range_start;
-	u64 range_length;
-} __attribute__((packed));
-
-bool amd_iommu_dump;
-
-static int __initdata amd_iommu_detected;
-static bool __initdata amd_iommu_disabled;
-
-u16 amd_iommu_last_bdf;			/* largest PCI device id we have
-					   to handle */
-LIST_HEAD(amd_iommu_unity_map);		/* a list of required unity mappings
-					   we find in ACPI */
-bool amd_iommu_unmap_flush;		/* if true, flush on every unmap */
-
-LIST_HEAD(amd_iommu_list);		/* list of all AMD IOMMUs in the
-					   system */
-
-/* Array to assign indices to IOMMUs*/
-struct amd_iommu *amd_iommus[MAX_IOMMUS];
-int amd_iommus_present;
-
-/* IOMMUs have a non-present cache? */
-bool amd_iommu_np_cache __read_mostly;
-bool amd_iommu_iotlb_sup __read_mostly = true;
-
-/*
- * The ACPI table parsing functions set this variable on an error
- */
-static int __initdata amd_iommu_init_err;
-
-/*
- * List of protection domains - used during resume
- */
-LIST_HEAD(amd_iommu_pd_list);
-spinlock_t amd_iommu_pd_lock;
-
-/*
- * Pointer to the device table which is shared by all AMD IOMMUs
- * it is indexed by the PCI device id or the HT unit id and contains
- * information about the domain the device belongs to as well as the
- * page table root pointer.
- */
-struct dev_table_entry *amd_iommu_dev_table;
-
-/*
- * The alias table is a driver specific data structure which contains the
- * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
- * More than one device can share the same requestor id.
- */
-u16 *amd_iommu_alias_table;
-
-/*
- * The rlookup table is used to find the IOMMU which is responsible
- * for a specific device. It is also indexed by the PCI device id.
- */
-struct amd_iommu **amd_iommu_rlookup_table;
-
-/*
- * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
- * to know which ones are already in use.
- */
-unsigned long *amd_iommu_pd_alloc_bitmap;
-
-static u32 dev_table_size;	/* size of the device table */
-static u32 alias_table_size;	/* size of the alias table */
-static u32 rlookup_table_size;	/* size if the rlookup table */
-
-/*
- * This function flushes all internal caches of
- * the IOMMU used by this driver.
- */
-extern void iommu_flush_all_caches(struct amd_iommu *iommu);
-
-static inline void update_last_devid(u16 devid)
-{
-	if (devid > amd_iommu_last_bdf)
-		amd_iommu_last_bdf = devid;
-}
-
-static inline unsigned long tbl_size(int entry_size)
-{
-	unsigned shift = PAGE_SHIFT +
-			 get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
-
-	return 1UL << shift;
-}
-
-/* Access to l1 and l2 indexed register spaces */
-
-static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
-{
-	u32 val;
-
-	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
-	pci_read_config_dword(iommu->dev, 0xfc, &val);
-	return val;
-}
-
-static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
-{
-	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
-	pci_write_config_dword(iommu->dev, 0xfc, val);
-	pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
-}
-
-static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
-{
-	u32 val;
-
-	pci_write_config_dword(iommu->dev, 0xf0, address);
-	pci_read_config_dword(iommu->dev, 0xf4, &val);
-	return val;
-}
-
-static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
-{
-	pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
-	pci_write_config_dword(iommu->dev, 0xf4, val);
-}
-
-/****************************************************************************
- *
- * AMD IOMMU MMIO register space handling functions
- *
- * These functions are used to program the IOMMU device registers in
- * MMIO space required for that driver.
- *
- ****************************************************************************/
-
-/*
- * This function set the exclusion range in the IOMMU. DMA accesses to the
- * exclusion range are passed through untranslated
- */
-static void iommu_set_exclusion_range(struct amd_iommu *iommu)
-{
-	u64 start = iommu->exclusion_start & PAGE_MASK;
-	u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
-	u64 entry;
-
-	if (!iommu->exclusion_start)
-		return;
-
-	entry = start | MMIO_EXCL_ENABLE_MASK;
-	memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET,
-			&entry, sizeof(entry));
-
-	entry = limit;
-	memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET,
-			&entry, sizeof(entry));
-}
-
-/* Programs the physical address of the device table into the IOMMU hardware */
-static void __init iommu_set_device_table(struct amd_iommu *iommu)
-{
-	u64 entry;
-
-	BUG_ON(iommu->mmio_base == NULL);
-
-	entry = virt_to_phys(amd_iommu_dev_table);
-	entry |= (dev_table_size >> 12) - 1;
-	memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET,
-			&entry, sizeof(entry));
-}
-
-/* Generic functions to enable/disable certain features of the IOMMU. */
-static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
-{
-	u32 ctrl;
-
-	ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
-	ctrl |= (1 << bit);
-	writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
-}
-
-static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
-{
-	u32 ctrl;
-
-	ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
-	ctrl &= ~(1 << bit);
-	writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
-}
-
-/* Function to enable the hardware */
-static void iommu_enable(struct amd_iommu *iommu)
-{
-	static const char * const feat_str[] = {
-		"PreF", "PPR", "X2APIC", "NX", "GT", "[5]",
-		"IA", "GA", "HE", "PC", NULL
-	};
-	int i;
-
-	printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx",
-	       dev_name(&iommu->dev->dev), iommu->cap_ptr);
-
-	if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
-		printk(KERN_CONT " extended features: ");
-		for (i = 0; feat_str[i]; ++i)
-			if (iommu_feature(iommu, (1ULL << i)))
-				printk(KERN_CONT " %s", feat_str[i]);
-	}
-	printk(KERN_CONT "\n");
-
-	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
-}
-
-static void iommu_disable(struct amd_iommu *iommu)
-{
-	/* Disable command buffer */
-	iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
-
-	/* Disable event logging and event interrupts */
-	iommu_feature_disable(iommu, CONTROL_EVT_INT_EN);
-	iommu_feature_disable(iommu, CONTROL_EVT_LOG_EN);
-
-	/* Disable IOMMU hardware itself */
-	iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
-}
-
-/*
- * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
- * the system has one.
- */
-static u8 * __init iommu_map_mmio_space(u64 address)
-{
-	u8 *ret;
-
-	if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) {
-		pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n",
-			address);
-		pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n");
-		return NULL;
-	}
-
-	ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
-	if (ret != NULL)
-		return ret;
-
-	release_mem_region(address, MMIO_REGION_LENGTH);
-
-	return NULL;
-}
-
-static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
-{
-	if (iommu->mmio_base)
-		iounmap(iommu->mmio_base);
-	release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
-}
-
-/****************************************************************************
- *
- * The functions below belong to the first pass of AMD IOMMU ACPI table
- * parsing. In this pass we try to find out the highest device id this
- * code has to handle. Upon this information the size of the shared data
- * structures is determined later.
- *
- ****************************************************************************/
-
-/*
- * This function calculates the length of a given IVHD entry
- */
-static inline int ivhd_entry_length(u8 *ivhd)
-{
-	return 0x04 << (*ivhd >> 6);
-}
-
-/*
- * This function reads the last device id the IOMMU has to handle from the PCI
- * capability header for this IOMMU
- */
-static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
-{
-	u32 cap;
-
-	cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
-	update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
-
-	return 0;
-}
-
-/*
- * After reading the highest device id from the IOMMU PCI capability header
- * this function looks if there is a higher device id defined in the ACPI table
- */
-static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
-{
-	u8 *p = (void *)h, *end = (void *)h;
-	struct ivhd_entry *dev;
-
-	p += sizeof(*h);
-	end += h->length;
-
-	find_last_devid_on_pci(PCI_BUS(h->devid),
-			PCI_SLOT(h->devid),
-			PCI_FUNC(h->devid),
-			h->cap_ptr);
-
-	while (p < end) {
-		dev = (struct ivhd_entry *)p;
-		switch (dev->type) {
-		case IVHD_DEV_SELECT:
-		case IVHD_DEV_RANGE_END:
-		case IVHD_DEV_ALIAS:
-		case IVHD_DEV_EXT_SELECT:
-			/* all the above subfield types refer to device ids */
-			update_last_devid(dev->devid);
-			break;
-		default:
-			break;
-		}
-		p += ivhd_entry_length(p);
-	}
-
-	WARN_ON(p != end);
-
-	return 0;
-}
-
-/*
- * Iterate over all IVHD entries in the ACPI table and find the highest device
- * id which we need to handle. This is the first of three functions which parse
- * the ACPI table. So we check the checksum here.
- */
-static int __init find_last_devid_acpi(struct acpi_table_header *table)
-{
-	int i;
-	u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table;
-	struct ivhd_header *h;
-
-	/*
-	 * Validate checksum here so we don't need to do it when
-	 * we actually parse the table
-	 */
-	for (i = 0; i < table->length; ++i)
-		checksum += p[i];
-	if (checksum != 0) {
-		/* ACPI table corrupt */
-		amd_iommu_init_err = -ENODEV;
-		return 0;
-	}
-
-	p += IVRS_HEADER_LENGTH;
-
-	end += table->length;
-	while (p < end) {
-		h = (struct ivhd_header *)p;
-		switch (h->type) {
-		case ACPI_IVHD_TYPE:
-			find_last_devid_from_ivhd(h);
-			break;
-		default:
-			break;
-		}
-		p += h->length;
-	}
-	WARN_ON(p != end);
-
-	return 0;
-}
-
-/****************************************************************************
- *
- * The following functions belong the the code path which parses the ACPI table
- * the second time. In this ACPI parsing iteration we allocate IOMMU specific
- * data structures, initialize the device/alias/rlookup table and also
- * basically initialize the hardware.
- *
- ****************************************************************************/
-
-/*
- * Allocates the command buffer. This buffer is per AMD IOMMU. We can
- * write commands to that buffer later and the IOMMU will execute them
- * asynchronously
- */
-static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
-{
-	u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-			get_order(CMD_BUFFER_SIZE));
-
-	if (cmd_buf == NULL)
-		return NULL;
-
-	iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
-
-	return cmd_buf;
-}
-
-/*
- * This function resets the command buffer if the IOMMU stopped fetching
- * commands from it.
- */
-void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu)
-{
-	iommu_feature_disable(iommu, CONTROL_CMDBUF_EN);
-
-	writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
-	writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-
-	iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
-}
-
-/*
- * This function writes the command buffer address to the hardware and
- * enables it.
- */
-static void iommu_enable_command_buffer(struct amd_iommu *iommu)
-{
-	u64 entry;
-
-	BUG_ON(iommu->cmd_buf == NULL);
-
-	entry = (u64)virt_to_phys(iommu->cmd_buf);
-	entry |= MMIO_CMD_SIZE_512;
-
-	memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
-		    &entry, sizeof(entry));
-
-	amd_iommu_reset_cmd_buffer(iommu);
-	iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
-}
-
-static void __init free_command_buffer(struct amd_iommu *iommu)
-{
-	free_pages((unsigned long)iommu->cmd_buf,
-		   get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
-}
-
-/* allocates the memory where the IOMMU will log its events to */
-static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
-{
-	iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-						get_order(EVT_BUFFER_SIZE));
-
-	if (iommu->evt_buf == NULL)
-		return NULL;
-
-	iommu->evt_buf_size = EVT_BUFFER_SIZE;
-
-	return iommu->evt_buf;
-}
-
-static void iommu_enable_event_buffer(struct amd_iommu *iommu)
-{
-	u64 entry;
-
-	BUG_ON(iommu->evt_buf == NULL);
-
-	entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
-
-	memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
-		    &entry, sizeof(entry));
-
-	/* set head and tail to zero manually */
-	writel(0x00, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
-	writel(0x00, iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
-
-	iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
-}
-
-static void __init free_event_buffer(struct amd_iommu *iommu)
-{
-	free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
-}
-
-/* sets a specific bit in the device table entry. */
-static void set_dev_entry_bit(u16 devid, u8 bit)
-{
-	int i = (bit >> 5) & 0x07;
-	int _bit = bit & 0x1f;
-
-	amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
-}
-
-static int get_dev_entry_bit(u16 devid, u8 bit)
-{
-	int i = (bit >> 5) & 0x07;
-	int _bit = bit & 0x1f;
-
-	return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit;
-}
-
-
-void amd_iommu_apply_erratum_63(u16 devid)
-{
-	int sysmgt;
-
-	sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) |
-		 (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1);
-
-	if (sysmgt == 0x01)
-		set_dev_entry_bit(devid, DEV_ENTRY_IW);
-}
-
-/* Writes the specific IOMMU for a device into the rlookup table */
-static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
-{
-	amd_iommu_rlookup_table[devid] = iommu;
-}
-
-/*
- * This function takes the device specific flags read from the ACPI
- * table and sets up the device table entry with that information
- */
-static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
-					   u16 devid, u32 flags, u32 ext_flags)
-{
-	if (flags & ACPI_DEVFLAG_INITPASS)
-		set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
-	if (flags & ACPI_DEVFLAG_EXTINT)
-		set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS);
-	if (flags & ACPI_DEVFLAG_NMI)
-		set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS);
-	if (flags & ACPI_DEVFLAG_SYSMGT1)
-		set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1);
-	if (flags & ACPI_DEVFLAG_SYSMGT2)
-		set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2);
-	if (flags & ACPI_DEVFLAG_LINT0)
-		set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
-	if (flags & ACPI_DEVFLAG_LINT1)
-		set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
-
-	amd_iommu_apply_erratum_63(devid);
-
-	set_iommu_for_device(iommu, devid);
-}
-
-/*
- * Reads the device exclusion range from ACPI and initialize IOMMU with
- * it
- */
-static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
-{
-	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
-
-	if (!(m->flags & IVMD_FLAG_EXCL_RANGE))
-		return;
-
-	if (iommu) {
-		/*
-		 * We only can configure exclusion ranges per IOMMU, not
-		 * per device. But we can enable the exclusion range per
-		 * device. This is done here
-		 */
-		set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
-		iommu->exclusion_start = m->range_start;
-		iommu->exclusion_length = m->range_length;
-	}
-}
-
-/*
- * This function reads some important data from the IOMMU PCI space and
- * initializes the driver data structure with it. It reads the hardware
- * capabilities and the first/last device entries
- */
-static void __init init_iommu_from_pci(struct amd_iommu *iommu)
-{
-	int cap_ptr = iommu->cap_ptr;
-	u32 range, misc, low, high;
-	int i, j;
-
-	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
-			      &iommu->cap);
-	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
-			      &range);
-	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
-			      &misc);
-
-	iommu->first_device = calc_devid(MMIO_GET_BUS(range),
-					 MMIO_GET_FD(range));
-	iommu->last_device = calc_devid(MMIO_GET_BUS(range),
-					MMIO_GET_LD(range));
-	iommu->evt_msi_num = MMIO_MSI_NUM(misc);
-
-	if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB)))
-		amd_iommu_iotlb_sup = false;
-
-	/* read extended feature bits */
-	low  = readl(iommu->mmio_base + MMIO_EXT_FEATURES);
-	high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4);
-
-	iommu->features = ((u64)high << 32) | low;
-
-	if (!is_rd890_iommu(iommu->dev))
-		return;
-
-	/*
-	 * Some rd890 systems may not be fully reconfigured by the BIOS, so
-	 * it's necessary for us to store this information so it can be
-	 * reprogrammed on resume
-	 */
-
-	pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
-			      &iommu->stored_addr_lo);
-	pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
-			      &iommu->stored_addr_hi);
-
-	/* Low bit locks writes to configuration space */
-	iommu->stored_addr_lo &= ~1;
-
-	for (i = 0; i < 6; i++)
-		for (j = 0; j < 0x12; j++)
-			iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
-
-	for (i = 0; i < 0x83; i++)
-		iommu->stored_l2[i] = iommu_read_l2(iommu, i);
-}
-
-/*
- * Takes a pointer to an AMD IOMMU entry in the ACPI table and
- * initializes the hardware and our data structures with it.
- */
-static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
-					struct ivhd_header *h)
-{
-	u8 *p = (u8 *)h;
-	u8 *end = p, flags = 0;
-	u16 devid = 0, devid_start = 0, devid_to = 0;
-	u32 dev_i, ext_flags = 0;
-	bool alias = false;
-	struct ivhd_entry *e;
-
-	/*
-	 * First save the recommended feature enable bits from ACPI
-	 */
-	iommu->acpi_flags = h->flags;
-
-	/*
-	 * Done. Now parse the device entries
-	 */
-	p += sizeof(struct ivhd_header);
-	end += h->length;
-
-
-	while (p < end) {
-		e = (struct ivhd_entry *)p;
-		switch (e->type) {
-		case IVHD_DEV_ALL:
-
-			DUMP_printk("  DEV_ALL\t\t\t first devid: %02x:%02x.%x"
-				    " last device %02x:%02x.%x flags: %02x\n",
-				    PCI_BUS(iommu->first_device),
-				    PCI_SLOT(iommu->first_device),
-				    PCI_FUNC(iommu->first_device),
-				    PCI_BUS(iommu->last_device),
-				    PCI_SLOT(iommu->last_device),
-				    PCI_FUNC(iommu->last_device),
-				    e->flags);
-
-			for (dev_i = iommu->first_device;
-					dev_i <= iommu->last_device; ++dev_i)
-				set_dev_entry_from_acpi(iommu, dev_i,
-							e->flags, 0);
-			break;
-		case IVHD_DEV_SELECT:
-
-			DUMP_printk("  DEV_SELECT\t\t\t devid: %02x:%02x.%x "
-				    "flags: %02x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid),
-				    e->flags);
-
-			devid = e->devid;
-			set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
-			break;
-		case IVHD_DEV_SELECT_RANGE_START:
-
-			DUMP_printk("  DEV_SELECT_RANGE_START\t "
-				    "devid: %02x:%02x.%x flags: %02x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid),
-				    e->flags);
-
-			devid_start = e->devid;
-			flags = e->flags;
-			ext_flags = 0;
-			alias = false;
-			break;
-		case IVHD_DEV_ALIAS:
-
-			DUMP_printk("  DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
-				    "flags: %02x devid_to: %02x:%02x.%x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid),
-				    e->flags,
-				    PCI_BUS(e->ext >> 8),
-				    PCI_SLOT(e->ext >> 8),
-				    PCI_FUNC(e->ext >> 8));
-
-			devid = e->devid;
-			devid_to = e->ext >> 8;
-			set_dev_entry_from_acpi(iommu, devid   , e->flags, 0);
-			set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
-			amd_iommu_alias_table[devid] = devid_to;
-			break;
-		case IVHD_DEV_ALIAS_RANGE:
-
-			DUMP_printk("  DEV_ALIAS_RANGE\t\t "
-				    "devid: %02x:%02x.%x flags: %02x "
-				    "devid_to: %02x:%02x.%x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid),
-				    e->flags,
-				    PCI_BUS(e->ext >> 8),
-				    PCI_SLOT(e->ext >> 8),
-				    PCI_FUNC(e->ext >> 8));
-
-			devid_start = e->devid;
-			flags = e->flags;
-			devid_to = e->ext >> 8;
-			ext_flags = 0;
-			alias = true;
-			break;
-		case IVHD_DEV_EXT_SELECT:
-
-			DUMP_printk("  DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
-				    "flags: %02x ext: %08x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid),
-				    e->flags, e->ext);
-
-			devid = e->devid;
-			set_dev_entry_from_acpi(iommu, devid, e->flags,
-						e->ext);
-			break;
-		case IVHD_DEV_EXT_SELECT_RANGE:
-
-			DUMP_printk("  DEV_EXT_SELECT_RANGE\t devid: "
-				    "%02x:%02x.%x flags: %02x ext: %08x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid),
-				    e->flags, e->ext);
-
-			devid_start = e->devid;
-			flags = e->flags;
-			ext_flags = e->ext;
-			alias = false;
-			break;
-		case IVHD_DEV_RANGE_END:
-
-			DUMP_printk("  DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
-				    PCI_BUS(e->devid),
-				    PCI_SLOT(e->devid),
-				    PCI_FUNC(e->devid));
-
-			devid = e->devid;
-			for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
-				if (alias) {
-					amd_iommu_alias_table[dev_i] = devid_to;
-					set_dev_entry_from_acpi(iommu,
-						devid_to, flags, ext_flags);
-				}
-				set_dev_entry_from_acpi(iommu, dev_i,
-							flags, ext_flags);
-			}
-			break;
-		default:
-			break;
-		}
-
-		p += ivhd_entry_length(p);
-	}
-}
-
-/* Initializes the device->iommu mapping for the driver */
-static int __init init_iommu_devices(struct amd_iommu *iommu)
-{
-	u32 i;
-
-	for (i = iommu->first_device; i <= iommu->last_device; ++i)
-		set_iommu_for_device(iommu, i);
-
-	return 0;
-}
-
-static void __init free_iommu_one(struct amd_iommu *iommu)
-{
-	free_command_buffer(iommu);
-	free_event_buffer(iommu);
-	iommu_unmap_mmio_space(iommu);
-}
-
-static void __init free_iommu_all(void)
-{
-	struct amd_iommu *iommu, *next;
-
-	for_each_iommu_safe(iommu, next) {
-		list_del(&iommu->list);
-		free_iommu_one(iommu);
-		kfree(iommu);
-	}
-}
-
-/*
- * This function clues the initialization function for one IOMMU
- * together and also allocates the command buffer and programs the
- * hardware. It does NOT enable the IOMMU. This is done afterwards.
- */
-static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
-{
-	spin_lock_init(&iommu->lock);
-
-	/* Add IOMMU to internal data structures */
-	list_add_tail(&iommu->list, &amd_iommu_list);
-	iommu->index             = amd_iommus_present++;
-
-	if (unlikely(iommu->index >= MAX_IOMMUS)) {
-		WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
-		return -ENOSYS;
-	}
-
-	/* Index is fine - add IOMMU to the array */
-	amd_iommus[iommu->index] = iommu;
-
-	/*
-	 * Copy data from ACPI table entry to the iommu struct
-	 */
-	iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
-	if (!iommu->dev)
-		return 1;
-
-	iommu->cap_ptr = h->cap_ptr;
-	iommu->pci_seg = h->pci_seg;
-	iommu->mmio_phys = h->mmio_phys;
-	iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
-	if (!iommu->mmio_base)
-		return -ENOMEM;
-
-	iommu->cmd_buf = alloc_command_buffer(iommu);
-	if (!iommu->cmd_buf)
-		return -ENOMEM;
-
-	iommu->evt_buf = alloc_event_buffer(iommu);
-	if (!iommu->evt_buf)
-		return -ENOMEM;
-
-	iommu->int_enabled = false;
-
-	init_iommu_from_pci(iommu);
-	init_iommu_from_acpi(iommu, h);
-	init_iommu_devices(iommu);
-
-	if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
-		amd_iommu_np_cache = true;
-
-	return pci_enable_device(iommu->dev);
-}
-
-/*
- * Iterates over all IOMMU entries in the ACPI table, allocates the
- * IOMMU structure and initializes it with init_iommu_one()
- */
-static int __init init_iommu_all(struct acpi_table_header *table)
-{
-	u8 *p = (u8 *)table, *end = (u8 *)table;
-	struct ivhd_header *h;
-	struct amd_iommu *iommu;
-	int ret;
-
-	end += table->length;
-	p += IVRS_HEADER_LENGTH;
-
-	while (p < end) {
-		h = (struct ivhd_header *)p;
-		switch (*p) {
-		case ACPI_IVHD_TYPE:
-
-			DUMP_printk("device: %02x:%02x.%01x cap: %04x "
-				    "seg: %d flags: %01x info %04x\n",
-				    PCI_BUS(h->devid), PCI_SLOT(h->devid),
-				    PCI_FUNC(h->devid), h->cap_ptr,
-				    h->pci_seg, h->flags, h->info);
-			DUMP_printk("       mmio-addr: %016llx\n",
-				    h->mmio_phys);
-
-			iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
-			if (iommu == NULL) {
-				amd_iommu_init_err = -ENOMEM;
-				return 0;
-			}
-
-			ret = init_iommu_one(iommu, h);
-			if (ret) {
-				amd_iommu_init_err = ret;
-				return 0;
-			}
-			break;
-		default:
-			break;
-		}
-		p += h->length;
-
-	}
-	WARN_ON(p != end);
-
-	return 0;
-}
-
-/****************************************************************************
- *
- * The following functions initialize the MSI interrupts for all IOMMUs
- * in the system. Its a bit challenging because there could be multiple
- * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
- * pci_dev.
- *
- ****************************************************************************/
-
-static int iommu_setup_msi(struct amd_iommu *iommu)
-{
-	int r;
-
-	if (pci_enable_msi(iommu->dev))
-		return 1;
-
-	r = request_threaded_irq(iommu->dev->irq,
-				 amd_iommu_int_handler,
-				 amd_iommu_int_thread,
-				 0, "AMD-Vi",
-				 iommu->dev);
-
-	if (r) {
-		pci_disable_msi(iommu->dev);
-		return 1;
-	}
-
-	iommu->int_enabled = true;
-	iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
-
-	return 0;
-}
-
-static int iommu_init_msi(struct amd_iommu *iommu)
-{
-	if (iommu->int_enabled)
-		return 0;
-
-	if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
-		return iommu_setup_msi(iommu);
-
-	return 1;
-}
-
-/****************************************************************************
- *
- * The next functions belong to the third pass of parsing the ACPI
- * table. In this last pass the memory mapping requirements are
- * gathered (like exclusion and unity mapping reanges).
- *
- ****************************************************************************/
-
-static void __init free_unity_maps(void)
-{
-	struct unity_map_entry *entry, *next;
-
-	list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) {
-		list_del(&entry->list);
-		kfree(entry);
-	}
-}
-
-/* called when we find an exclusion range definition in ACPI */
-static int __init init_exclusion_range(struct ivmd_header *m)
-{
-	int i;
-
-	switch (m->type) {
-	case ACPI_IVMD_TYPE:
-		set_device_exclusion_range(m->devid, m);
-		break;
-	case ACPI_IVMD_TYPE_ALL:
-		for (i = 0; i <= amd_iommu_last_bdf; ++i)
-			set_device_exclusion_range(i, m);
-		break;
-	case ACPI_IVMD_TYPE_RANGE:
-		for (i = m->devid; i <= m->aux; ++i)
-			set_device_exclusion_range(i, m);
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-/* called for unity map ACPI definition */
-static int __init init_unity_map_range(struct ivmd_header *m)
-{
-	struct unity_map_entry *e = 0;
-	char *s;
-
-	e = kzalloc(sizeof(*e), GFP_KERNEL);
-	if (e == NULL)
-		return -ENOMEM;
-
-	switch (m->type) {
-	default:
-		kfree(e);
-		return 0;
-	case ACPI_IVMD_TYPE:
-		s = "IVMD_TYPEi\t\t\t";
-		e->devid_start = e->devid_end = m->devid;
-		break;
-	case ACPI_IVMD_TYPE_ALL:
-		s = "IVMD_TYPE_ALL\t\t";
-		e->devid_start = 0;
-		e->devid_end = amd_iommu_last_bdf;
-		break;
-	case ACPI_IVMD_TYPE_RANGE:
-		s = "IVMD_TYPE_RANGE\t\t";
-		e->devid_start = m->devid;
-		e->devid_end = m->aux;
-		break;
-	}
-	e->address_start = PAGE_ALIGN(m->range_start);
-	e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
-	e->prot = m->flags >> 1;
-
-	DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
-		    " range_start: %016llx range_end: %016llx flags: %x\n", s,
-		    PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
-		    PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
-		    PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
-		    e->address_start, e->address_end, m->flags);
-
-	list_add_tail(&e->list, &amd_iommu_unity_map);
-
-	return 0;
-}
-
-/* iterates over all memory definitions we find in the ACPI table */
-static int __init init_memory_definitions(struct acpi_table_header *table)
-{
-	u8 *p = (u8 *)table, *end = (u8 *)table;
-	struct ivmd_header *m;
-
-	end += table->length;
-	p += IVRS_HEADER_LENGTH;
-
-	while (p < end) {
-		m = (struct ivmd_header *)p;
-		if (m->flags & IVMD_FLAG_EXCL_RANGE)
-			init_exclusion_range(m);
-		else if (m->flags & IVMD_FLAG_UNITY_MAP)
-			init_unity_map_range(m);
-
-		p += m->length;
-	}
-
-	return 0;
-}
-
-/*
- * Init the device table to not allow DMA access for devices and
- * suppress all page faults
- */
-static void init_device_table(void)
-{
-	u32 devid;
-
-	for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
-		set_dev_entry_bit(devid, DEV_ENTRY_VALID);
-		set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
-	}
-}
-
-static void iommu_init_flags(struct amd_iommu *iommu)
-{
-	iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ?
-		iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) :
-		iommu_feature_disable(iommu, CONTROL_HT_TUN_EN);
-
-	iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ?
-		iommu_feature_enable(iommu, CONTROL_PASSPW_EN) :
-		iommu_feature_disable(iommu, CONTROL_PASSPW_EN);
-
-	iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ?
-		iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) :
-		iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN);
-
-	iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ?
-		iommu_feature_enable(iommu, CONTROL_ISOC_EN) :
-		iommu_feature_disable(iommu, CONTROL_ISOC_EN);
-
-	/*
-	 * make IOMMU memory accesses cache coherent
-	 */
-	iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
-}
-
-static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
-{
-	int i, j;
-	u32 ioc_feature_control;
-	struct pci_dev *pdev = NULL;
-
-	/* RD890 BIOSes may not have completely reconfigured the iommu */
-	if (!is_rd890_iommu(iommu->dev))
-		return;
-
-	/*
-	 * First, we need to ensure that the iommu is enabled. This is
-	 * controlled by a register in the northbridge
-	 */
-	pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
-
-	if (!pdev)
-		return;
-
-	/* Select Northbridge indirect register 0x75 and enable writing */
-	pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
-	pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
-
-	/* Enable the iommu */
-	if (!(ioc_feature_control & 0x1))
-		pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
-
-	pci_dev_put(pdev);
-
-	/* Restore the iommu BAR */
-	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
-			       iommu->stored_addr_lo);
-	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
-			       iommu->stored_addr_hi);
-
-	/* Restore the l1 indirect regs for each of the 6 l1s */
-	for (i = 0; i < 6; i++)
-		for (j = 0; j < 0x12; j++)
-			iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
-
-	/* Restore the l2 indirect regs */
-	for (i = 0; i < 0x83; i++)
-		iommu_write_l2(iommu, i, iommu->stored_l2[i]);
-
-	/* Lock PCI setup registers */
-	pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
-			       iommu->stored_addr_lo | 1);
-}
-
-/*
- * This function finally enables all IOMMUs found in the system after
- * they have been initialized
- */
-static void enable_iommus(void)
-{
-	struct amd_iommu *iommu;
-
-	for_each_iommu(iommu) {
-		iommu_disable(iommu);
-		iommu_init_flags(iommu);
-		iommu_set_device_table(iommu);
-		iommu_enable_command_buffer(iommu);
-		iommu_enable_event_buffer(iommu);
-		iommu_set_exclusion_range(iommu);
-		iommu_init_msi(iommu);
-		iommu_enable(iommu);
-		iommu_flush_all_caches(iommu);
-	}
-}
-
-static void disable_iommus(void)
-{
-	struct amd_iommu *iommu;
-
-	for_each_iommu(iommu)
-		iommu_disable(iommu);
-}
-
-/*
- * Suspend/Resume support
- * disable suspend until real resume implemented
- */
-
-static void amd_iommu_resume(void)
-{
-	struct amd_iommu *iommu;
-
-	for_each_iommu(iommu)
-		iommu_apply_resume_quirks(iommu);
-
-	/* re-load the hardware */
-	enable_iommus();
-
-	/*
-	 * we have to flush after the IOMMUs are enabled because a
-	 * disabled IOMMU will never execute the commands we send
-	 */
-	for_each_iommu(iommu)
-		iommu_flush_all_caches(iommu);
-}
-
-static int amd_iommu_suspend(void)
-{
-	/* disable IOMMUs to go out of the way for BIOS */
-	disable_iommus();
-
-	return 0;
-}
-
-static struct syscore_ops amd_iommu_syscore_ops = {
-	.suspend = amd_iommu_suspend,
-	.resume = amd_iommu_resume,
-};
-
-/*
- * This is the core init function for AMD IOMMU hardware in the system.
- * This function is called from the generic x86 DMA layer initialization
- * code.
- *
- * This function basically parses the ACPI table for AMD IOMMU (IVRS)
- * three times:
- *
- *	1 pass) Find the highest PCI device id the driver has to handle.
- *		Upon this information the size of the data structures is
- *		determined that needs to be allocated.
- *
- *	2 pass) Initialize the data structures just allocated with the
- *		information in the ACPI table about available AMD IOMMUs
- *		in the system. It also maps the PCI devices in the
- *		system to specific IOMMUs
- *
- *	3 pass) After the basic data structures are allocated and
- *		initialized we update them with information about memory
- *		remapping requirements parsed out of the ACPI table in
- *		this last pass.
- *
- * After that the hardware is initialized and ready to go. In the last
- * step we do some Linux specific things like registering the driver in
- * the dma_ops interface and initializing the suspend/resume support
- * functions. Finally it prints some information about AMD IOMMUs and
- * the driver state and enables the hardware.
- */
-static int __init amd_iommu_init(void)
-{
-	int i, ret = 0;
-
-	/*
-	 * First parse ACPI tables to find the largest Bus/Dev/Func
-	 * we need to handle. Upon this information the shared data
-	 * structures for the IOMMUs in the system will be allocated
-	 */
-	if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
-		return -ENODEV;
-
-	ret = amd_iommu_init_err;
-	if (ret)
-		goto out;
-
-	dev_table_size     = tbl_size(DEV_TABLE_ENTRY_SIZE);
-	alias_table_size   = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
-	rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
-
-	ret = -ENOMEM;
-
-	/* Device table - directly used by all IOMMUs */
-	amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
-				      get_order(dev_table_size));
-	if (amd_iommu_dev_table == NULL)
-		goto out;
-
-	/*
-	 * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the
-	 * IOMMU see for that device
-	 */
-	amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL,
-			get_order(alias_table_size));
-	if (amd_iommu_alias_table == NULL)
-		goto free;
-
-	/* IOMMU rlookup table - find the IOMMU for a specific device */
-	amd_iommu_rlookup_table = (void *)__get_free_pages(
-			GFP_KERNEL | __GFP_ZERO,
-			get_order(rlookup_table_size));
-	if (amd_iommu_rlookup_table == NULL)
-		goto free;
-
-	amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
-					    GFP_KERNEL | __GFP_ZERO,
-					    get_order(MAX_DOMAIN_ID/8));
-	if (amd_iommu_pd_alloc_bitmap == NULL)
-		goto free;
-
-	/* init the device table */
-	init_device_table();
-
-	/*
-	 * let all alias entries point to itself
-	 */
-	for (i = 0; i <= amd_iommu_last_bdf; ++i)
-		amd_iommu_alias_table[i] = i;
-
-	/*
-	 * never allocate domain 0 because its used as the non-allocated and
-	 * error value placeholder
-	 */
-	amd_iommu_pd_alloc_bitmap[0] = 1;
-
-	spin_lock_init(&amd_iommu_pd_lock);
-
-	/*
-	 * now the data structures are allocated and basically initialized
-	 * start the real acpi table scan
-	 */
-	ret = -ENODEV;
-	if (acpi_table_parse("IVRS", init_iommu_all) != 0)
-		goto free;
-
-	if (amd_iommu_init_err) {
-		ret = amd_iommu_init_err;
-		goto free;
-	}
-
-	if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
-		goto free;
-
-	if (amd_iommu_init_err) {
-		ret = amd_iommu_init_err;
-		goto free;
-	}
-
-	ret = amd_iommu_init_devices();
-	if (ret)
-		goto free;
-
-	enable_iommus();
-
-	if (iommu_pass_through)
-		ret = amd_iommu_init_passthrough();
-	else
-		ret = amd_iommu_init_dma_ops();
-
-	if (ret)
-		goto free_disable;
-
-	amd_iommu_init_api();
-
-	amd_iommu_init_notifier();
-
-	register_syscore_ops(&amd_iommu_syscore_ops);
-
-	if (iommu_pass_through)
-		goto out;
-
-	if (amd_iommu_unmap_flush)
-		printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
-	else
-		printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
-
-	x86_platform.iommu_shutdown = disable_iommus;
-out:
-	return ret;
-
-free_disable:
-	disable_iommus();
-
-free:
-	amd_iommu_uninit_devices();
-
-	free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
-		   get_order(MAX_DOMAIN_ID/8));
-
-	free_pages((unsigned long)amd_iommu_rlookup_table,
-		   get_order(rlookup_table_size));
-
-	free_pages((unsigned long)amd_iommu_alias_table,
-		   get_order(alias_table_size));
-
-	free_pages((unsigned long)amd_iommu_dev_table,
-		   get_order(dev_table_size));
-
-	free_iommu_all();
-
-	free_unity_maps();
-
-#ifdef CONFIG_GART_IOMMU
-	/*
-	 * We failed to initialize the AMD IOMMU - try fallback to GART
-	 * if possible.
-	 */
-	gart_iommu_init();
-
-#endif
-
-	goto out;
-}
-
-/****************************************************************************
- *
- * Early detect code. This code runs at IOMMU detection time in the DMA
- * layer. It just looks if there is an IVRS ACPI table to detect AMD
- * IOMMUs
- *
- ****************************************************************************/
-static int __init early_amd_iommu_detect(struct acpi_table_header *table)
-{
-	return 0;
-}
-
-int __init amd_iommu_detect(void)
-{
-	if (no_iommu || (iommu_detected && !gart_iommu_aperture))
-		return -ENODEV;
-
-	if (amd_iommu_disabled)
-		return -ENODEV;
-
-	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
-		iommu_detected = 1;
-		amd_iommu_detected = 1;
-		x86_init.iommu.iommu_init = amd_iommu_init;
-
-		/* Make sure ACS will be enabled */
-		pci_request_acs();
-		return 1;
-	}
-	return -ENODEV;
-}
-
-/****************************************************************************
- *
- * Parsing functions for the AMD IOMMU specific kernel command line
- * options.
- *
- ****************************************************************************/
-
-static int __init parse_amd_iommu_dump(char *str)
-{
-	amd_iommu_dump = true;
-
-	return 1;
-}
-
-static int __init parse_amd_iommu_options(char *str)
-{
-	for (; *str; ++str) {
-		if (strncmp(str, "fullflush", 9) == 0)
-			amd_iommu_unmap_flush = true;
-		if (strncmp(str, "off", 3) == 0)
-			amd_iommu_disabled = true;
-	}
-
-	return 1;
-}
-
-__setup("amd_iommu_dump", parse_amd_iommu_dump);
-__setup("amd_iommu=", parse_amd_iommu_options);
-
-IOMMU_INIT_FINISH(amd_iommu_detect,
-		  gart_iommu_hole_init,
-		  0,
-		  0);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 289e92862fd..afdc3f756de 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -27,15 +27,12 @@
  * timer, but by default APB timer has higher rating than local APIC timers.
  */
 
-#include <linux/clocksource.h>
-#include <linux/clockchips.h>
 #include <linux/delay.h>
+#include <linux/dw_apb_timer.h>
 #include <linux/errno.h>
 #include <linux/init.h>
-#include <linux/sysdev.h>
 #include <linux/slab.h>
 #include <linux/pm.h>
-#include <linux/pci.h>
 #include <linux/sfi.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
@@ -44,76 +41,48 @@
 #include <asm/fixmap.h>
 #include <asm/apb_timer.h>
 #include <asm/mrst.h>
+#include <asm/time.h>
 
-#define APBT_MASK			CLOCKSOURCE_MASK(32)
-#define APBT_SHIFT			22
 #define APBT_CLOCKEVENT_RATING		110
 #define APBT_CLOCKSOURCE_RATING		250
-#define APBT_MIN_DELTA_USEC		200
 
-#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt)
 #define APBT_CLOCKEVENT0_NUM   (0)
-#define APBT_CLOCKEVENT1_NUM   (1)
 #define APBT_CLOCKSOURCE_NUM   (2)
 
-static unsigned long apbt_address;
+static phys_addr_t apbt_address;
 static int apb_timer_block_enabled;
 static void __iomem *apbt_virt_address;
-static int phy_cs_timer_id;
 
 /*
  * Common DW APB timer info
  */
-static uint64_t apbt_freq;
-
-static void apbt_set_mode(enum clock_event_mode mode,
-			  struct clock_event_device *evt);
-static int apbt_next_event(unsigned long delta,
-			   struct clock_event_device *evt);
-static cycle_t apbt_read_clocksource(struct clocksource *cs);
-static void apbt_restart_clocksource(struct clocksource *cs);
+static unsigned long apbt_freq;
 
 struct apbt_dev {
-	struct clock_event_device evt;
-	unsigned int num;
-	int cpu;
-	unsigned int irq;
-	unsigned int tick;
-	unsigned int count;
-	unsigned int flags;
-	char name[10];
+	struct dw_apb_clock_event_device	*timer;
+	unsigned int				num;
+	int					cpu;
+	unsigned int				irq;
+	char					name[10];
 };
 
-static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
+static struct dw_apb_clocksource *clocksource_apbt;
 
-#ifdef CONFIG_SMP
-static unsigned int apbt_num_timers_used;
-static struct apbt_dev *apbt_devs;
-#endif
-
-static	inline unsigned long apbt_readl_reg(unsigned long a)
+static inline void __iomem *adev_virt_addr(struct apbt_dev *adev)
 {
-	return readl(apbt_virt_address + a);
+	return apbt_virt_address + adev->num * APBTMRS_REG_SIZE;
 }
 
-static inline void apbt_writel_reg(unsigned long d, unsigned long a)
-{
-	writel(d, apbt_virt_address + a);
-}
-
-static inline unsigned long apbt_readl(int n, unsigned long a)
-{
-	return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE);
-}
+static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
 
-static inline void apbt_writel(int n, unsigned long d, unsigned long a)
-{
-	writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE);
-}
+#ifdef CONFIG_SMP
+static unsigned int apbt_num_timers_used;
+#endif
 
 static inline void apbt_set_mapping(void)
 {
 	struct sfi_timer_table_entry *mtmr;
+	int phy_cs_timer_id = 0;
 
 	if (apbt_virt_address) {
 		pr_debug("APBT base already mapped\n");
@@ -125,21 +94,18 @@ static inline void apbt_set_mapping(void)
 		       APBT_CLOCKEVENT0_NUM);
 		return;
 	}
-	apbt_address = (unsigned long)mtmr->phys_addr;
+	apbt_address = (phys_addr_t)mtmr->phys_addr;
 	if (!apbt_address) {
 		printk(KERN_WARNING "No timer base from SFI, use default\n");
 		apbt_address = APBT_DEFAULT_BASE;
 	}
 	apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
-	if (apbt_virt_address) {
-		pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\
-			 (void *)apbt_address, (void *)apbt_virt_address);
-	} else {
-		pr_debug("Failed mapping APBT phy address at %p\n",\
-			 (void *)apbt_address);
+	if (!apbt_virt_address) {
+		pr_debug("Failed mapping APBT phy address at %lu\n",\
+			 (unsigned long)apbt_address);
 		goto panic_noapbt;
 	}
-	apbt_freq = mtmr->freq_hz / USEC_PER_SEC;
+	apbt_freq = mtmr->freq_hz;
 	sfi_free_mtmr(mtmr);
 
 	/* Now figure out the physical timer id for clocksource device */
@@ -148,9 +114,14 @@ static inline void apbt_set_mapping(void)
 		goto panic_noapbt;
 
 	/* Now figure out the physical timer id */
-	phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff)
-		/ APBTMRS_REG_SIZE;
-	pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id);
+	pr_debug("Use timer %d for clocksource\n",
+		 (int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE);
+	phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) /
+		APBTMRS_REG_SIZE;
+
+	clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING,
+		"apbt0", apbt_virt_address + phy_cs_timer_id *
+		APBTMRS_REG_SIZE, apbt_freq);
 	return;
 
 panic_noapbt:
@@ -172,82 +143,6 @@ static inline int is_apbt_capable(void)
 	return apbt_virt_address ? 1 : 0;
 }
 
-static struct clocksource clocksource_apbt = {
-	.name		= "apbt",
-	.rating		= APBT_CLOCKSOURCE_RATING,
-	.read		= apbt_read_clocksource,
-	.mask		= APBT_MASK,
-	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
-	.resume		= apbt_restart_clocksource,
-};
-
-/* boot APB clock event device */
-static struct clock_event_device apbt_clockevent = {
-	.name		= "apbt0",
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-	.set_mode	= apbt_set_mode,
-	.set_next_event = apbt_next_event,
-	.shift		= APBT_SHIFT,
-	.irq		= 0,
-	.rating		= APBT_CLOCKEVENT_RATING,
-};
-
-/*
- * start count down from 0xffff_ffff. this is done by toggling the enable bit
- * then load initial load count to ~0.
- */
-static void apbt_start_counter(int n)
-{
-	unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
-
-	ctrl &= ~APBTMR_CONTROL_ENABLE;
-	apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-	apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT);
-	/* enable, mask interrupt */
-	ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
-	ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT);
-	apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-	/* read it once to get cached counter value initialized */
-	apbt_read_clocksource(&clocksource_apbt);
-}
-
-static irqreturn_t apbt_interrupt_handler(int irq, void *data)
-{
-	struct apbt_dev *dev = (struct apbt_dev *)data;
-	struct clock_event_device *aevt = &dev->evt;
-
-	if (!aevt->event_handler) {
-		printk(KERN_INFO "Spurious APBT timer interrupt on %d\n",
-		       dev->num);
-		return IRQ_NONE;
-	}
-	aevt->event_handler(aevt);
-	return IRQ_HANDLED;
-}
-
-static void apbt_restart_clocksource(struct clocksource *cs)
-{
-	apbt_start_counter(phy_cs_timer_id);
-}
-
-static void apbt_enable_int(int n)
-{
-	unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
-	/* clear pending intr */
-	apbt_readl(n, APBTMR_N_EOI);
-	ctrl &= ~APBTMR_CONTROL_INT;
-	apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-}
-
-static void apbt_disable_int(int n)
-{
-	unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
-
-	ctrl |= APBTMR_CONTROL_INT;
-	apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-}
-
-
 static int __init apbt_clockevent_register(void)
 {
 	struct sfi_timer_table_entry *mtmr;
@@ -260,45 +155,21 @@ static int __init apbt_clockevent_register(void)
 		return -ENODEV;
 	}
 
-	/*
-	 * We need to calculate the scaled math multiplication factor for
-	 * nanosecond to apbt tick conversion.
-	 * mult = (nsec/cycle)*2^APBT_SHIFT
-	 */
-	apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz
-				      , NSEC_PER_SEC, APBT_SHIFT);
-
-	/* Calculate the min / max delta */
-	apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
-							   &apbt_clockevent);
-	apbt_clockevent.min_delta_ns = clockevent_delta2ns(
-		APBT_MIN_DELTA_USEC*apbt_freq,
-		&apbt_clockevent);
-	/*
-	 * Start apbt with the boot cpu mask and make it
-	 * global if not used for per cpu timer.
-	 */
-	apbt_clockevent.cpumask = cpumask_of(smp_processor_id());
 	adev->num = smp_processor_id();
-	memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
+	adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0",
+		mrst_timer_options == MRST_TIMER_LAPIC_APBT ?
+		APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING,
+		adev_virt_addr(adev), 0, apbt_freq);
+	/* Firmware does EOI handling for us. */
+	adev->timer->eoi = NULL;
 
 	if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
-		adev->evt.rating = APBT_CLOCKEVENT_RATING - 100;
-		global_clock_event = &adev->evt;
+		global_clock_event = &adev->timer->ced;
 		printk(KERN_DEBUG "%s clockevent registered as global\n",
 		       global_clock_event->name);
 	}
 
-	if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler,
-			IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
-			apbt_clockevent.name, adev)) {
-		printk(KERN_ERR "Failed request IRQ for APBT%d\n",
-		       apbt_clockevent.irq);
-	}
-
-	clockevents_register_device(&adev->evt);
-	/* Start APBT 0 interrupts */
-	apbt_enable_int(APBT_CLOCKEVENT0_NUM);
+	dw_apb_clockevent_register(adev->timer);
 
 	sfi_free_mtmr(mtmr);
 	return 0;
@@ -316,52 +187,34 @@ static void apbt_setup_irq(struct apbt_dev *adev)
 	irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
 	/* APB timer irqs are set up as mp_irqs, timer is edge type */
 	__irq_set_handler(adev->irq, handle_edge_irq, 0, "edge");
-
-	if (system_state == SYSTEM_BOOTING) {
-		if (request_irq(adev->irq, apbt_interrupt_handler,
-					IRQF_TIMER | IRQF_DISABLED |
-					IRQF_NOBALANCING,
-					adev->name, adev)) {
-			printk(KERN_ERR "Failed request IRQ for APBT%d\n",
-			       adev->num);
-		}
-	} else
-		enable_irq(adev->irq);
 }
 
 /* Should be called with per cpu */
 void apbt_setup_secondary_clock(void)
 {
 	struct apbt_dev *adev;
-	struct clock_event_device *aevt;
 	int cpu;
 
 	/* Don't register boot CPU clockevent */
 	cpu = smp_processor_id();
 	if (!cpu)
 		return;
-	/*
-	 * We need to calculate the scaled math multiplication factor for
-	 * nanosecond to apbt tick conversion.
-	 * mult = (nsec/cycle)*2^APBT_SHIFT
-	 */
-	printk(KERN_INFO "Init per CPU clockevent %d\n", cpu);
-	adev = &per_cpu(cpu_apbt_dev, cpu);
-	aevt = &adev->evt;
 
-	memcpy(aevt, &apbt_clockevent, sizeof(*aevt));
-	aevt->cpumask = cpumask_of(cpu);
-	aevt->name = adev->name;
-	aevt->mode = CLOCK_EVT_MODE_UNUSED;
+	adev = &__get_cpu_var(cpu_apbt_dev);
+	if (!adev->timer) {
+		adev->timer = dw_apb_clockevent_init(cpu, adev->name,
+			APBT_CLOCKEVENT_RATING, adev_virt_addr(adev),
+			adev->irq, apbt_freq);
+		adev->timer->eoi = NULL;
+	} else {
+		dw_apb_clockevent_resume(adev->timer);
+	}
 
-	printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n",
-	       cpu, aevt->name, *(u32 *)aevt->cpumask);
+	printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n",
+	       cpu, adev->name, adev->cpu);
 
 	apbt_setup_irq(adev);
-
-	clockevents_register_device(aevt);
-
-	apbt_enable_int(cpu);
+	dw_apb_clockevent_register(adev->timer);
 
 	return;
 }
@@ -384,13 +237,12 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
 
 	switch (action & 0xf) {
 	case CPU_DEAD:
-		disable_irq(adev->irq);
-		apbt_disable_int(cpu);
+		dw_apb_clockevent_pause(adev->timer);
 		if (system_state == SYSTEM_RUNNING) {
 			pr_debug("skipping APBT CPU %lu offline\n", cpu);
 		} else if (adev) {
 			pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
-			free_irq(adev->irq, adev);
+			dw_apb_clockevent_stop(adev->timer);
 		}
 		break;
 	default:
@@ -415,116 +267,16 @@ void apbt_setup_secondary_clock(void) {}
 
 #endif /* CONFIG_SMP */
 
-static void apbt_set_mode(enum clock_event_mode mode,
-			  struct clock_event_device *evt)
-{
-	unsigned long ctrl;
-	uint64_t delta;
-	int timer_num;
-	struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
-
-	BUG_ON(!apbt_virt_address);
-
-	timer_num = adev->num;
-	pr_debug("%s CPU %d timer %d mode=%d\n",
-		 __func__, first_cpu(*evt->cpumask), timer_num, mode);
-
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-		delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult;
-		delta >>= apbt_clockevent.shift;
-		ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
-		ctrl |= APBTMR_CONTROL_MODE_PERIODIC;
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-		/*
-		 * DW APB p. 46, have to disable timer before load counter,
-		 * may cause sync problem.
-		 */
-		ctrl &= ~APBTMR_CONTROL_ENABLE;
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-		udelay(1);
-		pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ);
-		apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
-		ctrl |= APBTMR_CONTROL_ENABLE;
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-		break;
-		/* APB timer does not have one-shot mode, use free running mode */
-	case CLOCK_EVT_MODE_ONESHOT:
-		ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
-		/*
-		 * set free running mode, this mode will let timer reload max
-		 * timeout which will give time (3min on 25MHz clock) to rearm
-		 * the next event, therefore emulate the one-shot mode.
-		 */
-		ctrl &= ~APBTMR_CONTROL_ENABLE;
-		ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
-
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-		/* write again to set free running mode */
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-
-		/*
-		 * DW APB p. 46, load counter with all 1s before starting free
-		 * running mode.
-		 */
-		apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT);
-		ctrl &= ~APBTMR_CONTROL_INT;
-		ctrl |= APBTMR_CONTROL_ENABLE;
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-		break;
-
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		apbt_disable_int(timer_num);
-		ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
-		ctrl &= ~APBTMR_CONTROL_ENABLE;
-		apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-		break;
-
-	case CLOCK_EVT_MODE_RESUME:
-		apbt_enable_int(timer_num);
-		break;
-	}
-}
-
-static int apbt_next_event(unsigned long delta,
-			   struct clock_event_device *evt)
-{
-	unsigned long ctrl;
-	int timer_num;
-
-	struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
-
-	timer_num = adev->num;
-	/* Disable timer */
-	ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
-	ctrl &= ~APBTMR_CONTROL_ENABLE;
-	apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-	/* write new count */
-	apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
-	ctrl |= APBTMR_CONTROL_ENABLE;
-	apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
-	return 0;
-}
-
-static cycle_t apbt_read_clocksource(struct clocksource *cs)
-{
-	unsigned long current_count;
-
-	current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE);
-	return (cycle_t)~current_count;
-}
-
 static int apbt_clocksource_register(void)
 {
 	u64 start, now;
 	cycle_t t1;
 
 	/* Start the counter, use timer 2 as source, timer 0/1 for event */
-	apbt_start_counter(phy_cs_timer_id);
+	dw_apb_clocksource_start(clocksource_apbt);
 
 	/* Verify whether apbt counter works */
-	t1 = apbt_read_clocksource(&clocksource_apbt);
+	t1 = dw_apb_clocksource_read(clocksource_apbt);
 	rdtscll(start);
 
 	/*
@@ -539,10 +291,10 @@ static int apbt_clocksource_register(void)
 	} while ((now - start) < 200000UL);
 
 	/* APBT is the only always on clocksource, it has to work! */
-	if (t1 == apbt_read_clocksource(&clocksource_apbt))
+	if (t1 == dw_apb_clocksource_read(clocksource_apbt))
 		panic("APBT counter not counting. APBT disabled\n");
 
-	clocksource_register_khz(&clocksource_apbt, (u32)apbt_freq*1000);
+	dw_apb_clocksource_register(clocksource_apbt);
 
 	return 0;
 }
@@ -566,10 +318,7 @@ void __init apbt_time_init(void)
 	if (apb_timer_block_enabled)
 		return;
 	apbt_set_mapping();
-	if (apbt_virt_address) {
-		pr_debug("Found APBT version 0x%lx\n",\
-			 apbt_readl_reg(APBTMRS_COMP_VERSION));
-	} else
+	if (!apbt_virt_address)
 		goto out_noapbt;
 	/*
 	 * Read the frequency and check for a sane value, for ESL model
@@ -577,7 +326,7 @@ void __init apbt_time_init(void)
 	 */
 
 	if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
-		pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq);
+		pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq);
 		goto out_noapbt;
 	}
 	if (apbt_clocksource_register()) {
@@ -603,30 +352,20 @@ void __init apbt_time_init(void)
 	} else {
 		percpu_timer = 0;
 		apbt_num_timers_used = 1;
-		adev = &per_cpu(cpu_apbt_dev, 0);
-		adev->flags &= ~APBT_DEV_USED;
 	}
 	pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
 
 	/* here we set up per CPU timer data structure */
-	apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used,
-			    GFP_KERNEL);
-	if (!apbt_devs) {
-		printk(KERN_ERR "Failed to allocate APB timer devices\n");
-		return;
-	}
 	for (i = 0; i < apbt_num_timers_used; i++) {
 		adev = &per_cpu(cpu_apbt_dev, i);
 		adev->num = i;
 		adev->cpu = i;
 		p_mtmr = sfi_get_mtmr(i);
-		if (p_mtmr) {
-			adev->tick = p_mtmr->freq_hz;
+		if (p_mtmr)
 			adev->irq = p_mtmr->irq;
-		} else
+		else
 			printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
-		adev->count = 0;
-		sprintf(adev->name, "apbt%d", i);
+		snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i);
 	}
 #endif
 
@@ -638,17 +377,8 @@ out_noapbt:
 	panic("failed to enable APB timer\n");
 }
 
-static inline void apbt_disable(int n)
-{
-	if (is_apbt_capable()) {
-		unsigned long ctrl =  apbt_readl(n, APBTMR_N_CONTROL);
-		ctrl &= ~APBTMR_CONTROL_ENABLE;
-		apbt_writel(n, ctrl, APBTMR_N_CONTROL);
-	}
-}
-
 /* called before apb_timer_enable, use early map */
-unsigned long apbt_quick_calibrate()
+unsigned long apbt_quick_calibrate(void)
 {
 	int i, scale;
 	u64 old, new;
@@ -657,31 +387,31 @@ unsigned long apbt_quick_calibrate()
 	u32 loop, shift;
 
 	apbt_set_mapping();
-	apbt_start_counter(phy_cs_timer_id);
+	dw_apb_clocksource_start(clocksource_apbt);
 
 	/* check if the timer can count down, otherwise return */
-	old = apbt_read_clocksource(&clocksource_apbt);
+	old = dw_apb_clocksource_read(clocksource_apbt);
 	i = 10000;
 	while (--i) {
-		if (old != apbt_read_clocksource(&clocksource_apbt))
+		if (old != dw_apb_clocksource_read(clocksource_apbt))
 			break;
 	}
 	if (!i)
 		goto failed;
 
 	/* count 16 ms */
-	loop = (apbt_freq * 1000) << 4;
+	loop = (apbt_freq / 1000) << 4;
 
 	/* restart the timer to ensure it won't get to 0 in the calibration */
-	apbt_start_counter(phy_cs_timer_id);
+	dw_apb_clocksource_start(clocksource_apbt);
 
-	old = apbt_read_clocksource(&clocksource_apbt);
+	old = dw_apb_clocksource_read(clocksource_apbt);
 	old += loop;
 
 	t1 = __native_read_tsc();
 
 	do {
-		new = apbt_read_clocksource(&clocksource_apbt);
+		new = dw_apb_clocksource_read(clocksource_apbt);
 	} while (new < old);
 
 	t2 = __native_read_tsc();
@@ -693,7 +423,7 @@ unsigned long apbt_quick_calibrate()
 		return 0;
 	}
 	scale = (int)div_u64((t2 - t1), loop >> shift);
-	khz = (scale * apbt_freq * 1000) >> shift;
+	khz = (scale * (apbt_freq / 1000)) >> shift;
 	printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
 	return khz;
 failed:
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b9338b8cf42..52fa56399a5 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -27,6 +27,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/delay.h>
 #include <linux/timex.h>
+#include <linux/i8253.h>
 #include <linux/dmar.h>
 #include <linux/init.h>
 #include <linux/cpu.h>
@@ -37,9 +38,8 @@
 #include <asm/perf_event.h>
 #include <asm/x86_init.h>
 #include <asm/pgalloc.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/mpspec.h>
-#include <asm/i8253.h>
 #include <asm/i8259.h>
 #include <asm/proto.h>
 #include <asm/apic.h>
@@ -48,6 +48,7 @@
 #include <asm/hpet.h>
 #include <asm/idle.h>
 #include <asm/mtrr.h>
+#include <asm/time.h>
 #include <asm/smp.h>
 #include <asm/mce.h>
 #include <asm/tsc.h>
@@ -1429,7 +1430,7 @@ void enable_x2apic(void)
 	rdmsr(MSR_IA32_APICBASE, msr, msr2);
 	if (!(msr & X2APIC_ENABLE)) {
 		printk_once(KERN_INFO "Enabling x2apic\n");
-		wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
+		wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2);
 	}
 }
 #endif /* CONFIG_X86_X2APIC */
@@ -1943,10 +1944,28 @@ void disconnect_bsp_APIC(int virt_wire_setup)
 
 void __cpuinit generic_processor_info(int apicid, int version)
 {
-	int cpu;
+	int cpu, max = nr_cpu_ids;
+	bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
+				phys_cpu_present_map);
+
+	/*
+	 * If boot cpu has not been detected yet, then only allow upto
+	 * nr_cpu_ids - 1 processors and keep one slot free for boot cpu
+	 */
+	if (!boot_cpu_detected && num_processors >= nr_cpu_ids - 1 &&
+	    apicid != boot_cpu_physical_apicid) {
+		int thiscpu = max + disabled_cpus - 1;
+
+		pr_warning(
+			"ACPI: NR_CPUS/possible_cpus limit of %i almost"
+			" reached. Keeping one slot for boot cpu."
+			"  Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+
+		disabled_cpus++;
+		return;
+	}
 
 	if (num_processors >= nr_cpu_ids) {
-		int max = nr_cpu_ids;
 		int thiscpu = max + disabled_cpus;
 
 		pr_warning(
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 9536b3fe43f..5d513bc47b6 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -48,7 +48,7 @@
 #include <linux/io.h>
 
 #include <asm/apicdef.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/fixmap.h>
 #include <asm/mpspec.h>
 #include <asm/setup.h>
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e5293394b54..8eb863e27ea 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1295,6 +1295,16 @@ static int setup_ioapic_entry(int apic_id, int irq,
 		 * irq handler will do the explicit EOI to the io-apic.
 		 */
 		ir_entry->vector = pin;
+
+		apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
+			"Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
+			"Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
+			"Avail:%X Vector:%02X Dest:%08X "
+			"SID:%04X SQ:%X SVT:%X)\n",
+			apic_id, irte.present, irte.fpd, irte.dst_mode,
+			irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
+			irte.avail, irte.vector, irte.dest_id,
+			irte.sid, irte.sq, irte.svt);
 	} else {
 		entry->delivery_mode = apic->irq_delivery_mode;
 		entry->dest_mode = apic->irq_dest_mode;
@@ -1337,9 +1347,9 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
 
 	apic_printk(APIC_VERBOSE,KERN_DEBUG
 		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
-		    "IRQ %d Mode:%i Active:%i)\n",
+		    "IRQ %d Mode:%i Active:%i Dest:%d)\n",
 		    apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector,
-		    irq, trigger, polarity);
+		    irq, trigger, polarity, dest);
 
 
 	if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry,
@@ -1522,10 +1532,12 @@ __apicdebuginit(void) print_IO_APIC(void)
 	printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
 
 	printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
-	printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
+	printk(KERN_DEBUG ".......     : max redirection entries: %02X\n",
+		reg_01.bits.entries);
 
 	printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
-	printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
+	printk(KERN_DEBUG ".......     : IO APIC version: %02X\n",
+		reg_01.bits.version);
 
 	/*
 	 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
@@ -1550,31 +1562,60 @@ __apicdebuginit(void) print_IO_APIC(void)
 
 	printk(KERN_DEBUG ".... IRQ redirection table:\n");
 
-	printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
-			  " Stat Dmod Deli Vect:\n");
+	if (intr_remapping_enabled) {
+		printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
+			" Pol Stat Indx2 Zero Vect:\n");
+	} else {
+		printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
+			" Stat Dmod Deli Vect:\n");
+	}
 
 	for (i = 0; i <= reg_01.bits.entries; i++) {
-		struct IO_APIC_route_entry entry;
-
-		entry = ioapic_read_entry(apic, i);
-
-		printk(KERN_DEBUG " %02x %03X ",
-			i,
-			entry.dest
-		);
+		if (intr_remapping_enabled) {
+			struct IO_APIC_route_entry entry;
+			struct IR_IO_APIC_route_entry *ir_entry;
+
+			entry = ioapic_read_entry(apic, i);
+			ir_entry = (struct IR_IO_APIC_route_entry *) &entry;
+			printk(KERN_DEBUG " %02x %04X ",
+				i,
+				ir_entry->index
+			);
+			printk("%1d   %1d    %1d    %1d   %1d   "
+				"%1d    %1d     %X    %02X\n",
+				ir_entry->format,
+				ir_entry->mask,
+				ir_entry->trigger,
+				ir_entry->irr,
+				ir_entry->polarity,
+				ir_entry->delivery_status,
+				ir_entry->index2,
+				ir_entry->zero,
+				ir_entry->vector
+			);
+		} else {
+			struct IO_APIC_route_entry entry;
 
-		printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
-			entry.mask,
-			entry.trigger,
-			entry.irr,
-			entry.polarity,
-			entry.delivery_status,
-			entry.dest_mode,
-			entry.delivery_mode,
-			entry.vector
-		);
+			entry = ioapic_read_entry(apic, i);
+			printk(KERN_DEBUG " %02x %02X  ",
+				i,
+				entry.dest
+			);
+			printk("%1d    %1d    %1d   %1d   %1d    "
+				"%1d    %1d    %02X\n",
+				entry.mask,
+				entry.trigger,
+				entry.irr,
+				entry.polarity,
+				entry.delivery_status,
+				entry.dest_mode,
+				entry.delivery_mode,
+				entry.vector
+			);
+		}
 	}
 	}
+
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
 	for_each_active_irq(irq) {
 		struct irq_pin_list *entry;
@@ -1792,7 +1833,7 @@ __apicdebuginit(int) print_ICs(void)
 	return 0;
 }
 
-fs_initcall(print_ICs);
+late_initcall(print_ICs);
 
 
 /* Where if anywhere is the i8259 connect in external int mode */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 965a7666c28..0371c484bb8 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -229,11 +229,11 @@
 #include <linux/jiffies.h>
 #include <linux/acpi.h>
 #include <linux/syscore_ops.h>
+#include <linux/i8253.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/desc.h>
-#include <asm/i8253.h>
 #include <asm/olpc.h>
 #include <asm/paravirt.h>
 #include <asm/reboot.h>
@@ -1220,11 +1220,11 @@ static void reinit_timer(void)
 
 	raw_spin_lock_irqsave(&i8253_lock, flags);
 	/* set the clock to HZ */
-	outb_pit(0x34, PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */
+	outb_p(0x34, PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */
 	udelay(10);
-	outb_pit(LATCH & 0xff, PIT_CH0);	/* LSB */
+	outb_p(LATCH & 0xff, PIT_CH0);	/* LSB */
 	udelay(10);
-	outb_pit(LATCH >> 8, PIT_CH0);	/* MSB */
+	outb_p(LATCH >> 8, PIT_CH0);	/* MSB */
 	udelay(10);
 	raw_spin_unlock_irqrestore(&i8253_lock, flags);
 #endif
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index c29d631af6f..395a10e6806 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -63,7 +63,6 @@ void foo(void)
 	BLANK();
 	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
 	OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
-	OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
 
 	BLANK();
 	OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 525514cf33c..46674fbb62b 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -62,6 +62,8 @@ static void __init check_fpu(void)
 		return;
 	}
 
+	kernel_fpu_begin();
+
 	/*
 	 * trap_init() enabled FXSR and company _before_ testing for FP
 	 * problems here.
@@ -80,6 +82,8 @@ static void __init check_fpu(void)
 		: "=m" (*&fdiv_bug)
 		: "m" (*&x), "m" (*&y));
 
+	kernel_fpu_end();
+
 	boot_cpu_data.fdiv_bug = fdiv_bug;
 	if (boot_cpu_data.fdiv_bug)
 		printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n");
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 22a073d7fbf..62184390a60 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -21,7 +21,7 @@
 #include <linux/topology.h>
 #include <linux/cpumask.h>
 #include <asm/pgtable.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/proto.h>
 #include <asm/setup.h>
 #include <asm/apic.h>
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 8095f8611f8..755f64fb074 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -32,11 +32,11 @@
  */
 static const __initconst struct hypervisor_x86 * const hypervisors[] =
 {
-	&x86_hyper_vmware,
-	&x86_hyper_ms_hyperv,
 #ifdef CONFIG_XEN_PVHVM
 	&x86_hyper_xen_hvm,
 #endif
+	&x86_hyper_vmware,
+	&x86_hyper_ms_hyperv,
 };
 
 const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1edf5ba4fb2..ed6086eedf1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -456,6 +456,24 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 
 	if (cpu_has(c, X86_FEATURE_VMX))
 		detect_vmx_virtcap(c);
+
+	/*
+	 * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
+	 * x86_energy_perf_policy(8) is available to change it at run-time
+	 */
+	if (cpu_has(c, X86_FEATURE_EPB)) {
+		u64 epb;
+
+		rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+		if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
+			printk_once(KERN_WARNING "ENERGY_PERF_BIAS:"
+				" Set to 'normal', was 'performance'\n"
+				"ENERGY_PERF_BIAS: View and update with"
+				" x86_energy_perf_policy(8)\n");
+			epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
+			wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+		}
+	}
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 1e8d66c1336..7395d5f4272 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -43,61 +43,105 @@ static struct severity {
 	unsigned char covered;
 	char *msg;
 } severities[] = {
-#define KERNEL .context = IN_KERNEL
-#define USER .context = IN_USER
-#define SER .ser = SER_REQUIRED
-#define NOSER .ser = NO_SER
-#define SEV(s) .sev = MCE_ ## s ## _SEVERITY
-#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r }
-#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r }
-#define MCGMASK(x, res, s, m, r...) \
-	{ .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r }
-#define MASK(x, y, s, m, r...) \
-	{ .mask = x, .result = y, SEV(s), .msg = m, ## r }
+#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
+#define  KERNEL		.context = IN_KERNEL
+#define  USER		.context = IN_USER
+#define  SER		.ser = SER_REQUIRED
+#define  NOSER		.ser = NO_SER
+#define  BITCLR(x)	.mask = x, .result = 0
+#define  BITSET(x)	.mask = x, .result = x
+#define  MCGMASK(x, y)	.mcgmask = x, .mcgres = y
+#define  MASK(x, y)	.mask = x, .result = y
 #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
 #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
 #define MCACOD 0xffff
 
-	BITCLR(MCI_STATUS_VAL, NO, "Invalid"),
-	BITCLR(MCI_STATUS_EN, NO, "Not enabled"),
-	BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"),
+	MCESEV(
+		NO, "Invalid",
+		BITCLR(MCI_STATUS_VAL)
+		),
+	MCESEV(
+		NO, "Not enabled",
+		BITCLR(MCI_STATUS_EN)
+		),
+	MCESEV(
+		PANIC, "Processor context corrupt",
+		BITSET(MCI_STATUS_PCC)
+		),
 	/* When MCIP is not set something is very confused */
-	MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"),
+	MCESEV(
+		PANIC, "MCIP not set in MCA handler",
+		MCGMASK(MCG_STATUS_MCIP, 0)
+		),
 	/* Neither return not error IP -- no chance to recover -> PANIC */
-	MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC,
-		"Neither restart nor error IP"),
-	MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP",
-		KERNEL),
-	BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER),
-	MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME,
-	     "Spurious not enabled", SER),
+	MCESEV(
+		PANIC, "Neither restart nor error IP",
+		MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0)
+		),
+	MCESEV(
+		PANIC, "In kernel and no restart IP",
+		KERNEL, MCGMASK(MCG_STATUS_RIPV, 0)
+		),
+	MCESEV(
+		KEEP, "Corrected error",
+		NOSER, BITCLR(MCI_STATUS_UC)
+		),
 
 	/* ignore OVER for UCNA */
-	MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP,
-	     "Uncorrected no action required", SER),
-	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC,
-	     "Illegal combination (UCNA with AR=1)", SER),
-	MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER),
+	MCESEV(
+		KEEP, "Uncorrected no action required",
+		SER, MASK(MCI_UC_SAR, MCI_STATUS_UC)
+		),
+	MCESEV(
+		PANIC, "Illegal combination (UCNA with AR=1)",
+		SER,
+		MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR)
+		),
+	MCESEV(
+		KEEP, "Non signalled machine check",
+		SER, BITCLR(MCI_STATUS_S)
+		),
 
 	/* AR add known MCACODs here */
-	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC,
-	     "Action required with lost events", SER),
-	MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC,
-	     "Action required; unknown MCACOD", SER),
+	MCESEV(
+		PANIC, "Action required with lost events",
+		SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
+		),
+	MCESEV(
+		PANIC, "Action required: unknown MCACOD",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
+		),
 
 	/* known AO MCACODs: */
-	MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO,
-	     "Action optional: memory scrubbing error", SER),
-	MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO,
-	     "Action optional: last level cache writeback error", SER),
-
-	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME,
-	     "Action optional unknown MCACOD", SER),
-	MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME,
-	     "Action optional with lost events", SER),
-	BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"),
-	BITSET(MCI_STATUS_UC, UC, "Uncorrected"),
-	BITSET(0, SOME, "No match")	/* always matches. keep at end */
+	MCESEV(
+		AO, "Action optional: memory scrubbing error",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0)
+		),
+	MCESEV(
+		AO, "Action optional: last level cache writeback error",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a)
+		),
+	MCESEV(
+		SOME, "Action optional: unknown MCACOD",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S)
+		),
+	MCESEV(
+		SOME, "Action optional with lost events",
+		SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S)
+		),
+
+	MCESEV(
+		PANIC, "Overflowed uncorrected",
+		BITSET(MCI_STATUS_OVER|MCI_STATUS_UC)
+		),
+	MCESEV(
+		UC, "Uncorrected",
+		BITSET(MCI_STATUS_UC)
+		),
+	MCESEV(
+		SOME, "No match",
+		BITSET(0)
+		)	/* always matches. keep at end */
 };
 
 /*
@@ -112,15 +156,15 @@ static int error_context(struct mce *m)
 	return IN_KERNEL;
 }
 
-int mce_severity(struct mce *a, int tolerant, char **msg)
+int mce_severity(struct mce *m, int tolerant, char **msg)
 {
-	enum context ctx = error_context(a);
+	enum context ctx = error_context(m);
 	struct severity *s;
 
 	for (s = severities;; s++) {
-		if ((a->status & s->mask) != s->result)
+		if ((m->status & s->mask) != s->result)
 			continue;
-		if ((a->mcgstatus & s->mcgmask) != s->mcgres)
+		if ((m->mcgstatus & s->mcgmask) != s->mcgres)
 			continue;
 		if (s->ser == SER_REQUIRED && !mce_ser)
 			continue;
@@ -197,15 +241,15 @@ static const struct file_operations severities_coverage_fops = {
 
 static int __init severities_debugfs_init(void)
 {
-	struct dentry *dmce = NULL, *fseverities_coverage = NULL;
+	struct dentry *dmce, *fsev;
 
 	dmce = mce_get_debugfs_dir();
-	if (dmce == NULL)
+	if (!dmce)
 		goto err_out;
-	fseverities_coverage = debugfs_create_file("severities-coverage",
-						   0444, dmce, NULL,
-						   &severities_coverage_fops);
-	if (fseverities_coverage == NULL)
+
+	fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL,
+				   &severities_coverage_fops);
+	if (!fsev)
 		goto err_out;
 
 	return 0;
@@ -214,4 +258,4 @@ err_out:
 	return -ENOMEM;
 }
 late_initcall(severities_debugfs_init);
-#endif
+#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ff1ae9b6464..08363b04212 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -10,7 +10,6 @@
 #include <linux/thread_info.h>
 #include <linux/capability.h>
 #include <linux/miscdevice.h>
-#include <linux/interrupt.h>
 #include <linux/ratelimit.h>
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
@@ -38,23 +37,20 @@
 #include <linux/mm.h>
 #include <linux/debugfs.h>
 #include <linux/edac_mce.h>
+#include <linux/irq_work.h>
 
 #include <asm/processor.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/idle.h>
-#include <asm/ipi.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 
 #include "mce-internal.h"
 
-static DEFINE_MUTEX(mce_read_mutex);
+static DEFINE_MUTEX(mce_chrdev_read_mutex);
 
 #define rcu_dereference_check_mce(p) \
 	rcu_dereference_index_check((p), \
 			      rcu_read_lock_sched_held() || \
-			      lockdep_is_held(&mce_read_mutex))
+			      lockdep_is_held(&mce_chrdev_read_mutex))
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/mce.h>
@@ -94,7 +90,8 @@ static unsigned long		mce_need_notify;
 static char			mce_helper[128];
 static char			*mce_helper_argv[2] = { mce_helper, NULL };
 
-static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
+static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+
 static DEFINE_PER_CPU(struct mce, mces_seen);
 static int			cpu_missing;
 
@@ -373,6 +370,31 @@ static void mce_wrmsrl(u32 msr, u64 v)
 }
 
 /*
+ * Collect all global (w.r.t. this processor) status about this machine
+ * check into our "mce" struct so that we can use it later to assess
+ * the severity of the problem as we read per-bank specific details.
+ */
+static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
+{
+	mce_setup(m);
+
+	m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+	if (regs) {
+		/*
+		 * Get the address of the instruction at the time of
+		 * the machine check error.
+		 */
+		if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
+			m->ip = regs->ip;
+			m->cs = regs->cs;
+		}
+		/* Use accurate RIP reporting if available. */
+		if (rip_msr)
+			m->ip = mce_rdmsrl(rip_msr);
+	}
+}
+
+/*
  * Simple lockless ring to communicate PFNs from the exception handler with the
  * process context work function. This is vastly simplified because there's
  * only a single reader and a single writer.
@@ -443,40 +465,13 @@ static void mce_schedule_work(void)
 	}
 }
 
-/*
- * Get the address of the instruction at the time of the machine check
- * error.
- */
-static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
-{
-
-	if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) {
-		m->ip = regs->ip;
-		m->cs = regs->cs;
-	} else {
-		m->ip = 0;
-		m->cs = 0;
-	}
-	if (rip_msr)
-		m->ip = mce_rdmsrl(rip_msr);
-}
+DEFINE_PER_CPU(struct irq_work, mce_irq_work);
 
-#ifdef CONFIG_X86_LOCAL_APIC
-/*
- * Called after interrupts have been reenabled again
- * when a MCE happened during an interrupts off region
- * in the kernel.
- */
-asmlinkage void smp_mce_self_interrupt(struct pt_regs *regs)
+static void mce_irq_work_cb(struct irq_work *entry)
 {
-	ack_APIC_irq();
-	exit_idle();
-	irq_enter();
 	mce_notify_irq();
 	mce_schedule_work();
-	irq_exit();
 }
-#endif
 
 static void mce_report_event(struct pt_regs *regs)
 {
@@ -492,29 +487,7 @@ static void mce_report_event(struct pt_regs *regs)
 		return;
 	}
 
-#ifdef CONFIG_X86_LOCAL_APIC
-	/*
-	 * Without APIC do not notify. The event will be picked
-	 * up eventually.
-	 */
-	if (!cpu_has_apic)
-		return;
-
-	/*
-	 * When interrupts are disabled we cannot use
-	 * kernel services safely. Trigger an self interrupt
-	 * through the APIC to instead do the notification
-	 * after interrupts are reenabled again.
-	 */
-	apic->send_IPI_self(MCE_SELF_VECTOR);
-
-	/*
-	 * Wait for idle afterwards again so that we don't leave the
-	 * APIC in a non idle state because the normal APIC writes
-	 * cannot exclude us.
-	 */
-	apic_wait_icr_idle();
-#endif
+	irq_work_queue(&__get_cpu_var(mce_irq_work));
 }
 
 DEFINE_PER_CPU(unsigned, mce_poll_count);
@@ -541,9 +514,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 
 	percpu_inc(mce_poll_count);
 
-	mce_setup(&m);
+	mce_gather_info(&m, NULL);
 
-	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 	for (i = 0; i < banks; i++) {
 		if (!mce_banks[i].ctl || !test_bit(i, *b))
 			continue;
@@ -879,9 +851,9 @@ static int mce_usable_address(struct mce *m)
 {
 	if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
 		return 0;
-	if ((m->misc & 0x3f) > PAGE_SHIFT)
+	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
 		return 0;
-	if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS)
+	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
 		return 0;
 	return 1;
 }
@@ -942,9 +914,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	if (!banks)
 		goto out;
 
-	mce_setup(&m);
+	mce_gather_info(&m, regs);
 
-	m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
 	final = &__get_cpu_var(mces_seen);
 	*final = m;
 
@@ -1028,7 +999,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
 			mce_ring_add(m.addr >> PAGE_SHIFT);
 
-		mce_get_rip(&m, regs);
 		mce_log(&m);
 
 		if (severity > worst) {
@@ -1190,7 +1160,8 @@ int mce_notify_irq(void)
 	clear_thread_flag(TIF_MCE_NOTIFY);
 
 	if (test_and_clear_bit(0, &mce_need_notify)) {
-		wake_up_interruptible(&mce_wait);
+		/* wake processes polling /dev/mcelog */
+		wake_up_interruptible(&mce_chrdev_wait);
 
 		/*
 		 * There is no risk of missing notifications because
@@ -1363,18 +1334,23 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 	return 0;
 }
 
-static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
+static int __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
 {
 	if (c->x86 != 5)
-		return;
+		return 0;
+
 	switch (c->x86_vendor) {
 	case X86_VENDOR_INTEL:
 		intel_p5_mcheck_init(c);
+		return 1;
 		break;
 	case X86_VENDOR_CENTAUR:
 		winchip_mcheck_init(c);
+		return 1;
 		break;
 	}
+
+	return 0;
 }
 
 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
@@ -1428,7 +1404,8 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
 	if (mce_disabled)
 		return;
 
-	__mcheck_cpu_ancient_init(c);
+	if (__mcheck_cpu_ancient_init(c))
+		return;
 
 	if (!mce_available(c))
 		return;
@@ -1444,44 +1421,45 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
 	__mcheck_cpu_init_vendor(c);
 	__mcheck_cpu_init_timer();
 	INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
-
+	init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
 }
 
 /*
- * Character device to read and clear the MCE log.
+ * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
  */
 
-static DEFINE_SPINLOCK(mce_state_lock);
-static int		open_count;		/* #times opened */
-static int		open_exclu;		/* already open exclusive? */
+static DEFINE_SPINLOCK(mce_chrdev_state_lock);
+static int mce_chrdev_open_count;	/* #times opened */
+static int mce_chrdev_open_exclu;	/* already open exclusive? */
 
-static int mce_open(struct inode *inode, struct file *file)
+static int mce_chrdev_open(struct inode *inode, struct file *file)
 {
-	spin_lock(&mce_state_lock);
+	spin_lock(&mce_chrdev_state_lock);
 
-	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
-		spin_unlock(&mce_state_lock);
+	if (mce_chrdev_open_exclu ||
+	    (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
+		spin_unlock(&mce_chrdev_state_lock);
 
 		return -EBUSY;
 	}
 
 	if (file->f_flags & O_EXCL)
-		open_exclu = 1;
-	open_count++;
+		mce_chrdev_open_exclu = 1;
+	mce_chrdev_open_count++;
 
-	spin_unlock(&mce_state_lock);
+	spin_unlock(&mce_chrdev_state_lock);
 
 	return nonseekable_open(inode, file);
 }
 
-static int mce_release(struct inode *inode, struct file *file)
+static int mce_chrdev_release(struct inode *inode, struct file *file)
 {
-	spin_lock(&mce_state_lock);
+	spin_lock(&mce_chrdev_state_lock);
 
-	open_count--;
-	open_exclu = 0;
+	mce_chrdev_open_count--;
+	mce_chrdev_open_exclu = 0;
 
-	spin_unlock(&mce_state_lock);
+	spin_unlock(&mce_chrdev_state_lock);
 
 	return 0;
 }
@@ -1530,8 +1508,8 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
 	return 0;
 }
 
-static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
-			loff_t *off)
+static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
+				size_t usize, loff_t *off)
 {
 	char __user *buf = ubuf;
 	unsigned long *cpu_tsc;
@@ -1542,7 +1520,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 	if (!cpu_tsc)
 		return -ENOMEM;
 
-	mutex_lock(&mce_read_mutex);
+	mutex_lock(&mce_chrdev_read_mutex);
 
 	if (!mce_apei_read_done) {
 		err = __mce_read_apei(&buf, usize);
@@ -1562,19 +1540,18 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 	do {
 		for (i = prev; i < next; i++) {
 			unsigned long start = jiffies;
+			struct mce *m = &mcelog.entry[i];
 
-			while (!mcelog.entry[i].finished) {
+			while (!m->finished) {
 				if (time_after_eq(jiffies, start + 2)) {
-					memset(mcelog.entry + i, 0,
-					       sizeof(struct mce));
+					memset(m, 0, sizeof(*m));
 					goto timeout;
 				}
 				cpu_relax();
 			}
 			smp_rmb();
-			err |= copy_to_user(buf, mcelog.entry + i,
-					    sizeof(struct mce));
-			buf += sizeof(struct mce);
+			err |= copy_to_user(buf, m, sizeof(*m));
+			buf += sizeof(*m);
 timeout:
 			;
 		}
@@ -1594,13 +1571,13 @@ timeout:
 	on_each_cpu(collect_tscs, cpu_tsc, 1);
 
 	for (i = next; i < MCE_LOG_LEN; i++) {
-		if (mcelog.entry[i].finished &&
-		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
-			err |= copy_to_user(buf, mcelog.entry+i,
-					    sizeof(struct mce));
+		struct mce *m = &mcelog.entry[i];
+
+		if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
+			err |= copy_to_user(buf, m, sizeof(*m));
 			smp_rmb();
-			buf += sizeof(struct mce);
-			memset(&mcelog.entry[i], 0, sizeof(struct mce));
+			buf += sizeof(*m);
+			memset(m, 0, sizeof(*m));
 		}
 	}
 
@@ -1608,15 +1585,15 @@ timeout:
 		err = -EFAULT;
 
 out:
-	mutex_unlock(&mce_read_mutex);
+	mutex_unlock(&mce_chrdev_read_mutex);
 	kfree(cpu_tsc);
 
 	return err ? err : buf - ubuf;
 }
 
-static unsigned int mce_poll(struct file *file, poll_table *wait)
+static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
 {
-	poll_wait(file, &mce_wait, wait);
+	poll_wait(file, &mce_chrdev_wait, wait);
 	if (rcu_access_index(mcelog.next))
 		return POLLIN | POLLRDNORM;
 	if (!mce_apei_read_done && apei_check_mce())
@@ -1624,7 +1601,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
 	return 0;
 }
 
-static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
+				unsigned long arg)
 {
 	int __user *p = (int __user *)arg;
 
@@ -1652,16 +1630,16 @@ static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
 
 /* Modified in mce-inject.c, so not static or const */
 struct file_operations mce_chrdev_ops = {
-	.open			= mce_open,
-	.release		= mce_release,
-	.read			= mce_read,
-	.poll			= mce_poll,
-	.unlocked_ioctl		= mce_ioctl,
-	.llseek		= no_llseek,
+	.open			= mce_chrdev_open,
+	.release		= mce_chrdev_release,
+	.read			= mce_chrdev_read,
+	.poll			= mce_chrdev_poll,
+	.unlocked_ioctl		= mce_chrdev_ioctl,
+	.llseek			= no_llseek,
 };
 EXPORT_SYMBOL_GPL(mce_chrdev_ops);
 
-static struct miscdevice mce_log_device = {
+static struct miscdevice mce_chrdev_device = {
 	MISC_MCELOG_MINOR,
 	"mcelog",
 	&mce_chrdev_ops,
@@ -1719,7 +1697,7 @@ int __init mcheck_init(void)
 }
 
 /*
- * Sysfs support
+ * mce_syscore: PM support
  */
 
 /*
@@ -1739,12 +1717,12 @@ static int mce_disable_error_reporting(void)
 	return 0;
 }
 
-static int mce_suspend(void)
+static int mce_syscore_suspend(void)
 {
 	return mce_disable_error_reporting();
 }
 
-static void mce_shutdown(void)
+static void mce_syscore_shutdown(void)
 {
 	mce_disable_error_reporting();
 }
@@ -1754,18 +1732,22 @@ static void mce_shutdown(void)
  * Only one CPU is active at this time, the others get re-added later using
  * CPU hotplug:
  */
-static void mce_resume(void)
+static void mce_syscore_resume(void)
 {
 	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
 }
 
 static struct syscore_ops mce_syscore_ops = {
-	.suspend	= mce_suspend,
-	.shutdown	= mce_shutdown,
-	.resume		= mce_resume,
+	.suspend	= mce_syscore_suspend,
+	.shutdown	= mce_syscore_shutdown,
+	.resume		= mce_syscore_resume,
 };
 
+/*
+ * mce_sysdev: Sysfs support
+ */
+
 static void mce_cpu_restart(void *data)
 {
 	del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1801,11 +1783,11 @@ static void mce_enable_ce(void *all)
 		__mcheck_cpu_init_timer();
 }
 
-static struct sysdev_class mce_sysclass = {
+static struct sysdev_class mce_sysdev_class = {
 	.name		= "machinecheck",
 };
 
-DEFINE_PER_CPU(struct sys_device, mce_dev);
+DEFINE_PER_CPU(struct sys_device, mce_sysdev);
 
 __cpuinitdata
 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
@@ -1934,7 +1916,7 @@ static struct sysdev_ext_attribute attr_cmci_disabled = {
 	&mce_cmci_disabled
 };
 
-static struct sysdev_attribute *mce_attrs[] = {
+static struct sysdev_attribute *mce_sysdev_attrs[] = {
 	&attr_tolerant.attr,
 	&attr_check_interval.attr,
 	&attr_trigger,
@@ -1945,66 +1927,67 @@ static struct sysdev_attribute *mce_attrs[] = {
 	NULL
 };
 
-static cpumask_var_t mce_dev_initialized;
+static cpumask_var_t mce_sysdev_initialized;
 
 /* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
-static __cpuinit int mce_create_device(unsigned int cpu)
+static __cpuinit int mce_sysdev_create(unsigned int cpu)
 {
+	struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
 	int err;
 	int i, j;
 
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	memset(&per_cpu(mce_dev, cpu).kobj, 0, sizeof(struct kobject));
-	per_cpu(mce_dev, cpu).id	= cpu;
-	per_cpu(mce_dev, cpu).cls	= &mce_sysclass;
+	memset(&sysdev->kobj, 0, sizeof(struct kobject));
+	sysdev->id  = cpu;
+	sysdev->cls = &mce_sysdev_class;
 
-	err = sysdev_register(&per_cpu(mce_dev, cpu));
+	err = sysdev_register(sysdev);
 	if (err)
 		return err;
 
-	for (i = 0; mce_attrs[i]; i++) {
-		err = sysdev_create_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+	for (i = 0; mce_sysdev_attrs[i]; i++) {
+		err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]);
 		if (err)
 			goto error;
 	}
 	for (j = 0; j < banks; j++) {
-		err = sysdev_create_file(&per_cpu(mce_dev, cpu),
-					&mce_banks[j].attr);
+		err = sysdev_create_file(sysdev, &mce_banks[j].attr);
 		if (err)
 			goto error2;
 	}
-	cpumask_set_cpu(cpu, mce_dev_initialized);
+	cpumask_set_cpu(cpu, mce_sysdev_initialized);
 
 	return 0;
 error2:
 	while (--j >= 0)
-		sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
+		sysdev_remove_file(sysdev, &mce_banks[j].attr);
 error:
 	while (--i >= 0)
-		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+		sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
 
-	sysdev_unregister(&per_cpu(mce_dev, cpu));
+	sysdev_unregister(sysdev);
 
 	return err;
 }
 
-static __cpuinit void mce_remove_device(unsigned int cpu)
+static __cpuinit void mce_sysdev_remove(unsigned int cpu)
 {
+	struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
 	int i;
 
-	if (!cpumask_test_cpu(cpu, mce_dev_initialized))
+	if (!cpumask_test_cpu(cpu, mce_sysdev_initialized))
 		return;
 
-	for (i = 0; mce_attrs[i]; i++)
-		sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
+	for (i = 0; mce_sysdev_attrs[i]; i++)
+		sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
 
 	for (i = 0; i < banks; i++)
-		sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
+		sysdev_remove_file(sysdev, &mce_banks[i].attr);
 
-	sysdev_unregister(&per_cpu(mce_dev, cpu));
-	cpumask_clear_cpu(cpu, mce_dev_initialized);
+	sysdev_unregister(sysdev);
+	cpumask_clear_cpu(cpu, mce_sysdev_initialized);
 }
 
 /* Make sure there are no machine checks on offlined CPUs. */
@@ -2054,7 +2037,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		mce_create_device(cpu);
+		mce_sysdev_create(cpu);
 		if (threshold_cpu_callback)
 			threshold_cpu_callback(action, cpu);
 		break;
@@ -2062,7 +2045,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_DEAD_FROZEN:
 		if (threshold_cpu_callback)
 			threshold_cpu_callback(action, cpu);
-		mce_remove_device(cpu);
+		mce_sysdev_remove(cpu);
 		break;
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
@@ -2116,27 +2099,28 @@ static __init int mcheck_init_device(void)
 	if (!mce_available(&boot_cpu_data))
 		return -EIO;
 
-	zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
+	zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL);
 
 	mce_init_banks();
 
-	err = sysdev_class_register(&mce_sysclass);
+	err = sysdev_class_register(&mce_sysdev_class);
 	if (err)
 		return err;
 
 	for_each_online_cpu(i) {
-		err = mce_create_device(i);
+		err = mce_sysdev_create(i);
 		if (err)
 			return err;
 	}
 
 	register_syscore_ops(&mce_syscore_ops);
 	register_hotcpu_notifier(&mce_cpu_notifier);
-	misc_register(&mce_log_device);
+
+	/* register character device /dev/mcelog */
+	misc_register(&mce_chrdev_device);
 
 	return err;
 }
-
 device_initcall(mcheck_init_device);
 
 /*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index bb0adad3514..f5474218cff 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -548,7 +548,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (!b)
 			goto out;
 
-		err = sysfs_create_link(&per_cpu(mce_dev, cpu).kobj,
+		err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj,
 					b->kobj, name);
 		if (err)
 			goto out;
@@ -571,7 +571,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		goto out;
 	}
 
-	b->kobj = kobject_create_and_add(name, &per_cpu(mce_dev, cpu).kobj);
+	b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj);
 	if (!b->kobj)
 		goto out_free;
 
@@ -591,7 +591,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 		if (i == cpu)
 			continue;
 
-		err = sysfs_create_link(&per_cpu(mce_dev, i).kobj,
+		err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj,
 					b->kobj, name);
 		if (err)
 			goto out;
@@ -669,7 +669,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 #ifdef CONFIG_SMP
 	/* sibling symlink */
 	if (shared_bank[bank] && b->blocks->cpu != cpu) {
-		sysfs_remove_link(&per_cpu(mce_dev, cpu).kobj, name);
+		sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name);
 		per_cpu(threshold_banks, cpu)[bank] = NULL;
 
 		return;
@@ -681,7 +681,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 		if (i == cpu)
 			continue;
 
-		sysfs_remove_link(&per_cpu(mce_dev, i).kobj, name);
+		sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name);
 		per_cpu(threshold_banks, i)[bank] = NULL;
 	}
 
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 929739a653d..08119a37e53 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -79,7 +79,6 @@ void set_mtrr_ops(const struct mtrr_ops *ops)
 static int have_wrcomb(void)
 {
 	struct pci_dev *dev;
-	u8 rev;
 
 	dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL);
 	if (dev != NULL) {
@@ -89,13 +88,11 @@ static int have_wrcomb(void)
 		 * chipsets to be tagged
 		 */
 		if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
-		    dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) {
-			pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
-			if (rev <= 5) {
-				pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
-				pci_dev_put(dev);
-				return 0;
-			}
+		    dev->device == PCI_DEVICE_ID_SERVERWORKS_LE &&
+		    dev->revision <= 5) {
+			pr_info("mtrr: Serverworks LE rev < 6 detected. Write-combining disabled.\n");
+			pci_dev_put(dev);
+			return 0;
 		}
 		/*
 		 * Intel 450NX errata # 23. Non ascending cacheline evictions to
@@ -137,55 +134,43 @@ static void __init init_table(void)
 }
 
 struct set_mtrr_data {
-	atomic_t	count;
-	atomic_t	gate;
 	unsigned long	smp_base;
 	unsigned long	smp_size;
 	unsigned int	smp_reg;
 	mtrr_type	smp_type;
 };
 
-static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
-
 /**
- * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
+ * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed
+ * by all the CPUs.
  * @info: pointer to mtrr configuration data
  *
  * Returns nothing.
  */
-static int mtrr_work_handler(void *info)
+static int mtrr_rendezvous_handler(void *info)
 {
 #ifdef CONFIG_SMP
 	struct set_mtrr_data *data = info;
-	unsigned long flags;
-
-	atomic_dec(&data->count);
-	while (!atomic_read(&data->gate))
-		cpu_relax();
-
-	local_irq_save(flags);
-
-	atomic_dec(&data->count);
-	while (atomic_read(&data->gate))
-		cpu_relax();
 
-	/*  The master has cleared me to execute  */
+	/*
+	 * We use this same function to initialize the mtrrs during boot,
+	 * resume, runtime cpu online and on an explicit request to set a
+	 * specific MTRR.
+	 *
+	 * During boot or suspend, the state of the boot cpu's mtrrs has been
+	 * saved, and we want to replicate that across all the cpus that come
+	 * online (either at the end of boot or resume or during a runtime cpu
+	 * online). If we're doing that, @reg is set to something special and on
+	 * all the cpu's we do mtrr_if->set_all() (On the logical cpu that
+	 * started the boot/resume sequence, this might be a duplicate
+	 * set_all()).
+	 */
 	if (data->smp_reg != ~0U) {
 		mtrr_if->set(data->smp_reg, data->smp_base,
 			     data->smp_size, data->smp_type);
-	} else if (mtrr_aps_delayed_init) {
-		/*
-		 * Initialize the MTRRs inaddition to the synchronisation.
-		 */
+	} else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) {
 		mtrr_if->set_all();
 	}
-
-	atomic_dec(&data->count);
-	while (!atomic_read(&data->gate))
-		cpu_relax();
-
-	atomic_dec(&data->count);
-	local_irq_restore(flags);
 #endif
 	return 0;
 }
@@ -223,20 +208,11 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
  * 14. Wait for buddies to catch up
  * 15. Enable interrupts.
  *
- * What does that mean for us? Well, first we set data.count to the number
- * of CPUs. As each CPU announces that it started the rendezvous handler by
- * decrementing the count, We reset data.count and set the data.gate flag
- * allowing all the cpu's to proceed with the work. As each cpu disables
- * interrupts, it'll decrement data.count once. We wait until it hits 0 and
- * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
- * are waiting for that flag to be cleared. Once it's cleared, each
- * CPU goes through the transition of updating MTRRs.
- * The CPU vendors may each do it differently,
- * so we call mtrr_if->set() callback and let them take care of it.
- * When they're done, they again decrement data->count and wait for data.gate
- * to be set.
- * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
- * Everyone then enables interrupts and we all continue on.
+ * What does that mean for us? Well, stop_machine() will ensure that
+ * the rendezvous handler is started on each CPU. And in lockstep they
+ * do the state transition of disabling interrupts, updating MTRR's
+ * (the CPU vendors may each do it differently, so we call mtrr_if->set()
+ * callback and let them take care of it.) and enabling interrupts.
  *
  * Note that the mechanism is the same for UP systems, too; all the SMP stuff
  * becomes nops.
@@ -244,92 +220,26 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
 static void
 set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type)
 {
-	struct set_mtrr_data data;
-	unsigned long flags;
-	int cpu;
-
-	preempt_disable();
-
-	data.smp_reg = reg;
-	data.smp_base = base;
-	data.smp_size = size;
-	data.smp_type = type;
-	atomic_set(&data.count, num_booting_cpus() - 1);
-
-	/* Make sure data.count is visible before unleashing other CPUs */
-	smp_wmb();
-	atomic_set(&data.gate, 0);
-
-	/* Start the ball rolling on other CPUs */
-	for_each_online_cpu(cpu) {
-		struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
-
-		if (cpu == smp_processor_id())
-			continue;
-
-		stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
-	}
-
-
-	while (atomic_read(&data.count))
-		cpu_relax();
-
-	/* Ok, reset count and toggle gate */
-	atomic_set(&data.count, num_booting_cpus() - 1);
-	smp_wmb();
-	atomic_set(&data.gate, 1);
-
-	local_irq_save(flags);
-
-	while (atomic_read(&data.count))
-		cpu_relax();
-
-	/* Ok, reset count and toggle gate */
-	atomic_set(&data.count, num_booting_cpus() - 1);
-	smp_wmb();
-	atomic_set(&data.gate, 0);
-
-	/* Do our MTRR business */
-
-	/*
-	 * HACK!
-	 *
-	 * We use this same function to initialize the mtrrs during boot,
-	 * resume, runtime cpu online and on an explicit request to set a
-	 * specific MTRR.
-	 *
-	 * During boot or suspend, the state of the boot cpu's mtrrs has been
-	 * saved, and we want to replicate that across all the cpus that come
-	 * online (either at the end of boot or resume or during a runtime cpu
-	 * online). If we're doing that, @reg is set to something special and on
-	 * this cpu we still do mtrr_if->set_all(). During boot/resume, this
-	 * is unnecessary if at this point we are still on the cpu that started
-	 * the boot/resume sequence. But there is no guarantee that we are still
-	 * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be
-	 * sure that we are in sync with everyone else.
-	 */
-	if (reg != ~0U)
-		mtrr_if->set(reg, base, size, type);
-	else
-		mtrr_if->set_all();
+	struct set_mtrr_data data = { .smp_reg = reg,
+				      .smp_base = base,
+				      .smp_size = size,
+				      .smp_type = type
+				    };
 
-	/* Wait for the others */
-	while (atomic_read(&data.count))
-		cpu_relax();
-
-	atomic_set(&data.count, num_booting_cpus() - 1);
-	smp_wmb();
-	atomic_set(&data.gate, 1);
-
-	/*
-	 * Wait here for everyone to have seen the gate change
-	 * So we're the last ones to touch 'data'
-	 */
-	while (atomic_read(&data.count))
-		cpu_relax();
+	stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask);
+}
 
-	local_irq_restore(flags);
-	preempt_enable();
+static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base,
+				      unsigned long size, mtrr_type type)
+{
+	struct set_mtrr_data data = { .smp_reg = reg,
+				      .smp_base = base,
+				      .smp_size = size,
+				      .smp_type = type
+				    };
+
+	stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data,
+				       cpu_callout_mask);
 }
 
 /**
@@ -783,7 +693,7 @@ void mtrr_ap_init(void)
 	 *   2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
 	 *      lock to prevent mtrr entry changes
 	 */
-	set_mtrr(~0U, 0, 0, 0);
+	set_mtrr_from_inactive_cpu(~0U, 0, 0, 0);
 }
 
 /**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3a0338b4b17..4ee3abf20ed 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -22,7 +22,6 @@
 #include <linux/sched.h>
 #include <linux/uaccess.h>
 #include <linux/slab.h>
-#include <linux/highmem.h>
 #include <linux/cpu.h>
 #include <linux/bitops.h>
 
@@ -45,38 +44,27 @@ do {								\
 #endif
 
 /*
- * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ *          |   NHM/WSM    |      SNB     |
+ * register -------------------------------
+ *          |  HT  | no HT |  HT  | no HT |
+ *-----------------------------------------
+ * offcore  | core | core  | cpu  | core  |
+ * lbr_sel  | core | core  | cpu  | core  |
+ * ld_lat   | cpu  | core  | cpu  | core  |
+ *-----------------------------------------
+ *
+ * Given that there is a small number of shared regs,
+ * we can pre-allocate their slot in the per-cpu
+ * per-core reg tables.
  */
-static unsigned long
-copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
-{
-	unsigned long offset, addr = (unsigned long)from;
-	unsigned long size, len = 0;
-	struct page *page;
-	void *map;
-	int ret;
-
-	do {
-		ret = __get_user_pages_fast(addr, 1, 0, &page);
-		if (!ret)
-			break;
-
-		offset = addr & (PAGE_SIZE - 1);
-		size = min(PAGE_SIZE - offset, n - len);
-
-		map = kmap_atomic(page);
-		memcpy(to, map+offset, size);
-		kunmap_atomic(map);
-		put_page(page);
+enum extra_reg_type {
+	EXTRA_REG_NONE  = -1,	/* not used */
 
-		len  += size;
-		to   += size;
-		addr += size;
+	EXTRA_REG_RSP_0 = 0,	/* offcore_response_0 */
+	EXTRA_REG_RSP_1 = 1,	/* offcore_response_1 */
 
-	} while (len < n);
-
-	return len;
-}
+	EXTRA_REG_MAX		/* number of entries needed */
+};
 
 struct event_constraint {
 	union {
@@ -132,11 +120,10 @@ struct cpu_hw_events {
 	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
 
 	/*
-	 * Intel percore register state.
-	 * Coordinate shared resources between HT threads.
+	 * manage shared (per-core, per-cpu) registers
+	 * used on Intel NHM/WSM/SNB
 	 */
-	int				percore_used; /* Used by this CPU? */
-	struct intel_percore		*per_core;
+	struct intel_shared_regs	*shared_regs;
 
 	/*
 	 * AMD specific bits
@@ -187,26 +174,45 @@ struct cpu_hw_events {
 	for ((e) = (c); (e)->weight; (e)++)
 
 /*
+ * Per register state.
+ */
+struct er_account {
+	raw_spinlock_t		lock;	/* per-core: protect structure */
+	u64			config;	/* extra MSR config */
+	u64			reg;	/* extra MSR number */
+	atomic_t		ref;	/* reference count */
+};
+
+/*
  * Extra registers for specific events.
+ *
  * Some events need large masks and require external MSRs.
- * Define a mapping to these extra registers.
+ * Those extra MSRs end up being shared for all events on
+ * a PMU and sometimes between PMU of sibling HT threads.
+ * In either case, the kernel needs to handle conflicting
+ * accesses to those extra, shared, regs. The data structure
+ * to manage those registers is stored in cpu_hw_event.
  */
 struct extra_reg {
 	unsigned int		event;
 	unsigned int		msr;
 	u64			config_mask;
 	u64			valid_mask;
+	int			idx;  /* per_xxx->regs[] reg index */
 };
 
-#define EVENT_EXTRA_REG(e, ms, m, vm) {	\
+#define EVENT_EXTRA_REG(e, ms, m, vm, i) {	\
 	.event = (e),		\
 	.msr = (ms),		\
 	.config_mask = (m),	\
 	.valid_mask = (vm),	\
+	.idx = EXTRA_REG_##i	\
 	}
-#define INTEL_EVENT_EXTRA_REG(event, msr, vm)	\
-	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
-#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
+
+#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx)	\
+	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
+
+#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
 
 union perf_capabilities {
 	struct {
@@ -252,7 +258,6 @@ struct x86_pmu {
 	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
 						 struct perf_event *event);
 	struct event_constraint *event_constraints;
-	struct event_constraint *percore_constraints;
 	void		(*quirks)(void);
 	int		perfctr_second_write;
 
@@ -286,8 +291,12 @@ struct x86_pmu {
 	 * Extra registers for events
 	 */
 	struct extra_reg *extra_regs;
+	unsigned int er_flags;
 };
 
+#define ERF_NO_HT_SHARING	1
+#define ERF_HAS_RSP_1		2
+
 static struct x86_pmu x86_pmu __read_mostly;
 
 static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
@@ -393,10 +402,10 @@ static inline unsigned int x86_pmu_event_addr(int index)
  */
 static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
 {
+	struct hw_perf_event_extra *reg;
 	struct extra_reg *er;
 
-	event->hw.extra_reg = 0;
-	event->hw.extra_config = 0;
+	reg = &event->hw.extra_reg;
 
 	if (!x86_pmu.extra_regs)
 		return 0;
@@ -406,8 +415,10 @@ static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
 			continue;
 		if (event->attr.config1 & ~er->valid_mask)
 			return -EINVAL;
-		event->hw.extra_reg = er->msr;
-		event->hw.extra_config = event->attr.config1;
+
+		reg->idx = er->idx;
+		reg->config = event->attr.config1;
+		reg->reg = er->msr;
 		break;
 	}
 	return 0;
@@ -706,6 +717,9 @@ static int __x86_pmu_event_init(struct perf_event *event)
 	event->hw.last_cpu = -1;
 	event->hw.last_tag = ~0ULL;
 
+	/* mark unused */
+	event->hw.extra_reg.idx = EXTRA_REG_NONE;
+
 	return x86_pmu.hw_config(event);
 }
 
@@ -747,8 +761,8 @@ static void x86_pmu_disable(struct pmu *pmu)
 static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
 					  u64 enable_mask)
 {
-	if (hwc->extra_reg)
-		wrmsrl(hwc->extra_reg, hwc->extra_config);
+	if (hwc->extra_reg.reg)
+		wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
 	wrmsrl(hwc->config_base, hwc->config | enable_mask);
 }
 
@@ -1332,7 +1346,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
 		if (!x86_perf_event_set_period(event))
 			continue;
 
-		if (perf_event_overflow(event, 1, &data, regs))
+		if (perf_event_overflow(event, &data, regs))
 			x86_pmu_stop(event, 0);
 	}
 
@@ -1637,6 +1651,40 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
 	perf_pmu_enable(pmu);
 	return 0;
 }
+/*
+ * a fake_cpuc is used to validate event groups. Due to
+ * the extra reg logic, we need to also allocate a fake
+ * per_core and per_cpu structure. Otherwise, group events
+ * using extra reg may conflict without the kernel being
+ * able to catch this when the last event gets added to
+ * the group.
+ */
+static void free_fake_cpuc(struct cpu_hw_events *cpuc)
+{
+	kfree(cpuc->shared_regs);
+	kfree(cpuc);
+}
+
+static struct cpu_hw_events *allocate_fake_cpuc(void)
+{
+	struct cpu_hw_events *cpuc;
+	int cpu = raw_smp_processor_id();
+
+	cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL);
+	if (!cpuc)
+		return ERR_PTR(-ENOMEM);
+
+	/* only needed, if we have extra_regs */
+	if (x86_pmu.extra_regs) {
+		cpuc->shared_regs = allocate_shared_regs(cpu);
+		if (!cpuc->shared_regs)
+			goto error;
+	}
+	return cpuc;
+error:
+	free_fake_cpuc(cpuc);
+	return ERR_PTR(-ENOMEM);
+}
 
 /*
  * validate that we can schedule this event
@@ -1647,9 +1695,9 @@ static int validate_event(struct perf_event *event)
 	struct event_constraint *c;
 	int ret = 0;
 
-	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
-	if (!fake_cpuc)
-		return -ENOMEM;
+	fake_cpuc = allocate_fake_cpuc();
+	if (IS_ERR(fake_cpuc))
+		return PTR_ERR(fake_cpuc);
 
 	c = x86_pmu.get_event_constraints(fake_cpuc, event);
 
@@ -1659,7 +1707,7 @@ static int validate_event(struct perf_event *event)
 	if (x86_pmu.put_event_constraints)
 		x86_pmu.put_event_constraints(fake_cpuc, event);
 
-	kfree(fake_cpuc);
+	free_fake_cpuc(fake_cpuc);
 
 	return ret;
 }
@@ -1679,36 +1727,32 @@ static int validate_group(struct perf_event *event)
 {
 	struct perf_event *leader = event->group_leader;
 	struct cpu_hw_events *fake_cpuc;
-	int ret, n;
-
-	ret = -ENOMEM;
-	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
-	if (!fake_cpuc)
-		goto out;
+	int ret = -ENOSPC, n;
 
+	fake_cpuc = allocate_fake_cpuc();
+	if (IS_ERR(fake_cpuc))
+		return PTR_ERR(fake_cpuc);
 	/*
 	 * the event is not yet connected with its
 	 * siblings therefore we must first collect
 	 * existing siblings, then add the new event
 	 * before we can simulate the scheduling
 	 */
-	ret = -ENOSPC;
 	n = collect_events(fake_cpuc, leader, true);
 	if (n < 0)
-		goto out_free;
+		goto out;
 
 	fake_cpuc->n_events = n;
 	n = collect_events(fake_cpuc, event, false);
 	if (n < 0)
-		goto out_free;
+		goto out;
 
 	fake_cpuc->n_events = n;
 
 	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
 
-out_free:
-	kfree(fake_cpuc);
 out:
+	free_fake_cpuc(fake_cpuc);
 	return ret;
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index fe29c1d2219..941caa2e449 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -89,6 +89,20 @@ static __initconst const u64 amd_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = -1,
 	},
  },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */
+		[ C(RESULT_MISS)   ] = 0x98e9, /* CPU Request to Memory, r   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
 };
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 41178c826c4..45fbb8f7f54 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,25 +1,15 @@
 #ifdef CONFIG_CPU_SUP_INTEL
 
-#define MAX_EXTRA_REGS 2
-
-/*
- * Per register state.
- */
-struct er_account {
-	int			ref;		/* reference count */
-	unsigned int		extra_reg;	/* extra MSR number */
-	u64			extra_config;	/* extra MSR config */
-};
-
 /*
- * Per core state
- * This used to coordinate shared registers for HT threads.
+ * Per core/cpu state
+ *
+ * Used to coordinate shared registers between HT threads or
+ * among events on a single PMU.
  */
-struct intel_percore {
-	raw_spinlock_t		lock;		/* protect structure */
-	struct er_account	regs[MAX_EXTRA_REGS];
-	int			refcnt;		/* number of threads */
-	unsigned		core_id;
+struct intel_shared_regs {
+	struct er_account       regs[EXTRA_REG_MAX];
+	int                     refcnt;		/* per-core: #HT threads */
+	unsigned                core_id;	/* per-core: core id */
 };
 
 /*
@@ -88,16 +78,10 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
 
 static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
 {
-	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
+	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
 	EVENT_EXTRA_END
 };
 
-static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly =
-{
-	INTEL_EVENT_CONSTRAINT(0xb7, 0),
-	EVENT_CONSTRAINT_END
-};
-
 static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
 {
 	FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
@@ -116,8 +100,6 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
 	FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
 	/* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
 	INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
-	INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */
-	INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */
 	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
 	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
 	EVENT_CONSTRAINT_END
@@ -125,15 +107,13 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
 
 static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
 {
-	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
-	INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
+	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0),
+	INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1),
 	EVENT_EXTRA_END
 };
 
-static struct event_constraint intel_westmere_percore_constraints[] __read_mostly =
+static struct event_constraint intel_v1_event_constraints[] __read_mostly =
 {
-	INTEL_EVENT_CONSTRAINT(0xb7, 0),
-	INTEL_EVENT_CONSTRAINT(0xbb, 0),
 	EVENT_CONSTRAINT_END
 };
 
@@ -145,6 +125,12 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
 	EVENT_CONSTRAINT_END
 };
 
+static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
+	INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
+	INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1),
+	EVENT_EXTRA_END
+};
+
 static u64 intel_pmu_event_map(int hw_event)
 {
 	return intel_perfmon_event_map[hw_event];
@@ -245,6 +231,21 @@ static __initconst const u64 snb_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = -1,
 	},
  },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+
 };
 
 static __initconst const u64 westmere_hw_cache_event_ids
@@ -346,6 +347,20 @@ static __initconst const u64 westmere_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = -1,
 	},
  },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
 };
 
 /*
@@ -398,7 +413,21 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
 		[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
 		[ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
 	},
- }
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM,
+		[ C(RESULT_MISS)   ] = NHM_DMND_READ|NHM_REMOTE_DRAM,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM,
+		[ C(RESULT_MISS)   ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM,
+		[ C(RESULT_MISS)   ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM,
+	},
+ },
 };
 
 static __initconst const u64 nehalem_hw_cache_event_ids
@@ -500,6 +529,20 @@ static __initconst const u64 nehalem_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = -1,
 	},
  },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01b7,
+		[ C(RESULT_MISS)   ] = 0x01b7,
+	},
+ },
 };
 
 static __initconst const u64 core2_hw_cache_event_ids
@@ -1003,7 +1046,7 @@ again:
 
 		data.period = event->hw.last_period;
 
-		if (perf_event_overflow(event, 1, &data, regs))
+		if (perf_event_overflow(event, &data, regs))
 			x86_pmu_stop(event, 0);
 	}
 
@@ -1037,65 +1080,121 @@ intel_bts_constraints(struct perf_event *event)
 	return NULL;
 }
 
+static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
+{
+	if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
+		return false;
+
+	if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) {
+		event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+		event->hw.config |= 0x01bb;
+		event->hw.extra_reg.idx = EXTRA_REG_RSP_1;
+		event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
+	} else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) {
+		event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
+		event->hw.config |= 0x01b7;
+		event->hw.extra_reg.idx = EXTRA_REG_RSP_0;
+		event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
+	}
+
+	if (event->hw.extra_reg.idx == orig_idx)
+		return false;
+
+	return true;
+}
+
+/*
+ * manage allocation of shared extra msr for certain events
+ *
+ * sharing can be:
+ * per-cpu: to be shared between the various events on a single PMU
+ * per-core: per-cpu + shared by HT threads
+ */
 static struct event_constraint *
-intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
+				   struct perf_event *event)
 {
-	struct hw_perf_event *hwc = &event->hw;
-	unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
-	struct event_constraint *c;
-	struct intel_percore *pc;
+	struct event_constraint *c = &emptyconstraint;
+	struct hw_perf_event_extra *reg = &event->hw.extra_reg;
 	struct er_account *era;
-	int i;
-	int free_slot;
-	int found;
+	unsigned long flags;
+	int orig_idx = reg->idx;
 
-	if (!x86_pmu.percore_constraints || hwc->extra_alloc)
-		return NULL;
+	/* already allocated shared msr */
+	if (reg->alloc)
+		return &unconstrained;
 
-	for (c = x86_pmu.percore_constraints; c->cmask; c++) {
-		if (e != c->code)
-			continue;
+again:
+	era = &cpuc->shared_regs->regs[reg->idx];
+	/*
+	 * we use spin_lock_irqsave() to avoid lockdep issues when
+	 * passing a fake cpuc
+	 */
+	raw_spin_lock_irqsave(&era->lock, flags);
+
+	if (!atomic_read(&era->ref) || era->config == reg->config) {
+
+		/* lock in msr value */
+		era->config = reg->config;
+		era->reg = reg->reg;
+
+		/* one more user */
+		atomic_inc(&era->ref);
+
+		/* no need to reallocate during incremental event scheduling */
+		reg->alloc = 1;
 
 		/*
-		 * Allocate resource per core.
+		 * All events using extra_reg are unconstrained.
+		 * Avoids calling x86_get_event_constraints()
+		 *
+		 * Must revisit if extra_reg controlling events
+		 * ever have constraints. Worst case we go through
+		 * the regular event constraint table.
 		 */
-		pc = cpuc->per_core;
-		if (!pc)
-			break;
-		c = &emptyconstraint;
-		raw_spin_lock(&pc->lock);
-		free_slot = -1;
-		found = 0;
-		for (i = 0; i < MAX_EXTRA_REGS; i++) {
-			era = &pc->regs[i];
-			if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
-				/* Allow sharing same config */
-				if (hwc->extra_config == era->extra_config) {
-					era->ref++;
-					cpuc->percore_used = 1;
-					hwc->extra_alloc = 1;
-					c = NULL;
-				}
-				/* else conflict */
-				found = 1;
-				break;
-			} else if (era->ref == 0 && free_slot == -1)
-				free_slot = i;
-		}
-		if (!found && free_slot != -1) {
-			era = &pc->regs[free_slot];
-			era->ref = 1;
-			era->extra_reg = hwc->extra_reg;
-			era->extra_config = hwc->extra_config;
-			cpuc->percore_used = 1;
-			hwc->extra_alloc = 1;
-			c = NULL;
-		}
-		raw_spin_unlock(&pc->lock);
-		return c;
+		c = &unconstrained;
+	} else if (intel_try_alt_er(event, orig_idx)) {
+		raw_spin_unlock(&era->lock);
+		goto again;
 	}
+	raw_spin_unlock_irqrestore(&era->lock, flags);
 
-	return NULL;
+	return c;
+}
+
+static void
+__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
+				   struct hw_perf_event_extra *reg)
+{
+	struct er_account *era;
+
+	/*
+	 * only put constraint if extra reg was actually
+	 * allocated. Also takes care of event which do
+	 * not use an extra shared reg
+	 */
+	if (!reg->alloc)
+		return;
+
+	era = &cpuc->shared_regs->regs[reg->idx];
+
+	/* one fewer user */
+	atomic_dec(&era->ref);
+
+	/* allocate again next time */
+	reg->alloc = 0;
+}
+
+static struct event_constraint *
+intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
+			      struct perf_event *event)
+{
+	struct event_constraint *c = NULL;
+
+	if (event->hw.extra_reg.idx != EXTRA_REG_NONE)
+		c = __intel_shared_reg_get_constraints(cpuc, event);
+
+	return c;
 }
 
 static struct event_constraint *
@@ -1111,49 +1210,28 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
 	if (c)
 		return c;
 
-	c = intel_percore_constraints(cpuc, event);
+	c = intel_shared_regs_constraints(cpuc, event);
 	if (c)
 		return c;
 
 	return x86_get_event_constraints(cpuc, event);
 }
 
-static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+static void
+intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
 					struct perf_event *event)
 {
-	struct extra_reg *er;
-	struct intel_percore *pc;
-	struct er_account *era;
-	struct hw_perf_event *hwc = &event->hw;
-	int i, allref;
+	struct hw_perf_event_extra *reg;
 
-	if (!cpuc->percore_used)
-		return;
-
-	for (er = x86_pmu.extra_regs; er->msr; er++) {
-		if (er->event != (hwc->config & er->config_mask))
-			continue;
+	reg = &event->hw.extra_reg;
+	if (reg->idx != EXTRA_REG_NONE)
+		__intel_shared_reg_put_constraints(cpuc, reg);
+}
 
-		pc = cpuc->per_core;
-		raw_spin_lock(&pc->lock);
-		for (i = 0; i < MAX_EXTRA_REGS; i++) {
-			era = &pc->regs[i];
-			if (era->ref > 0 &&
-			    era->extra_config == hwc->extra_config &&
-			    era->extra_reg == er->msr) {
-				era->ref--;
-				hwc->extra_alloc = 0;
-				break;
-			}
-		}
-		allref = 0;
-		for (i = 0; i < MAX_EXTRA_REGS; i++)
-			allref += pc->regs[i].ref;
-		if (allref == 0)
-			cpuc->percore_used = 0;
-		raw_spin_unlock(&pc->lock);
-		break;
-	}
+static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
+					struct perf_event *event)
+{
+	intel_put_shared_regs_event_constraints(cpuc, event);
 }
 
 static int intel_pmu_hw_config(struct perf_event *event)
@@ -1231,20 +1309,36 @@ static __initconst const struct x86_pmu core_pmu = {
 	.event_constraints	= intel_core_event_constraints,
 };
 
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+	struct intel_shared_regs *regs;
+	int i;
+
+	regs = kzalloc_node(sizeof(struct intel_shared_regs),
+			    GFP_KERNEL, cpu_to_node(cpu));
+	if (regs) {
+		/*
+		 * initialize the locks to keep lockdep happy
+		 */
+		for (i = 0; i < EXTRA_REG_MAX; i++)
+			raw_spin_lock_init(&regs->regs[i].lock);
+
+		regs->core_id = -1;
+	}
+	return regs;
+}
+
 static int intel_pmu_cpu_prepare(int cpu)
 {
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
 
-	if (!cpu_has_ht_siblings())
+	if (!x86_pmu.extra_regs)
 		return NOTIFY_OK;
 
-	cpuc->per_core = kzalloc_node(sizeof(struct intel_percore),
-				      GFP_KERNEL, cpu_to_node(cpu));
-	if (!cpuc->per_core)
+	cpuc->shared_regs = allocate_shared_regs(cpu);
+	if (!cpuc->shared_regs)
 		return NOTIFY_BAD;
 
-	raw_spin_lock_init(&cpuc->per_core->lock);
-	cpuc->per_core->core_id = -1;
 	return NOTIFY_OK;
 }
 
@@ -1260,32 +1354,34 @@ static void intel_pmu_cpu_starting(int cpu)
 	 */
 	intel_pmu_lbr_reset();
 
-	if (!cpu_has_ht_siblings())
+	if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING))
 		return;
 
 	for_each_cpu(i, topology_thread_cpumask(cpu)) {
-		struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
+		struct intel_shared_regs *pc;
 
+		pc = per_cpu(cpu_hw_events, i).shared_regs;
 		if (pc && pc->core_id == core_id) {
-			kfree(cpuc->per_core);
-			cpuc->per_core = pc;
+			kfree(cpuc->shared_regs);
+			cpuc->shared_regs = pc;
 			break;
 		}
 	}
 
-	cpuc->per_core->core_id = core_id;
-	cpuc->per_core->refcnt++;
+	cpuc->shared_regs->core_id = core_id;
+	cpuc->shared_regs->refcnt++;
 }
 
 static void intel_pmu_cpu_dying(int cpu)
 {
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	struct intel_percore *pc = cpuc->per_core;
+	struct intel_shared_regs *pc;
 
+	pc = cpuc->shared_regs;
 	if (pc) {
 		if (pc->core_id == -1 || --pc->refcnt == 0)
 			kfree(pc);
-		cpuc->per_core = NULL;
+		cpuc->shared_regs = NULL;
 	}
 
 	fini_debug_store_on_cpu(cpu);
@@ -1436,7 +1532,6 @@ static __init int intel_pmu_init(void)
 
 		x86_pmu.event_constraints = intel_nehalem_event_constraints;
 		x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
-		x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
 		x86_pmu.extra_regs = intel_nehalem_extra_regs;
 
@@ -1481,10 +1576,10 @@ static __init int intel_pmu_init(void)
 		intel_pmu_lbr_init_nhm();
 
 		x86_pmu.event_constraints = intel_westmere_event_constraints;
-		x86_pmu.percore_constraints = intel_westmere_percore_constraints;
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
 		x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_westmere_extra_regs;
+		x86_pmu.er_flags |= ERF_HAS_RSP_1;
 
 		/* UOPS_ISSUED.STALLED_CYCLES */
 		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
@@ -1502,6 +1597,10 @@ static __init int intel_pmu_init(void)
 
 		x86_pmu.event_constraints = intel_snb_event_constraints;
 		x86_pmu.pebs_constraints = intel_snb_pebs_events;
+		x86_pmu.extra_regs = intel_snb_extra_regs;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.er_flags |= ERF_HAS_RSP_1;
+		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
 
 		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
 		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
@@ -1512,11 +1611,19 @@ static __init int intel_pmu_init(void)
 		break;
 
 	default:
-		/*
-		 * default constraints for v2 and up
-		 */
-		x86_pmu.event_constraints = intel_gen_event_constraints;
-		pr_cont("generic architected perfmon, ");
+		switch (x86_pmu.version) {
+		case 1:
+			x86_pmu.event_constraints = intel_v1_event_constraints;
+			pr_cont("generic architected perfmon v1, ");
+			break;
+		default:
+			/*
+			 * default constraints for v2 and up
+			 */
+			x86_pmu.event_constraints = intel_gen_event_constraints;
+			pr_cont("generic architected perfmon, ");
+			break;
+		}
 	}
 	return 0;
 }
@@ -1528,4 +1635,8 @@ static int intel_pmu_init(void)
 	return 0;
 }
 
+static struct intel_shared_regs *allocate_shared_regs(int cpu)
+{
+	return NULL;
+}
 #endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index bab491b8ee2..1b1ef3addcf 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -340,7 +340,7 @@ static int intel_pmu_drain_bts_buffer(void)
 	 */
 	perf_prepare_sample(&header, &data, event, &regs);
 
-	if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
+	if (perf_output_begin(&handle, event, header.size * (top - at)))
 		return 1;
 
 	for (; at < top; at++) {
@@ -616,7 +616,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
 	else
 		regs.flags &= ~PERF_EFLAGS_EXACT;
 
-	if (perf_event_overflow(event, 1, &data, &regs))
+	if (perf_event_overflow(event, &data, &regs))
 		x86_pmu_stop(event, 0);
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index ead584fb6a7..7809d2bcb20 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -554,13 +554,102 @@ static __initconst const u64 p4_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = -1,
 	},
  },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
 };
 
+/*
+ * Because of Netburst being quite restricted in how many
+ * identical events may run simultaneously, we introduce event aliases,
+ * ie the different events which have the same functionality but
+ * utilize non-intersected resources (ESCR/CCCR/counter registers).
+ *
+ * This allow us to relax restrictions a bit and run two or more
+ * identical events together.
+ *
+ * Never set any custom internal bits such as P4_CONFIG_HT,
+ * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are
+ * either up to date automatically or not applicable at all.
+ */
+struct p4_event_alias {
+	u64 original;
+	u64 alternative;
+} p4_event_aliases[] = {
+	{
+		/*
+		 * Non-halted cycles can be substituted with non-sleeping cycles (see
+		 * Intel SDM Vol3b for details). We need this alias to be able
+		 * to run nmi-watchdog and 'perf top' (or any other user space tool
+		 * which is interested in running PERF_COUNT_HW_CPU_CYCLES)
+		 * simultaneously.
+		 */
+	.original	=
+		p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)		|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+	.alternative	=
+		p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT)		|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0)	|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1)	|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2)	|
+				    P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))|
+		p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT		|
+				    P4_CCCR_COMPARE),
+	},
+};
+
+static u64 p4_get_alias_event(u64 config)
+{
+	u64 config_match;
+	int i;
+
+	/*
+	 * Only event with special mark is allowed,
+	 * we're to be sure it didn't come as malformed
+	 * RAW event.
+	 */
+	if (!(config & P4_CONFIG_ALIASABLE))
+		return 0;
+
+	config_match = config & P4_CONFIG_EVENT_ALIAS_MASK;
+
+	for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) {
+		if (config_match == p4_event_aliases[i].original) {
+			config_match = p4_event_aliases[i].alternative;
+			break;
+		} else if (config_match == p4_event_aliases[i].alternative) {
+			config_match = p4_event_aliases[i].original;
+			break;
+		}
+	}
+
+	if (i >= ARRAY_SIZE(p4_event_aliases))
+		return 0;
+
+	return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS);
+}
+
 static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
   /* non-halted CPU clocks */
   [PERF_COUNT_HW_CPU_CYCLES] =
 	p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS)		|
-		P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+		P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING))	|
+		P4_CONFIG_ALIASABLE,
 
   /*
    * retired instructions
@@ -945,7 +1034,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 
 		if (!x86_perf_event_set_period(event))
 			continue;
-		if (perf_event_overflow(event, 1, &data, regs))
+		if (perf_event_overflow(event, &data, regs))
 			x86_pmu_stop(event, 0);
 	}
 
@@ -1120,6 +1209,8 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
 	struct p4_event_bind *bind;
 	unsigned int i, thread, num;
 	int cntr_idx, escr_idx;
+	u64 config_alias;
+	int pass;
 
 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
 	bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
@@ -1128,6 +1219,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
 
 		hwc = &cpuc->event_list[i]->hw;
 		thread = p4_ht_thread(cpu);
+		pass = 0;
+
+again:
+		/*
+		 * It's possible to hit a circular lock
+		 * between original and alternative events
+		 * if both are scheduled already.
+		 */
+		if (pass > 2)
+			goto done;
+
 		bind = p4_config_get_bind(hwc->config);
 		escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
 		if (unlikely(escr_idx == -1))
@@ -1141,8 +1243,17 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
 		}
 
 		cntr_idx = p4_next_cntr(thread, used_mask, bind);
-		if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
-			goto done;
+		if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) {
+			/*
+			 * Check whether an event alias is still available.
+			 */
+			config_alias = p4_get_alias_event(hwc->config);
+			if (!config_alias)
+				goto done;
+			hwc->config = config_alias;
+			pass++;
+			goto again;
+		}
 
 		p4_pmu_swap_config_ts(hwc, cpu);
 		if (assign)
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 9aeb78a23de..a621f342768 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -134,6 +134,24 @@ static int __init add_bus_probe(void)
 module_init(add_bus_probe);
 
 #ifdef CONFIG_PCI
+struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
+{
+	struct device_node *np;
+
+	for_each_node_by_type(np, "pci") {
+		const void *prop;
+		unsigned int bus_min;
+
+		prop = of_get_property(np, "bus-range", NULL);
+		if (!prop)
+			continue;
+		bus_min = be32_to_cpup(prop);
+		if (bus->number == bus_min)
+			return np;
+	}
+	return NULL;
+}
+
 static int x86_of_pci_irq_enable(struct pci_dev *dev)
 {
 	struct of_irq oirq;
@@ -165,50 +183,8 @@ static void x86_of_pci_irq_disable(struct pci_dev *dev)
 
 void __cpuinit x86_of_pci_init(void)
 {
-	struct device_node *np;
-
 	pcibios_enable_irq = x86_of_pci_irq_enable;
 	pcibios_disable_irq = x86_of_pci_irq_disable;
-
-	for_each_node_by_type(np, "pci") {
-		const void *prop;
-		struct pci_bus *bus;
-		unsigned int bus_min;
-		struct device_node *child;
-
-		prop = of_get_property(np, "bus-range", NULL);
-		if (!prop)
-			continue;
-		bus_min = be32_to_cpup(prop);
-
-		bus = pci_find_bus(0, bus_min);
-		if (!bus) {
-			printk(KERN_ERR "Can't find a node for bus %s.\n",
-					np->full_name);
-			continue;
-		}
-
-		if (bus->self)
-			bus->self->dev.of_node = np;
-		else
-			bus->dev.of_node = np;
-
-		for_each_child_of_node(np, child) {
-			struct pci_dev *dev;
-			u32 devfn;
-
-			prop = of_get_property(child, "reg", NULL);
-			if (!prop)
-				continue;
-
-			devfn = (be32_to_cpup(prop) >> 8) & 0xff;
-			dev = pci_get_slot(bus, devfn);
-			if (!dev)
-				continue;
-			dev->dev.of_node = child;
-			pci_dev_put(dev);
-		}
-	}
 }
 #endif
 
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index e71c98d3c0d..19853ad8afc 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -105,34 +105,6 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
 }
 
 /*
- * We are returning from the irq stack and go to the previous one.
- * If the previous stack is also in the irq stack, then bp in the first
- * frame of the irq stack points to the previous, interrupted one.
- * Otherwise we have another level of indirection: We first save
- * the bp of the previous stack, then we switch the stack to the irq one
- * and save a new bp that links to the previous one.
- * (See save_args())
- */
-static inline unsigned long
-fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
-		  unsigned long *irq_stack, unsigned long *irq_stack_end)
-{
-#ifdef CONFIG_FRAME_POINTER
-	struct stack_frame *frame = (struct stack_frame *)bp;
-	unsigned long next;
-
-	if (!in_irq_stack(stack, irq_stack, irq_stack_end)) {
-		if (!probe_kernel_address(&frame->next_frame, next))
-			return next;
-		else
-			WARN_ONCE(1, "Perf: bad frame pointer = %p in "
-				  "callchain\n", &frame->next_frame);
-	}
-#endif
-	return bp;
-}
-
-/*
  * x86-64 can have up to three kernel stacks:
  * process stack
  * interrupt stack
@@ -155,9 +127,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		task = current;
 
 	if (!stack) {
-		stack = &dummy;
-		if (task && task != current)
+		if (regs)
+			stack = (unsigned long *)regs->sp;
+		else if (task && task != current)
 			stack = (unsigned long *)task->thread.sp;
+		else
+			stack = &dummy;
 	}
 
 	if (!bp)
@@ -205,8 +180,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 				 * pointer (index -1 to end) in the IRQ stack:
 				 */
 				stack = (unsigned long *) (irq_stack_end[-1]);
-				bp = fixup_bp_irq_link(bp, stack, irq_stack,
-						       irq_stack_end);
 				irq_stack_end = NULL;
 				ops->stack(data, "EOI");
 				continue;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 8a445a0c989..e13329d800c 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -9,6 +9,8 @@
 /*
  * entry.S contains the system-call and fault low-level handling routines.
  *
+ * Some of this is documented in Documentation/x86/entry_64.txt
+ *
  * NOTE: This code handles signal-recognition, which happens every time
  * after an interrupt and after each system call.
  *
@@ -297,27 +299,26 @@ ENDPROC(native_usergs_sysret64)
 	.endm
 
 /* save partial stack frame */
-	.pushsection .kprobes.text, "ax"
-ENTRY(save_args)
-	XCPT_FRAME
+	.macro SAVE_ARGS_IRQ
 	cld
-	/*
-	 * start from rbp in pt_regs and jump over
-	 * return address.
-	 */
-	movq_cfi rdi, RDI+8-RBP
-	movq_cfi rsi, RSI+8-RBP
-	movq_cfi rdx, RDX+8-RBP
-	movq_cfi rcx, RCX+8-RBP
-	movq_cfi rax, RAX+8-RBP
-	movq_cfi  r8,  R8+8-RBP
-	movq_cfi  r9,  R9+8-RBP
-	movq_cfi r10, R10+8-RBP
-	movq_cfi r11, R11+8-RBP
-
-	leaq -RBP+8(%rsp),%rdi	/* arg1 for handler */
-	movq_cfi rbp, 8		/* push %rbp */
-	leaq 8(%rsp), %rbp		/* mov %rsp, %ebp */
+	/* start from rbp in pt_regs and jump over */
+	movq_cfi rdi, RDI-RBP
+	movq_cfi rsi, RSI-RBP
+	movq_cfi rdx, RDX-RBP
+	movq_cfi rcx, RCX-RBP
+	movq_cfi rax, RAX-RBP
+	movq_cfi  r8,  R8-RBP
+	movq_cfi  r9,  R9-RBP
+	movq_cfi r10, R10-RBP
+	movq_cfi r11, R11-RBP
+
+	/* Save rbp so that we can unwind from get_irq_regs() */
+	movq_cfi rbp, 0
+
+	/* Save previous stack value */
+	movq %rsp, %rsi
+
+	leaq -RBP(%rsp),%rdi	/* arg1 for handler */
 	testl $3, CS(%rdi)
 	je 1f
 	SWAPGS
@@ -329,19 +330,14 @@ ENTRY(save_args)
 	 */
 1:	incl PER_CPU_VAR(irq_count)
 	jne 2f
-	popq_cfi %rax			/* move return address... */
 	mov PER_CPU_VAR(irq_stack_ptr),%rsp
 	EMPTY_FRAME 0
-	pushq_cfi %rbp			/* backlink for unwinder */
-	pushq_cfi %rax			/* ... to the new stack */
-	/*
-	 * We entered an interrupt context - irqs are off:
-	 */
-2:	TRACE_IRQS_OFF
-	ret
-	CFI_ENDPROC
-END(save_args)
-	.popsection
+
+2:	/* Store previous stack value */
+	pushq %rsi
+	/* We entered an interrupt context - irqs are off: */
+	TRACE_IRQS_OFF
+	.endm
 
 ENTRY(save_rest)
 	PARTIAL_FRAME 1 REST_SKIP+8
@@ -473,7 +469,7 @@ ENTRY(system_call_after_swapgs)
 	 * and short:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	SAVE_ARGS 8,1
+	SAVE_ARGS 8,0
 	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
@@ -508,7 +504,7 @@ sysret_check:
 	TRACE_IRQS_ON
 	movq RIP-ARGOFFSET(%rsp),%rcx
 	CFI_REGISTER	rip,rcx
-	RESTORE_ARGS 0,-ARG_SKIP,1
+	RESTORE_ARGS 1,-ARG_SKIP,0
 	/*CFI_REGISTER	rflags,r11*/
 	movq	PER_CPU_VAR(old_rsp), %rsp
 	USERGS_SYSRET64
@@ -791,7 +787,7 @@ END(interrupt)
 	/* reserve pt_regs for scratch regs and rbp */
 	subq $ORIG_RAX-RBP, %rsp
 	CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
-	call save_args
+	SAVE_ARGS_IRQ
 	PARTIAL_FRAME 0
 	call \func
 	.endm
@@ -814,15 +810,14 @@ ret_from_intr:
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	decl PER_CPU_VAR(irq_count)
-	leaveq
 
-	CFI_RESTORE		rbp
+	/* Restore saved previous stack */
+	popq %rsi
+	leaq 16(%rsi), %rsp
+
 	CFI_DEF_CFA_REGISTER	rsp
-	CFI_ADJUST_CFA_OFFSET	-8
+	CFI_ADJUST_CFA_OFFSET	-16
 
-	/* we did not save rbx, restore only from ARGOFFSET */
-	addq $8, %rsp
-	CFI_ADJUST_CFA_OFFSET	-8
 exit_intr:
 	GET_THREAD_INFO(%rcx)
 	testl $3,CS-ARGOFFSET(%rsp)
@@ -858,7 +853,7 @@ retint_restore_args:	/* return to kernel space */
 	 */
 	TRACE_IRQS_IRETQ
 restore_args:
-	RESTORE_ARGS 0,8,0
+	RESTORE_ARGS 1,8,1
 
 irq_return:
 	INTERRUPT_RETURN
@@ -991,11 +986,6 @@ apicinterrupt THRESHOLD_APIC_VECTOR \
 apicinterrupt THERMAL_APIC_VECTOR \
 	thermal_interrupt smp_thermal_interrupt
 
-#ifdef CONFIG_X86_MCE
-apicinterrupt MCE_SELF_VECTOR \
-	mce_self_interrupt smp_mce_self_interrupt
-#endif
-
 #ifdef CONFIG_SMP
 apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
 	call_function_single_interrupt smp_call_function_single_interrupt
@@ -1121,6 +1111,8 @@ zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
 zeroentry coprocessor_error do_coprocessor_error
 errorentry alignment_check do_alignment_check
 zeroentry simd_coprocessor_error do_simd_coprocessor_error
+zeroentry emulate_vsyscall do_emulate_vsyscall
+
 
 	/* Reload gs selector with exception handling */
 	/* edi:  new selector */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 6781765b3a0..4aecc54236a 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -4,6 +4,7 @@
 #include <linux/sysdev.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
+#include <linux/i8253.h>
 #include <linux/slab.h>
 #include <linux/hpet.h>
 #include <linux/init.h>
@@ -12,8 +13,8 @@
 #include <linux/io.h>
 
 #include <asm/fixmap.h>
-#include <asm/i8253.h>
 #include <asm/hpet.h>
+#include <asm/time.h>
 
 #define HPET_MASK			CLOCKSOURCE_MASK(32)
 
@@ -71,7 +72,7 @@ static inline void hpet_set_mapping(void)
 {
 	hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
 #ifdef CONFIG_X86_64
-	__set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
+	__set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE);
 #endif
 }
 
@@ -738,13 +739,6 @@ static cycle_t read_hpet(struct clocksource *cs)
 	return (cycle_t)hpet_readl(HPET_COUNTER);
 }
 
-#ifdef CONFIG_X86_64
-static cycle_t __vsyscall_fn vread_hpet(void)
-{
-	return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
-}
-#endif
-
 static struct clocksource clocksource_hpet = {
 	.name		= "hpet",
 	.rating		= 250,
@@ -753,7 +747,7 @@ static struct clocksource clocksource_hpet = {
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
 	.resume		= hpet_resume_counter,
 #ifdef CONFIG_X86_64
-	.vread		= vread_hpet,
+	.archdata	= { .vclock_mode = VCLOCK_HPET },
 #endif
 };
 
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 12aff253768..739d8598f78 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -321,7 +321,7 @@ static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
 	return tmp;
 }
 
-#define FPREG_ADDR(f, n)	((void *)&(f)->st_space + (n) * 16);
+#define FPREG_ADDR(f, n)	((void *)&(f)->st_space + (n) * 16)
 #define FP_EXP_TAG_VALID	0
 #define FP_EXP_TAG_ZERO		1
 #define FP_EXP_TAG_SPECIAL	2
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index fb66dc9e36c..f2b96de3c7c 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -3,113 +3,24 @@
  *
  */
 #include <linux/clockchips.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/timex.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/io.h>
+#include <linux/i8253.h>
 
-#include <asm/i8253.h>
 #include <asm/hpet.h>
+#include <asm/time.h>
 #include <asm/smp.h>
 
-DEFINE_RAW_SPINLOCK(i8253_lock);
-EXPORT_SYMBOL(i8253_lock);
-
 /*
  * HPET replaces the PIT, when enabled. So we need to know, which of
  * the two timers is used
  */
 struct clock_event_device *global_clock_event;
 
-/*
- * Initialize the PIT timer.
- *
- * This is also called after resume to bring the PIT into operation again.
- */
-static void init_pit_timer(enum clock_event_mode mode,
-			   struct clock_event_device *evt)
-{
-	raw_spin_lock(&i8253_lock);
-
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-		/* binary, mode 2, LSB/MSB, ch 0 */
-		outb_pit(0x34, PIT_MODE);
-		outb_pit(LATCH & 0xff , PIT_CH0);	/* LSB */
-		outb_pit(LATCH >> 8 , PIT_CH0);		/* MSB */
-		break;
-
-	case CLOCK_EVT_MODE_SHUTDOWN:
-	case CLOCK_EVT_MODE_UNUSED:
-		if (evt->mode == CLOCK_EVT_MODE_PERIODIC ||
-		    evt->mode == CLOCK_EVT_MODE_ONESHOT) {
-			outb_pit(0x30, PIT_MODE);
-			outb_pit(0, PIT_CH0);
-			outb_pit(0, PIT_CH0);
-		}
-		break;
-
-	case CLOCK_EVT_MODE_ONESHOT:
-		/* One shot setup */
-		outb_pit(0x38, PIT_MODE);
-		break;
-
-	case CLOCK_EVT_MODE_RESUME:
-		/* Nothing to do here */
-		break;
-	}
-	raw_spin_unlock(&i8253_lock);
-}
-
-/*
- * Program the next event in oneshot mode
- *
- * Delta is given in PIT ticks
- */
-static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
-{
-	raw_spin_lock(&i8253_lock);
-	outb_pit(delta & 0xff , PIT_CH0);	/* LSB */
-	outb_pit(delta >> 8 , PIT_CH0);		/* MSB */
-	raw_spin_unlock(&i8253_lock);
-
-	return 0;
-}
-
-/*
- * On UP the PIT can serve all of the possible timer functions. On SMP systems
- * it can be solely used for the global tick.
- *
- * The profiling and update capabilities are switched off once the local apic is
- * registered. This mechanism replaces the previous #ifdef LOCAL_APIC -
- * !using_apic_timer decisions in do_timer_interrupt_hook()
- */
-static struct clock_event_device pit_ce = {
-	.name		= "pit",
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-	.set_mode	= init_pit_timer,
-	.set_next_event = pit_next_event,
-	.irq		= 0,
-};
-
-/*
- * Initialize the conversion factor and the min/max deltas of the clock event
- * structure and register the clock event source with the framework.
- */
 void __init setup_pit_timer(void)
 {
-	/*
-	 * Start pit with the boot cpu mask and make it global after the
-	 * IO_APIC has been initialized.
-	 */
-	pit_ce.cpumask = cpumask_of(smp_processor_id());
-
-	clockevents_config_and_register(&pit_ce, CLOCK_TICK_RATE, 0xF, 0x7FFF);
-	global_clock_event = &pit_ce;
+	clockevent_i8253_init(true);
+	global_clock_event = &i8253_clockevent;
 }
 
 #ifndef CONFIG_X86_64
@@ -123,7 +34,7 @@ static int __init init_pit_clocksource(void)
 	  * - when local APIC timer is active (PIT is switched off)
 	  */
 	if (num_possible_cpus() > 1 || is_hpet_enabled() ||
-	    pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
+	    i8253_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
 		return 0;
 
 	return clocksource_i8253_init();
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 65b8f5c2eeb..610485223bd 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -14,7 +14,7 @@
 #include <linux/io.h>
 #include <linux/delay.h>
 
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/system.h>
 #include <asm/timer.h>
 #include <asm/hw_irq.h>
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index f470e4ef993..b3300e6bace 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -15,7 +15,7 @@
 #include <linux/io.h>
 #include <linux/delay.h>
 
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/system.h>
 #include <asm/timer.h>
 #include <asm/hw_irq.h>
@@ -272,9 +272,6 @@ static void __init apic_intr_init(void)
 #ifdef CONFIG_X86_MCE_THRESHOLD
 	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
 #endif
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
-	alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
-#endif
 
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
 	/* self generated IPI for local APIC timer */
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 5f9ecff328b..00354d4919a 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -608,7 +608,7 @@ int kgdb_arch_init(void)
 	return register_die_notifier(&kgdb_notifier);
 }
 
-static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi,
+static void kgdb_hw_overflow_handler(struct perf_event *event,
 		struct perf_sample_data *data, struct pt_regs *regs)
 {
 	struct task_struct *tsk = current;
@@ -638,7 +638,7 @@ void kgdb_arch_late(void)
 	for (i = 0; i < HBP_NUM; i++) {
 		if (breakinfo[i].pev)
 			continue;
-		breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
+		breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL, NULL);
 		if (IS_ERR((void * __force)breakinfo[i].pev)) {
 			printk(KERN_ERR "kgdb: Could not allocate hw"
 			       "breakpoints\nDisabling the kernel debugger\n");
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33c07b0b122..a9c2116001d 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -51,6 +51,15 @@ static int parse_no_kvmapf(char *arg)
 
 early_param("no-kvmapf", parse_no_kvmapf);
 
+static int steal_acc = 1;
+static int parse_no_stealacc(char *arg)
+{
+        steal_acc = 0;
+        return 0;
+}
+
+early_param("no-steal-acc", parse_no_stealacc);
+
 struct kvm_para_state {
 	u8 mmu_queue[MMU_QUEUE_SIZE];
 	int mmu_queue_len;
@@ -58,6 +67,8 @@ struct kvm_para_state {
 
 static DEFINE_PER_CPU(struct kvm_para_state, para_state);
 static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
+static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
+static int has_steal_clock = 0;
 
 static struct kvm_para_state *kvm_para_state(void)
 {
@@ -441,6 +452,21 @@ static void __init paravirt_ops_setup(void)
 #endif
 }
 
+static void kvm_register_steal_time(void)
+{
+	int cpu = smp_processor_id();
+	struct kvm_steal_time *st = &per_cpu(steal_time, cpu);
+
+	if (!has_steal_clock)
+		return;
+
+	memset(st, 0, sizeof(*st));
+
+	wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED));
+	printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n",
+		cpu, __pa(st));
+}
+
 void __cpuinit kvm_guest_cpu_init(void)
 {
 	if (!kvm_para_available())
@@ -457,6 +483,9 @@ void __cpuinit kvm_guest_cpu_init(void)
 		printk(KERN_INFO"KVM setup async PF for cpu %d\n",
 		       smp_processor_id());
 	}
+
+	if (has_steal_clock)
+		kvm_register_steal_time();
 }
 
 static void kvm_pv_disable_apf(void *unused)
@@ -483,6 +512,31 @@ static struct notifier_block kvm_pv_reboot_nb = {
 	.notifier_call = kvm_pv_reboot_notify,
 };
 
+static u64 kvm_steal_clock(int cpu)
+{
+	u64 steal;
+	struct kvm_steal_time *src;
+	int version;
+
+	src = &per_cpu(steal_time, cpu);
+	do {
+		version = src->version;
+		rmb();
+		steal = src->steal;
+		rmb();
+	} while ((version & 1) || (version != src->version));
+
+	return steal;
+}
+
+void kvm_disable_steal_time(void)
+{
+	if (!has_steal_clock)
+		return;
+
+	wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
+}
+
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
@@ -500,6 +554,7 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)
 
 static void kvm_guest_cpu_offline(void *dummy)
 {
+	kvm_disable_steal_time();
 	kvm_pv_disable_apf(NULL);
 	apf_task_wake_all();
 }
@@ -548,6 +603,11 @@ void __init kvm_guest_init(void)
 	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
 		x86_init.irqs.trap_init = kvm_apf_trap_init;
 
+	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
+		has_steal_clock = 1;
+		pv_time_ops.steal_clock = kvm_steal_clock;
+	}
+
 #ifdef CONFIG_SMP
 	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
 	register_cpu_notifier(&kvm_cpu_notifier);
@@ -555,3 +615,15 @@ void __init kvm_guest_init(void)
 	kvm_guest_cpu_init();
 #endif
 }
+
+static __init int activate_jump_labels(void)
+{
+	if (has_steal_clock) {
+		jump_label_inc(&paravirt_steal_enabled);
+		if (steal_acc)
+			jump_label_inc(&paravirt_steal_rq_enabled);
+	}
+
+	return 0;
+}
+arch_initcall(activate_jump_labels);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 6389a6bca11..c1a0188e29a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -160,6 +160,7 @@ static void __cpuinit kvm_setup_secondary_clock(void)
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
 	native_write_msr(msr_kvm_system_time, 0, 0);
+	kvm_disable_steal_time();
 	native_machine_crash_shutdown(regs);
 }
 #endif
@@ -167,6 +168,7 @@ static void kvm_crash_shutdown(struct pt_regs *regs)
 static void kvm_shutdown(void)
 {
 	native_write_msr(msr_kvm_system_time, 0, 0);
+	kvm_disable_steal_time();
 	native_machine_shutdown();
 }
 
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index c5610384ab1..591be0ee193 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -66,8 +66,8 @@ struct microcode_amd {
 	unsigned int			mpb[0];
 };
 
-#define UCODE_CONTAINER_SECTION_HDR	8
-#define UCODE_CONTAINER_HEADER_SIZE	12
+#define SECTION_HDR_SIZE	8
+#define CONTAINER_HDR_SZ	12
 
 static struct equiv_cpu_entry *equiv_cpu_table;
 
@@ -157,7 +157,7 @@ static int apply_microcode_amd(int cpu)
 static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
 {
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
-	unsigned int max_size, actual_size;
+	u32 max_size, actual_size;
 
 #define F1XH_MPB_MAX_SIZE 2048
 #define F14H_MPB_MAX_SIZE 1824
@@ -175,9 +175,9 @@ static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
 		break;
 	}
 
-	actual_size = buf[4] + (buf[5] << 8);
+	actual_size = *(u32 *)(buf + 4);
 
-	if (actual_size > size || actual_size > max_size) {
+	if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) {
 		pr_err("section size mismatch\n");
 		return 0;
 	}
@@ -191,7 +191,7 @@ get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
 	struct microcode_header_amd *mc = NULL;
 	unsigned int actual_size = 0;
 
-	if (buf[0] != UCODE_UCODE_TYPE) {
+	if (*(u32 *)buf != UCODE_UCODE_TYPE) {
 		pr_err("invalid type field in container file section header\n");
 		goto out;
 	}
@@ -204,8 +204,8 @@ get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
 	if (!mc)
 		goto out;
 
-	get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size);
-	*mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR;
+	get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size);
+	*mc_size = actual_size + SECTION_HDR_SIZE;
 
 out:
 	return mc;
@@ -229,9 +229,10 @@ static int install_equiv_cpu_table(const u8 *buf)
 		return -ENOMEM;
 	}
 
-	get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size);
+	get_ucode_data(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
 
-	return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
+	/* add header length */
+	return size + CONTAINER_HDR_SZ;
 }
 
 static void free_equiv_cpu_table(void)
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 52f256f2cc8..925179f871d 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -45,21 +45,6 @@ void *module_alloc(unsigned long size)
 				-1, __builtin_return_address(0));
 }
 
-/* Free memory returned from module_alloc */
-void module_free(struct module *mod, void *module_region)
-{
-	vfree(module_region);
-}
-
-/* We don't need anything special. */
-int module_frob_arch_sections(Elf_Ehdr *hdr,
-			      Elf_Shdr *sechdrs,
-			      char *secstrings,
-			      struct module *mod)
-{
-	return 0;
-}
-
 #ifdef CONFIG_X86_32
 int apply_relocate(Elf32_Shdr *sechdrs,
 		   const char *strtab,
@@ -100,17 +85,6 @@ int apply_relocate(Elf32_Shdr *sechdrs,
 	}
 	return 0;
 }
-
-int apply_relocate_add(Elf32_Shdr *sechdrs,
-		       const char *strtab,
-		       unsigned int symindex,
-		       unsigned int relsec,
-		       struct module *me)
-{
-	printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n",
-	       me->name);
-	return -ENOEXEC;
-}
 #else /*X86_64*/
 int apply_relocate_add(Elf64_Shdr *sechdrs,
 		   const char *strtab,
@@ -181,17 +155,6 @@ overflow:
 	       me->name);
 	return -ENOEXEC;
 }
-
-int apply_relocate(Elf_Shdr *sechdrs,
-		   const char *strtab,
-		   unsigned int symindex,
-		   unsigned int relsec,
-		   struct module *me)
-{
-	printk(KERN_ERR "non add relocation not supported\n");
-	return -ENOSYS;
-}
-
 #endif
 
 int module_finalize(const Elf_Ehdr *hdr,
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 869e1aeeb71..613a7931ecc 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -202,6 +202,14 @@ static void native_flush_tlb_single(unsigned long addr)
 	__native_flush_tlb_single(addr);
 }
 
+struct jump_label_key paravirt_steal_enabled;
+struct jump_label_key paravirt_steal_rq_enabled;
+
+static u64 native_steal_clock(int cpu)
+{
+	return 0;
+}
+
 /* These are in entry.S */
 extern void native_iret(void);
 extern void native_irq_enable_sysexit(void);
@@ -307,6 +315,7 @@ struct pv_init_ops pv_init_ops = {
 
 struct pv_time_ops pv_time_ops = {
 	.sched_clock = native_sched_clock,
+	.steal_clock = native_steal_clock,
 };
 
 struct pv_irq_ops pv_irq_ops = {
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index e8c33a30200..726494b5834 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -1553,7 +1553,7 @@ static void __init calgary_fixup_one_tce_space(struct pci_dev *dev)
 			continue;
 
 		/* cover the whole region */
-		npages = (r->end - r->start) >> PAGE_SHIFT;
+		npages = resource_size(r) >> PAGE_SHIFT;
 		npages++;
 
 		iommu_range_reserve(tbl, r->start, npages);
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index ba0a4cce53b..63228035f9d 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -234,7 +234,7 @@ void __init probe_roms(void)
 	/* check for extension rom (ignore length byte!) */
 	rom = isa_bus_to_virt(extension_rom_resource.start);
 	if (romsignature(rom)) {
-		length = extension_rom_resource.end - extension_rom_resource.start + 1;
+		length = resource_size(&extension_rom_resource);
 		if (romchecksum(rom, length)) {
 			request_resource(&iomem_resource, &extension_rom_resource);
 			upper = extension_rom_resource.start;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e1ba8cb24e4..e7e3b019c43 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -438,29 +438,6 @@ void cpu_idle_wait(void)
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);
 
-/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- *
- * New with Core Duo processors, MWAIT can take some hints based on CPU
- * capability.
- */
-void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
-{
-	if (!need_resched()) {
-		if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
-			clflush((void *)&current_thread_info()->flags);
-
-		__monitor((void *)&current_thread_info()->flags, 0, 0);
-		smp_mb();
-		if (!need_resched())
-			__mwait(ax, cx);
-	}
-}
-
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 static void mwait_idle(void)
 {
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a3d0dc59067..7a3b65107a2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -38,6 +38,7 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/kdebug.h>
+#include <linux/cpuidle.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -109,7 +110,8 @@ void cpu_idle(void)
 			local_irq_disable();
 			/* Don't trace irqs off for idle */
 			stop_critical_timings();
-			pm_idle();
+			if (cpuidle_idle_call())
+				pm_idle();
 			start_critical_timings();
 		}
 		tick_nohz_restart_sched_tick();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ca6f7ab8df3..f693e44e1bf 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,6 +37,7 @@
 #include <linux/uaccess.h>
 #include <linux/io.h>
 #include <linux/ftrace.h>
+#include <linux/cpuidle.h>
 
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -136,7 +137,8 @@ void cpu_idle(void)
 			enter_idle();
 			/* Don't trace irqs off for idle */
 			stop_critical_timings();
-			pm_idle();
+			if (cpuidle_idle_call())
+				pm_idle();
 			start_critical_timings();
 
 			/* In many cases the interrupt that ended idle
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 807c2a2b80f..82528799c5d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -528,7 +528,7 @@ static int genregs_set(struct task_struct *target,
 	return ret;
 }
 
-static void ptrace_triggered(struct perf_event *bp, int nmi,
+static void ptrace_triggered(struct perf_event *bp,
 			     struct perf_sample_data *data,
 			     struct pt_regs *regs)
 {
@@ -715,7 +715,8 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
 		attr.bp_type = HW_BREAKPOINT_W;
 		attr.disabled = 1;
 
-		bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
+		bp = register_user_hw_breakpoint(&attr, ptrace_triggered,
+						 NULL, tsk);
 
 		/*
 		 * CHECKME: the previous code returned -EIO if the addr wasn't
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 8bbe8c56916..b78643d0f9a 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -10,7 +10,7 @@
 
 static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
 {
-	u8 config, rev;
+	u8 config;
 	u16 word;
 
 	/* BIOS may enable hardware IRQ balancing for
@@ -18,8 +18,7 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
 	 * based platforms.
 	 * Disable SW irqbalance/affinity on those platforms.
 	 */
-	pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
-	if (rev > 0x9)
+	if (dev->revision > 0x9)
 		return;
 
 	/* enable access to config space*/
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 4f0d46fefa7..9242436e993 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -419,6 +419,30 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
 		},
 	},
+	{	/* Handle problems with rebooting on the Latitude E6320. */
+		.callback = set_pci_reboot,
+		.ident = "Dell Latitude E6320",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
+		},
+	},
+	{	/* Handle problems with rebooting on the Latitude E5420. */
+		.callback = set_pci_reboot,
+		.ident = "Dell Latitude E5420",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
+		},
+	},
+	{	/* Handle problems with rebooting on the Latitude E6420. */
+		.callback = set_pci_reboot,
+		.ident = "Dell Latitude E6420",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
+		},
+	},
 	{ }
 };
 
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index 41235531b11..36818f8ec2b 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -97,6 +97,8 @@ relocate_kernel:
 	ret
 
 identity_mapped:
+	/* set return address to 0 if not preserving context */
+	pushl	$0
 	/* store the start address on the stack */
 	pushl   %edx
 
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 4de8f5b3d47..7a6f3b3be3c 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -100,6 +100,8 @@ relocate_kernel:
 	ret
 
 identity_mapped:
+	/* set return address to 0 if not preserving context */
+	pushq	$0
 	/* store the start address on the stack */
 	pushq   %rdx
 
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 40a24932a8a..54ddaeb221c 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -485,17 +485,18 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 asmlinkage int
 sys_sigsuspend(int history0, int history1, old_sigset_t mask)
 {
-	mask &= _BLOCKABLE;
-	spin_lock_irq(&current->sighand->siglock);
+	sigset_t blocked;
+
 	current->saved_sigmask = current->blocked;
-	siginitset(&current->blocked, mask);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
+
+	mask &= _BLOCKABLE;
+	siginitset(&blocked, mask);
+	set_current_blocked(&blocked);
 
 	current->state = TASK_INTERRUPTIBLE;
 	schedule();
-	set_restore_sigmask();
 
+	set_restore_sigmask();
 	return -ERESTARTNOHAND;
 }
 
@@ -572,10 +573,7 @@ unsigned long sys_sigreturn(struct pt_regs *regs)
 		goto badframe;
 
 	sigdelsetmask(&set, ~_BLOCKABLE);
-	spin_lock_irq(&current->sighand->siglock);
-	current->blocked = set;
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
+	set_current_blocked(&set);
 
 	if (restore_sigcontext(regs, &frame->sc, &ax))
 		goto badframe;
@@ -653,11 +651,15 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
 
 static int
 setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
-	       sigset_t *set, struct pt_regs *regs)
+		struct pt_regs *regs)
 {
 	int usig = signr_convert(sig);
+	sigset_t *set = &current->blocked;
 	int ret;
 
+	if (current_thread_info()->status & TS_RESTORE_SIGMASK)
+		set = &current->saved_sigmask;
+
 	/* Set up the stack frame */
 	if (is_ia32) {
 		if (ka->sa.sa_flags & SA_SIGINFO)
@@ -672,12 +674,13 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 		return -EFAULT;
 	}
 
+	current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
 	return ret;
 }
 
 static int
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
-	      sigset_t *oldset, struct pt_regs *regs)
+		struct pt_regs *regs)
 {
 	sigset_t blocked;
 	int ret;
@@ -712,20 +715,11 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	    likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
 		regs->flags &= ~X86_EFLAGS_TF;
 
-	ret = setup_rt_frame(sig, ka, info, oldset, regs);
+	ret = setup_rt_frame(sig, ka, info, regs);
 
 	if (ret)
 		return ret;
 
-#ifdef CONFIG_X86_64
-	/*
-	 * This has nothing to do with segment registers,
-	 * despite the name.  This magic affects uaccess.h
-	 * macros' behavior.  Reset it to the normal setting.
-	 */
-	set_fs(USER_DS);
-#endif
-
 	/*
 	 * Clear the direction flag as per the ABI for function entry.
 	 */
@@ -767,7 +761,6 @@ static void do_signal(struct pt_regs *regs)
 	struct k_sigaction ka;
 	siginfo_t info;
 	int signr;
-	sigset_t *oldset;
 
 	/*
 	 * We want the common case to go fast, which is why we may in certain
@@ -779,23 +772,10 @@ static void do_signal(struct pt_regs *regs)
 	if (!user_mode(regs))
 		return;
 
-	if (current_thread_info()->status & TS_RESTORE_SIGMASK)
-		oldset = &current->saved_sigmask;
-	else
-		oldset = &current->blocked;
-
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
 		/* Whee! Actually deliver the signal.  */
-		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
-			/*
-			 * A signal was successfully delivered; the saved
-			 * sigmask will have been stored in the signal frame,
-			 * and will be restored by sigreturn, so we can simply
-			 * clear the TS_RESTORE_SIGMASK flag.
-			 */
-			current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		}
+		handle_signal(signr, &info, &ka, regs);
 		return;
 	}
 
@@ -823,7 +803,7 @@ static void do_signal(struct pt_regs *regs)
 	 */
 	if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
 		current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
-		sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
+		set_current_blocked(&current->saved_sigmask);
 	}
 }
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9fd3137230d..9f548cb4a95 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -438,7 +438,7 @@ static void impress_friends(void)
 void __inquire_remote_apic(int apicid)
 {
 	unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
-	char *names[] = { "ID", "VERSION", "SPIV" };
+	const char * const names[] = { "ID", "VERSION", "SPIV" };
 	int timeout;
 	u32 status;
 
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 55d9bc03f69..fdd0c6430e5 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -66,7 +66,7 @@ void save_stack_trace(struct stack_trace *trace)
 }
 EXPORT_SYMBOL_GPL(save_stack_trace);
 
-void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
+void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace)
 {
 	dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
 	if (trace->nr_entries < trace->max_entries)
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 30ac65df7d4..e07a2fc876b 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -36,6 +36,7 @@
 #include <asm/bootparam.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
+#include <asm/swiotlb.h>
 #include <asm/fixmap.h>
 #include <asm/proto.h>
 #include <asm/setup.h>
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 00cbb272627..5a64d057be5 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -11,13 +11,13 @@
 
 #include <linux/clockchips.h>
 #include <linux/interrupt.h>
+#include <linux/i8253.h>
 #include <linux/time.h>
 #include <linux/mca.h>
 
 #include <asm/vsyscall.h>
 #include <asm/x86_init.h>
 #include <asm/i8259.h>
-#include <asm/i8253.h>
 #include <asm/timer.h>
 #include <asm/hpet.h>
 #include <asm/time.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b9b67166f9d..9682ec50180 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -49,7 +49,7 @@
 #include <asm/stacktrace.h>
 #include <asm/processor.h>
 #include <asm/debugreg.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <asm/system.h>
 #include <asm/traps.h>
 #include <asm/desc.h>
@@ -872,6 +872,12 @@ void __init trap_init(void)
 	set_bit(SYSCALL_VECTOR, used_vectors);
 #endif
 
+#ifdef CONFIG_X86_64
+	BUG_ON(test_bit(VSYSCALL_EMU_VECTOR, used_vectors));
+	set_system_intr_gate(VSYSCALL_EMU_VECTOR, &emulate_vsyscall);
+	set_bit(VSYSCALL_EMU_VECTOR, used_vectors);
+#endif
+
 	/*
 	 * Should be a barrier for any external CPU state:
 	 */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6cc6922262a..db483369f10 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -5,7 +5,6 @@
 #include <linux/timer.h>
 #include <linux/acpi_pmtmr.h>
 #include <linux/cpufreq.h>
-#include <linux/dmi.h>
 #include <linux/delay.h>
 #include <linux/clocksource.h>
 #include <linux/percpu.h>
@@ -777,7 +776,7 @@ static struct clocksource clocksource_tsc = {
 	.flags                  = CLOCK_SOURCE_IS_CONTINUOUS |
 				  CLOCK_SOURCE_MUST_VERIFY,
 #ifdef CONFIG_X86_64
-	.vread                  = vread_tsc,
+	.archdata               = { .vclock_mode = VCLOCK_TSC },
 #endif
 };
 
@@ -800,27 +799,6 @@ void mark_tsc_unstable(char *reason)
 
 EXPORT_SYMBOL_GPL(mark_tsc_unstable);
 
-static int __init dmi_mark_tsc_unstable(const struct dmi_system_id *d)
-{
-	printk(KERN_NOTICE "%s detected: marking TSC unstable.\n",
-			d->ident);
-	tsc_unstable = 1;
-	return 0;
-}
-
-/* List of systems that have known TSC problems */
-static struct dmi_system_id __initdata bad_tsc_dmi_table[] = {
-	{
-		.callback = dmi_mark_tsc_unstable,
-		.ident = "IBM Thinkpad 380XD",
-		.matches = {
-			DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
-			DMI_MATCH(DMI_BOARD_NAME, "2635FA0"),
-		},
-	},
-	{}
-};
-
 static void __init check_system_tsc_reliable(void)
 {
 #ifdef CONFIG_MGEODE_LX
@@ -1010,8 +988,6 @@ void __init tsc_init(void)
 	lpj_fine = lpj;
 
 	use_tsc_delay();
-	/* Check and install the TSC clocksource */
-	dmi_check_system(bad_tsc_dmi_table);
 
 	if (unsynchronized_tsc())
 		mark_tsc_unstable("TSCs unsynchronized");
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 89aed99aafc..4aa9c54a9b7 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -161,50 +161,47 @@ SECTIONS
 
 #define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
 #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
-#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x	\
-	ADDR(.vsyscall_0) + offset		 	\
-	: AT(VLOAD(.vsyscall_var_ ## x)) {     		\
-		*(.vsyscall_var_ ## x)			\
-	}						\
-	x = VVIRT(.vsyscall_var_ ## x);
 
 	. = ALIGN(4096);
 	__vsyscall_0 = .;
 
 	. = VSYSCALL_ADDR;
-	.vsyscall_0 : AT(VLOAD(.vsyscall_0)) {
+	.vsyscall : AT(VLOAD(.vsyscall)) {
 		*(.vsyscall_0)
-	} :user
 
-	. = ALIGN(L1_CACHE_BYTES);
-	.vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
-		*(.vsyscall_fn)
-	}
-
-	.vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
+		. = 1024;
 		*(.vsyscall_1)
-	}
-	.vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
-		*(.vsyscall_2)
-	}
 
-	.vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
-		*(.vsyscall_3)
-	}
-
-#define __VVAR_KERNEL_LDS
-#include <asm/vvar.h>
-#undef __VVAR_KERNEL_LDS
+		. = 2048;
+		*(.vsyscall_2)
 
-	. = __vsyscall_0 + PAGE_SIZE;
+		. = 4096;  /* Pad the whole page. */
+	} :user =0xcc
+	. = ALIGN(__vsyscall_0 + PAGE_SIZE, PAGE_SIZE);
 
 #undef VSYSCALL_ADDR
 #undef VLOAD_OFFSET
 #undef VLOAD
 #undef VVIRT_OFFSET
 #undef VVIRT
+
+	__vvar_page = .;
+
+	.vvar : AT(ADDR(.vvar) - LOAD_OFFSET) {
+
+	      /* Place all vvars at the offsets in asm/vvar.h. */
+#define EMIT_VVAR(name, offset) 		\
+		. = offset;		\
+		*(.vvar_ ## name)
+#define __VVAR_KERNEL_LDS
+#include <asm/vvar.h>
+#undef __VVAR_KERNEL_LDS
 #undef EMIT_VVAR
 
+	} :data
+
+       . = ALIGN(__vvar_page + PAGE_SIZE, PAGE_SIZE);
+
 #endif /* CONFIG_X86_64 */
 
 	/* Init code and data - will be freed after init */
diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c
deleted file mode 100644
index a81aa9e9894..00000000000
--- a/arch/x86/kernel/vread_tsc_64.c
+++ /dev/null
@@ -1,36 +0,0 @@
-/* This code runs in userspace. */
-
-#define DISABLE_BRANCH_PROFILING
-#include <asm/vgtod.h>
-
-notrace cycle_t __vsyscall_fn vread_tsc(void)
-{
-	cycle_t ret;
-	u64 last;
-
-	/*
-	 * Empirically, a fence (of type that depends on the CPU)
-	 * before rdtsc is enough to ensure that rdtsc is ordered
-	 * with respect to loads.  The various CPU manuals are unclear
-	 * as to whether rdtsc can be reordered with later loads,
-	 * but no one has ever seen it happen.
-	 */
-	rdtsc_barrier();
-	ret = (cycle_t)vget_cycles();
-
-	last = VVAR(vsyscall_gtod_data).clock.cycle_last;
-
-	if (likely(ret >= last))
-		return ret;
-
-	/*
-	 * GCC likes to generate cmov here, but this branch is extremely
-	 * predictable (it's just a funciton of time and the likely is
-	 * very likely) and there's a data dependence, so force GCC
-	 * to generate a branch instead.  I don't barrier() because
-	 * we don't actually need a barrier, and if this function
-	 * ever gets inlined it will generate worse code.
-	 */
-	asm volatile ("");
-	return last;
-}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 3e682184d76..dda7dff9cef 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -2,6 +2,8 @@
  *  Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
  *  Copyright 2003 Andi Kleen, SuSE Labs.
  *
+ *  [ NOTE: this mechanism is now deprecated in favor of the vDSO. ]
+ *
  *  Thanks to hpa@transmeta.com for some useful hint.
  *  Special thanks to Ingo Molnar for his early experience with
  *  a different vsyscall implementation for Linux/IA32 and for the name.
@@ -11,10 +13,9 @@
  *  vsyscalls. One vsyscall can reserve more than 1 slot to avoid
  *  jumping out of line if necessary. We cannot add more with this
  *  mechanism because older kernels won't return -ENOSYS.
- *  If we want more than four we need a vDSO.
  *
- *  Note: the concept clashes with user mode linux. If you use UML and
- *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
+ *  Note: the concept clashes with user mode linux.  UML users should
+ *  use the vDSO.
  */
 
 /* Disable profiling for userspace code: */
@@ -32,9 +33,12 @@
 #include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/notifier.h>
+#include <linux/syscalls.h>
+#include <linux/ratelimit.h>
 
 #include <asm/vsyscall.h>
 #include <asm/pgtable.h>
+#include <asm/compat.h>
 #include <asm/page.h>
 #include <asm/unistd.h>
 #include <asm/fixmap.h>
@@ -44,16 +48,12 @@
 #include <asm/desc.h>
 #include <asm/topology.h>
 #include <asm/vgtod.h>
-
-#define __vsyscall(nr) \
-		__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
-#define __syscall_clobber "r11","cx","memory"
+#include <asm/traps.h>
 
 DEFINE_VVAR(int, vgetcpu_mode);
 DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
 {
 	.lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
-	.sysctl_enabled = 1,
 };
 
 void update_vsyscall_tz(void)
@@ -72,179 +72,149 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
 	unsigned long flags;
 
 	write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+
 	/* copy vsyscall data */
-	vsyscall_gtod_data.clock.vread = clock->vread;
-	vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
-	vsyscall_gtod_data.clock.mask = clock->mask;
-	vsyscall_gtod_data.clock.mult = mult;
-	vsyscall_gtod_data.clock.shift = clock->shift;
-	vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
-	vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
-	vsyscall_gtod_data.wall_to_monotonic = *wtm;
-	vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
+	vsyscall_gtod_data.clock.vclock_mode	= clock->archdata.vclock_mode;
+	vsyscall_gtod_data.clock.cycle_last	= clock->cycle_last;
+	vsyscall_gtod_data.clock.mask		= clock->mask;
+	vsyscall_gtod_data.clock.mult		= mult;
+	vsyscall_gtod_data.clock.shift		= clock->shift;
+	vsyscall_gtod_data.wall_time_sec	= wall_time->tv_sec;
+	vsyscall_gtod_data.wall_time_nsec	= wall_time->tv_nsec;
+	vsyscall_gtod_data.wall_to_monotonic	= *wtm;
+	vsyscall_gtod_data.wall_time_coarse	= __current_kernel_time();
+
 	write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
 }
 
-/* RED-PEN may want to readd seq locking, but then the variable should be
- * write-once.
- */
-static __always_inline void do_get_tz(struct timezone * tz)
+static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
+			      const char *message)
 {
-	*tz = VVAR(vsyscall_gtod_data).sys_tz;
-}
+	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
+	struct task_struct *tsk;
 
-static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
-{
-	int ret;
-	asm volatile("syscall"
-		: "=a" (ret)
-		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
-		: __syscall_clobber );
-	return ret;
-}
+	if (!show_unhandled_signals || !__ratelimit(&rs))
+		return;
 
-static __always_inline long time_syscall(long *t)
-{
-	long secs;
-	asm volatile("syscall"
-		: "=a" (secs)
-		: "0" (__NR_time),"D" (t) : __syscall_clobber);
-	return secs;
-}
+	tsk = current;
 
-static __always_inline void do_vgettimeofday(struct timeval * tv)
-{
-	cycle_t now, base, mask, cycle_delta;
-	unsigned seq;
-	unsigned long mult, shift, nsec;
-	cycle_t (*vread)(void);
-	do {
-		seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
-
-		vread = VVAR(vsyscall_gtod_data).clock.vread;
-		if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled ||
-			     !vread)) {
-			gettimeofday(tv,NULL);
-			return;
-		}
-
-		now = vread();
-		base = VVAR(vsyscall_gtod_data).clock.cycle_last;
-		mask = VVAR(vsyscall_gtod_data).clock.mask;
-		mult = VVAR(vsyscall_gtod_data).clock.mult;
-		shift = VVAR(vsyscall_gtod_data).clock.shift;
-
-		tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec;
-		nsec = VVAR(vsyscall_gtod_data).wall_time_nsec;
-	} while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
-
-	/* calculate interval: */
-	cycle_delta = (now - base) & mask;
-	/* convert to nsecs: */
-	nsec += (cycle_delta * mult) >> shift;
-
-	while (nsec >= NSEC_PER_SEC) {
-		tv->tv_sec += 1;
-		nsec -= NSEC_PER_SEC;
-	}
-	tv->tv_usec = nsec / NSEC_PER_USEC;
+	printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
+	       level, tsk->comm, task_pid_nr(tsk),
+	       message, regs->ip - 2, regs->cs,
+	       regs->sp, regs->ax, regs->si, regs->di);
 }
 
-int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
+static int addr_to_vsyscall_nr(unsigned long addr)
 {
-	if (tv)
-		do_vgettimeofday(tv);
-	if (tz)
-		do_get_tz(tz);
-	return 0;
-}
+	int nr;
 
-/* This will break when the xtime seconds get inaccurate, but that is
- * unlikely */
-time_t __vsyscall(1) vtime(time_t *t)
-{
-	unsigned seq;
-	time_t result;
-	if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
-		return time_syscall(t);
+	if ((addr & ~0xC00UL) != VSYSCALL_START)
+		return -EINVAL;
 
-	do {
-		seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
+	nr = (addr & 0xC00UL) >> 10;
+	if (nr >= 3)
+		return -EINVAL;
 
-		result = VVAR(vsyscall_gtod_data).wall_time_sec;
+	return nr;
+}
 
-	} while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
+void dotraplinkage do_emulate_vsyscall(struct pt_regs *regs, long error_code)
+{
+	struct task_struct *tsk;
+	unsigned long caller;
+	int vsyscall_nr;
+	long ret;
+
+	local_irq_enable();
+
+	/*
+	 * Real 64-bit user mode code has cs == __USER_CS.  Anything else
+	 * is bogus.
+	 */
+	if (regs->cs != __USER_CS) {
+		/*
+		 * If we trapped from kernel mode, we might as well OOPS now
+		 * instead of returning to some random address and OOPSing
+		 * then.
+		 */
+		BUG_ON(!user_mode(regs));
+
+		/* Compat mode and non-compat 32-bit CS should both segfault. */
+		warn_bad_vsyscall(KERN_WARNING, regs,
+				  "illegal int 0xcc from 32-bit mode");
+		goto sigsegv;
+	}
 
-	if (t)
-		*t = result;
-	return result;
-}
+	/*
+	 * x86-ism here: regs->ip points to the instruction after the int 0xcc,
+	 * and int 0xcc is two bytes long.
+	 */
+	vsyscall_nr = addr_to_vsyscall_nr(regs->ip - 2);
+	if (vsyscall_nr < 0) {
+		warn_bad_vsyscall(KERN_WARNING, regs,
+				  "illegal int 0xcc (exploit attempt?)");
+		goto sigsegv;
+	}
 
-/* Fast way to get current CPU and node.
-   This helps to do per node and per CPU caches in user space.
-   The result is not guaranteed without CPU affinity, but usually
-   works out because the scheduler tries to keep a thread on the same
-   CPU.
+	if (get_user(caller, (unsigned long __user *)regs->sp) != 0) {
+		warn_bad_vsyscall(KERN_WARNING, regs, "int 0xcc with bad stack (exploit attempt?)");
+		goto sigsegv;
+	}
 
-   tcache must point to a two element sized long array.
-   All arguments can be NULL. */
-long __vsyscall(2)
-vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
-{
-	unsigned int p;
-	unsigned long j = 0;
-
-	/* Fast cache - only recompute value once per jiffies and avoid
-	   relatively costly rdtscp/cpuid otherwise.
-	   This works because the scheduler usually keeps the process
-	   on the same CPU and this syscall doesn't guarantee its
-	   results anyways.
-	   We do this here because otherwise user space would do it on
-	   its own in a likely inferior way (no access to jiffies).
-	   If you don't like it pass NULL. */
-	if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) {
-		p = tcache->blob[1];
-	} else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
-		/* Load per CPU data from RDTSCP */
-		native_read_tscp(&p);
-	} else {
-		/* Load per CPU data from GDT */
-		asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+	tsk = current;
+	if (seccomp_mode(&tsk->seccomp))
+		do_exit(SIGKILL);
+
+	switch (vsyscall_nr) {
+	case 0:
+		ret = sys_gettimeofday(
+			(struct timeval __user *)regs->di,
+			(struct timezone __user *)regs->si);
+		break;
+
+	case 1:
+		ret = sys_time((time_t __user *)regs->di);
+		break;
+
+	case 2:
+		ret = sys_getcpu((unsigned __user *)regs->di,
+				 (unsigned __user *)regs->si,
+				 0);
+		break;
 	}
-	if (tcache) {
-		tcache->blob[0] = j;
-		tcache->blob[1] = p;
+
+	if (ret == -EFAULT) {
+		/*
+		 * Bad news -- userspace fed a bad pointer to a vsyscall.
+		 *
+		 * With a real vsyscall, that would have caused SIGSEGV.
+		 * To make writing reliable exploits using the emulated
+		 * vsyscalls harder, generate SIGSEGV here as well.
+		 */
+		warn_bad_vsyscall(KERN_INFO, regs,
+				  "vsyscall fault (exploit attempt?)");
+		goto sigsegv;
 	}
-	if (cpu)
-		*cpu = p & 0xfff;
-	if (node)
-		*node = p >> 12;
-	return 0;
-}
 
-static long __vsyscall(3) venosys_1(void)
-{
-	return -ENOSYS;
-}
+	regs->ax = ret;
 
-#ifdef CONFIG_SYSCTL
-static ctl_table kernel_table2[] = {
-	{ .procname = "vsyscall64",
-	  .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
-	  .mode = 0644,
-	  .proc_handler = proc_dointvec },
-	{}
-};
+	/* Emulate a ret instruction. */
+	regs->ip = caller;
+	regs->sp += 8;
 
-static ctl_table kernel_root_table2[] = {
-	{ .procname = "kernel", .mode = 0555,
-	  .child = kernel_table2 },
-	{}
-};
-#endif
+	local_irq_disable();
+	return;
+
+sigsegv:
+	regs->ip -= 2;  /* The faulting instruction should be the int 0xcc. */
+	force_sig(SIGSEGV, current);
+	local_irq_disable();
+}
 
-/* Assume __initcall executes before all user space. Hopefully kmod
-   doesn't violate that. We'll find out if it does. */
+/*
+ * Assume __initcall executes before all user space. Hopefully kmod
+ * doesn't violate that. We'll find out if it does.
+ */
 static void __cpuinit vsyscall_set_cpu(int cpu)
 {
 	unsigned long d;
@@ -255,13 +225,15 @@ static void __cpuinit vsyscall_set_cpu(int cpu)
 	if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
 		write_rdtscp_aux((node << 12) | cpu);
 
-	/* Store cpu number in limit so that it can be loaded quickly
-	   in user space in vgetcpu.
-	   12 bits for the CPU and 8 bits for the node. */
+	/*
+	 * Store cpu number in limit so that it can be loaded quickly
+	 * in user space in vgetcpu. (12 bits for the CPU and 8 bits for the node)
+	 */
 	d = 0x0f40000000000ULL;
 	d |= cpu;
 	d |= (node & 0xf) << 12;
 	d |= (node >> 4) << 48;
+
 	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
 }
 
@@ -275,8 +247,10 @@ static int __cpuinit
 cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
 {
 	long cpu = (long)arg;
+
 	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
 		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
+
 	return NOTIFY_DONE;
 }
 
@@ -284,25 +258,23 @@ void __init map_vsyscall(void)
 {
 	extern char __vsyscall_0;
 	unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0);
+	extern char __vvar_page;
+	unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page);
 
 	/* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */
 	__set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL);
+	__set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR);
+	BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != (unsigned long)VVAR_ADDRESS);
 }
 
 static int __init vsyscall_init(void)
 {
-	BUG_ON(((unsigned long) &vgettimeofday !=
-			VSYSCALL_ADDR(__NR_vgettimeofday)));
-	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
-	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
-	BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu));
-#ifdef CONFIG_SYSCTL
-	register_sysctl_table(kernel_root_table2);
-#endif
+	BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
+
 	on_each_cpu(cpu_vsyscall_init, NULL, 1);
 	/* notifier priority > KVM */
 	hotcpu_notifier(cpu_vsyscall_notifier, 30);
+
 	return 0;
 }
-
 __initcall(vsyscall_init);
diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S
new file mode 100644
index 00000000000..ffa845eae5c
--- /dev/null
+++ b/arch/x86/kernel/vsyscall_emu_64.S
@@ -0,0 +1,27 @@
+/*
+ * vsyscall_emu_64.S: Vsyscall emulation page
+ *
+ * Copyright (c) 2011 Andy Lutomirski
+ *
+ * Subject to the GNU General Public License, version 2
+ */
+
+#include <linux/linkage.h>
+#include <asm/irq_vectors.h>
+
+/* The unused parts of the page are filled with 0xcc by the linker script. */
+
+.section .vsyscall_0, "a"
+ENTRY(vsyscall_0)
+	int $VSYSCALL_EMU_VECTOR
+END(vsyscall_0)
+
+.section .vsyscall_1, "a"
+ENTRY(vsyscall_1)
+	int $VSYSCALL_EMU_VECTOR
+END(vsyscall_1)
+
+.section .vsyscall_2, "a"
+ENTRY(vsyscall_2)
+	int $VSYSCALL_EMU_VECTOR
+END(vsyscall_2)
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 50f63648ce1..988724b236b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -31,6 +31,7 @@ config KVM
 	select KVM_ASYNC_PF
 	select USER_RETURN_NOTIFIER
 	select KVM_MMIO
+	select TASK_DELAY_ACCT
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
 	  virtualization extensions.  You will need a fairly recent
@@ -76,6 +77,5 @@ config KVM_MMU_AUDIT
 # the virtualization menu.
 source drivers/vhost/Kconfig
 source drivers/lguest/Kconfig
-source drivers/virtio/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index adc98675cda..6f08bc940fa 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -407,76 +407,59 @@ struct gprefix {
 		}							\
 	} while (0)
 
-/* Fetch next part of the instruction being emulated. */
-#define insn_fetch(_type, _size, _eip)                                  \
-({	unsigned long _x;						\
-	rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size));		\
-	if (rc != X86EMUL_CONTINUE)					\
-		goto done;						\
-	(_eip) += (_size);						\
-	(_type)_x;							\
-})
-
-#define insn_fetch_arr(_arr, _size, _eip)				\
-({	rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size));		\
-	if (rc != X86EMUL_CONTINUE)					\
-		goto done;						\
-	(_eip) += (_size);						\
-})
-
 static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
 				    enum x86_intercept intercept,
 				    enum x86_intercept_stage stage)
 {
 	struct x86_instruction_info info = {
 		.intercept  = intercept,
-		.rep_prefix = ctxt->decode.rep_prefix,
-		.modrm_mod  = ctxt->decode.modrm_mod,
-		.modrm_reg  = ctxt->decode.modrm_reg,
-		.modrm_rm   = ctxt->decode.modrm_rm,
-		.src_val    = ctxt->decode.src.val64,
-		.src_bytes  = ctxt->decode.src.bytes,
-		.dst_bytes  = ctxt->decode.dst.bytes,
-		.ad_bytes   = ctxt->decode.ad_bytes,
+		.rep_prefix = ctxt->rep_prefix,
+		.modrm_mod  = ctxt->modrm_mod,
+		.modrm_reg  = ctxt->modrm_reg,
+		.modrm_rm   = ctxt->modrm_rm,
+		.src_val    = ctxt->src.val64,
+		.src_bytes  = ctxt->src.bytes,
+		.dst_bytes  = ctxt->dst.bytes,
+		.ad_bytes   = ctxt->ad_bytes,
 		.next_rip   = ctxt->eip,
 	};
 
 	return ctxt->ops->intercept(ctxt, &info, stage);
 }
 
-static inline unsigned long ad_mask(struct decode_cache *c)
+static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
 {
-	return (1UL << (c->ad_bytes << 3)) - 1;
+	return (1UL << (ctxt->ad_bytes << 3)) - 1;
 }
 
 /* Access/update address held in a register, based on addressing mode. */
 static inline unsigned long
-address_mask(struct decode_cache *c, unsigned long reg)
+address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
 {
-	if (c->ad_bytes == sizeof(unsigned long))
+	if (ctxt->ad_bytes == sizeof(unsigned long))
 		return reg;
 	else
-		return reg & ad_mask(c);
+		return reg & ad_mask(ctxt);
 }
 
 static inline unsigned long
-register_address(struct decode_cache *c, unsigned long reg)
+register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg)
 {
-	return address_mask(c, reg);
+	return address_mask(ctxt, reg);
 }
 
 static inline void
-register_address_increment(struct decode_cache *c, unsigned long *reg, int inc)
+register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc)
 {
-	if (c->ad_bytes == sizeof(unsigned long))
+	if (ctxt->ad_bytes == sizeof(unsigned long))
 		*reg += inc;
 	else
-		*reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c));
+		*reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt));
 }
 
-static inline void jmp_rel(struct decode_cache *c, int rel)
+static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
 {
-	register_address_increment(c, &c->eip, rel);
+	register_address_increment(ctxt, &ctxt->_eip, rel);
 }
 
 static u32 desc_limit_scaled(struct desc_struct *desc)
@@ -486,28 +469,26 @@ static u32 desc_limit_scaled(struct desc_struct *desc)
 	return desc->g ? (limit << 12) | 0xfff : limit;
 }
 
-static void set_seg_override(struct decode_cache *c, int seg)
+static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg)
 {
-	c->has_seg_override = true;
-	c->seg_override = seg;
+	ctxt->has_seg_override = true;
+	ctxt->seg_override = seg;
 }
 
-static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
-			      struct x86_emulate_ops *ops, int seg)
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
 {
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
 		return 0;
 
-	return ops->get_cached_segment_base(ctxt, seg);
+	return ctxt->ops->get_cached_segment_base(ctxt, seg);
 }
 
-static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
-			     struct decode_cache *c)
+static unsigned seg_override(struct x86_emulate_ctxt *ctxt)
 {
-	if (!c->has_seg_override)
+	if (!ctxt->has_seg_override)
 		return 0;
 
-	return c->seg_override;
+	return ctxt->seg_override;
 }
 
 static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
@@ -579,7 +560,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
 		     unsigned size, bool write, bool fetch,
 		     ulong *linear)
 {
-	struct decode_cache *c = &ctxt->decode;
 	struct desc_struct desc;
 	bool usable;
 	ulong la;
@@ -587,7 +567,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
 	u16 sel;
 	unsigned cpl, rpl;
 
-	la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
+	la = seg_base(ctxt, addr.seg) + addr.ea;
 	switch (ctxt->mode) {
 	case X86EMUL_MODE_REAL:
 		break;
@@ -637,7 +617,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
 		}
 		break;
 	}
-	if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8)
+	if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8)
 		la &= (u32)-1;
 	*linear = la;
 	return X86EMUL_CONTINUE;
@@ -671,11 +651,10 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
 	return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
 }
 
-static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
-			      struct x86_emulate_ops *ops,
+static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt,
 			      unsigned long eip, u8 *dest)
 {
-	struct fetch_cache *fc = &ctxt->decode.fetch;
+	struct fetch_cache *fc = &ctxt->fetch;
 	int rc;
 	int size, cur_size;
 
@@ -687,8 +666,8 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
 		rc = __linearize(ctxt, addr, size, false, true, &linear);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
-		rc = ops->fetch(ctxt, linear, fc->data + cur_size,
-				size, &ctxt->exception);
+		rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size,
+				      size, &ctxt->exception);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		fc->end += size;
@@ -698,7 +677,6 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
 }
 
 static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
-			 struct x86_emulate_ops *ops,
 			 unsigned long eip, void *dest, unsigned size)
 {
 	int rc;
@@ -707,13 +685,30 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
 	if (eip + size - ctxt->eip > 15)
 		return X86EMUL_UNHANDLEABLE;
 	while (size--) {
-		rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
+		rc = do_insn_fetch_byte(ctxt, eip++, dest++);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 	}
 	return X86EMUL_CONTINUE;
 }
 
+/* Fetch next part of the instruction being emulated. */
+#define insn_fetch(_type, _size, _eip)					\
+({	unsigned long _x;						\
+	rc = do_insn_fetch(ctxt, (_eip), &_x, (_size));			\
+	if (rc != X86EMUL_CONTINUE)					\
+		goto done;						\
+	(_eip) += (_size);						\
+	(_type)_x;							\
+})
+
+#define insn_fetch_arr(_arr, _size, _eip)				\
+({	rc = do_insn_fetch(ctxt, (_eip), _arr, (_size));		\
+	if (rc != X86EMUL_CONTINUE)					\
+		goto done;						\
+	(_eip) += (_size);						\
+})
+
 /*
  * Given the 'reg' portion of a ModRM byte, and a register block, return a
  * pointer into the block that addresses the relevant register.
@@ -857,16 +852,15 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
 
 static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
 				    struct operand *op,
-				    struct decode_cache *c,
 				    int inhibit_bytereg)
 {
-	unsigned reg = c->modrm_reg;
-	int highbyte_regs = c->rex_prefix == 0;
+	unsigned reg = ctxt->modrm_reg;
+	int highbyte_regs = ctxt->rex_prefix == 0;
 
-	if (!(c->d & ModRM))
-		reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
+	if (!(ctxt->d & ModRM))
+		reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
 
-	if (c->d & Sse) {
+	if (ctxt->d & Sse) {
 		op->type = OP_XMM;
 		op->bytes = 16;
 		op->addr.xmm = reg;
@@ -875,49 +869,47 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
 	}
 
 	op->type = OP_REG;
-	if ((c->d & ByteOp) && !inhibit_bytereg) {
-		op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
+	if ((ctxt->d & ByteOp) && !inhibit_bytereg) {
+		op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs);
 		op->bytes = 1;
 	} else {
-		op->addr.reg = decode_register(reg, c->regs, 0);
-		op->bytes = c->op_bytes;
+		op->addr.reg = decode_register(reg, ctxt->regs, 0);
+		op->bytes = ctxt->op_bytes;
 	}
 	fetch_register_operand(op);
 	op->orig_val = op->val;
 }
 
 static int decode_modrm(struct x86_emulate_ctxt *ctxt,
-			struct x86_emulate_ops *ops,
 			struct operand *op)
 {
-	struct decode_cache *c = &ctxt->decode;
 	u8 sib;
 	int index_reg = 0, base_reg = 0, scale;
 	int rc = X86EMUL_CONTINUE;
 	ulong modrm_ea = 0;
 
-	if (c->rex_prefix) {
-		c->modrm_reg = (c->rex_prefix & 4) << 1;	/* REX.R */
-		index_reg = (c->rex_prefix & 2) << 2; /* REX.X */
-		c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */
+	if (ctxt->rex_prefix) {
+		ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1;	/* REX.R */
+		index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */
+		ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
 	}
 
-	c->modrm = insn_fetch(u8, 1, c->eip);
-	c->modrm_mod |= (c->modrm & 0xc0) >> 6;
-	c->modrm_reg |= (c->modrm & 0x38) >> 3;
-	c->modrm_rm |= (c->modrm & 0x07);
-	c->modrm_seg = VCPU_SREG_DS;
+	ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
+	ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
+	ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
+	ctxt->modrm_rm |= (ctxt->modrm & 0x07);
+	ctxt->modrm_seg = VCPU_SREG_DS;
 
-	if (c->modrm_mod == 3) {
+	if (ctxt->modrm_mod == 3) {
 		op->type = OP_REG;
-		op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		op->addr.reg = decode_register(c->modrm_rm,
-					       c->regs, c->d & ByteOp);
-		if (c->d & Sse) {
+		op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+		op->addr.reg = decode_register(ctxt->modrm_rm,
+					       ctxt->regs, ctxt->d & ByteOp);
+		if (ctxt->d & Sse) {
 			op->type = OP_XMM;
 			op->bytes = 16;
-			op->addr.xmm = c->modrm_rm;
-			read_sse_reg(ctxt, &op->vec_val, c->modrm_rm);
+			op->addr.xmm = ctxt->modrm_rm;
+			read_sse_reg(ctxt, &op->vec_val, ctxt->modrm_rm);
 			return rc;
 		}
 		fetch_register_operand(op);
@@ -926,26 +918,26 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 
 	op->type = OP_MEM;
 
-	if (c->ad_bytes == 2) {
-		unsigned bx = c->regs[VCPU_REGS_RBX];
-		unsigned bp = c->regs[VCPU_REGS_RBP];
-		unsigned si = c->regs[VCPU_REGS_RSI];
-		unsigned di = c->regs[VCPU_REGS_RDI];
+	if (ctxt->ad_bytes == 2) {
+		unsigned bx = ctxt->regs[VCPU_REGS_RBX];
+		unsigned bp = ctxt->regs[VCPU_REGS_RBP];
+		unsigned si = ctxt->regs[VCPU_REGS_RSI];
+		unsigned di = ctxt->regs[VCPU_REGS_RDI];
 
 		/* 16-bit ModR/M decode. */
-		switch (c->modrm_mod) {
+		switch (ctxt->modrm_mod) {
 		case 0:
-			if (c->modrm_rm == 6)
-				modrm_ea += insn_fetch(u16, 2, c->eip);
+			if (ctxt->modrm_rm == 6)
+				modrm_ea += insn_fetch(u16, 2, ctxt->_eip);
 			break;
 		case 1:
-			modrm_ea += insn_fetch(s8, 1, c->eip);
+			modrm_ea += insn_fetch(s8, 1, ctxt->_eip);
 			break;
 		case 2:
-			modrm_ea += insn_fetch(u16, 2, c->eip);
+			modrm_ea += insn_fetch(u16, 2, ctxt->_eip);
 			break;
 		}
-		switch (c->modrm_rm) {
+		switch (ctxt->modrm_rm) {
 		case 0:
 			modrm_ea += bx + si;
 			break;
@@ -965,46 +957,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 			modrm_ea += di;
 			break;
 		case 6:
-			if (c->modrm_mod != 0)
+			if (ctxt->modrm_mod != 0)
 				modrm_ea += bp;
 			break;
 		case 7:
 			modrm_ea += bx;
 			break;
 		}
-		if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
-		    (c->modrm_rm == 6 && c->modrm_mod != 0))
-			c->modrm_seg = VCPU_SREG_SS;
+		if (ctxt->modrm_rm == 2 || ctxt->modrm_rm == 3 ||
+		    (ctxt->modrm_rm == 6 && ctxt->modrm_mod != 0))
+			ctxt->modrm_seg = VCPU_SREG_SS;
 		modrm_ea = (u16)modrm_ea;
 	} else {
 		/* 32/64-bit ModR/M decode. */
-		if ((c->modrm_rm & 7) == 4) {
-			sib = insn_fetch(u8, 1, c->eip);
+		if ((ctxt->modrm_rm & 7) == 4) {
+			sib = insn_fetch(u8, 1, ctxt->_eip);
 			index_reg |= (sib >> 3) & 7;
 			base_reg |= sib & 7;
 			scale = sib >> 6;
 
-			if ((base_reg & 7) == 5 && c->modrm_mod == 0)
-				modrm_ea += insn_fetch(s32, 4, c->eip);
+			if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
+				modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
 			else
-				modrm_ea += c->regs[base_reg];
+				modrm_ea += ctxt->regs[base_reg];
 			if (index_reg != 4)
-				modrm_ea += c->regs[index_reg] << scale;
-		} else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
+				modrm_ea += ctxt->regs[index_reg] << scale;
+		} else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
 			if (ctxt->mode == X86EMUL_MODE_PROT64)
-				c->rip_relative = 1;
+				ctxt->rip_relative = 1;
 		} else
-			modrm_ea += c->regs[c->modrm_rm];
-		switch (c->modrm_mod) {
+			modrm_ea += ctxt->regs[ctxt->modrm_rm];
+		switch (ctxt->modrm_mod) {
 		case 0:
-			if (c->modrm_rm == 5)
-				modrm_ea += insn_fetch(s32, 4, c->eip);
+			if (ctxt->modrm_rm == 5)
+				modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
 			break;
 		case 1:
-			modrm_ea += insn_fetch(s8, 1, c->eip);
+			modrm_ea += insn_fetch(s8, 1, ctxt->_eip);
 			break;
 		case 2:
-			modrm_ea += insn_fetch(s32, 4, c->eip);
+			modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
 			break;
 		}
 	}
@@ -1014,53 +1006,50 @@ done:
 }
 
 static int decode_abs(struct x86_emulate_ctxt *ctxt,
-		      struct x86_emulate_ops *ops,
 		      struct operand *op)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 
 	op->type = OP_MEM;
-	switch (c->ad_bytes) {
+	switch (ctxt->ad_bytes) {
 	case 2:
-		op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
+		op->addr.mem.ea = insn_fetch(u16, 2, ctxt->_eip);
 		break;
 	case 4:
-		op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
+		op->addr.mem.ea = insn_fetch(u32, 4, ctxt->_eip);
 		break;
 	case 8:
-		op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
+		op->addr.mem.ea = insn_fetch(u64, 8, ctxt->_eip);
 		break;
 	}
 done:
 	return rc;
 }
 
-static void fetch_bit_operand(struct decode_cache *c)
+static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
 {
 	long sv = 0, mask;
 
-	if (c->dst.type == OP_MEM && c->src.type == OP_REG) {
-		mask = ~(c->dst.bytes * 8 - 1);
+	if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) {
+		mask = ~(ctxt->dst.bytes * 8 - 1);
 
-		if (c->src.bytes == 2)
-			sv = (s16)c->src.val & (s16)mask;
-		else if (c->src.bytes == 4)
-			sv = (s32)c->src.val & (s32)mask;
+		if (ctxt->src.bytes == 2)
+			sv = (s16)ctxt->src.val & (s16)mask;
+		else if (ctxt->src.bytes == 4)
+			sv = (s32)ctxt->src.val & (s32)mask;
 
-		c->dst.addr.mem.ea += (sv >> 3);
+		ctxt->dst.addr.mem.ea += (sv >> 3);
 	}
 
 	/* only subword offset */
-	c->src.val &= (c->dst.bytes << 3) - 1;
+	ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
 }
 
 static int read_emulated(struct x86_emulate_ctxt *ctxt,
-			 struct x86_emulate_ops *ops,
 			 unsigned long addr, void *dest, unsigned size)
 {
 	int rc;
-	struct read_cache *mc = &ctxt->decode.mem_read;
+	struct read_cache *mc = &ctxt->mem_read;
 
 	while (size) {
 		int n = min(size, 8u);
@@ -1068,8 +1057,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
 		if (mc->pos < mc->end)
 			goto read_cached;
 
-		rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
-					&ctxt->exception);
+		rc = ctxt->ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
+					      &ctxt->exception);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		mc->end += n;
@@ -1094,7 +1083,7 @@ static int segmented_read(struct x86_emulate_ctxt *ctxt,
 	rc = linearize(ctxt, addr, size, false, &linear);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	return read_emulated(ctxt, ctxt->ops, linear, data, size);
+	return read_emulated(ctxt, linear, data, size);
 }
 
 static int segmented_write(struct x86_emulate_ctxt *ctxt,
@@ -1128,26 +1117,24 @@ static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
 }
 
 static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
-			   struct x86_emulate_ops *ops,
 			   unsigned int size, unsigned short port,
 			   void *dest)
 {
-	struct read_cache *rc = &ctxt->decode.io_read;
+	struct read_cache *rc = &ctxt->io_read;
 
 	if (rc->pos == rc->end) { /* refill pio read ahead */
-		struct decode_cache *c = &ctxt->decode;
 		unsigned int in_page, n;
-		unsigned int count = c->rep_prefix ?
-			address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
+		unsigned int count = ctxt->rep_prefix ?
+			address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) : 1;
 		in_page = (ctxt->eflags & EFLG_DF) ?
-			offset_in_page(c->regs[VCPU_REGS_RDI]) :
-			PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
+			offset_in_page(ctxt->regs[VCPU_REGS_RDI]) :
+			PAGE_SIZE - offset_in_page(ctxt->regs[VCPU_REGS_RDI]);
 		n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
 			count);
 		if (n == 0)
 			n = 1;
 		rc->pos = rc->end = 0;
-		if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n))
+		if (!ctxt->ops->pio_in_emulated(ctxt, size, port, rc->data, n))
 			return 0;
 		rc->end = n * size;
 	}
@@ -1158,9 +1145,10 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 }
 
 static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
-				     struct x86_emulate_ops *ops,
 				     u16 selector, struct desc_ptr *dt)
 {
+	struct x86_emulate_ops *ops = ctxt->ops;
+
 	if (selector & 1 << 2) {
 		struct desc_struct desc;
 		u16 sel;
@@ -1177,48 +1165,42 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
 
 /* allowed just for 8 bytes segments */
 static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-				   struct x86_emulate_ops *ops,
 				   u16 selector, struct desc_struct *desc)
 {
 	struct desc_ptr dt;
 	u16 index = selector >> 3;
-	int ret;
 	ulong addr;
 
-	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+	get_descriptor_table_ptr(ctxt, selector, &dt);
 
 	if (dt.size < index * 8 + 7)
 		return emulate_gp(ctxt, selector & 0xfffc);
-	addr = dt.address + index * 8;
-	ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
 
-       return ret;
+	addr = dt.address + index * 8;
+	return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc,
+				   &ctxt->exception);
 }
 
 /* allowed just for 8 bytes segments */
 static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-				    struct x86_emulate_ops *ops,
 				    u16 selector, struct desc_struct *desc)
 {
 	struct desc_ptr dt;
 	u16 index = selector >> 3;
 	ulong addr;
-	int ret;
 
-	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+	get_descriptor_table_ptr(ctxt, selector, &dt);
 
 	if (dt.size < index * 8 + 7)
 		return emulate_gp(ctxt, selector & 0xfffc);
 
 	addr = dt.address + index * 8;
-	ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
-
-	return ret;
+	return ctxt->ops->write_std(ctxt, addr, desc, sizeof *desc,
+				    &ctxt->exception);
 }
 
 /* Does not support long mode */
 static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
-				   struct x86_emulate_ops *ops,
 				   u16 selector, int seg)
 {
 	struct desc_struct seg_desc;
@@ -1253,7 +1235,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	if (null_selector) /* for NULL selector skip all following checks */
 		goto load;
 
-	ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
+	ret = read_segment_descriptor(ctxt, selector, &seg_desc);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 
@@ -1271,7 +1253,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 
 	rpl = selector & 3;
 	dpl = seg_desc.dpl;
-	cpl = ops->cpl(ctxt);
+	cpl = ctxt->ops->cpl(ctxt);
 
 	switch (seg) {
 	case VCPU_SREG_SS:
@@ -1322,12 +1304,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	if (seg_desc.s) {
 		/* mark segment as accessed */
 		seg_desc.type |= 1;
-		ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
+		ret = write_segment_descriptor(ctxt, selector, &seg_desc);
 		if (ret != X86EMUL_CONTINUE)
 			return ret;
 	}
 load:
-	ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
+	ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
 	return X86EMUL_CONTINUE;
 exception:
 	emulate_exception(ctxt, err_vec, err_code, true);
@@ -1356,29 +1338,28 @@ static void write_register_operand(struct operand *op)
 static int writeback(struct x86_emulate_ctxt *ctxt)
 {
 	int rc;
-	struct decode_cache *c = &ctxt->decode;
 
-	switch (c->dst.type) {
+	switch (ctxt->dst.type) {
 	case OP_REG:
-		write_register_operand(&c->dst);
+		write_register_operand(&ctxt->dst);
 		break;
 	case OP_MEM:
-		if (c->lock_prefix)
+		if (ctxt->lock_prefix)
 			rc = segmented_cmpxchg(ctxt,
-					       c->dst.addr.mem,
-					       &c->dst.orig_val,
-					       &c->dst.val,
-					       c->dst.bytes);
+					       ctxt->dst.addr.mem,
+					       &ctxt->dst.orig_val,
+					       &ctxt->dst.val,
+					       ctxt->dst.bytes);
 		else
 			rc = segmented_write(ctxt,
-					     c->dst.addr.mem,
-					     &c->dst.val,
-					     c->dst.bytes);
+					     ctxt->dst.addr.mem,
+					     &ctxt->dst.val,
+					     ctxt->dst.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		break;
 	case OP_XMM:
-		write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm);
+		write_sse_reg(ctxt, &ctxt->dst.vec_val, ctxt->dst.addr.xmm);
 		break;
 	case OP_NONE:
 		/* no writeback */
@@ -1391,50 +1372,45 @@ static int writeback(struct x86_emulate_ctxt *ctxt)
 
 static int em_push(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	struct segmented_address addr;
 
-	register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
-	addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+	register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes);
+	addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
 	addr.seg = VCPU_SREG_SS;
 
 	/* Disable writeback. */
-	c->dst.type = OP_NONE;
-	return segmented_write(ctxt, addr, &c->src.val, c->op_bytes);
+	ctxt->dst.type = OP_NONE;
+	return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes);
 }
 
 static int emulate_pop(struct x86_emulate_ctxt *ctxt,
 		       void *dest, int len)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc;
 	struct segmented_address addr;
 
-	addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+	addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]);
 	addr.seg = VCPU_SREG_SS;
 	rc = segmented_read(ctxt, addr, dest, len);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
+	register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len);
 	return rc;
 }
 
 static int em_pop(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	return emulate_pop(ctxt, &c->dst.val, c->op_bytes);
+	return emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
 }
 
 static int emulate_popf(struct x86_emulate_ctxt *ctxt,
-		       struct x86_emulate_ops *ops,
-		       void *dest, int len)
+			void *dest, int len)
 {
 	int rc;
 	unsigned long val, change_mask;
 	int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
-	int cpl = ops->cpl(ctxt);
+	int cpl = ctxt->ops->cpl(ctxt);
 
 	rc = emulate_pop(ctxt, &val, len);
 	if (rc != X86EMUL_CONTINUE)
@@ -1470,49 +1446,41 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 
 static int em_popf(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	c->dst.type = OP_REG;
-	c->dst.addr.reg = &ctxt->eflags;
-	c->dst.bytes = c->op_bytes;
-	return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
+	ctxt->dst.type = OP_REG;
+	ctxt->dst.addr.reg = &ctxt->eflags;
+	ctxt->dst.bytes = ctxt->op_bytes;
+	return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
 }
 
-static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
-			     struct x86_emulate_ops *ops, int seg)
+static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	c->src.val = get_segment_selector(ctxt, seg);
+	ctxt->src.val = get_segment_selector(ctxt, seg);
 
 	return em_push(ctxt);
 }
 
-static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
-			     struct x86_emulate_ops *ops, int seg)
+static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg)
 {
-	struct decode_cache *c = &ctxt->decode;
 	unsigned long selector;
 	int rc;
 
-	rc = emulate_pop(ctxt, &selector, c->op_bytes);
+	rc = emulate_pop(ctxt, &selector, ctxt->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
+	rc = load_segment_descriptor(ctxt, (u16)selector, seg);
 	return rc;
 }
 
 static int em_pusha(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	unsigned long old_esp = c->regs[VCPU_REGS_RSP];
+	unsigned long old_esp = ctxt->regs[VCPU_REGS_RSP];
 	int rc = X86EMUL_CONTINUE;
 	int reg = VCPU_REGS_RAX;
 
 	while (reg <= VCPU_REGS_RDI) {
 		(reg == VCPU_REGS_RSP) ?
-		(c->src.val = old_esp) : (c->src.val = c->regs[reg]);
+		(ctxt->src.val = old_esp) : (ctxt->src.val = ctxt->regs[reg]);
 
 		rc = em_push(ctxt);
 		if (rc != X86EMUL_CONTINUE)
@@ -1526,26 +1494,23 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt)
 
 static int em_pushf(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	c->src.val =  (unsigned long)ctxt->eflags;
+	ctxt->src.val =  (unsigned long)ctxt->eflags;
 	return em_push(ctxt);
 }
 
 static int em_popa(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	int reg = VCPU_REGS_RDI;
 
 	while (reg >= VCPU_REGS_RAX) {
 		if (reg == VCPU_REGS_RSP) {
-			register_address_increment(c, &c->regs[VCPU_REGS_RSP],
-							c->op_bytes);
+			register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP],
+							ctxt->op_bytes);
 			--reg;
 		}
 
-		rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes);
+		rc = emulate_pop(ctxt, &ctxt->regs[reg], ctxt->op_bytes);
 		if (rc != X86EMUL_CONTINUE)
 			break;
 		--reg;
@@ -1553,10 +1518,9 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
 	return rc;
 }
 
-int emulate_int_real(struct x86_emulate_ctxt *ctxt,
-			       struct x86_emulate_ops *ops, int irq)
+int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
 {
-	struct decode_cache *c = &ctxt->decode;
+	struct x86_emulate_ops *ops = ctxt->ops;
 	int rc;
 	struct desc_ptr dt;
 	gva_t cs_addr;
@@ -1564,19 +1528,19 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
 	u16 cs, eip;
 
 	/* TODO: Add limit checks */
-	c->src.val = ctxt->eflags;
+	ctxt->src.val = ctxt->eflags;
 	rc = em_push(ctxt);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
 	ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
 
-	c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
+	ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
 	rc = em_push(ctxt);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	c->src.val = c->eip;
+	ctxt->src.val = ctxt->_eip;
 	rc = em_push(ctxt);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
@@ -1594,21 +1558,20 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS);
+	rc = load_segment_descriptor(ctxt, cs, VCPU_SREG_CS);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	c->eip = eip;
+	ctxt->_eip = eip;
 
 	return rc;
 }
 
-static int emulate_int(struct x86_emulate_ctxt *ctxt,
-		       struct x86_emulate_ops *ops, int irq)
+static int emulate_int(struct x86_emulate_ctxt *ctxt, int irq)
 {
 	switch(ctxt->mode) {
 	case X86EMUL_MODE_REAL:
-		return emulate_int_real(ctxt, ops, irq);
+		return emulate_int_real(ctxt, irq);
 	case X86EMUL_MODE_VM86:
 	case X86EMUL_MODE_PROT16:
 	case X86EMUL_MODE_PROT32:
@@ -1619,10 +1582,8 @@ static int emulate_int(struct x86_emulate_ctxt *ctxt,
 	}
 }
 
-static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
-			     struct x86_emulate_ops *ops)
+static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	unsigned long temp_eip = 0;
 	unsigned long temp_eflags = 0;
@@ -1634,7 +1595,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
 
 	/* TODO: Add stack limit check */
 
-	rc = emulate_pop(ctxt, &temp_eip, c->op_bytes);
+	rc = emulate_pop(ctxt, &temp_eip, ctxt->op_bytes);
 
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
@@ -1642,27 +1603,27 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
 	if (temp_eip & ~0xffff)
 		return emulate_gp(ctxt, 0);
 
-	rc = emulate_pop(ctxt, &cs, c->op_bytes);
+	rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
 
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes);
+	rc = emulate_pop(ctxt, &temp_eflags, ctxt->op_bytes);
 
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
+	rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
 
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	c->eip = temp_eip;
+	ctxt->_eip = temp_eip;
 
 
-	if (c->op_bytes == 4)
+	if (ctxt->op_bytes == 4)
 		ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
-	else if (c->op_bytes == 2) {
+	else if (ctxt->op_bytes == 2) {
 		ctxt->eflags &= ~0xffff;
 		ctxt->eflags |= temp_eflags;
 	}
@@ -1673,12 +1634,11 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
-static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
-				    struct x86_emulate_ops* ops)
+static int em_iret(struct x86_emulate_ctxt *ctxt)
 {
 	switch(ctxt->mode) {
 	case X86EMUL_MODE_REAL:
-		return emulate_iret_real(ctxt, ops);
+		return emulate_iret_real(ctxt);
 	case X86EMUL_MODE_VM86:
 	case X86EMUL_MODE_PROT16:
 	case X86EMUL_MODE_PROT32:
@@ -1691,53 +1651,49 @@ static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
 
 static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc;
 	unsigned short sel;
 
-	memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+	memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
 
-	rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS);
+	rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	c->eip = 0;
-	memcpy(&c->eip, c->src.valptr, c->op_bytes);
+	ctxt->_eip = 0;
+	memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_grp1a(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	return emulate_pop(ctxt, &c->dst.val, c->dst.bytes);
+	return emulate_pop(ctxt, &ctxt->dst.val, ctxt->dst.bytes);
 }
 
 static int em_grp2(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	switch (c->modrm_reg) {
+	switch (ctxt->modrm_reg) {
 	case 0:	/* rol */
-		emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcB("rol", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 1:	/* ror */
-		emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcB("ror", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 2:	/* rcl */
-		emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcB("rcl", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 3:	/* rcr */
-		emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcB("rcr", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 4:	/* sal/shl */
 	case 6:	/* sal/shl */
-		emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcB("sal", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 5:	/* shr */
-		emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcB("shr", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 7:	/* sar */
-		emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcB("sar", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	}
 	return X86EMUL_CONTINUE;
@@ -1745,33 +1701,32 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt)
 
 static int em_grp3(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	unsigned long *rax = &c->regs[VCPU_REGS_RAX];
-	unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
+	unsigned long *rax = &ctxt->regs[VCPU_REGS_RAX];
+	unsigned long *rdx = &ctxt->regs[VCPU_REGS_RDX];
 	u8 de = 0;
 
-	switch (c->modrm_reg) {
+	switch (ctxt->modrm_reg) {
 	case 0 ... 1:	/* test */
-		emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 2:	/* not */
-		c->dst.val = ~c->dst.val;
+		ctxt->dst.val = ~ctxt->dst.val;
 		break;
 	case 3:	/* neg */
-		emulate_1op("neg", c->dst, ctxt->eflags);
+		emulate_1op("neg", ctxt->dst, ctxt->eflags);
 		break;
 	case 4: /* mul */
-		emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags);
+		emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags);
 		break;
 	case 5: /* imul */
-		emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags);
+		emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, ctxt->eflags);
 		break;
 	case 6: /* div */
-		emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx,
+		emulate_1op_rax_rdx_ex("div", ctxt->src, *rax, *rdx,
 				       ctxt->eflags, de);
 		break;
 	case 7: /* idiv */
-		emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx,
+		emulate_1op_rax_rdx_ex("idiv", ctxt->src, *rax, *rdx,
 				       ctxt->eflags, de);
 		break;
 	default:
@@ -1784,26 +1739,25 @@ static int em_grp3(struct x86_emulate_ctxt *ctxt)
 
 static int em_grp45(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 
-	switch (c->modrm_reg) {
+	switch (ctxt->modrm_reg) {
 	case 0:	/* inc */
-		emulate_1op("inc", c->dst, ctxt->eflags);
+		emulate_1op("inc", ctxt->dst, ctxt->eflags);
 		break;
 	case 1:	/* dec */
-		emulate_1op("dec", c->dst, ctxt->eflags);
+		emulate_1op("dec", ctxt->dst, ctxt->eflags);
 		break;
 	case 2: /* call near abs */ {
 		long int old_eip;
-		old_eip = c->eip;
-		c->eip = c->src.val;
-		c->src.val = old_eip;
+		old_eip = ctxt->_eip;
+		ctxt->_eip = ctxt->src.val;
+		ctxt->src.val = old_eip;
 		rc = em_push(ctxt);
 		break;
 	}
 	case 4: /* jmp abs */
-		c->eip = c->src.val;
+		ctxt->_eip = ctxt->src.val;
 		break;
 	case 5: /* jmp far */
 		rc = em_jmp_far(ctxt);
@@ -1817,68 +1771,70 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
 
 static int em_grp9(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	u64 old = c->dst.orig_val64;
+	u64 old = ctxt->dst.orig_val64;
 
-	if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
-	    ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
-		c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
-		c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
+	if (((u32) (old >> 0) != (u32) ctxt->regs[VCPU_REGS_RAX]) ||
+	    ((u32) (old >> 32) != (u32) ctxt->regs[VCPU_REGS_RDX])) {
+		ctxt->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
+		ctxt->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
 		ctxt->eflags &= ~EFLG_ZF;
 	} else {
-		c->dst.val64 = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
-			(u32) c->regs[VCPU_REGS_RBX];
+		ctxt->dst.val64 = ((u64)ctxt->regs[VCPU_REGS_RCX] << 32) |
+			(u32) ctxt->regs[VCPU_REGS_RBX];
 
 		ctxt->eflags |= EFLG_ZF;
 	}
 	return X86EMUL_CONTINUE;
 }
 
-static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
-			   struct x86_emulate_ops *ops)
+static int em_ret(struct x86_emulate_ctxt *ctxt)
+{
+	ctxt->dst.type = OP_REG;
+	ctxt->dst.addr.reg = &ctxt->_eip;
+	ctxt->dst.bytes = ctxt->op_bytes;
+	return em_pop(ctxt);
+}
+
+static int em_ret_far(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc;
 	unsigned long cs;
 
-	rc = emulate_pop(ctxt, &c->eip, c->op_bytes);
+	rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	if (c->op_bytes == 4)
-		c->eip = (u32)c->eip;
-	rc = emulate_pop(ctxt, &cs, c->op_bytes);
+	if (ctxt->op_bytes == 4)
+		ctxt->_eip = (u32)ctxt->_eip;
+	rc = emulate_pop(ctxt, &cs, ctxt->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
+	rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS);
 	return rc;
 }
 
-static int emulate_load_segment(struct x86_emulate_ctxt *ctxt,
-			   struct x86_emulate_ops *ops, int seg)
+static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg)
 {
-	struct decode_cache *c = &ctxt->decode;
 	unsigned short sel;
 	int rc;
 
-	memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+	memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
 
-	rc = load_segment_descriptor(ctxt, ops, sel, seg);
+	rc = load_segment_descriptor(ctxt, sel, seg);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	c->dst.val = c->src.val;
+	ctxt->dst.val = ctxt->src.val;
 	return rc;
 }
 
-static inline void
+static void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
-			struct x86_emulate_ops *ops, struct desc_struct *cs,
-			struct desc_struct *ss)
+			struct desc_struct *cs, struct desc_struct *ss)
 {
 	u16 selector;
 
 	memset(cs, 0, sizeof(struct desc_struct));
-	ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
+	ctxt->ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
 	memset(ss, 0, sizeof(struct desc_struct));
 
 	cs->l = 0;		/* will be adjusted later */
@@ -1901,10 +1857,9 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 	ss->p = 1;
 }
 
-static int
-emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+static int em_syscall(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct desc_struct cs, ss;
 	u64 msr_data;
 	u16 cs_sel, ss_sel;
@@ -1916,7 +1871,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		return emulate_ud(ctxt);
 
 	ops->get_msr(ctxt, MSR_EFER, &efer);
-	setup_syscalls_segments(ctxt, ops, &cs, &ss);
+	setup_syscalls_segments(ctxt, &cs, &ss);
 
 	ops->get_msr(ctxt, MSR_STAR, &msr_data);
 	msr_data >>= 32;
@@ -1930,15 +1885,15 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
 	ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
 
-	c->regs[VCPU_REGS_RCX] = c->eip;
+	ctxt->regs[VCPU_REGS_RCX] = ctxt->_eip;
 	if (efer & EFER_LMA) {
 #ifdef CONFIG_X86_64
-		c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
+		ctxt->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
 
 		ops->get_msr(ctxt,
 			     ctxt->mode == X86EMUL_MODE_PROT64 ?
 			     MSR_LSTAR : MSR_CSTAR, &msr_data);
-		c->eip = msr_data;
+		ctxt->_eip = msr_data;
 
 		ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
 		ctxt->eflags &= ~(msr_data | EFLG_RF);
@@ -1946,7 +1901,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	} else {
 		/* legacy mode */
 		ops->get_msr(ctxt, MSR_STAR, &msr_data);
-		c->eip = (u32)msr_data;
+		ctxt->_eip = (u32)msr_data;
 
 		ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
 	}
@@ -1954,16 +1909,15 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	return X86EMUL_CONTINUE;
 }
 
-static int
-emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+static int em_sysenter(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct desc_struct cs, ss;
 	u64 msr_data;
 	u16 cs_sel, ss_sel;
 	u64 efer = 0;
 
-	ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+	ops->get_msr(ctxt, MSR_EFER, &efer);
 	/* inject #GP if in real mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL)
 		return emulate_gp(ctxt, 0);
@@ -1974,7 +1928,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	if (ctxt->mode == X86EMUL_MODE_PROT64)
 		return emulate_ud(ctxt);
 
-	setup_syscalls_segments(ctxt, ops, &cs, &ss);
+	setup_syscalls_segments(ctxt, &cs, &ss);
 
 	ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
 	switch (ctxt->mode) {
@@ -2002,31 +1956,30 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
 
 	ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
-	c->eip = msr_data;
+	ctxt->_eip = msr_data;
 
 	ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
-	c->regs[VCPU_REGS_RSP] = msr_data;
+	ctxt->regs[VCPU_REGS_RSP] = msr_data;
 
 	return X86EMUL_CONTINUE;
 }
 
-static int
-emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
+static int em_sysexit(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct desc_struct cs, ss;
 	u64 msr_data;
 	int usermode;
-	u16 cs_sel, ss_sel;
+	u16 cs_sel = 0, ss_sel = 0;
 
 	/* inject #GP if in real mode or Virtual 8086 mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL ||
 	    ctxt->mode == X86EMUL_MODE_VM86)
 		return emulate_gp(ctxt, 0);
 
-	setup_syscalls_segments(ctxt, ops, &cs, &ss);
+	setup_syscalls_segments(ctxt, &cs, &ss);
 
-	if ((c->rex_prefix & 0x8) != 0x0)
+	if ((ctxt->rex_prefix & 0x8) != 0x0)
 		usermode = X86EMUL_MODE_PROT64;
 	else
 		usermode = X86EMUL_MODE_PROT32;
@@ -2056,14 +2009,13 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
 	ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
 
-	c->eip = c->regs[VCPU_REGS_RDX];
-	c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
+	ctxt->_eip = ctxt->regs[VCPU_REGS_RDX];
+	ctxt->regs[VCPU_REGS_RSP] = ctxt->regs[VCPU_REGS_RCX];
 
 	return X86EMUL_CONTINUE;
 }
 
-static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
-			      struct x86_emulate_ops *ops)
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
 {
 	int iopl;
 	if (ctxt->mode == X86EMUL_MODE_REAL)
@@ -2071,13 +2023,13 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
 	if (ctxt->mode == X86EMUL_MODE_VM86)
 		return true;
 	iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
-	return ops->cpl(ctxt) > iopl;
+	return ctxt->ops->cpl(ctxt) > iopl;
 }
 
 static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
-					    struct x86_emulate_ops *ops,
 					    u16 port, u16 len)
 {
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct desc_struct tr_seg;
 	u32 base3;
 	int r;
@@ -2108,14 +2060,13 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
 }
 
 static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
-				 struct x86_emulate_ops *ops,
 				 u16 port, u16 len)
 {
 	if (ctxt->perm_ok)
 		return true;
 
-	if (emulator_bad_iopl(ctxt, ops))
-		if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
+	if (emulator_bad_iopl(ctxt))
+		if (!emulator_io_port_access_allowed(ctxt, port, len))
 			return false;
 
 	ctxt->perm_ok = true;
@@ -2124,21 +2075,18 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
 }
 
 static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
-				struct x86_emulate_ops *ops,
 				struct tss_segment_16 *tss)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	tss->ip = c->eip;
+	tss->ip = ctxt->_eip;
 	tss->flag = ctxt->eflags;
-	tss->ax = c->regs[VCPU_REGS_RAX];
-	tss->cx = c->regs[VCPU_REGS_RCX];
-	tss->dx = c->regs[VCPU_REGS_RDX];
-	tss->bx = c->regs[VCPU_REGS_RBX];
-	tss->sp = c->regs[VCPU_REGS_RSP];
-	tss->bp = c->regs[VCPU_REGS_RBP];
-	tss->si = c->regs[VCPU_REGS_RSI];
-	tss->di = c->regs[VCPU_REGS_RDI];
+	tss->ax = ctxt->regs[VCPU_REGS_RAX];
+	tss->cx = ctxt->regs[VCPU_REGS_RCX];
+	tss->dx = ctxt->regs[VCPU_REGS_RDX];
+	tss->bx = ctxt->regs[VCPU_REGS_RBX];
+	tss->sp = ctxt->regs[VCPU_REGS_RSP];
+	tss->bp = ctxt->regs[VCPU_REGS_RBP];
+	tss->si = ctxt->regs[VCPU_REGS_RSI];
+	tss->di = ctxt->regs[VCPU_REGS_RDI];
 
 	tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
 	tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2148,22 +2096,20 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
 }
 
 static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
-				 struct x86_emulate_ops *ops,
 				 struct tss_segment_16 *tss)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int ret;
 
-	c->eip = tss->ip;
+	ctxt->_eip = tss->ip;
 	ctxt->eflags = tss->flag | 2;
-	c->regs[VCPU_REGS_RAX] = tss->ax;
-	c->regs[VCPU_REGS_RCX] = tss->cx;
-	c->regs[VCPU_REGS_RDX] = tss->dx;
-	c->regs[VCPU_REGS_RBX] = tss->bx;
-	c->regs[VCPU_REGS_RSP] = tss->sp;
-	c->regs[VCPU_REGS_RBP] = tss->bp;
-	c->regs[VCPU_REGS_RSI] = tss->si;
-	c->regs[VCPU_REGS_RDI] = tss->di;
+	ctxt->regs[VCPU_REGS_RAX] = tss->ax;
+	ctxt->regs[VCPU_REGS_RCX] = tss->cx;
+	ctxt->regs[VCPU_REGS_RDX] = tss->dx;
+	ctxt->regs[VCPU_REGS_RBX] = tss->bx;
+	ctxt->regs[VCPU_REGS_RSP] = tss->sp;
+	ctxt->regs[VCPU_REGS_RBP] = tss->bp;
+	ctxt->regs[VCPU_REGS_RSI] = tss->si;
+	ctxt->regs[VCPU_REGS_RDI] = tss->di;
 
 	/*
 	 * SDM says that segment selectors are loaded before segment
@@ -2179,19 +2125,19 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
 	 * Now load segment descriptors. If fault happenes at this stage
 	 * it is handled in a context of new task
 	 */
-	ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR);
+	ret = load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+	ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+	ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+	ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+	ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 
@@ -2199,10 +2145,10 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
 }
 
 static int task_switch_16(struct x86_emulate_ctxt *ctxt,
-			  struct x86_emulate_ops *ops,
 			  u16 tss_selector, u16 old_tss_sel,
 			  ulong old_tss_base, struct desc_struct *new_desc)
 {
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct tss_segment_16 tss_seg;
 	int ret;
 	u32 new_tss_base = get_desc_base(new_desc);
@@ -2213,7 +2159,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 		/* FIXME: need to provide precise fault address */
 		return ret;
 
-	save_state_to_tss16(ctxt, ops, &tss_seg);
+	save_state_to_tss16(ctxt, &tss_seg);
 
 	ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
 			     &ctxt->exception);
@@ -2239,26 +2185,23 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			return ret;
 	}
 
-	return load_state_from_tss16(ctxt, ops, &tss_seg);
+	return load_state_from_tss16(ctxt, &tss_seg);
 }
 
 static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
-				struct x86_emulate_ops *ops,
 				struct tss_segment_32 *tss)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	tss->cr3 = ops->get_cr(ctxt, 3);
-	tss->eip = c->eip;
+	tss->cr3 = ctxt->ops->get_cr(ctxt, 3);
+	tss->eip = ctxt->_eip;
 	tss->eflags = ctxt->eflags;
-	tss->eax = c->regs[VCPU_REGS_RAX];
-	tss->ecx = c->regs[VCPU_REGS_RCX];
-	tss->edx = c->regs[VCPU_REGS_RDX];
-	tss->ebx = c->regs[VCPU_REGS_RBX];
-	tss->esp = c->regs[VCPU_REGS_RSP];
-	tss->ebp = c->regs[VCPU_REGS_RBP];
-	tss->esi = c->regs[VCPU_REGS_RSI];
-	tss->edi = c->regs[VCPU_REGS_RDI];
+	tss->eax = ctxt->regs[VCPU_REGS_RAX];
+	tss->ecx = ctxt->regs[VCPU_REGS_RCX];
+	tss->edx = ctxt->regs[VCPU_REGS_RDX];
+	tss->ebx = ctxt->regs[VCPU_REGS_RBX];
+	tss->esp = ctxt->regs[VCPU_REGS_RSP];
+	tss->ebp = ctxt->regs[VCPU_REGS_RBP];
+	tss->esi = ctxt->regs[VCPU_REGS_RSI];
+	tss->edi = ctxt->regs[VCPU_REGS_RDI];
 
 	tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
 	tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
@@ -2270,24 +2213,22 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
 }
 
 static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
-				 struct x86_emulate_ops *ops,
 				 struct tss_segment_32 *tss)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int ret;
 
-	if (ops->set_cr(ctxt, 3, tss->cr3))
+	if (ctxt->ops->set_cr(ctxt, 3, tss->cr3))
 		return emulate_gp(ctxt, 0);
-	c->eip = tss->eip;
+	ctxt->_eip = tss->eip;
 	ctxt->eflags = tss->eflags | 2;
-	c->regs[VCPU_REGS_RAX] = tss->eax;
-	c->regs[VCPU_REGS_RCX] = tss->ecx;
-	c->regs[VCPU_REGS_RDX] = tss->edx;
-	c->regs[VCPU_REGS_RBX] = tss->ebx;
-	c->regs[VCPU_REGS_RSP] = tss->esp;
-	c->regs[VCPU_REGS_RBP] = tss->ebp;
-	c->regs[VCPU_REGS_RSI] = tss->esi;
-	c->regs[VCPU_REGS_RDI] = tss->edi;
+	ctxt->regs[VCPU_REGS_RAX] = tss->eax;
+	ctxt->regs[VCPU_REGS_RCX] = tss->ecx;
+	ctxt->regs[VCPU_REGS_RDX] = tss->edx;
+	ctxt->regs[VCPU_REGS_RBX] = tss->ebx;
+	ctxt->regs[VCPU_REGS_RSP] = tss->esp;
+	ctxt->regs[VCPU_REGS_RBP] = tss->ebp;
+	ctxt->regs[VCPU_REGS_RSI] = tss->esi;
+	ctxt->regs[VCPU_REGS_RDI] = tss->edi;
 
 	/*
 	 * SDM says that segment selectors are loaded before segment
@@ -2305,25 +2246,25 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 	 * Now load segment descriptors. If fault happenes at this stage
 	 * it is handled in a context of new task
 	 */
-	ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR);
+	ret = load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+	ret = load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+	ret = load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+	ret = load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+	ret = load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS);
+	ret = load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS);
+	ret = load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 
@@ -2331,10 +2272,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 }
 
 static int task_switch_32(struct x86_emulate_ctxt *ctxt,
-			  struct x86_emulate_ops *ops,
 			  u16 tss_selector, u16 old_tss_sel,
 			  ulong old_tss_base, struct desc_struct *new_desc)
 {
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct tss_segment_32 tss_seg;
 	int ret;
 	u32 new_tss_base = get_desc_base(new_desc);
@@ -2345,7 +2286,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 		/* FIXME: need to provide precise fault address */
 		return ret;
 
-	save_state_to_tss32(ctxt, ops, &tss_seg);
+	save_state_to_tss32(ctxt, &tss_seg);
 
 	ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
 			     &ctxt->exception);
@@ -2371,14 +2312,14 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			return ret;
 	}
 
-	return load_state_from_tss32(ctxt, ops, &tss_seg);
+	return load_state_from_tss32(ctxt, &tss_seg);
 }
 
 static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
-				   struct x86_emulate_ops *ops,
 				   u16 tss_selector, int reason,
 				   bool has_error_code, u32 error_code)
 {
+	struct x86_emulate_ops *ops = ctxt->ops;
 	struct desc_struct curr_tss_desc, next_tss_desc;
 	int ret;
 	u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
@@ -2388,10 +2329,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 
 	/* FIXME: old_tss_base == ~0 ? */
 
-	ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc);
+	ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
-	ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc);
+	ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
 
@@ -2413,8 +2354,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 
 	if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
 		curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
-		write_segment_descriptor(ctxt, ops, old_tss_sel,
-					 &curr_tss_desc);
+		write_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc);
 	}
 
 	if (reason == TASK_SWITCH_IRET)
@@ -2426,10 +2366,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 		old_tss_sel = 0xffff;
 
 	if (next_tss_desc.type & 8)
-		ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel,
+		ret = task_switch_32(ctxt, tss_selector, old_tss_sel,
 				     old_tss_base, &next_tss_desc);
 	else
-		ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
+		ret = task_switch_16(ctxt, tss_selector, old_tss_sel,
 				     old_tss_base, &next_tss_desc);
 	if (ret != X86EMUL_CONTINUE)
 		return ret;
@@ -2439,19 +2379,16 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 
 	if (reason != TASK_SWITCH_IRET) {
 		next_tss_desc.type |= (1 << 1); /* set busy flag */
-		write_segment_descriptor(ctxt, ops, tss_selector,
-					 &next_tss_desc);
+		write_segment_descriptor(ctxt, tss_selector, &next_tss_desc);
 	}
 
 	ops->set_cr(ctxt, 0,  ops->get_cr(ctxt, 0) | X86_CR0_TS);
 	ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
 
 	if (has_error_code) {
-		struct decode_cache *c = &ctxt->decode;
-
-		c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
-		c->lock_prefix = 0;
-		c->src.val = (unsigned long) error_code;
+		ctxt->op_bytes = ctxt->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
+		ctxt->lock_prefix = 0;
+		ctxt->src.val = (unsigned long) error_code;
 		ret = em_push(ctxt);
 	}
 
@@ -2462,18 +2399,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 			 u16 tss_selector, int reason,
 			 bool has_error_code, u32 error_code)
 {
-	struct x86_emulate_ops *ops = ctxt->ops;
-	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	c->eip = ctxt->eip;
-	c->dst.type = OP_NONE;
+	ctxt->_eip = ctxt->eip;
+	ctxt->dst.type = OP_NONE;
 
-	rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
+	rc = emulator_do_task_switch(ctxt, tss_selector, reason,
 				     has_error_code, error_code);
 
 	if (rc == X86EMUL_CONTINUE)
-		ctxt->eip = c->eip;
+		ctxt->eip = ctxt->_eip;
 
 	return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 }
@@ -2481,22 +2416,20 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
 			    int reg, struct operand *op)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
 
-	register_address_increment(c, &c->regs[reg], df * op->bytes);
-	op->addr.mem.ea = register_address(c, c->regs[reg]);
+	register_address_increment(ctxt, &ctxt->regs[reg], df * op->bytes);
+	op->addr.mem.ea = register_address(ctxt, ctxt->regs[reg]);
 	op->addr.mem.seg = seg;
 }
 
 static int em_das(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	u8 al, old_al;
 	bool af, cf, old_cf;
 
 	cf = ctxt->eflags & X86_EFLAGS_CF;
-	al = c->dst.val;
+	al = ctxt->dst.val;
 
 	old_al = al;
 	old_cf = cf;
@@ -2514,12 +2447,12 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
 		cf = true;
 	}
 
-	c->dst.val = al;
+	ctxt->dst.val = al;
 	/* Set PF, ZF, SF */
-	c->src.type = OP_IMM;
-	c->src.val = 0;
-	c->src.bytes = 1;
-	emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+	ctxt->src.type = OP_IMM;
+	ctxt->src.val = 0;
+	ctxt->src.bytes = 1;
+	emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags);
 	ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
 	if (cf)
 		ctxt->eflags |= X86_EFLAGS_CF;
@@ -2530,175 +2463,189 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
 
 static int em_call_far(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	u16 sel, old_cs;
 	ulong old_eip;
 	int rc;
 
 	old_cs = get_segment_selector(ctxt, VCPU_SREG_CS);
-	old_eip = c->eip;
+	old_eip = ctxt->_eip;
 
-	memcpy(&sel, c->src.valptr + c->op_bytes, 2);
-	if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS))
+	memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
+	if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS))
 		return X86EMUL_CONTINUE;
 
-	c->eip = 0;
-	memcpy(&c->eip, c->src.valptr, c->op_bytes);
+	ctxt->_eip = 0;
+	memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes);
 
-	c->src.val = old_cs;
+	ctxt->src.val = old_cs;
 	rc = em_push(ctxt);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	c->src.val = old_eip;
+	ctxt->src.val = old_eip;
 	return em_push(ctxt);
 }
 
 static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	c->dst.type = OP_REG;
-	c->dst.addr.reg = &c->eip;
-	c->dst.bytes = c->op_bytes;
-	rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes);
+	ctxt->dst.type = OP_REG;
+	ctxt->dst.addr.reg = &ctxt->_eip;
+	ctxt->dst.bytes = ctxt->op_bytes;
+	rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
+	register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_add(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+	emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_or(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
+	emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_adc(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
+	emulate_2op_SrcV("adc", ctxt->src, ctxt->dst, ctxt->eflags);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_sbb(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
+	emulate_2op_SrcV("sbb", ctxt->src, ctxt->dst, ctxt->eflags);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_and(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
+	emulate_2op_SrcV("and", ctxt->src, ctxt->dst, ctxt->eflags);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_sub(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
+	emulate_2op_SrcV("sub", ctxt->src, ctxt->dst, ctxt->eflags);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_xor(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
+	emulate_2op_SrcV("xor", ctxt->src, ctxt->dst, ctxt->eflags);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_cmp(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+	emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags);
 	/* Disable writeback. */
-	c->dst.type = OP_NONE;
+	ctxt->dst.type = OP_NONE;
 	return X86EMUL_CONTINUE;
 }
 
-static int em_imul(struct x86_emulate_ctxt *ctxt)
+static int em_test(struct x86_emulate_ctxt *ctxt)
+{
+	emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags);
+	return X86EMUL_CONTINUE;
+}
+
+static int em_xchg(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
+	/* Write back the register source. */
+	ctxt->src.val = ctxt->dst.val;
+	write_register_operand(&ctxt->src);
 
-	emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags);
+	/* Write back the memory destination with implicit LOCK prefix. */
+	ctxt->dst.val = ctxt->src.orig_val;
+	ctxt->lock_prefix = 1;
 	return X86EMUL_CONTINUE;
 }
 
-static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+static int em_imul(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
+	emulate_2op_SrcV_nobyte("imul", ctxt->src, ctxt->dst, ctxt->eflags);
+	return X86EMUL_CONTINUE;
+}
 
-	c->dst.val = c->src2.val;
+static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
+{
+	ctxt->dst.val = ctxt->src2.val;
 	return em_imul(ctxt);
 }
 
 static int em_cwd(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	c->dst.type = OP_REG;
-	c->dst.bytes = c->src.bytes;
-	c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
-	c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
+	ctxt->dst.type = OP_REG;
+	ctxt->dst.bytes = ctxt->src.bytes;
+	ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+	ctxt->dst.val = ~((ctxt->src.val >> (ctxt->src.bytes * 8 - 1)) - 1);
 
 	return X86EMUL_CONTINUE;
 }
 
 static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	u64 tsc = 0;
 
 	ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
-	c->regs[VCPU_REGS_RAX] = (u32)tsc;
-	c->regs[VCPU_REGS_RDX] = tsc >> 32;
+	ctxt->regs[VCPU_REGS_RAX] = (u32)tsc;
+	ctxt->regs[VCPU_REGS_RDX] = tsc >> 32;
 	return X86EMUL_CONTINUE;
 }
 
 static int em_mov(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	c->dst.val = c->src.val;
+	ctxt->dst.val = ctxt->src.val;
 	return X86EMUL_CONTINUE;
 }
 
+static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
+{
+	if (ctxt->modrm_reg > VCPU_SREG_GS)
+		return emulate_ud(ctxt);
+
+	ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg);
+	return X86EMUL_CONTINUE;
+}
+
+static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt)
+{
+	u16 sel = ctxt->src.val;
+
+	if (ctxt->modrm_reg == VCPU_SREG_CS || ctxt->modrm_reg > VCPU_SREG_GS)
+		return emulate_ud(ctxt);
+
+	if (ctxt->modrm_reg == VCPU_SREG_SS)
+		ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
+
+	/* Disable writeback. */
+	ctxt->dst.type = OP_NONE;
+	return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg);
+}
+
 static int em_movdqu(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes);
+	memcpy(&ctxt->dst.vec_val, &ctxt->src.vec_val, ctxt->op_bytes);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_invlpg(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc;
 	ulong linear;
 
-	rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear);
+	rc = linearize(ctxt, ctxt->src.addr.mem, 1, false, &linear);
 	if (rc == X86EMUL_CONTINUE)
 		ctxt->ops->invlpg(ctxt, linear);
 	/* Disable writeback. */
-	c->dst.type = OP_NONE;
+	ctxt->dst.type = OP_NONE;
 	return X86EMUL_CONTINUE;
 }
 
@@ -2714,10 +2661,9 @@ static int em_clts(struct x86_emulate_ctxt *ctxt)
 
 static int em_vmcall(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	if (c->modrm_mod != 3 || c->modrm_rm != 1)
+	if (ctxt->modrm_mod != 3 || ctxt->modrm_rm != 1)
 		return X86EMUL_UNHANDLEABLE;
 
 	rc = ctxt->ops->fix_hypercall(ctxt);
@@ -2725,73 +2671,104 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt)
 		return rc;
 
 	/* Let the processor re-execute the fixed hypercall */
-	c->eip = ctxt->eip;
+	ctxt->_eip = ctxt->eip;
 	/* Disable writeback. */
-	c->dst.type = OP_NONE;
+	ctxt->dst.type = OP_NONE;
 	return X86EMUL_CONTINUE;
 }
 
 static int em_lgdt(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	struct desc_ptr desc_ptr;
 	int rc;
 
-	rc = read_descriptor(ctxt, c->src.addr.mem,
+	rc = read_descriptor(ctxt, ctxt->src.addr.mem,
 			     &desc_ptr.size, &desc_ptr.address,
-			     c->op_bytes);
+			     ctxt->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 	ctxt->ops->set_gdt(ctxt, &desc_ptr);
 	/* Disable writeback. */
-	c->dst.type = OP_NONE;
+	ctxt->dst.type = OP_NONE;
 	return X86EMUL_CONTINUE;
 }
 
 static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
 	rc = ctxt->ops->fix_hypercall(ctxt);
 
 	/* Disable writeback. */
-	c->dst.type = OP_NONE;
+	ctxt->dst.type = OP_NONE;
 	return rc;
 }
 
 static int em_lidt(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	struct desc_ptr desc_ptr;
 	int rc;
 
-	rc = read_descriptor(ctxt, c->src.addr.mem,
+	rc = read_descriptor(ctxt, ctxt->src.addr.mem,
 			     &desc_ptr.size, &desc_ptr.address,
-			     c->op_bytes);
+			     ctxt->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 	ctxt->ops->set_idt(ctxt, &desc_ptr);
 	/* Disable writeback. */
-	c->dst.type = OP_NONE;
+	ctxt->dst.type = OP_NONE;
 	return X86EMUL_CONTINUE;
 }
 
 static int em_smsw(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	c->dst.bytes = 2;
-	c->dst.val = ctxt->ops->get_cr(ctxt, 0);
+	ctxt->dst.bytes = 2;
+	ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0);
 	return X86EMUL_CONTINUE;
 }
 
 static int em_lmsw(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
 	ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
-			  | (c->src.val & 0x0f));
-	c->dst.type = OP_NONE;
+			  | (ctxt->src.val & 0x0f));
+	ctxt->dst.type = OP_NONE;
+	return X86EMUL_CONTINUE;
+}
+
+static int em_loop(struct x86_emulate_ctxt *ctxt)
+{
+	register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
+	if ((address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) != 0) &&
+	    (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
+		jmp_rel(ctxt, ctxt->src.val);
+
+	return X86EMUL_CONTINUE;
+}
+
+static int em_jcxz(struct x86_emulate_ctxt *ctxt)
+{
+	if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0)
+		jmp_rel(ctxt, ctxt->src.val);
+
+	return X86EMUL_CONTINUE;
+}
+
+static int em_cli(struct x86_emulate_ctxt *ctxt)
+{
+	if (emulator_bad_iopl(ctxt))
+		return emulate_gp(ctxt, 0);
+
+	ctxt->eflags &= ~X86_EFLAGS_IF;
+	return X86EMUL_CONTINUE;
+}
+
+static int em_sti(struct x86_emulate_ctxt *ctxt)
+{
+	if (emulator_bad_iopl(ctxt))
+		return emulate_gp(ctxt, 0);
+
+	ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
+	ctxt->eflags |= X86_EFLAGS_IF;
 	return X86EMUL_CONTINUE;
 }
 
@@ -2809,9 +2786,7 @@ static bool valid_cr(int nr)
 
 static int check_cr_read(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	if (!valid_cr(c->modrm_reg))
+	if (!valid_cr(ctxt->modrm_reg))
 		return emulate_ud(ctxt);
 
 	return X86EMUL_CONTINUE;
@@ -2819,9 +2794,8 @@ static int check_cr_read(struct x86_emulate_ctxt *ctxt)
 
 static int check_cr_write(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	u64 new_val = c->src.val64;
-	int cr = c->modrm_reg;
+	u64 new_val = ctxt->src.val64;
+	int cr = ctxt->modrm_reg;
 	u64 efer = 0;
 
 	static u64 cr_reserved_bits[] = {
@@ -2898,8 +2872,7 @@ static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
 
 static int check_dr_read(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	int dr = c->modrm_reg;
+	int dr = ctxt->modrm_reg;
 	u64 cr4;
 
 	if (dr > 7)
@@ -2917,9 +2890,8 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt)
 
 static int check_dr_write(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-	u64 new_val = c->src.val64;
-	int dr = c->modrm_reg;
+	u64 new_val = ctxt->src.val64;
+	int dr = ctxt->modrm_reg;
 
 	if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
 		return emulate_gp(ctxt, 0);
@@ -2941,7 +2913,7 @@ static int check_svme(struct x86_emulate_ctxt *ctxt)
 
 static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
 {
-	u64 rax = ctxt->decode.regs[VCPU_REGS_RAX];
+	u64 rax = ctxt->regs[VCPU_REGS_RAX];
 
 	/* Valid physical address? */
 	if (rax & 0xffff000000000000ULL)
@@ -2963,7 +2935,7 @@ static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
 static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
 {
 	u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
-	u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX];
+	u64 rcx = ctxt->regs[VCPU_REGS_RCX];
 
 	if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
 	    (rcx > 3))
@@ -2974,10 +2946,8 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
 
 static int check_perm_in(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	c->dst.bytes = min(c->dst.bytes, 4u);
-	if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes))
+	ctxt->dst.bytes = min(ctxt->dst.bytes, 4u);
+	if (!emulator_io_permited(ctxt, ctxt->src.val, ctxt->dst.bytes))
 		return emulate_gp(ctxt, 0);
 
 	return X86EMUL_CONTINUE;
@@ -2985,10 +2955,8 @@ static int check_perm_in(struct x86_emulate_ctxt *ctxt)
 
 static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
-	c->src.bytes = min(c->src.bytes, 4u);
-	if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes))
+	ctxt->src.bytes = min(ctxt->src.bytes, 4u);
+	if (!emulator_io_permited(ctxt, ctxt->dst.val, ctxt->src.bytes))
 		return emulate_gp(ctxt, 0);
 
 	return X86EMUL_CONTINUE;
@@ -3165,12 +3133,15 @@ static struct opcode opcode_table[256] = {
 	G(DstMem | SrcImm | ModRM | Group, group1),
 	G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
 	G(DstMem | SrcImmByte | ModRM | Group, group1),
-	D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock),
+	I2bv(DstMem | SrcReg | ModRM, em_test),
+	I2bv(DstMem | SrcReg | ModRM | Lock, em_xchg),
 	/* 0x88 - 0x8F */
 	I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
 	I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
-	D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
-	D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
+	I(DstMem | SrcNone | ModRM | Mov, em_mov_rm_sreg),
+	D(ModRM | SrcMem | NoAccess | DstReg),
+	I(ImplicitOps | SrcMem16 | ModRM, em_mov_sreg_rm),
+	G(0, group1A),
 	/* 0x90 - 0x97 */
 	DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
 	/* 0x98 - 0x9F */
@@ -3184,7 +3155,7 @@ static struct opcode opcode_table[256] = {
 	I2bv(SrcSI | DstDI | Mov | String, em_mov),
 	I2bv(SrcSI | DstDI | String, em_cmp),
 	/* 0xA8 - 0xAF */
-	D2bv(DstAcc | SrcImm),
+	I2bv(DstAcc | SrcImm, em_test),
 	I2bv(SrcAcc | DstDI | Mov | String, em_mov),
 	I2bv(SrcSI | DstAcc | Mov | String, em_mov),
 	I2bv(SrcAcc | DstDI | String, em_cmp),
@@ -3195,25 +3166,26 @@ static struct opcode opcode_table[256] = {
 	/* 0xC0 - 0xC7 */
 	D2bv(DstMem | SrcImmByte | ModRM),
 	I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
-	D(ImplicitOps | Stack),
+	I(ImplicitOps | Stack, em_ret),
 	D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
 	G(ByteOp, group11), G(0, group11),
 	/* 0xC8 - 0xCF */
-	N, N, N, D(ImplicitOps | Stack),
+	N, N, N, I(ImplicitOps | Stack, em_ret_far),
 	D(ImplicitOps), DI(SrcImmByte, intn),
-	D(ImplicitOps | No64), DI(ImplicitOps, iret),
+	D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret),
 	/* 0xD0 - 0xD7 */
 	D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
 	N, N, N, N,
 	/* 0xD8 - 0xDF */
 	N, N, N, N, N, N, N, N,
 	/* 0xE0 - 0xE7 */
-	X4(D(SrcImmByte)),
+	X3(I(SrcImmByte, em_loop)),
+	I(SrcImmByte, em_jcxz),
 	D2bvIP(SrcImmUByte | DstAcc, in,  check_perm_in),
 	D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out),
 	/* 0xE8 - 0xEF */
 	D(SrcImm | Stack), D(SrcImm | ImplicitOps),
-	D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
+	I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps),
 	D2bvIP(SrcDX | DstAcc, in,  check_perm_in),
 	D2bvIP(SrcAcc | DstDX, out, check_perm_out),
 	/* 0xF0 - 0xF7 */
@@ -3221,14 +3193,16 @@ static struct opcode opcode_table[256] = {
 	DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
 	G(ByteOp, group3), G(0, group3),
 	/* 0xF8 - 0xFF */
-	D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
+	D(ImplicitOps), D(ImplicitOps),
+	I(ImplicitOps, em_cli), I(ImplicitOps, em_sti),
 	D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
 };
 
 static struct opcode twobyte_table[256] = {
 	/* 0x00 - 0x0F */
 	G(0, group6), GD(0, &group7), N, N,
-	N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N,
+	N, I(ImplicitOps | VendorSpecific, em_syscall),
+	II(ImplicitOps | Priv, em_clts, clts), N,
 	DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
 	N, D(ImplicitOps | ModRM), N, N,
 	/* 0x10 - 0x1F */
@@ -3245,7 +3219,8 @@ static struct opcode twobyte_table[256] = {
 	IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
 	DI(ImplicitOps | Priv, rdmsr),
 	DIP(ImplicitOps | Priv, rdpmc, check_rdpmc),
-	D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific),
+	I(ImplicitOps | VendorSpecific, em_sysenter),
+	I(ImplicitOps | Priv | VendorSpecific, em_sysexit),
 	N, N,
 	N, N, N, N, N, N, N, N,
 	/* 0x40 - 0x4F */
@@ -3313,11 +3288,11 @@ static struct opcode twobyte_table[256] = {
 #undef I2bv
 #undef I6ALU
 
-static unsigned imm_size(struct decode_cache *c)
+static unsigned imm_size(struct x86_emulate_ctxt *ctxt)
 {
 	unsigned size;
 
-	size = (c->d & ByteOp) ? 1 : c->op_bytes;
+	size = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
 	if (size == 8)
 		size = 4;
 	return size;
@@ -3326,23 +3301,21 @@ static unsigned imm_size(struct decode_cache *c)
 static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
 		      unsigned size, bool sign_extension)
 {
-	struct decode_cache *c = &ctxt->decode;
-	struct x86_emulate_ops *ops = ctxt->ops;
 	int rc = X86EMUL_CONTINUE;
 
 	op->type = OP_IMM;
 	op->bytes = size;
-	op->addr.mem.ea = c->eip;
+	op->addr.mem.ea = ctxt->_eip;
 	/* NB. Immediates are sign-extended as necessary. */
 	switch (op->bytes) {
 	case 1:
-		op->val = insn_fetch(s8, 1, c->eip);
+		op->val = insn_fetch(s8, 1, ctxt->_eip);
 		break;
 	case 2:
-		op->val = insn_fetch(s16, 2, c->eip);
+		op->val = insn_fetch(s16, 2, ctxt->_eip);
 		break;
 	case 4:
-		op->val = insn_fetch(s32, 4, c->eip);
+		op->val = insn_fetch(s32, 4, ctxt->_eip);
 		break;
 	}
 	if (!sign_extension) {
@@ -3362,11 +3335,8 @@ done:
 	return rc;
 }
 
-int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 {
-	struct x86_emulate_ops *ops = ctxt->ops;
-	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 	int mode = ctxt->mode;
 	int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
@@ -3374,11 +3344,11 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 	struct opcode opcode;
 	struct operand memop = { .type = OP_NONE }, *memopp = NULL;
 
-	c->eip = ctxt->eip;
-	c->fetch.start = c->eip;
-	c->fetch.end = c->fetch.start + insn_len;
+	ctxt->_eip = ctxt->eip;
+	ctxt->fetch.start = ctxt->_eip;
+	ctxt->fetch.end = ctxt->fetch.start + insn_len;
 	if (insn_len > 0)
-		memcpy(c->fetch.data, insn, insn_len);
+		memcpy(ctxt->fetch.data, insn, insn_len);
 
 	switch (mode) {
 	case X86EMUL_MODE_REAL:
@@ -3399,46 +3369,46 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 		return -1;
 	}
 
-	c->op_bytes = def_op_bytes;
-	c->ad_bytes = def_ad_bytes;
+	ctxt->op_bytes = def_op_bytes;
+	ctxt->ad_bytes = def_ad_bytes;
 
 	/* Legacy prefixes. */
 	for (;;) {
-		switch (c->b = insn_fetch(u8, 1, c->eip)) {
+		switch (ctxt->b = insn_fetch(u8, 1, ctxt->_eip)) {
 		case 0x66:	/* operand-size override */
 			op_prefix = true;
 			/* switch between 2/4 bytes */
-			c->op_bytes = def_op_bytes ^ 6;
+			ctxt->op_bytes = def_op_bytes ^ 6;
 			break;
 		case 0x67:	/* address-size override */
 			if (mode == X86EMUL_MODE_PROT64)
 				/* switch between 4/8 bytes */
-				c->ad_bytes = def_ad_bytes ^ 12;
+				ctxt->ad_bytes = def_ad_bytes ^ 12;
 			else
 				/* switch between 2/4 bytes */
-				c->ad_bytes = def_ad_bytes ^ 6;
+				ctxt->ad_bytes = def_ad_bytes ^ 6;
 			break;
 		case 0x26:	/* ES override */
 		case 0x2e:	/* CS override */
 		case 0x36:	/* SS override */
 		case 0x3e:	/* DS override */
-			set_seg_override(c, (c->b >> 3) & 3);
+			set_seg_override(ctxt, (ctxt->b >> 3) & 3);
 			break;
 		case 0x64:	/* FS override */
 		case 0x65:	/* GS override */
-			set_seg_override(c, c->b & 7);
+			set_seg_override(ctxt, ctxt->b & 7);
 			break;
 		case 0x40 ... 0x4f: /* REX */
 			if (mode != X86EMUL_MODE_PROT64)
 				goto done_prefixes;
-			c->rex_prefix = c->b;
+			ctxt->rex_prefix = ctxt->b;
 			continue;
 		case 0xf0:	/* LOCK */
-			c->lock_prefix = 1;
+			ctxt->lock_prefix = 1;
 			break;
 		case 0xf2:	/* REPNE/REPNZ */
 		case 0xf3:	/* REP/REPE/REPZ */
-			c->rep_prefix = c->b;
+			ctxt->rep_prefix = ctxt->b;
 			break;
 		default:
 			goto done_prefixes;
@@ -3446,50 +3416,50 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 
 		/* Any legacy prefix after a REX prefix nullifies its effect. */
 
-		c->rex_prefix = 0;
+		ctxt->rex_prefix = 0;
 	}
 
 done_prefixes:
 
 	/* REX prefix. */
-	if (c->rex_prefix & 8)
-		c->op_bytes = 8;	/* REX.W */
+	if (ctxt->rex_prefix & 8)
+		ctxt->op_bytes = 8;	/* REX.W */
 
 	/* Opcode byte(s). */
-	opcode = opcode_table[c->b];
+	opcode = opcode_table[ctxt->b];
 	/* Two-byte opcode? */
-	if (c->b == 0x0f) {
-		c->twobyte = 1;
-		c->b = insn_fetch(u8, 1, c->eip);
-		opcode = twobyte_table[c->b];
+	if (ctxt->b == 0x0f) {
+		ctxt->twobyte = 1;
+		ctxt->b = insn_fetch(u8, 1, ctxt->_eip);
+		opcode = twobyte_table[ctxt->b];
 	}
-	c->d = opcode.flags;
+	ctxt->d = opcode.flags;
 
-	while (c->d & GroupMask) {
-		switch (c->d & GroupMask) {
+	while (ctxt->d & GroupMask) {
+		switch (ctxt->d & GroupMask) {
 		case Group:
-			c->modrm = insn_fetch(u8, 1, c->eip);
-			--c->eip;
-			goffset = (c->modrm >> 3) & 7;
+			ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
+			--ctxt->_eip;
+			goffset = (ctxt->modrm >> 3) & 7;
 			opcode = opcode.u.group[goffset];
 			break;
 		case GroupDual:
-			c->modrm = insn_fetch(u8, 1, c->eip);
-			--c->eip;
-			goffset = (c->modrm >> 3) & 7;
-			if ((c->modrm >> 6) == 3)
+			ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
+			--ctxt->_eip;
+			goffset = (ctxt->modrm >> 3) & 7;
+			if ((ctxt->modrm >> 6) == 3)
 				opcode = opcode.u.gdual->mod3[goffset];
 			else
 				opcode = opcode.u.gdual->mod012[goffset];
 			break;
 		case RMExt:
-			goffset = c->modrm & 7;
+			goffset = ctxt->modrm & 7;
 			opcode = opcode.u.group[goffset];
 			break;
 		case Prefix:
-			if (c->rep_prefix && op_prefix)
+			if (ctxt->rep_prefix && op_prefix)
 				return X86EMUL_UNHANDLEABLE;
-			simd_prefix = op_prefix ? 0x66 : c->rep_prefix;
+			simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix;
 			switch (simd_prefix) {
 			case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
 			case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
@@ -3501,61 +3471,61 @@ done_prefixes:
 			return X86EMUL_UNHANDLEABLE;
 		}
 
-		c->d &= ~GroupMask;
-		c->d |= opcode.flags;
+		ctxt->d &= ~GroupMask;
+		ctxt->d |= opcode.flags;
 	}
 
-	c->execute = opcode.u.execute;
-	c->check_perm = opcode.check_perm;
-	c->intercept = opcode.intercept;
+	ctxt->execute = opcode.u.execute;
+	ctxt->check_perm = opcode.check_perm;
+	ctxt->intercept = opcode.intercept;
 
 	/* Unrecognised? */
-	if (c->d == 0 || (c->d & Undefined))
+	if (ctxt->d == 0 || (ctxt->d & Undefined))
 		return -1;
 
-	if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
+	if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
 		return -1;
 
-	if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
-		c->op_bytes = 8;
+	if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
+		ctxt->op_bytes = 8;
 
-	if (c->d & Op3264) {
+	if (ctxt->d & Op3264) {
 		if (mode == X86EMUL_MODE_PROT64)
-			c->op_bytes = 8;
+			ctxt->op_bytes = 8;
 		else
-			c->op_bytes = 4;
+			ctxt->op_bytes = 4;
 	}
 
-	if (c->d & Sse)
-		c->op_bytes = 16;
+	if (ctxt->d & Sse)
+		ctxt->op_bytes = 16;
 
 	/* ModRM and SIB bytes. */
-	if (c->d & ModRM) {
-		rc = decode_modrm(ctxt, ops, &memop);
-		if (!c->has_seg_override)
-			set_seg_override(c, c->modrm_seg);
-	} else if (c->d & MemAbs)
-		rc = decode_abs(ctxt, ops, &memop);
+	if (ctxt->d & ModRM) {
+		rc = decode_modrm(ctxt, &memop);
+		if (!ctxt->has_seg_override)
+			set_seg_override(ctxt, ctxt->modrm_seg);
+	} else if (ctxt->d & MemAbs)
+		rc = decode_abs(ctxt, &memop);
 	if (rc != X86EMUL_CONTINUE)
 		goto done;
 
-	if (!c->has_seg_override)
-		set_seg_override(c, VCPU_SREG_DS);
+	if (!ctxt->has_seg_override)
+		set_seg_override(ctxt, VCPU_SREG_DS);
 
-	memop.addr.mem.seg = seg_override(ctxt, c);
+	memop.addr.mem.seg = seg_override(ctxt);
 
-	if (memop.type == OP_MEM && c->ad_bytes != 8)
+	if (memop.type == OP_MEM && ctxt->ad_bytes != 8)
 		memop.addr.mem.ea = (u32)memop.addr.mem.ea;
 
 	/*
 	 * Decode and fetch the source operand: register, memory
 	 * or immediate.
 	 */
-	switch (c->d & SrcMask) {
+	switch (ctxt->d & SrcMask) {
 	case SrcNone:
 		break;
 	case SrcReg:
-		decode_register_operand(ctxt, &c->src, c, 0);
+		decode_register_operand(ctxt, &ctxt->src, 0);
 		break;
 	case SrcMem16:
 		memop.bytes = 2;
@@ -3564,60 +3534,60 @@ done_prefixes:
 		memop.bytes = 4;
 		goto srcmem_common;
 	case SrcMem:
-		memop.bytes = (c->d & ByteOp) ? 1 :
-							   c->op_bytes;
+		memop.bytes = (ctxt->d & ByteOp) ? 1 :
+							   ctxt->op_bytes;
 	srcmem_common:
-		c->src = memop;
-		memopp = &c->src;
+		ctxt->src = memop;
+		memopp = &ctxt->src;
 		break;
 	case SrcImmU16:
-		rc = decode_imm(ctxt, &c->src, 2, false);
+		rc = decode_imm(ctxt, &ctxt->src, 2, false);
 		break;
 	case SrcImm:
-		rc = decode_imm(ctxt, &c->src, imm_size(c), true);
+		rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), true);
 		break;
 	case SrcImmU:
-		rc = decode_imm(ctxt, &c->src, imm_size(c), false);
+		rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), false);
 		break;
 	case SrcImmByte:
-		rc = decode_imm(ctxt, &c->src, 1, true);
+		rc = decode_imm(ctxt, &ctxt->src, 1, true);
 		break;
 	case SrcImmUByte:
-		rc = decode_imm(ctxt, &c->src, 1, false);
+		rc = decode_imm(ctxt, &ctxt->src, 1, false);
 		break;
 	case SrcAcc:
-		c->src.type = OP_REG;
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
-		fetch_register_operand(&c->src);
+		ctxt->src.type = OP_REG;
+		ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+		ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
+		fetch_register_operand(&ctxt->src);
 		break;
 	case SrcOne:
-		c->src.bytes = 1;
-		c->src.val = 1;
+		ctxt->src.bytes = 1;
+		ctxt->src.val = 1;
 		break;
 	case SrcSI:
-		c->src.type = OP_MEM;
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.addr.mem.ea =
-			register_address(c, c->regs[VCPU_REGS_RSI]);
-		c->src.addr.mem.seg = seg_override(ctxt, c);
-		c->src.val = 0;
+		ctxt->src.type = OP_MEM;
+		ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+		ctxt->src.addr.mem.ea =
+			register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
+		ctxt->src.addr.mem.seg = seg_override(ctxt);
+		ctxt->src.val = 0;
 		break;
 	case SrcImmFAddr:
-		c->src.type = OP_IMM;
-		c->src.addr.mem.ea = c->eip;
-		c->src.bytes = c->op_bytes + 2;
-		insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
+		ctxt->src.type = OP_IMM;
+		ctxt->src.addr.mem.ea = ctxt->_eip;
+		ctxt->src.bytes = ctxt->op_bytes + 2;
+		insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt->_eip);
 		break;
 	case SrcMemFAddr:
-		memop.bytes = c->op_bytes + 2;
+		memop.bytes = ctxt->op_bytes + 2;
 		goto srcmem_common;
 		break;
 	case SrcDX:
-		c->src.type = OP_REG;
-		c->src.bytes = 2;
-		c->src.addr.reg = &c->regs[VCPU_REGS_RDX];
-		fetch_register_operand(&c->src);
+		ctxt->src.type = OP_REG;
+		ctxt->src.bytes = 2;
+		ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+		fetch_register_operand(&ctxt->src);
 		break;
 	}
 
@@ -3628,22 +3598,22 @@ done_prefixes:
 	 * Decode and fetch the second source operand: register, memory
 	 * or immediate.
 	 */
-	switch (c->d & Src2Mask) {
+	switch (ctxt->d & Src2Mask) {
 	case Src2None:
 		break;
 	case Src2CL:
-		c->src2.bytes = 1;
-		c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
+		ctxt->src2.bytes = 1;
+		ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0x8;
 		break;
 	case Src2ImmByte:
-		rc = decode_imm(ctxt, &c->src2, 1, true);
+		rc = decode_imm(ctxt, &ctxt->src2, 1, true);
 		break;
 	case Src2One:
-		c->src2.bytes = 1;
-		c->src2.val = 1;
+		ctxt->src2.bytes = 1;
+		ctxt->src2.val = 1;
 		break;
 	case Src2Imm:
-		rc = decode_imm(ctxt, &c->src2, imm_size(c), true);
+		rc = decode_imm(ctxt, &ctxt->src2, imm_size(ctxt), true);
 		break;
 	}
 
@@ -3651,68 +3621,66 @@ done_prefixes:
 		goto done;
 
 	/* Decode and fetch the destination operand: register or memory. */
-	switch (c->d & DstMask) {
+	switch (ctxt->d & DstMask) {
 	case DstReg:
-		decode_register_operand(ctxt, &c->dst, c,
-			 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
+		decode_register_operand(ctxt, &ctxt->dst,
+			 ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
 		break;
 	case DstImmUByte:
-		c->dst.type = OP_IMM;
-		c->dst.addr.mem.ea = c->eip;
-		c->dst.bytes = 1;
-		c->dst.val = insn_fetch(u8, 1, c->eip);
+		ctxt->dst.type = OP_IMM;
+		ctxt->dst.addr.mem.ea = ctxt->_eip;
+		ctxt->dst.bytes = 1;
+		ctxt->dst.val = insn_fetch(u8, 1, ctxt->_eip);
 		break;
 	case DstMem:
 	case DstMem64:
-		c->dst = memop;
-		memopp = &c->dst;
-		if ((c->d & DstMask) == DstMem64)
-			c->dst.bytes = 8;
+		ctxt->dst = memop;
+		memopp = &ctxt->dst;
+		if ((ctxt->d & DstMask) == DstMem64)
+			ctxt->dst.bytes = 8;
 		else
-			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		if (c->d & BitOp)
-			fetch_bit_operand(c);
-		c->dst.orig_val = c->dst.val;
+			ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+		if (ctxt->d & BitOp)
+			fetch_bit_operand(ctxt);
+		ctxt->dst.orig_val = ctxt->dst.val;
 		break;
 	case DstAcc:
-		c->dst.type = OP_REG;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.addr.reg = &c->regs[VCPU_REGS_RAX];
-		fetch_register_operand(&c->dst);
-		c->dst.orig_val = c->dst.val;
+		ctxt->dst.type = OP_REG;
+		ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+		ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
+		fetch_register_operand(&ctxt->dst);
+		ctxt->dst.orig_val = ctxt->dst.val;
 		break;
 	case DstDI:
-		c->dst.type = OP_MEM;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.addr.mem.ea =
-			register_address(c, c->regs[VCPU_REGS_RDI]);
-		c->dst.addr.mem.seg = VCPU_SREG_ES;
-		c->dst.val = 0;
+		ctxt->dst.type = OP_MEM;
+		ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+		ctxt->dst.addr.mem.ea =
+			register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
+		ctxt->dst.addr.mem.seg = VCPU_SREG_ES;
+		ctxt->dst.val = 0;
 		break;
 	case DstDX:
-		c->dst.type = OP_REG;
-		c->dst.bytes = 2;
-		c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
-		fetch_register_operand(&c->dst);
+		ctxt->dst.type = OP_REG;
+		ctxt->dst.bytes = 2;
+		ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+		fetch_register_operand(&ctxt->dst);
 		break;
 	case ImplicitOps:
 		/* Special instructions do their own operand decoding. */
 	default:
-		c->dst.type = OP_NONE; /* Disable writeback. */
+		ctxt->dst.type = OP_NONE; /* Disable writeback. */
 		break;
 	}
 
 done:
-	if (memopp && memopp->type == OP_MEM && c->rip_relative)
-		memopp->addr.mem.ea += c->eip;
+	if (memopp && memopp->type == OP_MEM && ctxt->rip_relative)
+		memopp->addr.mem.ea += ctxt->_eip;
 
 	return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 }
 
 static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
 {
-	struct decode_cache *c = &ctxt->decode;
-
 	/* The second termination condition only applies for REPE
 	 * and REPNE. Test if the repeat string operation prefix is
 	 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
@@ -3720,304 +3688,232 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
 	 * 	- if REPE/REPZ and ZF = 0 then done
 	 * 	- if REPNE/REPNZ and ZF = 1 then done
 	 */
-	if (((c->b == 0xa6) || (c->b == 0xa7) ||
-	     (c->b == 0xae) || (c->b == 0xaf))
-	    && (((c->rep_prefix == REPE_PREFIX) &&
+	if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) ||
+	     (ctxt->b == 0xae) || (ctxt->b == 0xaf))
+	    && (((ctxt->rep_prefix == REPE_PREFIX) &&
 		 ((ctxt->eflags & EFLG_ZF) == 0))
-		|| ((c->rep_prefix == REPNE_PREFIX) &&
+		|| ((ctxt->rep_prefix == REPNE_PREFIX) &&
 		    ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
 		return true;
 
 	return false;
 }
 
-int
-x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
+int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 {
 	struct x86_emulate_ops *ops = ctxt->ops;
 	u64 msr_data;
-	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
-	int saved_dst_type = c->dst.type;
-	int irq; /* Used for int 3, int, and into */
+	int saved_dst_type = ctxt->dst.type;
 
-	ctxt->decode.mem_read.pos = 0;
+	ctxt->mem_read.pos = 0;
 
-	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
+	if (ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) {
 		rc = emulate_ud(ctxt);
 		goto done;
 	}
 
 	/* LOCK prefix is allowed only with some instructions */
-	if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
+	if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) {
 		rc = emulate_ud(ctxt);
 		goto done;
 	}
 
-	if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
+	if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) {
 		rc = emulate_ud(ctxt);
 		goto done;
 	}
 
-	if ((c->d & Sse)
+	if ((ctxt->d & Sse)
 	    && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)
 		|| !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
 		rc = emulate_ud(ctxt);
 		goto done;
 	}
 
-	if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
+	if ((ctxt->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
 		rc = emulate_nm(ctxt);
 		goto done;
 	}
 
-	if (unlikely(ctxt->guest_mode) && c->intercept) {
-		rc = emulator_check_intercept(ctxt, c->intercept,
+	if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
+		rc = emulator_check_intercept(ctxt, ctxt->intercept,
 					      X86_ICPT_PRE_EXCEPT);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 	}
 
 	/* Privileged instruction can be executed only in CPL=0 */
-	if ((c->d & Priv) && ops->cpl(ctxt)) {
+	if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
 		rc = emulate_gp(ctxt, 0);
 		goto done;
 	}
 
 	/* Instruction can only be executed in protected mode */
-	if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
+	if ((ctxt->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
 		rc = emulate_ud(ctxt);
 		goto done;
 	}
 
 	/* Do instruction specific permission checks */
-	if (c->check_perm) {
-		rc = c->check_perm(ctxt);
+	if (ctxt->check_perm) {
+		rc = ctxt->check_perm(ctxt);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 	}
 
-	if (unlikely(ctxt->guest_mode) && c->intercept) {
-		rc = emulator_check_intercept(ctxt, c->intercept,
+	if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
+		rc = emulator_check_intercept(ctxt, ctxt->intercept,
 					      X86_ICPT_POST_EXCEPT);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 	}
 
-	if (c->rep_prefix && (c->d & String)) {
+	if (ctxt->rep_prefix && (ctxt->d & String)) {
 		/* All REP prefixes have the same first termination condition */
-		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
-			ctxt->eip = c->eip;
+		if (address_mask(ctxt, ctxt->regs[VCPU_REGS_RCX]) == 0) {
+			ctxt->eip = ctxt->_eip;
 			goto done;
 		}
 	}
 
-	if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
-		rc = segmented_read(ctxt, c->src.addr.mem,
-				    c->src.valptr, c->src.bytes);
+	if ((ctxt->src.type == OP_MEM) && !(ctxt->d & NoAccess)) {
+		rc = segmented_read(ctxt, ctxt->src.addr.mem,
+				    ctxt->src.valptr, ctxt->src.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
-		c->src.orig_val64 = c->src.val64;
+		ctxt->src.orig_val64 = ctxt->src.val64;
 	}
 
-	if (c->src2.type == OP_MEM) {
-		rc = segmented_read(ctxt, c->src2.addr.mem,
-				    &c->src2.val, c->src2.bytes);
+	if (ctxt->src2.type == OP_MEM) {
+		rc = segmented_read(ctxt, ctxt->src2.addr.mem,
+				    &ctxt->src2.val, ctxt->src2.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 	}
 
-	if ((c->d & DstMask) == ImplicitOps)
+	if ((ctxt->d & DstMask) == ImplicitOps)
 		goto special_insn;
 
 
-	if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
+	if ((ctxt->dst.type == OP_MEM) && !(ctxt->d & Mov)) {
 		/* optimisation - avoid slow emulated read if Mov */
-		rc = segmented_read(ctxt, c->dst.addr.mem,
-				   &c->dst.val, c->dst.bytes);
+		rc = segmented_read(ctxt, ctxt->dst.addr.mem,
+				   &ctxt->dst.val, ctxt->dst.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 	}
-	c->dst.orig_val = c->dst.val;
+	ctxt->dst.orig_val = ctxt->dst.val;
 
 special_insn:
 
-	if (unlikely(ctxt->guest_mode) && c->intercept) {
-		rc = emulator_check_intercept(ctxt, c->intercept,
+	if (unlikely(ctxt->guest_mode) && ctxt->intercept) {
+		rc = emulator_check_intercept(ctxt, ctxt->intercept,
 					      X86_ICPT_POST_MEMACCESS);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 	}
 
-	if (c->execute) {
-		rc = c->execute(ctxt);
+	if (ctxt->execute) {
+		rc = ctxt->execute(ctxt);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		goto writeback;
 	}
 
-	if (c->twobyte)
+	if (ctxt->twobyte)
 		goto twobyte_insn;
 
-	switch (c->b) {
+	switch (ctxt->b) {
 	case 0x06:		/* push es */
-		rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
+		rc = emulate_push_sreg(ctxt, VCPU_SREG_ES);
 		break;
 	case 0x07:		/* pop es */
-		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
+		rc = emulate_pop_sreg(ctxt, VCPU_SREG_ES);
 		break;
 	case 0x0e:		/* push cs */
-		rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
+		rc = emulate_push_sreg(ctxt, VCPU_SREG_CS);
 		break;
 	case 0x16:		/* push ss */
-		rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
+		rc = emulate_push_sreg(ctxt, VCPU_SREG_SS);
 		break;
 	case 0x17:		/* pop ss */
-		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
+		rc = emulate_pop_sreg(ctxt, VCPU_SREG_SS);
 		break;
 	case 0x1e:		/* push ds */
-		rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
+		rc = emulate_push_sreg(ctxt, VCPU_SREG_DS);
 		break;
 	case 0x1f:		/* pop ds */
-		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
+		rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS);
 		break;
 	case 0x40 ... 0x47: /* inc r16/r32 */
-		emulate_1op("inc", c->dst, ctxt->eflags);
+		emulate_1op("inc", ctxt->dst, ctxt->eflags);
 		break;
 	case 0x48 ... 0x4f: /* dec r16/r32 */
-		emulate_1op("dec", c->dst, ctxt->eflags);
+		emulate_1op("dec", ctxt->dst, ctxt->eflags);
 		break;
 	case 0x63:		/* movsxd */
 		if (ctxt->mode != X86EMUL_MODE_PROT64)
 			goto cannot_emulate;
-		c->dst.val = (s32) c->src.val;
+		ctxt->dst.val = (s32) ctxt->src.val;
 		break;
 	case 0x6c:		/* insb */
 	case 0x6d:		/* insw/insd */
-		c->src.val = c->regs[VCPU_REGS_RDX];
+		ctxt->src.val = ctxt->regs[VCPU_REGS_RDX];
 		goto do_io_in;
 	case 0x6e:		/* outsb */
 	case 0x6f:		/* outsw/outsd */
-		c->dst.val = c->regs[VCPU_REGS_RDX];
+		ctxt->dst.val = ctxt->regs[VCPU_REGS_RDX];
 		goto do_io_out;
 		break;
 	case 0x70 ... 0x7f: /* jcc (short) */
-		if (test_cc(c->b, ctxt->eflags))
-			jmp_rel(c, c->src.val);
-		break;
-	case 0x84 ... 0x85:
-	test:
-		emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
-		break;
-	case 0x86 ... 0x87:	/* xchg */
-	xchg:
-		/* Write back the register source. */
-		c->src.val = c->dst.val;
-		write_register_operand(&c->src);
-		/*
-		 * Write back the memory destination with implicit LOCK
-		 * prefix.
-		 */
-		c->dst.val = c->src.orig_val;
-		c->lock_prefix = 1;
-		break;
-	case 0x8c:  /* mov r/m, sreg */
-		if (c->modrm_reg > VCPU_SREG_GS) {
-			rc = emulate_ud(ctxt);
-			goto done;
-		}
-		c->dst.val = get_segment_selector(ctxt, c->modrm_reg);
+		if (test_cc(ctxt->b, ctxt->eflags))
+			jmp_rel(ctxt, ctxt->src.val);
 		break;
 	case 0x8d: /* lea r16/r32, m */
-		c->dst.val = c->src.addr.mem.ea;
+		ctxt->dst.val = ctxt->src.addr.mem.ea;
 		break;
-	case 0x8e: { /* mov seg, r/m16 */
-		uint16_t sel;
-
-		sel = c->src.val;
-
-		if (c->modrm_reg == VCPU_SREG_CS ||
-		    c->modrm_reg > VCPU_SREG_GS) {
-			rc = emulate_ud(ctxt);
-			goto done;
-		}
-
-		if (c->modrm_reg == VCPU_SREG_SS)
-			ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
-
-		rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
-
-		c->dst.type = OP_NONE;  /* Disable writeback. */
-		break;
-	}
 	case 0x8f:		/* pop (sole member of Grp1a) */
 		rc = em_grp1a(ctxt);
 		break;
 	case 0x90 ... 0x97: /* nop / xchg reg, rax */
-		if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
+		if (ctxt->dst.addr.reg == &ctxt->regs[VCPU_REGS_RAX])
 			break;
-		goto xchg;
+		rc = em_xchg(ctxt);
+		break;
 	case 0x98: /* cbw/cwde/cdqe */
-		switch (c->op_bytes) {
-		case 2: c->dst.val = (s8)c->dst.val; break;
-		case 4: c->dst.val = (s16)c->dst.val; break;
-		case 8: c->dst.val = (s32)c->dst.val; break;
+		switch (ctxt->op_bytes) {
+		case 2: ctxt->dst.val = (s8)ctxt->dst.val; break;
+		case 4: ctxt->dst.val = (s16)ctxt->dst.val; break;
+		case 8: ctxt->dst.val = (s32)ctxt->dst.val; break;
 		}
 		break;
-	case 0xa8 ... 0xa9:	/* test ax, imm */
-		goto test;
 	case 0xc0 ... 0xc1:
 		rc = em_grp2(ctxt);
 		break;
-	case 0xc3: /* ret */
-		c->dst.type = OP_REG;
-		c->dst.addr.reg = &c->eip;
-		c->dst.bytes = c->op_bytes;
-		rc = em_pop(ctxt);
-		break;
 	case 0xc4:		/* les */
-		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
+		rc = emulate_load_segment(ctxt, VCPU_SREG_ES);
 		break;
 	case 0xc5:		/* lds */
-		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS);
-		break;
-	case 0xcb:		/* ret far */
-		rc = emulate_ret_far(ctxt, ops);
+		rc = emulate_load_segment(ctxt, VCPU_SREG_DS);
 		break;
 	case 0xcc:		/* int3 */
-		irq = 3;
-		goto do_interrupt;
+		rc = emulate_int(ctxt, 3);
+		break;
 	case 0xcd:		/* int n */
-		irq = c->src.val;
-	do_interrupt:
-		rc = emulate_int(ctxt, ops, irq);
+		rc = emulate_int(ctxt, ctxt->src.val);
 		break;
 	case 0xce:		/* into */
-		if (ctxt->eflags & EFLG_OF) {
-			irq = 4;
-			goto do_interrupt;
-		}
-		break;
-	case 0xcf:		/* iret */
-		rc = emulate_iret(ctxt, ops);
+		if (ctxt->eflags & EFLG_OF)
+			rc = emulate_int(ctxt, 4);
 		break;
 	case 0xd0 ... 0xd1:	/* Grp2 */
 		rc = em_grp2(ctxt);
 		break;
 	case 0xd2 ... 0xd3:	/* Grp2 */
-		c->src.val = c->regs[VCPU_REGS_RCX];
+		ctxt->src.val = ctxt->regs[VCPU_REGS_RCX];
 		rc = em_grp2(ctxt);
 		break;
-	case 0xe0 ... 0xe2:	/* loop/loopz/loopnz */
-		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
-		if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 &&
-		    (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
-			jmp_rel(c, c->src.val);
-		break;
-	case 0xe3:	/* jcxz/jecxz/jrcxz */
-		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
-			jmp_rel(c, c->src.val);
-		break;
 	case 0xe4: 	/* inb */
 	case 0xe5: 	/* in */
 		goto do_io_in;
@@ -4025,35 +3921,30 @@ special_insn:
 	case 0xe7: /* out */
 		goto do_io_out;
 	case 0xe8: /* call (near) */ {
-		long int rel = c->src.val;
-		c->src.val = (unsigned long) c->eip;
-		jmp_rel(c, rel);
+		long int rel = ctxt->src.val;
+		ctxt->src.val = (unsigned long) ctxt->_eip;
+		jmp_rel(ctxt, rel);
 		rc = em_push(ctxt);
 		break;
 	}
 	case 0xe9: /* jmp rel */
-		goto jmp;
-	case 0xea: /* jmp far */
-		rc = em_jmp_far(ctxt);
-		break;
-	case 0xeb:
-	      jmp:		/* jmp rel short */
-		jmp_rel(c, c->src.val);
-		c->dst.type = OP_NONE; /* Disable writeback. */
+	case 0xeb: /* jmp rel short */
+		jmp_rel(ctxt, ctxt->src.val);
+		ctxt->dst.type = OP_NONE; /* Disable writeback. */
 		break;
 	case 0xec: /* in al,dx */
 	case 0xed: /* in (e/r)ax,dx */
 	do_io_in:
-		if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
-				     &c->dst.val))
+		if (!pio_in_emulated(ctxt, ctxt->dst.bytes, ctxt->src.val,
+				     &ctxt->dst.val))
 			goto done; /* IO is needed */
 		break;
 	case 0xee: /* out dx,al */
 	case 0xef: /* out dx,(e/r)ax */
 	do_io_out:
-		ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val,
-				      &c->src.val, 1);
-		c->dst.type = OP_NONE;	/* Disable writeback. */
+		ops->pio_out_emulated(ctxt, ctxt->src.bytes, ctxt->dst.val,
+				      &ctxt->src.val, 1);
+		ctxt->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xf4:              /* hlt */
 		ctxt->ops->halt(ctxt);
@@ -4071,22 +3962,6 @@ special_insn:
 	case 0xf9: /* stc */
 		ctxt->eflags |= EFLG_CF;
 		break;
-	case 0xfa: /* cli */
-		if (emulator_bad_iopl(ctxt, ops)) {
-			rc = emulate_gp(ctxt, 0);
-			goto done;
-		} else
-			ctxt->eflags &= ~X86_EFLAGS_IF;
-		break;
-	case 0xfb: /* sti */
-		if (emulator_bad_iopl(ctxt, ops)) {
-			rc = emulate_gp(ctxt, 0);
-			goto done;
-		} else {
-			ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
-			ctxt->eflags |= X86_EFLAGS_IF;
-		}
-		break;
 	case 0xfc: /* cld */
 		ctxt->eflags &= ~EFLG_DF;
 		break;
@@ -4115,40 +3990,40 @@ writeback:
 	 * restore dst type in case the decoding will be reused
 	 * (happens for string instruction )
 	 */
-	c->dst.type = saved_dst_type;
+	ctxt->dst.type = saved_dst_type;
 
-	if ((c->d & SrcMask) == SrcSI)
-		string_addr_inc(ctxt, seg_override(ctxt, c),
-				VCPU_REGS_RSI, &c->src);
+	if ((ctxt->d & SrcMask) == SrcSI)
+		string_addr_inc(ctxt, seg_override(ctxt),
+				VCPU_REGS_RSI, &ctxt->src);
 
-	if ((c->d & DstMask) == DstDI)
+	if ((ctxt->d & DstMask) == DstDI)
 		string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
-				&c->dst);
+				&ctxt->dst);
 
-	if (c->rep_prefix && (c->d & String)) {
-		struct read_cache *r = &ctxt->decode.io_read;
-		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
+	if (ctxt->rep_prefix && (ctxt->d & String)) {
+		struct read_cache *r = &ctxt->io_read;
+		register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RCX], -1);
 
 		if (!string_insn_completed(ctxt)) {
 			/*
 			 * Re-enter guest when pio read ahead buffer is empty
 			 * or, if it is not used, after each 1024 iteration.
 			 */
-			if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) &&
+			if ((r->end != 0 || ctxt->regs[VCPU_REGS_RCX] & 0x3ff) &&
 			    (r->end == 0 || r->end != r->pos)) {
 				/*
 				 * Reset read cache. Usually happens before
 				 * decode, but since instruction is restarted
 				 * we have to do it here.
 				 */
-				ctxt->decode.mem_read.end = 0;
+				ctxt->mem_read.end = 0;
 				return EMULATION_RESTART;
 			}
 			goto done; /* skip rip writeback */
 		}
 	}
 
-	ctxt->eip = c->eip;
+	ctxt->eip = ctxt->_eip;
 
 done:
 	if (rc == X86EMUL_PROPAGATE_FAULT)
@@ -4159,13 +4034,7 @@ done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 
 twobyte_insn:
-	switch (c->b) {
-	case 0x05: 		/* syscall */
-		rc = emulate_syscall(ctxt, ops);
-		break;
-	case 0x06:
-		rc = em_clts(ctxt);
-		break;
+	switch (ctxt->b) {
 	case 0x09:		/* wbinvd */
 		(ctxt->ops->wbinvd)(ctxt);
 		break;
@@ -4174,21 +4043,21 @@ twobyte_insn:
 	case 0x18:		/* Grp16 (prefetch/nop) */
 		break;
 	case 0x20: /* mov cr, reg */
-		c->dst.val = ops->get_cr(ctxt, c->modrm_reg);
+		ctxt->dst.val = ops->get_cr(ctxt, ctxt->modrm_reg);
 		break;
 	case 0x21: /* mov from dr to reg */
-		ops->get_dr(ctxt, c->modrm_reg, &c->dst.val);
+		ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val);
 		break;
 	case 0x22: /* mov reg, cr */
-		if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) {
+		if (ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val)) {
 			emulate_gp(ctxt, 0);
 			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
 		}
-		c->dst.type = OP_NONE;
+		ctxt->dst.type = OP_NONE;
 		break;
 	case 0x23: /* mov from reg to dr */
-		if (ops->set_dr(ctxt, c->modrm_reg, c->src.val &
+		if (ops->set_dr(ctxt, ctxt->modrm_reg, ctxt->src.val &
 				((ctxt->mode == X86EMUL_MODE_PROT64) ?
 				 ~0ULL : ~0U)) < 0) {
 			/* #UD condition is already handled by the code above */
@@ -4197,13 +4066,13 @@ twobyte_insn:
 			goto done;
 		}
 
-		c->dst.type = OP_NONE;	/* no writeback */
+		ctxt->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x30:
 		/* wrmsr */
-		msr_data = (u32)c->regs[VCPU_REGS_RAX]
-			| ((u64)c->regs[VCPU_REGS_RDX] << 32);
-		if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) {
+		msr_data = (u32)ctxt->regs[VCPU_REGS_RAX]
+			| ((u64)ctxt->regs[VCPU_REGS_RDX] << 32);
+		if (ops->set_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], msr_data)) {
 			emulate_gp(ctxt, 0);
 			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
@@ -4212,64 +4081,58 @@ twobyte_insn:
 		break;
 	case 0x32:
 		/* rdmsr */
-		if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) {
+		if (ops->get_msr(ctxt, ctxt->regs[VCPU_REGS_RCX], &msr_data)) {
 			emulate_gp(ctxt, 0);
 			rc = X86EMUL_PROPAGATE_FAULT;
 			goto done;
 		} else {
-			c->regs[VCPU_REGS_RAX] = (u32)msr_data;
-			c->regs[VCPU_REGS_RDX] = msr_data >> 32;
+			ctxt->regs[VCPU_REGS_RAX] = (u32)msr_data;
+			ctxt->regs[VCPU_REGS_RDX] = msr_data >> 32;
 		}
 		rc = X86EMUL_CONTINUE;
 		break;
-	case 0x34:		/* sysenter */
-		rc = emulate_sysenter(ctxt, ops);
-		break;
-	case 0x35:		/* sysexit */
-		rc = emulate_sysexit(ctxt, ops);
-		break;
 	case 0x40 ... 0x4f:	/* cmov */
-		c->dst.val = c->dst.orig_val = c->src.val;
-		if (!test_cc(c->b, ctxt->eflags))
-			c->dst.type = OP_NONE; /* no writeback */
+		ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val;
+		if (!test_cc(ctxt->b, ctxt->eflags))
+			ctxt->dst.type = OP_NONE; /* no writeback */
 		break;
 	case 0x80 ... 0x8f: /* jnz rel, etc*/
-		if (test_cc(c->b, ctxt->eflags))
-			jmp_rel(c, c->src.val);
+		if (test_cc(ctxt->b, ctxt->eflags))
+			jmp_rel(ctxt, ctxt->src.val);
 		break;
 	case 0x90 ... 0x9f:     /* setcc r/m8 */
-		c->dst.val = test_cc(c->b, ctxt->eflags);
+		ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
 		break;
 	case 0xa0:	  /* push fs */
-		rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
+		rc = emulate_push_sreg(ctxt, VCPU_SREG_FS);
 		break;
 	case 0xa1:	 /* pop fs */
-		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
+		rc = emulate_pop_sreg(ctxt, VCPU_SREG_FS);
 		break;
 	case 0xa3:
 	      bt:		/* bt */
-		c->dst.type = OP_NONE;
+		ctxt->dst.type = OP_NONE;
 		/* only subword offset */
-		c->src.val &= (c->dst.bytes << 3) - 1;
-		emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
+		ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
+		emulate_2op_SrcV_nobyte("bt", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 0xa4: /* shld imm8, r, r/m */
 	case 0xa5: /* shld cl, r, r/m */
-		emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
+		emulate_2op_cl("shld", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 0xa8:	/* push gs */
-		rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
+		rc = emulate_push_sreg(ctxt, VCPU_SREG_GS);
 		break;
 	case 0xa9:	/* pop gs */
-		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
+		rc = emulate_pop_sreg(ctxt, VCPU_SREG_GS);
 		break;
 	case 0xab:
 	      bts:		/* bts */
-		emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcV_nobyte("bts", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 0xac: /* shrd imm8, r, r/m */
 	case 0xad: /* shrd cl, r, r/m */
-		emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags);
+		emulate_2op_cl("shrd", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 0xae:              /* clflush */
 		break;
@@ -4278,38 +4141,38 @@ twobyte_insn:
 		 * Save real source value, then compare EAX against
 		 * destination.
 		 */
-		c->src.orig_val = c->src.val;
-		c->src.val = c->regs[VCPU_REGS_RAX];
-		emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
+		ctxt->src.orig_val = ctxt->src.val;
+		ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
+		emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags);
 		if (ctxt->eflags & EFLG_ZF) {
 			/* Success: write back to memory. */
-			c->dst.val = c->src.orig_val;
+			ctxt->dst.val = ctxt->src.orig_val;
 		} else {
 			/* Failure: write the value we saw to EAX. */
-			c->dst.type = OP_REG;
-			c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX];
+			ctxt->dst.type = OP_REG;
+			ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
 		}
 		break;
 	case 0xb2:		/* lss */
-		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS);
+		rc = emulate_load_segment(ctxt, VCPU_SREG_SS);
 		break;
 	case 0xb3:
 	      btr:		/* btr */
-		emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcV_nobyte("btr", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 0xb4:		/* lfs */
-		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS);
+		rc = emulate_load_segment(ctxt, VCPU_SREG_FS);
 		break;
 	case 0xb5:		/* lgs */
-		rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS);
+		rc = emulate_load_segment(ctxt, VCPU_SREG_GS);
 		break;
 	case 0xb6 ... 0xb7:	/* movzx */
-		c->dst.bytes = c->op_bytes;
-		c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
-						       : (u16) c->src.val;
+		ctxt->dst.bytes = ctxt->op_bytes;
+		ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val
+						       : (u16) ctxt->src.val;
 		break;
 	case 0xba:		/* Grp8 */
-		switch (c->modrm_reg & 3) {
+		switch (ctxt->modrm_reg & 3) {
 		case 0:
 			goto bt;
 		case 1:
@@ -4322,47 +4185,47 @@ twobyte_insn:
 		break;
 	case 0xbb:
 	      btc:		/* btc */
-		emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcV_nobyte("btc", ctxt->src, ctxt->dst, ctxt->eflags);
 		break;
 	case 0xbc: {		/* bsf */
 		u8 zf;
 		__asm__ ("bsf %2, %0; setz %1"
-			 : "=r"(c->dst.val), "=q"(zf)
-			 : "r"(c->src.val));
+			 : "=r"(ctxt->dst.val), "=q"(zf)
+			 : "r"(ctxt->src.val));
 		ctxt->eflags &= ~X86_EFLAGS_ZF;
 		if (zf) {
 			ctxt->eflags |= X86_EFLAGS_ZF;
-			c->dst.type = OP_NONE;	/* Disable writeback. */
+			ctxt->dst.type = OP_NONE;	/* Disable writeback. */
 		}
 		break;
 	}
 	case 0xbd: {		/* bsr */
 		u8 zf;
 		__asm__ ("bsr %2, %0; setz %1"
-			 : "=r"(c->dst.val), "=q"(zf)
-			 : "r"(c->src.val));
+			 : "=r"(ctxt->dst.val), "=q"(zf)
+			 : "r"(ctxt->src.val));
 		ctxt->eflags &= ~X86_EFLAGS_ZF;
 		if (zf) {
 			ctxt->eflags |= X86_EFLAGS_ZF;
-			c->dst.type = OP_NONE;	/* Disable writeback. */
+			ctxt->dst.type = OP_NONE;	/* Disable writeback. */
 		}
 		break;
 	}
 	case 0xbe ... 0xbf:	/* movsx */
-		c->dst.bytes = c->op_bytes;
-		c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
-							(s16) c->src.val;
+		ctxt->dst.bytes = ctxt->op_bytes;
+		ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val :
+							(s16) ctxt->src.val;
 		break;
 	case 0xc0 ... 0xc1:	/* xadd */
-		emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
+		emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags);
 		/* Write back the register source. */
-		c->src.val = c->dst.orig_val;
-		write_register_operand(&c->src);
+		ctxt->src.val = ctxt->dst.orig_val;
+		write_register_operand(&ctxt->src);
 		break;
 	case 0xc3:		/* movnti */
-		c->dst.bytes = c->op_bytes;
-		c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
-							(u64) c->src.val;
+		ctxt->dst.bytes = ctxt->op_bytes;
+		ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val :
+							(u64) ctxt->src.val;
 		break;
 	case 0xc7:		/* Grp9 (cmpxchg8b) */
 		rc = em_grp9(ctxt);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2b2255b1f04..57dcbd4308f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -33,7 +33,7 @@
 #include <asm/page.h>
 #include <asm/current.h>
 #include <asm/apicdef.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include "kvm_cache_regs.h"
 #include "irq.h"
 #include "trace.h"
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index aee38623b76..1c5b69373a0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -22,7 +22,6 @@
 #include "mmu.h"
 #include "x86.h"
 #include "kvm_cache_regs.h"
-#include "x86.h"
 
 #include <linux/kvm_host.h>
 #include <linux/types.h>
@@ -148,7 +147,7 @@ module_param(oos_shadow, bool, 0644);
 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
 			| PT64_NX_MASK)
 
-#define RMAP_EXT 4
+#define PTE_LIST_EXT 4
 
 #define ACC_EXEC_MASK    1
 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
@@ -164,16 +163,16 @@ module_param(oos_shadow, bool, 0644);
 
 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 
-struct kvm_rmap_desc {
-	u64 *sptes[RMAP_EXT];
-	struct kvm_rmap_desc *more;
+struct pte_list_desc {
+	u64 *sptes[PTE_LIST_EXT];
+	struct pte_list_desc *more;
 };
 
 struct kvm_shadow_walk_iterator {
 	u64 addr;
 	hpa_t shadow_addr;
-	int level;
 	u64 *sptep;
+	int level;
 	unsigned index;
 };
 
@@ -182,32 +181,68 @@ struct kvm_shadow_walk_iterator {
 	     shadow_walk_okay(&(_walker));			\
 	     shadow_walk_next(&(_walker)))
 
-typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
+#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)	\
+	for (shadow_walk_init(&(_walker), _vcpu, _addr);		\
+	     shadow_walk_okay(&(_walker)) &&				\
+		({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });	\
+	     __shadow_walk_next(&(_walker), spte))
 
-static struct kmem_cache *pte_chain_cache;
-static struct kmem_cache *rmap_desc_cache;
+static struct kmem_cache *pte_list_desc_cache;
 static struct kmem_cache *mmu_page_header_cache;
 static struct percpu_counter kvm_total_used_mmu_pages;
 
-static u64 __read_mostly shadow_trap_nonpresent_pte;
-static u64 __read_mostly shadow_notrap_nonpresent_pte;
 static u64 __read_mostly shadow_nx_mask;
 static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
 static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mmio_mask;
 
-static inline u64 rsvd_bits(int s, int e)
+static void mmu_spte_set(u64 *sptep, u64 spte);
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
 {
-	return ((1ULL << (e - s + 1)) - 1) << s;
+	shadow_mmio_mask = mmio_mask;
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
+static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
 {
-	shadow_trap_nonpresent_pte = trap_pte;
-	shadow_notrap_nonpresent_pte = notrap_pte;
+	access &= ACC_WRITE_MASK | ACC_USER_MASK;
+
+	trace_mark_mmio_spte(sptep, gfn, access);
+	mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
+}
+
+static bool is_mmio_spte(u64 spte)
+{
+	return (spte & shadow_mmio_mask) == shadow_mmio_mask;
+}
+
+static gfn_t get_mmio_spte_gfn(u64 spte)
+{
+	return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
+}
+
+static unsigned get_mmio_spte_access(u64 spte)
+{
+	return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
+}
+
+static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
+{
+	if (unlikely(is_noslot_pfn(pfn))) {
+		mark_mmio_spte(sptep, gfn, access);
+		return true;
+	}
+
+	return false;
+}
+
+static inline u64 rsvd_bits(int s, int e)
+{
+	return ((1ULL << (e - s + 1)) - 1) << s;
 }
-EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
 
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask)
@@ -220,11 +255,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
-static bool is_write_protection(struct kvm_vcpu *vcpu)
-{
-	return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
-}
-
 static int is_cpuid_PSE36(void)
 {
 	return 1;
@@ -237,8 +267,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
 
 static int is_shadow_present_pte(u64 pte)
 {
-	return pte != shadow_trap_nonpresent_pte
-		&& pte != shadow_notrap_nonpresent_pte;
+	return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
 }
 
 static int is_large_pte(u64 pte)
@@ -246,11 +275,6 @@ static int is_large_pte(u64 pte)
 	return pte & PT_PAGE_SIZE_MASK;
 }
 
-static int is_writable_pte(unsigned long pte)
-{
-	return pte & PT_WRITABLE_MASK;
-}
-
 static int is_dirty_gpte(unsigned long pte)
 {
 	return pte & PT_DIRTY_MASK;
@@ -282,26 +306,154 @@ static gfn_t pse36_gfn_delta(u32 gpte)
 	return (gpte & PT32_DIR_PSE36_MASK) << shift;
 }
 
+#ifdef CONFIG_X86_64
 static void __set_spte(u64 *sptep, u64 spte)
 {
-	set_64bit(sptep, spte);
+	*sptep = spte;
 }
 
-static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 {
-#ifdef CONFIG_X86_64
-	return xchg(sptep, new_spte);
+	*sptep = spte;
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+	return xchg(sptep, spte);
+}
+
+static u64 __get_spte_lockless(u64 *sptep)
+{
+	return ACCESS_ONCE(*sptep);
+}
+
+static bool __check_direct_spte_mmio_pf(u64 spte)
+{
+	/* It is valid if the spte is zapped. */
+	return spte == 0ull;
+}
 #else
-	u64 old_spte;
+union split_spte {
+	struct {
+		u32 spte_low;
+		u32 spte_high;
+	};
+	u64 spte;
+};
 
-	do {
-		old_spte = *sptep;
-	} while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+static void count_spte_clear(u64 *sptep, u64 spte)
+{
+	struct kvm_mmu_page *sp =  page_header(__pa(sptep));
 
-	return old_spte;
-#endif
+	if (is_shadow_present_pte(spte))
+		return;
+
+	/* Ensure the spte is completely set before we increase the count */
+	smp_wmb();
+	sp->clear_spte_count++;
+}
+
+static void __set_spte(u64 *sptep, u64 spte)
+{
+	union split_spte *ssptep, sspte;
+
+	ssptep = (union split_spte *)sptep;
+	sspte = (union split_spte)spte;
+
+	ssptep->spte_high = sspte.spte_high;
+
+	/*
+	 * If we map the spte from nonpresent to present, We should store
+	 * the high bits firstly, then set present bit, so cpu can not
+	 * fetch this spte while we are setting the spte.
+	 */
+	smp_wmb();
+
+	ssptep->spte_low = sspte.spte_low;
 }
 
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
+{
+	union split_spte *ssptep, sspte;
+
+	ssptep = (union split_spte *)sptep;
+	sspte = (union split_spte)spte;
+
+	ssptep->spte_low = sspte.spte_low;
+
+	/*
+	 * If we map the spte from present to nonpresent, we should clear
+	 * present bit firstly to avoid vcpu fetch the old high bits.
+	 */
+	smp_wmb();
+
+	ssptep->spte_high = sspte.spte_high;
+	count_spte_clear(sptep, spte);
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+	union split_spte *ssptep, sspte, orig;
+
+	ssptep = (union split_spte *)sptep;
+	sspte = (union split_spte)spte;
+
+	/* xchg acts as a barrier before the setting of the high bits */
+	orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
+	orig.spte_high = ssptep->spte_high = sspte.spte_high;
+	count_spte_clear(sptep, spte);
+
+	return orig.spte;
+}
+
+/*
+ * The idea using the light way get the spte on x86_32 guest is from
+ * gup_get_pte(arch/x86/mm/gup.c).
+ * The difference is we can not catch the spte tlb flush if we leave
+ * guest mode, so we emulate it by increase clear_spte_count when spte
+ * is cleared.
+ */
+static u64 __get_spte_lockless(u64 *sptep)
+{
+	struct kvm_mmu_page *sp =  page_header(__pa(sptep));
+	union split_spte spte, *orig = (union split_spte *)sptep;
+	int count;
+
+retry:
+	count = sp->clear_spte_count;
+	smp_rmb();
+
+	spte.spte_low = orig->spte_low;
+	smp_rmb();
+
+	spte.spte_high = orig->spte_high;
+	smp_rmb();
+
+	if (unlikely(spte.spte_low != orig->spte_low ||
+	      count != sp->clear_spte_count))
+		goto retry;
+
+	return spte.spte;
+}
+
+static bool __check_direct_spte_mmio_pf(u64 spte)
+{
+	union split_spte sspte = (union split_spte)spte;
+	u32 high_mmio_mask = shadow_mmio_mask >> 32;
+
+	/* It is valid if the spte is zapped. */
+	if (spte == 0ull)
+		return true;
+
+	/* It is valid if the spte is being zapped. */
+	if (sspte.spte_low == 0ull &&
+	    (sspte.spte_high & high_mmio_mask) == high_mmio_mask)
+		return true;
+
+	return false;
+}
+#endif
+
 static bool spte_has_volatile_bits(u64 spte)
 {
 	if (!shadow_accessed_mask)
@@ -322,12 +474,30 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
 	return (old_spte & bit_mask) && !(new_spte & bit_mask);
 }
 
-static void update_spte(u64 *sptep, u64 new_spte)
+/* Rules for using mmu_spte_set:
+ * Set the sptep from nonpresent to present.
+ * Note: the sptep being assigned *must* be either not present
+ * or in a state where the hardware will not attempt to update
+ * the spte.
+ */
+static void mmu_spte_set(u64 *sptep, u64 new_spte)
+{
+	WARN_ON(is_shadow_present_pte(*sptep));
+	__set_spte(sptep, new_spte);
+}
+
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changged.
+ */
+static void mmu_spte_update(u64 *sptep, u64 new_spte)
 {
 	u64 mask, old_spte = *sptep;
 
 	WARN_ON(!is_rmap_spte(new_spte));
 
+	if (!is_shadow_present_pte(old_spte))
+		return mmu_spte_set(sptep, new_spte);
+
 	new_spte |= old_spte & shadow_dirty_mask;
 
 	mask = shadow_accessed_mask;
@@ -335,9 +505,9 @@ static void update_spte(u64 *sptep, u64 new_spte)
 		mask |= shadow_dirty_mask;
 
 	if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
-		__set_spte(sptep, new_spte);
+		__update_clear_spte_fast(sptep, new_spte);
 	else
-		old_spte = __xchg_spte(sptep, new_spte);
+		old_spte = __update_clear_spte_slow(sptep, new_spte);
 
 	if (!shadow_accessed_mask)
 		return;
@@ -348,6 +518,64 @@ static void update_spte(u64 *sptep, u64 new_spte)
 		kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 }
 
+/*
+ * Rules for using mmu_spte_clear_track_bits:
+ * It sets the sptep from present to nonpresent, and track the
+ * state bits, it is used to clear the last level sptep.
+ */
+static int mmu_spte_clear_track_bits(u64 *sptep)
+{
+	pfn_t pfn;
+	u64 old_spte = *sptep;
+
+	if (!spte_has_volatile_bits(old_spte))
+		__update_clear_spte_fast(sptep, 0ull);
+	else
+		old_spte = __update_clear_spte_slow(sptep, 0ull);
+
+	if (!is_rmap_spte(old_spte))
+		return 0;
+
+	pfn = spte_to_pfn(old_spte);
+	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+		kvm_set_pfn_accessed(pfn);
+	if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
+		kvm_set_pfn_dirty(pfn);
+	return 1;
+}
+
+/*
+ * Rules for using mmu_spte_clear_no_track:
+ * Directly clear spte without caring the state bits of sptep,
+ * it is used to set the upper level spte.
+ */
+static void mmu_spte_clear_no_track(u64 *sptep)
+{
+	__update_clear_spte_fast(sptep, 0ull);
+}
+
+static u64 mmu_spte_get_lockless(u64 *sptep)
+{
+	return __get_spte_lockless(sptep);
+}
+
+static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
+{
+	rcu_read_lock();
+	atomic_inc(&vcpu->kvm->arch.reader_counter);
+
+	/* Increase the counter before walking shadow page table */
+	smp_mb__after_atomic_inc();
+}
+
+static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
+{
+	/* Decrease the counter after walking shadow page table finished */
+	smp_mb__before_atomic_dec();
+	atomic_dec(&vcpu->kvm->arch.reader_counter);
+	rcu_read_unlock();
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  struct kmem_cache *base_cache, int min)
 {
@@ -397,12 +625,8 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
 {
 	int r;
 
-	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_chain_cache,
-				   pte_chain_cache, 4);
-	if (r)
-		goto out;
-	r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
-				   rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
+	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+				   pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
 	if (r)
 		goto out;
 	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -416,8 +640,8 @@ out:
 
 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
-	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
-	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
+	mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
+				pte_list_desc_cache);
 	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
 	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
 				mmu_page_header_cache);
@@ -433,26 +657,15 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
 	return p;
 }
 
-static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
+static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
 {
-	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_chain_cache,
-				      sizeof(struct kvm_pte_chain));
+	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache,
+				      sizeof(struct pte_list_desc));
 }
 
-static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
+static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
 {
-	kmem_cache_free(pte_chain_cache, pc);
-}
-
-static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
-{
-	return mmu_memory_cache_alloc(&vcpu->arch.mmu_rmap_desc_cache,
-				      sizeof(struct kvm_rmap_desc));
-}
-
-static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
-{
-	kmem_cache_free(rmap_desc_cache, rd);
+	kmem_cache_free(pte_list_desc_cache, pte_list_desc);
 }
 
 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
@@ -498,6 +711,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 		linfo = lpage_info_slot(gfn, slot, i);
 		linfo->write_count += 1;
 	}
+	kvm->arch.indirect_shadow_pages++;
 }
 
 static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
@@ -513,6 +727,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 		linfo->write_count -= 1;
 		WARN_ON(linfo->write_count < 0);
 	}
+	kvm->arch.indirect_shadow_pages--;
 }
 
 static int has_wrprotected_page(struct kvm *kvm,
@@ -588,67 +803,42 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 }
 
 /*
- * Take gfn and return the reverse mapping to it.
- */
-
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
-{
-	struct kvm_memory_slot *slot;
-	struct kvm_lpage_info *linfo;
-
-	slot = gfn_to_memslot(kvm, gfn);
-	if (likely(level == PT_PAGE_TABLE_LEVEL))
-		return &slot->rmap[gfn - slot->base_gfn];
-
-	linfo = lpage_info_slot(gfn, slot, level);
-
-	return &linfo->rmap_pde;
-}
-
-/*
- * Reverse mapping data structures:
+ * Pte mapping structures:
  *
- * If rmapp bit zero is zero, then rmapp point to the shadw page table entry
- * that points to page_address(page).
+ * If pte_list bit zero is zero, then pte_list point to the spte.
  *
- * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
- * containing more mappings.
+ * If pte_list bit zero is one, (then pte_list & ~1) points to a struct
+ * pte_list_desc containing more mappings.
  *
- * Returns the number of rmap entries before the spte was added or zero if
+ * Returns the number of pte entries before the spte was added or zero if
  * the spte was not added.
  *
  */
-static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
+			unsigned long *pte_list)
 {
-	struct kvm_mmu_page *sp;
-	struct kvm_rmap_desc *desc;
-	unsigned long *rmapp;
+	struct pte_list_desc *desc;
 	int i, count = 0;
 
-	if (!is_rmap_spte(*spte))
-		return count;
-	sp = page_header(__pa(spte));
-	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
-	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
-	if (!*rmapp) {
-		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
-		*rmapp = (unsigned long)spte;
-	} else if (!(*rmapp & 1)) {
-		rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
-		desc = mmu_alloc_rmap_desc(vcpu);
-		desc->sptes[0] = (u64 *)*rmapp;
+	if (!*pte_list) {
+		rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
+		*pte_list = (unsigned long)spte;
+	} else if (!(*pte_list & 1)) {
+		rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
+		desc = mmu_alloc_pte_list_desc(vcpu);
+		desc->sptes[0] = (u64 *)*pte_list;
 		desc->sptes[1] = spte;
-		*rmapp = (unsigned long)desc | 1;
+		*pte_list = (unsigned long)desc | 1;
 		++count;
 	} else {
-		rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
-		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-		while (desc->sptes[RMAP_EXT-1] && desc->more) {
+		rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
+		desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+		while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
 			desc = desc->more;
-			count += RMAP_EXT;
+			count += PTE_LIST_EXT;
 		}
-		if (desc->sptes[RMAP_EXT-1]) {
-			desc->more = mmu_alloc_rmap_desc(vcpu);
+		if (desc->sptes[PTE_LIST_EXT-1]) {
+			desc->more = mmu_alloc_pte_list_desc(vcpu);
 			desc = desc->more;
 		}
 		for (i = 0; desc->sptes[i]; ++i)
@@ -658,59 +848,78 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 	return count;
 }
 
-static void rmap_desc_remove_entry(unsigned long *rmapp,
-				   struct kvm_rmap_desc *desc,
-				   int i,
-				   struct kvm_rmap_desc *prev_desc)
+static u64 *pte_list_next(unsigned long *pte_list, u64 *spte)
+{
+	struct pte_list_desc *desc;
+	u64 *prev_spte;
+	int i;
+
+	if (!*pte_list)
+		return NULL;
+	else if (!(*pte_list & 1)) {
+		if (!spte)
+			return (u64 *)*pte_list;
+		return NULL;
+	}
+	desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+	prev_spte = NULL;
+	while (desc) {
+		for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
+			if (prev_spte == spte)
+				return desc->sptes[i];
+			prev_spte = desc->sptes[i];
+		}
+		desc = desc->more;
+	}
+	return NULL;
+}
+
+static void
+pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc,
+			   int i, struct pte_list_desc *prev_desc)
 {
 	int j;
 
-	for (j = RMAP_EXT - 1; !desc->sptes[j] && j > i; --j)
+	for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
 		;
 	desc->sptes[i] = desc->sptes[j];
 	desc->sptes[j] = NULL;
 	if (j != 0)
 		return;
 	if (!prev_desc && !desc->more)
-		*rmapp = (unsigned long)desc->sptes[0];
+		*pte_list = (unsigned long)desc->sptes[0];
 	else
 		if (prev_desc)
 			prev_desc->more = desc->more;
 		else
-			*rmapp = (unsigned long)desc->more | 1;
-	mmu_free_rmap_desc(desc);
+			*pte_list = (unsigned long)desc->more | 1;
+	mmu_free_pte_list_desc(desc);
 }
 
-static void rmap_remove(struct kvm *kvm, u64 *spte)
+static void pte_list_remove(u64 *spte, unsigned long *pte_list)
 {
-	struct kvm_rmap_desc *desc;
-	struct kvm_rmap_desc *prev_desc;
-	struct kvm_mmu_page *sp;
-	gfn_t gfn;
-	unsigned long *rmapp;
+	struct pte_list_desc *desc;
+	struct pte_list_desc *prev_desc;
 	int i;
 
-	sp = page_header(__pa(spte));
-	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
-	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
-	if (!*rmapp) {
-		printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
+	if (!*pte_list) {
+		printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
 		BUG();
-	} else if (!(*rmapp & 1)) {
-		rmap_printk("rmap_remove:  %p 1->0\n", spte);
-		if ((u64 *)*rmapp != spte) {
-			printk(KERN_ERR "rmap_remove:  %p 1->BUG\n", spte);
+	} else if (!(*pte_list & 1)) {
+		rmap_printk("pte_list_remove:  %p 1->0\n", spte);
+		if ((u64 *)*pte_list != spte) {
+			printk(KERN_ERR "pte_list_remove:  %p 1->BUG\n", spte);
 			BUG();
 		}
-		*rmapp = 0;
+		*pte_list = 0;
 	} else {
-		rmap_printk("rmap_remove:  %p many->many\n", spte);
-		desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
+		rmap_printk("pte_list_remove:  %p many->many\n", spte);
+		desc = (struct pte_list_desc *)(*pte_list & ~1ul);
 		prev_desc = NULL;
 		while (desc) {
-			for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i)
+			for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
 				if (desc->sptes[i] == spte) {
-					rmap_desc_remove_entry(rmapp,
+					pte_list_desc_remove_entry(pte_list,
 							       desc, i,
 							       prev_desc);
 					return;
@@ -718,62 +927,80 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 			prev_desc = desc;
 			desc = desc->more;
 		}
-		pr_err("rmap_remove: %p many->many\n", spte);
+		pr_err("pte_list_remove: %p many->many\n", spte);
 		BUG();
 	}
 }
 
-static int set_spte_track_bits(u64 *sptep, u64 new_spte)
+typedef void (*pte_list_walk_fn) (u64 *spte);
+static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn)
 {
-	pfn_t pfn;
-	u64 old_spte = *sptep;
+	struct pte_list_desc *desc;
+	int i;
 
-	if (!spte_has_volatile_bits(old_spte))
-		__set_spte(sptep, new_spte);
-	else
-		old_spte = __xchg_spte(sptep, new_spte);
+	if (!*pte_list)
+		return;
 
-	if (!is_rmap_spte(old_spte))
-		return 0;
+	if (!(*pte_list & 1))
+		return fn((u64 *)*pte_list);
 
-	pfn = spte_to_pfn(old_spte);
-	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
-		kvm_set_pfn_accessed(pfn);
-	if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
-		kvm_set_pfn_dirty(pfn);
-	return 1;
+	desc = (struct pte_list_desc *)(*pte_list & ~1ul);
+	while (desc) {
+		for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i)
+			fn(desc->sptes[i]);
+		desc = desc->more;
+	}
 }
 
-static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+/*
+ * Take gfn and return the reverse mapping to it.
+ */
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
 {
-	if (set_spte_track_bits(sptep, new_spte))
-		rmap_remove(kvm, sptep);
+	struct kvm_memory_slot *slot;
+	struct kvm_lpage_info *linfo;
+
+	slot = gfn_to_memslot(kvm, gfn);
+	if (likely(level == PT_PAGE_TABLE_LEVEL))
+		return &slot->rmap[gfn - slot->base_gfn];
+
+	linfo = lpage_info_slot(gfn, slot, level);
+
+	return &linfo->rmap_pde;
+}
+
+static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+{
+	struct kvm_mmu_page *sp;
+	unsigned long *rmapp;
+
+	sp = page_header(__pa(spte));
+	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
+	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
+	return pte_list_add(vcpu, spte, rmapp);
 }
 
 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 {
-	struct kvm_rmap_desc *desc;
-	u64 *prev_spte;
-	int i;
+	return pte_list_next(rmapp, spte);
+}
 
-	if (!*rmapp)
-		return NULL;
-	else if (!(*rmapp & 1)) {
-		if (!spte)
-			return (u64 *)*rmapp;
-		return NULL;
-	}
-	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-	prev_spte = NULL;
-	while (desc) {
-		for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
-			if (prev_spte == spte)
-				return desc->sptes[i];
-			prev_spte = desc->sptes[i];
-		}
-		desc = desc->more;
-	}
-	return NULL;
+static void rmap_remove(struct kvm *kvm, u64 *spte)
+{
+	struct kvm_mmu_page *sp;
+	gfn_t gfn;
+	unsigned long *rmapp;
+
+	sp = page_header(__pa(spte));
+	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
+	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
+	pte_list_remove(spte, rmapp);
+}
+
+static void drop_spte(struct kvm *kvm, u64 *sptep)
+{
+	if (mmu_spte_clear_track_bits(sptep))
+		rmap_remove(kvm, sptep);
 }
 
 static int rmap_write_protect(struct kvm *kvm, u64 gfn)
@@ -790,7 +1017,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
 		if (is_writable_pte(*spte)) {
-			update_spte(spte, *spte & ~PT_WRITABLE_MASK);
+			mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
 			write_protected = 1;
 		}
 		spte = rmap_next(kvm, rmapp, spte);
@@ -807,8 +1034,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
 			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
 			if (is_writable_pte(*spte)) {
-				drop_spte(kvm, spte,
-					  shadow_trap_nonpresent_pte);
+				drop_spte(kvm, spte);
 				--kvm->stat.lpages;
 				spte = NULL;
 				write_protected = 1;
@@ -829,7 +1055,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	while ((spte = rmap_next(kvm, rmapp, NULL))) {
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
-		drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+		drop_spte(kvm, spte);
 		need_tlb_flush = 1;
 	}
 	return need_tlb_flush;
@@ -851,7 +1077,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
 		need_flush = 1;
 		if (pte_write(*ptep)) {
-			drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+			drop_spte(kvm, spte);
 			spte = rmap_next(kvm, rmapp, NULL);
 		} else {
 			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -860,7 +1086,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			new_spte &= ~PT_WRITABLE_MASK;
 			new_spte &= ~SPTE_HOST_WRITEABLE;
 			new_spte &= ~shadow_accessed_mask;
-			set_spte_track_bits(spte, new_spte);
+			mmu_spte_clear_track_bits(spte);
+			mmu_spte_set(spte, new_spte);
 			spte = rmap_next(kvm, rmapp, spte);
 		}
 	}
@@ -1032,151 +1259,89 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
 	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
 }
 
-static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+/*
+ * Remove the sp from shadow page cache, after call it,
+ * we can not find this sp from the cache, and the shadow
+ * page table is still valid.
+ * It should be under the protection of mmu lock.
+ */
+static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
 {
 	ASSERT(is_empty_shadow_page(sp->spt));
 	hlist_del(&sp->hash_link);
-	list_del(&sp->link);
-	free_page((unsigned long)sp->spt);
 	if (!sp->role.direct)
 		free_page((unsigned long)sp->gfns);
-	kmem_cache_free(mmu_page_header_cache, sp);
-	kvm_mod_used_mmu_pages(kvm, -1);
 }
 
-static unsigned kvm_page_table_hashfn(gfn_t gfn)
+/*
+ * Free the shadow page table and the sp, we can do it
+ * out of the protection of mmu lock.
+ */
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
 {
-	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
+	list_del(&sp->link);
+	free_page((unsigned long)sp->spt);
+	kmem_cache_free(mmu_page_header_cache, sp);
 }
 
-static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
-					       u64 *parent_pte, int direct)
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
 {
-	struct kvm_mmu_page *sp;
-
-	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
-	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
-	if (!direct)
-		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
-						  PAGE_SIZE);
-	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
-	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
-	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
-	sp->multimapped = 0;
-	sp->parent_pte = parent_pte;
-	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
-	return sp;
+	return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
 }
 
 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
 				    struct kvm_mmu_page *sp, u64 *parent_pte)
 {
-	struct kvm_pte_chain *pte_chain;
-	struct hlist_node *node;
-	int i;
-
 	if (!parent_pte)
 		return;
-	if (!sp->multimapped) {
-		u64 *old = sp->parent_pte;
 
-		if (!old) {
-			sp->parent_pte = parent_pte;
-			return;
-		}
-		sp->multimapped = 1;
-		pte_chain = mmu_alloc_pte_chain(vcpu);
-		INIT_HLIST_HEAD(&sp->parent_ptes);
-		hlist_add_head(&pte_chain->link, &sp->parent_ptes);
-		pte_chain->parent_ptes[0] = old;
-	}
-	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) {
-		if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
-			continue;
-		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
-			if (!pte_chain->parent_ptes[i]) {
-				pte_chain->parent_ptes[i] = parent_pte;
-				return;
-			}
-	}
-	pte_chain = mmu_alloc_pte_chain(vcpu);
-	BUG_ON(!pte_chain);
-	hlist_add_head(&pte_chain->link, &sp->parent_ptes);
-	pte_chain->parent_ptes[0] = parent_pte;
+	pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
 }
 
 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
 				       u64 *parent_pte)
 {
-	struct kvm_pte_chain *pte_chain;
-	struct hlist_node *node;
-	int i;
-
-	if (!sp->multimapped) {
-		BUG_ON(sp->parent_pte != parent_pte);
-		sp->parent_pte = NULL;
-		return;
-	}
-	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
-		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-			if (!pte_chain->parent_ptes[i])
-				break;
-			if (pte_chain->parent_ptes[i] != parent_pte)
-				continue;
-			while (i + 1 < NR_PTE_CHAIN_ENTRIES
-				&& pte_chain->parent_ptes[i + 1]) {
-				pte_chain->parent_ptes[i]
-					= pte_chain->parent_ptes[i + 1];
-				++i;
-			}
-			pte_chain->parent_ptes[i] = NULL;
-			if (i == 0) {
-				hlist_del(&pte_chain->link);
-				mmu_free_pte_chain(pte_chain);
-				if (hlist_empty(&sp->parent_ptes)) {
-					sp->multimapped = 0;
-					sp->parent_pte = NULL;
-				}
-			}
-			return;
-		}
-	BUG();
+	pte_list_remove(parent_pte, &sp->parent_ptes);
 }
 
-static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
+static void drop_parent_pte(struct kvm_mmu_page *sp,
+			    u64 *parent_pte)
 {
-	struct kvm_pte_chain *pte_chain;
-	struct hlist_node *node;
-	struct kvm_mmu_page *parent_sp;
-	int i;
-
-	if (!sp->multimapped && sp->parent_pte) {
-		parent_sp = page_header(__pa(sp->parent_pte));
-		fn(parent_sp, sp->parent_pte);
-		return;
-	}
-
-	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
-		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-			u64 *spte = pte_chain->parent_ptes[i];
+	mmu_page_remove_parent_pte(sp, parent_pte);
+	mmu_spte_clear_no_track(parent_pte);
+}
 
-			if (!spte)
-				break;
-			parent_sp = page_header(__pa(spte));
-			fn(parent_sp, spte);
-		}
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
+					       u64 *parent_pte, int direct)
+{
+	struct kvm_mmu_page *sp;
+	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache,
+					sizeof *sp);
+	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+	if (!direct)
+		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
+						  PAGE_SIZE);
+	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
+	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
+	sp->parent_ptes = 0;
+	mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
+	return sp;
 }
 
-static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
+static void mark_unsync(u64 *spte);
 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
 {
-	mmu_parent_walk(sp, mark_unsync);
+	pte_list_walk(&sp->parent_ptes, mark_unsync);
 }
 
-static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
+static void mark_unsync(u64 *spte)
 {
+	struct kvm_mmu_page *sp;
 	unsigned int index;
 
+	sp = page_header(__pa(spte));
 	index = spte - sp->spt;
 	if (__test_and_set_bit(index, sp->unsync_child_bitmap))
 		return;
@@ -1185,15 +1350,6 @@ static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
 	kvm_mmu_mark_parents_unsync(sp);
 }
 
-static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
-				    struct kvm_mmu_page *sp)
-{
-	int i;
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
-		sp->spt[i] = shadow_trap_nonpresent_pte;
-}
-
 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
 			       struct kvm_mmu_page *sp)
 {
@@ -1475,6 +1631,14 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 	}
 }
 
+static void init_shadow_page_table(struct kvm_mmu_page *sp)
+{
+	int i;
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+		sp->spt[i] = 0ull;
+}
+
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     gfn_t gfn,
 					     gva_t gaddr,
@@ -1537,10 +1701,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 
 		account_shadowed(vcpu->kvm, gfn);
 	}
-	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
-		vcpu->arch.mmu.prefetch_page(vcpu, sp);
-	else
-		nonpaging_prefetch_page(vcpu, sp);
+	init_shadow_page_table(sp);
 	trace_kvm_mmu_get_page(sp, true);
 	return sp;
 }
@@ -1572,21 +1733,28 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
 	if (iterator->level < PT_PAGE_TABLE_LEVEL)
 		return false;
 
-	if (iterator->level == PT_PAGE_TABLE_LEVEL)
-		if (is_large_pte(*iterator->sptep))
-			return false;
-
 	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
 	iterator->sptep	= ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
 	return true;
 }
 
-static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
+			       u64 spte)
 {
-	iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
+	if (is_last_spte(spte, iterator->level)) {
+		iterator->level = 0;
+		return;
+	}
+
+	iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
 	--iterator->level;
 }
 
+static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+{
+	return __shadow_walk_next(iterator, *iterator->sptep);
+}
+
 static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
 {
 	u64 spte;
@@ -1594,13 +1762,13 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
 	spte = __pa(sp->spt)
 		| PT_PRESENT_MASK | PT_ACCESSED_MASK
 		| PT_WRITABLE_MASK | PT_USER_MASK;
-	__set_spte(sptep, spte);
+	mmu_spte_set(sptep, spte);
 }
 
 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
 {
 	if (is_large_pte(*sptep)) {
-		drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+		drop_spte(vcpu->kvm, sptep);
 		kvm_flush_remote_tlbs(vcpu->kvm);
 	}
 }
@@ -1622,38 +1790,39 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		if (child->role.access == direct_access)
 			return;
 
-		mmu_page_remove_parent_pte(child, sptep);
-		__set_spte(sptep, shadow_trap_nonpresent_pte);
+		drop_parent_pte(child, sptep);
 		kvm_flush_remote_tlbs(vcpu->kvm);
 	}
 }
 
+static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
+			     u64 *spte)
+{
+	u64 pte;
+	struct kvm_mmu_page *child;
+
+	pte = *spte;
+	if (is_shadow_present_pte(pte)) {
+		if (is_last_spte(pte, sp->role.level))
+			drop_spte(kvm, spte);
+		else {
+			child = page_header(pte & PT64_BASE_ADDR_MASK);
+			drop_parent_pte(child, spte);
+		}
+	} else if (is_mmio_spte(pte))
+		mmu_spte_clear_no_track(spte);
+
+	if (is_large_pte(pte))
+		--kvm->stat.lpages;
+}
+
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 					 struct kvm_mmu_page *sp)
 {
 	unsigned i;
-	u64 *pt;
-	u64 ent;
-
-	pt = sp->spt;
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-		ent = pt[i];
-
-		if (is_shadow_present_pte(ent)) {
-			if (!is_last_spte(ent, sp->role.level)) {
-				ent &= PT64_BASE_ADDR_MASK;
-				mmu_page_remove_parent_pte(page_header(ent),
-							   &pt[i]);
-			} else {
-				if (is_large_pte(ent))
-					--kvm->stat.lpages;
-				drop_spte(kvm, &pt[i],
-					  shadow_trap_nonpresent_pte);
-			}
-		}
-		pt[i] = shadow_trap_nonpresent_pte;
-	}
+
+	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+		mmu_page_zap_pte(kvm, sp, sp->spt + i);
 }
 
 static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -1674,20 +1843,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	u64 *parent_pte;
 
-	while (sp->multimapped || sp->parent_pte) {
-		if (!sp->multimapped)
-			parent_pte = sp->parent_pte;
-		else {
-			struct kvm_pte_chain *chain;
-
-			chain = container_of(sp->parent_ptes.first,
-					     struct kvm_pte_chain, link);
-			parent_pte = chain->parent_ptes[0];
-		}
-		BUG_ON(!parent_pte);
-		kvm_mmu_put_page(sp, parent_pte);
-		__set_spte(parent_pte, shadow_trap_nonpresent_pte);
-	}
+	while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL)))
+		drop_parent_pte(sp, parent_pte);
 }
 
 static int mmu_zap_unsync_children(struct kvm *kvm,
@@ -1734,6 +1891,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 		/* Count self */
 		ret++;
 		list_move(&sp->link, invalid_list);
+		kvm_mod_used_mmu_pages(kvm, -1);
 	} else {
 		list_move(&sp->link, &kvm->arch.active_mmu_pages);
 		kvm_reload_remote_mmus(kvm);
@@ -1744,6 +1902,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 	return ret;
 }
 
+static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
+{
+	struct kvm_mmu_page *sp;
+
+	list_for_each_entry(sp, invalid_list, link)
+		kvm_mmu_isolate_page(sp);
+}
+
+static void free_pages_rcu(struct rcu_head *head)
+{
+	struct kvm_mmu_page *next, *sp;
+
+	sp = container_of(head, struct kvm_mmu_page, rcu);
+	while (sp) {
+		if (!list_empty(&sp->link))
+			next = list_first_entry(&sp->link,
+				      struct kvm_mmu_page, link);
+		else
+			next = NULL;
+		kvm_mmu_free_page(sp);
+		sp = next;
+	}
+}
+
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 				    struct list_head *invalid_list)
 {
@@ -1754,10 +1936,21 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 
 	kvm_flush_remote_tlbs(kvm);
 
+	if (atomic_read(&kvm->arch.reader_counter)) {
+		kvm_mmu_isolate_pages(invalid_list);
+		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+		list_del_init(invalid_list);
+
+		trace_kvm_mmu_delay_free_pages(sp);
+		call_rcu(&sp->rcu, free_pages_rcu);
+		return;
+	}
+
 	do {
 		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
 		WARN_ON(!sp->role.invalid || sp->root_count);
-		kvm_mmu_free_page(kvm, sp);
+		kvm_mmu_isolate_page(sp);
+		kvm_mmu_free_page(sp);
 	} while (!list_empty(invalid_list));
 
 }
@@ -1783,8 +1976,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
 			page = container_of(kvm->arch.active_mmu_pages.prev,
 					    struct kvm_mmu_page, link);
 			kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
-			kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		}
+		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
 	}
 
@@ -1833,20 +2026,6 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
 	__set_bit(slot, sp->slot_bitmap);
 }
 
-static void mmu_convert_notrap(struct kvm_mmu_page *sp)
-{
-	int i;
-	u64 *pt = sp->spt;
-
-	if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
-		return;
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-		if (pt[i] == shadow_notrap_nonpresent_pte)
-			__set_spte(&pt[i], shadow_trap_nonpresent_pte);
-	}
-}
-
 /*
  * The function is based on mtrr_type_lookup() in
  * arch/x86/kernel/cpu/mtrr/generic.c
@@ -1959,7 +2138,6 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	sp->unsync = 1;
 
 	kvm_mmu_mark_parents_unsync(sp);
-	mmu_convert_notrap(sp);
 }
 
 static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
@@ -2002,13 +2180,16 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 
 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		    unsigned pte_access, int user_fault,
-		    int write_fault, int dirty, int level,
+		    int write_fault, int level,
 		    gfn_t gfn, pfn_t pfn, bool speculative,
 		    bool can_unsync, bool host_writable)
 {
 	u64 spte, entry = *sptep;
 	int ret = 0;
 
+	if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+		return 0;
+
 	/*
 	 * We don't set the accessed bit, since we sometimes want to see
 	 * whether the guest actually used the pte (in order to detect
@@ -2017,8 +2198,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	spte = PT_PRESENT_MASK;
 	if (!speculative)
 		spte |= shadow_accessed_mask;
-	if (!dirty)
-		pte_access &= ~ACC_WRITE_MASK;
+
 	if (pte_access & ACC_EXEC_MASK)
 		spte |= shadow_x_mask;
 	else
@@ -2045,15 +2225,24 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		if (level > PT_PAGE_TABLE_LEVEL &&
 		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
 			ret = 1;
-			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+			drop_spte(vcpu->kvm, sptep);
 			goto done;
 		}
 
 		spte |= PT_WRITABLE_MASK;
 
 		if (!vcpu->arch.mmu.direct_map
-		    && !(pte_access & ACC_WRITE_MASK))
+		    && !(pte_access & ACC_WRITE_MASK)) {
 			spte &= ~PT_USER_MASK;
+			/*
+			 * If we converted a user page to a kernel page,
+			 * so that the kernel can write to it when cr0.wp=0,
+			 * then we should prevent the kernel from executing it
+			 * if SMEP is enabled.
+			 */
+			if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+				spte |= PT64_NX_MASK;
+		}
 
 		/*
 		 * Optimization: for pte sync, if spte was writable the hash
@@ -2078,7 +2267,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		mark_page_dirty(vcpu->kvm, gfn);
 
 set_pte:
-	update_spte(sptep, spte);
+	mmu_spte_update(sptep, spte);
 	/*
 	 * If we overwrite a writable spte with a read-only one we
 	 * should flush remote TLBs. Otherwise rmap_write_protect
@@ -2093,8 +2282,8 @@ done:
 
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			 unsigned pt_access, unsigned pte_access,
-			 int user_fault, int write_fault, int dirty,
-			 int *ptwrite, int level, gfn_t gfn,
+			 int user_fault, int write_fault,
+			 int *emulate, int level, gfn_t gfn,
 			 pfn_t pfn, bool speculative,
 			 bool host_writable)
 {
@@ -2117,26 +2306,28 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			u64 pte = *sptep;
 
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
-			mmu_page_remove_parent_pte(child, sptep);
-			__set_spte(sptep, shadow_trap_nonpresent_pte);
+			drop_parent_pte(child, sptep);
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		} else if (pfn != spte_to_pfn(*sptep)) {
 			pgprintk("hfn old %llx new %llx\n",
 				 spte_to_pfn(*sptep), pfn);
-			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+			drop_spte(vcpu->kvm, sptep);
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		} else
 			was_rmapped = 1;
 	}
 
 	if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
-		      dirty, level, gfn, pfn, speculative, true,
+		      level, gfn, pfn, speculative, true,
 		      host_writable)) {
 		if (write_fault)
-			*ptwrite = 1;
+			*emulate = 1;
 		kvm_mmu_flush_tlb(vcpu);
 	}
 
+	if (unlikely(is_mmio_spte(*sptep) && emulate))
+		*emulate = 1;
+
 	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
 	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
 		 is_large_pte(*sptep)? "2MB" : "4kB",
@@ -2145,11 +2336,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	if (!was_rmapped && is_large_pte(*sptep))
 		++vcpu->kvm->stat.lpages;
 
-	page_header_update_slot(vcpu->kvm, sptep, gfn);
-	if (!was_rmapped) {
-		rmap_count = rmap_add(vcpu, sptep, gfn);
-		if (rmap_count > RMAP_RECYCLE_THRESHOLD)
-			rmap_recycle(vcpu, sptep, gfn);
+	if (is_shadow_present_pte(*sptep)) {
+		page_header_update_slot(vcpu->kvm, sptep, gfn);
+		if (!was_rmapped) {
+			rmap_count = rmap_add(vcpu, sptep, gfn);
+			if (rmap_count > RMAP_RECYCLE_THRESHOLD)
+				rmap_recycle(vcpu, sptep, gfn);
+		}
 	}
 	kvm_release_pfn_clean(pfn);
 	if (speculative) {
@@ -2170,8 +2363,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
 
 	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
 	if (!slot) {
-		get_page(bad_page);
-		return page_to_pfn(bad_page);
+		get_page(fault_page);
+		return page_to_pfn(fault_page);
 	}
 
 	hva = gfn_to_hva_memslot(slot, gfn);
@@ -2198,7 +2391,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
 
 	for (i = 0; i < ret; i++, gfn++, start++)
 		mmu_set_spte(vcpu, start, ACC_ALL,
-			     access, 0, 0, 1, NULL,
+			     access, 0, 0, NULL,
 			     sp->role.level, gfn,
 			     page_to_pfn(pages[i]), true, true);
 
@@ -2217,7 +2410,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
 	spte = sp->spt + i;
 
 	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
-		if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
+		if (is_shadow_present_pte(*spte) || spte == sptep) {
 			if (!start)
 				continue;
 			if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
@@ -2254,7 +2447,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 {
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_mmu_page *sp;
-	int pt_write = 0;
+	int emulate = 0;
 	gfn_t pseudo_gfn;
 
 	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
@@ -2262,14 +2455,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 			unsigned pte_access = ACC_ALL;
 
 			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
-				     0, write, 1, &pt_write,
+				     0, write, &emulate,
 				     level, gfn, pfn, prefault, map_writable);
 			direct_pte_prefetch(vcpu, iterator.sptep);
 			++vcpu->stat.pf_fixed;
 			break;
 		}
 
-		if (*iterator.sptep == shadow_trap_nonpresent_pte) {
+		if (!is_shadow_present_pte(*iterator.sptep)) {
 			u64 base_addr = iterator.addr;
 
 			base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
@@ -2283,14 +2476,14 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 				return -ENOMEM;
 			}
 
-			__set_spte(iterator.sptep,
-				   __pa(sp->spt)
-				   | PT_PRESENT_MASK | PT_WRITABLE_MASK
-				   | shadow_user_mask | shadow_x_mask
-				   | shadow_accessed_mask);
+			mmu_spte_set(iterator.sptep,
+				     __pa(sp->spt)
+				     | PT_PRESENT_MASK | PT_WRITABLE_MASK
+				     | shadow_user_mask | shadow_x_mask
+				     | shadow_accessed_mask);
 		}
 	}
-	return pt_write;
+	return emulate;
 }
 
 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
@@ -2306,16 +2499,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
 	send_sig_info(SIGBUS, &info, tsk);
 }
 
-static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
 {
 	kvm_release_pfn_clean(pfn);
 	if (is_hwpoison_pfn(pfn)) {
-		kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
+		kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
 		return 0;
-	} else if (is_fault_pfn(pfn))
-		return -EFAULT;
+	}
 
-	return 1;
+	return -EFAULT;
 }
 
 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
@@ -2360,6 +2552,30 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 	}
 }
 
+static bool mmu_invalid_pfn(pfn_t pfn)
+{
+	return unlikely(is_invalid_pfn(pfn));
+}
+
+static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+				pfn_t pfn, unsigned access, int *ret_val)
+{
+	bool ret = true;
+
+	/* The pfn is invalid, report the error! */
+	if (unlikely(is_invalid_pfn(pfn))) {
+		*ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
+		goto exit;
+	}
+
+	if (unlikely(is_noslot_pfn(pfn)))
+		vcpu_cache_mmio_info(vcpu, gva, gfn, access);
+
+	ret = false;
+exit:
+	return ret;
+}
+
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 			 gva_t gva, pfn_t *pfn, bool write, bool *writable);
 
@@ -2394,9 +2610,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
 	if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
 		return 0;
 
-	/* mmio */
-	if (is_error_pfn(pfn))
-		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
+	if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
+		return r;
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2623,6 +2838,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
 
+	vcpu_clear_mmio_info(vcpu, ~0ul);
 	trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
 	if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
 		hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -2667,6 +2883,94 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
 	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
 }
 
+static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+	if (direct)
+		return vcpu_match_mmio_gpa(vcpu, addr);
+
+	return vcpu_match_mmio_gva(vcpu, addr);
+}
+
+
+/*
+ * On direct hosts, the last spte is only allows two states
+ * for mmio page fault:
+ *   - It is the mmio spte
+ *   - It is zapped or it is being zapped.
+ *
+ * This function completely checks the spte when the last spte
+ * is not the mmio spte.
+ */
+static bool check_direct_spte_mmio_pf(u64 spte)
+{
+	return __check_direct_spte_mmio_pf(spte);
+}
+
+static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
+{
+	struct kvm_shadow_walk_iterator iterator;
+	u64 spte = 0ull;
+
+	walk_shadow_page_lockless_begin(vcpu);
+	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
+		if (!is_shadow_present_pte(spte))
+			break;
+	walk_shadow_page_lockless_end(vcpu);
+
+	return spte;
+}
+
+/*
+ * If it is a real mmio page fault, return 1 and emulat the instruction
+ * directly, return 0 to let CPU fault again on the address, -1 is
+ * returned if bug is detected.
+ */
+int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+	u64 spte;
+
+	if (quickly_check_mmio_pf(vcpu, addr, direct))
+		return 1;
+
+	spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
+
+	if (is_mmio_spte(spte)) {
+		gfn_t gfn = get_mmio_spte_gfn(spte);
+		unsigned access = get_mmio_spte_access(spte);
+
+		if (direct)
+			addr = 0;
+
+		trace_handle_mmio_page_fault(addr, gfn, access);
+		vcpu_cache_mmio_info(vcpu, addr, gfn, access);
+		return 1;
+	}
+
+	/*
+	 * It's ok if the gva is remapped by other cpus on shadow guest,
+	 * it's a BUG if the gfn is not a mmio page.
+	 */
+	if (direct && !check_direct_spte_mmio_pf(spte))
+		return -1;
+
+	/*
+	 * If the page table is zapped by other cpus, let CPU fault again on
+	 * the address.
+	 */
+	return 0;
+}
+EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
+
+static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
+				  u32 error_code, bool direct)
+{
+	int ret;
+
+	ret = handle_mmio_page_fault_common(vcpu, addr, direct);
+	WARN_ON(ret < 0);
+	return ret;
+}
+
 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 				u32 error_code, bool prefault)
 {
@@ -2674,6 +2978,10 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
 	int r;
 
 	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
+
+	if (unlikely(error_code & PFERR_RSVD_MASK))
+		return handle_mmio_page_fault(vcpu, gva, error_code, true);
+
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		return r;
@@ -2750,6 +3058,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	ASSERT(vcpu);
 	ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
+	if (unlikely(error_code & PFERR_RSVD_MASK))
+		return handle_mmio_page_fault(vcpu, gpa, error_code, true);
+
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		return r;
@@ -2767,9 +3078,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
 		return 0;
 
-	/* mmio */
-	if (is_error_pfn(pfn))
-		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
+	if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
+		return r;
+
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
@@ -2800,7 +3111,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
 	context->page_fault = nonpaging_page_fault;
 	context->gva_to_gpa = nonpaging_gva_to_gpa;
 	context->free = nonpaging_free;
-	context->prefetch_page = nonpaging_prefetch_page;
 	context->sync_page = nonpaging_sync_page;
 	context->invlpg = nonpaging_invlpg;
 	context->update_pte = nonpaging_update_pte;
@@ -2848,6 +3158,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
 	return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
 }
 
+static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
+			   int *nr_present)
+{
+	if (unlikely(is_mmio_spte(*sptep))) {
+		if (gfn != get_mmio_spte_gfn(*sptep)) {
+			mmu_spte_clear_no_track(sptep);
+			return true;
+		}
+
+		(*nr_present)++;
+		mark_mmio_spte(sptep, gfn, access);
+		return true;
+	}
+
+	return false;
+}
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
@@ -2930,7 +3257,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
 	context->new_cr3 = paging_new_cr3;
 	context->page_fault = paging64_page_fault;
 	context->gva_to_gpa = paging64_gva_to_gpa;
-	context->prefetch_page = paging64_prefetch_page;
 	context->sync_page = paging64_sync_page;
 	context->invlpg = paging64_invlpg;
 	context->update_pte = paging64_update_pte;
@@ -2959,7 +3285,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
 	context->page_fault = paging32_page_fault;
 	context->gva_to_gpa = paging32_gva_to_gpa;
 	context->free = paging_free;
-	context->prefetch_page = paging32_prefetch_page;
 	context->sync_page = paging32_sync_page;
 	context->invlpg = paging32_invlpg;
 	context->update_pte = paging32_update_pte;
@@ -2984,7 +3309,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 	context->new_cr3 = nonpaging_new_cr3;
 	context->page_fault = tdp_page_fault;
 	context->free = nonpaging_free;
-	context->prefetch_page = nonpaging_prefetch_page;
 	context->sync_page = nonpaging_sync_page;
 	context->invlpg = nonpaging_invlpg;
 	context->update_pte = nonpaging_update_pte;
@@ -3023,6 +3347,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 {
 	int r;
+	bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
 	ASSERT(vcpu);
 	ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
 
@@ -3037,6 +3362,8 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
 
 	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
 	vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
+	vcpu->arch.mmu.base_role.smep_andnot_wp
+		= smep && !is_write_protection(vcpu);
 
 	return r;
 }
@@ -3141,27 +3468,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
 
-static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
-				  struct kvm_mmu_page *sp,
-				  u64 *spte)
-{
-	u64 pte;
-	struct kvm_mmu_page *child;
-
-	pte = *spte;
-	if (is_shadow_present_pte(pte)) {
-		if (is_last_spte(pte, sp->role.level))
-			drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
-		else {
-			child = page_header(pte & PT64_BASE_ADDR_MASK);
-			mmu_page_remove_parent_pte(child, spte);
-		}
-	}
-	__set_spte(spte, shadow_trap_nonpresent_pte);
-	if (is_large_pte(pte))
-		--vcpu->kvm->stat.lpages;
-}
-
 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 				  struct kvm_mmu_page *sp, u64 *spte,
 				  const void *new)
@@ -3233,6 +3539,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	int level, npte, invlpg_counter, r, flooded = 0;
 	bool remote_flush, local_flush, zap_page;
 
+	/*
+	 * If we don't have indirect shadow pages, it means no page is
+	 * write-protected, so we can exit simply.
+	 */
+	if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
+		return;
+
 	zap_page = remote_flush = local_flush = false;
 	offset = offset_in_page(gpa);
 
@@ -3336,7 +3649,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		spte = &sp->spt[page_offset / sizeof(*spte)];
 		while (npte--) {
 			entry = *spte;
-			mmu_pte_write_zap_pte(vcpu, sp, spte);
+			mmu_page_zap_pte(vcpu->kvm, sp, spte);
 			if (gentry &&
 			      !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
 			      & mask.word))
@@ -3380,9 +3693,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
 				  struct kvm_mmu_page, link);
 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
-		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
+	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
@@ -3506,15 +3819,15 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 				continue;
 
 			if (is_large_pte(pt[i])) {
-				drop_spte(kvm, &pt[i],
-					  shadow_trap_nonpresent_pte);
+				drop_spte(kvm, &pt[i]);
 				--kvm->stat.lpages;
 				continue;
 			}
 
 			/* avoid RMW */
 			if (is_writable_pte(pt[i]))
-				update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
+				mmu_spte_update(&pt[i],
+						pt[i] & ~PT_WRITABLE_MASK);
 		}
 	}
 	kvm_flush_remote_tlbs(kvm);
@@ -3590,25 +3903,18 @@ static struct shrinker mmu_shrinker = {
 
 static void mmu_destroy_caches(void)
 {
-	if (pte_chain_cache)
-		kmem_cache_destroy(pte_chain_cache);
-	if (rmap_desc_cache)
-		kmem_cache_destroy(rmap_desc_cache);
+	if (pte_list_desc_cache)
+		kmem_cache_destroy(pte_list_desc_cache);
 	if (mmu_page_header_cache)
 		kmem_cache_destroy(mmu_page_header_cache);
 }
 
 int kvm_mmu_module_init(void)
 {
-	pte_chain_cache = kmem_cache_create("kvm_pte_chain",
-					    sizeof(struct kvm_pte_chain),
-					    0, 0, NULL);
-	if (!pte_chain_cache)
-		goto nomem;
-	rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
-					    sizeof(struct kvm_rmap_desc),
+	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
+					    sizeof(struct pte_list_desc),
 					    0, 0, NULL);
-	if (!rmap_desc_cache)
+	if (!pte_list_desc_cache)
 		goto nomem;
 
 	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
@@ -3775,16 +4081,17 @@ out:
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
 {
 	struct kvm_shadow_walk_iterator iterator;
+	u64 spte;
 	int nr_sptes = 0;
 
-	spin_lock(&vcpu->kvm->mmu_lock);
-	for_each_shadow_entry(vcpu, addr, iterator) {
-		sptes[iterator.level-1] = *iterator.sptep;
+	walk_shadow_page_lockless_begin(vcpu);
+	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+		sptes[iterator.level-1] = spte;
 		nr_sptes++;
-		if (!is_shadow_present_pte(*iterator.sptep))
+		if (!is_shadow_present_pte(spte))
 			break;
 	}
-	spin_unlock(&vcpu->kvm->mmu_lock);
+	walk_shadow_page_lockless_end(vcpu);
 
 	return nr_sptes;
 }
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 7086ca85d3e..e374db9af02 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -49,6 +49,8 @@
 #define PFERR_FETCH_MASK (1U << 4)
 
 int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
+int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
@@ -76,4 +78,27 @@ static inline int is_present_gpte(unsigned long pte)
 	return pte & PT_PRESENT_MASK;
 }
 
+static inline int is_writable_pte(unsigned long pte)
+{
+	return pte & PT_WRITABLE_MASK;
+}
+
+static inline bool is_write_protection(struct kvm_vcpu *vcpu)
+{
+	return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
+}
+
+static inline bool check_write_user_access(struct kvm_vcpu *vcpu,
+					   bool write_fault, bool user_fault,
+					   unsigned long pte)
+{
+	if (unlikely(write_fault && !is_writable_pte(pte)
+	      && (user_fault || is_write_protection(vcpu))))
+		return false;
+
+	if (unlikely(user_fault && !(pte & PT_USER_MASK)))
+		return false;
+
+	return true;
+}
 #endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 5f6223b8bcf..2460a265be2 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -99,18 +99,6 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
 				     "level = %d\n", sp, level);
 			return;
 		}
-
-		if (*sptep == shadow_notrap_nonpresent_pte) {
-			audit_printk(vcpu->kvm, "notrap spte in unsync "
-				     "sp: %p\n", sp);
-			return;
-		}
-	}
-
-	if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
-		audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
-			     sp);
-		return;
 	}
 
 	if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index b60b4fdb3ed..eed67f34146 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -196,6 +196,54 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
 	TP_ARGS(sp)
 );
 
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages,
+	TP_PROTO(struct kvm_mmu_page *sp),
+
+	TP_ARGS(sp)
+);
+
+TRACE_EVENT(
+	mark_mmio_spte,
+	TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
+	TP_ARGS(sptep, gfn, access),
+
+	TP_STRUCT__entry(
+		__field(void *, sptep)
+		__field(gfn_t, gfn)
+		__field(unsigned, access)
+	),
+
+	TP_fast_assign(
+		__entry->sptep = sptep;
+		__entry->gfn = gfn;
+		__entry->access = access;
+	),
+
+	TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn,
+		  __entry->access)
+);
+
+TRACE_EVENT(
+	handle_mmio_page_fault,
+	TP_PROTO(u64 addr, gfn_t gfn, unsigned access),
+	TP_ARGS(addr, gfn, access),
+
+	TP_STRUCT__entry(
+		__field(u64, addr)
+		__field(gfn_t, gfn)
+		__field(unsigned, access)
+	),
+
+	TP_fast_assign(
+		__entry->addr = addr;
+		__entry->gfn = gfn;
+		__entry->access = access;
+	),
+
+	TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
+		  __entry->access)
+);
+
 TRACE_EVENT(
 	kvm_mmu_audit,
 	TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9d03ad4dd5e..507e2b844cf 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -101,11 +101,15 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 	return (ret != orig_pte);
 }
 
-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
+				   bool last)
 {
 	unsigned access;
 
 	access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+	if (last && !is_dirty_gpte(gpte))
+		access &= ~ACC_WRITE_MASK;
+
 #if PTTYPE == 64
 	if (vcpu->arch.mmu.nx)
 		access &= ~(gpte >> PT64_NX_SHIFT);
@@ -113,6 +117,24 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
 	return access;
 }
 
+static bool FNAME(is_last_gpte)(struct guest_walker *walker,
+				struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+				pt_element_t gpte)
+{
+	if (walker->level == PT_PAGE_TABLE_LEVEL)
+		return true;
+
+	if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) &&
+	    (PTTYPE == 64 || is_pse(vcpu)))
+		return true;
+
+	if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) &&
+	    (mmu->root_level == PT64_ROOT_LEVEL))
+		return true;
+
+	return false;
+}
+
 /*
  * Fetch a guest pte for a guest virtual address
  */
@@ -125,18 +147,17 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	gfn_t table_gfn;
 	unsigned index, pt_access, uninitialized_var(pte_access);
 	gpa_t pte_gpa;
-	bool eperm, present, rsvd_fault;
-	int offset, write_fault, user_fault, fetch_fault;
-
-	write_fault = access & PFERR_WRITE_MASK;
-	user_fault = access & PFERR_USER_MASK;
-	fetch_fault = access & PFERR_FETCH_MASK;
+	bool eperm;
+	int offset;
+	const int write_fault = access & PFERR_WRITE_MASK;
+	const int user_fault  = access & PFERR_USER_MASK;
+	const int fetch_fault = access & PFERR_FETCH_MASK;
+	u16 errcode = 0;
 
 	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
 				     fetch_fault);
-walk:
-	present = true;
-	eperm = rsvd_fault = false;
+retry_walk:
+	eperm = false;
 	walker->level = mmu->root_level;
 	pte           = mmu->get_cr3(vcpu);
 
@@ -144,10 +165,8 @@ walk:
 	if (walker->level == PT32E_ROOT_LEVEL) {
 		pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
-		if (!is_present_gpte(pte)) {
-			present = false;
+		if (!is_present_gpte(pte))
 			goto error;
-		}
 		--walker->level;
 	}
 #endif
@@ -170,42 +189,31 @@ walk:
 
 		real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
 					      PFERR_USER_MASK|PFERR_WRITE_MASK);
-		if (unlikely(real_gfn == UNMAPPED_GVA)) {
-			present = false;
-			break;
-		}
+		if (unlikely(real_gfn == UNMAPPED_GVA))
+			goto error;
 		real_gfn = gpa_to_gfn(real_gfn);
 
 		host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
-		if (unlikely(kvm_is_error_hva(host_addr))) {
-			present = false;
-			break;
-		}
+		if (unlikely(kvm_is_error_hva(host_addr)))
+			goto error;
 
 		ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
-		if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) {
-			present = false;
-			break;
-		}
+		if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
+			goto error;
 
 		trace_kvm_mmu_paging_element(pte, walker->level);
 
-		if (unlikely(!is_present_gpte(pte))) {
-			present = false;
-			break;
-		}
+		if (unlikely(!is_present_gpte(pte)))
+			goto error;
 
 		if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
 					      walker->level))) {
-			rsvd_fault = true;
-			break;
+			errcode |= PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
+			goto error;
 		}
 
-		if (unlikely(write_fault && !is_writable_pte(pte)
-			     && (user_fault || is_write_protection(vcpu))))
-			eperm = true;
-
-		if (unlikely(user_fault && !(pte & PT_USER_MASK)))
+		if (!check_write_user_access(vcpu, write_fault, user_fault,
+					  pte))
 			eperm = true;
 
 #if PTTYPE == 64
@@ -213,39 +221,35 @@ walk:
 			eperm = true;
 #endif
 
-		if (!eperm && !rsvd_fault
-		    && unlikely(!(pte & PT_ACCESSED_MASK))) {
+		if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
 			int ret;
 			trace_kvm_mmu_set_accessed_bit(table_gfn, index,
 						       sizeof(pte));
 			ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
 						  pte, pte|PT_ACCESSED_MASK);
-			if (unlikely(ret < 0)) {
-				present = false;
-				break;
-			} else if (ret)
-				goto walk;
+			if (unlikely(ret < 0))
+				goto error;
+			else if (ret)
+				goto retry_walk;
 
 			mark_page_dirty(vcpu->kvm, table_gfn);
 			pte |= PT_ACCESSED_MASK;
 		}
 
-		pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
-
 		walker->ptes[walker->level - 1] = pte;
 
-		if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
-		    ((walker->level == PT_DIRECTORY_LEVEL) &&
-				is_large_pte(pte) &&
-				(PTTYPE == 64 || is_pse(vcpu))) ||
-		    ((walker->level == PT_PDPE_LEVEL) &&
-				is_large_pte(pte) &&
-				mmu->root_level == PT64_ROOT_LEVEL)) {
+		if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) {
 			int lvl = walker->level;
 			gpa_t real_gpa;
 			gfn_t gfn;
 			u32 ac;
 
+			/* check if the kernel is fetching from user page */
+			if (unlikely(pte_access & PT_USER_MASK) &&
+			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+				if (fetch_fault && !user_fault)
+					eperm = true;
+
 			gfn = gpte_to_gfn_lvl(pte, lvl);
 			gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
 
@@ -266,12 +270,14 @@ walk:
 			break;
 		}
 
-		pt_access = pte_access;
+		pt_access &= FNAME(gpte_access)(vcpu, pte, false);
 		--walker->level;
 	}
 
-	if (unlikely(!present || eperm || rsvd_fault))
+	if (unlikely(eperm)) {
+		errcode |= PFERR_PRESENT_MASK;
 		goto error;
+	}
 
 	if (write_fault && unlikely(!is_dirty_gpte(pte))) {
 		int ret;
@@ -279,17 +285,17 @@ walk:
 		trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
 		ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
 					  pte, pte|PT_DIRTY_MASK);
-		if (unlikely(ret < 0)) {
-			present = false;
+		if (unlikely(ret < 0))
 			goto error;
-		} else if (ret)
-			goto walk;
+		else if (ret)
+			goto retry_walk;
 
 		mark_page_dirty(vcpu->kvm, table_gfn);
 		pte |= PT_DIRTY_MASK;
 		walker->ptes[walker->level - 1] = pte;
 	}
 
+	pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true);
 	walker->pt_access = pt_access;
 	walker->pte_access = pte_access;
 	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
@@ -297,19 +303,14 @@ walk:
 	return 1;
 
 error:
+	errcode |= write_fault | user_fault;
+	if (fetch_fault && (mmu->nx ||
+			    kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
+		errcode |= PFERR_FETCH_MASK;
+
 	walker->fault.vector = PF_VECTOR;
 	walker->fault.error_code_valid = true;
-	walker->fault.error_code = 0;
-	if (present)
-		walker->fault.error_code |= PFERR_PRESENT_MASK;
-
-	walker->fault.error_code |= write_fault | user_fault;
-
-	if (fetch_fault && mmu->nx)
-		walker->fault.error_code |= PFERR_FETCH_MASK;
-	if (rsvd_fault)
-		walker->fault.error_code |= PFERR_RSVD_MASK;
-
+	walker->fault.error_code = errcode;
 	walker->fault.address = addr;
 	walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
 
@@ -336,16 +337,11 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
 				    struct kvm_mmu_page *sp, u64 *spte,
 				    pt_element_t gpte)
 {
-	u64 nonpresent = shadow_trap_nonpresent_pte;
-
 	if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
 		goto no_present;
 
-	if (!is_present_gpte(gpte)) {
-		if (!sp->unsync)
-			nonpresent = shadow_notrap_nonpresent_pte;
+	if (!is_present_gpte(gpte))
 		goto no_present;
-	}
 
 	if (!(gpte & PT_ACCESSED_MASK))
 		goto no_present;
@@ -353,7 +349,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
 	return false;
 
 no_present:
-	drop_spte(vcpu->kvm, spte, nonpresent);
+	drop_spte(vcpu->kvm, spte);
 	return true;
 }
 
@@ -369,9 +365,9 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		return;
 
 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
-	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
 	pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
-	if (is_error_pfn(pfn)) {
+	if (mmu_invalid_pfn(pfn)) {
 		kvm_release_pfn_clean(pfn);
 		return;
 	}
@@ -381,7 +377,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
 	 */
 	mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
-		     is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
+		     NULL, PT_PAGE_TABLE_LEVEL,
 		     gpte_to_gfn(gpte), pfn, true, true);
 }
 
@@ -432,12 +428,11 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 		unsigned pte_access;
 		gfn_t gfn;
 		pfn_t pfn;
-		bool dirty;
 
 		if (spte == sptep)
 			continue;
 
-		if (*spte != shadow_trap_nonpresent_pte)
+		if (is_shadow_present_pte(*spte))
 			continue;
 
 		gpte = gptep[i];
@@ -445,18 +440,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
 			continue;
 
-		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
+								  true);
 		gfn = gpte_to_gfn(gpte);
-		dirty = is_dirty_gpte(gpte);
 		pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
-				      (pte_access & ACC_WRITE_MASK) && dirty);
-		if (is_error_pfn(pfn)) {
+				      pte_access & ACC_WRITE_MASK);
+		if (mmu_invalid_pfn(pfn)) {
 			kvm_release_pfn_clean(pfn);
 			break;
 		}
 
 		mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
-			     dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
+			     NULL, PT_PAGE_TABLE_LEVEL, gfn,
 			     pfn, true, true);
 	}
 }
@@ -467,12 +462,11 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
 static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 struct guest_walker *gw,
 			 int user_fault, int write_fault, int hlevel,
-			 int *ptwrite, pfn_t pfn, bool map_writable,
+			 int *emulate, pfn_t pfn, bool map_writable,
 			 bool prefault)
 {
 	unsigned access = gw->pt_access;
 	struct kvm_mmu_page *sp = NULL;
-	bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
 	int top_level;
 	unsigned direct_access;
 	struct kvm_shadow_walk_iterator it;
@@ -480,9 +474,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (!is_present_gpte(gw->ptes[gw->level - 1]))
 		return NULL;
 
-	direct_access = gw->pt_access & gw->pte_access;
-	if (!dirty)
-		direct_access &= ~ACC_WRITE_MASK;
+	direct_access = gw->pte_access;
 
 	top_level = vcpu->arch.mmu.root_level;
 	if (top_level == PT32E_ROOT_LEVEL)
@@ -540,8 +532,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		link_shadow_page(it.sptep, sp);
 	}
 
-	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
-		     user_fault, write_fault, dirty, ptwrite, it.level,
+	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
+		     user_fault, write_fault, emulate, it.level,
 		     gw->gfn, pfn, prefault, map_writable);
 	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
 
@@ -575,7 +567,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	int user_fault = error_code & PFERR_USER_MASK;
 	struct guest_walker walker;
 	u64 *sptep;
-	int write_pt = 0;
+	int emulate = 0;
 	int r;
 	pfn_t pfn;
 	int level = PT_PAGE_TABLE_LEVEL;
@@ -585,6 +577,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 
 	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
 
+	if (unlikely(error_code & PFERR_RSVD_MASK))
+		return handle_mmio_page_fault(vcpu, addr, error_code,
+					      mmu_is_nested(vcpu));
+
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		return r;
@@ -623,9 +619,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 			 &map_writable))
 		return 0;
 
-	/* mmio */
-	if (is_error_pfn(pfn))
-		return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
+	if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
+				walker.gfn, pfn, walker.pte_access, &r))
+		return r;
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -636,19 +632,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	if (!force_pt_level)
 		transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
-			     level, &write_pt, pfn, map_writable, prefault);
+			     level, &emulate, pfn, map_writable, prefault);
 	(void)sptep;
-	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
-		 sptep, *sptep, write_pt);
+	pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
+		 sptep, *sptep, emulate);
 
-	if (!write_pt)
+	if (!emulate)
 		vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
 
 	++vcpu->stat.pf_fixed;
 	trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 
-	return write_pt;
+	return emulate;
 
 out_unlock:
 	spin_unlock(&vcpu->kvm->mmu_lock);
@@ -665,6 +661,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 	u64 *sptep;
 	int need_flush = 0;
 
+	vcpu_clear_mmio_info(vcpu, gva);
+
 	spin_lock(&vcpu->kvm->mmu_lock);
 
 	for_each_shadow_entry(vcpu, gva, iterator) {
@@ -688,11 +686,11 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 			if (is_shadow_present_pte(*sptep)) {
 				if (is_large_pte(*sptep))
 					--vcpu->kvm->stat.lpages;
-				drop_spte(vcpu->kvm, sptep,
-					  shadow_trap_nonpresent_pte);
+				drop_spte(vcpu->kvm, sptep);
 				need_flush = 1;
-			} else
-				__set_spte(sptep, shadow_trap_nonpresent_pte);
+			} else if (is_mmio_spte(*sptep))
+				mmu_spte_clear_no_track(sptep);
+
 			break;
 		}
 
@@ -752,36 +750,6 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
 	return gpa;
 }
 
-static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
-				 struct kvm_mmu_page *sp)
-{
-	int i, j, offset, r;
-	pt_element_t pt[256 / sizeof(pt_element_t)];
-	gpa_t pte_gpa;
-
-	if (sp->role.direct
-	    || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
-		nonpaging_prefetch_page(vcpu, sp);
-		return;
-	}
-
-	pte_gpa = gfn_to_gpa(sp->gfn);
-	if (PTTYPE == 32) {
-		offset = sp->role.quadrant << PT64_LEVEL_BITS;
-		pte_gpa += offset * sizeof(pt_element_t);
-	}
-
-	for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
-		r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
-		pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
-		for (j = 0; j < ARRAY_SIZE(pt); ++j)
-			if (r || is_present_gpte(pt[j]))
-				sp->spt[i+j] = shadow_trap_nonpresent_pte;
-			else
-				sp->spt[i+j] = shadow_notrap_nonpresent_pte;
-	}
-}
-
 /*
  * Using the cached information from sp->gfns is safe because:
  * - The spte has a reference to the struct page, so the pfn for a given gfn
@@ -817,7 +785,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		gpa_t pte_gpa;
 		gfn_t gfn;
 
-		if (!is_shadow_present_pte(sp->spt[i]))
+		if (!sp->spt[i])
 			continue;
 
 		pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
@@ -826,26 +794,30 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 					  sizeof(pt_element_t)))
 			return -EINVAL;
 
-		gfn = gpte_to_gfn(gpte);
-
 		if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
 			vcpu->kvm->tlbs_dirty++;
 			continue;
 		}
 
+		gfn = gpte_to_gfn(gpte);
+		pte_access = sp->role.access;
+		pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
+
+		if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
+			continue;
+
 		if (gfn != sp->gfns[i]) {
-			drop_spte(vcpu->kvm, &sp->spt[i],
-				      shadow_trap_nonpresent_pte);
+			drop_spte(vcpu->kvm, &sp->spt[i]);
 			vcpu->kvm->tlbs_dirty++;
 			continue;
 		}
 
 		nr_present++;
-		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+
 		host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
 
 		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
-			 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
+			 PT_PAGE_TABLE_LEVEL, gfn,
 			 spte_to_pfn(sp->spt[i]), true, false,
 			 host_writable);
 	}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 506e4fe23ad..475d1c94850 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1496,11 +1496,14 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	update_cr0_intercept(svm);
 }
 
-static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
 	unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
 
+	if (cr4 & X86_CR4_VMXE)
+		return 1;
+
 	if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
 		svm_flush_tlb(vcpu);
 
@@ -1510,6 +1513,7 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	cr4 |= host_cr4_mce;
 	to_svm(vcpu)->vmcb->save.cr4 = cr4;
 	mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
+	return 0;
 }
 
 static void svm_set_segment(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index abd86e865be..ae432ea1cd8 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -15,7 +15,7 @@
 #include <linux/kvm_host.h>
 #include <linux/kvm.h>
 #include <linux/hrtimer.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include "kvm_timer.h"
 
 static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index db932760ea8..3ff898c104f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -675,12 +675,12 @@ TRACE_EVENT(kvm_emulate_insn,
 		),
 
 	TP_fast_assign(
-		__entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start;
+		__entry->rip = vcpu->arch.emulate_ctxt.fetch.start;
 		__entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
-		__entry->len = vcpu->arch.emulate_ctxt.decode.eip
-			       - vcpu->arch.emulate_ctxt.decode.fetch.start;
+		__entry->len = vcpu->arch.emulate_ctxt._eip
+			       - vcpu->arch.emulate_ctxt.fetch.start;
 		memcpy(__entry->insn,
-		       vcpu->arch.emulate_ctxt.decode.fetch.data,
+		       vcpu->arch.emulate_ctxt.fetch.data,
 		       15);
 		__entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
 		__entry->failed = failed;
@@ -698,6 +698,29 @@ TRACE_EVENT(kvm_emulate_insn,
 #define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
 #define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
 
+TRACE_EVENT(
+	vcpu_match_mmio,
+	TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match),
+	TP_ARGS(gva, gpa, write, gpa_match),
+
+	TP_STRUCT__entry(
+		__field(gva_t, gva)
+		__field(gpa_t, gpa)
+		__field(bool, write)
+		__field(bool, gpa_match)
+		),
+
+	TP_fast_assign(
+		__entry->gva = gva;
+		__entry->gpa = gpa;
+		__entry->write = write;
+		__entry->gpa_match = gpa_match
+		),
+
+	TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa,
+		  __entry->write ? "Write" : "Read",
+		  __entry->gpa_match ? "GPA" : "GVA")
+);
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d48ec60ea42..e65a158dee6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -43,13 +43,12 @@
 #include "trace.h"
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
+#define __ex_clear(x, reg) \
+	____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
 
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-static int __read_mostly bypass_guest_pf = 1;
-module_param(bypass_guest_pf, bool, S_IRUGO);
-
 static int __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
@@ -72,6 +71,14 @@ module_param(vmm_exclusive, bool, S_IRUGO);
 static int __read_mostly yield_on_hlt = 1;
 module_param(yield_on_hlt, bool, S_IRUGO);
 
+/*
+ * If nested=1, nested virtualization is supported, i.e., guests may use
+ * VMX and be a hypervisor for its own guests. If nested=0, guests may not
+ * use VMX instructions.
+ */
+static int __read_mostly nested = 0;
+module_param(nested, bool, S_IRUGO);
+
 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST				\
 	(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
 #define KVM_GUEST_CR0_MASK						\
@@ -109,6 +116,7 @@ static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
 module_param(ple_window, int, S_IRUGO);
 
 #define NR_AUTOLOAD_MSRS 1
+#define VMCS02_POOL_SIZE 1
 
 struct vmcs {
 	u32 revision_id;
@@ -116,17 +124,237 @@ struct vmcs {
 	char data[0];
 };
 
+/*
+ * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
+ * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
+ * loaded on this CPU (so we can clear them if the CPU goes down).
+ */
+struct loaded_vmcs {
+	struct vmcs *vmcs;
+	int cpu;
+	int launched;
+	struct list_head loaded_vmcss_on_cpu_link;
+};
+
 struct shared_msr_entry {
 	unsigned index;
 	u64 data;
 	u64 mask;
 };
 
+/*
+ * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
+ * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
+ * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
+ * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
+ * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
+ * More than one of these structures may exist, if L1 runs multiple L2 guests.
+ * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * underlying hardware which will be used to run L2.
+ * This structure is packed to ensure that its layout is identical across
+ * machines (necessary for live migration).
+ * If there are changes in this struct, VMCS12_REVISION must be changed.
+ */
+typedef u64 natural_width;
+struct __packed vmcs12 {
+	/* According to the Intel spec, a VMCS region must start with the
+	 * following two fields. Then follow implementation-specific data.
+	 */
+	u32 revision_id;
+	u32 abort;
+
+	u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
+	u32 padding[7]; /* room for future expansion */
+
+	u64 io_bitmap_a;
+	u64 io_bitmap_b;
+	u64 msr_bitmap;
+	u64 vm_exit_msr_store_addr;
+	u64 vm_exit_msr_load_addr;
+	u64 vm_entry_msr_load_addr;
+	u64 tsc_offset;
+	u64 virtual_apic_page_addr;
+	u64 apic_access_addr;
+	u64 ept_pointer;
+	u64 guest_physical_address;
+	u64 vmcs_link_pointer;
+	u64 guest_ia32_debugctl;
+	u64 guest_ia32_pat;
+	u64 guest_ia32_efer;
+	u64 guest_ia32_perf_global_ctrl;
+	u64 guest_pdptr0;
+	u64 guest_pdptr1;
+	u64 guest_pdptr2;
+	u64 guest_pdptr3;
+	u64 host_ia32_pat;
+	u64 host_ia32_efer;
+	u64 host_ia32_perf_global_ctrl;
+	u64 padding64[8]; /* room for future expansion */
+	/*
+	 * To allow migration of L1 (complete with its L2 guests) between
+	 * machines of different natural widths (32 or 64 bit), we cannot have
+	 * unsigned long fields with no explict size. We use u64 (aliased
+	 * natural_width) instead. Luckily, x86 is little-endian.
+	 */
+	natural_width cr0_guest_host_mask;
+	natural_width cr4_guest_host_mask;
+	natural_width cr0_read_shadow;
+	natural_width cr4_read_shadow;
+	natural_width cr3_target_value0;
+	natural_width cr3_target_value1;
+	natural_width cr3_target_value2;
+	natural_width cr3_target_value3;
+	natural_width exit_qualification;
+	natural_width guest_linear_address;
+	natural_width guest_cr0;
+	natural_width guest_cr3;
+	natural_width guest_cr4;
+	natural_width guest_es_base;
+	natural_width guest_cs_base;
+	natural_width guest_ss_base;
+	natural_width guest_ds_base;
+	natural_width guest_fs_base;
+	natural_width guest_gs_base;
+	natural_width guest_ldtr_base;
+	natural_width guest_tr_base;
+	natural_width guest_gdtr_base;
+	natural_width guest_idtr_base;
+	natural_width guest_dr7;
+	natural_width guest_rsp;
+	natural_width guest_rip;
+	natural_width guest_rflags;
+	natural_width guest_pending_dbg_exceptions;
+	natural_width guest_sysenter_esp;
+	natural_width guest_sysenter_eip;
+	natural_width host_cr0;
+	natural_width host_cr3;
+	natural_width host_cr4;
+	natural_width host_fs_base;
+	natural_width host_gs_base;
+	natural_width host_tr_base;
+	natural_width host_gdtr_base;
+	natural_width host_idtr_base;
+	natural_width host_ia32_sysenter_esp;
+	natural_width host_ia32_sysenter_eip;
+	natural_width host_rsp;
+	natural_width host_rip;
+	natural_width paddingl[8]; /* room for future expansion */
+	u32 pin_based_vm_exec_control;
+	u32 cpu_based_vm_exec_control;
+	u32 exception_bitmap;
+	u32 page_fault_error_code_mask;
+	u32 page_fault_error_code_match;
+	u32 cr3_target_count;
+	u32 vm_exit_controls;
+	u32 vm_exit_msr_store_count;
+	u32 vm_exit_msr_load_count;
+	u32 vm_entry_controls;
+	u32 vm_entry_msr_load_count;
+	u32 vm_entry_intr_info_field;
+	u32 vm_entry_exception_error_code;
+	u32 vm_entry_instruction_len;
+	u32 tpr_threshold;
+	u32 secondary_vm_exec_control;
+	u32 vm_instruction_error;
+	u32 vm_exit_reason;
+	u32 vm_exit_intr_info;
+	u32 vm_exit_intr_error_code;
+	u32 idt_vectoring_info_field;
+	u32 idt_vectoring_error_code;
+	u32 vm_exit_instruction_len;
+	u32 vmx_instruction_info;
+	u32 guest_es_limit;
+	u32 guest_cs_limit;
+	u32 guest_ss_limit;
+	u32 guest_ds_limit;
+	u32 guest_fs_limit;
+	u32 guest_gs_limit;
+	u32 guest_ldtr_limit;
+	u32 guest_tr_limit;
+	u32 guest_gdtr_limit;
+	u32 guest_idtr_limit;
+	u32 guest_es_ar_bytes;
+	u32 guest_cs_ar_bytes;
+	u32 guest_ss_ar_bytes;
+	u32 guest_ds_ar_bytes;
+	u32 guest_fs_ar_bytes;
+	u32 guest_gs_ar_bytes;
+	u32 guest_ldtr_ar_bytes;
+	u32 guest_tr_ar_bytes;
+	u32 guest_interruptibility_info;
+	u32 guest_activity_state;
+	u32 guest_sysenter_cs;
+	u32 host_ia32_sysenter_cs;
+	u32 padding32[8]; /* room for future expansion */
+	u16 virtual_processor_id;
+	u16 guest_es_selector;
+	u16 guest_cs_selector;
+	u16 guest_ss_selector;
+	u16 guest_ds_selector;
+	u16 guest_fs_selector;
+	u16 guest_gs_selector;
+	u16 guest_ldtr_selector;
+	u16 guest_tr_selector;
+	u16 host_es_selector;
+	u16 host_cs_selector;
+	u16 host_ss_selector;
+	u16 host_ds_selector;
+	u16 host_fs_selector;
+	u16 host_gs_selector;
+	u16 host_tr_selector;
+};
+
+/*
+ * VMCS12_REVISION is an arbitrary id that should be changed if the content or
+ * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
+ * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
+ */
+#define VMCS12_REVISION 0x11e57ed0
+
+/*
+ * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
+ * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
+ * current implementation, 4K are reserved to avoid future complications.
+ */
+#define VMCS12_SIZE 0x1000
+
+/* Used to remember the last vmcs02 used for some recently used vmcs12s */
+struct vmcs02_list {
+	struct list_head list;
+	gpa_t vmptr;
+	struct loaded_vmcs vmcs02;
+};
+
+/*
+ * The nested_vmx structure is part of vcpu_vmx, and holds information we need
+ * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
+ */
+struct nested_vmx {
+	/* Has the level1 guest done vmxon? */
+	bool vmxon;
+
+	/* The guest-physical address of the current VMCS L1 keeps for L2 */
+	gpa_t current_vmptr;
+	/* The host-usable pointer to the above */
+	struct page *current_vmcs12_page;
+	struct vmcs12 *current_vmcs12;
+
+	/* vmcs02_list cache of VMCSs recently used to run L2 guests */
+	struct list_head vmcs02_pool;
+	int vmcs02_num;
+	u64 vmcs01_tsc_offset;
+	/* L2 must run next, and mustn't decide to exit to L1. */
+	bool nested_run_pending;
+	/*
+	 * Guest pages referred to in vmcs02 with host-physical pointers, so
+	 * we must keep them pinned while L2 runs.
+	 */
+	struct page *apic_access_page;
+};
+
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
-	struct list_head      local_vcpus_link;
 	unsigned long         host_rsp;
-	int                   launched;
 	u8                    fail;
 	u8                    cpl;
 	bool                  nmi_known_unmasked;
@@ -140,7 +368,14 @@ struct vcpu_vmx {
 	u64 		      msr_host_kernel_gs_base;
 	u64 		      msr_guest_kernel_gs_base;
 #endif
-	struct vmcs          *vmcs;
+	/*
+	 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
+	 * non-nested (L1) guest, it always points to vmcs01. For a nested
+	 * guest (L2), it points to a different VMCS.
+	 */
+	struct loaded_vmcs    vmcs01;
+	struct loaded_vmcs   *loaded_vmcs;
+	bool                  __launched; /* temporary, used in vmx_vcpu_run */
 	struct msr_autoload {
 		unsigned nr;
 		struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
@@ -176,6 +411,9 @@ struct vcpu_vmx {
 	u32 exit_reason;
 
 	bool rdtscp_enabled;
+
+	/* Support for a guest hypervisor (nested VMX) */
+	struct nested_vmx nested;
 };
 
 enum segment_cache_field {
@@ -192,6 +430,174 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
+#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
+#define FIELD(number, name)	[number] = VMCS12_OFFSET(name)
+#define FIELD64(number, name)	[number] = VMCS12_OFFSET(name), \
+				[number##_HIGH] = VMCS12_OFFSET(name)+4
+
+static unsigned short vmcs_field_to_offset_table[] = {
+	FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
+	FIELD(GUEST_ES_SELECTOR, guest_es_selector),
+	FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
+	FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
+	FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
+	FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
+	FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
+	FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
+	FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
+	FIELD(HOST_ES_SELECTOR, host_es_selector),
+	FIELD(HOST_CS_SELECTOR, host_cs_selector),
+	FIELD(HOST_SS_SELECTOR, host_ss_selector),
+	FIELD(HOST_DS_SELECTOR, host_ds_selector),
+	FIELD(HOST_FS_SELECTOR, host_fs_selector),
+	FIELD(HOST_GS_SELECTOR, host_gs_selector),
+	FIELD(HOST_TR_SELECTOR, host_tr_selector),
+	FIELD64(IO_BITMAP_A, io_bitmap_a),
+	FIELD64(IO_BITMAP_B, io_bitmap_b),
+	FIELD64(MSR_BITMAP, msr_bitmap),
+	FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
+	FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
+	FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
+	FIELD64(TSC_OFFSET, tsc_offset),
+	FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
+	FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
+	FIELD64(EPT_POINTER, ept_pointer),
+	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
+	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
+	FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
+	FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
+	FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
+	FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
+	FIELD64(GUEST_PDPTR0, guest_pdptr0),
+	FIELD64(GUEST_PDPTR1, guest_pdptr1),
+	FIELD64(GUEST_PDPTR2, guest_pdptr2),
+	FIELD64(GUEST_PDPTR3, guest_pdptr3),
+	FIELD64(HOST_IA32_PAT, host_ia32_pat),
+	FIELD64(HOST_IA32_EFER, host_ia32_efer),
+	FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
+	FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
+	FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
+	FIELD(EXCEPTION_BITMAP, exception_bitmap),
+	FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
+	FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
+	FIELD(CR3_TARGET_COUNT, cr3_target_count),
+	FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
+	FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
+	FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
+	FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
+	FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
+	FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
+	FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
+	FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
+	FIELD(TPR_THRESHOLD, tpr_threshold),
+	FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
+	FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
+	FIELD(VM_EXIT_REASON, vm_exit_reason),
+	FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
+	FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
+	FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
+	FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
+	FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
+	FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
+	FIELD(GUEST_ES_LIMIT, guest_es_limit),
+	FIELD(GUEST_CS_LIMIT, guest_cs_limit),
+	FIELD(GUEST_SS_LIMIT, guest_ss_limit),
+	FIELD(GUEST_DS_LIMIT, guest_ds_limit),
+	FIELD(GUEST_FS_LIMIT, guest_fs_limit),
+	FIELD(GUEST_GS_LIMIT, guest_gs_limit),
+	FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
+	FIELD(GUEST_TR_LIMIT, guest_tr_limit),
+	FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
+	FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
+	FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
+	FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
+	FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
+	FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
+	FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
+	FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
+	FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
+	FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
+	FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
+	FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
+	FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
+	FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
+	FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
+	FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
+	FIELD(CR0_READ_SHADOW, cr0_read_shadow),
+	FIELD(CR4_READ_SHADOW, cr4_read_shadow),
+	FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
+	FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
+	FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
+	FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
+	FIELD(EXIT_QUALIFICATION, exit_qualification),
+	FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
+	FIELD(GUEST_CR0, guest_cr0),
+	FIELD(GUEST_CR3, guest_cr3),
+	FIELD(GUEST_CR4, guest_cr4),
+	FIELD(GUEST_ES_BASE, guest_es_base),
+	FIELD(GUEST_CS_BASE, guest_cs_base),
+	FIELD(GUEST_SS_BASE, guest_ss_base),
+	FIELD(GUEST_DS_BASE, guest_ds_base),
+	FIELD(GUEST_FS_BASE, guest_fs_base),
+	FIELD(GUEST_GS_BASE, guest_gs_base),
+	FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
+	FIELD(GUEST_TR_BASE, guest_tr_base),
+	FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
+	FIELD(GUEST_IDTR_BASE, guest_idtr_base),
+	FIELD(GUEST_DR7, guest_dr7),
+	FIELD(GUEST_RSP, guest_rsp),
+	FIELD(GUEST_RIP, guest_rip),
+	FIELD(GUEST_RFLAGS, guest_rflags),
+	FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
+	FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
+	FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
+	FIELD(HOST_CR0, host_cr0),
+	FIELD(HOST_CR3, host_cr3),
+	FIELD(HOST_CR4, host_cr4),
+	FIELD(HOST_FS_BASE, host_fs_base),
+	FIELD(HOST_GS_BASE, host_gs_base),
+	FIELD(HOST_TR_BASE, host_tr_base),
+	FIELD(HOST_GDTR_BASE, host_gdtr_base),
+	FIELD(HOST_IDTR_BASE, host_idtr_base),
+	FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
+	FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
+	FIELD(HOST_RSP, host_rsp),
+	FIELD(HOST_RIP, host_rip),
+};
+static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);
+
+static inline short vmcs_field_to_offset(unsigned long field)
+{
+	if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
+		return -1;
+	return vmcs_field_to_offset_table[field];
+}
+
+static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
+{
+	return to_vmx(vcpu)->nested.current_vmcs12;
+}
+
+static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+	struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
+	if (is_error_page(page)) {
+		kvm_release_page_clean(page);
+		return NULL;
+	}
+	return page;
+}
+
+static void nested_release_page(struct page *page)
+{
+	kvm_release_page_dirty(page);
+}
+
+static void nested_release_page_clean(struct page *page)
+{
+	kvm_release_page_clean(page);
+}
+
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
@@ -200,7 +606,11 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
-static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+/*
+ * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
+ * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
+ */
+static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 
 static unsigned long *vmx_io_bitmap_a;
@@ -442,6 +852,35 @@ static inline bool report_flexpriority(void)
 	return flexpriority_enabled;
 }
 
+static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
+{
+	return vmcs12->cpu_based_vm_exec_control & bit;
+}
+
+static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
+{
+	return (vmcs12->cpu_based_vm_exec_control &
+			CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
+		(vmcs12->secondary_vm_exec_control & bit);
+}
+
+static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12,
+	struct kvm_vcpu *vcpu)
+{
+	return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
+}
+
+static inline bool is_exception(u32 intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
+		== (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+}
+
+static void nested_vmx_vmexit(struct kvm_vcpu *vcpu);
+static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
+			struct vmcs12 *vmcs12,
+			u32 reason, unsigned long qualification);
+
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
@@ -501,6 +940,13 @@ static void vmcs_clear(struct vmcs *vmcs)
 		       vmcs, phys_addr);
 }
 
+static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
+{
+	vmcs_clear(loaded_vmcs->vmcs);
+	loaded_vmcs->cpu = -1;
+	loaded_vmcs->launched = 0;
+}
+
 static void vmcs_load(struct vmcs *vmcs)
 {
 	u64 phys_addr = __pa(vmcs);
@@ -510,29 +956,28 @@ static void vmcs_load(struct vmcs *vmcs)
 			: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
 			: "cc", "memory");
 	if (error)
-		printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
+		printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
 		       vmcs, phys_addr);
 }
 
-static void __vcpu_clear(void *arg)
+static void __loaded_vmcs_clear(void *arg)
 {
-	struct vcpu_vmx *vmx = arg;
+	struct loaded_vmcs *loaded_vmcs = arg;
 	int cpu = raw_smp_processor_id();
 
-	if (vmx->vcpu.cpu == cpu)
-		vmcs_clear(vmx->vmcs);
-	if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
+	if (loaded_vmcs->cpu != cpu)
+		return; /* vcpu migration can race with cpu offline */
+	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
 		per_cpu(current_vmcs, cpu) = NULL;
-	list_del(&vmx->local_vcpus_link);
-	vmx->vcpu.cpu = -1;
-	vmx->launched = 0;
+	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
+	loaded_vmcs_init(loaded_vmcs);
 }
 
-static void vcpu_clear(struct vcpu_vmx *vmx)
+static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 {
-	if (vmx->vcpu.cpu == -1)
-		return;
-	smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
+	if (loaded_vmcs->cpu != -1)
+		smp_call_function_single(
+			loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
 }
 
 static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -585,26 +1030,26 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
 	}
 }
 
-static unsigned long vmcs_readl(unsigned long field)
+static __always_inline unsigned long vmcs_readl(unsigned long field)
 {
-	unsigned long value = 0;
+	unsigned long value;
 
-	asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
-		      : "+a"(value) : "d"(field) : "cc");
+	asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
+		      : "=a"(value) : "d"(field) : "cc");
 	return value;
 }
 
-static u16 vmcs_read16(unsigned long field)
+static __always_inline u16 vmcs_read16(unsigned long field)
 {
 	return vmcs_readl(field);
 }
 
-static u32 vmcs_read32(unsigned long field)
+static __always_inline u32 vmcs_read32(unsigned long field)
 {
 	return vmcs_readl(field);
 }
 
-static u64 vmcs_read64(unsigned long field)
+static __always_inline u64 vmcs_read64(unsigned long field)
 {
 #ifdef CONFIG_X86_64
 	return vmcs_readl(field);
@@ -731,6 +1176,15 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
 	if (vcpu->fpu_active)
 		eb &= ~(1u << NM_VECTOR);
+
+	/* When we are running a nested L2 guest and L1 specified for it a
+	 * certain exception bitmap, we must trap the same exceptions and pass
+	 * them to L1. When running L2, we will only handle the exceptions
+	 * specified above if L1 did not want them.
+	 */
+	if (is_guest_mode(vcpu))
+		eb |= get_vmcs12(vcpu)->exception_bitmap;
+
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
@@ -971,22 +1425,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	if (!vmm_exclusive)
 		kvm_cpu_vmxon(phys_addr);
-	else if (vcpu->cpu != cpu)
-		vcpu_clear(vmx);
+	else if (vmx->loaded_vmcs->cpu != cpu)
+		loaded_vmcs_clear(vmx->loaded_vmcs);
 
-	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
-		per_cpu(current_vmcs, cpu) = vmx->vmcs;
-		vmcs_load(vmx->vmcs);
+	if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+		per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+		vmcs_load(vmx->loaded_vmcs->vmcs);
 	}
 
-	if (vcpu->cpu != cpu) {
+	if (vmx->loaded_vmcs->cpu != cpu) {
 		struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
 		unsigned long sysenter_esp;
 
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 		local_irq_disable();
-		list_add(&vmx->local_vcpus_link,
-			 &per_cpu(vcpus_on_cpu, cpu));
+		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
+			 &per_cpu(loaded_vmcss_on_cpu, cpu));
 		local_irq_enable();
 
 		/*
@@ -998,6 +1452,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+		vmx->loaded_vmcs->cpu = cpu;
 	}
 }
 
@@ -1005,7 +1460,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	__vmx_load_host_state(to_vmx(vcpu));
 	if (!vmm_exclusive) {
-		__vcpu_clear(to_vmx(vcpu));
+		__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
+		vcpu->cpu = -1;
 		kvm_cpu_vmxoff();
 	}
 }
@@ -1023,19 +1479,55 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
 	vmcs_writel(GUEST_CR0, cr0);
 	update_exception_bitmap(vcpu);
 	vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+	if (is_guest_mode(vcpu))
+		vcpu->arch.cr0_guest_owned_bits &=
+			~get_vmcs12(vcpu)->cr0_guest_host_mask;
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
 }
 
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
 
+/*
+ * Return the cr0 value that a nested guest would read. This is a combination
+ * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
+ * its hypervisor (cr0_read_shadow).
+ */
+static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
+{
+	return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
+		(fields->cr0_read_shadow & fields->cr0_guest_host_mask);
+}
+static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
+{
+	return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
+		(fields->cr4_read_shadow & fields->cr4_guest_host_mask);
+}
+
 static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
 {
+	/* Note that there is no vcpu->fpu_active = 0 here. The caller must
+	 * set this *before* calling this function.
+	 */
 	vmx_decache_cr0_guest_bits(vcpu);
 	vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
 	update_exception_bitmap(vcpu);
 	vcpu->arch.cr0_guest_owned_bits = 0;
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-	vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
+	if (is_guest_mode(vcpu)) {
+		/*
+		 * L1's specified read shadow might not contain the TS bit,
+		 * so now that we turned on shadowing of this bit, we need to
+		 * set this bit of the shadow. Like in nested_vmx_run we need
+		 * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
+		 * up-to-date here because we just decached cr0.TS (and we'll
+		 * only update vmcs12->guest_cr0 on nested exit).
+		 */
+		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+		vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
+			(vcpu->arch.cr0 & X86_CR0_TS);
+		vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
+	} else
+		vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
 }
 
 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
@@ -1119,6 +1611,25 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
 		vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
 }
 
+/*
+ * KVM wants to inject page-faults which it got to the guest. This function
+ * checks whether in a nested guest, we need to inject them to L1 or L2.
+ * This function assumes it is called with the exit reason in vmcs02 being
+ * a #PF exception (this is the only case in which KVM injects a #PF when L2
+ * is running).
+ */
+static int nested_pf_handled(struct kvm_vcpu *vcpu)
+{
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+	/* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
+	if (!(vmcs12->exception_bitmap & PF_VECTOR))
+		return 0;
+
+	nested_vmx_vmexit(vcpu);
+	return 1;
+}
+
 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code,
 				bool reinject)
@@ -1126,6 +1637,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
+	if (nr == PF_VECTOR && is_guest_mode(vcpu) &&
+		nested_pf_handled(vcpu))
+		return;
+
 	if (has_error_code) {
 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
 		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
@@ -1248,12 +1763,24 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
 	vmcs_write64(TSC_OFFSET, offset);
+	if (is_guest_mode(vcpu))
+		/*
+		 * We're here if L1 chose not to trap the TSC MSR. Since
+		 * prepare_vmcs12() does not copy tsc_offset, we need to also
+		 * set the vmcs12 field here.
+		 */
+		get_vmcs12(vcpu)->tsc_offset = offset -
+			to_vmx(vcpu)->nested.vmcs01_tsc_offset;
 }
 
 static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
 {
 	u64 offset = vmcs_read64(TSC_OFFSET);
 	vmcs_write64(TSC_OFFSET, offset + adjustment);
+	if (is_guest_mode(vcpu)) {
+		/* Even when running L2, the adjustment needs to apply to L1 */
+		to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
+	}
 }
 
 static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
@@ -1261,6 +1788,236 @@ static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 	return target_tsc - native_read_tsc();
 }
 
+static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
+}
+
+/*
+ * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
+ * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
+ * all guests if the "nested" module option is off, and can also be disabled
+ * for a single guest by disabling its VMX cpuid bit.
+ */
+static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
+{
+	return nested && guest_cpuid_has_vmx(vcpu);
+}
+
+/*
+ * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
+ * returned for the various VMX controls MSRs when nested VMX is enabled.
+ * The same values should also be used to verify that vmcs12 control fields are
+ * valid during nested entry from L1 to L2.
+ * Each of these control msrs has a low and high 32-bit half: A low bit is on
+ * if the corresponding bit in the (32-bit) control field *must* be on, and a
+ * bit in the high half is on if the corresponding bit in the control field
+ * may be on. See also vmx_control_verify().
+ * TODO: allow these variables to be modified (downgraded) by module options
+ * or other means.
+ */
+static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high;
+static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high;
+static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high;
+static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high;
+static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high;
+static __init void nested_vmx_setup_ctls_msrs(void)
+{
+	/*
+	 * Note that as a general rule, the high half of the MSRs (bits in
+	 * the control fields which may be 1) should be initialized by the
+	 * intersection of the underlying hardware's MSR (i.e., features which
+	 * can be supported) and the list of features we want to expose -
+	 * because they are known to be properly supported in our code.
+	 * Also, usually, the low half of the MSRs (bits which must be 1) can
+	 * be set to 0, meaning that L1 may turn off any of these bits. The
+	 * reason is that if one of these bits is necessary, it will appear
+	 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
+	 * fields of vmcs01 and vmcs02, will turn these bits off - and
+	 * nested_vmx_exit_handled() will not pass related exits to L1.
+	 * These rules have exceptions below.
+	 */
+
+	/* pin-based controls */
+	/*
+	 * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is
+	 * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR.
+	 */
+	nested_vmx_pinbased_ctls_low = 0x16 ;
+	nested_vmx_pinbased_ctls_high = 0x16 |
+		PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
+		PIN_BASED_VIRTUAL_NMIS;
+
+	/* exit controls */
+	nested_vmx_exit_ctls_low = 0;
+	/* Note that guest use of VM_EXIT_ACK_INTR_ON_EXIT is not supported. */
+#ifdef CONFIG_X86_64
+	nested_vmx_exit_ctls_high = VM_EXIT_HOST_ADDR_SPACE_SIZE;
+#else
+	nested_vmx_exit_ctls_high = 0;
+#endif
+
+	/* entry controls */
+	rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
+		nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high);
+	nested_vmx_entry_ctls_low = 0;
+	nested_vmx_entry_ctls_high &=
+		VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_IA32E_MODE;
+
+	/* cpu-based controls */
+	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
+		nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high);
+	nested_vmx_procbased_ctls_low = 0;
+	nested_vmx_procbased_ctls_high &=
+		CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_USE_TSC_OFFSETING |
+		CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
+		CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
+		CPU_BASED_CR3_STORE_EXITING |
+#ifdef CONFIG_X86_64
+		CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
+#endif
+		CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
+		CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
+		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+	/*
+	 * We can allow some features even when not supported by the
+	 * hardware. For example, L1 can specify an MSR bitmap - and we
+	 * can use it to avoid exits to L1 - even when L0 runs L2
+	 * without MSR bitmaps.
+	 */
+	nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS;
+
+	/* secondary cpu-based controls */
+	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+		nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high);
+	nested_vmx_secondary_ctls_low = 0;
+	nested_vmx_secondary_ctls_high &=
+		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+}
+
+static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
+{
+	/*
+	 * Bits 0 in high must be 0, and bits 1 in low must be 1.
+	 */
+	return ((control & high) | low) == control;
+}
+
+static inline u64 vmx_control_msr(u32 low, u32 high)
+{
+	return low | ((u64)high << 32);
+}
+
+/*
+ * If we allow our guest to use VMX instructions (i.e., nested VMX), we should
+ * also let it use VMX-specific MSRs.
+ * vmx_get_vmx_msr() and vmx_set_vmx_msr() return 1 when we handled a
+ * VMX-specific MSR, or 0 when we haven't (and the caller should handle it
+ * like all other MSRs).
+ */
+static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+{
+	if (!nested_vmx_allowed(vcpu) && msr_index >= MSR_IA32_VMX_BASIC &&
+		     msr_index <= MSR_IA32_VMX_TRUE_ENTRY_CTLS) {
+		/*
+		 * According to the spec, processors which do not support VMX
+		 * should throw a #GP(0) when VMX capability MSRs are read.
+		 */
+		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+		return 1;
+	}
+
+	switch (msr_index) {
+	case MSR_IA32_FEATURE_CONTROL:
+		*pdata = 0;
+		break;
+	case MSR_IA32_VMX_BASIC:
+		/*
+		 * This MSR reports some information about VMX support. We
+		 * should return information about the VMX we emulate for the
+		 * guest, and the VMCS structure we give it - not about the
+		 * VMX support of the underlying hardware.
+		 */
+		*pdata = VMCS12_REVISION |
+			   ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
+			   (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
+		break;
+	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+	case MSR_IA32_VMX_PINBASED_CTLS:
+		*pdata = vmx_control_msr(nested_vmx_pinbased_ctls_low,
+					nested_vmx_pinbased_ctls_high);
+		break;
+	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
+	case MSR_IA32_VMX_PROCBASED_CTLS:
+		*pdata = vmx_control_msr(nested_vmx_procbased_ctls_low,
+					nested_vmx_procbased_ctls_high);
+		break;
+	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+	case MSR_IA32_VMX_EXIT_CTLS:
+		*pdata = vmx_control_msr(nested_vmx_exit_ctls_low,
+					nested_vmx_exit_ctls_high);
+		break;
+	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+	case MSR_IA32_VMX_ENTRY_CTLS:
+		*pdata = vmx_control_msr(nested_vmx_entry_ctls_low,
+					nested_vmx_entry_ctls_high);
+		break;
+	case MSR_IA32_VMX_MISC:
+		*pdata = 0;
+		break;
+	/*
+	 * These MSRs specify bits which the guest must keep fixed (on or off)
+	 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
+	 * We picked the standard core2 setting.
+	 */
+#define VMXON_CR0_ALWAYSON	(X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
+#define VMXON_CR4_ALWAYSON	X86_CR4_VMXE
+	case MSR_IA32_VMX_CR0_FIXED0:
+		*pdata = VMXON_CR0_ALWAYSON;
+		break;
+	case MSR_IA32_VMX_CR0_FIXED1:
+		*pdata = -1ULL;
+		break;
+	case MSR_IA32_VMX_CR4_FIXED0:
+		*pdata = VMXON_CR4_ALWAYSON;
+		break;
+	case MSR_IA32_VMX_CR4_FIXED1:
+		*pdata = -1ULL;
+		break;
+	case MSR_IA32_VMX_VMCS_ENUM:
+		*pdata = 0x1f;
+		break;
+	case MSR_IA32_VMX_PROCBASED_CTLS2:
+		*pdata = vmx_control_msr(nested_vmx_secondary_ctls_low,
+					nested_vmx_secondary_ctls_high);
+		break;
+	case MSR_IA32_VMX_EPT_VPID_CAP:
+		/* Currently, no nested ept or nested vpid */
+		*pdata = 0;
+		break;
+	default:
+		return 0;
+	}
+
+	return 1;
+}
+
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+{
+	if (!nested_vmx_allowed(vcpu))
+		return 0;
+
+	if (msr_index == MSR_IA32_FEATURE_CONTROL)
+		/* TODO: the right thing. */
+		return 1;
+	/*
+	 * No need to treat VMX capability MSRs specially: If we don't handle
+	 * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
+	 */
+	return 0;
+}
+
 /*
  * Reads an msr value (of 'msr_index') into 'pdata'.
  * Returns 0 on success, non-0 otherwise.
@@ -1309,6 +2066,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		/* Otherwise falls through */
 	default:
 		vmx_load_host_state(to_vmx(vcpu));
+		if (vmx_get_vmx_msr(vcpu, msr_index, pdata))
+			return 0;
 		msr = find_msr_entry(to_vmx(vcpu), msr_index);
 		if (msr) {
 			vmx_load_host_state(to_vmx(vcpu));
@@ -1380,6 +2139,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 			return 1;
 		/* Otherwise falls through */
 	default:
+		if (vmx_set_vmx_msr(vcpu, msr_index, data))
+			break;
 		msr = find_msr_entry(vmx, msr_index);
 		if (msr) {
 			vmx_load_host_state(vmx);
@@ -1469,7 +2230,7 @@ static int hardware_enable(void *garbage)
 	if (read_cr4() & X86_CR4_VMXE)
 		return -EBUSY;
 
-	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
+	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
 
 	test_bits = FEATURE_CONTROL_LOCKED;
@@ -1493,14 +2254,14 @@ static int hardware_enable(void *garbage)
 	return 0;
 }
 
-static void vmclear_local_vcpus(void)
+static void vmclear_local_loaded_vmcss(void)
 {
 	int cpu = raw_smp_processor_id();
-	struct vcpu_vmx *vmx, *n;
+	struct loaded_vmcs *v, *n;
 
-	list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
-				 local_vcpus_link)
-		__vcpu_clear(vmx);
+	list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
+				 loaded_vmcss_on_cpu_link)
+		__loaded_vmcs_clear(v);
 }
 
 
@@ -1515,7 +2276,7 @@ static void kvm_cpu_vmxoff(void)
 static void hardware_disable(void *garbage)
 {
 	if (vmm_exclusive) {
-		vmclear_local_vcpus();
+		vmclear_local_loaded_vmcss();
 		kvm_cpu_vmxoff();
 	}
 	write_cr4(read_cr4() & ~X86_CR4_VMXE);
@@ -1696,6 +2457,18 @@ static void free_vmcs(struct vmcs *vmcs)
 	free_pages((unsigned long)vmcs, vmcs_config.order);
 }
 
+/*
+ * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
+ */
+static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
+{
+	if (!loaded_vmcs->vmcs)
+		return;
+	loaded_vmcs_clear(loaded_vmcs);
+	free_vmcs(loaded_vmcs->vmcs);
+	loaded_vmcs->vmcs = NULL;
+}
+
 static void free_kvm_area(void)
 {
 	int cpu;
@@ -1756,6 +2529,9 @@ static __init int hardware_setup(void)
 	if (!cpu_has_vmx_ple())
 		ple_gap = 0;
 
+	if (nested)
+		nested_vmx_setup_ctls_msrs();
+
 	return alloc_kvm_area();
 }
 
@@ -2041,7 +2817,7 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
 		  (unsigned long *)&vcpu->arch.regs_dirty);
 }
 
-static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 
 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
 					unsigned long cr0,
@@ -2139,11 +2915,23 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	vmcs_writel(GUEST_CR3, guest_cr3);
 }
 
-static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
 		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
 
+	if (cr4 & X86_CR4_VMXE) {
+		/*
+		 * To use VMXON (and later other VMX instructions), a guest
+		 * must first be able to turn on cr4.VMXE (see handle_vmon()).
+		 * So basically the check on whether to allow nested VMX
+		 * is here.
+		 */
+		if (!nested_vmx_allowed(vcpu))
+			return 1;
+	} else if (to_vmx(vcpu)->nested.vmxon)
+		return 1;
+
 	vcpu->arch.cr4 = cr4;
 	if (enable_ept) {
 		if (!is_paging(vcpu)) {
@@ -2156,6 +2944,7 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 
 	vmcs_writel(CR4_READ_SHADOW, cr4);
 	vmcs_writel(GUEST_CR4, hw_cr4);
+	return 0;
 }
 
 static void vmx_get_segment(struct kvm_vcpu *vcpu,
@@ -2721,18 +3510,110 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
 }
 
 /*
+ * Set up the vmcs's constant host-state fields, i.e., host-state fields that
+ * will not change in the lifetime of the guest.
+ * Note that host-state that does change is set elsewhere. E.g., host-state
+ * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
+ */
+static void vmx_set_constant_host_state(void)
+{
+	u32 low32, high32;
+	unsigned long tmpl;
+	struct desc_ptr dt;
+
+	vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS);  /* 22.2.3 */
+	vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
+	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
+
+	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
+	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
+
+	native_store_idt(&dt);
+	vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
+
+	asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl));
+	vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */
+
+	rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
+	vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
+	rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
+	vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
+
+	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+		rdmsr(MSR_IA32_CR_PAT, low32, high32);
+		vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
+	}
+}
+
+static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
+{
+	vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
+	if (enable_ept)
+		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
+	if (is_guest_mode(&vmx->vcpu))
+		vmx->vcpu.arch.cr4_guest_owned_bits &=
+			~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
+	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
+}
+
+static u32 vmx_exec_control(struct vcpu_vmx *vmx)
+{
+	u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
+	if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
+		exec_control &= ~CPU_BASED_TPR_SHADOW;
+#ifdef CONFIG_X86_64
+		exec_control |= CPU_BASED_CR8_STORE_EXITING |
+				CPU_BASED_CR8_LOAD_EXITING;
+#endif
+	}
+	if (!enable_ept)
+		exec_control |= CPU_BASED_CR3_STORE_EXITING |
+				CPU_BASED_CR3_LOAD_EXITING  |
+				CPU_BASED_INVLPG_EXITING;
+	return exec_control;
+}
+
+static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
+{
+	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
+	if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+	if (vmx->vpid == 0)
+		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
+	if (!enable_ept) {
+		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+		enable_unrestricted_guest = 0;
+	}
+	if (!enable_unrestricted_guest)
+		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+	if (!ple_gap)
+		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
+	return exec_control;
+}
+
+static void ept_set_mmio_spte_mask(void)
+{
+	/*
+	 * EPT Misconfigurations can be generated if the value of bits 2:0
+	 * of an EPT paging-structure entry is 110b (write/execute).
+	 * Also, magic bits (0xffull << 49) is set to quickly identify mmio
+	 * spte.
+	 */
+	kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
+}
+
+/*
  * Sets up the vmcs for emulated real mode.
  */
 static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
-	u32 host_sysenter_cs, msr_low, msr_high;
-	u32 junk;
-	u64 host_pat;
+#ifdef CONFIG_X86_64
 	unsigned long a;
-	struct desc_ptr dt;
+#endif
 	int i;
-	unsigned long kvm_vmx_return;
-	u32 exec_control;
 
 	/* I/O */
 	vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
@@ -2747,36 +3628,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
 		vmcs_config.pin_based_exec_ctrl);
 
-	exec_control = vmcs_config.cpu_based_exec_ctrl;
-	if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
-		exec_control &= ~CPU_BASED_TPR_SHADOW;
-#ifdef CONFIG_X86_64
-		exec_control |= CPU_BASED_CR8_STORE_EXITING |
-				CPU_BASED_CR8_LOAD_EXITING;
-#endif
-	}
-	if (!enable_ept)
-		exec_control |= CPU_BASED_CR3_STORE_EXITING |
-				CPU_BASED_CR3_LOAD_EXITING  |
-				CPU_BASED_INVLPG_EXITING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
 	if (cpu_has_secondary_exec_ctrls()) {
-		exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
-		if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
-			exec_control &=
-				~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
-		if (vmx->vpid == 0)
-			exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
-		if (!enable_ept) {
-			exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
-			enable_unrestricted_guest = 0;
-		}
-		if (!enable_unrestricted_guest)
-			exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
-		if (!ple_gap)
-			exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
-		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+				vmx_secondary_exec_control(vmx));
 	}
 
 	if (ple_gap) {
@@ -2784,20 +3640,13 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		vmcs_write32(PLE_WINDOW, ple_window);
 	}
 
-	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
-	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
 	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
 
-	vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS);  /* 22.2.3 */
-	vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
-	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
-
-	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
-	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
-	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
 	vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
 	vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
-	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
+	vmx_set_constant_host_state();
 #ifdef CONFIG_X86_64
 	rdmsrl(MSR_FS_BASE, a);
 	vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
@@ -2808,32 +3657,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
 #endif
 
-	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
-
-	native_store_idt(&dt);
-	vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
-
-	asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
-	vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
 	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
 	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
 	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
 
-	rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
-	vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
-	rdmsrl(MSR_IA32_SYSENTER_ESP, a);
-	vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
-	rdmsrl(MSR_IA32_SYSENTER_EIP, a);
-	vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
-
-	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
-		rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
-		host_pat = msr_low | ((u64) msr_high << 32);
-		vmcs_write64(HOST_IA32_PAT, host_pat);
-	}
 	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+		u32 msr_low, msr_high;
+		u64 host_pat;
 		rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
 		host_pat = msr_low | ((u64) msr_high << 32);
 		/* Write the default value follow host pat */
@@ -2863,10 +3695,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
 
 	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
-	vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
-	if (enable_ept)
-		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
-	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
+	set_cr4_guest_host_mask(vmx);
 
 	kvm_write_tsc(&vmx->vcpu, 0);
 
@@ -2990,9 +3819,25 @@ out:
 	return ret;
 }
 
+/*
+ * In nested virtualization, check if L1 asked to exit on external interrupts.
+ * For most existing hypervisors, this will always return true.
+ */
+static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
+{
+	return get_vmcs12(vcpu)->pin_based_vm_exec_control &
+		PIN_BASED_EXT_INTR_MASK;
+}
+
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
+	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+		/* We can get here when nested_run_pending caused
+		 * vmx_interrupt_allowed() to return false. In this case, do
+		 * nothing - the interrupt will be injected later.
+		 */
+		return;
 
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
@@ -3049,6 +3894,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	if (is_guest_mode(vcpu))
+		return;
+
 	if (!cpu_has_virtual_nmis()) {
 		/*
 		 * Tracking the NMI-blocked state in software is built upon
@@ -3115,6 +3963,17 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
+	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
+		struct vmcs12 *vmcs12;
+		if (to_vmx(vcpu)->nested.nested_run_pending)
+			return 0;
+		nested_vmx_vmexit(vcpu);
+		vmcs12 = get_vmcs12(vcpu);
+		vmcs12->vm_exit_reason = EXIT_REASON_EXTERNAL_INTERRUPT;
+		vmcs12->vm_exit_intr_info = 0;
+		/* fall through to normal code, but now in L1, not L2 */
+	}
+
 	return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
 		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
 			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
@@ -3356,6 +4215,58 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
 	hypercall[2] = 0xc1;
 }
 
+/* called to set cr0 as approriate for a mov-to-cr0 exit. */
+static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	if (to_vmx(vcpu)->nested.vmxon &&
+	    ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
+		return 1;
+
+	if (is_guest_mode(vcpu)) {
+		/*
+		 * We get here when L2 changed cr0 in a way that did not change
+		 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
+		 * but did change L0 shadowed bits. This can currently happen
+		 * with the TS bit: L0 may want to leave TS on (for lazy fpu
+		 * loading) while pretending to allow the guest to change it.
+		 */
+		if (kvm_set_cr0(vcpu, (val & vcpu->arch.cr0_guest_owned_bits) |
+			 (vcpu->arch.cr0 & ~vcpu->arch.cr0_guest_owned_bits)))
+			return 1;
+		vmcs_writel(CR0_READ_SHADOW, val);
+		return 0;
+	} else
+		return kvm_set_cr0(vcpu, val);
+}
+
+static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	if (is_guest_mode(vcpu)) {
+		if (kvm_set_cr4(vcpu, (val & vcpu->arch.cr4_guest_owned_bits) |
+			 (vcpu->arch.cr4 & ~vcpu->arch.cr4_guest_owned_bits)))
+			return 1;
+		vmcs_writel(CR4_READ_SHADOW, val);
+		return 0;
+	} else
+		return kvm_set_cr4(vcpu, val);
+}
+
+/* called to set cr0 as approriate for clts instruction exit. */
+static void handle_clts(struct kvm_vcpu *vcpu)
+{
+	if (is_guest_mode(vcpu)) {
+		/*
+		 * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
+		 * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
+		 * just pretend it's off (also in arch.cr0 for fpu_activate).
+		 */
+		vmcs_writel(CR0_READ_SHADOW,
+			vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
+		vcpu->arch.cr0 &= ~X86_CR0_TS;
+	} else
+		vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+}
+
 static int handle_cr(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification, val;
@@ -3372,7 +4283,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 		trace_kvm_cr_write(cr, val);
 		switch (cr) {
 		case 0:
-			err = kvm_set_cr0(vcpu, val);
+			err = handle_set_cr0(vcpu, val);
 			kvm_complete_insn_gp(vcpu, err);
 			return 1;
 		case 3:
@@ -3380,7 +4291,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 			kvm_complete_insn_gp(vcpu, err);
 			return 1;
 		case 4:
-			err = kvm_set_cr4(vcpu, val);
+			err = handle_set_cr4(vcpu, val);
 			kvm_complete_insn_gp(vcpu, err);
 			return 1;
 		case 8: {
@@ -3398,7 +4309,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 		};
 		break;
 	case 2: /* clts */
-		vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+		handle_clts(vcpu);
 		trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
 		skip_emulated_instruction(vcpu);
 		vmx_fpu_activate(vcpu);
@@ -3574,12 +4485,6 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
-static int handle_vmx_insn(struct kvm_vcpu *vcpu)
-{
-	kvm_queue_exception(vcpu, UD_VECTOR);
-	return 1;
-}
-
 static int handle_invd(struct kvm_vcpu *vcpu)
 {
 	return emulate_instruction(vcpu, 0) == EMULATE_DONE;
@@ -3777,11 +4682,19 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
 static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 {
 	u64 sptes[4];
-	int nr_sptes, i;
+	int nr_sptes, i, ret;
 	gpa_t gpa;
 
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 
+	ret = handle_mmio_page_fault_common(vcpu, gpa, true);
+	if (likely(ret == 1))
+		return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
+					      EMULATE_DONE;
+	if (unlikely(!ret))
+		return 1;
+
+	/* It is the real ept misconfig */
 	printk(KERN_ERR "EPT: Misconfiguration.\n");
 	printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
 
@@ -3866,6 +4779,639 @@ static int handle_invalid_op(struct kvm_vcpu *vcpu)
 }
 
 /*
+ * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
+ * We could reuse a single VMCS for all the L2 guests, but we also want the
+ * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
+ * allows keeping them loaded on the processor, and in the future will allow
+ * optimizations where prepare_vmcs02 doesn't need to set all the fields on
+ * every entry if they never change.
+ * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
+ * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
+ *
+ * The following functions allocate and free a vmcs02 in this pool.
+ */
+
+/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
+static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
+{
+	struct vmcs02_list *item;
+	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
+		if (item->vmptr == vmx->nested.current_vmptr) {
+			list_move(&item->list, &vmx->nested.vmcs02_pool);
+			return &item->vmcs02;
+		}
+
+	if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
+		/* Recycle the least recently used VMCS. */
+		item = list_entry(vmx->nested.vmcs02_pool.prev,
+			struct vmcs02_list, list);
+		item->vmptr = vmx->nested.current_vmptr;
+		list_move(&item->list, &vmx->nested.vmcs02_pool);
+		return &item->vmcs02;
+	}
+
+	/* Create a new VMCS */
+	item = (struct vmcs02_list *)
+		kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
+	if (!item)
+		return NULL;
+	item->vmcs02.vmcs = alloc_vmcs();
+	if (!item->vmcs02.vmcs) {
+		kfree(item);
+		return NULL;
+	}
+	loaded_vmcs_init(&item->vmcs02);
+	item->vmptr = vmx->nested.current_vmptr;
+	list_add(&(item->list), &(vmx->nested.vmcs02_pool));
+	vmx->nested.vmcs02_num++;
+	return &item->vmcs02;
+}
+
+/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
+static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
+{
+	struct vmcs02_list *item;
+	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
+		if (item->vmptr == vmptr) {
+			free_loaded_vmcs(&item->vmcs02);
+			list_del(&item->list);
+			kfree(item);
+			vmx->nested.vmcs02_num--;
+			return;
+		}
+}
+
+/*
+ * Free all VMCSs saved for this vcpu, except the one pointed by
+ * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one
+ * currently used, if running L2), and vmcs01 when running L2.
+ */
+static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
+{
+	struct vmcs02_list *item, *n;
+	list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
+		if (vmx->loaded_vmcs != &item->vmcs02)
+			free_loaded_vmcs(&item->vmcs02);
+		list_del(&item->list);
+		kfree(item);
+	}
+	vmx->nested.vmcs02_num = 0;
+
+	if (vmx->loaded_vmcs != &vmx->vmcs01)
+		free_loaded_vmcs(&vmx->vmcs01);
+}
+
+/*
+ * Emulate the VMXON instruction.
+ * Currently, we just remember that VMX is active, and do not save or even
+ * inspect the argument to VMXON (the so-called "VMXON pointer") because we
+ * do not currently need to store anything in that guest-allocated memory
+ * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
+ * argument is different from the VMXON pointer (which the spec says they do).
+ */
+static int handle_vmon(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment cs;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	/* The Intel VMX Instruction Reference lists a bunch of bits that
+	 * are prerequisite to running VMXON, most notably cr4.VMXE must be
+	 * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
+	 * Otherwise, we should fail with #UD. We test these now:
+	 */
+	if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
+	    !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
+	    (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+	if (is_long_mode(vcpu) && !cs.l) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	if (vmx_get_cpl(vcpu)) {
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+
+	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
+	vmx->nested.vmcs02_num = 0;
+
+	vmx->nested.vmxon = true;
+
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+/*
+ * Intel's VMX Instruction Reference specifies a common set of prerequisites
+ * for running VMX instructions (except VMXON, whose prerequisites are
+ * slightly different). It also specifies what exception to inject otherwise.
+ */
+static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment cs;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!vmx->nested.vmxon) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 0;
+	}
+
+	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
+	if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
+	    (is_long_mode(vcpu) && !cs.l)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 0;
+	}
+
+	if (vmx_get_cpl(vcpu)) {
+		kvm_inject_gp(vcpu, 0);
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Free whatever needs to be freed from vmx->nested when L1 goes down, or
+ * just stops using VMX.
+ */
+static void free_nested(struct vcpu_vmx *vmx)
+{
+	if (!vmx->nested.vmxon)
+		return;
+	vmx->nested.vmxon = false;
+	if (vmx->nested.current_vmptr != -1ull) {
+		kunmap(vmx->nested.current_vmcs12_page);
+		nested_release_page(vmx->nested.current_vmcs12_page);
+		vmx->nested.current_vmptr = -1ull;
+		vmx->nested.current_vmcs12 = NULL;
+	}
+	/* Unpin physical memory we referred to in current vmcs02 */
+	if (vmx->nested.apic_access_page) {
+		nested_release_page(vmx->nested.apic_access_page);
+		vmx->nested.apic_access_page = 0;
+	}
+
+	nested_free_all_saved_vmcss(vmx);
+}
+
+/* Emulate the VMXOFF instruction */
+static int handle_vmoff(struct kvm_vcpu *vcpu)
+{
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+	free_nested(to_vmx(vcpu));
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+/*
+ * Decode the memory-address operand of a vmx instruction, as recorded on an
+ * exit caused by such an instruction (run by a guest hypervisor).
+ * On success, returns 0. When the operand is invalid, returns 1 and throws
+ * #UD or #GP.
+ */
+static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
+				 unsigned long exit_qualification,
+				 u32 vmx_instruction_info, gva_t *ret)
+{
+	/*
+	 * According to Vol. 3B, "Information for VM Exits Due to Instruction
+	 * Execution", on an exit, vmx_instruction_info holds most of the
+	 * addressing components of the operand. Only the displacement part
+	 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
+	 * For how an actual address is calculated from all these components,
+	 * refer to Vol. 1, "Operand Addressing".
+	 */
+	int  scaling = vmx_instruction_info & 3;
+	int  addr_size = (vmx_instruction_info >> 7) & 7;
+	bool is_reg = vmx_instruction_info & (1u << 10);
+	int  seg_reg = (vmx_instruction_info >> 15) & 7;
+	int  index_reg = (vmx_instruction_info >> 18) & 0xf;
+	bool index_is_valid = !(vmx_instruction_info & (1u << 22));
+	int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
+	bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
+
+	if (is_reg) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	/* Addr = segment_base + offset */
+	/* offset = base + [index * scale] + displacement */
+	*ret = vmx_get_segment_base(vcpu, seg_reg);
+	if (base_is_valid)
+		*ret += kvm_register_read(vcpu, base_reg);
+	if (index_is_valid)
+		*ret += kvm_register_read(vcpu, index_reg)<<scaling;
+	*ret += exit_qualification; /* holds the displacement */
+
+	if (addr_size == 1) /* 32 bit */
+		*ret &= 0xffffffff;
+
+	/*
+	 * TODO: throw #GP (and return 1) in various cases that the VM*
+	 * instructions require it - e.g., offset beyond segment limit,
+	 * unusable or unreadable/unwritable segment, non-canonical 64-bit
+	 * address, and so on. Currently these are not checked.
+	 */
+	return 0;
+}
+
+/*
+ * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+ * set the success or error code of an emulated VMX instruction, as specified
+ * by Vol 2B, VMX Instruction Reference, "Conventions".
+ */
+static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
+{
+	vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
+			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+			    X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
+}
+
+static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
+{
+	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+			& ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+			    X86_EFLAGS_SF | X86_EFLAGS_OF))
+			| X86_EFLAGS_CF);
+}
+
+static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
+					u32 vm_instruction_error)
+{
+	if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
+		/*
+		 * failValid writes the error number to the current VMCS, which
+		 * can't be done there isn't a current VMCS.
+		 */
+		nested_vmx_failInvalid(vcpu);
+		return;
+	}
+	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
+			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+			    X86_EFLAGS_SF | X86_EFLAGS_OF))
+			| X86_EFLAGS_ZF);
+	get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
+}
+
+/* Emulate the VMCLEAR instruction */
+static int handle_vmclear(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	gva_t gva;
+	gpa_t vmptr;
+	struct vmcs12 *vmcs12;
+	struct page *page;
+	struct x86_exception e;
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+			vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
+		return 1;
+
+	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
+				sizeof(vmptr), &e)) {
+		kvm_inject_page_fault(vcpu, &e);
+		return 1;
+	}
+
+	if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
+		nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
+		skip_emulated_instruction(vcpu);
+		return 1;
+	}
+
+	if (vmptr == vmx->nested.current_vmptr) {
+		kunmap(vmx->nested.current_vmcs12_page);
+		nested_release_page(vmx->nested.current_vmcs12_page);
+		vmx->nested.current_vmptr = -1ull;
+		vmx->nested.current_vmcs12 = NULL;
+	}
+
+	page = nested_get_page(vcpu, vmptr);
+	if (page == NULL) {
+		/*
+		 * For accurate processor emulation, VMCLEAR beyond available
+		 * physical memory should do nothing at all. However, it is
+		 * possible that a nested vmx bug, not a guest hypervisor bug,
+		 * resulted in this case, so let's shut down before doing any
+		 * more damage:
+		 */
+		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+		return 1;
+	}
+	vmcs12 = kmap(page);
+	vmcs12->launch_state = 0;
+	kunmap(page);
+	nested_release_page(page);
+
+	nested_free_vmcs02(vmx, vmptr);
+
+	skip_emulated_instruction(vcpu);
+	nested_vmx_succeed(vcpu);
+	return 1;
+}
+
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
+
+/* Emulate the VMLAUNCH instruction */
+static int handle_vmlaunch(struct kvm_vcpu *vcpu)
+{
+	return nested_vmx_run(vcpu, true);
+}
+
+/* Emulate the VMRESUME instruction */
+static int handle_vmresume(struct kvm_vcpu *vcpu)
+{
+
+	return nested_vmx_run(vcpu, false);
+}
+
+enum vmcs_field_type {
+	VMCS_FIELD_TYPE_U16 = 0,
+	VMCS_FIELD_TYPE_U64 = 1,
+	VMCS_FIELD_TYPE_U32 = 2,
+	VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
+};
+
+static inline int vmcs_field_type(unsigned long field)
+{
+	if (0x1 & field)	/* the *_HIGH fields are all 32 bit */
+		return VMCS_FIELD_TYPE_U32;
+	return (field >> 13) & 0x3 ;
+}
+
+static inline int vmcs_field_readonly(unsigned long field)
+{
+	return (((field >> 10) & 0x3) == 1);
+}
+
+/*
+ * Read a vmcs12 field. Since these can have varying lengths and we return
+ * one type, we chose the biggest type (u64) and zero-extend the return value
+ * to that size. Note that the caller, handle_vmread, might need to use only
+ * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
+ * 64-bit fields are to be returned).
+ */
+static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
+					unsigned long field, u64 *ret)
+{
+	short offset = vmcs_field_to_offset(field);
+	char *p;
+
+	if (offset < 0)
+		return 0;
+
+	p = ((char *)(get_vmcs12(vcpu))) + offset;
+
+	switch (vmcs_field_type(field)) {
+	case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+		*ret = *((natural_width *)p);
+		return 1;
+	case VMCS_FIELD_TYPE_U16:
+		*ret = *((u16 *)p);
+		return 1;
+	case VMCS_FIELD_TYPE_U32:
+		*ret = *((u32 *)p);
+		return 1;
+	case VMCS_FIELD_TYPE_U64:
+		*ret = *((u64 *)p);
+		return 1;
+	default:
+		return 0; /* can never happen. */
+	}
+}
+
+/*
+ * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
+ * used before) all generate the same failure when it is missing.
+ */
+static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	if (vmx->nested.current_vmptr == -1ull) {
+		nested_vmx_failInvalid(vcpu);
+		skip_emulated_instruction(vcpu);
+		return 0;
+	}
+	return 1;
+}
+
+static int handle_vmread(struct kvm_vcpu *vcpu)
+{
+	unsigned long field;
+	u64 field_value;
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	gva_t gva = 0;
+
+	if (!nested_vmx_check_permission(vcpu) ||
+	    !nested_vmx_check_vmcs12(vcpu))
+		return 1;
+
+	/* Decode instruction info and find the field to read */
+	field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+	/* Read the field, zero-extended to a u64 field_value */
+	if (!vmcs12_read_any(vcpu, field, &field_value)) {
+		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+		skip_emulated_instruction(vcpu);
+		return 1;
+	}
+	/*
+	 * Now copy part of this value to register or memory, as requested.
+	 * Note that the number of bits actually copied is 32 or 64 depending
+	 * on the guest's mode (32 or 64 bit), not on the given field's length.
+	 */
+	if (vmx_instruction_info & (1u << 10)) {
+		kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
+			field_value);
+	} else {
+		if (get_vmx_mem_address(vcpu, exit_qualification,
+				vmx_instruction_info, &gva))
+			return 1;
+		/* _system ok, as nested_vmx_check_permission verified cpl=0 */
+		kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
+			     &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
+	}
+
+	nested_vmx_succeed(vcpu);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+
+static int handle_vmwrite(struct kvm_vcpu *vcpu)
+{
+	unsigned long field;
+	gva_t gva;
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	char *p;
+	short offset;
+	/* The value to write might be 32 or 64 bits, depending on L1's long
+	 * mode, and eventually we need to write that into a field of several
+	 * possible lengths. The code below first zero-extends the value to 64
+	 * bit (field_value), and then copies only the approriate number of
+	 * bits into the vmcs12 field.
+	 */
+	u64 field_value = 0;
+	struct x86_exception e;
+
+	if (!nested_vmx_check_permission(vcpu) ||
+	    !nested_vmx_check_vmcs12(vcpu))
+		return 1;
+
+	if (vmx_instruction_info & (1u << 10))
+		field_value = kvm_register_read(vcpu,
+			(((vmx_instruction_info) >> 3) & 0xf));
+	else {
+		if (get_vmx_mem_address(vcpu, exit_qualification,
+				vmx_instruction_info, &gva))
+			return 1;
+		if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
+			   &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) {
+			kvm_inject_page_fault(vcpu, &e);
+			return 1;
+		}
+	}
+
+
+	field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+	if (vmcs_field_readonly(field)) {
+		nested_vmx_failValid(vcpu,
+			VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
+		skip_emulated_instruction(vcpu);
+		return 1;
+	}
+
+	offset = vmcs_field_to_offset(field);
+	if (offset < 0) {
+		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+		skip_emulated_instruction(vcpu);
+		return 1;
+	}
+	p = ((char *) get_vmcs12(vcpu)) + offset;
+
+	switch (vmcs_field_type(field)) {
+	case VMCS_FIELD_TYPE_U16:
+		*(u16 *)p = field_value;
+		break;
+	case VMCS_FIELD_TYPE_U32:
+		*(u32 *)p = field_value;
+		break;
+	case VMCS_FIELD_TYPE_U64:
+		*(u64 *)p = field_value;
+		break;
+	case VMCS_FIELD_TYPE_NATURAL_WIDTH:
+		*(natural_width *)p = field_value;
+		break;
+	default:
+		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+		skip_emulated_instruction(vcpu);
+		return 1;
+	}
+
+	nested_vmx_succeed(vcpu);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+/* Emulate the VMPTRLD instruction */
+static int handle_vmptrld(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	gva_t gva;
+	gpa_t vmptr;
+	struct x86_exception e;
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+			vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
+		return 1;
+
+	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
+				sizeof(vmptr), &e)) {
+		kvm_inject_page_fault(vcpu, &e);
+		return 1;
+	}
+
+	if (!IS_ALIGNED(vmptr, PAGE_SIZE)) {
+		nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
+		skip_emulated_instruction(vcpu);
+		return 1;
+	}
+
+	if (vmx->nested.current_vmptr != vmptr) {
+		struct vmcs12 *new_vmcs12;
+		struct page *page;
+		page = nested_get_page(vcpu, vmptr);
+		if (page == NULL) {
+			nested_vmx_failInvalid(vcpu);
+			skip_emulated_instruction(vcpu);
+			return 1;
+		}
+		new_vmcs12 = kmap(page);
+		if (new_vmcs12->revision_id != VMCS12_REVISION) {
+			kunmap(page);
+			nested_release_page_clean(page);
+			nested_vmx_failValid(vcpu,
+				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
+			skip_emulated_instruction(vcpu);
+			return 1;
+		}
+		if (vmx->nested.current_vmptr != -1ull) {
+			kunmap(vmx->nested.current_vmcs12_page);
+			nested_release_page(vmx->nested.current_vmcs12_page);
+		}
+
+		vmx->nested.current_vmptr = vmptr;
+		vmx->nested.current_vmcs12 = new_vmcs12;
+		vmx->nested.current_vmcs12_page = page;
+	}
+
+	nested_vmx_succeed(vcpu);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+/* Emulate the VMPTRST instruction */
+static int handle_vmptrst(struct kvm_vcpu *vcpu)
+{
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	gva_t vmcs_gva;
+	struct x86_exception e;
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	if (get_vmx_mem_address(vcpu, exit_qualification,
+			vmx_instruction_info, &vmcs_gva))
+		return 1;
+	/* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */
+	if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
+				 (void *)&to_vmx(vcpu)->nested.current_vmptr,
+				 sizeof(u64), &e)) {
+		kvm_inject_page_fault(vcpu, &e);
+		return 1;
+	}
+	nested_vmx_succeed(vcpu);
+	skip_emulated_instruction(vcpu);
+	return 1;
+}
+
+/*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
  * to be done to userspace and return 0.
@@ -3886,15 +5432,15 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_INVD]		      = handle_invd,
 	[EXIT_REASON_INVLPG]		      = handle_invlpg,
 	[EXIT_REASON_VMCALL]                  = handle_vmcall,
-	[EXIT_REASON_VMCLEAR]	              = handle_vmx_insn,
-	[EXIT_REASON_VMLAUNCH]                = handle_vmx_insn,
-	[EXIT_REASON_VMPTRLD]                 = handle_vmx_insn,
-	[EXIT_REASON_VMPTRST]                 = handle_vmx_insn,
-	[EXIT_REASON_VMREAD]                  = handle_vmx_insn,
-	[EXIT_REASON_VMRESUME]                = handle_vmx_insn,
-	[EXIT_REASON_VMWRITE]                 = handle_vmx_insn,
-	[EXIT_REASON_VMOFF]                   = handle_vmx_insn,
-	[EXIT_REASON_VMON]                    = handle_vmx_insn,
+	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
+	[EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
+	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
+	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
+	[EXIT_REASON_VMREAD]                  = handle_vmread,
+	[EXIT_REASON_VMRESUME]                = handle_vmresume,
+	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
+	[EXIT_REASON_VMOFF]                   = handle_vmoff,
+	[EXIT_REASON_VMON]                    = handle_vmon,
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
@@ -3911,6 +5457,229 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 static const int kvm_vmx_max_exit_handlers =
 	ARRAY_SIZE(kvm_vmx_exit_handlers);
 
+/*
+ * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
+ * rather than handle it ourselves in L0. I.e., check whether L1 expressed
+ * disinterest in the current event (read or write a specific MSR) by using an
+ * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
+ */
+static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
+	struct vmcs12 *vmcs12, u32 exit_reason)
+{
+	u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
+	gpa_t bitmap;
+
+	if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS))
+		return 1;
+
+	/*
+	 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
+	 * for the four combinations of read/write and low/high MSR numbers.
+	 * First we need to figure out which of the four to use:
+	 */
+	bitmap = vmcs12->msr_bitmap;
+	if (exit_reason == EXIT_REASON_MSR_WRITE)
+		bitmap += 2048;
+	if (msr_index >= 0xc0000000) {
+		msr_index -= 0xc0000000;
+		bitmap += 1024;
+	}
+
+	/* Then read the msr_index'th bit from this bitmap: */
+	if (msr_index < 1024*8) {
+		unsigned char b;
+		kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1);
+		return 1 & (b >> (msr_index & 7));
+	} else
+		return 1; /* let L1 handle the wrong parameter */
+}
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
+ * rather than handle it ourselves in L0. I.e., check if L1 wanted to
+ * intercept (via guest_host_mask etc.) the current event.
+ */
+static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
+	struct vmcs12 *vmcs12)
+{
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	int cr = exit_qualification & 15;
+	int reg = (exit_qualification >> 8) & 15;
+	unsigned long val = kvm_register_read(vcpu, reg);
+
+	switch ((exit_qualification >> 4) & 3) {
+	case 0: /* mov to cr */
+		switch (cr) {
+		case 0:
+			if (vmcs12->cr0_guest_host_mask &
+			    (val ^ vmcs12->cr0_read_shadow))
+				return 1;
+			break;
+		case 3:
+			if ((vmcs12->cr3_target_count >= 1 &&
+					vmcs12->cr3_target_value0 == val) ||
+				(vmcs12->cr3_target_count >= 2 &&
+					vmcs12->cr3_target_value1 == val) ||
+				(vmcs12->cr3_target_count >= 3 &&
+					vmcs12->cr3_target_value2 == val) ||
+				(vmcs12->cr3_target_count >= 4 &&
+					vmcs12->cr3_target_value3 == val))
+				return 0;
+			if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
+				return 1;
+			break;
+		case 4:
+			if (vmcs12->cr4_guest_host_mask &
+			    (vmcs12->cr4_read_shadow ^ val))
+				return 1;
+			break;
+		case 8:
+			if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
+				return 1;
+			break;
+		}
+		break;
+	case 2: /* clts */
+		if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
+		    (vmcs12->cr0_read_shadow & X86_CR0_TS))
+			return 1;
+		break;
+	case 1: /* mov from cr */
+		switch (cr) {
+		case 3:
+			if (vmcs12->cpu_based_vm_exec_control &
+			    CPU_BASED_CR3_STORE_EXITING)
+				return 1;
+			break;
+		case 8:
+			if (vmcs12->cpu_based_vm_exec_control &
+			    CPU_BASED_CR8_STORE_EXITING)
+				return 1;
+			break;
+		}
+		break;
+	case 3: /* lmsw */
+		/*
+		 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
+		 * cr0. Other attempted changes are ignored, with no exit.
+		 */
+		if (vmcs12->cr0_guest_host_mask & 0xe &
+		    (val ^ vmcs12->cr0_read_shadow))
+			return 1;
+		if ((vmcs12->cr0_guest_host_mask & 0x1) &&
+		    !(vmcs12->cr0_read_shadow & 0x1) &&
+		    (val & 0x1))
+			return 1;
+		break;
+	}
+	return 0;
+}
+
+/*
+ * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
+ * should handle it ourselves in L0 (and then continue L2). Only call this
+ * when in is_guest_mode (L2).
+ */
+static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
+{
+	u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
+	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+	if (vmx->nested.nested_run_pending)
+		return 0;
+
+	if (unlikely(vmx->fail)) {
+		printk(KERN_INFO "%s failed vm entry %x\n",
+		       __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+		return 1;
+	}
+
+	switch (exit_reason) {
+	case EXIT_REASON_EXCEPTION_NMI:
+		if (!is_exception(intr_info))
+			return 0;
+		else if (is_page_fault(intr_info))
+			return enable_ept;
+		return vmcs12->exception_bitmap &
+				(1u << (intr_info & INTR_INFO_VECTOR_MASK));
+	case EXIT_REASON_EXTERNAL_INTERRUPT:
+		return 0;
+	case EXIT_REASON_TRIPLE_FAULT:
+		return 1;
+	case EXIT_REASON_PENDING_INTERRUPT:
+	case EXIT_REASON_NMI_WINDOW:
+		/*
+		 * prepare_vmcs02() set the CPU_BASED_VIRTUAL_INTR_PENDING bit
+		 * (aka Interrupt Window Exiting) only when L1 turned it on,
+		 * so if we got a PENDING_INTERRUPT exit, this must be for L1.
+		 * Same for NMI Window Exiting.
+		 */
+		return 1;
+	case EXIT_REASON_TASK_SWITCH:
+		return 1;
+	case EXIT_REASON_CPUID:
+		return 1;
+	case EXIT_REASON_HLT:
+		return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
+	case EXIT_REASON_INVD:
+		return 1;
+	case EXIT_REASON_INVLPG:
+		return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
+	case EXIT_REASON_RDPMC:
+		return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
+	case EXIT_REASON_RDTSC:
+		return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
+	case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
+	case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
+	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
+	case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
+	case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
+		/*
+		 * VMX instructions trap unconditionally. This allows L1 to
+		 * emulate them for its L2 guest, i.e., allows 3-level nesting!
+		 */
+		return 1;
+	case EXIT_REASON_CR_ACCESS:
+		return nested_vmx_exit_handled_cr(vcpu, vmcs12);
+	case EXIT_REASON_DR_ACCESS:
+		return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
+	case EXIT_REASON_IO_INSTRUCTION:
+		/* TODO: support IO bitmaps */
+		return 1;
+	case EXIT_REASON_MSR_READ:
+	case EXIT_REASON_MSR_WRITE:
+		return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
+	case EXIT_REASON_INVALID_STATE:
+		return 1;
+	case EXIT_REASON_MWAIT_INSTRUCTION:
+		return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
+	case EXIT_REASON_MONITOR_INSTRUCTION:
+		return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
+	case EXIT_REASON_PAUSE_INSTRUCTION:
+		return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
+			nested_cpu_has2(vmcs12,
+				SECONDARY_EXEC_PAUSE_LOOP_EXITING);
+	case EXIT_REASON_MCE_DURING_VMENTRY:
+		return 0;
+	case EXIT_REASON_TPR_BELOW_THRESHOLD:
+		return 1;
+	case EXIT_REASON_APIC_ACCESS:
+		return nested_cpu_has2(vmcs12,
+			SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+	case EXIT_REASON_EPT_VIOLATION:
+	case EXIT_REASON_EPT_MISCONFIG:
+		return 0;
+	case EXIT_REASON_WBINVD:
+		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
+	case EXIT_REASON_XSETBV:
+		return 1;
+	default:
+		return 1;
+	}
+}
+
 static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 {
 	*info1 = vmcs_readl(EXIT_QUALIFICATION);
@@ -3933,6 +5702,25 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	if (vmx->emulation_required && emulate_invalid_guest_state)
 		return handle_invalid_guest_state(vcpu);
 
+	/*
+	 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
+	 * we did not inject a still-pending event to L1 now because of
+	 * nested_run_pending, we need to re-enable this bit.
+	 */
+	if (vmx->nested.nested_run_pending)
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+	if (!is_guest_mode(vcpu) && (exit_reason == EXIT_REASON_VMLAUNCH ||
+	    exit_reason == EXIT_REASON_VMRESUME))
+		vmx->nested.nested_run_pending = 1;
+	else
+		vmx->nested.nested_run_pending = 0;
+
+	if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
+		nested_vmx_vmexit(vcpu);
+		return 1;
+	}
+
 	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -3955,7 +5743,9 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 		       "(0x%x) and exit reason is 0x%x\n",
 		       __func__, vectoring_info, exit_reason);
 
-	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
+	    !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
+	                                get_vmcs12(vcpu), vcpu)))) {
 		if (vmx_interrupt_allowed(vcpu)) {
 			vmx->soft_vnmi_blocked = 0;
 		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
@@ -4118,6 +5908,8 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
 
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
+	if (is_guest_mode(&vmx->vcpu))
+		return;
 	__vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
 				  VM_EXIT_INSTRUCTION_LEN,
 				  IDT_VECTORING_ERROR_CODE);
@@ -4125,6 +5917,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 
 static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
 {
+	if (is_guest_mode(vcpu))
+		return;
 	__vmx_complete_interrupts(to_vmx(vcpu),
 				  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
 				  VM_ENTRY_INSTRUCTION_LEN,
@@ -4145,6 +5939,21 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
+		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+		if (vmcs12->idt_vectoring_info_field &
+				VECTORING_INFO_VALID_MASK) {
+			vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+				vmcs12->idt_vectoring_info_field);
+			vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+				vmcs12->vm_exit_instruction_len);
+			if (vmcs12->idt_vectoring_info_field &
+					VECTORING_INFO_DELIVER_CODE_MASK)
+				vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+					vmcs12->idt_vectoring_error_code);
+		}
+	}
+
 	/* Record the guest's net vcpu time for enforced NMI injections. */
 	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
 		vmx->entry_time = ktime_get();
@@ -4167,6 +5976,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		vmx_set_interrupt_shadow(vcpu, 0);
 
+	vmx->__launched = vmx->loaded_vmcs->launched;
 	asm(
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
@@ -4237,7 +6047,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		"pop  %%"R"bp; pop  %%"R"dx \n\t"
 		"setbe %c[fail](%0) \n\t"
 	      : : "c"(vmx), "d"((unsigned long)HOST_RSP),
-		[launched]"i"(offsetof(struct vcpu_vmx, launched)),
+		[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
 		[fail]"i"(offsetof(struct vcpu_vmx, fail)),
 		[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
 		[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
@@ -4276,8 +6086,19 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
 	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
 
+	if (is_guest_mode(vcpu)) {
+		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+		vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
+		if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
+			vmcs12->idt_vectoring_error_code =
+				vmcs_read32(IDT_VECTORING_ERROR_CODE);
+			vmcs12->vm_exit_instruction_len =
+				vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+		}
+	}
+
 	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
-	vmx->launched = 1;
+	vmx->loaded_vmcs->launched = 1;
 
 	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
 
@@ -4289,41 +6110,18 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #undef R
 #undef Q
 
-static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_vmx *vmx = to_vmx(vcpu);
-
-	if (vmx->vmcs) {
-		vcpu_clear(vmx);
-		free_vmcs(vmx->vmcs);
-		vmx->vmcs = NULL;
-	}
-}
-
 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	free_vpid(vmx);
-	vmx_free_vmcs(vcpu);
+	free_nested(vmx);
+	free_loaded_vmcs(vmx->loaded_vmcs);
 	kfree(vmx->guest_msrs);
 	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 }
 
-static inline void vmcs_init(struct vmcs *vmcs)
-{
-	u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
-
-	if (!vmm_exclusive)
-		kvm_cpu_vmxon(phys_addr);
-
-	vmcs_clear(vmcs);
-
-	if (!vmm_exclusive)
-		kvm_cpu_vmxoff();
-}
-
 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 {
 	int err;
@@ -4345,11 +6143,15 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 		goto uninit_vcpu;
 	}
 
-	vmx->vmcs = alloc_vmcs();
-	if (!vmx->vmcs)
+	vmx->loaded_vmcs = &vmx->vmcs01;
+	vmx->loaded_vmcs->vmcs = alloc_vmcs();
+	if (!vmx->loaded_vmcs->vmcs)
 		goto free_msrs;
-
-	vmcs_init(vmx->vmcs);
+	if (!vmm_exclusive)
+		kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
+	loaded_vmcs_init(vmx->loaded_vmcs);
+	if (!vmm_exclusive)
+		kvm_cpu_vmxoff();
 
 	cpu = get_cpu();
 	vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4375,10 +6177,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 			goto free_vmcs;
 	}
 
+	vmx->nested.current_vmptr = -1ull;
+	vmx->nested.current_vmcs12 = NULL;
+
 	return &vmx->vcpu;
 
 free_vmcs:
-	free_vmcs(vmx->vmcs);
+	free_vmcs(vmx->loaded_vmcs->vmcs);
 free_msrs:
 	kfree(vmx->guest_msrs);
 uninit_vcpu:
@@ -4512,6 +6317,650 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 {
+	if (func == 1 && nested)
+		entry->ecx |= bit(X86_FEATURE_VMX);
+}
+
+/*
+ * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
+ * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
+ * with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2
+ * guest in a way that will both be appropriate to L1's requests, and our
+ * needs. In addition to modifying the active vmcs (which is vmcs02), this
+ * function also has additional necessary side-effects, like setting various
+ * vcpu->arch fields.
+ */
+static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 exec_control;
+
+	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
+	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
+	vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
+	vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
+	vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
+	vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
+	vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
+	vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
+	vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
+	vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
+	vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
+	vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
+	vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
+	vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
+	vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
+	vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
+	vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
+	vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
+	vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
+	vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
+	vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
+	vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
+	vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
+	vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
+	vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
+	vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
+	vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
+	vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
+	vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
+	vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
+	vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
+	vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
+	vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
+	vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
+	vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
+	vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+
+	vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+		vmcs12->vm_entry_intr_info_field);
+	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+		vmcs12->vm_entry_exception_error_code);
+	vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+		vmcs12->vm_entry_instruction_len);
+	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+		vmcs12->guest_interruptibility_info);
+	vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
+	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
+	vmcs_writel(GUEST_DR7, vmcs12->guest_dr7);
+	vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
+	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+		vmcs12->guest_pending_dbg_exceptions);
+	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
+	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+
+	vmcs_write64(VMCS_LINK_POINTER, -1ull);
+
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
+		(vmcs_config.pin_based_exec_ctrl |
+		 vmcs12->pin_based_vm_exec_control));
+
+	/*
+	 * Whether page-faults are trapped is determined by a combination of
+	 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
+	 * If enable_ept, L0 doesn't care about page faults and we should
+	 * set all of these to L1's desires. However, if !enable_ept, L0 does
+	 * care about (at least some) page faults, and because it is not easy
+	 * (if at all possible?) to merge L0 and L1's desires, we simply ask
+	 * to exit on each and every L2 page fault. This is done by setting
+	 * MASK=MATCH=0 and (see below) EB.PF=1.
+	 * Note that below we don't need special code to set EB.PF beyond the
+	 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
+	 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
+	 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
+	 *
+	 * A problem with this approach (when !enable_ept) is that L1 may be
+	 * injected with more page faults than it asked for. This could have
+	 * caused problems, but in practice existing hypervisors don't care.
+	 * To fix this, we will need to emulate the PFEC checking (on the L1
+	 * page tables), using walk_addr(), when injecting PFs to L1.
+	 */
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
+		enable_ept ? vmcs12->page_fault_error_code_mask : 0);
+	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
+		enable_ept ? vmcs12->page_fault_error_code_match : 0);
+
+	if (cpu_has_secondary_exec_ctrls()) {
+		u32 exec_control = vmx_secondary_exec_control(vmx);
+		if (!vmx->rdtscp_enabled)
+			exec_control &= ~SECONDARY_EXEC_RDTSCP;
+		/* Take the following fields only from vmcs12 */
+		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+		if (nested_cpu_has(vmcs12,
+				CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
+			exec_control |= vmcs12->secondary_vm_exec_control;
+
+		if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
+			/*
+			 * Translate L1 physical address to host physical
+			 * address for vmcs02. Keep the page pinned, so this
+			 * physical address remains valid. We keep a reference
+			 * to it so we can release it later.
+			 */
+			if (vmx->nested.apic_access_page) /* shouldn't happen */
+				nested_release_page(vmx->nested.apic_access_page);
+			vmx->nested.apic_access_page =
+				nested_get_page(vcpu, vmcs12->apic_access_addr);
+			/*
+			 * If translation failed, no matter: This feature asks
+			 * to exit when accessing the given address, and if it
+			 * can never be accessed, this feature won't do
+			 * anything anyway.
+			 */
+			if (!vmx->nested.apic_access_page)
+				exec_control &=
+				  ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+			else
+				vmcs_write64(APIC_ACCESS_ADDR,
+				  page_to_phys(vmx->nested.apic_access_page));
+		}
+
+		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+	}
+
+
+	/*
+	 * Set host-state according to L0's settings (vmcs12 is irrelevant here)
+	 * Some constant fields are set here by vmx_set_constant_host_state().
+	 * Other fields are different per CPU, and will be set later when
+	 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
+	 */
+	vmx_set_constant_host_state();
+
+	/*
+	 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
+	 * entry, but only if the current (host) sp changed from the value
+	 * we wrote last (vmx->host_rsp). This cache is no longer relevant
+	 * if we switch vmcs, and rather than hold a separate cache per vmcs,
+	 * here we just force the write to happen on entry.
+	 */
+	vmx->host_rsp = 0;
+
+	exec_control = vmx_exec_control(vmx); /* L0's desires */
+	exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
+	exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+	exec_control &= ~CPU_BASED_TPR_SHADOW;
+	exec_control |= vmcs12->cpu_based_vm_exec_control;
+	/*
+	 * Merging of IO and MSR bitmaps not currently supported.
+	 * Rather, exit every time.
+	 */
+	exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
+	exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
+	exec_control |= CPU_BASED_UNCOND_IO_EXITING;
+
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
+
+	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
+	 * bitwise-or of what L1 wants to trap for L2, and what we want to
+	 * trap. Note that CR0.TS also needs updating - we do this later.
+	 */
+	update_exception_bitmap(vcpu);
+	vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
+	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+
+	/* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
+	vmcs_write32(VM_EXIT_CONTROLS,
+		vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
+	vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
+		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
+
+	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
+		vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
+	else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
+
+
+	set_cr4_guest_host_mask(vmx);
+
+	vmcs_write64(TSC_OFFSET,
+		vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
+
+	if (enable_vpid) {
+		/*
+		 * Trivially support vpid by letting L2s share their parent
+		 * L1's vpid. TODO: move to a more elaborate solution, giving
+		 * each L2 its own vpid and exposing the vpid feature to L1.
+		 */
+		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+		vmx_flush_tlb(vcpu);
+	}
+
+	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
+		vcpu->arch.efer = vmcs12->guest_ia32_efer;
+	if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
+		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+	else
+		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+	/* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
+	vmx_set_efer(vcpu, vcpu->arch.efer);
+
+	/*
+	 * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
+	 * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
+	 * The CR0_READ_SHADOW is what L2 should have expected to read given
+	 * the specifications by L1; It's not enough to take
+	 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
+	 * have more bits than L1 expected.
+	 */
+	vmx_set_cr0(vcpu, vmcs12->guest_cr0);
+	vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
+
+	vmx_set_cr4(vcpu, vmcs12->guest_cr4);
+	vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
+
+	/* shadow page tables on either EPT or shadow page tables */
+	kvm_set_cr3(vcpu, vmcs12->guest_cr3);
+	kvm_mmu_reset_context(vcpu);
+
+	kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
+}
+
+/*
+ * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
+ * for running an L2 nested guest.
+ */
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+{
+	struct vmcs12 *vmcs12;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int cpu;
+	struct loaded_vmcs *vmcs02;
+
+	if (!nested_vmx_check_permission(vcpu) ||
+	    !nested_vmx_check_vmcs12(vcpu))
+		return 1;
+
+	skip_emulated_instruction(vcpu);
+	vmcs12 = get_vmcs12(vcpu);
+
+	/*
+	 * The nested entry process starts with enforcing various prerequisites
+	 * on vmcs12 as required by the Intel SDM, and act appropriately when
+	 * they fail: As the SDM explains, some conditions should cause the
+	 * instruction to fail, while others will cause the instruction to seem
+	 * to succeed, but return an EXIT_REASON_INVALID_STATE.
+	 * To speed up the normal (success) code path, we should avoid checking
+	 * for misconfigurations which will anyway be caught by the processor
+	 * when using the merged vmcs02.
+	 */
+	if (vmcs12->launch_state == launch) {
+		nested_vmx_failValid(vcpu,
+			launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
+			       : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
+		return 1;
+	}
+
+	if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) &&
+			!IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) {
+		/*TODO: Also verify bits beyond physical address width are 0*/
+		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+		return 1;
+	}
+
+	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
+			!IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) {
+		/*TODO: Also verify bits beyond physical address width are 0*/
+		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+		return 1;
+	}
+
+	if (vmcs12->vm_entry_msr_load_count > 0 ||
+	    vmcs12->vm_exit_msr_load_count > 0 ||
+	    vmcs12->vm_exit_msr_store_count > 0) {
+		if (printk_ratelimit())
+			printk(KERN_WARNING
+			  "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__);
+		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+		return 1;
+	}
+
+	if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
+	      nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) ||
+	    !vmx_control_verify(vmcs12->secondary_vm_exec_control,
+	      nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) ||
+	    !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
+	      nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) ||
+	    !vmx_control_verify(vmcs12->vm_exit_controls,
+	      nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) ||
+	    !vmx_control_verify(vmcs12->vm_entry_controls,
+	      nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high))
+	{
+		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+		return 1;
+	}
+
+	if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
+	    ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
+		nested_vmx_failValid(vcpu,
+			VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
+		return 1;
+	}
+
+	if (((vmcs12->guest_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
+	    ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
+		nested_vmx_entry_failure(vcpu, vmcs12,
+			EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+		return 1;
+	}
+	if (vmcs12->vmcs_link_pointer != -1ull) {
+		nested_vmx_entry_failure(vcpu, vmcs12,
+			EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
+		return 1;
+	}
+
+	/*
+	 * We're finally done with prerequisite checking, and can start with
+	 * the nested entry.
+	 */
+
+	vmcs02 = nested_get_current_vmcs02(vmx);
+	if (!vmcs02)
+		return -ENOMEM;
+
+	enter_guest_mode(vcpu);
+
+	vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
+
+	cpu = get_cpu();
+	vmx->loaded_vmcs = vmcs02;
+	vmx_vcpu_put(vcpu);
+	vmx_vcpu_load(vcpu, cpu);
+	vcpu->cpu = cpu;
+	put_cpu();
+
+	vmcs12->launch_state = 1;
+
+	prepare_vmcs02(vcpu, vmcs12);
+
+	/*
+	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
+	 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
+	 * returned as far as L1 is concerned. It will only return (and set
+	 * the success flag) when L2 exits (see nested_vmx_vmexit()).
+	 */
+	return 1;
+}
+
+/*
+ * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
+ * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
+ * This function returns the new value we should put in vmcs12.guest_cr0.
+ * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
+ *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
+ *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
+ *     didn't trap the bit, because if L1 did, so would L0).
+ *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
+ *     been modified by L2, and L1 knows it. So just leave the old value of
+ *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
+ *     isn't relevant, because if L0 traps this bit it can set it to anything.
+ *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
+ *     changed these bits, and therefore they need to be updated, but L0
+ *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
+ *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
+ */
+static inline unsigned long
+vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+	return
+	/*1*/	(vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
+	/*2*/	(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
+	/*3*/	(vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
+			vcpu->arch.cr0_guest_owned_bits));
+}
+
+static inline unsigned long
+vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+	return
+	/*1*/	(vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
+	/*2*/	(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
+	/*3*/	(vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
+			vcpu->arch.cr4_guest_owned_bits));
+}
+
+/*
+ * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
+ * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
+ * and this function updates it to reflect the changes to the guest state while
+ * L2 was running (and perhaps made some exits which were handled directly by L0
+ * without going back to L1), and to reflect the exit reason.
+ * Note that we do not have to copy here all VMCS fields, just those that
+ * could have changed by the L2 guest or the exit - i.e., the guest-state and
+ * exit-information fields only. Other fields are modified by L1 with VMWRITE,
+ * which already writes to vmcs12 directly.
+ */
+void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+	/* update guest state fields: */
+	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
+	vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
+
+	kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
+	vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
+	vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
+	vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
+
+	vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
+	vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
+	vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
+	vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
+	vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
+	vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
+	vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
+	vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
+	vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
+	vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
+	vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
+	vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
+	vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
+	vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
+	vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
+	vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
+	vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
+	vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
+	vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
+	vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
+	vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
+	vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
+	vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
+	vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
+	vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
+	vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
+	vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
+	vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
+	vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
+	vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
+	vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
+	vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
+	vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
+	vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
+	vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
+	vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
+
+	vmcs12->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
+	vmcs12->guest_interruptibility_info =
+		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	vmcs12->guest_pending_dbg_exceptions =
+		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
+
+	/* TODO: These cannot have changed unless we have MSR bitmaps and
+	 * the relevant bit asks not to trap the change */
+	vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
+	if (vmcs12->vm_entry_controls & VM_EXIT_SAVE_IA32_PAT)
+		vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
+	vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
+	vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
+	vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
+
+	/* update exit information fields: */
+
+	vmcs12->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
+	vmcs12->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+	vmcs12->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+	vmcs12->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+	vmcs12->idt_vectoring_info_field =
+		vmcs_read32(IDT_VECTORING_INFO_FIELD);
+	vmcs12->idt_vectoring_error_code =
+		vmcs_read32(IDT_VECTORING_ERROR_CODE);
+	vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+	vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+
+	/* clear vm-entry fields which are to be cleared on exit */
+	if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
+		vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
+}
+
+/*
+ * A part of what we need to when the nested L2 guest exits and we want to
+ * run its L1 parent, is to reset L1's guest state to the host state specified
+ * in vmcs12.
+ * This function is to be called not only on normal nested exit, but also on
+ * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
+ * Failures During or After Loading Guest State").
+ * This function should be called when the active VMCS is L1's (vmcs01).
+ */
+void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
+{
+	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
+		vcpu->arch.efer = vmcs12->host_ia32_efer;
+	if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
+		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
+	else
+		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
+	vmx_set_efer(vcpu, vcpu->arch.efer);
+
+	kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
+	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
+	/*
+	 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
+	 * actually changed, because it depends on the current state of
+	 * fpu_active (which may have changed).
+	 * Note that vmx_set_cr0 refers to efer set above.
+	 */
+	kvm_set_cr0(vcpu, vmcs12->host_cr0);
+	/*
+	 * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
+	 * to apply the same changes to L1's vmcs. We just set cr0 correctly,
+	 * but we also need to update cr0_guest_host_mask and exception_bitmap.
+	 */
+	update_exception_bitmap(vcpu);
+	vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
+	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
+
+	/*
+	 * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
+	 * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
+	 */
+	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
+	kvm_set_cr4(vcpu, vmcs12->host_cr4);
+
+	/* shadow page tables on either EPT or shadow page tables */
+	kvm_set_cr3(vcpu, vmcs12->host_cr3);
+	kvm_mmu_reset_context(vcpu);
+
+	if (enable_vpid) {
+		/*
+		 * Trivially support vpid by letting L2s share their parent
+		 * L1's vpid. TODO: move to a more elaborate solution, giving
+		 * each L2 its own vpid and exposing the vpid feature to L1.
+		 */
+		vmx_flush_tlb(vcpu);
+	}
+
+
+	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
+	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
+	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
+	vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
+	vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
+	vmcs_writel(GUEST_TR_BASE, vmcs12->host_tr_base);
+	vmcs_writel(GUEST_GS_BASE, vmcs12->host_gs_base);
+	vmcs_writel(GUEST_FS_BASE, vmcs12->host_fs_base);
+	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->host_es_selector);
+	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->host_cs_selector);
+	vmcs_write16(GUEST_SS_SELECTOR, vmcs12->host_ss_selector);
+	vmcs_write16(GUEST_DS_SELECTOR, vmcs12->host_ds_selector);
+	vmcs_write16(GUEST_FS_SELECTOR, vmcs12->host_fs_selector);
+	vmcs_write16(GUEST_GS_SELECTOR, vmcs12->host_gs_selector);
+	vmcs_write16(GUEST_TR_SELECTOR, vmcs12->host_tr_selector);
+
+	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT)
+		vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
+	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+		vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
+			vmcs12->host_ia32_perf_global_ctrl);
+}
+
+/*
+ * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
+ * and modify vmcs12 to make it see what it would expect to see there if
+ * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
+ */
+static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int cpu;
+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+	leave_guest_mode(vcpu);
+	prepare_vmcs12(vcpu, vmcs12);
+
+	cpu = get_cpu();
+	vmx->loaded_vmcs = &vmx->vmcs01;
+	vmx_vcpu_put(vcpu);
+	vmx_vcpu_load(vcpu, cpu);
+	vcpu->cpu = cpu;
+	put_cpu();
+
+	/* if no vmcs02 cache requested, remove the one we used */
+	if (VMCS02_POOL_SIZE == 0)
+		nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
+
+	load_vmcs12_host_state(vcpu, vmcs12);
+
+	/* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */
+	vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+
+	/* This is needed for same reason as it was needed in prepare_vmcs02 */
+	vmx->host_rsp = 0;
+
+	/* Unpin physical memory we referred to in vmcs02 */
+	if (vmx->nested.apic_access_page) {
+		nested_release_page(vmx->nested.apic_access_page);
+		vmx->nested.apic_access_page = 0;
+	}
+
+	/*
+	 * Exiting from L2 to L1, we're now back to L1 which thinks it just
+	 * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
+	 * success or failure flag accordingly.
+	 */
+	if (unlikely(vmx->fail)) {
+		vmx->fail = 0;
+		nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
+	} else
+		nested_vmx_succeed(vcpu);
+}
+
+/*
+ * L1's failure to enter L2 is a subset of a normal exit, as explained in
+ * 23.7 "VM-entry failures during or after loading guest state" (this also
+ * lists the acceptable exit-reason and exit-qualification parameters).
+ * It should only be called before L2 actually succeeded to run, and when
+ * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
+ */
+static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
+			struct vmcs12 *vmcs12,
+			u32 reason, unsigned long qualification)
+{
+	load_vmcs12_host_state(vcpu, vmcs12);
+	vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
+	vmcs12->exit_qualification = qualification;
+	nested_vmx_succeed(vcpu);
 }
 
 static int vmx_check_intercept(struct kvm_vcpu *vcpu,
@@ -4670,16 +7119,13 @@ static int __init vmx_init(void)
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
 
 	if (enable_ept) {
-		bypass_guest_pf = 0;
 		kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
 				VMX_EPT_EXECUTABLE_MASK);
+		ept_set_mmio_spte_mask();
 		kvm_enable_tdp();
 	} else
 		kvm_disable_tdp();
 
-	if (bypass_guest_pf)
-		kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
-
 	return 0;
 
 out3:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 77c9d8673dc..84a28ea45fa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -347,6 +347,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 	vcpu->arch.cr2 = fault->address;
 	kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
 }
+EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 
 void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 {
@@ -579,6 +580,22 @@ static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
 	return best && (best->ecx & bit(X86_FEATURE_XSAVE));
 }
 
+static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->ebx & bit(X86_FEATURE_SMEP));
+}
+
+static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
+}
+
 static void update_cpuid(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
@@ -598,14 +615,20 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long old_cr4 = kvm_read_cr4(vcpu);
-	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
-
+	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
+				   X86_CR4_PAE | X86_CR4_SMEP;
 	if (cr4 & CR4_RESERVED_BITS)
 		return 1;
 
 	if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
 		return 1;
 
+	if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
+		return 1;
+
+	if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS))
+		return 1;
+
 	if (is_long_mode(vcpu)) {
 		if (!(cr4 & X86_CR4_PAE))
 			return 1;
@@ -615,11 +638,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 				   kvm_read_cr3(vcpu)))
 		return 1;
 
-	if (cr4 & X86_CR4_VMXE)
+	if (kvm_x86_ops->set_cr4(vcpu, cr4))
 		return 1;
 
-	kvm_x86_ops->set_cr4(vcpu, cr4);
-
 	if ((cr4 ^ old_cr4) & pdptr_bits)
 		kvm_mmu_reset_context(vcpu);
 
@@ -787,12 +808,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN	8
+#define KVM_SAVE_MSRS_BEGIN	9
 static u32 msrs_to_save[] = {
 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
 	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
-	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
+	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
 	MSR_STAR,
 #ifdef CONFIG_X86_64
@@ -1388,7 +1409,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 			return 1;
 		kvm_x86_ops->patch_hypercall(vcpu, instructions);
 		((unsigned char *)instructions)[3] = 0xc3; /* ret */
-		if (copy_to_user((void __user *)addr, instructions, 4))
+		if (__copy_to_user((void __user *)addr, instructions, 4))
 			return 1;
 		kvm->arch.hv_hypercall = data;
 		break;
@@ -1415,7 +1436,7 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 				  HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
 		if (kvm_is_error_hva(addr))
 			return 1;
-		if (clear_user((void __user *)addr, PAGE_SIZE))
+		if (__clear_user((void __user *)addr, PAGE_SIZE))
 			return 1;
 		vcpu->arch.hv_vapic = data;
 		break;
@@ -1467,6 +1488,35 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void accumulate_steal_time(struct kvm_vcpu *vcpu)
+{
+	u64 delta;
+
+	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+		return;
+
+	delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
+	vcpu->arch.st.last_steal = current->sched_info.run_delay;
+	vcpu->arch.st.accum_steal = delta;
+}
+
+static void record_steal_time(struct kvm_vcpu *vcpu)
+{
+	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
+		return;
+
+	if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
+		return;
+
+	vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
+	vcpu->arch.st.steal.version += 2;
+	vcpu->arch.st.accum_steal = 0;
+
+	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+}
+
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	switch (msr) {
@@ -1549,6 +1599,33 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		if (kvm_pv_enable_async_pf(vcpu, data))
 			return 1;
 		break;
+	case MSR_KVM_STEAL_TIME:
+
+		if (unlikely(!sched_info_on()))
+			return 1;
+
+		if (data & KVM_STEAL_RESERVED_MASK)
+			return 1;
+
+		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
+							data & KVM_STEAL_VALID_BITS))
+			return 1;
+
+		vcpu->arch.st.msr_val = data;
+
+		if (!(data & KVM_MSR_ENABLED))
+			break;
+
+		vcpu->arch.st.last_steal = current->sched_info.run_delay;
+
+		preempt_disable();
+		accumulate_steal_time(vcpu);
+		preempt_enable();
+
+		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
+
+		break;
+
 	case MSR_IA32_MCG_CTL:
 	case MSR_IA32_MCG_STATUS:
 	case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1834,6 +1911,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_KVM_ASYNC_PF_EN:
 		data = vcpu->arch.apf.msr_val;
 		break;
+	case MSR_KVM_STEAL_TIME:
+		data = vcpu->arch.st.msr_val;
+		break;
 	case MSR_IA32_P5_MC_ADDR:
 	case MSR_IA32_P5_MC_TYPE:
 	case MSR_IA32_MCG_CAP:
@@ -2145,6 +2225,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 			kvm_migrate_timers(vcpu);
 		vcpu->cpu = cpu;
 	}
+
+	accumulate_steal_time(vcpu);
+	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -2283,6 +2366,13 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	entry->flags = 0;
 }
 
+static bool supported_xcr0_bit(unsigned bit)
+{
+	u64 mask = ((u64)1 << bit);
+
+	return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0;
+}
+
 #define F(x) bit(X86_FEATURE_##x)
 
 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
@@ -2328,7 +2418,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* Reserved, DCA */ | F(XMM4_1) |
 		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
 		0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
-		F(F16C);
+		F(F16C) | F(RDRAND);
 	/* cpuid 0x80000001.ecx */
 	const u32 kvm_supported_word6_x86_features =
 		F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
@@ -2342,6 +2432,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
 		F(PMM) | F(PMM_EN);
 
+	/* cpuid 7.0.ebx */
+	const u32 kvm_supported_word9_x86_features =
+		F(SMEP) | F(FSGSBASE) | F(ERMS);
+
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
 	do_cpuid_1_ent(entry, function, index);
@@ -2376,7 +2470,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		}
 		break;
 	}
-	/* function 4 and 0xb have additional index. */
+	/* function 4 has additional index. */
 	case 4: {
 		int i, cache_type;
 
@@ -2393,6 +2487,22 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		}
 		break;
 	}
+	case 7: {
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		/* Mask ebx against host capbability word 9 */
+		if (index == 0) {
+			entry->ebx &= kvm_supported_word9_x86_features;
+			cpuid_mask(&entry->ebx, 9);
+		} else
+			entry->ebx = 0;
+		entry->eax = 0;
+		entry->ecx = 0;
+		entry->edx = 0;
+		break;
+	}
+	case 9:
+		break;
+	/* function 0xb has additional index. */
 	case 0xb: {
 		int i, level_type;
 
@@ -2410,16 +2520,17 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		break;
 	}
 	case 0xd: {
-		int i;
+		int idx, i;
 
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
-		for (i = 1; *nent < maxnent && i < 64; ++i) {
-			if (entry[i].eax == 0)
+		for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) {
+			do_cpuid_1_ent(&entry[i], function, idx);
+			if (entry[i].eax == 0 || !supported_xcr0_bit(idx))
 				continue;
-			do_cpuid_1_ent(&entry[i], function, i);
 			entry[i].flags |=
 			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 			++*nent;
+			++i;
 		}
 		break;
 	}
@@ -2438,6 +2549,10 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			     (1 << KVM_FEATURE_CLOCKSOURCE2) |
 			     (1 << KVM_FEATURE_ASYNC_PF) |
 			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+
+		if (sched_info_on())
+			entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
+
 		entry->ebx = 0;
 		entry->ecx = 0;
 		entry->edx = 0;
@@ -2451,6 +2566,24 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->ecx &= kvm_supported_word6_x86_features;
 		cpuid_mask(&entry->ecx, 6);
 		break;
+	case 0x80000008: {
+		unsigned g_phys_as = (entry->eax >> 16) & 0xff;
+		unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
+		unsigned phys_as = entry->eax & 0xff;
+
+		if (!g_phys_as)
+			g_phys_as = phys_as;
+		entry->eax = g_phys_as | (virt_as << 8);
+		entry->ebx = entry->edx = 0;
+		break;
+	}
+	case 0x80000019:
+		entry->ecx = entry->edx = 0;
+		break;
+	case 0x8000001a:
+		break;
+	case 0x8000001d:
+		break;
 	/*Add support for Centaur's CPUID instruction*/
 	case 0xC0000000:
 		/*Just support up to 0xC0000004 now*/
@@ -2460,10 +2593,16 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->edx &= kvm_supported_word5_x86_features;
 		cpuid_mask(&entry->edx, 5);
 		break;
+	case 3: /* Processor serial number */
+	case 5: /* MONITOR/MWAIT */
+	case 6: /* Thermal management */
+	case 0xA: /* Architectural Performance Monitoring */
+	case 0x80000007: /* Advanced power management */
 	case 0xC0000002:
 	case 0xC0000003:
 	case 0xC0000004:
-		/*Now nothing to do, reserved for the future*/
+	default:
+		entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
 		break;
 	}
 
@@ -3817,7 +3956,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
 					  exception);
 }
 
-static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
+int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
 			       gva_t addr, void *val, unsigned int bytes,
 			       struct x86_exception *exception)
 {
@@ -3827,6 +3966,7 @@ static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
 					  exception);
 }
+EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
 
 static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 				      gva_t addr, void *val, unsigned int bytes,
@@ -3836,7 +3976,7 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
 }
 
-static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
+int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 				       gva_t addr, void *val,
 				       unsigned int bytes,
 				       struct x86_exception *exception)
@@ -3868,6 +4008,42 @@ static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 out:
 	return r;
 }
+EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
+
+static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+				gpa_t *gpa, struct x86_exception *exception,
+				bool write)
+{
+	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+
+	if (vcpu_match_mmio_gva(vcpu, gva) &&
+		  check_write_user_access(vcpu, write, access,
+		  vcpu->arch.access)) {
+		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
+					(gva & (PAGE_SIZE - 1));
+		trace_vcpu_match_mmio(gva, *gpa, write, false);
+		return 1;
+	}
+
+	if (write)
+		access |= PFERR_WRITE_MASK;
+
+	*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+
+	if (*gpa == UNMAPPED_GVA)
+		return -1;
+
+	/* For APIC access vmexit */
+	if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+		return 1;
+
+	if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
+		trace_vcpu_match_mmio(gva, *gpa, write, true);
+		return 1;
+	}
+
+	return 0;
+}
 
 static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
 				  unsigned long addr,
@@ -3876,8 +4052,8 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
 				  struct x86_exception *exception)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-	gpa_t                 gpa;
-	int handled;
+	gpa_t gpa;
+	int handled, ret;
 
 	if (vcpu->mmio_read_completed) {
 		memcpy(val, vcpu->mmio_data, bytes);
@@ -3887,13 +4063,12 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
 		return X86EMUL_CONTINUE;
 	}
 
-	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
+	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false);
 
-	if (gpa == UNMAPPED_GVA)
+	if (ret < 0)
 		return X86EMUL_PROPAGATE_FAULT;
 
-	/* For APIC access vmexit */
-	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+	if (ret)
 		goto mmio;
 
 	if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
@@ -3944,16 +4119,16 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 					   struct x86_exception *exception,
 					   struct kvm_vcpu *vcpu)
 {
-	gpa_t                 gpa;
-	int handled;
+	gpa_t gpa;
+	int handled, ret;
 
-	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
+	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true);
 
-	if (gpa == UNMAPPED_GVA)
+	if (ret < 0)
 		return X86EMUL_PROPAGATE_FAULT;
 
 	/* For APIC access vmexit */
-	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+	if (ret)
 		goto mmio;
 
 	if (emulator_write_phys(vcpu, gpa, val, bytes))
@@ -4473,9 +4648,24 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 		kvm_queue_exception(vcpu, ctxt->exception.vector);
 }
 
+static void init_decode_cache(struct x86_emulate_ctxt *ctxt,
+			      const unsigned long *regs)
+{
+	memset(&ctxt->twobyte, 0,
+	       (void *)&ctxt->regs - (void *)&ctxt->twobyte);
+	memcpy(ctxt->regs, regs, sizeof(ctxt->regs));
+
+	ctxt->fetch.start = 0;
+	ctxt->fetch.end = 0;
+	ctxt->io_read.pos = 0;
+	ctxt->io_read.end = 0;
+	ctxt->mem_read.pos = 0;
+	ctxt->mem_read.end = 0;
+}
+
 static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 {
-	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	int cs_db, cs_l;
 
 	/*
@@ -4488,40 +4678,38 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
 
 	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
-	vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
-	vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
-	vcpu->arch.emulate_ctxt.mode =
-		(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
-		(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
-		? X86EMUL_MODE_VM86 : cs_l
-		? X86EMUL_MODE_PROT64 :	cs_db
-		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
-	vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu);
-	memset(c, 0, sizeof(struct decode_cache));
-	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+	ctxt->eflags = kvm_get_rflags(vcpu);
+	ctxt->eip = kvm_rip_read(vcpu);
+	ctxt->mode = (!is_protmode(vcpu))		? X86EMUL_MODE_REAL :
+		     (ctxt->eflags & X86_EFLAGS_VM)	? X86EMUL_MODE_VM86 :
+		     cs_l				? X86EMUL_MODE_PROT64 :
+		     cs_db				? X86EMUL_MODE_PROT32 :
+							  X86EMUL_MODE_PROT16;
+	ctxt->guest_mode = is_guest_mode(vcpu);
+
+	init_decode_cache(ctxt, vcpu->arch.regs);
 	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
 }
 
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
 {
-	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	int ret;
 
 	init_emulate_ctxt(vcpu);
 
-	vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
-	vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
-	vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip +
-								 inc_eip;
-	ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
+	ctxt->op_bytes = 2;
+	ctxt->ad_bytes = 2;
+	ctxt->_eip = ctxt->eip + inc_eip;
+	ret = emulate_int_real(ctxt, irq);
 
 	if (ret != X86EMUL_CONTINUE)
 		return EMULATE_FAIL;
 
-	vcpu->arch.emulate_ctxt.eip = c->eip;
-	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
-	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
-	kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	ctxt->eip = ctxt->_eip;
+	memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
+	kvm_rip_write(vcpu, ctxt->eip);
+	kvm_set_rflags(vcpu, ctxt->eflags);
 
 	if (irq == NMI_VECTOR)
 		vcpu->arch.nmi_pending = false;
@@ -4582,21 +4770,21 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 			    int insn_len)
 {
 	int r;
-	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	bool writeback = true;
 
 	kvm_clear_exception_queue(vcpu);
 
 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
 		init_emulate_ctxt(vcpu);
-		vcpu->arch.emulate_ctxt.interruptibility = 0;
-		vcpu->arch.emulate_ctxt.have_exception = false;
-		vcpu->arch.emulate_ctxt.perm_ok = false;
+		ctxt->interruptibility = 0;
+		ctxt->have_exception = false;
+		ctxt->perm_ok = false;
 
-		vcpu->arch.emulate_ctxt.only_vendor_specific_insn
+		ctxt->only_vendor_specific_insn
 			= emulation_type & EMULTYPE_TRAP_UD;
 
-		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
+		r = x86_decode_insn(ctxt, insn, insn_len);
 
 		trace_kvm_emulate_insn_start(vcpu);
 		++vcpu->stat.insn_emulation;
@@ -4612,7 +4800,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	}
 
 	if (emulation_type & EMULTYPE_SKIP) {
-		kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
+		kvm_rip_write(vcpu, ctxt->_eip);
 		return EMULATE_DONE;
 	}
 
@@ -4620,11 +4808,11 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
 	   changes registers values  during IO operation */
 	if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
 		vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
-		memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+		memcpy(ctxt->regs, vcpu->arch.regs, sizeof ctxt->regs);
 	}
 
 restart:
-	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
+	r = x86_emulate_insn(ctxt);
 
 	if (r == EMULATION_INTERCEPTED)
 		return EMULATE_DONE;
@@ -4636,7 +4824,7 @@ restart:
 		return handle_emulation_failure(vcpu);
 	}
 
-	if (vcpu->arch.emulate_ctxt.have_exception) {
+	if (ctxt->have_exception) {
 		inject_emulated_exception(vcpu);
 		r = EMULATE_DONE;
 	} else if (vcpu->arch.pio.count) {
@@ -4655,13 +4843,12 @@ restart:
 		r = EMULATE_DONE;
 
 	if (writeback) {
-		toggle_interruptibility(vcpu,
-				vcpu->arch.emulate_ctxt.interruptibility);
-		kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+		toggle_interruptibility(vcpu, ctxt->interruptibility);
+		kvm_set_rflags(vcpu, ctxt->eflags);
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
-		memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+		memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
-		kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+		kvm_rip_write(vcpu, ctxt->eip);
 	} else
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
 
@@ -4878,6 +5065,30 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
 
+static void kvm_set_mmio_spte_mask(void)
+{
+	u64 mask;
+	int maxphyaddr = boot_cpu_data.x86_phys_bits;
+
+	/*
+	 * Set the reserved bits and the present bit of an paging-structure
+	 * entry to generate page fault with PFER.RSV = 1.
+	 */
+	mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;
+	mask |= 1ull;
+
+#ifdef CONFIG_X86_64
+	/*
+	 * If reserved bit is not supported, clear the present bit to disable
+	 * mmio page fault.
+	 */
+	if (maxphyaddr == 52)
+		mask &= ~1ull;
+#endif
+
+	kvm_mmu_set_mmio_spte_mask(mask);
+}
+
 int kvm_arch_init(void *opaque)
 {
 	int r;
@@ -4904,10 +5115,10 @@ int kvm_arch_init(void *opaque)
 	if (r)
 		goto out;
 
+	kvm_set_mmio_spte_mask();
 	kvm_init_msr_list();
 
 	kvm_x86_ops = ops;
-	kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
 			PT_DIRTY_MASK, PT64_NX_MASK, 0);
 
@@ -5082,8 +5293,7 @@ int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
 
 	kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
-	return emulator_write_emulated(&vcpu->arch.emulate_ctxt,
-				       rip, instruction, 3, NULL);
+	return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
 }
 
 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
@@ -5384,6 +5594,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			r = 1;
 			goto out;
 		}
+		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
+			record_steal_time(vcpu);
+
 	}
 
 	r = kvm_mmu_reload(vcpu);
@@ -5671,8 +5884,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 		 * that usually, but some bad designed PV devices (vmware
 		 * backdoor interface) need this to work
 		 */
-		struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
-		memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+		struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+		memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
 	}
 	regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
@@ -5801,21 +6014,20 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 		    bool has_error_code, u32 error_code)
 {
-	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
+	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
 	int ret;
 
 	init_emulate_ctxt(vcpu);
 
-	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
-				   tss_selector, reason, has_error_code,
-				   error_code);
+	ret = emulator_task_switch(ctxt, tss_selector, reason,
+				   has_error_code, error_code);
 
 	if (ret)
 		return EMULATE_FAIL;
 
-	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
-	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
-	kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	memcpy(vcpu->arch.regs, ctxt->regs, sizeof ctxt->regs);
+	kvm_rip_write(vcpu, ctxt->eip);
+	kvm_set_rflags(vcpu, ctxt->eflags);
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	return EMULATE_DONE;
 }
@@ -6093,12 +6305,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	if (r == 0)
 		r = kvm_mmu_setup(vcpu);
 	vcpu_put(vcpu);
-	if (r < 0)
-		goto free_vcpu;
 
-	return 0;
-free_vcpu:
-	kvm_x86_ops->vcpu_free(vcpu);
 	return r;
 }
 
@@ -6126,6 +6333,7 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	vcpu->arch.apf.msr_val = 0;
+	vcpu->arch.st.msr_val = 0;
 
 	kvmclock_reset(vcpu);
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index e407ed3df81..d36fe237c66 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -75,10 +75,54 @@ static inline u32 bit(int bitno)
 	return 1 << (bitno & 31);
 }
 
+static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
+					gva_t gva, gfn_t gfn, unsigned access)
+{
+	vcpu->arch.mmio_gva = gva & PAGE_MASK;
+	vcpu->arch.access = access;
+	vcpu->arch.mmio_gfn = gfn;
+}
+
+/*
+ * Clear the mmio cache info for the given gva,
+ * specially, if gva is ~0ul, we clear all mmio cache info.
+ */
+static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
+		return;
+
+	vcpu->arch.mmio_gva = 0;
+}
+
+static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
+{
+	if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK))
+		return true;
+
+	return false;
+}
+
+static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+	if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
+		return true;
+
+	return false;
+}
+
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
 
+int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
+	gva_t addr, void *val, unsigned int bytes,
+	struct x86_exception *exception);
+
+int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
+	gva_t addr, void *val, unsigned int bytes,
+	struct x86_exception *exception);
+
 #endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index db832fd65ec..13ee258442a 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -71,7 +71,8 @@
 #include <asm/stackprotector.h>
 #include <asm/reboot.h>		/* for struct machine_ops */
 
-/*G:010 Welcome to the Guest!
+/*G:010
+ * Welcome to the Guest!
  *
  * The Guest in our tale is a simple creature: identical to the Host but
  * behaving in simplified but equivalent ways.  In particular, the Guest is the
@@ -190,15 +191,23 @@ static void lazy_hcall4(unsigned long call,
 #endif
 
 /*G:036
- * When lazy mode is turned off reset the per-cpu lazy mode variable and then
- * issue the do-nothing hypercall to flush any stored calls.
-:*/
+ * When lazy mode is turned off, we issue the do-nothing hypercall to
+ * flush any stored calls, and call the generic helper to reset the
+ * per-cpu lazy mode variable.
+ */
 static void lguest_leave_lazy_mmu_mode(void)
 {
 	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
 	paravirt_leave_lazy_mmu();
 }
 
+/*
+ * We also catch the end of context switch; we enter lazy mode for much of
+ * that too, so again we need to flush here.
+ *
+ * (Technically, this is lazy CPU mode, and normally we're in lazy MMU
+ * mode, but unlike Xen, lguest doesn't care about the difference).
+ */
 static void lguest_end_context_switch(struct task_struct *next)
 {
 	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
@@ -391,7 +400,7 @@ static void lguest_load_tr_desc(void)
  * giant ball of hair.  Its entry in the current Intel manual runs to 28 pages.
  *
  * This instruction even it has its own Wikipedia entry.  The Wikipedia entry
- * has been translated into 5 languages.  I am not making this up!
+ * has been translated into 6 languages.  I am not making this up!
  *
  * We could get funky here and identify ourselves as "GenuineLguest", but
  * instead we just use the real "cpuid" instruction.  Then I pretty much turned
@@ -458,7 +467,7 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
 	/*
 	 * PAE systems can mark pages as non-executable.  Linux calls this the
 	 * NX bit.  Intel calls it XD (eXecute Disable), AMD EVP (Enhanced
-	 * Virus Protection).  We just switch turn if off here, since we don't
+	 * Virus Protection).  We just switch it off here, since we don't
 	 * support it.
 	 */
 	case 0x80000001:
@@ -520,17 +529,16 @@ static unsigned long lguest_read_cr2(void)
 
 /* See lguest_set_pte() below. */
 static bool cr3_changed = false;
+static unsigned long current_cr3;
 
 /*
  * cr3 is the current toplevel pagetable page: the principle is the same as
- * cr0.  Keep a local copy, and tell the Host when it changes.  The only
- * difference is that our local copy is in lguest_data because the Host needs
- * to set it upon our initial hypercall.
+ * cr0.  Keep a local copy, and tell the Host when it changes.
  */
 static void lguest_write_cr3(unsigned long cr3)
 {
-	lguest_data.pgdir = cr3;
 	lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
+	current_cr3 = cr3;
 
 	/* These two page tables are simple, linear, and used during boot */
 	if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
@@ -539,7 +547,7 @@ static void lguest_write_cr3(unsigned long cr3)
 
 static unsigned long lguest_read_cr3(void)
 {
-	return lguest_data.pgdir;
+	return current_cr3;
 }
 
 /* cr4 is used to enable and disable PGE, but we don't care. */
@@ -641,7 +649,7 @@ static void lguest_write_cr4(unsigned long val)
 
 /*
  * The Guest calls this after it has set a second-level entry (pte), ie. to map
- * a page into a process' address space.  Wetell the Host the toplevel and
+ * a page into a process' address space.  We tell the Host the toplevel and
  * address this corresponds to.  The Guest uses one pagetable per process, so
  * we need to tell the Host which one we're changing (mm->pgd).
  */
@@ -758,7 +766,7 @@ static void lguest_pmd_clear(pmd_t *pmdp)
 static void lguest_flush_tlb_single(unsigned long addr)
 {
 	/* Simply set it to zero: if it was not, it will fault back in. */
-	lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
+	lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
 }
 
 /*
@@ -1140,7 +1148,7 @@ static struct notifier_block paniced = {
 static __init char *lguest_memory_setup(void)
 {
 	/*
-	 *The Linux bootloader header contains an "e820" memory map: the
+	 * The Linux bootloader header contains an "e820" memory map: the
 	 * Launcher populated the first entry with our memory limit.
 	 */
 	e820_add_region(boot_params.e820_map[0].addr,
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 4f420c2f2d5..6ddfe4fc23c 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -6,18 +6,22 @@
 #include <asm/processor-flags.h>
 
 /*G:020
- * Our story starts with the kernel booting into startup_32 in
- * arch/x86/kernel/head_32.S.  It expects a boot header, which is created by
- * the bootloader (the Launcher in our case).
+
+ * Our story starts with the bzImage: booting starts at startup_32 in
+ * arch/x86/boot/compressed/head_32.S.  This merely uncompresses the real
+ * kernel in place and then jumps into it: startup_32 in
+ * arch/x86/kernel/head_32.S.  Both routines expects a boot header in the %esi
+ * register, which is created by the bootloader (the Launcher in our case).
  *
  * The startup_32 function does very little: it clears the uninitialized global
  * C variables which we expect to be zero (ie. BSS) and then copies the boot
- * header and kernel command line somewhere safe.  Finally it checks the
- * 'hardware_subarch' field.  This was introduced in 2.6.24 for lguest and Xen:
- * if it's set to '1' (lguest's assigned number), then it calls us here.
+ * header and kernel command line somewhere safe, and populates some initial
+ * page tables.  Finally it checks the 'hardware_subarch' field.  This was
+ * introduced in 2.6.24 for lguest and Xen: if it's set to '1' (lguest's
+ * assigned number), then it calls us here.
  *
  * WARNING: be very careful here!  We're running at addresses equal to physical
- * addesses (around 0), not above PAGE_OFFSET as most code expectes
+ * addresses (around 0), not above PAGE_OFFSET as most code expects
  * (eg. 0xC0000000).  Jumps are relative, so they're OK, but we can't touch any
  * data without remembering to subtract __PAGE_OFFSET!
  *
@@ -27,13 +31,18 @@
 .section .init.text, "ax", @progbits
 ENTRY(lguest_entry)
 	/*
-	 * We make the "initialization" hypercall now to tell the Host about
-	 * us, and also find out where it put our page tables.
+	 * We make the "initialization" hypercall now to tell the Host where
+	 * our lguest_data struct is.
 	 */
 	movl $LHCALL_LGUEST_INIT, %eax
 	movl $lguest_data - __PAGE_OFFSET, %ebx
 	int $LGUEST_TRAP_ENTRY
 
+	/* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
+	movl $LHCALL_NEW_PGTABLE, %eax
+	movl $(initial_page_table - __PAGE_OFFSET), %ebx
+	int $LGUEST_TRAP_ENTRY
+
 	/* Set up the initial stack so we can run C code. */
 	movl $(init_thread_union+THREAD_SIZE),%esp
 
@@ -96,12 +105,8 @@ send_interrupts:
 	 */
 	pushl %eax
 	movl $LHCALL_SEND_INTERRUPTS, %eax
-	/*
-	 * This is a vmcall instruction (same thing that KVM uses).  Older
-	 * assembler versions might not know the "vmcall" instruction, so we
-	 * create one manually here.
-	 */
-	.byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
+	/* This is the actual hypercall trap. */
+	int  $LGUEST_TRAP_ENTRY
 	/* Put eax back the way we found it. */
 	popl %eax
 	ret
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f2479f19ddd..b00f6785da7 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -18,8 +18,10 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
 
 lib-y := delay.o
 lib-y += thunk_$(BITS).o
-lib-y += usercopy_$(BITS).o getuser.o putuser.o
+lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
 lib-y += memcpy_$(BITS).o
+lib-$(CONFIG_SMP) += rwlock.o
+lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
 
 obj-y += msr.o msr-reg.o msr-reg-export.o
@@ -29,7 +31,7 @@ ifeq ($(CONFIG_X86_32),y)
         lib-y += atomic64_cx8_32.o
         lib-y += checksum_32.o
         lib-y += strstr_32.o
-        lib-y += semaphore_32.o string_32.o
+        lib-y += string_32.o
         lib-y += cmpxchg.o
 ifneq ($(CONFIG_X86_CMPXCHG64),y)
         lib-y += cmpxchg8b_emu.o atomic64_386_32.o
@@ -40,7 +42,6 @@ else
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
         lib-y += thunk_64.o clear_page_64.o copy_page_64.o
         lib-y += memmove_64.o memset_64.o
-        lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
-	lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
+        lib-y += copy_user_64.o copy_user_nocache_64.o
 	lib-y += cmpxchg16b_emu.o
 endif
diff --git a/arch/x86/lib/atomic64_32.c b/arch/x86/lib/atomic64_32.c
index 540179e8e9f..042f6826bf5 100644
--- a/arch/x86/lib/atomic64_32.c
+++ b/arch/x86/lib/atomic64_32.c
@@ -4,7 +4,7 @@
 
 #include <asm/processor.h>
 #include <asm/cmpxchg.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 
 long long atomic64_read_cx8(long long, const atomic64_t *v);
 EXPORT_SYMBOL(atomic64_read_cx8);
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 6fec2d1cebe..01c805ba535 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,6 +2,7 @@
 
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/alternative-asm.h>
 
 	ALIGN
 copy_page_c:
@@ -110,10 +111,6 @@ ENDPROC(copy_page)
 2:
 	.previous
 	.section .altinstructions,"a"
-	.align 8
-	.quad copy_page
-	.quad 1b
-	.word X86_FEATURE_REP_GOOD
-	.byte .Lcopy_page_end - copy_page
-	.byte 2b - 1b
+	altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD,	\
+		.Lcopy_page_end-copy_page, 2b-1b
 	.previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index d0ec9c2936d..ee164610ec4 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -9,6 +9,7 @@
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
 
 #undef memmove
 
@@ -214,11 +215,9 @@ ENTRY(memmove)
 	.previous
 
 	.section .altinstructions,"a"
-	.align 8
-	.quad .Lmemmove_begin_forward
-	.quad .Lmemmove_begin_forward_efs
-	.word X86_FEATURE_ERMS
-	.byte .Lmemmove_end_forward-.Lmemmove_begin_forward
-	.byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
+	altinstruction_entry .Lmemmove_begin_forward,		\
+		.Lmemmove_begin_forward_efs,X86_FEATURE_ERMS,	\
+		.Lmemmove_end_forward-.Lmemmove_begin_forward,	\
+		.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
 	.previous
 ENDPROC(memmove)
diff --git a/arch/x86/lib/rwlock.S b/arch/x86/lib/rwlock.S
new file mode 100644
index 00000000000..1cad22139c8
--- /dev/null
+++ b/arch/x86/lib/rwlock.S
@@ -0,0 +1,44 @@
+/* Slow paths of read/write spinlocks. */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/frame.h>
+#include <asm/rwlock.h>
+
+#ifdef CONFIG_X86_32
+# define __lock_ptr eax
+#else
+# define __lock_ptr rdi
+#endif
+
+ENTRY(__write_lock_failed)
+	CFI_STARTPROC
+	FRAME
+0:	LOCK_PREFIX
+	WRITE_LOCK_ADD($RW_LOCK_BIAS) (%__lock_ptr)
+1:	rep; nop
+	cmpl	$WRITE_LOCK_CMP, (%__lock_ptr)
+	jne	1b
+	LOCK_PREFIX
+	WRITE_LOCK_SUB($RW_LOCK_BIAS) (%__lock_ptr)
+	jnz	0b
+	ENDFRAME
+	ret
+	CFI_ENDPROC
+END(__write_lock_failed)
+
+ENTRY(__read_lock_failed)
+	CFI_STARTPROC
+	FRAME
+0:	LOCK_PREFIX
+	READ_LOCK_SIZE(inc) (%__lock_ptr)
+1:	rep; nop
+	READ_LOCK_SIZE(cmp) $1, (%__lock_ptr)
+	js	1b
+	LOCK_PREFIX
+	READ_LOCK_SIZE(dec) (%__lock_ptr)
+	js	0b
+	ENDFRAME
+	ret
+	CFI_ENDPROC
+END(__read_lock_failed)
diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S
deleted file mode 100644
index 05ea55f7140..00000000000
--- a/arch/x86/lib/rwlock_64.S
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Slow paths of read/write spinlocks. */
-
-#include <linux/linkage.h>
-#include <asm/rwlock.h>
-#include <asm/alternative-asm.h>
-#include <asm/dwarf2.h>
-
-/* rdi:	pointer to rwlock_t */
-ENTRY(__write_lock_failed)
-	CFI_STARTPROC
-	LOCK_PREFIX
-	addl $RW_LOCK_BIAS,(%rdi)
-1:	rep
-	nop
-	cmpl $RW_LOCK_BIAS,(%rdi)
-	jne 1b
-	LOCK_PREFIX
-	subl $RW_LOCK_BIAS,(%rdi)
-	jnz  __write_lock_failed
-	ret
-	CFI_ENDPROC
-END(__write_lock_failed)
-
-/* rdi:	pointer to rwlock_t */
-ENTRY(__read_lock_failed)
-	CFI_STARTPROC
-	LOCK_PREFIX
-	incl (%rdi)
-1:	rep
-	nop
-	cmpl $1,(%rdi)
-	js 1b
-	LOCK_PREFIX
-	decl (%rdi)
-	js __read_lock_failed
-	ret
-	CFI_ENDPROC
-END(__read_lock_failed)
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem.S
index 67743977398..5dff5f04246 100644
--- a/arch/x86/lib/rwsem_64.S
+++ b/arch/x86/lib/rwsem.S
@@ -1,4 +1,51 @@
 /*
+ * x86 semaphore implementation.
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Portions Copyright 1999 Red Hat, Inc.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/dwarf2.h>
+
+#define __ASM_HALF_REG(reg)	__ASM_SEL(reg, e##reg)
+#define __ASM_HALF_SIZE(inst)	__ASM_SEL(inst##w, inst##l)
+
+#ifdef CONFIG_X86_32
+
+/*
+ * The semaphore operations have a special calling sequence that
+ * allow us to do a simpler in-line version of them. These routines
+ * need to convert that sequence back into the C sequence when
+ * there is contention on the semaphore.
+ *
+ * %eax contains the semaphore pointer on entry. Save the C-clobbered
+ * registers (%eax, %edx and %ecx) except %eax whish is either a return
+ * value or just clobbered..
+ */
+
+#define save_common_regs \
+	pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0
+
+#define restore_common_regs \
+	popl_cfi %ecx; CFI_RESTORE ecx
+
+	/* Avoid uglifying the argument copying x86-64 needs to do. */
+	.macro movq src, dst
+	.endm
+
+#else
+
+/*
  * x86-64 rwsem wrappers
  *
  * This interfaces the inline asm code to the slow-path
@@ -16,12 +63,6 @@
  * but %rdi, %rsi, %rcx, %r8-r11 always need saving.
  */
 
-#include <linux/linkage.h>
-#include <asm/rwlock.h>
-#include <asm/alternative-asm.h>
-#include <asm/frame.h>
-#include <asm/dwarf2.h>
-
 #define save_common_regs \
 	pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
 	pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
@@ -40,16 +81,18 @@
 	popq_cfi %rsi; CFI_RESTORE rsi; \
 	popq_cfi %rdi; CFI_RESTORE rdi
 
+#endif
+
 /* Fix up special calling conventions */
 ENTRY(call_rwsem_down_read_failed)
 	CFI_STARTPROC
 	save_common_regs
-	pushq_cfi %rdx
-	CFI_REL_OFFSET rdx, 0
+	__ASM_SIZE(push,_cfi) %__ASM_REG(dx)
+	CFI_REL_OFFSET __ASM_REG(dx), 0
 	movq %rax,%rdi
 	call rwsem_down_read_failed
-	popq_cfi %rdx
-	CFI_RESTORE rdx
+	__ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
+	CFI_RESTORE __ASM_REG(dx)
 	restore_common_regs
 	ret
 	CFI_ENDPROC
@@ -67,7 +110,8 @@ ENDPROC(call_rwsem_down_write_failed)
 
 ENTRY(call_rwsem_wake)
 	CFI_STARTPROC
-	decl %edx	/* do nothing if still outstanding active readers */
+	/* do nothing if still outstanding active readers */
+	__ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx)
 	jnz 1f
 	save_common_regs
 	movq %rax,%rdi
@@ -77,16 +121,15 @@ ENTRY(call_rwsem_wake)
 	CFI_ENDPROC
 ENDPROC(call_rwsem_wake)
 
-/* Fix up special calling conventions */
 ENTRY(call_rwsem_downgrade_wake)
 	CFI_STARTPROC
 	save_common_regs
-	pushq_cfi %rdx
-	CFI_REL_OFFSET rdx, 0
+	__ASM_SIZE(push,_cfi) %__ASM_REG(dx)
+	CFI_REL_OFFSET __ASM_REG(dx), 0
 	movq %rax,%rdi
 	call rwsem_downgrade_wake
-	popq_cfi %rdx
-	CFI_RESTORE rdx
+	__ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
+	CFI_RESTORE __ASM_REG(dx)
 	restore_common_regs
 	ret
 	CFI_ENDPROC
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
deleted file mode 100644
index 06691daa410..00000000000
--- a/arch/x86/lib/semaphore_32.S
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * i386 semaphore implementation.
- *
- * (C) Copyright 1999 Linus Torvalds
- *
- * Portions Copyright 1999 Red Hat, Inc.
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
- */
-
-#include <linux/linkage.h>
-#include <asm/rwlock.h>
-#include <asm/alternative-asm.h>
-#include <asm/frame.h>
-#include <asm/dwarf2.h>
-
-/*
- * The semaphore operations have a special calling sequence that
- * allow us to do a simpler in-line version of them. These routines
- * need to convert that sequence back into the C sequence when
- * there is contention on the semaphore.
- *
- * %eax contains the semaphore pointer on entry. Save the C-clobbered
- * registers (%eax, %edx and %ecx) except %eax whish is either a return
- * value or just clobbered..
- */
-	.section .sched.text, "ax"
-
-/*
- * rw spinlock fallbacks
- */
-#ifdef CONFIG_SMP
-ENTRY(__write_lock_failed)
-	CFI_STARTPROC
-	FRAME
-2: 	LOCK_PREFIX
-	addl	$ RW_LOCK_BIAS,(%eax)
-1:	rep; nop
-	cmpl	$ RW_LOCK_BIAS,(%eax)
-	jne	1b
-	LOCK_PREFIX
-	subl	$ RW_LOCK_BIAS,(%eax)
-	jnz	2b
-	ENDFRAME
-	ret
-	CFI_ENDPROC
-	ENDPROC(__write_lock_failed)
-
-ENTRY(__read_lock_failed)
-	CFI_STARTPROC
-	FRAME
-2: 	LOCK_PREFIX
-	incl	(%eax)
-1:	rep; nop
-	cmpl	$1,(%eax)
-	js	1b
-	LOCK_PREFIX
-	decl	(%eax)
-	js	2b
-	ENDFRAME
-	ret
-	CFI_ENDPROC
-	ENDPROC(__read_lock_failed)
-
-#endif
-
-#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
-
-/* Fix up special calling conventions */
-ENTRY(call_rwsem_down_read_failed)
-	CFI_STARTPROC
-	pushl_cfi %ecx
-	CFI_REL_OFFSET ecx,0
-	pushl_cfi %edx
-	CFI_REL_OFFSET edx,0
-	call rwsem_down_read_failed
-	popl_cfi %edx
-	popl_cfi %ecx
-	ret
-	CFI_ENDPROC
-	ENDPROC(call_rwsem_down_read_failed)
-
-ENTRY(call_rwsem_down_write_failed)
-	CFI_STARTPROC
-	pushl_cfi %ecx
-	CFI_REL_OFFSET ecx,0
-	calll rwsem_down_write_failed
-	popl_cfi %ecx
-	ret
-	CFI_ENDPROC
-	ENDPROC(call_rwsem_down_write_failed)
-
-ENTRY(call_rwsem_wake)
-	CFI_STARTPROC
-	decw %dx    /* do nothing if still outstanding active readers */
-	jnz 1f
-	pushl_cfi %ecx
-	CFI_REL_OFFSET ecx,0
-	call rwsem_wake
-	popl_cfi %ecx
-1:	ret
-	CFI_ENDPROC
-	ENDPROC(call_rwsem_wake)
-
-/* Fix up special calling conventions */
-ENTRY(call_rwsem_downgrade_wake)
-	CFI_STARTPROC
-	pushl_cfi %ecx
-	CFI_REL_OFFSET ecx,0
-	pushl_cfi %edx
-	CFI_REL_OFFSET edx,0
-	call rwsem_downgrade_wake
-	popl_cfi %edx
-	popl_cfi %ecx
-	ret
-	CFI_ENDPROC
-	ENDPROC(call_rwsem_downgrade_wake)
-
-#endif
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index 782b082c9ff..a63efd6bb6a 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -5,50 +5,41 @@
  * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
  * Subject to the GNU public license, v.2. No warranty of any kind.
  */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/calling.h>
 
-	#include <linux/linkage.h>
-	#include <asm/dwarf2.h>
-	#include <asm/calling.h>			
-	#include <asm/rwlock.h>
-		
-	/* rdi:	arg1 ... normal C conventions. rax is saved/restored. */ 	
-	.macro thunk name,func
-	.globl \name
-\name:	
-	CFI_STARTPROC
-	SAVE_ARGS
-	call \func
-	jmp  restore
-	CFI_ENDPROC
-	.endm
-
-#ifdef CONFIG_TRACE_IRQFLAGS
-	/* put return address in rdi (arg1) */
-	.macro thunk_ra name,func
+	/* rdi:	arg1 ... normal C conventions. rax is saved/restored. */
+	.macro THUNK name, func, put_ret_addr_in_rdi=0
 	.globl \name
 \name:
 	CFI_STARTPROC
+
+	/* this one pushes 9 elems, the next one would be %rIP */
 	SAVE_ARGS
-	/* SAVE_ARGS pushs 9 elements */
-	/* the next element would be the rip */
-	movq 9*8(%rsp), %rdi
+
+	.if \put_ret_addr_in_rdi
+	movq_cfi_restore 9*8, rdi
+	.endif
+
 	call \func
 	jmp  restore
 	CFI_ENDPROC
 	.endm
 
-	thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
-	thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
+#ifdef CONFIG_TRACE_IRQFLAGS
+	THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
+	THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
 #endif
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-	thunk lockdep_sys_exit_thunk,lockdep_sys_exit
+	THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
 #endif
-	
+
 	/* SAVE_ARGS below is used only for the .cfi directives it contains. */
 	CFI_STARTPROC
 	SAVE_ARGS
 restore:
 	RESTORE_ARGS
-	ret	
+	ret
 	CFI_ENDPROC
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
new file mode 100644
index 00000000000..97be9cb5448
--- /dev/null
+++ b/arch/x86/lib/usercopy.c
@@ -0,0 +1,43 @@
+/*
+ * User address space access functions.
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/highmem.h>
+#include <linux/module.h>
+
+/*
+ * best effort, GUP based copy_from_user() that is NMI-safe
+ */
+unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+{
+	unsigned long offset, addr = (unsigned long)from;
+	unsigned long size, len = 0;
+	struct page *page;
+	void *map;
+	int ret;
+
+	do {
+		ret = __get_user_pages_fast(addr, 1, 0, &page);
+		if (!ret)
+			break;
+
+		offset = addr & (PAGE_SIZE - 1);
+		size = min(PAGE_SIZE - offset, n - len);
+
+		map = kmap_atomic(page);
+		memcpy(to, map+offset, size);
+		kunmap_atomic(map);
+		put_page(page);
+
+		len  += size;
+		to   += size;
+		addr += size;
+
+	} while (len < n);
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(copy_from_user_nmi);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2dbf6bf4c7e..4d09df054e3 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1059,7 +1059,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
-	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
@@ -1161,11 +1161,11 @@ good_area:
 	if (flags & FAULT_FLAG_ALLOW_RETRY) {
 		if (fault & VM_FAULT_MAJOR) {
 			tsk->maj_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
 				      regs, address);
 		} else {
 			tsk->min_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
 				      regs, address);
 		}
 		if (fault & VM_FAULT_RETRY) {
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index 704a37ceddd..dab41876cdd 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
 	e->trace.entries = e->trace_entries;
 	e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
 	e->trace.skip = 0;
-	save_stack_trace_regs(&e->trace, regs);
+	save_stack_trace_regs(regs, &e->trace);
 
 	/* Round address down to nearest 16 bytes */
 	shadow_copy = kmemcheck_shadow_lookup(address
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 3adff7dcc14..67421f38a21 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -34,7 +34,7 @@
 #include <asm/pgtable.h>
 #include <linux/mmiotrace.h>
 #include <asm/e820.h> /* for ISA_START_ADDRESS */
-#include <asm/atomic.h>
+#include <linux/atomic.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
 
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index f5510d889a2..fbeaaf41661 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -496,6 +496,7 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
 
 static int __init numa_register_memblks(struct numa_meminfo *mi)
 {
+	unsigned long uninitialized_var(pfn_align);
 	int i, nid;
 
 	/* Account for nodes with cpus and no memory */
@@ -511,6 +512,20 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 
 	/* for out of order entries */
 	sort_node_map();
+
+	/*
+	 * If sections array is gonna be used for pfn -> nid mapping, check
+	 * whether its granularity is fine enough.
+	 */
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+	pfn_align = node_map_pfn_alignment();
+	if (pfn_align && pfn_align < PAGES_PER_SECTION) {
+		printk(KERN_WARNING "Node alignment %LuMB < min %LuMB, rejecting NUMA config\n",
+		       PFN_PHYS(pfn_align) >> 20,
+		       PFN_PHYS(PAGES_PER_SECTION) >> 20);
+		return -EINVAL;
+	}
+#endif
 	if (!numa_meminfo_cover_memory(mi))
 		return -EINVAL;
 
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 849a975d3fa..3adebe7e536 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -41,7 +41,7 @@
  *     physnode_map[16-31] = 1;
  *     physnode_map[32- ] = -1;
  */
-s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
+s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1};
 EXPORT_SYMBOL(physnode_map);
 
 void memory_present(int nid, unsigned long start, unsigned long end)
@@ -52,8 +52,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
 			nid, start, end);
 	printk(KERN_DEBUG "  Setting physnode_map array to node %d for pfns:\n", nid);
 	printk(KERN_DEBUG "  ");
-	for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) {
-		physnode_map[pfn / PAGES_PER_ELEMENT] = nid;
+	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
+		physnode_map[pfn / PAGES_PER_SECTION] = nid;
 		printk(KERN_CONT "%lx ", pfn);
 	}
 	printk(KERN_CONT "\n");
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index e1d10690921..b0086567271 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -123,12 +123,11 @@ static int pageattr_test(void)
 	if (print)
 		printk(KERN_INFO "CPA self-test:\n");
 
-	bm = vmalloc((max_pfn_mapped + 7) / 8);
+	bm = vzalloc((max_pfn_mapped + 7) / 8);
 	if (!bm) {
 		printk(KERN_ERR "CPA Cannot vmalloc bitmap\n");
 		return -ENOMEM;
 	}
-	memset(bm, 0, (max_pfn_mapped + 7) / 8);
 
 	failed += print_split(&sa);
 	srandom32(100);
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index a5b64ab4cd6..bff89dfe361 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -11,10 +11,11 @@
 #include <linux/oprofile.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+
 #include <asm/ptrace.h>
-#include <asm/uaccess.h>
 #include <asm/stacktrace.h>
-#include <linux/compat.h>
 
 static int backtrace_stack(void *data, char *name)
 {
@@ -40,13 +41,13 @@ static struct stacktrace_ops backtrace_ops = {
 static struct stack_frame_ia32 *
 dump_user_backtrace_32(struct stack_frame_ia32 *head)
 {
+	/* Also check accessibility of one struct frame_head beyond: */
 	struct stack_frame_ia32 bufhead[2];
 	struct stack_frame_ia32 *fp;
+	unsigned long bytes;
 
-	/* Also check accessibility of one struct frame_head beyond */
-	if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
-		return NULL;
-	if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
+	bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
+	if (bytes != sizeof(bufhead))
 		return NULL;
 
 	fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame);
@@ -87,12 +88,12 @@ x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
 
 static struct stack_frame *dump_user_backtrace(struct stack_frame *head)
 {
+	/* Also check accessibility of one struct frame_head beyond: */
 	struct stack_frame bufhead[2];
+	unsigned long bytes;
 
-	/* Also check accessibility of one struct stack_frame beyond */
-	if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
-		return NULL;
-	if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
+	bytes = copy_from_user_nmi(bufhead, head, sizeof(bufhead));
+	if (bytes != sizeof(bufhead))
 		return NULL;
 
 	oprofile_add_trace(bufhead[0].return_address);
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 68c3c139520..ae3cb23cd89 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -246,10 +246,9 @@ static void add_resources(struct pci_root_info *info)
 
 		conflict = insert_resource_conflict(root, res);
 		if (conflict)
-			dev_err(&info->bridge->dev,
-				"address space collision: host bridge window %pR "
-				"conflicts with %s %pR\n",
-				res, conflict->name, conflict);
+			dev_info(&info->bridge->dev,
+				 "ignoring host bridge window %pR (conflicts with %s %pR)\n",
+				 res, conflict->name, conflict);
 		else
 			pci_bus_add_resource(info->bus, res, 0);
 	}
diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c
index 67858be4b52..99176094500 100644
--- a/arch/x86/pci/ce4100.c
+++ b/arch/x86/pci/ce4100.c
@@ -257,6 +257,7 @@ static int ce4100_conf_read(unsigned int seg, unsigned int bus,
 {
 	int i;
 
+	WARN_ON(seg);
 	if (bus == 1) {
 		for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
 			if (bus1_fixups[i].dev_func == devfn &&
@@ -282,6 +283,7 @@ static int ce4100_conf_write(unsigned int seg, unsigned int bus,
 {
 	int i;
 
+	WARN_ON(seg);
 	if (bus == 1) {
 		for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
 			if (bus1_fixups[i].dev_func == devfn &&
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 5fe75026ecc..92df322e0b5 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -247,13 +247,6 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
 	},
 #endif		/* __i386__ */
 	{
-		.callback = find_sort_method,
-		.ident = "Dell System",
-		.matches = {
-			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"),
-		},
-	},
-	{
 		.callback = set_bf_sort,
 		.ident = "Dell PowerEdge 1950",
 		.matches = {
@@ -294,6 +287,13 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
 		},
 	},
 	{
+		.callback = find_sort_method,
+		.ident = "Dell System",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"),
+		},
+	},
+	{
 		.callback = set_bf_sort,
 		.ident = "HP ProLiant BL20p G3",
 		.matches = {
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index e6fd8473fb7..4f2c70439d7 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -22,7 +22,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus,
 {
 	unsigned long flags;
 
-	if ((bus > 255) || (devfn > 255) || (reg > 4095)) {
+	if (seg || (bus > 255) || (devfn > 255) || (reg > 4095)) {
 		*value = -1;
 		return -EINVAL;
 	}
@@ -53,7 +53,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus,
 {
 	unsigned long flags;
 
-	if ((bus > 255) || (devfn > 255) || (reg > 4095))
+	if (seg || (bus > 255) || (devfn > 255) || (reg > 4095))
 		return -EINVAL;
 
 	raw_spin_lock_irqsave(&pci_config_lock, flags);
@@ -97,6 +97,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus,
 	unsigned long flags;
 	int dev, fn;
 
+	WARN_ON(seg);
 	if ((bus > 255) || (devfn > 255) || (reg > 255)) {
 		*value = -1;
 		return -EINVAL;
@@ -138,6 +139,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,
 	unsigned long flags;
 	int dev, fn;
 
+	WARN_ON(seg);
 	if ((bus > 255) || (devfn > 255) || (reg > 255)) 
 		return -EINVAL;
 
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 750c346ef50..301e325992f 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -519,7 +519,8 @@ static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
 	if (cfg->address < 0xFFFFFFFF)
 		return 0;
 
-	if (!strcmp(mcfg->header.oem_id, "SGI"))
+	if (!strcmp(mcfg->header.oem_id, "SGI") ||
+			!strcmp(mcfg->header.oem_id, "SGI2"))
 		return 0;
 
 	if (mcfg->header.revision >= 1) {
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 5c9e2458df4..512a88c4150 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -34,6 +34,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
 	unsigned long flags;
 	void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus));
 
+	WARN_ON(seg);
 	if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
 		return -EINVAL;
 
@@ -73,6 +74,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
 	unsigned long flags;
 	void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus));
 
+	WARN_ON(seg);
 	if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) 
 		return -EINVAL;
 
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index 13700ec8e2e..5262603b04d 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -206,6 +206,8 @@ static int pci_olpc_read(unsigned int seg, unsigned int bus,
 {
 	uint32_t *addr;
 
+	WARN_ON(seg);
+
 	/* Use the hardware mechanism for non-simulated devices */
 	if (!is_simulated(bus, devfn))
 		return pci_direct_conf1.read(seg, bus, devfn, reg, len, value);
@@ -264,6 +266,8 @@ static int pci_olpc_read(unsigned int seg, unsigned int bus,
 static int pci_olpc_write(unsigned int seg, unsigned int bus,
 		unsigned int devfn, int reg, int len, uint32_t value)
 {
+	WARN_ON(seg);
+
 	/* Use the hardware mechanism for non-simulated devices */
 	if (!is_simulated(bus, devfn))
 		return pci_direct_conf1.write(seg, bus, devfn, reg, len, value);
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index a5f7d0d63de..f6855355146 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -181,6 +181,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
 	unsigned long flags;
 	unsigned long bx = (bus << 8) | devfn;
 
+	WARN_ON(seg);
 	if (!value || (bus > 255) || (devfn > 255) || (reg > 255))
 		return -EINVAL;
 
@@ -247,6 +248,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus,
 	unsigned long flags;
 	unsigned long bx = (bus << 8) | devfn;
 
+	WARN_ON(seg);
 	if ((bus > 255) || (devfn > 255) || (reg > 255)) 
 		return -EINVAL;
 
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index 03008f72eb0..6f2f8eeed17 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -24,7 +24,7 @@ static void pci_visws_disable_irq(struct pci_dev *dev) { }
 
 unsigned int pci_bus0, pci_bus1;
 
-static int __init visws_map_irq(struct pci_dev *dev, u8 slot, u8 pin)
+static int __init visws_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
 {
 	int irq, bus = dev->bus->number;
 
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index f567965c062..1017c7bee38 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -1,8 +1,13 @@
 /*
- * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux
- *			   x86 PCI core to support the Xen PCI Frontend
+ * Xen PCI - handle PCI (INTx) and MSI infrastructure calls for PV, HVM and
+ * initial domain support. We also handle the DSDT _PRT callbacks for GSI's
+ * used in HVM and initial domain mode (PV does not parse ACPI, so it has no
+ * concept of GSIs). Under PV we hook under the pnbbios API for IRQs and
+ * 0xcf8 PCI configuration read/write.
  *
  *   Author: Ryan Wilson <hap9@epoch.ncsc.mil>
+ *           Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+ *           Stefano Stabellini <stefano.stabellini@eu.citrix.com>
  */
 #include <linux/module.h>
 #include <linux/init.h>
@@ -19,22 +24,53 @@
 #include <xen/events.h>
 #include <asm/xen/pci.h>
 
+static int xen_pcifront_enable_irq(struct pci_dev *dev)
+{
+	int rc;
+	int share = 1;
+	int pirq;
+	u8 gsi;
+
+	rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
+	if (rc < 0) {
+		dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
+			 rc);
+		return rc;
+	}
+	/* In PV DomU the Xen PCI backend puts the PIRQ in the interrupt line.*/
+	pirq = gsi;
+
+	if (gsi < NR_IRQS_LEGACY)
+		share = 0;
+
+	rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
+	if (rc < 0) {
+		dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n",
+			 gsi, pirq, rc);
+		return rc;
+	}
+
+	dev->irq = rc;
+	dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq);
+	return 0;
+}
+
 #ifdef CONFIG_ACPI
-static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
-				 int trigger, int polarity)
+static int xen_register_pirq(u32 gsi, int gsi_override, int triggering,
+			     bool set_pirq)
 {
-	int rc, irq;
+	int rc, pirq = -1, irq = -1;
 	struct physdev_map_pirq map_irq;
 	int shareable = 0;
 	char *name;
 
-	if (!xen_hvm_domain())
-		return -1;
+	if (set_pirq)
+		pirq = gsi;
 
 	map_irq.domid = DOMID_SELF;
 	map_irq.type = MAP_PIRQ_TYPE_GSI;
 	map_irq.index = gsi;
-	map_irq.pirq = -1;
+	map_irq.pirq = pirq;
 
 	rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
 	if (rc) {
@@ -42,7 +78,7 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
 		return -1;
 	}
 
-	if (trigger == ACPI_EDGE_SENSITIVE) {
+	if (triggering == ACPI_EDGE_SENSITIVE) {
 		shareable = 0;
 		name = "ioapic-edge";
 	} else {
@@ -50,12 +86,63 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
 		name = "ioapic-level";
 	}
 
+	if (gsi_override >= 0)
+		gsi = gsi_override;
+
 	irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name);
+	if (irq < 0)
+		goto out;
+
+	printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", map_irq.pirq, irq, gsi);
+out:
+	return irq;
+}
+
+static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
+				     int trigger, int polarity)
+{
+	if (!xen_hvm_domain())
+		return -1;
 
-	printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
+	return xen_register_pirq(gsi, -1 /* no GSI override */, trigger,
+				 false /* no mapping of GSI to PIRQ */);
+}
+
+#ifdef CONFIG_XEN_DOM0
+static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polarity)
+{
+	int rc, irq;
+	struct physdev_setup_gsi setup_gsi;
+
+	if (!xen_pv_domain())
+		return -1;
+
+	printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n",
+			gsi, triggering, polarity);
+
+	irq = xen_register_pirq(gsi, gsi_override, triggering, true);
+
+	setup_gsi.gsi = gsi;
+	setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1);
+	setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+
+	rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
+	if (rc == -EEXIST)
+		printk(KERN_INFO "Already setup the GSI :%d\n", gsi);
+	else if (rc) {
+		printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n",
+				gsi, rc);
+	}
 
 	return irq;
 }
+
+static int acpi_register_gsi_xen(struct device *dev, u32 gsi,
+				 int trigger, int polarity)
+{
+	return xen_register_gsi(gsi, -1 /* no GSI override */, trigger, polarity);
+}
+#endif
 #endif
 
 #if defined(CONFIG_PCI_MSI)
@@ -65,6 +152,43 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
 struct xen_pci_frontend_ops *xen_pci_frontend;
 EXPORT_SYMBOL_GPL(xen_pci_frontend);
 
+static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	int irq, ret, i;
+	struct msi_desc *msidesc;
+	int *v;
+
+	v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL);
+	if (!v)
+		return -ENOMEM;
+
+	if (type == PCI_CAP_ID_MSIX)
+		ret = xen_pci_frontend_enable_msix(dev, v, nvec);
+	else
+		ret = xen_pci_frontend_enable_msi(dev, v);
+	if (ret)
+		goto error;
+	i = 0;
+	list_for_each_entry(msidesc, &dev->msi_list, list) {
+		irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
+					       (type == PCI_CAP_ID_MSIX) ?
+					       "pcifront-msi-x" :
+					       "pcifront-msi",
+						DOMID_SELF);
+		if (irq < 0)
+			goto free;
+		i++;
+	}
+	kfree(v);
+	return 0;
+
+error:
+	dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
+free:
+	kfree(v);
+	return ret;
+}
+
 #define XEN_PIRQ_MSI_DATA  (MSI_DATA_TRIGGER_EDGE | \
 		MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0))
 
@@ -123,67 +247,6 @@ error:
 	return -ENODEV;
 }
 
-/*
- * For MSI interrupts we have to use drivers/xen/event.s functions to
- * allocate an irq_desc and setup the right */
-
-
-static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
-	int irq, ret, i;
-	struct msi_desc *msidesc;
-	int *v;
-
-	v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL);
-	if (!v)
-		return -ENOMEM;
-
-	if (type == PCI_CAP_ID_MSIX)
-		ret = xen_pci_frontend_enable_msix(dev, v, nvec);
-	else
-		ret = xen_pci_frontend_enable_msi(dev, v);
-	if (ret)
-		goto error;
-	i = 0;
-	list_for_each_entry(msidesc, &dev->msi_list, list) {
-		irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
-					       (type == PCI_CAP_ID_MSIX) ?
-					       "pcifront-msi-x" :
-					       "pcifront-msi",
-						DOMID_SELF);
-		if (irq < 0)
-			goto free;
-		i++;
-	}
-	kfree(v);
-	return 0;
-
-error:
-	dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
-free:
-	kfree(v);
-	return ret;
-}
-
-static void xen_teardown_msi_irqs(struct pci_dev *dev)
-{
-	struct msi_desc *msidesc;
-
-	msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
-	if (msidesc->msi_attrib.is_msix)
-		xen_pci_frontend_disable_msix(dev);
-	else
-		xen_pci_frontend_disable_msi(dev);
-
-	/* Free the IRQ's and the msidesc using the generic code. */
-	default_teardown_msi_irqs(dev);
-}
-
-static void xen_teardown_msi_irq(unsigned int irq)
-{
-	xen_destroy_irq(irq);
-}
-
 #ifdef CONFIG_XEN_DOM0
 static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
@@ -242,45 +305,28 @@ out:
 	return ret;
 }
 #endif
-#endif
 
-static int xen_pcifront_enable_irq(struct pci_dev *dev)
+static void xen_teardown_msi_irqs(struct pci_dev *dev)
 {
-	int rc;
-	int share = 1;
-	int pirq;
-	u8 gsi;
-
-	rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
-	if (rc < 0) {
-		dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
-			 rc);
-		return rc;
-	}
-
-	rc = xen_allocate_pirq_gsi(gsi);
-	if (rc < 0) {
-		dev_warn(&dev->dev, "Xen PCI: failed to allocate a PIRQ for GSI%d: %d\n",
-			 gsi, rc);
-		return rc;
-	}
-	pirq = rc;
+	struct msi_desc *msidesc;
 
-	if (gsi < NR_IRQS_LEGACY)
-		share = 0;
+	msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
+	if (msidesc->msi_attrib.is_msix)
+		xen_pci_frontend_disable_msix(dev);
+	else
+		xen_pci_frontend_disable_msi(dev);
 
-	rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
-	if (rc < 0) {
-		dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n",
-			 gsi, pirq, rc);
-		return rc;
-	}
+	/* Free the IRQ's and the msidesc using the generic code. */
+	default_teardown_msi_irqs(dev);
+}
 
-	dev->irq = rc;
-	dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq);
-	return 0;
+static void xen_teardown_msi_irq(unsigned int irq)
+{
+	xen_destroy_irq(irq);
 }
 
+#endif
+
 int __init pci_xen_init(void)
 {
 	if (!xen_pv_domain() || xen_initial_domain())
@@ -327,79 +373,6 @@ int __init pci_xen_hvm_init(void)
 }
 
 #ifdef CONFIG_XEN_DOM0
-static int xen_register_pirq(u32 gsi, int gsi_override, int triggering)
-{
-	int rc, pirq, irq = -1;
-	struct physdev_map_pirq map_irq;
-	int shareable = 0;
-	char *name;
-
-	if (!xen_pv_domain())
-		return -1;
-
-	if (triggering == ACPI_EDGE_SENSITIVE) {
-		shareable = 0;
-		name = "ioapic-edge";
-	} else {
-		shareable = 1;
-		name = "ioapic-level";
-	}
-	pirq = xen_allocate_pirq_gsi(gsi);
-	if (pirq < 0)
-		goto out;
-
-	if (gsi_override >= 0)
-		irq = xen_bind_pirq_gsi_to_irq(gsi_override, pirq, shareable, name);
-	else
-		irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name);
-	if (irq < 0)
-		goto out;
-
-	printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", pirq, irq, gsi);
-
-	map_irq.domid = DOMID_SELF;
-	map_irq.type = MAP_PIRQ_TYPE_GSI;
-	map_irq.index = gsi;
-	map_irq.pirq = pirq;
-
-	rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
-	if (rc) {
-		printk(KERN_WARNING "xen map irq failed %d\n", rc);
-		return -1;
-	}
-
-out:
-	return irq;
-}
-
-static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polarity)
-{
-	int rc, irq;
-	struct physdev_setup_gsi setup_gsi;
-
-	if (!xen_pv_domain())
-		return -1;
-
-	printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n",
-			gsi, triggering, polarity);
-
-	irq = xen_register_pirq(gsi, gsi_override, triggering);
-
-	setup_gsi.gsi = gsi;
-	setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1);
-	setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-
-	rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
-	if (rc == -EEXIST)
-		printk(KERN_INFO "Already setup the GSI :%d\n", gsi);
-	else if (rc) {
-		printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n",
-				gsi, rc);
-	}
-
-	return irq;
-}
-
 static __init void xen_setup_acpi_sci(void)
 {
 	int rc;
@@ -419,7 +392,7 @@ static __init void xen_setup_acpi_sci(void)
 	}
 	trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
 	polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
-	
+
 	printk(KERN_INFO "xen: sci override: global_irq=%d trigger=%d "
 			"polarity=%d\n", gsi, trigger, polarity);
 
@@ -434,10 +407,9 @@ static __init void xen_setup_acpi_sci(void)
 	 * the ACPI interpreter and keels over since IRQ 9 has not been
 	 * setup as we had setup IRQ 20 for it).
 	 */
-	/* Check whether the GSI != IRQ */
 	if (acpi_gsi_to_irq(gsi, &irq) == 0) {
-		if (irq >= 0 && irq != gsi)
-			/* Bugger, we MUST have that IRQ. */
+		/* Use the provided value if it's valid. */
+		if (irq >= 0)
 			gsi_override = irq;
 	}
 
@@ -447,41 +419,16 @@ static __init void xen_setup_acpi_sci(void)
 	return;
 }
 
-static int acpi_register_gsi_xen(struct device *dev, u32 gsi,
-				 int trigger, int polarity)
+int __init pci_xen_initial_domain(void)
 {
-	return xen_register_gsi(gsi, -1 /* no GSI override */, trigger, polarity);
-}
+	int irq;
 
-static int __init pci_xen_initial_domain(void)
-{
 #ifdef CONFIG_PCI_MSI
 	x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs;
 	x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
 #endif
 	xen_setup_acpi_sci();
 	__acpi_register_gsi = acpi_register_gsi_xen;
-
-	return 0;
-}
-
-void __init xen_setup_pirqs(void)
-{
-	int pirq, irq;
-
-	pci_xen_initial_domain();
-
-	if (0 == nr_ioapics) {
-		for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
-			pirq = xen_allocate_pirq_gsi(irq);
-			if (WARN(pirq < 0,
-				 "Could not allocate PIRQ for legacy interrupt\n"))
-				break;
-			irq = xen_bind_pirq_gsi_to_irq(irq, pirq, 0, "xt-pic");
-		}
-		return;
-	}
-
 	/* Pre-allocate legacy irqs */
 	for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
 		int trigger, polarity;
@@ -490,12 +437,16 @@ void __init xen_setup_pirqs(void)
 			continue;
 
 		xen_register_pirq(irq, -1 /* no GSI override */,
-			trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE);
+			trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE,
+			true /* Map GSI to PIRQ */);
 	}
+	if (0 == nr_ioapics) {
+		for (irq = 0; irq < NR_IRQS_LEGACY; irq++)
+			xen_bind_pirq_gsi_to_irq(irq, irq, 0, "xt-pic");
+	}
+	return 0;
 }
-#endif
 
-#ifdef CONFIG_XEN_DOM0
 struct xen_device_domain_owner {
 	domid_t domain;
 	struct pci_dev *dev;
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 899e393d8e7..3ae4128013e 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -51,7 +51,17 @@
 int efi_enabled;
 EXPORT_SYMBOL(efi_enabled);
 
-struct efi efi;
+struct efi __read_mostly efi = {
+	.mps        = EFI_INVALID_TABLE_ADDR,
+	.acpi       = EFI_INVALID_TABLE_ADDR,
+	.acpi20     = EFI_INVALID_TABLE_ADDR,
+	.smbios     = EFI_INVALID_TABLE_ADDR,
+	.sal_systab = EFI_INVALID_TABLE_ADDR,
+	.boot_info  = EFI_INVALID_TABLE_ADDR,
+	.hcdp       = EFI_INVALID_TABLE_ADDR,
+	.uga        = EFI_INVALID_TABLE_ADDR,
+	.uv_systab  = EFI_INVALID_TABLE_ADDR,
+};
 EXPORT_SYMBOL(efi);
 
 struct efi_memory_map memmap;
@@ -79,26 +89,50 @@ early_param("add_efi_memmap", setup_add_efi_memmap);
 
 static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
 {
-	return efi_call_virt2(get_time, tm, tc);
+	unsigned long flags;
+	efi_status_t status;
+
+	spin_lock_irqsave(&rtc_lock, flags);
+	status = efi_call_virt2(get_time, tm, tc);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+	return status;
 }
 
 static efi_status_t virt_efi_set_time(efi_time_t *tm)
 {
-	return efi_call_virt1(set_time, tm);
+	unsigned long flags;
+	efi_status_t status;
+
+	spin_lock_irqsave(&rtc_lock, flags);
+	status = efi_call_virt1(set_time, tm);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+	return status;
 }
 
 static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
 					     efi_bool_t *pending,
 					     efi_time_t *tm)
 {
-	return efi_call_virt3(get_wakeup_time,
-			      enabled, pending, tm);
+	unsigned long flags;
+	efi_status_t status;
+
+	spin_lock_irqsave(&rtc_lock, flags);
+	status = efi_call_virt3(get_wakeup_time,
+				enabled, pending, tm);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+	return status;
 }
 
 static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
 {
-	return efi_call_virt2(set_wakeup_time,
-			      enabled, tm);
+	unsigned long flags;
+	efi_status_t status;
+
+	spin_lock_irqsave(&rtc_lock, flags);
+	status = efi_call_virt2(set_wakeup_time,
+				enabled, tm);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+	return status;
 }
 
 static efi_status_t virt_efi_get_variable(efi_char16_t *name,
@@ -122,7 +156,7 @@ static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
 
 static efi_status_t virt_efi_set_variable(efi_char16_t *name,
 					  efi_guid_t *vendor,
-					  unsigned long attr,
+					  u32 attr,
 					  unsigned long data_size,
 					  void *data)
 {
@@ -131,6 +165,18 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name,
 			      data_size, data);
 }
 
+static efi_status_t virt_efi_query_variable_info(u32 attr,
+						 u64 *storage_space,
+						 u64 *remaining_space,
+						 u64 *max_variable_size)
+{
+	if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+		return EFI_UNSUPPORTED;
+
+	return efi_call_virt4(query_variable_info, attr, storage_space,
+			      remaining_space, max_variable_size);
+}
+
 static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
 {
 	return efi_call_virt1(get_next_high_mono_count, count);
@@ -145,6 +191,28 @@ static void virt_efi_reset_system(int reset_type,
 		       data_size, data);
 }
 
+static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules,
+					    unsigned long count,
+					    unsigned long sg_list)
+{
+	if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+		return EFI_UNSUPPORTED;
+
+	return efi_call_virt3(update_capsule, capsules, count, sg_list);
+}
+
+static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules,
+						unsigned long count,
+						u64 *max_size,
+						int *reset_type)
+{
+	if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+		return EFI_UNSUPPORTED;
+
+	return efi_call_virt4(query_capsule_caps, capsules, count, max_size,
+			      reset_type);
+}
+
 static efi_status_t __init phys_efi_set_virtual_address_map(
 	unsigned long memory_map_size,
 	unsigned long descriptor_size,
@@ -164,11 +232,14 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
 static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
 					     efi_time_cap_t *tc)
 {
+	unsigned long flags;
 	efi_status_t status;
 
+	spin_lock_irqsave(&rtc_lock, flags);
 	efi_call_phys_prelog();
 	status = efi_call_phys2(efi_phys.get_time, tm, tc);
 	efi_call_phys_epilog();
+	spin_unlock_irqrestore(&rtc_lock, flags);
 	return status;
 }
 
@@ -669,6 +740,9 @@ void __init efi_enter_virtual_mode(void)
 	efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
 	efi.reset_system = virt_efi_reset_system;
 	efi.set_virtual_address_map = NULL;
+	efi.query_variable_info = virt_efi_query_variable_info;
+	efi.update_capsule = virt_efi_update_capsule;
+	efi.query_capsule_caps = virt_efi_query_capsule_caps;
 	if (__supported_pte_mask & _PAGE_NX)
 		runtime_code_page_mkexec();
 	early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
index f61ccdd4934..1ea38775a6d 100644
--- a/arch/x86/platform/mrst/Makefile
+++ b/arch/x86/platform/mrst/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_X86_MRST)		+= mrst.o
 obj-$(CONFIG_X86_MRST)		+= vrtc.o
 obj-$(CONFIG_EARLY_PRINTK_MRST)	+= early_printk_mrst.o
+obj-$(CONFIG_X86_MRST)		+= pmu.o
diff --git a/arch/x86/platform/mrst/pmu.c b/arch/x86/platform/mrst/pmu.c
new file mode 100644
index 00000000000..9281da7d91b
--- /dev/null
+++ b/arch/x86/platform/mrst/pmu.c
@@ -0,0 +1,817 @@
+/*
+ * mrst/pmu.c - driver for MRST Power Management Unit
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/cpuidle.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/seq_file.h>
+#include <linux/sfi.h>
+#include <asm/intel_scu_ipc.h>
+#include "pmu.h"
+
+#define IPCMSG_FW_REVISION	0xF4
+
+struct mrst_device {
+	u16 pci_dev_num;	/* DEBUG only */
+	u16 lss;
+	u16 latest_request;
+	unsigned int pci_state_counts[PCI_D3cold + 1]; /* DEBUG only */
+};
+
+/*
+ * comlete list of MRST PCI devices
+ */
+static struct mrst_device mrst_devs[] = {
+/*  0 */ { 0x0800, LSS_SPI0 },		/* Moorestown SPI Ctrl 0 */
+/*  1 */ { 0x0801, LSS_SPI1 },		/* Moorestown SPI Ctrl 1 */
+/*  2 */ { 0x0802, LSS_I2C0 },		/* Moorestown I2C 0 */
+/*  3 */ { 0x0803, LSS_I2C1 },		/* Moorestown I2C 1 */
+/*  4 */ { 0x0804, LSS_I2C2 },		/* Moorestown I2C 2 */
+/*  5 */ { 0x0805, LSS_KBD },		/* Moorestown Keyboard Ctrl */
+/*  6 */ { 0x0806, LSS_USB_HC },	/* Moorestown USB Ctrl */
+/*  7 */ { 0x0807, LSS_SD_HC0 },	/* Moorestown SD Host Ctrl 0 */
+/*  8 */ { 0x0808, LSS_SD_HC1 },	/* Moorestown SD Host Ctrl 1 */
+/*  9 */ { 0x0809, LSS_NAND },		/* Moorestown NAND Ctrl */
+/* 10 */ { 0x080a, LSS_AUDIO },		/* Moorestown Audio Ctrl */
+/* 11 */ { 0x080b, LSS_IMAGING },	/* Moorestown ISP */
+/* 12 */ { 0x080c, LSS_SECURITY },	/* Moorestown Security Controller */
+/* 13 */ { 0x080d, LSS_DISPLAY },	/* Moorestown External Displays */
+/* 14 */ { 0x080e, 0 },			/* Moorestown SCU IPC */
+/* 15 */ { 0x080f, LSS_GPIO },		/* Moorestown GPIO Controller */
+/* 16 */ { 0x0810, 0 },			/* Moorestown Power Management Unit */
+/* 17 */ { 0x0811, LSS_USB_OTG },	/* Moorestown OTG Ctrl */
+/* 18 */ { 0x0812, LSS_SPI2 },		/* Moorestown SPI Ctrl 2 */
+/* 19 */ { 0x0813, 0 },			/* Moorestown SC DMA */
+/* 20 */ { 0x0814, LSS_AUDIO_LPE },	/* Moorestown LPE DMA */
+/* 21 */ { 0x0815, LSS_AUDIO_SSP },	/* Moorestown SSP0 */
+
+/* 22 */ { 0x084F, LSS_SD_HC2 },	/* Moorestown SD Host Ctrl 2 */
+
+/* 23 */ { 0x4102, 0 },			/* Lincroft */
+/* 24 */ { 0x4110, 0 },			/* Lincroft */
+};
+
+/* n.b. We ignore PCI-id 0x815 in LSS9 b/c MeeGo has no driver for it */
+static u16 mrst_lss9_pci_ids[] = {0x080a, 0x0814, 0};
+static u16 mrst_lss10_pci_ids[] = {0x0800, 0x0801, 0x0802, 0x0803,
+					0x0804, 0x0805, 0x080f, 0};
+
+/* handle concurrent SMP invokations of pmu_pci_set_power_state() */
+static spinlock_t mrst_pmu_power_state_lock;
+
+static unsigned int wake_counters[MRST_NUM_LSS];	/* DEBUG only */
+static unsigned int pmu_irq_stats[INT_INVALID + 1];	/* DEBUG only */
+
+static int graphics_is_off;
+static int lss_s0i3_enabled;
+static bool mrst_pmu_s0i3_enable;
+
+/*  debug counters */
+static u32 pmu_wait_ready_calls;
+static u32 pmu_wait_ready_udelays;
+static u32 pmu_wait_ready_udelays_max;
+static u32 pmu_wait_done_calls;
+static u32 pmu_wait_done_udelays;
+static u32 pmu_wait_done_udelays_max;
+static u32 pmu_set_power_state_entry;
+static u32 pmu_set_power_state_send_cmd;
+
+static struct mrst_device *pci_id_2_mrst_dev(u16 pci_dev_num)
+{
+	int index = 0;
+
+	if ((pci_dev_num >= 0x0800) && (pci_dev_num <= 0x815))
+		index = pci_dev_num - 0x800;
+	else if (pci_dev_num == 0x084F)
+		index = 22;
+	else if (pci_dev_num == 0x4102)
+		index = 23;
+	else if (pci_dev_num == 0x4110)
+		index = 24;
+
+	if (pci_dev_num != mrst_devs[index].pci_dev_num) {
+		WARN_ONCE(1, FW_BUG "Unknown PCI device 0x%04X\n", pci_dev_num);
+		return 0;
+	}
+
+	return &mrst_devs[index];
+}
+
+/**
+ * mrst_pmu_validate_cstates
+ * @dev: cpuidle_device
+ *
+ * Certain states are not appropriate for governor to pick in some cases.
+ * This function will be called as cpuidle_device's prepare callback and
+ * thus tells governor to ignore such states when selecting the next state
+ * to enter.
+ */
+
+#define IDLE_STATE4_IS_C6	4
+#define IDLE_STATE5_IS_S0I3	5
+
+int mrst_pmu_invalid_cstates(void)
+{
+	int cpu = smp_processor_id();
+
+	/*
+	 * Demote to C4 if the PMU is busy.
+	 * Since LSS changes leave the busy bit clear...
+	 * busy means either the PMU is waiting for an ACK-C6 that
+	 * isn't coming due to an MWAIT that returned immediately;
+	 * or we returned from S0i3 successfully, and the PMU
+	 * is not done sending us interrupts.
+	 */
+	if (pmu_read_busy_status())
+		return 1 << IDLE_STATE4_IS_C6 | 1 << IDLE_STATE5_IS_S0I3;
+
+	/*
+	 * Disallow S0i3 if: PMU is not initialized, or CPU1 is active,
+	 * or if device LSS is insufficient, or the GPU is active,
+	 * or if it has been explicitly disabled.
+	 */
+	if (!pmu_reg || !cpumask_equal(cpu_online_mask, cpumask_of(cpu)) ||
+	    !lss_s0i3_enabled || !graphics_is_off || !mrst_pmu_s0i3_enable)
+		return 1 << IDLE_STATE5_IS_S0I3;
+	else
+		return 0;
+}
+
+/*
+ * pmu_update_wake_counters(): read PM_WKS, update wake_counters[]
+ * DEBUG only.
+ */
+static void pmu_update_wake_counters(void)
+{
+	int lss;
+	u32 wake_status;
+
+	wake_status = pmu_read_wks();
+
+	for (lss = 0; lss < MRST_NUM_LSS; ++lss) {
+		if (wake_status & (1 << lss))
+			wake_counters[lss]++;
+	}
+}
+
+int mrst_pmu_s0i3_entry(void)
+{
+	int status;
+
+	/* Clear any possible error conditions */
+	pmu_write_ics(0x300);
+
+	/* set wake control to current D-states */
+	pmu_write_wssc(S0I3_SSS_TARGET);
+
+	status = mrst_s0i3_entry(PM_S0I3_COMMAND, &pmu_reg->pm_cmd);
+	pmu_update_wake_counters();
+	return status;
+}
+
+/* poll for maximum of 5ms for busy bit to clear */
+static int pmu_wait_ready(void)
+{
+	int udelays;
+
+	pmu_wait_ready_calls++;
+
+	for (udelays = 0; udelays < 500; ++udelays) {
+		if (udelays > pmu_wait_ready_udelays_max)
+			pmu_wait_ready_udelays_max = udelays;
+
+		if (pmu_read_busy_status() == 0)
+			return 0;
+
+		udelay(10);
+		pmu_wait_ready_udelays++;
+	}
+
+	/*
+	 * if this fires, observe
+	 * /sys/kernel/debug/mrst_pmu_wait_ready_calls
+	 * /sys/kernel/debug/mrst_pmu_wait_ready_udelays
+	 */
+	WARN_ONCE(1, "SCU not ready for 5ms");
+	return -EBUSY;
+}
+/* poll for maximum of 50ms us for busy bit to clear */
+static int pmu_wait_done(void)
+{
+	int udelays;
+
+	pmu_wait_done_calls++;
+
+	for (udelays = 0; udelays < 500; ++udelays) {
+		if (udelays > pmu_wait_done_udelays_max)
+			pmu_wait_done_udelays_max = udelays;
+
+		if (pmu_read_busy_status() == 0)
+			return 0;
+
+		udelay(100);
+		pmu_wait_done_udelays++;
+	}
+
+	/*
+	 * if this fires, observe
+	 * /sys/kernel/debug/mrst_pmu_wait_done_calls
+	 * /sys/kernel/debug/mrst_pmu_wait_done_udelays
+	 */
+	WARN_ONCE(1, "SCU not done for 50ms");
+	return -EBUSY;
+}
+
+u32 mrst_pmu_msi_is_disabled(void)
+{
+	return pmu_msi_is_disabled();
+}
+
+void mrst_pmu_enable_msi(void)
+{
+	pmu_msi_enable();
+}
+
+/**
+ * pmu_irq - pmu driver interrupt handler
+ * Context: interrupt context
+ */
+static irqreturn_t pmu_irq(int irq, void *dummy)
+{
+	union pmu_pm_ics pmu_ics;
+
+	pmu_ics.value = pmu_read_ics();
+
+	if (!pmu_ics.bits.pending)
+		return IRQ_NONE;
+
+	switch (pmu_ics.bits.cause) {
+	case INT_SPURIOUS:
+	case INT_CMD_DONE:
+	case INT_CMD_ERR:
+	case INT_WAKE_RX:
+	case INT_SS_ERROR:
+	case INT_S0IX_MISS:
+	case INT_NO_ACKC6:
+		pmu_irq_stats[pmu_ics.bits.cause]++;
+		break;
+	default:
+		pmu_irq_stats[INT_INVALID]++;
+	}
+
+	pmu_write_ics(pmu_ics.value); /* Clear pending interrupt */
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Translate PCI power management to MRST LSS D-states
+ */
+static int pci_2_mrst_state(int lss, pci_power_t pci_state)
+{
+	switch (pci_state) {
+	case PCI_D0:
+		if (SSMSK(D0i1, lss) & D0I1_ACG_SSS_TARGET)
+			return D0i1;
+		else
+			return D0;
+	case PCI_D1:
+		return D0i1;
+	case PCI_D2:
+		return D0i2;
+	case PCI_D3hot:
+	case PCI_D3cold:
+		return D0i3;
+	default:
+		WARN(1, "pci_state %d\n", pci_state);
+		return 0;
+	}
+}
+
+static int pmu_issue_command(u32 pm_ssc)
+{
+	union pmu_pm_set_cfg_cmd_t command;
+
+	if (pmu_read_busy_status()) {
+		pr_debug("pmu is busy, Operation not permitted\n");
+		return -1;
+	}
+
+	/*
+	 * enable interrupts in PMU so that interrupts are
+	 * propagated when ioc bit for a particular set
+	 * command is set
+	 */
+
+	pmu_irq_enable();
+
+	/* Configure the sub systems for pmu2 */
+
+	pmu_write_ssc(pm_ssc);
+
+	/*
+	 * Send the set config command for pmu its configured
+	 * for mode CM_IMMEDIATE & hence with No Trigger
+	 */
+
+	command.pmu2_params.d_param.cfg_mode = CM_IMMEDIATE;
+	command.pmu2_params.d_param.cfg_delay = 0;
+	command.pmu2_params.d_param.rsvd = 0;
+
+	/* construct the command to send SET_CFG to particular PMU */
+	command.pmu2_params.d_param.cmd = SET_CFG_CMD;
+	command.pmu2_params.d_param.ioc = 0;
+	command.pmu2_params.d_param.mode_id = 0;
+	command.pmu2_params.d_param.sys_state = SYS_STATE_S0I0;
+
+	/* write the value of PM_CMD into particular PMU */
+	pr_debug("pmu command being written %x\n",
+			command.pmu_pm_set_cfg_cmd_value);
+
+	pmu_write_cmd(command.pmu_pm_set_cfg_cmd_value);
+
+	return 0;
+}
+
+static u16 pmu_min_lss_pci_req(u16 *ids, u16 pci_state)
+{
+	u16 existing_request;
+	int i;
+
+	for (i = 0; ids[i]; ++i) {
+		struct mrst_device *mrst_dev;
+
+		mrst_dev = pci_id_2_mrst_dev(ids[i]);
+		if (unlikely(!mrst_dev))
+			continue;
+
+		existing_request = mrst_dev->latest_request;
+		if (existing_request < pci_state)
+			pci_state = existing_request;
+	}
+	return pci_state;
+}
+
+/**
+ * pmu_pci_set_power_state - Callback function is used by all the PCI devices
+ *			for a platform  specific device power on/shutdown.
+ */
+
+int pmu_pci_set_power_state(struct pci_dev *pdev, pci_power_t pci_state)
+{
+	u32 old_sss, new_sss;
+	int status = 0;
+	struct mrst_device *mrst_dev;
+
+	pmu_set_power_state_entry++;
+
+	BUG_ON(pdev->vendor != PCI_VENDOR_ID_INTEL);
+	BUG_ON(pci_state < PCI_D0 || pci_state > PCI_D3cold);
+
+	mrst_dev = pci_id_2_mrst_dev(pdev->device);
+	if (unlikely(!mrst_dev))
+		return -ENODEV;
+
+	mrst_dev->pci_state_counts[pci_state]++;	/* count invocations */
+
+	/* PMU driver calls self as part of PCI initialization, ignore */
+	if (pdev->device == PCI_DEV_ID_MRST_PMU)
+		return 0;
+
+	BUG_ON(!pmu_reg); /* SW bug if called before initialized */
+
+	spin_lock(&mrst_pmu_power_state_lock);
+
+	if (pdev->d3_delay) {
+		dev_dbg(&pdev->dev, "d3_delay %d, should be 0\n",
+			pdev->d3_delay);
+		pdev->d3_delay = 0;
+	}
+	/*
+	 * If Lincroft graphics, simply remember state
+	 */
+	if ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY
+		&& !((pdev->class & PCI_SUB_CLASS_MASK) >> 8)) {
+		if (pci_state == PCI_D0)
+			graphics_is_off = 0;
+		else
+			graphics_is_off = 1;
+		goto ret;
+	}
+
+	if (!mrst_dev->lss)
+		goto ret;	/* device with no LSS */
+
+	if (mrst_dev->latest_request == pci_state)
+		goto ret;	/* no change */
+
+	mrst_dev->latest_request = pci_state;	/* record latest request */
+
+	/*
+	 * LSS9 and LSS10 contain multiple PCI devices.
+	 * Use the lowest numbered (highest power) state in the LSS
+	 */
+	if (mrst_dev->lss == 9)
+		pci_state = pmu_min_lss_pci_req(mrst_lss9_pci_ids, pci_state);
+	else if (mrst_dev->lss == 10)
+		pci_state = pmu_min_lss_pci_req(mrst_lss10_pci_ids, pci_state);
+
+	status = pmu_wait_ready();
+	if (status)
+		goto ret;
+
+	old_sss = pmu_read_sss();
+	new_sss = old_sss & ~SSMSK(3, mrst_dev->lss);
+	new_sss |= SSMSK(pci_2_mrst_state(mrst_dev->lss, pci_state),
+			mrst_dev->lss);
+
+	if (new_sss == old_sss)
+		goto ret;	/* nothing to do */
+
+	pmu_set_power_state_send_cmd++;
+
+	status = pmu_issue_command(new_sss);
+
+	if (unlikely(status != 0)) {
+		dev_err(&pdev->dev, "Failed to Issue a PM command\n");
+		goto ret;
+	}
+
+	if (pmu_wait_done())
+		goto ret;
+
+	lss_s0i3_enabled =
+	((pmu_read_sss() & S0I3_SSS_TARGET) == S0I3_SSS_TARGET);
+ret:
+	spin_unlock(&mrst_pmu_power_state_lock);
+	return status;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static char *d0ix_names[] = {"D0", "D0i1", "D0i2", "D0i3"};
+
+static inline const char *d0ix_name(int state)
+{
+	return d0ix_names[(int) state];
+}
+
+static int debug_mrst_pmu_show(struct seq_file *s, void *unused)
+{
+	struct pci_dev *pdev = NULL;
+	u32 cur_pmsss;
+	int lss;
+
+	seq_printf(s, "0x%08X D0I1_ACG_SSS_TARGET\n", D0I1_ACG_SSS_TARGET);
+
+	cur_pmsss = pmu_read_sss();
+
+	seq_printf(s, "0x%08X S0I3_SSS_TARGET\n", S0I3_SSS_TARGET);
+
+	seq_printf(s, "0x%08X Current SSS ", cur_pmsss);
+	seq_printf(s, lss_s0i3_enabled ? "\n" : "[BLOCKS s0i3]\n");
+
+	if (cpumask_equal(cpu_online_mask, cpumask_of(0)))
+		seq_printf(s, "cpu0 is only cpu online\n");
+	else
+		seq_printf(s, "cpu0 is NOT only cpu online [BLOCKS S0i3]\n");
+
+	seq_printf(s, "GFX: %s\n", graphics_is_off ? "" : "[BLOCKS s0i3]");
+
+
+	for_each_pci_dev(pdev) {
+		int pos;
+		u16 pmcsr;
+		struct mrst_device *mrst_dev;
+		int i;
+
+		mrst_dev = pci_id_2_mrst_dev(pdev->device);
+
+		seq_printf(s, "%s %04x/%04X %-16.16s ",
+			dev_name(&pdev->dev),
+			pdev->vendor, pdev->device,
+			dev_driver_string(&pdev->dev));
+
+		if (unlikely (!mrst_dev)) {
+			seq_printf(s, " UNKNOWN\n");
+			continue;
+		}
+
+		if (mrst_dev->lss)
+			seq_printf(s, "LSS %2d %-4s ", mrst_dev->lss,
+				d0ix_name(((cur_pmsss >>
+					(mrst_dev->lss * 2)) & 0x3)));
+		else
+			seq_printf(s, "            ");
+
+		/* PCI PM config space setting */
+		pos = pci_find_capability(pdev, PCI_CAP_ID_PM);
+		if (pos != 0) {
+			pci_read_config_word(pdev, pos + PCI_PM_CTRL, &pmcsr);
+		seq_printf(s, "PCI-%-4s",
+			pci_power_name(pmcsr & PCI_PM_CTRL_STATE_MASK));
+		} else {
+			seq_printf(s, "        ");
+		}
+
+		seq_printf(s, " %s ", pci_power_name(mrst_dev->latest_request));
+		for (i = 0; i <= PCI_D3cold; ++i)
+			seq_printf(s, "%d ", mrst_dev->pci_state_counts[i]);
+
+		if (mrst_dev->lss) {
+			unsigned int lssmask;
+
+			lssmask = SSMSK(D0i3, mrst_dev->lss);
+
+			if ((lssmask & S0I3_SSS_TARGET) &&
+				((lssmask & cur_pmsss) !=
+					(lssmask & S0I3_SSS_TARGET)))
+						seq_printf(s , "[BLOCKS s0i3]");
+		}
+
+		seq_printf(s, "\n");
+	}
+	seq_printf(s, "Wake Counters:\n");
+	for (lss = 0; lss < MRST_NUM_LSS; ++lss)
+		seq_printf(s, "LSS%d %d\n", lss, wake_counters[lss]);
+
+	seq_printf(s, "Interrupt Counters:\n");
+	seq_printf(s,
+		"INT_SPURIOUS \t%8u\n" "INT_CMD_DONE \t%8u\n"
+		"INT_CMD_ERR  \t%8u\n" "INT_WAKE_RX  \t%8u\n"
+		"INT_SS_ERROR \t%8u\n" "INT_S0IX_MISS\t%8u\n"
+		"INT_NO_ACKC6 \t%8u\n" "INT_INVALID  \t%8u\n",
+		pmu_irq_stats[INT_SPURIOUS], pmu_irq_stats[INT_CMD_DONE],
+		pmu_irq_stats[INT_CMD_ERR], pmu_irq_stats[INT_WAKE_RX],
+		pmu_irq_stats[INT_SS_ERROR], pmu_irq_stats[INT_S0IX_MISS],
+		pmu_irq_stats[INT_NO_ACKC6], pmu_irq_stats[INT_INVALID]);
+
+	seq_printf(s, "mrst_pmu_wait_ready_calls          %8d\n",
+			pmu_wait_ready_calls);
+	seq_printf(s, "mrst_pmu_wait_ready_udelays        %8d\n",
+			pmu_wait_ready_udelays);
+	seq_printf(s, "mrst_pmu_wait_ready_udelays_max    %8d\n",
+			pmu_wait_ready_udelays_max);
+	seq_printf(s, "mrst_pmu_wait_done_calls           %8d\n",
+			pmu_wait_done_calls);
+	seq_printf(s, "mrst_pmu_wait_done_udelays         %8d\n",
+			pmu_wait_done_udelays);
+	seq_printf(s, "mrst_pmu_wait_done_udelays_max     %8d\n",
+			pmu_wait_done_udelays_max);
+	seq_printf(s, "mrst_pmu_set_power_state_entry     %8d\n",
+			pmu_set_power_state_entry);
+	seq_printf(s, "mrst_pmu_set_power_state_send_cmd  %8d\n",
+			pmu_set_power_state_send_cmd);
+	seq_printf(s, "SCU busy: %d\n", pmu_read_busy_status());
+
+	return 0;
+}
+
+static int debug_mrst_pmu_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, debug_mrst_pmu_show, NULL);
+}
+
+static const struct file_operations devices_state_operations = {
+	.open		= debug_mrst_pmu_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif	/* DEBUG_FS */
+
+/*
+ * Validate SCU PCI shim PCI vendor capability byte
+ * against LSS hard-coded in mrst_devs[] above.
+ * DEBUG only.
+ */
+static void pmu_scu_firmware_debug(void)
+{
+	struct pci_dev *pdev = NULL;
+
+	for_each_pci_dev(pdev) {
+		struct mrst_device *mrst_dev;
+		u8 pci_config_lss;
+		int pos;
+
+		mrst_dev = pci_id_2_mrst_dev(pdev->device);
+		if (unlikely(!mrst_dev)) {
+			printk(KERN_ERR FW_BUG "pmu: Unknown "
+				"PCI device 0x%04X\n", pdev->device);
+			continue;
+		}
+
+		if (mrst_dev->lss == 0)
+			continue;	 /* no LSS in our table */
+
+		pos = pci_find_capability(pdev, PCI_CAP_ID_VNDR);
+		if (!pos != 0) {
+			printk(KERN_ERR FW_BUG "pmu: 0x%04X "
+				"missing PCI Vendor Capability\n",
+				pdev->device);
+			continue;
+		}
+		pci_read_config_byte(pdev, pos + 4, &pci_config_lss);
+		if (!(pci_config_lss & PCI_VENDOR_CAP_LOG_SS_MASK)) {
+			printk(KERN_ERR FW_BUG "pmu: 0x%04X "
+				"invalid PCI Vendor Capability 0x%x "
+				" expected LSS 0x%X\n",
+				pdev->device, pci_config_lss, mrst_dev->lss);
+			continue;
+		}
+		pci_config_lss &= PCI_VENDOR_CAP_LOG_ID_MASK;
+
+		if (mrst_dev->lss == pci_config_lss)
+			continue;
+
+		printk(KERN_ERR FW_BUG "pmu: 0x%04X LSS = %d, expected %d\n",
+			pdev->device, pci_config_lss, mrst_dev->lss);
+	}
+}
+
+/**
+ * pmu_probe
+ */
+static int __devinit pmu_probe(struct pci_dev *pdev,
+				   const struct pci_device_id *pci_id)
+{
+	int ret;
+	struct mrst_pmu_reg *pmu;
+
+	/* Init the device */
+	ret = pci_enable_device(pdev);
+	if (ret) {
+		dev_err(&pdev->dev, "Unable to Enable PCI device\n");
+		return ret;
+	}
+
+	ret = pci_request_regions(pdev, MRST_PMU_DRV_NAME);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Cannot obtain PCI resources, aborting\n");
+		goto out_err1;
+	}
+
+	/* Map the memory of PMU reg base */
+	pmu = pci_iomap(pdev, 0, 0);
+	if (!pmu) {
+		dev_err(&pdev->dev, "Unable to map the PMU address space\n");
+		ret = -ENOMEM;
+		goto out_err2;
+	}
+
+#ifdef CONFIG_DEBUG_FS
+	/* /sys/kernel/debug/mrst_pmu */
+	(void) debugfs_create_file("mrst_pmu", S_IFREG | S_IRUGO,
+				NULL, NULL, &devices_state_operations);
+#endif
+	pmu_reg = pmu;	/* success */
+
+	if (request_irq(pdev->irq, pmu_irq, 0, MRST_PMU_DRV_NAME, NULL)) {
+		dev_err(&pdev->dev, "Registering isr has failed\n");
+		ret = -1;
+		goto out_err3;
+	}
+
+	pmu_scu_firmware_debug();
+
+	pmu_write_wkc(S0I3_WAKE_SOURCES);	/* Enable S0i3 wakeup sources */
+
+	pmu_wait_ready();
+
+	pmu_write_ssc(D0I1_ACG_SSS_TARGET);	/* Enable Auto-Clock_Gating */
+	pmu_write_cmd(0x201);
+
+	spin_lock_init(&mrst_pmu_power_state_lock);
+
+	/* Enable the hardware interrupt */
+	pmu_irq_enable();
+	return 0;
+
+out_err3:
+	free_irq(pdev->irq, NULL);
+	pci_iounmap(pdev, pmu_reg);
+	pmu_reg = NULL;
+out_err2:
+	pci_release_region(pdev, 0);
+out_err1:
+	pci_disable_device(pdev);
+	return ret;
+}
+
+static void __devexit pmu_remove(struct pci_dev *pdev)
+{
+	dev_err(&pdev->dev, "Mid PM pmu_remove called\n");
+
+	/* Freeing up the irq */
+	free_irq(pdev->irq, NULL);
+
+	pci_iounmap(pdev, pmu_reg);
+	pmu_reg = NULL;
+
+	/* disable the current PCI device */
+	pci_release_region(pdev, 0);
+	pci_disable_device(pdev);
+}
+
+static DEFINE_PCI_DEVICE_TABLE(pmu_pci_ids) = {
+	{ PCI_VDEVICE(INTEL, PCI_DEV_ID_MRST_PMU), 0 },
+	{ }
+};
+
+MODULE_DEVICE_TABLE(pci, pmu_pci_ids);
+
+static struct pci_driver driver = {
+	.name = MRST_PMU_DRV_NAME,
+	.id_table = pmu_pci_ids,
+	.probe = pmu_probe,
+	.remove = __devexit_p(pmu_remove),
+};
+
+/**
+ * pmu_pci_register - register the PMU driver as PCI device
+ */
+static int __init pmu_pci_register(void)
+{
+	return pci_register_driver(&driver);
+}
+
+/* Register and probe via fs_initcall() to preceed device_initcall() */
+fs_initcall(pmu_pci_register);
+
+static void __exit mid_pci_cleanup(void)
+{
+	pci_unregister_driver(&driver);
+}
+
+static int ia_major;
+static int ia_minor;
+
+static int pmu_sfi_parse_oem(struct sfi_table_header *table)
+{
+	struct sfi_table_simple *sb;
+
+	sb = (struct sfi_table_simple *)table;
+	ia_major = (sb->pentry[1] >> 0) & 0xFFFF;
+	ia_minor = (sb->pentry[1] >> 16) & 0xFFFF;
+	printk(KERN_INFO "mrst_pmu: IA FW version v%x.%x\n",
+		ia_major, ia_minor);
+
+	return 0;
+}
+
+static int __init scu_fw_check(void)
+{
+	int ret;
+	u32 fw_version;
+
+	if (!pmu_reg)
+		return 0;	/* this driver didn't probe-out */
+
+	sfi_table_parse("OEMB", NULL, NULL, pmu_sfi_parse_oem);
+
+	if (ia_major < 0x6005 || ia_minor < 0x1525) {
+		WARN(1, "mrst_pmu: IA FW version too old\n");
+		return -1;
+	}
+
+	ret = intel_scu_ipc_command(IPCMSG_FW_REVISION, 0, NULL, 0,
+					&fw_version, 1);
+
+	if (ret) {
+		WARN(1, "mrst_pmu: IPC FW version? %d\n", ret);
+	} else {
+		int scu_major = (fw_version >> 8) & 0xFF;
+		int scu_minor = (fw_version >> 0) & 0xFF;
+
+		printk(KERN_INFO "mrst_pmu: firmware v%x\n", fw_version);
+
+		if ((scu_major >= 0xC0) && (scu_minor >= 0x49)) {
+			printk(KERN_INFO "mrst_pmu: enabling S0i3\n");
+			mrst_pmu_s0i3_enable = true;
+		} else {
+			WARN(1, "mrst_pmu: S0i3 disabled, old firmware %X.%X",
+					scu_major, scu_minor);
+		}
+	}
+	return 0;
+}
+late_initcall(scu_fw_check);
+module_exit(mid_pci_cleanup);
diff --git a/arch/x86/platform/mrst/pmu.h b/arch/x86/platform/mrst/pmu.h
new file mode 100644
index 00000000000..bfbfe64b167
--- /dev/null
+++ b/arch/x86/platform/mrst/pmu.h
@@ -0,0 +1,234 @@
+/*
+ * mrst/pmu.h - private definitions for MRST Power Management Unit mrst/pmu.c
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#ifndef _MRST_PMU_H_
+#define _MRST_PMU_H_
+
+#define PCI_DEV_ID_MRST_PMU		0x0810
+#define MRST_PMU_DRV_NAME		"mrst_pmu"
+#define	PCI_SUB_CLASS_MASK		0xFF00
+
+#define	PCI_VENDOR_CAP_LOG_ID_MASK	0x7F
+#define PCI_VENDOR_CAP_LOG_SS_MASK	0x80
+
+#define SUB_SYS_ALL_D0I1	0x01155555
+#define S0I3_WAKE_SOURCES	0x00001FFF
+
+#define PM_S0I3_COMMAND					\
+	((0 << 31) |	/* Reserved */			\
+	(0 << 30) |	/* Core must be idle */		\
+	(0xc2 << 22) |	/* ACK C6 trigger */		\
+	(3 << 19) |	/* Trigger on DMI message */	\
+	(3 << 16) |	/* Enter S0i3 */		\
+	(0 << 13) |	/* Numeric mode ID (sw) */	\
+	(3 << 9) |	/* Trigger mode */		\
+	(0 << 8) |	/* Do not interrupt */		\
+	(1 << 0))	/* Set configuration */
+
+#define	LSS_DMI		0
+#define	LSS_SD_HC0	1
+#define	LSS_SD_HC1	2
+#define	LSS_NAND	3
+#define	LSS_IMAGING	4
+#define	LSS_SECURITY	5
+#define	LSS_DISPLAY	6
+#define	LSS_USB_HC	7
+#define	LSS_USB_OTG	8
+#define	LSS_AUDIO	9
+#define	LSS_AUDIO_LPE	9
+#define	LSS_AUDIO_SSP	9
+#define	LSS_I2C0	10
+#define	LSS_I2C1	10
+#define	LSS_I2C2	10
+#define	LSS_KBD		10
+#define	LSS_SPI0	10
+#define	LSS_SPI1	10
+#define	LSS_SPI2	10
+#define	LSS_GPIO	10
+#define	LSS_SRAM	11	/* used by SCU, do not touch */
+#define	LSS_SD_HC2	12
+/* LSS hardware bits 15,14,13 are hardwired to 0, thus unusable */
+#define MRST_NUM_LSS	13
+
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+
+#define	SSMSK(mask, lss) ((mask) << ((lss) * 2))
+#define	D0	0
+#define	D0i1	1
+#define	D0i2	2
+#define	D0i3	3
+
+#define S0I3_SSS_TARGET	(		\
+	SSMSK(D0i1, LSS_DMI) |		\
+	SSMSK(D0i3, LSS_SD_HC0) |	\
+	SSMSK(D0i3, LSS_SD_HC1) |	\
+	SSMSK(D0i3, LSS_NAND) |		\
+	SSMSK(D0i3, LSS_SD_HC2) |	\
+	SSMSK(D0i3, LSS_IMAGING) |	\
+	SSMSK(D0i3, LSS_SECURITY) |	\
+	SSMSK(D0i3, LSS_DISPLAY) |	\
+	SSMSK(D0i3, LSS_USB_HC) |	\
+	SSMSK(D0i3, LSS_USB_OTG) |	\
+	SSMSK(D0i3, LSS_AUDIO) |	\
+	SSMSK(D0i1, LSS_I2C0))
+
+/*
+ * D0i1 on Langwell is Autonomous Clock Gating (ACG).
+ * Enable ACG on every LSS except camera and audio
+ */
+#define D0I1_ACG_SSS_TARGET	 \
+	(SUB_SYS_ALL_D0I1 & ~SSMSK(D0i1, LSS_IMAGING) & ~SSMSK(D0i1, LSS_AUDIO))
+
+enum cm_mode {
+	CM_NOP,			/* ignore the config mode value */
+	CM_IMMEDIATE,
+	CM_DELAY,
+	CM_TRIGGER,
+	CM_INVALID
+};
+
+enum sys_state {
+	SYS_STATE_S0I0,
+	SYS_STATE_S0I1,
+	SYS_STATE_S0I2,
+	SYS_STATE_S0I3,
+	SYS_STATE_S3,
+	SYS_STATE_S5
+};
+
+#define SET_CFG_CMD	1
+
+enum int_status {
+	INT_SPURIOUS = 0,
+	INT_CMD_DONE = 1,
+	INT_CMD_ERR = 2,
+	INT_WAKE_RX = 3,
+	INT_SS_ERROR = 4,
+	INT_S0IX_MISS = 5,
+	INT_NO_ACKC6 = 6,
+	INT_INVALID = 7,
+};
+
+/* PMU register interface */
+static struct mrst_pmu_reg {
+	u32 pm_sts;		/* 0x00 */
+	u32 pm_cmd;		/* 0x04 */
+	u32 pm_ics;		/* 0x08 */
+	u32 _resv1;		/* 0x0C */
+	u32 pm_wkc[2];		/* 0x10 */
+	u32 pm_wks[2];		/* 0x18 */
+	u32 pm_ssc[4];		/* 0x20 */
+	u32 pm_sss[4];		/* 0x30 */
+	u32 pm_wssc[4];		/* 0x40 */
+	u32 pm_c3c4;		/* 0x50 */
+	u32 pm_c5c6;		/* 0x54 */
+	u32 pm_msi_disable;	/* 0x58 */
+} *pmu_reg;
+
+static inline u32 pmu_read_sts(void) { return readl(&pmu_reg->pm_sts); }
+static inline u32 pmu_read_ics(void) { return readl(&pmu_reg->pm_ics); }
+static inline u32 pmu_read_wks(void) { return readl(&pmu_reg->pm_wks[0]); }
+static inline u32 pmu_read_sss(void) { return readl(&pmu_reg->pm_sss[0]); }
+
+static inline void pmu_write_cmd(u32 arg) { writel(arg, &pmu_reg->pm_cmd); }
+static inline void pmu_write_ics(u32 arg) { writel(arg, &pmu_reg->pm_ics); }
+static inline void pmu_write_wkc(u32 arg) { writel(arg, &pmu_reg->pm_wkc[0]); }
+static inline void pmu_write_ssc(u32 arg) { writel(arg, &pmu_reg->pm_ssc[0]); }
+static inline void pmu_write_wssc(u32 arg)
+					{ writel(arg, &pmu_reg->pm_wssc[0]); }
+
+static inline void pmu_msi_enable(void) { writel(0, &pmu_reg->pm_msi_disable); }
+static inline u32 pmu_msi_is_disabled(void)
+				{ return readl(&pmu_reg->pm_msi_disable); }
+
+union pmu_pm_ics {
+	struct {
+		u32 cause:8;
+		u32 enable:1;
+		u32 pending:1;
+		u32 reserved:22;
+	} bits;
+	u32 value;
+};
+
+static inline void pmu_irq_enable(void)
+{
+	union pmu_pm_ics pmu_ics;
+
+	pmu_ics.value = pmu_read_ics();
+	pmu_ics.bits.enable = 1;
+	pmu_write_ics(pmu_ics.value);
+}
+
+union pmu_pm_status {
+	struct {
+		u32 pmu_rev:8;
+		u32 pmu_busy:1;
+		u32 mode_id:4;
+		u32 Reserved:19;
+	} pmu_status_parts;
+	u32 pmu_status_value;
+};
+
+static inline int pmu_read_busy_status(void)
+{
+	union pmu_pm_status result;
+
+	result.pmu_status_value = pmu_read_sts();
+
+	return result.pmu_status_parts.pmu_busy;
+}
+
+/* pmu set config parameters */
+struct cfg_delay_param_t {
+	u32 cmd:8;
+	u32 ioc:1;
+	u32 cfg_mode:4;
+	u32 mode_id:3;
+	u32 sys_state:3;
+	u32 cfg_delay:8;
+	u32 rsvd:5;
+};
+
+struct cfg_trig_param_t {
+	u32 cmd:8;
+	u32 ioc:1;
+	u32 cfg_mode:4;
+	u32 mode_id:3;
+	u32 sys_state:3;
+	u32 cfg_trig_type:3;
+	u32 cfg_trig_val:8;
+	u32 cmbi:1;
+	u32 rsvd1:1;
+};
+
+union pmu_pm_set_cfg_cmd_t {
+	union {
+		struct cfg_delay_param_t d_param;
+		struct cfg_trig_param_t t_param;
+	} pmu2_params;
+	u32 pmu_pm_set_cfg_cmd_value;
+};
+
+#ifdef FUTURE_PATCH
+extern int mrst_s0i3_entry(u32 regval, u32 *regaddr);
+#else
+static inline int mrst_s0i3_entry(u32 regval, u32 *regaddr) { return -1; }
+#endif
+#endif
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index 81c5e2165c2..fd332c53394 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,2 +1,5 @@
 obj-$(CONFIG_OLPC)		+= olpc.o olpc_ofw.o olpc_dt.o
-obj-$(CONFIG_OLPC_XO1)		+= olpc-xo1.o
+obj-$(CONFIG_OLPC_XO1_PM)		+= olpc-xo1-pm.o xo1-wakeup.o
+obj-$(CONFIG_OLPC_XO1_RTC)	+= olpc-xo1-rtc.o
+obj-$(CONFIG_OLPC_XO1_SCI)	+= olpc-xo1-sci.o
+obj-$(CONFIG_OLPC_XO15_SCI)	+= olpc-xo15-sci.o
diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c
new file mode 100644
index 00000000000..6f3855a5a2f
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo1-pm.c
@@ -0,0 +1,215 @@
+/*
+ * Support for power management features of the OLPC XO-1 laptop
+ *
+ * Copyright (C) 2010 Andres Salomon <dilinger@queued.net>
+ * Copyright (C) 2010 One Laptop per Child
+ * Copyright (C) 2006 Red Hat, Inc.
+ * Copyright (C) 2006 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/cs5535.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/mfd/core.h>
+#include <linux/suspend.h>
+
+#include <asm/io.h>
+#include <asm/olpc.h>
+
+#define DRV_NAME "olpc-xo1-pm"
+
+static unsigned long acpi_base;
+static unsigned long pms_base;
+
+static u16 wakeup_mask = CS5536_PM_PWRBTN;
+
+static struct {
+	unsigned long address;
+	unsigned short segment;
+} ofw_bios_entry = { 0xF0000 + PAGE_OFFSET, __KERNEL_CS };
+
+/* Set bits in the wakeup mask */
+void olpc_xo1_pm_wakeup_set(u16 value)
+{
+	wakeup_mask |= value;
+}
+EXPORT_SYMBOL_GPL(olpc_xo1_pm_wakeup_set);
+
+/* Clear bits in the wakeup mask */
+void olpc_xo1_pm_wakeup_clear(u16 value)
+{
+	wakeup_mask &= ~value;
+}
+EXPORT_SYMBOL_GPL(olpc_xo1_pm_wakeup_clear);
+
+static int xo1_power_state_enter(suspend_state_t pm_state)
+{
+	unsigned long saved_sci_mask;
+	int r;
+
+	/* Only STR is supported */
+	if (pm_state != PM_SUSPEND_MEM)
+		return -EINVAL;
+
+	r = olpc_ec_cmd(EC_SET_SCI_INHIBIT, NULL, 0, NULL, 0);
+	if (r)
+		return r;
+
+	/*
+	 * Save SCI mask (this gets lost since PM1_EN is used as a mask for
+	 * wakeup events, which is not necessarily the same event set)
+	 */
+	saved_sci_mask = inl(acpi_base + CS5536_PM1_STS);
+	saved_sci_mask &= 0xffff0000;
+
+	/* Save CPU state */
+	do_olpc_suspend_lowlevel();
+
+	/* Resume path starts here */
+
+	/* Restore SCI mask (using dword access to CS5536_PM1_EN) */
+	outl(saved_sci_mask, acpi_base + CS5536_PM1_STS);
+
+	/* Tell the EC to stop inhibiting SCIs */
+	olpc_ec_cmd(EC_SET_SCI_INHIBIT_RELEASE, NULL, 0, NULL, 0);
+
+	/*
+	 * Tell the wireless module to restart USB communication.
+	 * Must be done twice.
+	 */
+	olpc_ec_cmd(EC_WAKE_UP_WLAN, NULL, 0, NULL, 0);
+	olpc_ec_cmd(EC_WAKE_UP_WLAN, NULL, 0, NULL, 0);
+
+	return 0;
+}
+
+asmlinkage int xo1_do_sleep(u8 sleep_state)
+{
+	void *pgd_addr = __va(read_cr3());
+
+	/* Program wakeup mask (using dword access to CS5536_PM1_EN) */
+	outl(wakeup_mask << 16, acpi_base + CS5536_PM1_STS);
+
+	__asm__("movl %0,%%eax" : : "r" (pgd_addr));
+	__asm__("call *(%%edi); cld"
+		: : "D" (&ofw_bios_entry));
+	__asm__("movb $0x34, %al\n\t"
+		"outb %al, $0x70\n\t"
+		"movb $0x30, %al\n\t"
+		"outb %al, $0x71\n\t");
+	return 0;
+}
+
+static void xo1_power_off(void)
+{
+	printk(KERN_INFO "OLPC XO-1 power off sequence...\n");
+
+	/* Enable all of these controls with 0 delay */
+	outl(0x40000000, pms_base + CS5536_PM_SCLK);
+	outl(0x40000000, pms_base + CS5536_PM_IN_SLPCTL);
+	outl(0x40000000, pms_base + CS5536_PM_WKXD);
+	outl(0x40000000, pms_base + CS5536_PM_WKD);
+
+	/* Clear status bits (possibly unnecessary) */
+	outl(0x0002ffff, pms_base  + CS5536_PM_SSC);
+	outl(0xffffffff, acpi_base + CS5536_PM_GPE0_STS);
+
+	/* Write SLP_EN bit to start the machinery */
+	outl(0x00002000, acpi_base + CS5536_PM1_CNT);
+}
+
+static int xo1_power_state_valid(suspend_state_t pm_state)
+{
+	/* suspend-to-RAM only */
+	return pm_state == PM_SUSPEND_MEM;
+}
+
+static const struct platform_suspend_ops xo1_suspend_ops = {
+	.valid = xo1_power_state_valid,
+	.enter = xo1_power_state_enter,
+};
+
+static int __devinit xo1_pm_probe(struct platform_device *pdev)
+{
+	struct resource *res;
+	int err;
+
+	/* don't run on non-XOs */
+	if (!machine_is_olpc())
+		return -ENODEV;
+
+	err = mfd_cell_enable(pdev);
+	if (err)
+		return err;
+
+	res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "can't fetch device resource info\n");
+		return -EIO;
+	}
+	if (strcmp(pdev->name, "cs5535-pms") == 0)
+		pms_base = res->start;
+	else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
+		acpi_base = res->start;
+
+	/* If we have both addresses, we can override the poweroff hook */
+	if (pms_base && acpi_base) {
+		suspend_set_ops(&xo1_suspend_ops);
+		pm_power_off = xo1_power_off;
+		printk(KERN_INFO "OLPC XO-1 support registered\n");
+	}
+
+	return 0;
+}
+
+static int __devexit xo1_pm_remove(struct platform_device *pdev)
+{
+	mfd_cell_disable(pdev);
+
+	if (strcmp(pdev->name, "cs5535-pms") == 0)
+		pms_base = 0;
+	else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
+		acpi_base = 0;
+
+	pm_power_off = NULL;
+	return 0;
+}
+
+static struct platform_driver cs5535_pms_driver = {
+	.driver = {
+		.name = "cs5535-pms",
+		.owner = THIS_MODULE,
+	},
+	.probe = xo1_pm_probe,
+	.remove = __devexit_p(xo1_pm_remove),
+};
+
+static struct platform_driver cs5535_acpi_driver = {
+	.driver = {
+		.name = "olpc-xo1-pm-acpi",
+		.owner = THIS_MODULE,
+	},
+	.probe = xo1_pm_probe,
+	.remove = __devexit_p(xo1_pm_remove),
+};
+
+static int __init xo1_pm_init(void)
+{
+	int r;
+
+	r = platform_driver_register(&cs5535_pms_driver);
+	if (r)
+		return r;
+
+	r = platform_driver_register(&cs5535_acpi_driver);
+	if (r)
+		platform_driver_unregister(&cs5535_pms_driver);
+
+	return r;
+}
+arch_initcall(xo1_pm_init);
diff --git a/arch/x86/platform/olpc/olpc-xo1-rtc.c b/arch/x86/platform/olpc/olpc-xo1-rtc.c
new file mode 100644
index 00000000000..a2b4efddd61
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo1-rtc.c
@@ -0,0 +1,81 @@
+/*
+ * Support for OLPC XO-1 Real Time Clock (RTC)
+ *
+ * Copyright (C) 2011 One Laptop per Child
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/mc146818rtc.h>
+#include <linux/platform_device.h>
+#include <linux/rtc.h>
+#include <linux/of.h>
+
+#include <asm/msr.h>
+#include <asm/olpc.h>
+
+static void rtc_wake_on(struct device *dev)
+{
+	olpc_xo1_pm_wakeup_set(CS5536_PM_RTC);
+}
+
+static void rtc_wake_off(struct device *dev)
+{
+	olpc_xo1_pm_wakeup_clear(CS5536_PM_RTC);
+}
+
+static struct resource rtc_platform_resource[] = {
+	[0] = {
+		.start	= RTC_PORT(0),
+		.end	= RTC_PORT(1),
+		.flags	= IORESOURCE_IO,
+	},
+	[1] = {
+		.start	= RTC_IRQ,
+		.end	= RTC_IRQ,
+		.flags	= IORESOURCE_IRQ,
+	}
+};
+
+static struct cmos_rtc_board_info rtc_info = {
+	.rtc_day_alarm = 0,
+	.rtc_mon_alarm = 0,
+	.rtc_century = 0,
+	.wake_on = rtc_wake_on,
+	.wake_off = rtc_wake_off,
+};
+
+static struct platform_device xo1_rtc_device = {
+	.name = "rtc_cmos",
+	.id = -1,
+	.num_resources = ARRAY_SIZE(rtc_platform_resource),
+	.dev.platform_data = &rtc_info,
+	.resource = rtc_platform_resource,
+};
+
+static int __init xo1_rtc_init(void)
+{
+	int r;
+	struct device_node *node;
+
+	node = of_find_compatible_node(NULL, NULL, "olpc,xo1-rtc");
+	if (!node)
+		return 0;
+	of_node_put(node);
+
+	pr_info("olpc-xo1-rtc: Initializing OLPC XO-1 RTC\n");
+	rdmsrl(MSR_RTC_DOMA_OFFSET, rtc_info.rtc_day_alarm);
+	rdmsrl(MSR_RTC_MONA_OFFSET, rtc_info.rtc_mon_alarm);
+	rdmsrl(MSR_RTC_CEN_OFFSET, rtc_info.rtc_century);
+
+	r = platform_device_register(&xo1_rtc_device);
+	if (r)
+		return r;
+
+	device_init_wakeup(&xo1_rtc_device.dev, 1);
+	return 0;
+}
+arch_initcall(xo1_rtc_init);
diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
new file mode 100644
index 00000000000..1d4c783d732
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -0,0 +1,614 @@
+/*
+ * Support for OLPC XO-1 System Control Interrupts (SCI)
+ *
+ * Copyright (C) 2010 One Laptop per Child
+ * Copyright (C) 2006 Red Hat, Inc.
+ * Copyright (C) 2006 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/cs5535.h>
+#include <linux/device.h>
+#include <linux/gpio.h>
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/mfd/core.h>
+#include <linux/power_supply.h>
+#include <linux/suspend.h>
+#include <linux/workqueue.h>
+
+#include <asm/io.h>
+#include <asm/msr.h>
+#include <asm/olpc.h>
+
+#define DRV_NAME	"olpc-xo1-sci"
+#define PFX		DRV_NAME ": "
+
+static unsigned long acpi_base;
+static struct input_dev *power_button_idev;
+static struct input_dev *ebook_switch_idev;
+static struct input_dev *lid_switch_idev;
+
+static int sci_irq;
+
+static bool lid_open;
+static bool lid_inverted;
+static int lid_wake_mode;
+
+enum lid_wake_modes {
+	LID_WAKE_ALWAYS,
+	LID_WAKE_OPEN,
+	LID_WAKE_CLOSE,
+};
+
+static const char * const lid_wake_mode_names[] = {
+	[LID_WAKE_ALWAYS] = "always",
+	[LID_WAKE_OPEN] = "open",
+	[LID_WAKE_CLOSE] = "close",
+};
+
+static void battery_status_changed(void)
+{
+	struct power_supply *psy = power_supply_get_by_name("olpc-battery");
+
+	if (psy) {
+		power_supply_changed(psy);
+		put_device(psy->dev);
+	}
+}
+
+static void ac_status_changed(void)
+{
+	struct power_supply *psy = power_supply_get_by_name("olpc-ac");
+
+	if (psy) {
+		power_supply_changed(psy);
+		put_device(psy->dev);
+	}
+}
+
+/* Report current ebook switch state through input layer */
+static void send_ebook_state(void)
+{
+	unsigned char state;
+
+	if (olpc_ec_cmd(EC_READ_EB_MODE, NULL, 0, &state, 1)) {
+		pr_err(PFX "failed to get ebook state\n");
+		return;
+	}
+
+	input_report_switch(ebook_switch_idev, SW_TABLET_MODE, state);
+	input_sync(ebook_switch_idev);
+}
+
+static void flip_lid_inverter(void)
+{
+	/* gpio is high; invert so we'll get l->h event interrupt */
+	if (lid_inverted)
+		cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_INPUT_INVERT);
+	else
+		cs5535_gpio_set(OLPC_GPIO_LID, GPIO_INPUT_INVERT);
+	lid_inverted = !lid_inverted;
+}
+
+static void detect_lid_state(void)
+{
+	/*
+	 * the edge detector hookup on the gpio inputs on the geode is
+	 * odd, to say the least.  See http://dev.laptop.org/ticket/5703
+	 * for details, but in a nutshell:  we don't use the edge
+	 * detectors.  instead, we make use of an anomoly:  with the both
+	 * edge detectors turned off, we still get an edge event on a
+	 * positive edge transition.  to take advantage of this, we use the
+	 * front-end inverter to ensure that that's the edge we're always
+	 * going to see next.
+	 */
+
+	int state;
+
+	state = cs5535_gpio_isset(OLPC_GPIO_LID, GPIO_READ_BACK);
+	lid_open = !state ^ !lid_inverted; /* x ^^ y */
+	if (!state)
+		return;
+
+	flip_lid_inverter();
+}
+
+/* Report current lid switch state through input layer */
+static void send_lid_state(void)
+{
+	input_report_switch(lid_switch_idev, SW_LID, !lid_open);
+	input_sync(lid_switch_idev);
+}
+
+static ssize_t lid_wake_mode_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	const char *mode = lid_wake_mode_names[lid_wake_mode];
+	return sprintf(buf, "%s\n", mode);
+}
+static ssize_t lid_wake_mode_set(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	int i;
+	for (i = 0; i < ARRAY_SIZE(lid_wake_mode_names); i++) {
+		const char *mode = lid_wake_mode_names[i];
+		if (strlen(mode) != count || strncasecmp(mode, buf, count))
+			continue;
+
+		lid_wake_mode = i;
+		return count;
+	}
+	return -EINVAL;
+}
+static DEVICE_ATTR(lid_wake_mode, S_IWUSR | S_IRUGO, lid_wake_mode_show,
+		   lid_wake_mode_set);
+
+/*
+ * Process all items in the EC's SCI queue.
+ *
+ * This is handled in a workqueue because olpc_ec_cmd can be slow (and
+ * can even timeout).
+ *
+ * If propagate_events is false, the queue is drained without events being
+ * generated for the interrupts.
+ */
+static void process_sci_queue(bool propagate_events)
+{
+	int r;
+	u16 data;
+
+	do {
+		r = olpc_ec_sci_query(&data);
+		if (r || !data)
+			break;
+
+		pr_debug(PFX "SCI 0x%x received\n", data);
+
+		switch (data) {
+		case EC_SCI_SRC_BATERR:
+		case EC_SCI_SRC_BATSOC:
+		case EC_SCI_SRC_BATTERY:
+		case EC_SCI_SRC_BATCRIT:
+			battery_status_changed();
+			break;
+		case EC_SCI_SRC_ACPWR:
+			ac_status_changed();
+			break;
+		}
+
+		if (data == EC_SCI_SRC_EBOOK && propagate_events)
+			send_ebook_state();
+	} while (data);
+
+	if (r)
+		pr_err(PFX "Failed to clear SCI queue");
+}
+
+static void process_sci_queue_work(struct work_struct *work)
+{
+	process_sci_queue(true);
+}
+
+static DECLARE_WORK(sci_work, process_sci_queue_work);
+
+static irqreturn_t xo1_sci_intr(int irq, void *dev_id)
+{
+	struct platform_device *pdev = dev_id;
+	u32 sts;
+	u32 gpe;
+
+	sts = inl(acpi_base + CS5536_PM1_STS);
+	outl(sts | 0xffff, acpi_base + CS5536_PM1_STS);
+
+	gpe = inl(acpi_base + CS5536_PM_GPE0_STS);
+	outl(0xffffffff, acpi_base + CS5536_PM_GPE0_STS);
+
+	dev_dbg(&pdev->dev, "sts %x gpe %x\n", sts, gpe);
+
+	if (sts & CS5536_PWRBTN_FLAG && !(sts & CS5536_WAK_FLAG)) {
+		input_report_key(power_button_idev, KEY_POWER, 1);
+		input_sync(power_button_idev);
+		input_report_key(power_button_idev, KEY_POWER, 0);
+		input_sync(power_button_idev);
+	}
+
+	if (gpe & CS5536_GPIOM7_PME_FLAG) { /* EC GPIO */
+		cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_NEGATIVE_EDGE_STS);
+		schedule_work(&sci_work);
+	}
+
+	cs5535_gpio_set(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_STS);
+	cs5535_gpio_set(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_STS);
+	detect_lid_state();
+	send_lid_state();
+
+	return IRQ_HANDLED;
+}
+
+static int xo1_sci_suspend(struct platform_device *pdev, pm_message_t state)
+{
+	if (device_may_wakeup(&power_button_idev->dev))
+		olpc_xo1_pm_wakeup_set(CS5536_PM_PWRBTN);
+	else
+		olpc_xo1_pm_wakeup_clear(CS5536_PM_PWRBTN);
+
+	if (device_may_wakeup(&ebook_switch_idev->dev))
+		olpc_ec_wakeup_set(EC_SCI_SRC_EBOOK);
+	else
+		olpc_ec_wakeup_clear(EC_SCI_SRC_EBOOK);
+
+	if (!device_may_wakeup(&lid_switch_idev->dev)) {
+		cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
+	} else if ((lid_open && lid_wake_mode == LID_WAKE_OPEN) ||
+		   (!lid_open && lid_wake_mode == LID_WAKE_CLOSE)) {
+		flip_lid_inverter();
+
+		/* we may have just caused an event */
+		cs5535_gpio_set(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_STS);
+		cs5535_gpio_set(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_STS);
+
+		cs5535_gpio_set(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
+	}
+
+	return 0;
+}
+
+static int xo1_sci_resume(struct platform_device *pdev)
+{
+	/*
+	 * We don't know what may have happened while we were asleep.
+	 * Reestablish our lid setup so we're sure to catch all transitions.
+	 */
+	detect_lid_state();
+	send_lid_state();
+	cs5535_gpio_set(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
+
+	/* Enable all EC events */
+	olpc_ec_mask_write(EC_SCI_SRC_ALL);
+
+	/* Power/battery status might have changed too */
+	battery_status_changed();
+	ac_status_changed();
+	return 0;
+}
+
+static int __devinit setup_sci_interrupt(struct platform_device *pdev)
+{
+	u32 lo, hi;
+	u32 sts;
+	int r;
+
+	rdmsr(0x51400020, lo, hi);
+	sci_irq = (lo >> 20) & 15;
+
+	if (sci_irq) {
+		dev_info(&pdev->dev, "SCI is mapped to IRQ %d\n", sci_irq);
+	} else {
+		/* Zero means masked */
+		dev_info(&pdev->dev, "SCI unmapped. Mapping to IRQ 3\n");
+		sci_irq = 3;
+		lo |= 0x00300000;
+		wrmsrl(0x51400020, lo);
+	}
+
+	/* Select level triggered in PIC */
+	if (sci_irq < 8) {
+		lo = inb(CS5536_PIC_INT_SEL1);
+		lo |= 1 << sci_irq;
+		outb(lo, CS5536_PIC_INT_SEL1);
+	} else {
+		lo = inb(CS5536_PIC_INT_SEL2);
+		lo |= 1 << (sci_irq - 8);
+		outb(lo, CS5536_PIC_INT_SEL2);
+	}
+
+	/* Enable SCI from power button, and clear pending interrupts */
+	sts = inl(acpi_base + CS5536_PM1_STS);
+	outl((CS5536_PM_PWRBTN << 16) | 0xffff, acpi_base + CS5536_PM1_STS);
+
+	r = request_irq(sci_irq, xo1_sci_intr, 0, DRV_NAME, pdev);
+	if (r)
+		dev_err(&pdev->dev, "can't request interrupt\n");
+
+	return r;
+}
+
+static int __devinit setup_ec_sci(void)
+{
+	int r;
+
+	r = gpio_request(OLPC_GPIO_ECSCI, "OLPC-ECSCI");
+	if (r)
+		return r;
+
+	gpio_direction_input(OLPC_GPIO_ECSCI);
+
+	/* Clear pending EC SCI events */
+	cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_NEGATIVE_EDGE_STS);
+	cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_POSITIVE_EDGE_STS);
+
+	/*
+	 * Enable EC SCI events, and map them to both a PME and the SCI
+	 * interrupt.
+	 *
+	 * Ordinarily, in addition to functioning as GPIOs, Geode GPIOs can
+	 * be mapped to regular interrupts *or* Geode-specific Power
+	 * Management Events (PMEs) - events that bring the system out of
+	 * suspend. In this case, we want both of those things - the system
+	 * wakeup, *and* the ability to get an interrupt when an event occurs.
+	 *
+	 * To achieve this, we map the GPIO to a PME, and then we use one
+	 * of the many generic knobs on the CS5535 PIC to additionally map the
+	 * PME to the regular SCI interrupt line.
+	 */
+	cs5535_gpio_set(OLPC_GPIO_ECSCI, GPIO_EVENTS_ENABLE);
+
+	/* Set the SCI to cause a PME event on group 7 */
+	cs5535_gpio_setup_event(OLPC_GPIO_ECSCI, 7, 1);
+
+	/* And have group 7 also fire the SCI interrupt */
+	cs5535_pic_unreqz_select_high(7, sci_irq);
+
+	return 0;
+}
+
+static void free_ec_sci(void)
+{
+	gpio_free(OLPC_GPIO_ECSCI);
+}
+
+static int __devinit setup_lid_events(void)
+{
+	int r;
+
+	r = gpio_request(OLPC_GPIO_LID, "OLPC-LID");
+	if (r)
+		return r;
+
+	gpio_direction_input(OLPC_GPIO_LID);
+
+	cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_INPUT_INVERT);
+	lid_inverted = 0;
+
+	/* Clear edge detection and event enable for now */
+	cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
+	cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_EN);
+	cs5535_gpio_clear(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_EN);
+	cs5535_gpio_set(OLPC_GPIO_LID, GPIO_NEGATIVE_EDGE_STS);
+	cs5535_gpio_set(OLPC_GPIO_LID, GPIO_POSITIVE_EDGE_STS);
+
+	/* Set the LID to cause an PME event on group 6 */
+	cs5535_gpio_setup_event(OLPC_GPIO_LID, 6, 1);
+
+	/* Set PME group 6 to fire the SCI interrupt */
+	cs5535_gpio_set_irq(6, sci_irq);
+
+	/* Enable the event */
+	cs5535_gpio_set(OLPC_GPIO_LID, GPIO_EVENTS_ENABLE);
+
+	return 0;
+}
+
+static void free_lid_events(void)
+{
+	gpio_free(OLPC_GPIO_LID);
+}
+
+static int __devinit setup_power_button(struct platform_device *pdev)
+{
+	int r;
+
+	power_button_idev = input_allocate_device();
+	if (!power_button_idev)
+		return -ENOMEM;
+
+	power_button_idev->name = "Power Button";
+	power_button_idev->phys = DRV_NAME "/input0";
+	set_bit(EV_KEY, power_button_idev->evbit);
+	set_bit(KEY_POWER, power_button_idev->keybit);
+
+	power_button_idev->dev.parent = &pdev->dev;
+	device_init_wakeup(&power_button_idev->dev, 1);
+
+	r = input_register_device(power_button_idev);
+	if (r) {
+		dev_err(&pdev->dev, "failed to register power button: %d\n", r);
+		input_free_device(power_button_idev);
+	}
+
+	return r;
+}
+
+static void free_power_button(void)
+{
+	input_unregister_device(power_button_idev);
+	input_free_device(power_button_idev);
+}
+
+static int __devinit setup_ebook_switch(struct platform_device *pdev)
+{
+	int r;
+
+	ebook_switch_idev = input_allocate_device();
+	if (!ebook_switch_idev)
+		return -ENOMEM;
+
+	ebook_switch_idev->name = "EBook Switch";
+	ebook_switch_idev->phys = DRV_NAME "/input1";
+	set_bit(EV_SW, ebook_switch_idev->evbit);
+	set_bit(SW_TABLET_MODE, ebook_switch_idev->swbit);
+
+	ebook_switch_idev->dev.parent = &pdev->dev;
+	device_set_wakeup_capable(&ebook_switch_idev->dev, true);
+
+	r = input_register_device(ebook_switch_idev);
+	if (r) {
+		dev_err(&pdev->dev, "failed to register ebook switch: %d\n", r);
+		input_free_device(ebook_switch_idev);
+	}
+
+	return r;
+}
+
+static void free_ebook_switch(void)
+{
+	input_unregister_device(ebook_switch_idev);
+	input_free_device(ebook_switch_idev);
+}
+
+static int __devinit setup_lid_switch(struct platform_device *pdev)
+{
+	int r;
+
+	lid_switch_idev = input_allocate_device();
+	if (!lid_switch_idev)
+		return -ENOMEM;
+
+	lid_switch_idev->name = "Lid Switch";
+	lid_switch_idev->phys = DRV_NAME "/input2";
+	set_bit(EV_SW, lid_switch_idev->evbit);
+	set_bit(SW_LID, lid_switch_idev->swbit);
+
+	lid_switch_idev->dev.parent = &pdev->dev;
+	device_set_wakeup_capable(&lid_switch_idev->dev, true);
+
+	r = input_register_device(lid_switch_idev);
+	if (r) {
+		dev_err(&pdev->dev, "failed to register lid switch: %d\n", r);
+		goto err_register;
+	}
+
+	r = device_create_file(&lid_switch_idev->dev, &dev_attr_lid_wake_mode);
+	if (r) {
+		dev_err(&pdev->dev, "failed to create wake mode attr: %d\n", r);
+		goto err_create_attr;
+	}
+
+	return 0;
+
+err_create_attr:
+	input_unregister_device(lid_switch_idev);
+err_register:
+	input_free_device(lid_switch_idev);
+	return r;
+}
+
+static void free_lid_switch(void)
+{
+	device_remove_file(&lid_switch_idev->dev, &dev_attr_lid_wake_mode);
+	input_unregister_device(lid_switch_idev);
+	input_free_device(lid_switch_idev);
+}
+
+static int __devinit xo1_sci_probe(struct platform_device *pdev)
+{
+	struct resource *res;
+	int r;
+
+	/* don't run on non-XOs */
+	if (!machine_is_olpc())
+		return -ENODEV;
+
+	r = mfd_cell_enable(pdev);
+	if (r)
+		return r;
+
+	res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "can't fetch device resource info\n");
+		return -EIO;
+	}
+	acpi_base = res->start;
+
+	r = setup_power_button(pdev);
+	if (r)
+		return r;
+
+	r = setup_ebook_switch(pdev);
+	if (r)
+		goto err_ebook;
+
+	r = setup_lid_switch(pdev);
+	if (r)
+		goto err_lid;
+
+	r = setup_lid_events();
+	if (r)
+		goto err_lidevt;
+
+	r = setup_ec_sci();
+	if (r)
+		goto err_ecsci;
+
+	/* Enable PME generation for EC-generated events */
+	outl(CS5536_GPIOM6_PME_EN | CS5536_GPIOM7_PME_EN,
+		acpi_base + CS5536_PM_GPE0_EN);
+
+	/* Clear pending events */
+	outl(0xffffffff, acpi_base + CS5536_PM_GPE0_STS);
+	process_sci_queue(false);
+
+	/* Initial sync */
+	send_ebook_state();
+	detect_lid_state();
+	send_lid_state();
+
+	r = setup_sci_interrupt(pdev);
+	if (r)
+		goto err_sci;
+
+	/* Enable all EC events */
+	olpc_ec_mask_write(EC_SCI_SRC_ALL);
+
+	return r;
+
+err_sci:
+	free_ec_sci();
+err_ecsci:
+	free_lid_events();
+err_lidevt:
+	free_lid_switch();
+err_lid:
+	free_ebook_switch();
+err_ebook:
+	free_power_button();
+	return r;
+}
+
+static int __devexit xo1_sci_remove(struct platform_device *pdev)
+{
+	mfd_cell_disable(pdev);
+	free_irq(sci_irq, pdev);
+	cancel_work_sync(&sci_work);
+	free_ec_sci();
+	free_lid_events();
+	free_lid_switch();
+	free_ebook_switch();
+	free_power_button();
+	acpi_base = 0;
+	return 0;
+}
+
+static struct platform_driver xo1_sci_driver = {
+	.driver = {
+		.name = "olpc-xo1-sci-acpi",
+	},
+	.probe = xo1_sci_probe,
+	.remove = __devexit_p(xo1_sci_remove),
+	.suspend = xo1_sci_suspend,
+	.resume = xo1_sci_resume,
+};
+
+static int __init xo1_sci_init(void)
+{
+	return platform_driver_register(&xo1_sci_driver);
+}
+arch_initcall(xo1_sci_init);
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
deleted file mode 100644
index ab81fb27176..00000000000
--- a/arch/x86/platform/olpc/olpc-xo1.c
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Support for features of the OLPC XO-1 laptop
- *
- * Copyright (C) 2010 Andres Salomon <dilinger@queued.net>
- * Copyright (C) 2010 One Laptop per Child
- * Copyright (C) 2006 Red Hat, Inc.
- * Copyright (C) 2006 Advanced Micro Devices, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/platform_device.h>
-#include <linux/pm.h>
-#include <linux/mfd/core.h>
-
-#include <asm/io.h>
-#include <asm/olpc.h>
-
-#define DRV_NAME "olpc-xo1"
-
-/* PMC registers (PMS block) */
-#define PM_SCLK		0x10
-#define PM_IN_SLPCTL	0x20
-#define PM_WKXD		0x34
-#define PM_WKD		0x30
-#define PM_SSC		0x54
-
-/* PM registers (ACPI block) */
-#define PM1_CNT		0x08
-#define PM_GPE0_STS	0x18
-
-static unsigned long acpi_base;
-static unsigned long pms_base;
-
-static void xo1_power_off(void)
-{
-	printk(KERN_INFO "OLPC XO-1 power off sequence...\n");
-
-	/* Enable all of these controls with 0 delay */
-	outl(0x40000000, pms_base + PM_SCLK);
-	outl(0x40000000, pms_base + PM_IN_SLPCTL);
-	outl(0x40000000, pms_base + PM_WKXD);
-	outl(0x40000000, pms_base + PM_WKD);
-
-	/* Clear status bits (possibly unnecessary) */
-	outl(0x0002ffff, pms_base  + PM_SSC);
-	outl(0xffffffff, acpi_base + PM_GPE0_STS);
-
-	/* Write SLP_EN bit to start the machinery */
-	outl(0x00002000, acpi_base + PM1_CNT);
-}
-
-static int __devinit olpc_xo1_probe(struct platform_device *pdev)
-{
-	struct resource *res;
-	int err;
-
-	/* don't run on non-XOs */
-	if (!machine_is_olpc())
-		return -ENODEV;
-
-	err = mfd_cell_enable(pdev);
-	if (err)
-		return err;
-
-	res = platform_get_resource(pdev, IORESOURCE_IO, 0);
-	if (!res) {
-		dev_err(&pdev->dev, "can't fetch device resource info\n");
-		return -EIO;
-	}
-	if (strcmp(pdev->name, "cs5535-pms") == 0)
-		pms_base = res->start;
-	else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
-		acpi_base = res->start;
-
-	/* If we have both addresses, we can override the poweroff hook */
-	if (pms_base && acpi_base) {
-		pm_power_off = xo1_power_off;
-		printk(KERN_INFO "OLPC XO-1 support registered\n");
-	}
-
-	return 0;
-}
-
-static int __devexit olpc_xo1_remove(struct platform_device *pdev)
-{
-	mfd_cell_disable(pdev);
-
-	if (strcmp(pdev->name, "cs5535-pms") == 0)
-		pms_base = 0;
-	else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
-		acpi_base = 0;
-
-	pm_power_off = NULL;
-	return 0;
-}
-
-static struct platform_driver cs5535_pms_drv = {
-	.driver = {
-		.name = "cs5535-pms",
-		.owner = THIS_MODULE,
-	},
-	.probe = olpc_xo1_probe,
-	.remove = __devexit_p(olpc_xo1_remove),
-};
-
-static struct platform_driver cs5535_acpi_drv = {
-	.driver = {
-		.name = "olpc-xo1-pm-acpi",
-		.owner = THIS_MODULE,
-	},
-	.probe = olpc_xo1_probe,
-	.remove = __devexit_p(olpc_xo1_remove),
-};
-
-static int __init olpc_xo1_init(void)
-{
-	int r;
-
-	r = platform_driver_register(&cs5535_pms_drv);
-	if (r)
-		return r;
-
-	r = platform_driver_register(&cs5535_acpi_drv);
-	if (r)
-		platform_driver_unregister(&cs5535_pms_drv);
-
-	return r;
-}
-
-static void __exit olpc_xo1_exit(void)
-{
-	platform_driver_unregister(&cs5535_acpi_drv);
-	platform_driver_unregister(&cs5535_pms_drv);
-}
-
-MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("platform:cs5535-pms");
-
-module_init(olpc_xo1_init);
-module_exit(olpc_xo1_exit);
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
new file mode 100644
index 00000000000..2b235b77d9a
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -0,0 +1,168 @@
+/*
+ * Support for OLPC XO-1.5 System Control Interrupts (SCI)
+ *
+ * Copyright (C) 2009-2010 One Laptop per Child
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/power_supply.h>
+
+#include <acpi/acpi_bus.h>
+#include <acpi/acpi_drivers.h>
+#include <asm/olpc.h>
+
+#define DRV_NAME			"olpc-xo15-sci"
+#define PFX				DRV_NAME ": "
+#define XO15_SCI_CLASS			DRV_NAME
+#define XO15_SCI_DEVICE_NAME		"OLPC XO-1.5 SCI"
+
+static unsigned long xo15_sci_gpe;
+
+static void battery_status_changed(void)
+{
+	struct power_supply *psy = power_supply_get_by_name("olpc-battery");
+
+	if (psy) {
+		power_supply_changed(psy);
+		put_device(psy->dev);
+	}
+}
+
+static void ac_status_changed(void)
+{
+	struct power_supply *psy = power_supply_get_by_name("olpc-ac");
+
+	if (psy) {
+		power_supply_changed(psy);
+		put_device(psy->dev);
+	}
+}
+
+static void process_sci_queue(void)
+{
+	u16 data;
+	int r;
+
+	do {
+		r = olpc_ec_sci_query(&data);
+		if (r || !data)
+			break;
+
+		pr_debug(PFX "SCI 0x%x received\n", data);
+
+		switch (data) {
+		case EC_SCI_SRC_BATERR:
+		case EC_SCI_SRC_BATSOC:
+		case EC_SCI_SRC_BATTERY:
+		case EC_SCI_SRC_BATCRIT:
+			battery_status_changed();
+			break;
+		case EC_SCI_SRC_ACPWR:
+			ac_status_changed();
+			break;
+		}
+	} while (data);
+
+	if (r)
+		pr_err(PFX "Failed to clear SCI queue");
+}
+
+static void process_sci_queue_work(struct work_struct *work)
+{
+	process_sci_queue();
+}
+
+static DECLARE_WORK(sci_work, process_sci_queue_work);
+
+static u32 xo15_sci_gpe_handler(acpi_handle gpe_device, u32 gpe, void *context)
+{
+	schedule_work(&sci_work);
+	return ACPI_INTERRUPT_HANDLED | ACPI_REENABLE_GPE;
+}
+
+static int xo15_sci_add(struct acpi_device *device)
+{
+	unsigned long long tmp;
+	acpi_status status;
+
+	if (!device)
+		return -EINVAL;
+
+	strcpy(acpi_device_name(device), XO15_SCI_DEVICE_NAME);
+	strcpy(acpi_device_class(device), XO15_SCI_CLASS);
+
+	/* Get GPE bit assignment (EC events). */
+	status = acpi_evaluate_integer(device->handle, "_GPE", NULL, &tmp);
+	if (ACPI_FAILURE(status))
+		return -EINVAL;
+
+	xo15_sci_gpe = tmp;
+	status = acpi_install_gpe_handler(NULL, xo15_sci_gpe,
+					  ACPI_GPE_EDGE_TRIGGERED,
+					  xo15_sci_gpe_handler, device);
+	if (ACPI_FAILURE(status))
+		return -ENODEV;
+
+	dev_info(&device->dev, "Initialized, GPE = 0x%lx\n", xo15_sci_gpe);
+
+	/* Flush queue, and enable all SCI events */
+	process_sci_queue();
+	olpc_ec_mask_write(EC_SCI_SRC_ALL);
+
+	acpi_enable_gpe(NULL, xo15_sci_gpe);
+
+	/* Enable wake-on-EC */
+	if (device->wakeup.flags.valid)
+		device_init_wakeup(&device->dev, true);
+
+	return 0;
+}
+
+static int xo15_sci_remove(struct acpi_device *device, int type)
+{
+	acpi_disable_gpe(NULL, xo15_sci_gpe);
+	acpi_remove_gpe_handler(NULL, xo15_sci_gpe, xo15_sci_gpe_handler);
+	cancel_work_sync(&sci_work);
+	return 0;
+}
+
+static int xo15_sci_resume(struct acpi_device *device)
+{
+	/* Enable all EC events */
+	olpc_ec_mask_write(EC_SCI_SRC_ALL);
+
+	/* Power/battery status might have changed */
+	battery_status_changed();
+	ac_status_changed();
+
+	return 0;
+}
+
+static const struct acpi_device_id xo15_sci_device_ids[] = {
+	{"XO15EC", 0},
+	{"", 0},
+};
+
+static struct acpi_driver xo15_sci_drv = {
+	.name = DRV_NAME,
+	.class = XO15_SCI_CLASS,
+	.ids = xo15_sci_device_ids,
+	.ops = {
+		.add = xo15_sci_add,
+		.remove = xo15_sci_remove,
+		.resume = xo15_sci_resume,
+	},
+};
+
+static int __init xo15_sci_init(void)
+{
+	return acpi_bus_register_driver(&xo15_sci_drv);
+}
+device_initcall(xo15_sci_init);
diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
index 0060fd59ea0..8b9940e78e2 100644
--- a/arch/x86/platform/olpc/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -19,6 +19,7 @@
 #include <linux/string.h>
 #include <linux/platform_device.h>
 #include <linux/of.h>
+#include <linux/syscore_ops.h>
 
 #include <asm/geode.h>
 #include <asm/setup.h>
@@ -30,6 +31,9 @@ EXPORT_SYMBOL_GPL(olpc_platform_info);
 
 static DEFINE_SPINLOCK(ec_lock);
 
+/* EC event mask to be applied during suspend (defining wakeup sources). */
+static u16 ec_wakeup_mask;
+
 /* what the timeout *should* be (in ms) */
 #define EC_BASE_TIMEOUT 20
 
@@ -188,6 +192,88 @@ err:
 }
 EXPORT_SYMBOL_GPL(olpc_ec_cmd);
 
+void olpc_ec_wakeup_set(u16 value)
+{
+	ec_wakeup_mask |= value;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_wakeup_set);
+
+void olpc_ec_wakeup_clear(u16 value)
+{
+	ec_wakeup_mask &= ~value;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_wakeup_clear);
+
+/*
+ * Returns true if the compile and runtime configurations allow for EC events
+ * to wake the system.
+ */
+bool olpc_ec_wakeup_available(void)
+{
+	if (!machine_is_olpc())
+		return false;
+
+	/*
+	 * XO-1 EC wakeups are available when olpc-xo1-sci driver is
+	 * compiled in
+	 */
+#ifdef CONFIG_OLPC_XO1_SCI
+	if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) /* XO-1 */
+		return true;
+#endif
+
+	/*
+	 * XO-1.5 EC wakeups are available when olpc-xo15-sci driver is
+	 * compiled in
+	 */
+#ifdef CONFIG_OLPC_XO15_SCI
+	if (olpc_platform_info.boardrev >= olpc_board_pre(0xd0)) /* XO-1.5 */
+		return true;
+#endif
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_wakeup_available);
+
+int olpc_ec_mask_write(u16 bits)
+{
+	if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) {
+		__be16 ec_word = cpu_to_be16(bits);
+		return olpc_ec_cmd(EC_WRITE_EXT_SCI_MASK, (void *) &ec_word, 2,
+				   NULL, 0);
+	} else {
+		unsigned char ec_byte = bits & 0xff;
+		return olpc_ec_cmd(EC_WRITE_SCI_MASK, &ec_byte, 1, NULL, 0);
+	}
+}
+EXPORT_SYMBOL_GPL(olpc_ec_mask_write);
+
+int olpc_ec_sci_query(u16 *sci_value)
+{
+	int ret;
+
+	if (olpc_platform_info.flags & OLPC_F_EC_WIDE_SCI) {
+		__be16 ec_word;
+		ret = olpc_ec_cmd(EC_EXT_SCI_QUERY,
+			NULL, 0, (void *) &ec_word, 2);
+		if (ret == 0)
+			*sci_value = be16_to_cpu(ec_word);
+	} else {
+		unsigned char ec_byte;
+		ret = olpc_ec_cmd(EC_SCI_QUERY, NULL, 0, &ec_byte, 1);
+		if (ret == 0)
+			*sci_value = ec_byte;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(olpc_ec_sci_query);
+
+static int olpc_ec_suspend(void)
+{
+	return olpc_ec_mask_write(ec_wakeup_mask);
+}
+
 static bool __init check_ofw_architecture(struct device_node *root)
 {
 	const char *olpc_arch;
@@ -242,6 +328,10 @@ static int __init add_xo1_platform_devices(void)
 	return 0;
 }
 
+static struct syscore_ops olpc_syscore_ops = {
+	.suspend = olpc_ec_suspend,
+};
+
 static int __init olpc_init(void)
 {
 	int r = 0;
@@ -266,6 +356,9 @@ static int __init olpc_init(void)
 			!cs5535_has_vsa2())
 		x86_init.pci.arch_init = pci_olpc_init;
 #endif
+	/* EC version 0x5f adds support for wide SCI mask */
+	if (olpc_platform_info.ecver >= 0x5f)
+		olpc_platform_info.flags |= OLPC_F_EC_WIDE_SCI;
 
 	printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
 			((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
@@ -278,6 +371,8 @@ static int __init olpc_init(void)
 			return r;
 	}
 
+	register_syscore_ops(&olpc_syscore_ops);
+
 	return 0;
 }
 
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
index d39f63d017d..d6ee9298692 100644
--- a/arch/x86/platform/olpc/olpc_dt.c
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -165,6 +165,107 @@ static struct of_pdt_ops prom_olpc_ops __initdata = {
 	.pkg2path = olpc_dt_pkg2path,
 };
 
+static phandle __init olpc_dt_finddevice(const char *path)
+{
+	phandle node;
+	const void *args[] = { path };
+	void *res[] = { &node };
+
+	if (olpc_ofw("finddevice", args, res)) {
+		pr_err("olpc_dt: finddevice failed!\n");
+		return 0;
+	}
+
+	if ((s32) node == -1)
+		return 0;
+
+	return node;
+}
+
+static int __init olpc_dt_interpret(const char *words)
+{
+	int result;
+	const void *args[] = { words };
+	void *res[] = { &result };
+
+	if (olpc_ofw("interpret", args, res)) {
+		pr_err("olpc_dt: interpret failed!\n");
+		return -1;
+	}
+
+	return result;
+}
+
+/*
+ * Extract board revision directly from OFW device tree.
+ * We can't use olpc_platform_info because that hasn't been set up yet.
+ */
+static u32 __init olpc_dt_get_board_revision(void)
+{
+	phandle node;
+	__be32 rev;
+	int r;
+
+	node = olpc_dt_finddevice("/");
+	if (!node)
+		return 0;
+
+	r = olpc_dt_getproperty(node, "board-revision-int",
+				(char *) &rev, sizeof(rev));
+	if (r < 0)
+		return 0;
+
+	return be32_to_cpu(rev);
+}
+
+void __init olpc_dt_fixup(void)
+{
+	int r;
+	char buf[64];
+	phandle node;
+	u32 board_rev;
+
+	node = olpc_dt_finddevice("/battery@0");
+	if (!node)
+		return;
+
+	/*
+	 * If the battery node has a compatible property, we are running a new
+	 * enough firmware and don't have fixups to make.
+	 */
+	r = olpc_dt_getproperty(node, "compatible", buf, sizeof(buf));
+	if (r > 0)
+		return;
+
+	pr_info("PROM DT: Old firmware detected, applying fixes\n");
+
+	/* Add olpc,xo1-battery compatible marker to battery node */
+	olpc_dt_interpret("\" /battery@0\" find-device"
+		" \" olpc,xo1-battery\" +compatible"
+		" device-end");
+
+	board_rev = olpc_dt_get_board_revision();
+	if (!board_rev)
+		return;
+
+	if (board_rev >= olpc_board_pre(0xd0)) {
+		/* XO-1.5: add dcon device */
+		olpc_dt_interpret("\" /pci/display@1\" find-device"
+			" new-device"
+			" \" dcon\" device-name \" olpc,xo1-dcon\" +compatible"
+			" finish-device device-end");
+	} else {
+		/* XO-1: add dcon device, mark RTC as olpc,xo1-rtc */
+		olpc_dt_interpret("\" /pci/display@1,1\" find-device"
+			" new-device"
+			" \" dcon\" device-name \" olpc,xo1-dcon\" +compatible"
+			" finish-device device-end"
+			" \" /rtc\" find-device"
+			" \" olpc,xo1-rtc\" +compatible"
+			" device-end");
+	}
+}
+
 void __init olpc_dt_build_devicetree(void)
 {
 	phandle root;
@@ -172,6 +273,8 @@ void __init olpc_dt_build_devicetree(void)
 	if (!olpc_ofw_is_installed())
 		return;
 
+	olpc_dt_fixup();
+
 	root = olpc_dt_getsibling(0);
 	if (!root) {
 		pr_err("PROM: unable to get root node from OFW!\n");
diff --git a/arch/x86/platform/olpc/xo1-wakeup.S b/arch/x86/platform/olpc/xo1-wakeup.S
new file mode 100644
index 00000000000..948deb28975
--- /dev/null
+++ b/arch/x86/platform/olpc/xo1-wakeup.S
@@ -0,0 +1,124 @@
+.text
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/page.h>
+#include <asm/pgtable_32.h>
+
+	.macro writepost,value
+		movb $0x34, %al
+		outb %al, $0x70
+		movb $\value, %al
+		outb %al, $0x71
+	.endm
+
+wakeup_start:
+	# OFW lands us here, running in protected mode, with a
+	# kernel-compatible GDT already setup.
+
+	# Clear any dangerous flags
+	pushl $0
+	popfl
+
+	writepost 0x31
+
+	# Set up %cr3
+	movl $initial_page_table - __PAGE_OFFSET, %eax
+	movl %eax, %cr3
+
+	movl saved_cr4, %eax
+	movl %eax, %cr4
+
+	movl saved_cr0, %eax
+	movl %eax, %cr0
+
+	# Control registers were modified, pipeline resync is needed
+	jmp 1f
+1:
+
+	movw    $__KERNEL_DS, %ax
+	movw    %ax, %ss
+	movw    %ax, %ds
+	movw    %ax, %es
+	movw    %ax, %fs
+	movw    %ax, %gs
+
+	lgdt    saved_gdt
+	lidt    saved_idt
+	lldt    saved_ldt
+	ljmp    $(__KERNEL_CS),$1f
+1:
+	movl    %cr3, %eax
+	movl    %eax, %cr3
+	wbinvd
+
+	# Go back to the return point
+	jmp ret_point
+
+save_registers:
+	sgdt  saved_gdt
+	sidt  saved_idt
+	sldt  saved_ldt
+
+	pushl %edx
+	movl %cr4, %edx
+	movl %edx, saved_cr4
+
+	movl %cr0, %edx
+	movl %edx, saved_cr0
+
+	popl %edx
+
+	movl %ebx, saved_context_ebx
+	movl %ebp, saved_context_ebp
+	movl %esi, saved_context_esi
+	movl %edi, saved_context_edi
+
+	pushfl
+	popl saved_context_eflags
+
+	ret
+
+restore_registers:
+	movl saved_context_ebp, %ebp
+	movl saved_context_ebx, %ebx
+	movl saved_context_esi, %esi
+	movl saved_context_edi, %edi
+
+	pushl saved_context_eflags
+	popfl
+
+	ret
+
+ENTRY(do_olpc_suspend_lowlevel)
+	call	save_processor_state
+	call	save_registers
+
+	# This is the stack context we want to remember
+	movl %esp, saved_context_esp
+
+	pushl	$3
+	call	xo1_do_sleep
+
+	jmp	wakeup_start
+	.p2align 4,,7
+ret_point:
+	movl    saved_context_esp, %esp
+
+	writepost 0x32
+
+	call	restore_registers
+	call	restore_processor_state
+	ret
+
+.data
+saved_gdt:             .long   0,0
+saved_idt:             .long   0,0
+saved_ldt:             .long   0
+saved_cr4:             .long   0
+saved_cr0:             .long   0
+saved_context_esp:     .long   0
+saved_context_edi:     .long   0
+saved_context_esi:     .long   0
+saved_context_ebx:     .long   0
+saved_context_ebp:     .long   0
+saved_context_eflags:  .long   0
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 68e467f69fe..db8b915f54b 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -296,14 +296,18 @@ static void bau_process_message(struct msg_desc *mdp,
 }
 
 /*
- * Determine the first cpu on a uvhub.
+ * Determine the first cpu on a pnode.
  */
-static int uvhub_to_first_cpu(int uvhub)
+static int pnode_to_first_cpu(int pnode, struct bau_control *smaster)
 {
 	int cpu;
-	for_each_present_cpu(cpu)
-		if (uvhub == uv_cpu_to_blade_id(cpu))
+	struct hub_and_pnode *hpp;
+
+	for_each_present_cpu(cpu) {
+		hpp = &smaster->thp[cpu];
+		if (pnode == hpp->pnode)
 			return cpu;
+	}
 	return -1;
 }
 
@@ -366,28 +370,32 @@ static void do_reset(void *ptr)
  * Use IPI to get all target uvhubs to release resources held by
  * a given sending cpu number.
  */
-static void reset_with_ipi(struct bau_targ_hubmask *distribution, int sender)
+static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
 {
-	int uvhub;
+	int pnode;
+	int apnode;
 	int maskbits;
-	cpumask_t mask;
+	int sender = bcp->cpu;
+	cpumask_t *mask = bcp->uvhub_master->cpumask;
+	struct bau_control *smaster = bcp->socket_master;
 	struct reset_args reset_args;
 
 	reset_args.sender = sender;
-	cpus_clear(mask);
+	cpus_clear(*mask);
 	/* find a single cpu for each uvhub in this distribution mask */
-	maskbits = sizeof(struct bau_targ_hubmask) * BITSPERBYTE;
-	for (uvhub = 0; uvhub < maskbits; uvhub++) {
+	maskbits = sizeof(struct pnmask) * BITSPERBYTE;
+	/* each bit is a pnode relative to the partition base pnode */
+	for (pnode = 0; pnode < maskbits; pnode++) {
 		int cpu;
-		if (!bau_uvhub_isset(uvhub, distribution))
+		if (!bau_uvhub_isset(pnode, distribution))
 			continue;
-		/* find a cpu for this uvhub */
-		cpu = uvhub_to_first_cpu(uvhub);
-		cpu_set(cpu, mask);
+		apnode = pnode + bcp->partition_base_pnode;
+		cpu = pnode_to_first_cpu(apnode, smaster);
+		cpu_set(cpu, *mask);
 	}
 
 	/* IPI all cpus; preemption is already disabled */
-	smp_call_function_many(&mask, do_reset, (void *)&reset_args, 1);
+	smp_call_function_many(mask, do_reset, (void *)&reset_args, 1);
 	return;
 }
 
@@ -604,7 +612,7 @@ static void destination_plugged(struct bau_desc *bau_desc,
 		quiesce_local_uvhub(hmaster);
 
 		spin_lock(&hmaster->queue_lock);
-		reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+		reset_with_ipi(&bau_desc->distribution, bcp);
 		spin_unlock(&hmaster->queue_lock);
 
 		end_uvhub_quiesce(hmaster);
@@ -626,7 +634,7 @@ static void destination_timeout(struct bau_desc *bau_desc,
 		quiesce_local_uvhub(hmaster);
 
 		spin_lock(&hmaster->queue_lock);
-		reset_with_ipi(&bau_desc->distribution, bcp->cpu);
+		reset_with_ipi(&bau_desc->distribution, bcp);
 		spin_unlock(&hmaster->queue_lock);
 
 		end_uvhub_quiesce(hmaster);
@@ -1334,9 +1342,10 @@ static ssize_t tunables_write(struct file *file, const char __user *user,
 
 	instr[count] = '\0';
 
-	bcp = &per_cpu(bau_control, smp_processor_id());
-
+	cpu = get_cpu();
+	bcp = &per_cpu(bau_control, cpu);
 	ret = parse_tunables_write(bcp, instr, count);
+	put_cpu();
 	if (ret)
 		return ret;
 
@@ -1687,6 +1696,16 @@ static void make_per_cpu_thp(struct bau_control *smaster)
 }
 
 /*
+ * Each uvhub is to get a local cpumask.
+ */
+static void make_per_hub_cpumask(struct bau_control *hmaster)
+{
+	int sz = sizeof(cpumask_t);
+
+	hmaster->cpumask = kzalloc_node(sz, GFP_KERNEL, hmaster->osnode);
+}
+
+/*
  * Initialize all the per_cpu information for the cpu's on a given socket,
  * given what has been gathered into the socket_desc struct.
  * And reports the chosen hub and socket masters back to the caller.
@@ -1751,11 +1770,12 @@ static int __init summarize_uvhub_sockets(int nuvhubs,
 				sdp = &bdp->socket[socket];
 				if (scan_sock(sdp, bdp, &smaster, &hmaster))
 					return 1;
+				make_per_cpu_thp(smaster);
 			}
 			socket++;
 			socket_mask = (socket_mask >> 1);
-			make_per_cpu_thp(smaster);
 		}
+		make_per_hub_cpumask(hmaster);
 	}
 	return 0;
 }
@@ -1777,15 +1797,20 @@ static int __init init_per_cpu(int nuvhubs, int base_part_pnode)
 	uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
 
 	if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask))
-		return 1;
+		goto fail;
 
 	if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask))
-		return 1;
+		goto fail;
 
 	kfree(uvhub_descs);
 	kfree(uvhub_mask);
 	init_per_cpu_tunables();
 	return 0;
+
+fail:
+	kfree(uvhub_descs);
+	kfree(uvhub_mask);
+	return 1;
 }
 
 /*
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index bef0bc96240..5d179502a52 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -26,6 +26,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
 export CPPFLAGS_vdso.lds += -P -C
 
 VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
+			-Wl,--no-undefined \
 		      	-Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
 
 $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index a724905fdae..6bc0e723b6e 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -6,7 +6,6 @@
  *
  * The code should have no internal unresolved relocations.
  * Check with readelf after changing.
- * Also alternative() doesn't work.
  */
 
 /* Disable profiling for userspace code: */
@@ -17,6 +16,7 @@
 #include <linux/time.h>
 #include <linux/string.h>
 #include <asm/vsyscall.h>
+#include <asm/fixmap.h>
 #include <asm/vgtod.h>
 #include <asm/timex.h>
 #include <asm/hpet.h>
@@ -25,6 +25,43 @@
 
 #define gtod (&VVAR(vsyscall_gtod_data))
 
+notrace static cycle_t vread_tsc(void)
+{
+	cycle_t ret;
+	u64 last;
+
+	/*
+	 * Empirically, a fence (of type that depends on the CPU)
+	 * before rdtsc is enough to ensure that rdtsc is ordered
+	 * with respect to loads.  The various CPU manuals are unclear
+	 * as to whether rdtsc can be reordered with later loads,
+	 * but no one has ever seen it happen.
+	 */
+	rdtsc_barrier();
+	ret = (cycle_t)vget_cycles();
+
+	last = VVAR(vsyscall_gtod_data).clock.cycle_last;
+
+	if (likely(ret >= last))
+		return ret;
+
+	/*
+	 * GCC likes to generate cmov here, but this branch is extremely
+	 * predictable (it's just a funciton of time and the likely is
+	 * very likely) and there's a data dependence, so force GCC
+	 * to generate a branch instead.  I don't barrier() because
+	 * we don't actually need a barrier, and if this function
+	 * ever gets inlined it will generate worse code.
+	 */
+	asm volatile ("");
+	return last;
+}
+
+static notrace cycle_t vread_hpet(void)
+{
+	return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
+}
+
 notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
 {
 	long ret;
@@ -36,9 +73,12 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
 notrace static inline long vgetns(void)
 {
 	long v;
-	cycles_t (*vread)(void);
-	vread = gtod->clock.vread;
-	v = (vread() - gtod->clock.cycle_last) & gtod->clock.mask;
+	cycles_t cycles;
+	if (gtod->clock.vclock_mode == VCLOCK_TSC)
+		cycles = vread_tsc();
+	else
+		cycles = vread_hpet();
+	v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask;
 	return (v * gtod->clock.mult) >> gtod->clock.shift;
 }
 
@@ -116,21 +156,21 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
 
 notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
-	if (likely(gtod->sysctl_enabled))
-		switch (clock) {
-		case CLOCK_REALTIME:
-			if (likely(gtod->clock.vread))
-				return do_realtime(ts);
-			break;
-		case CLOCK_MONOTONIC:
-			if (likely(gtod->clock.vread))
-				return do_monotonic(ts);
-			break;
-		case CLOCK_REALTIME_COARSE:
-			return do_realtime_coarse(ts);
-		case CLOCK_MONOTONIC_COARSE:
-			return do_monotonic_coarse(ts);
-		}
+	switch (clock) {
+	case CLOCK_REALTIME:
+		if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
+			return do_realtime(ts);
+		break;
+	case CLOCK_MONOTONIC:
+		if (likely(gtod->clock.vclock_mode != VCLOCK_NONE))
+			return do_monotonic(ts);
+		break;
+	case CLOCK_REALTIME_COARSE:
+		return do_realtime_coarse(ts);
+	case CLOCK_MONOTONIC_COARSE:
+		return do_monotonic_coarse(ts);
+	}
+
 	return vdso_fallback_gettime(clock, ts);
 }
 int clock_gettime(clockid_t, struct timespec *)
@@ -139,7 +179,7 @@ int clock_gettime(clockid_t, struct timespec *)
 notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
 {
 	long ret;
-	if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
+	if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) {
 		if (likely(tv != NULL)) {
 			BUILD_BUG_ON(offsetof(struct timeval, tv_usec) !=
 				     offsetof(struct timespec, tv_nsec) ||
@@ -161,27 +201,14 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
 int gettimeofday(struct timeval *, struct timezone *)
 	__attribute__((weak, alias("__vdso_gettimeofday")));
 
-/* This will break when the xtime seconds get inaccurate, but that is
- * unlikely */
-
-static __always_inline long time_syscall(long *t)
-{
-	long secs;
-	asm volatile("syscall"
-		     : "=a" (secs)
-		     : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory");
-	return secs;
-}
-
+/*
+ * This will break when the xtime seconds get inaccurate, but that is
+ * unlikely
+ */
 notrace time_t __vdso_time(time_t *t)
 {
-	time_t result;
-
-	if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
-		return time_syscall(t);
-
 	/* This is atomic on x86_64 so we don't need any locks. */
-	result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec);
+	time_t result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec);
 
 	if (t)
 		*t = result;
diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S
index 1d3aa6b8718..1b979c12ba8 100644
--- a/arch/x86/vdso/vdso.S
+++ b/arch/x86/vdso/vdso.S
@@ -1,10 +1,21 @@
+#include <asm/page_types.h>
+#include <linux/linkage.h>
 #include <linux/init.h>
 
-__INITDATA
+__PAGE_ALIGNED_DATA
 
 	.globl vdso_start, vdso_end
+	.align PAGE_SIZE
 vdso_start:
 	.incbin "arch/x86/vdso/vdso.so"
 vdso_end:
 
-__FINIT
+.previous
+
+	.globl vdso_pages
+	.bss
+	.align 8
+	.type vdso_pages, @object
+vdso_pages:
+	.zero (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE * 8
+	.size vdso_pages, .-vdso_pages
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7abd2be0f9b..316fbca3490 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -14,41 +14,61 @@
 #include <asm/vgtod.h>
 #include <asm/proto.h>
 #include <asm/vdso.h>
+#include <asm/page.h>
 
 unsigned int __read_mostly vdso_enabled = 1;
 
 extern char vdso_start[], vdso_end[];
 extern unsigned short vdso_sync_cpuid;
 
-static struct page **vdso_pages;
+extern struct page *vdso_pages[];
 static unsigned vdso_size;
 
-static int __init init_vdso_vars(void)
+static void __init patch_vdso(void *vdso, size_t len)
+{
+	Elf64_Ehdr *hdr = vdso;
+	Elf64_Shdr *sechdrs, *alt_sec = 0;
+	char *secstrings;
+	void *alt_data;
+	int i;
+
+	BUG_ON(len < sizeof(Elf64_Ehdr));
+	BUG_ON(memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0);
+
+	sechdrs = (void *)hdr + hdr->e_shoff;
+	secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
+
+	for (i = 1; i < hdr->e_shnum; i++) {
+		Elf64_Shdr *shdr = &sechdrs[i];
+		if (!strcmp(secstrings + shdr->sh_name, ".altinstructions")) {
+			alt_sec = shdr;
+			goto found;
+		}
+	}
+
+	/* If we get here, it's probably a bug. */
+	pr_warning("patch_vdso: .altinstructions not found\n");
+	return;  /* nothing to patch */
+
+found:
+	alt_data = (void *)hdr + alt_sec->sh_offset;
+	apply_alternatives(alt_data, alt_data + alt_sec->sh_size);
+}
+
+static int __init init_vdso(void)
 {
 	int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
 	int i;
 
+	patch_vdso(vdso_start, vdso_end - vdso_start);
+
 	vdso_size = npages << PAGE_SHIFT;
-	vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
-	if (!vdso_pages)
-		goto oom;
-	for (i = 0; i < npages; i++) {
-		struct page *p;
-		p = alloc_page(GFP_KERNEL);
-		if (!p)
-			goto oom;
-		vdso_pages[i] = p;
-		copy_page(page_address(p), vdso_start + i*PAGE_SIZE);
-	}
+	for (i = 0; i < npages; i++)
+		vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE);
 
 	return 0;
-
- oom:
-	printk("Cannot allocate vdso\n");
-	vdso_enabled = 0;
-	return -ENOMEM;
 }
-subsys_initcall(init_vdso_vars);
+subsys_initcall(init_vdso);
 
 struct linux_binprm;
 
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 17c565de3d6..45e94aca5bc 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -15,8 +15,10 @@ obj-y		:= enlighten.o setup.o multicalls.o mmu.o irq.o \
 			grant-table.o suspend.o platform-pci-unplug.o \
 			p2m.o
 
+obj-$(CONFIG_FUNCTION_TRACER) += trace.o
+
 obj-$(CONFIG_SMP)		+= smp.o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
 obj-$(CONFIG_XEN_DEBUG_FS)	+= debugfs.o
-
+obj-$(CONFIG_XEN_DOM0)		+= vga.o
 obj-$(CONFIG_SWIOTLB_XEN)	+= pci-swiotlb-xen.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5525163a039..974a528458a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -341,6 +341,8 @@ static void xen_set_ldt(const void *addr, unsigned entries)
 	struct mmuext_op *op;
 	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
 
+	trace_xen_cpu_set_ldt(addr, entries);
+
 	op = mcs.args;
 	op->cmd = MMUEXT_SET_LDT;
 	op->arg1.linear_addr = (unsigned long)addr;
@@ -496,6 +498,8 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
 	xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
 	u64 entry = *(u64 *)ptr;
 
+	trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);
+
 	preempt_disable();
 
 	xen_mc_flush();
@@ -565,6 +569,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
 	unsigned long p = (unsigned long)&dt[entrynum];
 	unsigned long start, end;
 
+	trace_xen_cpu_write_idt_entry(dt, entrynum, g);
+
 	preempt_disable();
 
 	start = __this_cpu_read(idt_desc.address);
@@ -619,6 +625,8 @@ static void xen_load_idt(const struct desc_ptr *desc)
 	static DEFINE_SPINLOCK(lock);
 	static struct trap_info traps[257];
 
+	trace_xen_cpu_load_idt(desc);
+
 	spin_lock(&lock);
 
 	__get_cpu_var(idt_desc) = *desc;
@@ -637,6 +645,8 @@ static void xen_load_idt(const struct desc_ptr *desc)
 static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
 				const void *desc, int type)
 {
+	trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
+
 	preempt_disable();
 
 	switch (type) {
@@ -665,6 +675,8 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
 static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
 					    const void *desc, int type)
 {
+	trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
+
 	switch (type) {
 	case DESC_LDT:
 	case DESC_TSS:
@@ -684,7 +696,9 @@ static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
 static void xen_load_sp0(struct tss_struct *tss,
 			 struct thread_struct *thread)
 {
-	struct multicall_space mcs = xen_mc_entry(0);
+	struct multicall_space mcs;
+
+	mcs = xen_mc_entry(0);
 	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
 }
@@ -1248,6 +1262,14 @@ asmlinkage void __init xen_start_kernel(void)
 		if (pci_xen)
 			x86_init.pci.arch_init = pci_xen_init;
 	} else {
+		const struct dom0_vga_console_info *info =
+			(void *)((char *)xen_start_info +
+				 xen_start_info->console.dom0.info_off);
+
+		xen_init_vga(info, xen_start_info->console.dom0.info_size);
+		xen_start_info->console.domU.mfn = 0;
+		xen_start_info->console.domU.evtchn = 0;
+
 		/* Make sure ACS will be enabled */
 		pci_request_acs();
 	}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 0ccccb67a99..f987bde77c4 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -48,6 +48,8 @@
 #include <linux/memblock.h>
 #include <linux/seq_file.h>
 
+#include <trace/events/xen.h>
+
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/fixmap.h>
@@ -194,6 +196,8 @@ void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
 	struct multicall_space mcs;
 	struct mmu_update *u;
 
+	trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
+
 	mcs = xen_mc_entry(sizeof(*u));
 	u = mcs.args;
 
@@ -225,6 +229,24 @@ static void xen_extend_mmu_update(const struct mmu_update *update)
 	*u = *update;
 }
 
+static void xen_extend_mmuext_op(const struct mmuext_op *op)
+{
+	struct multicall_space mcs;
+	struct mmuext_op *u;
+
+	mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
+
+	if (mcs.mc != NULL) {
+		mcs.mc->args[1]++;
+	} else {
+		mcs = __xen_mc_entry(sizeof(*u));
+		MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+	}
+
+	u = mcs.args;
+	*u = *op;
+}
+
 static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 {
 	struct mmu_update u;
@@ -245,6 +267,8 @@ static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 
 static void xen_set_pmd(pmd_t *ptr, pmd_t val)
 {
+	trace_xen_mmu_set_pmd(ptr, val);
+
 	/* If page is not pinned, we can just update the entry
 	   directly */
 	if (!xen_page_pinned(ptr)) {
@@ -282,22 +306,30 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
 	return true;
 }
 
-static void xen_set_pte(pte_t *ptep, pte_t pteval)
+static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
 {
 	if (!xen_batched_set_pte(ptep, pteval))
 		native_set_pte(ptep, pteval);
 }
 
+static void xen_set_pte(pte_t *ptep, pte_t pteval)
+{
+	trace_xen_mmu_set_pte(ptep, pteval);
+	__xen_set_pte(ptep, pteval);
+}
+
 static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 		    pte_t *ptep, pte_t pteval)
 {
-	xen_set_pte(ptep, pteval);
+	trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
+	__xen_set_pte(ptep, pteval);
 }
 
 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
 				 unsigned long addr, pte_t *ptep)
 {
 	/* Just return the pte as-is.  We preserve the bits on commit */
+	trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
 	return *ptep;
 }
 
@@ -306,6 +338,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 {
 	struct mmu_update u;
 
+	trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
 	xen_mc_batch();
 
 	u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
@@ -530,6 +563,8 @@ static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 
 static void xen_set_pud(pud_t *ptr, pud_t val)
 {
+	trace_xen_mmu_set_pud(ptr, val);
+
 	/* If page is not pinned, we can just update the entry
 	   directly */
 	if (!xen_page_pinned(ptr)) {
@@ -543,17 +578,20 @@ static void xen_set_pud(pud_t *ptr, pud_t val)
 #ifdef CONFIG_X86_PAE
 static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
+	trace_xen_mmu_set_pte_atomic(ptep, pte);
 	set_64bit((u64 *)ptep, native_pte_val(pte));
 }
 
 static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
+	trace_xen_mmu_pte_clear(mm, addr, ptep);
 	if (!xen_batched_set_pte(ptep, native_make_pte(0)))
 		native_pte_clear(mm, addr, ptep);
 }
 
 static void xen_pmd_clear(pmd_t *pmdp)
 {
+	trace_xen_mmu_pmd_clear(pmdp);
 	set_pmd(pmdp, __pmd(0));
 }
 #endif	/* CONFIG_X86_PAE */
@@ -629,6 +667,8 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
 {
 	pgd_t *user_ptr = xen_get_user_pgd(ptr);
 
+	trace_xen_mmu_set_pgd(ptr, user_ptr, val);
+
 	/* If page is not pinned, we can just update the entry
 	   directly */
 	if (!xen_page_pinned(ptr)) {
@@ -788,14 +828,12 @@ static void xen_pte_unlock(void *v)
 
 static void xen_do_pin(unsigned level, unsigned long pfn)
 {
-	struct mmuext_op *op;
-	struct multicall_space mcs;
+	struct mmuext_op op;
 
-	mcs = __xen_mc_entry(sizeof(*op));
-	op = mcs.args;
-	op->cmd = level;
-	op->arg1.mfn = pfn_to_mfn(pfn);
-	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+	op.cmd = level;
+	op.arg1.mfn = pfn_to_mfn(pfn);
+
+	xen_extend_mmuext_op(&op);
 }
 
 static int xen_pin_page(struct mm_struct *mm, struct page *page,
@@ -863,6 +901,8 @@ static int xen_pin_page(struct mm_struct *mm, struct page *page,
    read-only, and can be pinned. */
 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
 {
+	trace_xen_mmu_pgd_pin(mm, pgd);
+
 	xen_mc_batch();
 
 	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
@@ -988,6 +1028,8 @@ static int xen_unpin_page(struct mm_struct *mm, struct page *page,
 /* Release a pagetables pages back as normal RW */
 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
 {
+	trace_xen_mmu_pgd_unpin(mm, pgd);
+
 	xen_mc_batch();
 
 	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
@@ -1196,6 +1238,8 @@ static void xen_flush_tlb(void)
 	struct mmuext_op *op;
 	struct multicall_space mcs;
 
+	trace_xen_mmu_flush_tlb(0);
+
 	preempt_disable();
 
 	mcs = xen_mc_entry(sizeof(*op));
@@ -1214,6 +1258,8 @@ static void xen_flush_tlb_single(unsigned long addr)
 	struct mmuext_op *op;
 	struct multicall_space mcs;
 
+	trace_xen_mmu_flush_tlb_single(addr);
+
 	preempt_disable();
 
 	mcs = xen_mc_entry(sizeof(*op));
@@ -1240,6 +1286,8 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
 	} *args;
 	struct multicall_space mcs;
 
+	trace_xen_mmu_flush_tlb_others(cpus, mm, va);
+
 	if (cpumask_empty(cpus))
 		return;		/* nothing to do */
 
@@ -1275,10 +1323,11 @@ static void set_current_cr3(void *v)
 
 static void __xen_write_cr3(bool kernel, unsigned long cr3)
 {
-	struct mmuext_op *op;
-	struct multicall_space mcs;
+	struct mmuext_op op;
 	unsigned long mfn;
 
+	trace_xen_mmu_write_cr3(kernel, cr3);
+
 	if (cr3)
 		mfn = pfn_to_mfn(PFN_DOWN(cr3));
 	else
@@ -1286,13 +1335,10 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3)
 
 	WARN_ON(mfn == 0 && kernel);
 
-	mcs = __xen_mc_entry(sizeof(*op));
-
-	op = mcs.args;
-	op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
-	op->arg1.mfn = mfn;
+	op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
+	op.arg1.mfn = mfn;
 
-	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+	xen_extend_mmuext_op(&op);
 
 	if (kernel) {
 		percpu_write(xen_cr3, cr3);
@@ -1451,19 +1497,52 @@ static void __init xen_release_pmd_init(unsigned long pfn)
 	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
 }
 
+static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+	struct multicall_space mcs;
+	struct mmuext_op *op;
+
+	mcs = __xen_mc_entry(sizeof(*op));
+	op = mcs.args;
+	op->cmd = cmd;
+	op->arg1.mfn = pfn_to_mfn(pfn);
+
+	MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
+}
+
+static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
+{
+	struct multicall_space mcs;
+	unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
+
+	mcs = __xen_mc_entry(0);
+	MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
+				pfn_pte(pfn, prot), 0);
+}
+
 /* This needs to make sure the new pte page is pinned iff its being
    attached to a pinned pagetable. */
-static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
+static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
+				    unsigned level)
 {
-	struct page *page = pfn_to_page(pfn);
+	bool pinned = PagePinned(virt_to_page(mm->pgd));
+
+	trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
+
+	if (pinned) {
+		struct page *page = pfn_to_page(pfn);
 
-	if (PagePinned(virt_to_page(mm->pgd))) {
 		SetPagePinned(page);
 
 		if (!PageHighMem(page)) {
-			make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
+			xen_mc_batch();
+
+			__set_pfn_prot(pfn, PAGE_KERNEL_RO);
+
 			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
-				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+				__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+
+			xen_mc_issue(PARAVIRT_LAZY_MMU);
 		} else {
 			/* make sure there are no stray mappings of
 			   this page */
@@ -1483,15 +1562,23 @@ static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
 }
 
 /* This should never happen until we're OK to use struct page */
-static void xen_release_ptpage(unsigned long pfn, unsigned level)
+static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
 {
 	struct page *page = pfn_to_page(pfn);
+	bool pinned = PagePinned(page);
 
-	if (PagePinned(page)) {
+	trace_xen_mmu_release_ptpage(pfn, level, pinned);
+
+	if (pinned) {
 		if (!PageHighMem(page)) {
+			xen_mc_batch();
+
 			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
-				pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
-			make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+				__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
+
+			__set_pfn_prot(pfn, PAGE_KERNEL);
+
+			xen_mc_issue(PARAVIRT_LAZY_MMU);
 		}
 		ClearPagePinned(page);
 	}
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 1b2b73ff0a6..0d82003e76a 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -30,12 +30,13 @@
 
 #define MC_BATCH	32
 
-#define MC_DEBUG	1
+#define MC_DEBUG	0
 
 #define MC_ARGS		(MC_BATCH * 16)
 
 
 struct mc_buffer {
+	unsigned mcidx, argidx, cbidx;
 	struct multicall_entry entries[MC_BATCH];
 #if MC_DEBUG
 	struct multicall_entry debug[MC_BATCH];
@@ -46,85 +47,15 @@ struct mc_buffer {
 		void (*fn)(void *);
 		void *data;
 	} callbacks[MC_BATCH];
-	unsigned mcidx, argidx, cbidx;
 };
 
 static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
 DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
 
-/* flush reasons 0- slots, 1- args, 2- callbacks */
-enum flush_reasons
-{
-	FL_SLOTS,
-	FL_ARGS,
-	FL_CALLBACKS,
-
-	FL_N_REASONS
-};
-
-#ifdef CONFIG_XEN_DEBUG_FS
-#define NHYPERCALLS	40		/* not really */
-
-static struct {
-	unsigned histo[MC_BATCH+1];
-
-	unsigned issued;
-	unsigned arg_total;
-	unsigned hypercalls;
-	unsigned histo_hypercalls[NHYPERCALLS];
-
-	unsigned flush[FL_N_REASONS];
-} mc_stats;
-
-static u8 zero_stats;
-
-static inline void check_zero(void)
-{
-	if (unlikely(zero_stats)) {
-		memset(&mc_stats, 0, sizeof(mc_stats));
-		zero_stats = 0;
-	}
-}
-
-static void mc_add_stats(const struct mc_buffer *mc)
-{
-	int i;
-
-	check_zero();
-
-	mc_stats.issued++;
-	mc_stats.hypercalls += mc->mcidx;
-	mc_stats.arg_total += mc->argidx;
-
-	mc_stats.histo[mc->mcidx]++;
-	for(i = 0; i < mc->mcidx; i++) {
-		unsigned op = mc->entries[i].op;
-		if (op < NHYPERCALLS)
-			mc_stats.histo_hypercalls[op]++;
-	}
-}
-
-static void mc_stats_flush(enum flush_reasons idx)
-{
-	check_zero();
-
-	mc_stats.flush[idx]++;
-}
-
-#else  /* !CONFIG_XEN_DEBUG_FS */
-
-static inline void mc_add_stats(const struct mc_buffer *mc)
-{
-}
-
-static inline void mc_stats_flush(enum flush_reasons idx)
-{
-}
-#endif	/* CONFIG_XEN_DEBUG_FS */
-
 void xen_mc_flush(void)
 {
 	struct mc_buffer *b = &__get_cpu_var(mc_buffer);
+	struct multicall_entry *mc;
 	int ret = 0;
 	unsigned long flags;
 	int i;
@@ -135,9 +66,26 @@ void xen_mc_flush(void)
 	   something in the middle */
 	local_irq_save(flags);
 
-	mc_add_stats(b);
+	trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx);
+
+	switch (b->mcidx) {
+	case 0:
+		/* no-op */
+		BUG_ON(b->argidx != 0);
+		break;
+
+	case 1:
+		/* Singleton multicall - bypass multicall machinery
+		   and just do the call directly. */
+		mc = &b->entries[0];
+
+		mc->result = privcmd_call(mc->op,
+					  mc->args[0], mc->args[1], mc->args[2], 
+					  mc->args[3], mc->args[4]);
+		ret = mc->result < 0;
+		break;
 
-	if (b->mcidx) {
+	default:
 #if MC_DEBUG
 		memcpy(b->debug, b->entries,
 		       b->mcidx * sizeof(struct multicall_entry));
@@ -164,11 +112,10 @@ void xen_mc_flush(void)
 			}
 		}
 #endif
+	}
 
-		b->mcidx = 0;
-		b->argidx = 0;
-	} else
-		BUG_ON(b->argidx != 0);
+	b->mcidx = 0;
+	b->argidx = 0;
 
 	for (i = 0; i < b->cbidx; i++) {
 		struct callback *cb = &b->callbacks[i];
@@ -188,18 +135,21 @@ struct multicall_space __xen_mc_entry(size_t args)
 	struct multicall_space ret;
 	unsigned argidx = roundup(b->argidx, sizeof(u64));
 
+	trace_xen_mc_entry_alloc(args);
+
 	BUG_ON(preemptible());
 	BUG_ON(b->argidx >= MC_ARGS);
 
-	if (b->mcidx == MC_BATCH ||
-	    (argidx + args) >= MC_ARGS) {
-		mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
+	if (unlikely(b->mcidx == MC_BATCH ||
+		     (argidx + args) >= MC_ARGS)) {
+		trace_xen_mc_flush_reason((b->mcidx == MC_BATCH) ?
+					  XEN_MC_FL_BATCH : XEN_MC_FL_ARGS);
 		xen_mc_flush();
 		argidx = roundup(b->argidx, sizeof(u64));
 	}
 
 	ret.mc = &b->entries[b->mcidx];
-#ifdef MC_DEBUG
+#if MC_DEBUG
 	b->caller[b->mcidx] = __builtin_return_address(0);
 #endif
 	b->mcidx++;
@@ -218,20 +168,25 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
 	BUG_ON(preemptible());
 	BUG_ON(b->argidx >= MC_ARGS);
 
-	if (b->mcidx == 0)
-		return ret;
-
-	if (b->entries[b->mcidx - 1].op != op)
-		return ret;
+	if (unlikely(b->mcidx == 0 ||
+		     b->entries[b->mcidx - 1].op != op)) {
+		trace_xen_mc_extend_args(op, size, XEN_MC_XE_BAD_OP);
+		goto out;
+	}
 
-	if ((b->argidx + size) >= MC_ARGS)
-		return ret;
+	if (unlikely((b->argidx + size) >= MC_ARGS)) {
+		trace_xen_mc_extend_args(op, size, XEN_MC_XE_NO_SPACE);
+		goto out;
+	}
 
 	ret.mc = &b->entries[b->mcidx - 1];
 	ret.args = &b->args[b->argidx];
 	b->argidx += size;
 
 	BUG_ON(b->argidx >= MC_ARGS);
+
+	trace_xen_mc_extend_args(op, size, XEN_MC_XE_OK);
+out:
 	return ret;
 }
 
@@ -241,43 +196,13 @@ void xen_mc_callback(void (*fn)(void *), void *data)
 	struct callback *cb;
 
 	if (b->cbidx == MC_BATCH) {
-		mc_stats_flush(FL_CALLBACKS);
+		trace_xen_mc_flush_reason(XEN_MC_FL_CALLBACK);
 		xen_mc_flush();
 	}
 
+	trace_xen_mc_callback(fn, data);
+
 	cb = &b->callbacks[b->cbidx++];
 	cb->fn = fn;
 	cb->data = data;
 }
-
-#ifdef CONFIG_XEN_DEBUG_FS
-
-static struct dentry *d_mc_debug;
-
-static int __init xen_mc_debugfs(void)
-{
-	struct dentry *d_xen = xen_init_debugfs();
-
-	if (d_xen == NULL)
-		return -ENOMEM;
-
-	d_mc_debug = debugfs_create_dir("multicalls", d_xen);
-
-	debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats);
-
-	debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued);
-	debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls);
-	debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total);
-
-	xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug,
-				     mc_stats.histo, MC_BATCH);
-	xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug,
-				     mc_stats.histo_hypercalls, NHYPERCALLS);
-	xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug,
-				     mc_stats.flush, FL_N_REASONS);
-
-	return 0;
-}
-fs_initcall(xen_mc_debugfs);
-
-#endif	/* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 4ec8035e321..dee79b78a90 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -1,6 +1,8 @@
 #ifndef _XEN_MULTICALLS_H
 #define _XEN_MULTICALLS_H
 
+#include <trace/events/xen.h>
+
 #include "xen-ops.h"
 
 /* Multicalls */
@@ -20,8 +22,10 @@ DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags);
 static inline void xen_mc_batch(void)
 {
 	unsigned long flags;
+
 	/* need to disable interrupts until this entry is complete */
 	local_irq_save(flags);
+	trace_xen_mc_batch(paravirt_get_lazy_mode());
 	__this_cpu_write(xen_mc_irq_flags, flags);
 }
 
@@ -37,6 +41,8 @@ void xen_mc_flush(void);
 /* Issue a multicall if we're not in a lazy mode */
 static inline void xen_mc_issue(unsigned mode)
 {
+	trace_xen_mc_issue(mode);
+
 	if ((paravirt_get_lazy_mode() & mode) == 0)
 		xen_mc_flush();
 
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 25c52f94a27..ffcf2615640 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -35,7 +35,7 @@ EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
 #ifdef CONFIG_XEN_PVHVM
 static int xen_emul_unplug;
 
-static int __init check_platform_magic(void)
+static int check_platform_magic(void)
 {
 	short magic;
 	char protocol;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 60aeeb56948..a9627e2e329 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -9,6 +9,7 @@
 #include <linux/mm.h>
 #include <linux/pm.h>
 #include <linux/memblock.h>
+#include <linux/cpuidle.h>
 
 #include <asm/elf.h>
 #include <asm/vdso.h>
@@ -426,7 +427,7 @@ void __init xen_arch_setup(void)
 #ifdef CONFIG_X86_32
 	boot_cpu_data.hlt_works_ok = 1;
 #endif
-	pm_idle = default_idle;
+	disable_cpuidle();
 	boot_option_idle_override = IDLE_HALT;
 
 	fiddle_vdso();
diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c
new file mode 100644
index 00000000000..734beba2a08
--- /dev/null
+++ b/arch/x86/xen/trace.c
@@ -0,0 +1,61 @@
+#include <linux/ftrace.h>
+
+#define N(x)	[__HYPERVISOR_##x] = "("#x")"
+static const char *xen_hypercall_names[] = {
+	N(set_trap_table),
+	N(mmu_update),
+	N(set_gdt),
+	N(stack_switch),
+	N(set_callbacks),
+	N(fpu_taskswitch),
+	N(sched_op_compat),
+	N(dom0_op),
+	N(set_debugreg),
+	N(get_debugreg),
+	N(update_descriptor),
+	N(memory_op),
+	N(multicall),
+	N(update_va_mapping),
+	N(set_timer_op),
+	N(event_channel_op_compat),
+	N(xen_version),
+	N(console_io),
+	N(physdev_op_compat),
+	N(grant_table_op),
+	N(vm_assist),
+	N(update_va_mapping_otherdomain),
+	N(iret),
+	N(vcpu_op),
+	N(set_segment_base),
+	N(mmuext_op),
+	N(acm_op),
+	N(nmi_op),
+	N(sched_op),
+	N(callback_op),
+	N(xenoprof_op),
+	N(event_channel_op),
+	N(physdev_op),
+	N(hvm_op),
+
+/* Architecture-specific hypercall definitions. */
+	N(arch_0),
+	N(arch_1),
+	N(arch_2),
+	N(arch_3),
+	N(arch_4),
+	N(arch_5),
+	N(arch_6),
+	N(arch_7),
+};
+#undef N
+
+static const char *xen_hypercall_name(unsigned op)
+{
+	if (op < ARRAY_SIZE(xen_hypercall_names) && xen_hypercall_names[op] != NULL)
+		return xen_hypercall_names[op];
+
+	return "";
+}
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/xen.h>
diff --git a/arch/x86/xen/vga.c b/arch/x86/xen/vga.c
new file mode 100644
index 00000000000..1cd7f4d11e2
--- /dev/null
+++ b/arch/x86/xen/vga.c
@@ -0,0 +1,67 @@
+#include <linux/screen_info.h>
+#include <linux/init.h>
+
+#include <asm/bootparam.h>
+#include <asm/setup.h>
+
+#include <xen/interface/xen.h>
+
+#include "xen-ops.h"
+
+void __init xen_init_vga(const struct dom0_vga_console_info *info, size_t size)
+{
+	struct screen_info *screen_info = &boot_params.screen_info;
+
+	/* This is drawn from a dump from vgacon:startup in
+	 * standard Linux. */
+	screen_info->orig_video_mode = 3;
+	screen_info->orig_video_isVGA = 1;
+	screen_info->orig_video_lines = 25;
+	screen_info->orig_video_cols = 80;
+	screen_info->orig_video_ega_bx = 3;
+	screen_info->orig_video_points = 16;
+	screen_info->orig_y = screen_info->orig_video_lines - 1;
+
+	switch (info->video_type) {
+	case XEN_VGATYPE_TEXT_MODE_3:
+		if (size < offsetof(struct dom0_vga_console_info, u.text_mode_3)
+		    + sizeof(info->u.text_mode_3))
+			break;
+		screen_info->orig_video_lines = info->u.text_mode_3.rows;
+		screen_info->orig_video_cols = info->u.text_mode_3.columns;
+		screen_info->orig_x = info->u.text_mode_3.cursor_x;
+		screen_info->orig_y = info->u.text_mode_3.cursor_y;
+		screen_info->orig_video_points =
+			info->u.text_mode_3.font_height;
+		break;
+
+	case XEN_VGATYPE_VESA_LFB:
+		if (size < offsetof(struct dom0_vga_console_info,
+				    u.vesa_lfb.gbl_caps))
+			break;
+		screen_info->orig_video_isVGA = VIDEO_TYPE_VLFB;
+		screen_info->lfb_width = info->u.vesa_lfb.width;
+		screen_info->lfb_height = info->u.vesa_lfb.height;
+		screen_info->lfb_depth = info->u.vesa_lfb.bits_per_pixel;
+		screen_info->lfb_base = info->u.vesa_lfb.lfb_base;
+		screen_info->lfb_size = info->u.vesa_lfb.lfb_size;
+		screen_info->lfb_linelength = info->u.vesa_lfb.bytes_per_line;
+		screen_info->red_size = info->u.vesa_lfb.red_size;
+		screen_info->red_pos = info->u.vesa_lfb.red_pos;
+		screen_info->green_size = info->u.vesa_lfb.green_size;
+		screen_info->green_pos = info->u.vesa_lfb.green_pos;
+		screen_info->blue_size = info->u.vesa_lfb.blue_size;
+		screen_info->blue_pos = info->u.vesa_lfb.blue_pos;
+		screen_info->rsvd_size = info->u.vesa_lfb.rsvd_size;
+		screen_info->rsvd_pos = info->u.vesa_lfb.rsvd_pos;
+		if (size >= offsetof(struct dom0_vga_console_info,
+				     u.vesa_lfb.gbl_caps)
+		    + sizeof(info->u.vesa_lfb.gbl_caps))
+			screen_info->capabilities = info->u.vesa_lfb.gbl_caps;
+		if (size >= offsetof(struct dom0_vga_console_info,
+				     u.vesa_lfb.mode_attrs)
+		    + sizeof(info->u.vesa_lfb.mode_attrs))
+			screen_info->vesa_attributes = info->u.vesa_lfb.mode_attrs;
+		break;
+	}
+}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 97dfdc8757b..b095739ccd4 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -88,6 +88,17 @@ static inline void xen_uninit_lock_cpu(int cpu)
 }
 #endif
 
+struct dom0_vga_console_info;
+
+#ifdef CONFIG_XEN_DOM0
+void __init xen_init_vga(const struct dom0_vga_console_info *, size_t size);
+#else
+static inline void __init xen_init_vga(const struct dom0_vga_console_info *info,
+				       size_t size)
+{
+}
+#endif
+
 /* Declare an asm function, along with symbols needed to make it
    inlineable */
 #define DECL_ASM(ret, name, ...)		\