91 files changed, 4139 insertions, 2271 deletions
diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore
new file mode 100644
index 00000000000..028079065af
--- /dev/null
+++ b/arch/x86/.gitignore
@@ -0,0 +1,3 @@
+boot/compressed/vmlinux
+tools/test_get_len
+
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a2d3a5fbeed..dcb0593b4a6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -109,6 +109,9 @@ config SBUS
 config NEED_DMA_MAP_STATE
        def_bool (X86_64 || DMAR || DMA_API_DEBUG)
 
+config NEED_SG_DMA_LENGTH
+	def_bool y
+
 config GENERIC_ISA_DMA
 	def_bool y
 
@@ -1703,6 +1706,10 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
 	def_bool X86_64
 	depends on NUMA
 
+config USE_PERCPU_NUMA_NODE_ID
+	def_bool X86_64
+	depends on NUMA
+
 menu "Power management and ACPI options"
 
 config ARCH_HIBERNATION_HEADER
@@ -1923,6 +1930,14 @@ config PCI_MMCONFIG
 	bool "Support mmconfig PCI config space access"
 	depends on X86_64 && PCI && ACPI
 
+config PCI_CNB20LE_QUIRK
+	bool "Read CNB20LE Host Bridge Windows"
+	depends on PCI
+	help
+	  Read the PCI windows out of the CNB20LE host bridge. This allows
+	  PCI hotplug to work on systems with the CNB20LE chipset which do
+	  not have ACPI.
+
 config DMAR
 	bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
 	depends on PCI_MSI && ACPI && EXPERIMENTAL
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
index bcbd36c4143..5c228129d17 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -77,7 +77,7 @@ int main(int argc, char *argv[])
 	offs += 32*1024 + 18;	/* Add 32K + 18 bytes slack */
 	offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
 
-	printf(".section \".rodata.compressed\",\"a\",@progbits\n");
+	printf(".section \".rodata..compressed\",\"a\",@progbits\n");
 	printf(".globl z_input_len\n");
 	printf("z_input_len = %lu\n", ilen);
 	printf(".globl z_output_len\n");
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index a6f1a59a5b0..5ddabceee12 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -26,8 +26,8 @@ SECTIONS
 		HEAD_TEXT
 		_ehead = . ;
 	}
-	.rodata.compressed : {
-		*(.rodata.compressed)
+	.rodata..compressed : {
+		*(.rodata..compressed)
 	}
 	.text :	{
 		_text = .; 	/* Text */
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 20bb0e1ac68..ff16756a51c 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -32,6 +32,9 @@
 #define IN	IN1
 #define KEY	%xmm2
 #define IV	%xmm3
+#define BSWAP_MASK %xmm10
+#define CTR	%xmm11
+#define INC	%xmm12
 
 #define KEYP	%rdi
 #define OUTP	%rsi
@@ -42,6 +45,7 @@
 #define T1	%r10
 #define TKEYP	T1
 #define T2	%r11
+#define TCTR_LOW T2
 
 _key_expansion_128:
 _key_expansion_256a:
@@ -724,3 +728,114 @@ ENTRY(aesni_cbc_dec)
 	movups IV, (IVP)
 .Lcbc_dec_just_ret:
 	ret
+
+.align 16
+.Lbswap_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/*
+ * _aesni_inc_init:	internal ABI
+ *	setup registers used by _aesni_inc
+ * input:
+ *	IV
+ * output:
+ *	CTR:	== IV, in little endian
+ *	TCTR_LOW: == lower qword of CTR
+ *	INC:	== 1, in little endian
+ *	BSWAP_MASK == endian swapping mask
+ */
+_aesni_inc_init:
+	movaps .Lbswap_mask, BSWAP_MASK
+	movaps IV, CTR
+	PSHUFB_XMM BSWAP_MASK CTR
+	mov $1, TCTR_LOW
+	MOVQ_R64_XMM TCTR_LOW INC
+	MOVQ_R64_XMM CTR TCTR_LOW
+	ret
+
+/*
+ * _aesni_inc:		internal ABI
+ *	Increase IV by 1, IV is in big endian
+ * input:
+ *	IV
+ *	CTR:	== IV, in little endian
+ *	TCTR_LOW: == lower qword of CTR
+ *	INC:	== 1, in little endian
+ *	BSWAP_MASK == endian swapping mask
+ * output:
+ *	IV:	Increase by 1
+ * changed:
+ *	CTR:	== output IV, in little endian
+ *	TCTR_LOW: == lower qword of CTR
+ */
+_aesni_inc:
+	paddq INC, CTR
+	add $1, TCTR_LOW
+	jnc .Linc_low
+	pslldq $8, INC
+	paddq INC, CTR
+	psrldq $8, INC
+.Linc_low:
+	movaps CTR, IV
+	PSHUFB_XMM BSWAP_MASK IV
+	ret
+
+/*
+ * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *		      size_t len, u8 *iv)
+ */
+ENTRY(aesni_ctr_enc)
+	cmp $16, LEN
+	jb .Lctr_enc_just_ret
+	mov 480(KEYP), KLEN
+	movups (IVP), IV
+	call _aesni_inc_init
+	cmp $64, LEN
+	jb .Lctr_enc_loop1
+.align 4
+.Lctr_enc_loop4:
+	movaps IV, STATE1
+	call _aesni_inc
+	movups (INP), IN1
+	movaps IV, STATE2
+	call _aesni_inc
+	movups 0x10(INP), IN2
+	movaps IV, STATE3
+	call _aesni_inc
+	movups 0x20(INP), IN3
+	movaps IV, STATE4
+	call _aesni_inc
+	movups 0x30(INP), IN4
+	call _aesni_enc4
+	pxor IN1, STATE1
+	movups STATE1, (OUTP)
+	pxor IN2, STATE2
+	movups STATE2, 0x10(OUTP)
+	pxor IN3, STATE3
+	movups STATE3, 0x20(OUTP)
+	pxor IN4, STATE4
+	movups STATE4, 0x30(OUTP)
+	sub $64, LEN
+	add $64, INP
+	add $64, OUTP
+	cmp $64, LEN
+	jge .Lctr_enc_loop4
+	cmp $16, LEN
+	jb .Lctr_enc_ret
+.align 4
+.Lctr_enc_loop1:
+	movaps IV, STATE
+	call _aesni_inc
+	movups (INP), IN
+	call _aesni_enc1
+	pxor IN, STATE
+	movups STATE, (OUTP)
+	sub $16, LEN
+	add $16, INP
+	add $16, OUTP
+	cmp $16, LEN
+	jge .Lctr_enc_loop1
+.Lctr_enc_ret:
+	movups IV, (IVP)
+.Lctr_enc_just_ret:
+	ret
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 49c552c060e..2cb3dcc4490 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -18,6 +18,7 @@
 #include <crypto/algapi.h>
 #include <crypto/aes.h>
 #include <crypto/cryptd.h>
+#include <crypto/ctr.h>
 #include <asm/i387.h>
 #include <asm/aes.h>
 
@@ -58,6 +59,8 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
+			      const u8 *in, unsigned int len, u8 *iv);
 
 static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
 {
@@ -321,6 +324,72 @@ static struct crypto_alg blk_cbc_alg = {
 	},
 };
 
+static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
+			    struct blkcipher_walk *walk)
+{
+	u8 *ctrblk = walk->iv;
+	u8 keystream[AES_BLOCK_SIZE];
+	u8 *src = walk->src.virt.addr;
+	u8 *dst = walk->dst.virt.addr;
+	unsigned int nbytes = walk->nbytes;
+
+	aesni_enc(ctx, keystream, ctrblk);
+	crypto_xor(keystream, src, nbytes);
+	memcpy(dst, keystream, nbytes);
+	crypto_inc(ctrblk, AES_BLOCK_SIZE);
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc,
+		     struct scatterlist *dst, struct scatterlist *src,
+		     unsigned int nbytes)
+{
+	struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	kernel_fpu_begin();
+	while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
+		aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+			      nbytes & AES_BLOCK_MASK, walk.iv);
+		nbytes &= AES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	if (walk.nbytes) {
+		ctr_crypt_final(ctx, &walk);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+	kernel_fpu_end();
+
+	return err;
+}
+
+static struct crypto_alg blk_ctr_alg = {
+	.cra_name		= "__ctr-aes-aesni",
+	.cra_driver_name	= "__driver-ctr-aes-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_ctr_alg.cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.ivsize		= AES_BLOCK_SIZE,
+			.setkey		= aes_set_key,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+};
+
 static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
 			unsigned int key_len)
 {
@@ -467,13 +536,11 @@ static struct crypto_alg ablk_cbc_alg = {
 	},
 };
 
-#ifdef HAS_CTR
 static int ablk_ctr_init(struct crypto_tfm *tfm)
 {
 	struct cryptd_ablkcipher *cryptd_tfm;
 
-	cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))",
-					     0, 0);
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-aes-aesni", 0, 0);
 	if (IS_ERR(cryptd_tfm))
 		return PTR_ERR(cryptd_tfm);
 	ablk_init_common(tfm, cryptd_tfm);
@@ -500,11 +567,50 @@ static struct crypto_alg ablk_ctr_alg = {
 			.ivsize		= AES_BLOCK_SIZE,
 			.setkey		= ablk_set_key,
 			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_decrypt,
+			.decrypt	= ablk_encrypt,
 			.geniv		= "chainiv",
 		},
 	},
 };
+
+#ifdef HAS_CTR
+static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher(
+		"rfc3686(__driver-ctr-aes-aesni)", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_rfc3686_ctr_alg = {
+	.cra_name		= "rfc3686(ctr(aes))",
+	.cra_driver_name	= "rfc3686-ctr-aes-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_rfc3686_ctr_alg.cra_list),
+	.cra_init		= ablk_rfc3686_ctr_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize = AES_MIN_KEY_SIZE+CTR_RFC3686_NONCE_SIZE,
+			.max_keysize = AES_MAX_KEY_SIZE+CTR_RFC3686_NONCE_SIZE,
+			.ivsize	     = CTR_RFC3686_IV_SIZE,
+			.setkey	     = ablk_set_key,
+			.encrypt     = ablk_encrypt,
+			.decrypt     = ablk_decrypt,
+			.geniv	     = "seqiv",
+		},
+	},
+};
 #endif
 
 #ifdef HAS_LRW
@@ -640,13 +746,17 @@ static int __init aesni_init(void)
 		goto blk_ecb_err;
 	if ((err = crypto_register_alg(&blk_cbc_alg)))
 		goto blk_cbc_err;
+	if ((err = crypto_register_alg(&blk_ctr_alg)))
+		goto blk_ctr_err;
 	if ((err = crypto_register_alg(&ablk_ecb_alg)))
 		goto ablk_ecb_err;
 	if ((err = crypto_register_alg(&ablk_cbc_alg)))
 		goto ablk_cbc_err;
-#ifdef HAS_CTR
 	if ((err = crypto_register_alg(&ablk_ctr_alg)))
 		goto ablk_ctr_err;
+#ifdef HAS_CTR
+	if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
+		goto ablk_rfc3686_ctr_err;
 #endif
 #ifdef HAS_LRW
 	if ((err = crypto_register_alg(&ablk_lrw_alg)))
@@ -675,13 +785,17 @@ ablk_pcbc_err:
 ablk_lrw_err:
 #endif
 #ifdef HAS_CTR
+	crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
+ablk_rfc3686_ctr_err:
+#endif
 	crypto_unregister_alg(&ablk_ctr_alg);
 ablk_ctr_err:
-#endif
 	crypto_unregister_alg(&ablk_cbc_alg);
 ablk_cbc_err:
 	crypto_unregister_alg(&ablk_ecb_alg);
 ablk_ecb_err:
+	crypto_unregister_alg(&blk_ctr_alg);
+blk_ctr_err:
 	crypto_unregister_alg(&blk_cbc_alg);
 blk_cbc_err:
 	crypto_unregister_alg(&blk_ecb_alg);
@@ -705,10 +819,12 @@ static void __exit aesni_exit(void)
 	crypto_unregister_alg(&ablk_lrw_alg);
 #endif
 #ifdef HAS_CTR
-	crypto_unregister_alg(&ablk_ctr_alg);
+	crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
 #endif
+	crypto_unregister_alg(&ablk_ctr_alg);
 	crypto_unregister_alg(&ablk_cbc_alg);
 	crypto_unregister_alg(&ablk_ecb_alg);
+	crypto_unregister_alg(&blk_ctr_alg);
 	crypto_unregister_alg(&blk_cbc_alg);
 	crypto_unregister_alg(&blk_ecb_alg);
 	crypto_unregister_alg(&__aesni_alg);
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 56f462cf22d..aa2c39d968f 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -85,7 +85,6 @@ extern int acpi_ioapic;
 extern int acpi_noirq;
 extern int acpi_strict;
 extern int acpi_disabled;
-extern int acpi_ht;
 extern int acpi_pci_disabled;
 extern int acpi_skip_timer_override;
 extern int acpi_use_timer_override;
@@ -97,7 +96,6 @@ void acpi_pic_sci_set_trigger(unsigned int, u16);
 static inline void disable_acpi(void)
 {
 	acpi_disabled = 1;
-	acpi_ht = 0;
 	acpi_pci_disabled = 1;
 	acpi_noirq = 1;
 }
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
index 2f9047cfaac..48f99f15452 100644
--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -7,7 +7,7 @@
 #define L1_CACHE_SHIFT	(CONFIG_X86_L1_CACHE_SHIFT)
 #define L1_CACHE_BYTES	(1 << L1_CACHE_SHIFT)
 
-#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+#define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
 #define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT
 #define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT)
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index dca9c545f44..46814591438 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -332,6 +332,7 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
 #endif
 }
 
+#if __GNUC__ >= 4
 #define static_cpu_has(bit)					\
 (								\
 	__builtin_constant_p(boot_cpu_has(bit)) ?		\
@@ -340,6 +341,12 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
 		__static_cpu_has(bit) :				\
 		boot_cpu_has(bit)				\
 )
+#else
+/*
+ * gcc 3.x is too stupid to do the static test; fall back to dynamic.
+ */
+#define static_cpu_has(bit) boot_cpu_has(bit)
+#endif
 
 #endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
 
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
index 14cf526091f..280bf7fb6ab 100644
--- a/arch/x86/include/asm/inst.h
+++ b/arch/x86/include/asm/inst.h
@@ -7,7 +7,66 @@
 
 #ifdef __ASSEMBLY__
 
+#define REG_NUM_INVALID		100
+
+#define REG_TYPE_R64		0
+#define REG_TYPE_XMM		1
+#define REG_TYPE_INVALID	100
+
+	.macro R64_NUM opd r64
+	\opd = REG_NUM_INVALID
+	.ifc \r64,%rax
+	\opd = 0
+	.endif
+	.ifc \r64,%rcx
+	\opd = 1
+	.endif
+	.ifc \r64,%rdx
+	\opd = 2
+	.endif
+	.ifc \r64,%rbx
+	\opd = 3
+	.endif
+	.ifc \r64,%rsp
+	\opd = 4
+	.endif
+	.ifc \r64,%rbp
+	\opd = 5
+	.endif
+	.ifc \r64,%rsi
+	\opd = 6
+	.endif
+	.ifc \r64,%rdi
+	\opd = 7
+	.endif
+	.ifc \r64,%r8
+	\opd = 8
+	.endif
+	.ifc \r64,%r9
+	\opd = 9
+	.endif
+	.ifc \r64,%r10
+	\opd = 10
+	.endif
+	.ifc \r64,%r11
+	\opd = 11
+	.endif
+	.ifc \r64,%r12
+	\opd = 12
+	.endif
+	.ifc \r64,%r13
+	\opd = 13
+	.endif
+	.ifc \r64,%r14
+	\opd = 14
+	.endif
+	.ifc \r64,%r15
+	\opd = 15
+	.endif
+	.endm
+
 	.macro XMM_NUM opd xmm
+	\opd = REG_NUM_INVALID
 	.ifc \xmm,%xmm0
 	\opd = 0
 	.endif
@@ -58,13 +117,25 @@
 	.endif
 	.endm
 
+	.macro REG_TYPE type reg
+	R64_NUM reg_type_r64 \reg
+	XMM_NUM reg_type_xmm \reg
+	.if reg_type_r64 <> REG_NUM_INVALID
+	\type = REG_TYPE_R64
+	.elseif reg_type_xmm <> REG_NUM_INVALID
+	\type = REG_TYPE_XMM
+	.else
+	\type = REG_TYPE_INVALID
+	.endif
+	.endm
+
 	.macro PFX_OPD_SIZE
 	.byte 0x66
 	.endm
 
-	.macro PFX_REX opd1 opd2
-	.if (\opd1 | \opd2) & 8
-	.byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1)
+	.macro PFX_REX opd1 opd2 W=0
+	.if ((\opd1 | \opd2) & 8) || \W
+	.byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
 	.endif
 	.endm
 
@@ -145,6 +216,25 @@
 	.byte 0x0f, 0x38, 0xdf
 	MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2
 	.endm
+
+	.macro MOVQ_R64_XMM opd1 opd2
+	REG_TYPE movq_r64_xmm_opd1_type \opd1
+	.if movq_r64_xmm_opd1_type == REG_TYPE_XMM
+	XMM_NUM movq_r64_xmm_opd1 \opd1
+	R64_NUM movq_r64_xmm_opd2 \opd2
+	.else
+	R64_NUM movq_r64_xmm_opd1 \opd1
+	XMM_NUM movq_r64_xmm_opd2 \opd2
+	.endif
+	PFX_OPD_SIZE
+	PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1
+	.if movq_r64_xmm_opd1_type == REG_TYPE_XMM
+	.byte 0x0f, 0x7e
+	.else
+	.byte 0x0f, 0x6e
+	.endif
+	MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
+	.endm
 #endif
 
 #endif
diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
new file mode 100644
index 00000000000..4470c9ad4a3
--- /dev/null
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -0,0 +1,55 @@
+#ifndef _ASM_X86_INTEL_SCU_IPC_H_
+#define  _ASM_X86_INTEL_SCU_IPC_H_
+
+/* Read single register */
+int intel_scu_ipc_ioread8(u16 addr, u8 *data);
+
+/* Read two sequential registers */
+int intel_scu_ipc_ioread16(u16 addr, u16 *data);
+
+/* Read four sequential registers */
+int intel_scu_ipc_ioread32(u16 addr, u32 *data);
+
+/* Read a vector */
+int intel_scu_ipc_readv(u16 *addr, u8 *data, int len);
+
+/* Write single register */
+int intel_scu_ipc_iowrite8(u16 addr, u8 data);
+
+/* Write two sequential registers */
+int intel_scu_ipc_iowrite16(u16 addr, u16 data);
+
+/* Write four sequential registers */
+int intel_scu_ipc_iowrite32(u16 addr, u32 data);
+
+/* Write a vector */
+int intel_scu_ipc_writev(u16 *addr, u8 *data, int len);
+
+/* Update single register based on the mask */
+int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask);
+
+/*
+ * Indirect register read
+ * Can be used when SCCB(System Controller Configuration Block) register
+ * HRIM(Honor Restricted IPC Messages) is set (bit 23)
+ */
+int intel_scu_ipc_register_read(u32 addr, u32 *data);
+
+/*
+ * Indirect register write
+ * Can be used when SCCB(System Controller Configuration Block) register
+ * HRIM(Honor Restricted IPC Messages) is set (bit 23)
+ */
+int intel_scu_ipc_register_write(u32 addr, u32 data);
+
+/* Issue commands to the SCU with or without data */
+int intel_scu_ipc_simple_command(int cmd, int sub);
+int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen,
+							u32 *out, int outlen);
+/* I2C control api */
+int intel_scu_ipc_i2c_cntrl(u32 addr, u32 *data);
+
+/* Update FW version */
+int intel_scu_ipc_fw_update(u8 *buffer, u32 length);
+
+#endif
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index f46b79f6c16..ff90055c7f0 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -21,6 +21,7 @@
 #define __KVM_HAVE_PIT_STATE2
 #define __KVM_HAVE_XEN_HVM
 #define __KVM_HAVE_VCPU_EVENTS
+#define __KVM_HAVE_DEBUGREGS
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
@@ -257,6 +258,11 @@ struct kvm_reinject_control {
 /* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
 #define KVM_VCPUEVENT_VALID_NMI_PENDING	0x00000001
 #define KVM_VCPUEVENT_VALID_SIPI_VECTOR	0x00000002
+#define KVM_VCPUEVENT_VALID_SHADOW	0x00000004
+
+/* Interrupt shadow states */
+#define KVM_X86_SHADOW_INT_MOV_SS	0x01
+#define KVM_X86_SHADOW_INT_STI		0x02
 
 /* for KVM_GET/SET_VCPU_EVENTS */
 struct kvm_vcpu_events {
@@ -271,7 +277,7 @@ struct kvm_vcpu_events {
 		__u8 injected;
 		__u8 nr;
 		__u8 soft;
-		__u8 pad;
+		__u8 shadow;
 	} interrupt;
 	struct {
 		__u8 injected;
@@ -284,4 +290,13 @@ struct kvm_vcpu_events {
 	__u32 reserved[10];
 };
 
+/* for KVM_GET/SET_DEBUGREGS */
+struct kvm_debugregs {
+	__u64 db[4];
+	__u64 dr6;
+	__u64 dr7;
+	__u64 flags;
+	__u64 reserved[9];
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7a6f54fa13b..0b2729bf207 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -11,6 +11,8 @@
 #ifndef _ASM_X86_KVM_X86_EMULATE_H
 #define _ASM_X86_KVM_X86_EMULATE_H
 
+#include <asm/desc_defs.h>
+
 struct x86_emulate_ctxt;
 
 /*
@@ -63,6 +65,15 @@ struct x86_emulate_ops {
 			unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
 
 	/*
+	 * write_std: Write bytes of standard (non-emulated/special) memory.
+	 *            Used for descriptor writing.
+	 *  @addr:  [IN ] Linear address to which to write.
+	 *  @val:   [OUT] Value write to memory, zero-extended to 'u_long'.
+	 *  @bytes: [IN ] Number of bytes to write to memory.
+	 */
+	int (*write_std)(unsigned long addr, void *val,
+			 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+	/*
 	 * fetch: Read bytes of standard (non-emulated/special) memory.
 	 *        Used for instruction fetch.
 	 *  @addr:  [IN ] Linear address from which to read.
@@ -109,6 +120,23 @@ struct x86_emulate_ops {
 				unsigned int bytes,
 				struct kvm_vcpu *vcpu);
 
+	int (*pio_in_emulated)(int size, unsigned short port, void *val,
+			       unsigned int count, struct kvm_vcpu *vcpu);
+
+	int (*pio_out_emulated)(int size, unsigned short port, const void *val,
+				unsigned int count, struct kvm_vcpu *vcpu);
+
+	bool (*get_cached_descriptor)(struct desc_struct *desc,
+				      int seg, struct kvm_vcpu *vcpu);
+	void (*set_cached_descriptor)(struct desc_struct *desc,
+				      int seg, struct kvm_vcpu *vcpu);
+	u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
+	void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
+	void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
+	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
+	void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
+	int (*cpl)(struct kvm_vcpu *vcpu);
+	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 };
 
 /* Type, address-of, and value of an instruction's operand. */
@@ -124,6 +152,12 @@ struct fetch_cache {
 	unsigned long end;
 };
 
+struct read_cache {
+	u8 data[1024];
+	unsigned long pos;
+	unsigned long end;
+};
+
 struct decode_cache {
 	u8 twobyte;
 	u8 b;
@@ -139,7 +173,7 @@ struct decode_cache {
 	u8 seg_override;
 	unsigned int d;
 	unsigned long regs[NR_VCPU_REGS];
-	unsigned long eip, eip_orig;
+	unsigned long eip;
 	/* modrm */
 	u8 modrm;
 	u8 modrm_mod;
@@ -151,16 +185,15 @@ struct decode_cache {
 	void *modrm_ptr;
 	unsigned long modrm_val;
 	struct fetch_cache fetch;
+	struct read_cache io_read;
 };
 
-#define X86_SHADOW_INT_MOV_SS  1
-#define X86_SHADOW_INT_STI     2
-
 struct x86_emulate_ctxt {
 	/* Register state before/after emulation. */
 	struct kvm_vcpu *vcpu;
 
 	unsigned long eflags;
+	unsigned long eip; /* eip before instruction emulation */
 	/* Emulated execution mode, represented by an X86EMUL_MODE value. */
 	int mode;
 	u32 cs_base;
@@ -168,6 +201,7 @@ struct x86_emulate_ctxt {
 	/* interruptibility state, as a result of execution of STI or MOV SS */
 	int interruptibility;
 
+	bool restart; /* restart string instruction after writeback */
 	/* decode cache */
 	struct decode_cache decode;
 };
@@ -194,5 +228,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
 		    struct x86_emulate_ops *ops);
 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
 		     struct x86_emulate_ops *ops);
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+			 struct x86_emulate_ops *ops,
+			 u16 tss_selector, int reason,
+			 bool has_error_code, u32 error_code);
 
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 06d9e79ca37..76f5483cffe 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -171,15 +171,15 @@ struct kvm_pte_chain {
 union kvm_mmu_page_role {
 	unsigned word;
 	struct {
-		unsigned glevels:4;
 		unsigned level:4;
+		unsigned cr4_pae:1;
 		unsigned quadrant:2;
 		unsigned pad_for_nice_hex_output:6;
 		unsigned direct:1;
 		unsigned access:3;
 		unsigned invalid:1;
-		unsigned cr4_pge:1;
 		unsigned nxe:1;
+		unsigned cr0_wp:1;
 	};
 };
 
@@ -187,8 +187,6 @@ struct kvm_mmu_page {
 	struct list_head link;
 	struct hlist_node hash_link;
 
-	struct list_head oos_link;
-
 	/*
 	 * The following two entries are used to key the shadow page in the
 	 * hash table.
@@ -204,9 +202,9 @@ struct kvm_mmu_page {
 	 * in this shadow page.
 	 */
 	DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
-	int multimapped;         /* More than one parent_pte? */
-	int root_count;          /* Currently serving as active root */
+	bool multimapped;         /* More than one parent_pte? */
 	bool unsync;
+	int root_count;          /* Currently serving as active root */
 	unsigned int unsync_children;
 	union {
 		u64 *parent_pte;               /* !multimapped */
@@ -224,14 +222,9 @@ struct kvm_pv_mmu_op_buffer {
 
 struct kvm_pio_request {
 	unsigned long count;
-	int cur_count;
-	gva_t guest_gva;
 	int in;
 	int port;
 	int size;
-	int string;
-	int down;
-	int rep;
 };
 
 /*
@@ -320,6 +313,7 @@ struct kvm_vcpu_arch {
 	struct kvm_queued_exception {
 		bool pending;
 		bool has_error_code;
+		bool reinject;
 		u8 nr;
 		u32 error_code;
 	} exception;
@@ -362,8 +356,8 @@ struct kvm_vcpu_arch {
 	u64 *mce_banks;
 
 	/* used for guest single stepping over the given code position */
-	u16 singlestep_cs;
 	unsigned long singlestep_rip;
+
 	/* fields used by HYPER-V emulation */
 	u64 hv_vapic;
 };
@@ -389,6 +383,7 @@ struct kvm_arch {
 	unsigned int n_free_mmu_pages;
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_alloc_mmu_pages;
+	atomic_t invlpg_counter;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
 	/*
 	 * Hash table of struct kvm_mmu_page.
@@ -461,11 +456,6 @@ struct kvm_vcpu_stat {
 	u32 nmi_injections;
 };
 
-struct descriptor_table {
-	u16 limit;
-	unsigned long base;
-} __attribute__((packed));
-
 struct kvm_x86_ops {
 	int (*cpu_has_kvm_support)(void);          /* __init */
 	int (*disabled_by_bios)(void);             /* __init */
@@ -503,12 +493,11 @@ struct kvm_x86_ops {
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
 	void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
 	void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
-	void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
-	int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
+	void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+	void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+	void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+	void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+	void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
 	void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
 	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
@@ -527,7 +516,8 @@ struct kvm_x86_ops {
 	void (*set_irq)(struct kvm_vcpu *vcpu);
 	void (*set_nmi)(struct kvm_vcpu *vcpu);
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
-				bool has_error_code, u32 error_code);
+				bool has_error_code, u32 error_code,
+				bool reinject);
 	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
 	int (*nmi_allowed)(struct kvm_vcpu *vcpu);
 	bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
@@ -541,6 +531,8 @@ struct kvm_x86_ops {
 	int (*get_lpage_level)(void);
 	bool (*rdtscp_supported)(void);
 
+	void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
+
 	const struct trace_print_flags *exit_reasons_str;
 };
 
@@ -587,23 +579,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
-		   unsigned long *rflags);
 
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
-		     unsigned long *rflags);
 void kvm_enable_efer_bits(u64);
 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
 
 struct x86_emulate_ctxt;
 
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
-		     int size, unsigned port);
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
-			   int size, unsigned long count, int down,
-			    gva_t address, int rep, unsigned port);
+int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
@@ -616,12 +599,15 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
 
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
+		    bool has_error_code, u32 error_code);
 
 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
@@ -634,6 +620,8 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
 
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
+void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
 			   u32 error_code);
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
@@ -649,8 +637,6 @@ int emulator_write_emulated(unsigned long addr,
 			    unsigned int bytes,
 			    struct kvm_vcpu *vcpu);
 
-unsigned long segment_base(u16 selector);
-
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       const u8 *new, int bytes,
@@ -675,7 +661,6 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
 void kvm_enable_tdp(void);
 void kvm_disable_tdp(void);
 
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
 int complete_pio(struct kvm_vcpu *vcpu);
 bool kvm_check_iopl(struct kvm_vcpu *vcpu);
 
@@ -724,23 +709,6 @@ static inline void kvm_load_ldt(u16 sel)
 	asm("lldt %0" : : "rm"(sel));
 }
 
-static inline void kvm_get_idt(struct descriptor_table *table)
-{
-	asm("sidt %0" : "=m"(*table));
-}
-
-static inline void kvm_get_gdt(struct descriptor_table *table)
-{
-	asm("sgdt %0" : "=m"(*table));
-}
-
-static inline unsigned long kvm_read_tr_base(void)
-{
-	u16 tr;
-	asm("str %0" : "=g"(tr));
-	return segment_base(tr);
-}
-
 #ifdef CONFIG_X86_64
 static inline unsigned long read_msr(unsigned long msr)
 {
@@ -826,4 +794,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 void kvm_define_shared_msr(unsigned index, u32 msr);
 void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
 
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index ffae1420e7d..05eba5e9a8e 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -16,10 +16,23 @@
 #define KVM_FEATURE_CLOCKSOURCE		0
 #define KVM_FEATURE_NOP_IO_DELAY	1
 #define KVM_FEATURE_MMU_OP		2
+/* This indicates that the new set of kvmclock msrs
+ * are available. The use of 0x11 and 0x12 is deprecated
+ */
+#define KVM_FEATURE_CLOCKSOURCE2        3
+
+/* The last 8 bits are used to indicate how to interpret the flags field
+ * in pvclock structure. If no bits are set, all flags are ignored.
+ */
+#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT	24
 
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
+/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
+#define MSR_KVM_WALL_CLOCK_NEW  0x4b564d00
+#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
+
 #define KVM_MAX_MMU_OP_BATCH           32
 
 /* Operations for KVM_HC_MMU_OP */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 6c3fdd631ed..f32a4301c4d 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -225,5 +225,13 @@ extern void mcheck_intel_therm_init(void);
 static inline void mcheck_intel_therm_init(void) { }
 #endif
 
+/*
+ * Used by APEI to report memory error via /dev/mcelog
+ */
+
+struct cper_sec_mem_err;
+extern void apei_mce_report_mem_error(int corrected,
+				      struct cper_sec_mem_err *mem_err);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index bc473acfa7f..8c7ae431862 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -110,6 +110,7 @@
 #define MSR_AMD64_PATCH_LOADER		0xc0010020
 #define MSR_AMD64_OSVW_ID_LENGTH	0xc0010140
 #define MSR_AMD64_OSVW_STATUS		0xc0010141
+#define MSR_AMD64_DC_CFG		0xc0011022
 #define MSR_AMD64_IBSFETCHCTL		0xc0011030
 #define MSR_AMD64_IBSFETCHLINAD		0xc0011031
 #define MSR_AMD64_IBSFETCHPHYSAD	0xc0011032
@@ -202,8 +203,9 @@
 #define MSR_IA32_EBL_CR_POWERON		0x0000002a
 #define MSR_IA32_FEATURE_CONTROL        0x0000003a
 
-#define FEATURE_CONTROL_LOCKED		(1<<0)
-#define FEATURE_CONTROL_VMXON_ENABLED	(1<<2)
+#define FEATURE_CONTROL_LOCKED				(1<<0)
+#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX	(1<<1)
+#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX	(1<<2)
 
 #define MSR_IA32_APICBASE		0x0000001b
 #define MSR_IA32_APICBASE_BSP		(1<<8)
@@ -235,6 +237,8 @@
 
 #define MSR_IA32_MISC_ENABLE		0x000001a0
 
+#define MSR_IA32_TEMPERATURE_TARGET	0x000001a2
+
 /* MISC_ENABLE bits: architectural */
 #define MSR_IA32_MISC_ENABLE_FAST_STRING	(1ULL << 0)
 #define MSR_IA32_MISC_ENABLE_TCC		(1ULL << 1)
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 1a0422348d6..cd2a31dc5fb 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -53,6 +53,8 @@ extern int pcibios_last_bus;
 extern struct pci_bus *pci_root_bus;
 extern struct pci_ops pci_root_ops;
 
+void pcibios_scan_specific_bus(int busn);
+
 /* pci-irq.c */
 
 struct irq_info {
@@ -83,7 +85,7 @@ struct irq_routing_table {
 
 extern unsigned int pcibios_irq_mask;
 
-extern spinlock_t pci_config_lock;
+extern raw_spinlock_t pci_config_lock;
 
 extern int (*pcibios_enable_irq)(struct pci_dev *dev);
 extern void (*pcibios_disable_irq)(struct pci_dev *dev);
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index b05400a542f..64a8ebff06f 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -89,7 +89,8 @@
 	P4_CCCR_ENABLE)
 
 /* HT mask */
-#define P4_CCCR_MASK_HT	(P4_CCCR_MASK | P4_CCCR_THREAD_ANY)
+#define P4_CCCR_MASK_HT				\
+	(P4_CCCR_MASK | P4_CCCR_OVF_PMI_T1 | P4_CCCR_THREAD_ANY)
 
 #define P4_GEN_ESCR_EMASK(class, name, bit)	\
 	class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 6d93508f262..35f2d1948ad 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -29,7 +29,8 @@ struct pvclock_vcpu_time_info {
 	u64   system_time;
 	u32   tsc_to_system_mul;
 	s8    tsc_shift;
-	u8    pad[3];
+	u8    flags;
+	u8    pad[2];
 } __attribute__((__packed__)); /* 32 bytes */
 
 struct pvclock_wall_clock {
@@ -38,5 +39,6 @@ struct pvclock_wall_clock {
 	u32   nsec;
 } __attribute__((__packed__));
 
+#define PVCLOCK_TSC_STABLE_BIT	(1 << 0)
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 53235fd5f8c..cd02f324aa6 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -6,6 +6,7 @@
 
 /* some helper functions for xen and kvm pv clock sources */
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+void pvclock_set_flags(u8 flags);
 unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
 void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
 			    struct pvclock_vcpu_time_info *vcpu,
diff --git a/arch/x86/include/asm/rdc321x_defs.h b/arch/x86/include/asm/rdc321x_defs.h
deleted file mode 100644
index c8e9c8bed3d..00000000000
--- a/arch/x86/include/asm/rdc321x_defs.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#define PFX	"rdc321x: "
-
-/* General purpose configuration and data registers */
-#define RDC3210_CFGREG_ADDR     0x0CF8
-#define RDC3210_CFGREG_DATA     0x0CFC
-
-#define RDC321X_GPIO_CTRL_REG1	0x48
-#define RDC321X_GPIO_CTRL_REG2	0x84
-#define RDC321X_GPIO_DATA_REG1	0x4c
-#define RDC321X_GPIO_DATA_REG2	0x88
-
-#define RDC321X_MAX_GPIO	58
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
index 75af592677e..fb0b1874396 100644
--- a/arch/x86/include/asm/scatterlist.h
+++ b/arch/x86/include/asm/scatterlist.h
@@ -1,8 +1,9 @@
 #ifndef _ASM_X86_SCATTERLIST_H
 #define _ASM_X86_SCATTERLIST_H
 
-#define ISA_DMA_THRESHOLD (0x00ffffff)
-
 #include <asm-generic/scatterlist.h>
 
+#define ISA_DMA_THRESHOLD (0x00ffffff)
+#define ARCH_HAS_SG_CHAIN
+
 #endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index 48dcfa62ea0..fd921c3a684 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -15,6 +15,8 @@ static inline int arch_prepare_suspend(void) { return 0; }
 struct saved_context {
 	u16 es, fs, gs, ss;
 	unsigned long cr0, cr2, cr3, cr4;
+	u64 misc_enable;
+	bool misc_enable_saved;
 	struct desc_ptr gdt;
 	struct desc_ptr idt;
 	u16 ldt;
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 06284f42b75..8d942afae68 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -27,6 +27,8 @@ struct saved_context {
 	u16 ds, es, fs, gs, ss;
 	unsigned long gs_base, gs_kernel_base, fs_base;
 	unsigned long cr0, cr2, cr3, cr4, cr8;
+	u64 misc_enable;
+	bool misc_enable_saved;
 	unsigned long efer;
 	u16 gdt_pad;
 	u16 gdt_limit;
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 38638cd2fa4..0e831059ac5 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -81,7 +81,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 	u32 event_inj_err;
 	u64 nested_cr3;
 	u64 lbr_ctl;
-	u8 reserved_5[832];
+	u64 reserved_5;
+	u64 next_rip;
+	u8 reserved_6[816];
 };
 
 
@@ -115,6 +117,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 #define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
 #define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
 
+#define SVM_VM_CR_VALID_MASK	0x001fULL
+#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
+#define SVM_VM_CR_SVM_DIS_MASK  0x0010ULL
+
 struct __attribute__ ((__packed__)) vmcb_seg {
 	u16 selector;
 	u16 attrib;
@@ -238,6 +244,7 @@ struct __attribute__ ((__packed__)) vmcb {
 
 #define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
 #define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
+#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
 
 #define	SVM_EXIT_READ_CR0 	0x000
 #define	SVM_EXIT_READ_CR3 	0x003
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 62ba9400cc4..f0b6e5dbc5a 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -239,8 +239,8 @@ static inline struct thread_info *current_thread_info(void)
 #define TS_USEDFPU		0x0001	/* FPU was used by this task
 					   this quantum (SMP) */
 #define TS_COMPAT		0x0002	/* 32bit syscall active (64BIT)*/
-#define TS_POLLING		0x0004	/* true if in idle loop
-					   and not sleeping */
+#define TS_POLLING		0x0004	/* idle task polling need_resched,
+					   skip sending interrupt */
 #define TS_RESTORE_SIGMASK	0x0008	/* restore signal mask in do_signal() */
 
 #define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index c5087d79658..21899cc31e5 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -53,33 +53,29 @@
 extern int cpu_to_node_map[];
 
 /* Returns the number of the node containing CPU 'cpu' */
-static inline int cpu_to_node(int cpu)
+static inline int __cpu_to_node(int cpu)
 {
 	return cpu_to_node_map[cpu];
 }
-#define early_cpu_to_node(cpu)	cpu_to_node(cpu)
+#define early_cpu_to_node __cpu_to_node
+#define cpu_to_node __cpu_to_node
 
 #else /* CONFIG_X86_64 */
 
 /* Mappings between logical cpu number and node number */
 DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
 
-/* Returns the number of the current Node. */
-DECLARE_PER_CPU(int, node_number);
-#define numa_node_id()		percpu_read(node_number)
-
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
-extern int cpu_to_node(int cpu);
+/*
+ * override generic percpu implementation of cpu_to_node
+ */
+extern int __cpu_to_node(int cpu);
+#define cpu_to_node __cpu_to_node
+
 extern int early_cpu_to_node(int cpu);
 
 #else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
 
-/* Returns the number of the node containing CPU 'cpu' */
-static inline int cpu_to_node(int cpu)
-{
-	return per_cpu(x86_cpu_to_node_map, cpu);
-}
-
 /* Same function but used if called before per_cpu areas are setup */
 static inline int early_cpu_to_node(int cpu)
 {
@@ -170,6 +166,10 @@ static inline int numa_node_id(void)
 {
 	return 0;
 }
+/*
+ * indicate override:
+ */
+#define numa_node_id numa_node_id
 
 static inline int early_cpu_to_node(int cpu)
 {
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index fb9a080740e..9e6779f7cf2 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -25,6 +25,8 @@
  *
  */
 
+#include <linux/types.h>
+
 /*
  * Definitions of Primary Processor-Based VM-Execution Controls.
  */
@@ -120,6 +122,8 @@ enum vmcs_field {
 	GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
 	GUEST_IA32_PAT			= 0x00002804,
 	GUEST_IA32_PAT_HIGH		= 0x00002805,
+	GUEST_IA32_EFER			= 0x00002806,
+	GUEST_IA32_EFER_HIGH		= 0x00002807,
 	GUEST_PDPTR0                    = 0x0000280a,
 	GUEST_PDPTR0_HIGH               = 0x0000280b,
 	GUEST_PDPTR1                    = 0x0000280c,
@@ -130,6 +134,8 @@ enum vmcs_field {
 	GUEST_PDPTR3_HIGH               = 0x00002811,
 	HOST_IA32_PAT			= 0x00002c00,
 	HOST_IA32_PAT_HIGH		= 0x00002c01,
+	HOST_IA32_EFER			= 0x00002c02,
+	HOST_IA32_EFER_HIGH		= 0x00002c03,
 	PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
 	CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
 	EXCEPTION_BITMAP                = 0x00004004,
@@ -394,6 +400,10 @@ enum vmcs_field {
 #define ASM_VMX_INVEPT		  ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
 #define ASM_VMX_INVVPID		  ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
 
-
+struct vmx_msr_entry {
+	u32 index;
+	u32 reserved;
+	u64 value;
+} __aligned(16);
 
 #endif
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 488be461a38..60cc4058ed5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -63,7 +63,6 @@ EXPORT_SYMBOL(acpi_disabled);
 int acpi_noirq;				/* skip ACPI IRQ initialization */
 int acpi_pci_disabled;		/* skip ACPI PCI scan and IRQ initialization */
 EXPORT_SYMBOL(acpi_pci_disabled);
-int acpi_ht __initdata = 1;	/* enable HT */
 
 int acpi_lapic;
 int acpi_ioapic;
@@ -1501,9 +1500,8 @@ void __init acpi_boot_table_init(void)
 
 	/*
 	 * If acpi_disabled, bail out
-	 * One exception: acpi=ht continues far enough to enumerate LAPICs
 	 */
-	if (acpi_disabled && !acpi_ht)
+	if (acpi_disabled)
 		return; 
 
 	/*
@@ -1534,9 +1532,8 @@ int __init early_acpi_boot_init(void)
 {
 	/*
 	 * If acpi_disabled, bail out
-	 * One exception: acpi=ht continues far enough to enumerate LAPICs
 	 */
-	if (acpi_disabled && !acpi_ht)
+	if (acpi_disabled)
 		return 1;
 
 	/*
@@ -1554,9 +1551,8 @@ int __init acpi_boot_init(void)
 
 	/*
 	 * If acpi_disabled, bail out
-	 * One exception: acpi=ht continues far enough to enumerate LAPICs
 	 */
-	if (acpi_disabled && !acpi_ht)
+	if (acpi_disabled)
 		return 1;
 
 	acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1591,21 +1587,12 @@ static int __init parse_acpi(char *arg)
 	/* acpi=force to over-ride black-list */
 	else if (strcmp(arg, "force") == 0) {
 		acpi_force = 1;
-		acpi_ht = 1;
 		acpi_disabled = 0;
 	}
 	/* acpi=strict disables out-of-spec workarounds */
 	else if (strcmp(arg, "strict") == 0) {
 		acpi_strict = 1;
 	}
-	/* Limit ACPI just to boot-time to enable HT */
-	else if (strcmp(arg, "ht") == 0) {
-		if (!acpi_force) {
-			printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n");
-			disable_acpi();
-		}
-		acpi_ht = 1;
-	}
 	/* acpi=rsdt use RSDT instead of XSDT */
 	else if (strcmp(arg, "rsdt") == 0) {
 		acpi_rsdt_forced = 1;
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index f9961034e55..82e508677b9 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -162,8 +162,6 @@ static int __init acpi_sleep_setup(char *str)
 #endif
 		if (strncmp(str, "old_ordering", 12) == 0)
 			acpi_old_suspend_ordering();
-		if (strncmp(str, "sci_force_enable", 16) == 0)
-			acpi_set_sci_en_on_resume();
 		str = strchr(str, ',');
 		if (str != NULL)
 			str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 8ded418b059..13ab720573e 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -1,4 +1,4 @@
-	.section .text.page_aligned
+	.section .text..page_aligned
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/page_types.h>
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index fa5a1474cd1..0d20286d78c 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1487,6 +1487,7 @@ static int __attach_device(struct device *dev,
 			   struct protection_domain *domain)
 {
 	struct iommu_dev_data *dev_data, *alias_data;
+	int ret;
 
 	dev_data   = get_dev_data(dev);
 	alias_data = get_dev_data(dev_data->alias);
@@ -1498,13 +1499,14 @@ static int __attach_device(struct device *dev,
 	spin_lock(&domain->lock);
 
 	/* Some sanity checks */
+	ret = -EBUSY;
 	if (alias_data->domain != NULL &&
 	    alias_data->domain != domain)
-		return -EBUSY;
+		goto out_unlock;
 
 	if (dev_data->domain != NULL &&
 	    dev_data->domain != domain)
-		return -EBUSY;
+		goto out_unlock;
 
 	/* Do real assignment */
 	if (dev_data->alias != dev) {
@@ -1520,10 +1522,14 @@ static int __attach_device(struct device *dev,
 
 	atomic_inc(&dev_data->bind);
 
+	ret = 0;
+
+out_unlock:
+
 	/* ready */
 	spin_unlock(&domain->lock);
 
-	return 0;
+	return ret;
 }
 
 /*
@@ -2324,10 +2330,6 @@ int __init amd_iommu_init_dma_ops(void)
 
 	iommu_detected = 1;
 	swiotlb = 0;
-#ifdef CONFIG_GART_IOMMU
-	gart_iommu_aperture_disabled = 1;
-	gart_iommu_aperture = 0;
-#endif
 
 	/* Make the driver finally visible to the drivers */
 	dma_ops = &amd_iommu_dma_ops;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 3bacb4d0844..3cc63e2b8dd 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -287,8 +287,12 @@ static u8 * __init iommu_map_mmio_space(u64 address)
 {
 	u8 *ret;
 
-	if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu"))
+	if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) {
+		pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n",
+			address);
+		pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n");
 		return NULL;
+	}
 
 	ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
 	if (ret != NULL)
@@ -1314,7 +1318,7 @@ static int __init amd_iommu_init(void)
 		ret = amd_iommu_init_dma_ops();
 
 	if (ret)
-		goto free;
+		goto free_disable;
 
 	amd_iommu_init_api();
 
@@ -1332,9 +1336,10 @@ static int __init amd_iommu_init(void)
 out:
 	return ret;
 
-free:
+free_disable:
 	disable_iommus();
 
+free:
 	amd_iommu_uninit_devices();
 
 	free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
@@ -1353,6 +1358,15 @@ free:
 
 	free_unity_maps();
 
+#ifdef CONFIG_GART_IOMMU
+	/*
+	 * We failed to initialize the AMD IOMMU - try fallback to GART
+	 * if possible.
+	 */
+	gart_iommu_init();
+
+#endif
+
 	goto out;
 }
 
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 192cd7ee35c..0bcc5aeda99 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -51,6 +51,7 @@
 #include <asm/smp.h>
 #include <asm/mce.h>
 #include <asm/kvm_para.h>
+#include <asm/tsc.h>
 
 unsigned int num_processors;
 
@@ -1151,8 +1152,13 @@ static void __cpuinit lapic_setup_esr(void)
  */
 void __cpuinit setup_local_APIC(void)
 {
-	unsigned int value;
-	int i, j;
+	unsigned int value, queued;
+	int i, j, acked = 0;
+	unsigned long long tsc = 0, ntsc;
+	long long max_loops = cpu_khz;
+
+	if (cpu_has_tsc)
+		rdtscll(tsc);
 
 	if (disable_apic) {
 		arch_disable_smp_support();
@@ -1204,13 +1210,32 @@ void __cpuinit setup_local_APIC(void)
 	 * the interrupt. Hence a vector might get locked. It was noticed
 	 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
 	 */
-	for (i = APIC_ISR_NR - 1; i >= 0; i--) {
-		value = apic_read(APIC_ISR + i*0x10);
-		for (j = 31; j >= 0; j--) {
-			if (value & (1<<j))
-				ack_APIC_irq();
+	do {
+		queued = 0;
+		for (i = APIC_ISR_NR - 1; i >= 0; i--)
+			queued |= apic_read(APIC_IRR + i*0x10);
+
+		for (i = APIC_ISR_NR - 1; i >= 0; i--) {
+			value = apic_read(APIC_ISR + i*0x10);
+			for (j = 31; j >= 0; j--) {
+				if (value & (1<<j)) {
+					ack_APIC_irq();
+					acked++;
+				}
+			}
 		}
-	}
+		if (acked > 256) {
+			printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n",
+			       acked);
+			break;
+		}
+		if (cpu_has_tsc) {
+			rdtscll(ntsc);
+			max_loops = (cpu_khz << 10) - (ntsc - tsc);
+		} else
+			max_loops--;
+	} while (queued && max_loops > 0);
+	WARN_ON(max_loops <= 0);
 
 	/*
 	 * Now that we are all set up, enable the APIC
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cc83a002786..68e4a6f2211 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1121,9 +1121,9 @@ void __cpuinit cpu_init(void)
 	oist = &per_cpu(orig_ist, cpu);
 
 #ifdef CONFIG_NUMA
-	if (cpu != 0 && percpu_read(node_number) == 0 &&
-	    cpu_to_node(cpu) != NUMA_NO_NODE)
-		percpu_write(node_number, cpu_to_node(cpu));
+	if (cpu != 0 && percpu_read(numa_node) == 0 &&
+	    early_cpu_to_node(cpu) != NUMA_NO_NODE)
+		set_numa_node(early_cpu_to_node(cpu));
 #endif
 
 	me = current;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 6f3dc8fbbfd..7ec2123838e 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1497,8 +1497,8 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
  * simply keep the boost-disable flag in sync with the current global
  * state.
  */
-static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action,
-				void *hcpu)
+static int cpb_notify(struct notifier_block *nb, unsigned long action,
+		      void *hcpu)
 {
 	unsigned cpu = (long)hcpu;
 	u32 lo, hi;
@@ -1528,7 +1528,7 @@ static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata cpb_nb = {
+static struct notifier_block cpb_nb = {
 	.notifier_call		= cpb_notify,
 };
 
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 4ac6d48fe11..bb34b03af25 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -7,3 +7,5 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
 obj-$(CONFIG_X86_MCE_INJECT)	+= mce-inject.o
 
 obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
+
+obj-$(CONFIG_ACPI_APEI)		+= mce-apei.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
new file mode 100644
index 00000000000..745b54f9be8
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -0,0 +1,138 @@
+/*
+ * Bridge between MCE and APEI
+ *
+ * On some machine, corrected memory errors are reported via APEI
+ * generic hardware error source (GHES) instead of corrected Machine
+ * Check. These corrected memory errors can be reported to user space
+ * through /dev/mcelog via faking a corrected Machine Check, so that
+ * the error memory page can be offlined by /sbin/mcelog if the error
+ * count for one page is beyond the threshold.
+ *
+ * For fatal MCE, save MCE record into persistent storage via ERST, so
+ * that the MCE record can be logged after reboot via ERST.
+ *
+ * Copyright 2010 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/cper.h>
+#include <acpi/apei.h>
+#include <asm/mce.h>
+
+#include "mce-internal.h"
+
+void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
+{
+	struct mce m;
+
+	/* Only corrected MC is reported */
+	if (!corrected)
+		return;
+
+	mce_setup(&m);
+	m.bank = 1;
+	/* Fake a memory read corrected error with unknown channel */
+	m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
+	m.addr = mem_err->physical_addr;
+	mce_log(&m);
+	mce_notify_irq();
+}
+EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
+
+#define CPER_CREATOR_MCE						\
+	UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c,	\
+		0x64, 0x90, 0xb8, 0x9d)
+#define CPER_SECTION_TYPE_MCE						\
+	UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96,	\
+		0x04, 0x4a, 0x38, 0xfc)
+
+/*
+ * CPER specification (in UEFI specification 2.3 appendix N) requires
+ * byte-packed.
+ */
+struct cper_mce_record {
+	struct cper_record_header hdr;
+	struct cper_section_descriptor sec_hdr;
+	struct mce mce;
+} __packed;
+
+int apei_write_mce(struct mce *m)
+{
+	struct cper_mce_record rcd;
+
+	memset(&rcd, 0, sizeof(rcd));
+	memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+	rcd.hdr.revision = CPER_RECORD_REV;
+	rcd.hdr.signature_end = CPER_SIG_END;
+	rcd.hdr.section_count = 1;
+	rcd.hdr.error_severity = CPER_SER_FATAL;
+	/* timestamp, platform_id, partition_id are all invalid */
+	rcd.hdr.validation_bits = 0;
+	rcd.hdr.record_length = sizeof(rcd);
+	rcd.hdr.creator_id = CPER_CREATOR_MCE;
+	rcd.hdr.notification_type = CPER_NOTIFY_MCE;
+	rcd.hdr.record_id = cper_next_record_id();
+	rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
+
+	rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
+	rcd.sec_hdr.section_length = sizeof(rcd.mce);
+	rcd.sec_hdr.revision = CPER_SEC_REV;
+	/* fru_id and fru_text is invalid */
+	rcd.sec_hdr.validation_bits = 0;
+	rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
+	rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
+	rcd.sec_hdr.section_severity = CPER_SER_FATAL;
+
+	memcpy(&rcd.mce, m, sizeof(*m));
+
+	return erst_write(&rcd.hdr);
+}
+
+ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+	struct cper_mce_record rcd;
+	ssize_t len;
+
+	len = erst_read_next(&rcd.hdr, sizeof(rcd));
+	if (len <= 0)
+		return len;
+	/* Can not skip other records in storage via ERST unless clear them */
+	else if (len != sizeof(rcd) ||
+		 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) {
+		if (printk_ratelimit())
+			pr_warning(
+			"MCE-APEI: Can not skip the unknown record in ERST");
+		return -EIO;
+	}
+
+	memcpy(m, &rcd.mce, sizeof(*m));
+	*record_id = rcd.hdr.record_id;
+
+	return sizeof(*m);
+}
+
+/* Check whether there is record in ERST */
+int apei_check_mce(void)
+{
+	return erst_get_record_count();
+}
+
+int apei_clear_mce(u64 record_id)
+{
+	return erst_clear(record_id);
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 32996f9fab6..fefcc69ee8b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -28,3 +28,26 @@ extern int mce_ser;
 
 extern struct mce_bank *mce_banks;
 
+#ifdef CONFIG_ACPI_APEI
+int apei_write_mce(struct mce *m);
+ssize_t apei_read_mce(struct mce *m, u64 *record_id);
+int apei_check_mce(void);
+int apei_clear_mce(u64 record_id);
+#else
+static inline int apei_write_mce(struct mce *m)
+{
+	return -EINVAL;
+}
+static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+	return 0;
+}
+static inline int apei_check_mce(void)
+{
+	return 0;
+}
+static inline int apei_clear_mce(u64 record_id)
+{
+	return -EINVAL;
+}
+#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 7a355ddcc64..18cc4256225 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -36,6 +36,7 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/debugfs.h>
+#include <linux/edac_mce.h>
 
 #include <asm/processor.h>
 #include <asm/hw_irq.h>
@@ -169,6 +170,15 @@ void mce_log(struct mce *mce)
 		entry = rcu_dereference_check_mce(mcelog.next);
 		for (;;) {
 			/*
+			 * If edac_mce is enabled, it will check the error type
+			 * and will process it, if it is a known error.
+			 * Otherwise, the error will be sent through mcelog
+			 * interface
+			 */
+			if (edac_mce_parse(mce))
+				return;
+
+			/*
 			 * When the buffer fills up discard new entries.
 			 * Assume that the earlier errors are the more
 			 * interesting ones:
@@ -264,7 +274,7 @@ static void wait_for_panic(void)
 
 static void mce_panic(char *msg, struct mce *final, char *exp)
 {
-	int i;
+	int i, apei_err = 0;
 
 	if (!fake_panic) {
 		/*
@@ -287,8 +297,11 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 		struct mce *m = &mcelog.entry[i];
 		if (!(m->status & MCI_STATUS_VAL))
 			continue;
-		if (!(m->status & MCI_STATUS_UC))
+		if (!(m->status & MCI_STATUS_UC)) {
 			print_mce(m);
+			if (!apei_err)
+				apei_err = apei_write_mce(m);
+		}
 	}
 	/* Now print uncorrected but with the final one last */
 	for (i = 0; i < MCE_LOG_LEN; i++) {
@@ -297,11 +310,17 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 			continue;
 		if (!(m->status & MCI_STATUS_UC))
 			continue;
-		if (!final || memcmp(m, final, sizeof(struct mce)))
+		if (!final || memcmp(m, final, sizeof(struct mce))) {
 			print_mce(m);
+			if (!apei_err)
+				apei_err = apei_write_mce(m);
+		}
 	}
-	if (final)
+	if (final) {
 		print_mce(final);
+		if (!apei_err)
+			apei_err = apei_write_mce(final);
+	}
 	if (cpu_missing)
 		printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
 	print_mce_tail();
@@ -1493,6 +1512,43 @@ static void collect_tscs(void *data)
 	rdtscll(cpu_tsc[smp_processor_id()]);
 }
 
+static int mce_apei_read_done;
+
+/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
+static int __mce_read_apei(char __user **ubuf, size_t usize)
+{
+	int rc;
+	u64 record_id;
+	struct mce m;
+
+	if (usize < sizeof(struct mce))
+		return -EINVAL;
+
+	rc = apei_read_mce(&m, &record_id);
+	/* Error or no more MCE record */
+	if (rc <= 0) {
+		mce_apei_read_done = 1;
+		return rc;
+	}
+	rc = -EFAULT;
+	if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
+		return rc;
+	/*
+	 * In fact, we should have cleared the record after that has
+	 * been flushed to the disk or sent to network in
+	 * /sbin/mcelog, but we have no interface to support that now,
+	 * so just clear it to avoid duplication.
+	 */
+	rc = apei_clear_mce(record_id);
+	if (rc) {
+		mce_apei_read_done = 1;
+		return rc;
+	}
+	*ubuf += sizeof(struct mce);
+
+	return 0;
+}
+
 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 			loff_t *off)
 {
@@ -1506,15 +1562,19 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 		return -ENOMEM;
 
 	mutex_lock(&mce_read_mutex);
+
+	if (!mce_apei_read_done) {
+		err = __mce_read_apei(&buf, usize);
+		if (err || buf != ubuf)
+			goto out;
+	}
+
 	next = rcu_dereference_check_mce(mcelog.next);
 
 	/* Only supports full reads right now */
-	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
-		mutex_unlock(&mce_read_mutex);
-		kfree(cpu_tsc);
-
-		return -EINVAL;
-	}
+	err = -EINVAL;
+	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
+		goto out;
 
 	err = 0;
 	prev = 0;
@@ -1562,10 +1622,15 @@ timeout:
 			memset(&mcelog.entry[i], 0, sizeof(struct mce));
 		}
 	}
+
+	if (err)
+		err = -EFAULT;
+
+out:
 	mutex_unlock(&mce_read_mutex);
 	kfree(cpu_tsc);
 
-	return err ? -EFAULT : buf - ubuf;
+	return err ? err : buf - ubuf;
 }
 
 static unsigned int mce_poll(struct file *file, poll_table *wait)
@@ -1573,6 +1638,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &mce_wait, wait);
 	if (rcu_dereference_check_mce(mcelog.next))
 		return POLLIN | POLLRDNORM;
+	if (!mce_apei_read_done && apei_check_mce())
+		return POLLIN | POLLRDNORM;
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 81c499eceb2..e1a0a3bf971 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -190,7 +190,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
 		mutex_unlock(&therm_cpu_lock);
 		break;
 	}
-	return err ? NOTIFY_BAD : NOTIFY_OK;
+	return notifier_from_errno(err);
 }
 
 static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata =
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index fd4db0db370..5db5b7d65a1 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -106,6 +106,7 @@ struct cpu_hw_events {
 
 	int			n_events;
 	int			n_added;
+	int			n_txn;
 	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
 	u64			tags[X86_PMC_IDX_MAX];
 	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
@@ -983,6 +984,7 @@ static int x86_pmu_enable(struct perf_event *event)
 out:
 	cpuc->n_events = n;
 	cpuc->n_added += n - n0;
+	cpuc->n_txn += n - n0;
 
 	return 0;
 }
@@ -1089,6 +1091,14 @@ static void x86_pmu_disable(struct perf_event *event)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int i;
 
+	/*
+	 * If we're called during a txn, we don't need to do anything.
+	 * The events never got scheduled and ->cancel_txn will truncate
+	 * the event_list.
+	 */
+	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+		return;
+
 	x86_pmu_stop(event);
 
 	for (i = 0; i < cpuc->n_events; i++) {
@@ -1379,6 +1389,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
 	cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuc->n_txn = 0;
 }
 
 /*
@@ -1391,6 +1402,11 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
 	cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	/*
+	 * Truncate the collected events.
+	 */
+	cpuc->n_added -= cpuc->n_txn;
+	cpuc->n_events -= cpuc->n_txn;
 }
 
 /*
@@ -1419,6 +1435,12 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
 	 */
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
+	/*
+	 * Clear out the txn count so that ->cancel_txn() which gets
+	 * run after ->commit_txn() doesn't undo things.
+	 */
+	cpuc->n_txn = 0;
+
 	return 0;
 }
 
@@ -1717,7 +1739,11 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 	 */
 	regs->bp = rewind_frame_pointer(skip + 1);
 	regs->cs = __KERNEL_CS;
-	local_save_flags(regs->flags);
+	/*
+	 * We abuse bit 3 to pass exact information, see perf_misc_flags
+	 * and the comment with PERF_EFLAGS_EXACT.
+	 */
+	regs->flags = 0;
 }
 
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 424fc8de68e..ae85d69644d 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -465,15 +465,21 @@ out:
 	return rc;
 }
 
-static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
+static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
 {
-	unsigned long dummy;
+	int overflow = 0;
+	u32 low, high;
 
-	rdmsrl(hwc->config_base + hwc->idx, dummy);
-	if (dummy & P4_CCCR_OVF) {
+	rdmsr(hwc->config_base + hwc->idx, low, high);
+
+	/* we need to check high bit for unflagged overflows */
+	if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
+		overflow = 1;
 		(void)checking_wrmsrl(hwc->config_base + hwc->idx,
-			((u64)dummy) & ~P4_CCCR_OVF);
+			((u64)low) & ~P4_CCCR_OVF);
 	}
+
+	return overflow;
 }
 
 static inline void p4_pmu_disable_event(struct perf_event *event)
@@ -584,21 +590,15 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 
 		WARN_ON_ONCE(hwc->idx != idx);
 
-		/*
-		 * FIXME: Redundant call, actually not needed
-		 * but just to check if we're screwed
-		 */
-		p4_pmu_clear_cccr_ovf(hwc);
+		/* it might be unflagged overflow */
+		handled = p4_pmu_clear_cccr_ovf(hwc);
 
 		val = x86_perf_event_update(event);
-		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
+		if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
 			continue;
 
-		/*
-		 * event overflow
-		 */
-		handled		= 1;
-		data.period	= event->hw.last_period;
+		/* event overflow for sure */
+		data.period = event->hw.last_period;
 
 		if (!x86_perf_event_set_period(event))
 			continue;
@@ -670,7 +670,7 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
 
 /*
  * ESCR address hashing is tricky, ESCRs are not sequential
- * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03e0) and
+ * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
  * the metric between any ESCRs is laid in range [0xa0,0xe1]
  *
  * so we make ~70% filled hashtable
@@ -735,8 +735,9 @@ static int p4_get_escr_idx(unsigned int addr)
 {
 	unsigned int idx = P4_ESCR_MSR_IDX(addr);
 
-	if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
-			!p4_escr_table[idx])) {
+	if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE	||
+			!p4_escr_table[idx]		||
+			p4_escr_table[idx] != addr)) {
 		WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
 		return -1;
 	}
@@ -762,7 +763,7 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
 {
 	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
-	int cpu = raw_smp_processor_id();
+	int cpu = smp_processor_id();
 	struct hw_perf_event *hwc;
 	struct p4_event_bind *bind;
 	unsigned int i, thread, num;
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 8b862d5900f..1b7b31ab7d8 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -170,7 +170,7 @@ static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb,
 		cpuid_device_destroy(cpu);
 		break;
 	}
-	return err ? NOTIFY_BAD : NOTIFY_OK;
+	return notifier_from_errno(err);
 }
 
 static struct notifier_block __refdata cpuid_class_cpu_notifier =
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 3a54dcb9cd0..43e9ccf4494 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -34,7 +34,7 @@ EXPORT_SYMBOL(init_task);
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
  * no more per-task TSS's. The TSS size is kept cacheline-aligned
- * so they are allowed to end up in the .data.cacheline_aligned
+ * so they are allowed to end up in the .data..cacheline_aligned
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index feaeb0d3aa4..eb9b76c716c 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -29,6 +29,8 @@
 #define KVM_SCALE 22
 
 static int kvmclock = 1;
+static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
+static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
 
 static int parse_no_kvmclock(char *arg)
 {
@@ -54,7 +56,8 @@ static unsigned long kvm_get_wallclock(void)
 
 	low = (int)__pa_symbol(&wall_clock);
 	high = ((u64)__pa_symbol(&wall_clock) >> 32);
-	native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
+
+	native_write_msr(msr_kvm_wall_clock, low, high);
 
 	vcpu_time = &get_cpu_var(hv_clock);
 	pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
@@ -130,7 +133,8 @@ static int kvm_register_clock(char *txt)
 	high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
 	printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
 	       cpu, high, low, txt);
-	return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
+
+	return native_write_msr_safe(msr_kvm_system_time, low, high);
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -165,14 +169,14 @@ static void __init kvm_smp_prepare_boot_cpu(void)
 #ifdef CONFIG_KEXEC
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
-	native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+	native_write_msr(msr_kvm_system_time, 0, 0);
 	native_machine_crash_shutdown(regs);
 }
 #endif
 
 static void kvm_shutdown(void)
 {
-	native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+	native_write_msr(msr_kvm_system_time, 0, 0);
 	native_machine_shutdown();
 }
 
@@ -181,27 +185,37 @@ void __init kvmclock_init(void)
 	if (!kvm_para_available())
 		return;
 
-	if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
-		if (kvm_register_clock("boot clock"))
-			return;
-		pv_time_ops.sched_clock = kvm_clock_read;
-		x86_platform.calibrate_tsc = kvm_get_tsc_khz;
-		x86_platform.get_wallclock = kvm_get_wallclock;
-		x86_platform.set_wallclock = kvm_set_wallclock;
+	if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
+		msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
+		msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
+	} else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
+		return;
+
+	printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
+		msr_kvm_system_time, msr_kvm_wall_clock);
+
+	if (kvm_register_clock("boot clock"))
+		return;
+	pv_time_ops.sched_clock = kvm_clock_read;
+	x86_platform.calibrate_tsc = kvm_get_tsc_khz;
+	x86_platform.get_wallclock = kvm_get_wallclock;
+	x86_platform.set_wallclock = kvm_set_wallclock;
 #ifdef CONFIG_X86_LOCAL_APIC
-		x86_cpuinit.setup_percpu_clockev =
-			kvm_setup_secondary_clock;
+	x86_cpuinit.setup_percpu_clockev =
+		kvm_setup_secondary_clock;
 #endif
 #ifdef CONFIG_SMP
-		smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
 #endif
-		machine_ops.shutdown  = kvm_shutdown;
+	machine_ops.shutdown  = kvm_shutdown;
 #ifdef CONFIG_KEXEC
-		machine_ops.crash_shutdown  = kvm_crash_shutdown;
+	machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
-		kvm_get_preset_lpj();
-		clocksource_register(&kvm_clock);
-		pv_info.paravirt_enabled = 1;
-		pv_info.name = "KVM";
-	}
+	kvm_get_preset_lpj();
+	clocksource_register(&kvm_clock);
+	pv_info.paravirt_enabled = 1;
+	pv_info.name = "KVM";
+
+	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
 }
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 2cd8c544e41..fa6551d36c1 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -260,6 +260,7 @@ static void microcode_dev_exit(void)
 }
 
 MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+MODULE_ALIAS("devname:cpu/microcode");
 #else
 #define microcode_dev_init()	0
 #define microcode_dev_exit()	do { } while (0)
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 4d4468e9f47..7bf2dc4c8f7 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -230,7 +230,7 @@ static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb,
 		msr_device_destroy(cpu);
 		break;
 	}
-	return err ? NOTIFY_BAD : NOTIFY_OK;
+	return notifier_from_errno(err);
 }
 
 static struct notifier_block __refdata msr_class_cpu_notifier = {
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 7d2829dde20..a5bc528d432 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -31,8 +31,6 @@ static struct dma_map_ops swiotlb_dma_ops = {
 	.free_coherent = swiotlb_free_coherent,
 	.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
 	.sync_single_for_device = swiotlb_sync_single_for_device,
-	.sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
-	.sync_single_range_for_device = swiotlb_sync_single_range_for_device,
 	.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
 	.sync_sg_for_device = swiotlb_sync_sg_for_device,
 	.map_sg = swiotlb_map_sg_attrs,
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 03801f2f761..239427ca02a 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -31,8 +31,16 @@ struct pvclock_shadow_time {
 	u32 tsc_to_nsec_mul;
 	int tsc_shift;
 	u32 version;
+	u8  flags;
 };
 
+static u8 valid_flags __read_mostly = 0;
+
+void pvclock_set_flags(u8 flags)
+{
+	valid_flags = flags;
+}
+
 /*
  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
  * yielding a 64-bit result.
@@ -91,6 +99,7 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
 		dst->system_timestamp  = src->system_time;
 		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
 		dst->tsc_shift         = src->tsc_shift;
+		dst->flags             = src->flags;
 		rmb();		/* test version after fetching data */
 	} while ((src->version & 1) || (dst->version != src->version));
 
@@ -109,11 +118,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
 	return pv_tsc_khz;
 }
 
+static atomic64_t last_value = ATOMIC64_INIT(0);
+
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 {
 	struct pvclock_shadow_time shadow;
 	unsigned version;
 	cycle_t ret, offset;
+	u64 last;
 
 	do {
 		version = pvclock_get_time_values(&shadow, src);
@@ -123,6 +135,31 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 		barrier();
 	} while (version != src->version);
 
+	if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
+		(shadow.flags & PVCLOCK_TSC_STABLE_BIT))
+		return ret;
+
+	/*
+	 * Assumption here is that last_value, a global accumulator, always goes
+	 * forward. If we are less than that, we should not be much smaller.
+	 * We assume there is an error marging we're inside, and then the correction
+	 * does not sacrifice accuracy.
+	 *
+	 * For reads: global may have changed between test and return,
+	 * but this means someone else updated poked the clock at a later time.
+	 * We just need to make sure we are not seeing a backwards event.
+	 *
+	 * For updates: last_value = ret is not enough, since two vcpus could be
+	 * updating at the same time, and one of them could be slightly behind,
+	 * making the assumption that last_value always go forward fail to hold.
+	 */
+	last = atomic64_read(&last_value);
+	do {
+		if (ret < last)
+			return last;
+		last = atomic64_cmpxchg(&last_value, last, ret);
+	} while (unlikely(last != ret));
+
 	return ret;
 }
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e8029896309..b4ae4acbd03 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -676,6 +676,17 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
 		},
 	},
+	/*
+	 * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
+	 * match on the product name.
+	 */
+	{
+		.callback = dmi_low_memory_corruption,
+		.ident = "Phoenix BIOS",
+		.matches = {
+			DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
+		},
+	},
 #endif
 	{}
 };
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ef6370b00e7..de3b63ae3da 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -21,12 +21,6 @@
 #include <asm/cpu.h>
 #include <asm/stackprotector.h>
 
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__)
-#else
-# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0)
-#endif
-
 DEFINE_PER_CPU(int, cpu_number);
 EXPORT_PER_CPU_SYMBOL(cpu_number);
 
@@ -247,7 +241,7 @@ void __init setup_per_cpu_areas(void)
 #endif
 #endif
 		/*
-		 * Up to this point, the boot CPU has been using .data.init
+		 * Up to this point, the boot CPU has been using .init.data
 		 * area.  Reload any changed state for the boot CPU.
 		 */
 		if (cpu == boot_cpu_id)
@@ -265,10 +259,10 @@ void __init setup_per_cpu_areas(void)
 
 #if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
 	/*
-	 * make sure boot cpu node_number is right, when boot cpu is on the
+	 * make sure boot cpu numa_node is right, when boot cpu is on the
 	 * node that doesn't have mem installed
 	 */
-	per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
+	set_cpu_numa_node(boot_cpu_id, early_cpu_to_node(boot_cpu_id));
 #endif
 
 	/* Setup node to cpumask map */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 763d815e27a..c4f33b2e77d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -686,7 +686,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
 static void __cpuinit announce_cpu(int cpu, int apicid)
 {
 	static int current_node = -1;
-	int node = cpu_to_node(cpu);
+	int node = early_cpu_to_node(cpu);
 
 	if (system_state == SYSTEM_BOOTING) {
 		if (node != current_node) {
@@ -1215,9 +1215,17 @@ __init void prefill_possible_map(void)
 	if (!num_processors)
 		num_processors = 1;
 
-	if (setup_possible_cpus == -1)
-		possible = num_processors + disabled_cpus;
-	else
+	i = setup_max_cpus ?: 1;
+	if (setup_possible_cpus == -1) {
+		possible = num_processors;
+#ifdef CONFIG_HOTPLUG_CPU
+		if (setup_max_cpus)
+			possible += disabled_cpus;
+#else
+		if (possible > i)
+			possible = i;
+#endif
+	} else
 		possible = setup_possible_cpus;
 
 	total_cpus = max_t(int, possible, num_processors + disabled_cpus);
@@ -1230,11 +1238,23 @@ __init void prefill_possible_map(void)
 		possible = nr_cpu_ids;
 	}
 
+#ifdef CONFIG_HOTPLUG_CPU
+	if (!setup_max_cpus)
+#endif
+	if (possible > i) {
+		printk(KERN_WARNING
+			"%d Processors exceeds max_cpus limit of %u\n",
+			possible, setup_max_cpus);
+		possible = i;
+	}
+
 	printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
 		possible, max_t(int, possible - num_processors, 0));
 
 	for (i = 0; i < possible; i++)
 		set_cpu_possible(i, true);
+	for (; i < NR_CPUS; i++)
+		set_cpu_possible(i, false);
 
 	nr_cpu_ids = possible;
 }
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index cc2c60474fd..c2f1b26141e 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -46,6 +46,7 @@
 
 /* Global pointer to shared data; NULL means no measured launch. */
 struct tboot *tboot __read_mostly;
+EXPORT_SYMBOL(tboot);
 
 /* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
 #define AP_WAIT_TIMEOUT		1
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 2cc249718c4..d0bb52296fa 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -97,7 +97,7 @@ SECTIONS
 		HEAD_TEXT
 #ifdef CONFIG_X86_32
 		. = ALIGN(PAGE_SIZE);
-		*(.text.page_aligned)
+		*(.text..page_aligned)
 #endif
 		. = ALIGN(8);
 		_stext = .;
@@ -305,7 +305,7 @@ SECTIONS
 	. = ALIGN(PAGE_SIZE);
 	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
 		__bss_start = .;
-		*(.bss.page_aligned)
+		*(.bss..page_aligned)
 		*(.bss)
 		. = ALIGN(4);
 		__bss_stop = .;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4dade6ac082..5ac0bb465ed 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -33,6 +33,7 @@
 #include <asm/kvm_emulate.h>
 
 #include "x86.h"
+#include "tss.h"
 
 /*
  * Opcode effective-address decode tables.
@@ -50,6 +51,8 @@
 #define DstReg      (2<<1)	/* Register operand. */
 #define DstMem      (3<<1)	/* Memory operand. */
 #define DstAcc      (4<<1)      /* Destination Accumulator */
+#define DstDI       (5<<1)	/* Destination is in ES:(E)DI */
+#define DstMem64    (6<<1)	/* 64bit memory operand */
 #define DstMask     (7<<1)
 /* Source operand type. */
 #define SrcNone     (0<<4)	/* No source operand. */
@@ -63,6 +66,7 @@
 #define SrcOne      (7<<4)	/* Implied '1' */
 #define SrcImmUByte (8<<4)      /* 8-bit unsigned immediate operand. */
 #define SrcImmU     (9<<4)      /* Immediate operand, unsigned */
+#define SrcSI       (0xa<<4)	/* Source is in the DS:RSI */
 #define SrcMask     (0xf<<4)
 /* Generic ModRM decode. */
 #define ModRM       (1<<8)
@@ -85,6 +89,9 @@
 #define Src2ImmByte (2<<29)
 #define Src2One     (3<<29)
 #define Src2Imm16   (4<<29)
+#define Src2Mem16   (5<<29) /* Used for Ep encoding. First argument has to be
+			       in memory and second argument is located
+			       immediately after the first one in memory. */
 #define Src2Mask    (7<<29)
 
 enum {
@@ -147,8 +154,8 @@ static u32 opcode_table[256] = {
 	0, 0, 0, 0,
 	/* 0x68 - 0x6F */
 	SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
-	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* insb, insw/insd */
-	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* outsb, outsw/outsd */
+	DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
+	SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
 	/* 0x70 - 0x77 */
 	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
 	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
@@ -173,12 +180,12 @@ static u32 opcode_table[256] = {
 	/* 0xA0 - 0xA7 */
 	ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
 	ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
-	ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
-	ByteOp | ImplicitOps | String, ImplicitOps | String,
+	ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
+	ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
 	/* 0xA8 - 0xAF */
-	0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
-	ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
-	ByteOp | ImplicitOps | String, ImplicitOps | String,
+	0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
+	ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
+	ByteOp | DstDI | String, DstDI | String,
 	/* 0xB0 - 0xB7 */
 	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
 	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
@@ -204,13 +211,13 @@ static u32 opcode_table[256] = {
 	0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xE0 - 0xE7 */
 	0, 0, 0, 0,
-	ByteOp | SrcImmUByte, SrcImmUByte,
-	ByteOp | SrcImmUByte, SrcImmUByte,
+	ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
+	ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
 	/* 0xE8 - 0xEF */
 	SrcImm | Stack, SrcImm | ImplicitOps,
 	SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
-	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
-	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
+	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
 	/* 0xF0 - 0xF7 */
 	0, 0, 0, 0,
 	ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
@@ -343,7 +350,8 @@ static u32 group_table[] = {
 	[Group5*8] =
 	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
 	SrcMem | ModRM | Stack, 0,
-	SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
+	SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps,
+	SrcMem | ModRM | Stack, 0,
 	[Group7*8] =
 	0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
 	SrcNone | ModRM | DstMem | Mov, 0,
@@ -353,14 +361,14 @@ static u32 group_table[] = {
 	DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
 	DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
 	[Group9*8] =
-	0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0,
+	0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
 };
 
 static u32 group2_table[] = {
 	[Group7*8] =
-	SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM,
+	SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
 	SrcNone | ModRM | DstMem | Mov, 0,
-	SrcMem16 | ModRM | Mov, 0,
+	SrcMem16 | ModRM | Mov | Priv, 0,
 	[Group9*8] =
 	0, 0, 0, 0, 0, 0, 0, 0,
 };
@@ -562,7 +570,7 @@ static u32 group2_table[] = {
 #define insn_fetch(_type, _size, _eip)                                  \
 ({	unsigned long _x;						\
 	rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size));		\
-	if (rc != 0)							\
+	if (rc != X86EMUL_CONTINUE)					\
 		goto done;						\
 	(_eip) += (_size);						\
 	(_type)_x;							\
@@ -638,40 +646,40 @@ static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
 
 static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
 			      struct x86_emulate_ops *ops,
-			      unsigned long linear, u8 *dest)
+			      unsigned long eip, u8 *dest)
 {
 	struct fetch_cache *fc = &ctxt->decode.fetch;
 	int rc;
-	int size;
+	int size, cur_size;
 
-	if (linear < fc->start || linear >= fc->end) {
-		size = min(15UL, PAGE_SIZE - offset_in_page(linear));
-		rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL);
-		if (rc)
+	if (eip == fc->end) {
+		cur_size = fc->end - fc->start;
+		size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
+		rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
+				size, ctxt->vcpu, NULL);
+		if (rc != X86EMUL_CONTINUE)
 			return rc;
-		fc->start = linear;
-		fc->end = linear + size;
+		fc->end += size;
 	}
-	*dest = fc->data[linear - fc->start];
-	return 0;
+	*dest = fc->data[eip - fc->start];
+	return X86EMUL_CONTINUE;
 }
 
 static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
 			 struct x86_emulate_ops *ops,
 			 unsigned long eip, void *dest, unsigned size)
 {
-	int rc = 0;
+	int rc;
 
 	/* x86 instructions are limited to 15 bytes. */
-	if (eip + size - ctxt->decode.eip_orig > 15)
+	if (eip + size - ctxt->eip > 15)
 		return X86EMUL_UNHANDLEABLE;
-	eip += ctxt->cs_base;
 	while (size--) {
 		rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
-		if (rc)
+		if (rc != X86EMUL_CONTINUE)
 			return rc;
 	}
-	return 0;
+	return X86EMUL_CONTINUE;
 }
 
 /*
@@ -702,7 +710,7 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
 	*address = 0;
 	rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
 			   ctxt->vcpu, NULL);
-	if (rc)
+	if (rc != X86EMUL_CONTINUE)
 		return rc;
 	rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
 			   ctxt->vcpu, NULL);
@@ -782,7 +790,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	u8 sib;
 	int index_reg = 0, base_reg = 0, scale;
-	int rc = 0;
+	int rc = X86EMUL_CONTINUE;
 
 	if (c->rex_prefix) {
 		c->modrm_reg = (c->rex_prefix & 4) << 1;	/* REX.R */
@@ -895,7 +903,7 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
 		      struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc = 0;
+	int rc = X86EMUL_CONTINUE;
 
 	switch (c->ad_bytes) {
 	case 2:
@@ -916,14 +924,18 @@ int
 x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc = 0;
+	int rc = X86EMUL_CONTINUE;
 	int mode = ctxt->mode;
 	int def_op_bytes, def_ad_bytes, group;
 
-	/* Shadow copy of register state. Committed on successful emulation. */
 
+	/* we cannot decode insn before we complete previous rep insn */
+	WARN_ON(ctxt->restart);
+
+	/* Shadow copy of register state. Committed on successful emulation. */
 	memset(c, 0, sizeof(struct decode_cache));
-	c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu);
+	c->eip = ctxt->eip;
+	c->fetch.start = c->fetch.end = c->eip;
 	ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
 	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
 
@@ -1015,11 +1027,6 @@ done_prefixes:
 		}
 	}
 
-	if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
-		kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
-		return -1;
-	}
-
 	if (c->d & Group) {
 		group = c->d & GroupMask;
 		c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1046,7 +1053,7 @@ done_prefixes:
 		rc = decode_modrm(ctxt, ops);
 	else if (c->d & MemAbs)
 		rc = decode_abs(ctxt, ops);
-	if (rc)
+	if (rc != X86EMUL_CONTINUE)
 		goto done;
 
 	if (!c->has_seg_override)
@@ -1057,6 +1064,10 @@ done_prefixes:
 
 	if (c->ad_bytes != 8)
 		c->modrm_ea = (u32)c->modrm_ea;
+
+	if (c->rip_relative)
+		c->modrm_ea += c->eip;
+
 	/*
 	 * Decode and fetch the source operand: register, memory
 	 * or immediate.
@@ -1091,6 +1102,8 @@ done_prefixes:
 			break;
 		}
 		c->src.type = OP_MEM;
+		c->src.ptr = (unsigned long *)c->modrm_ea;
+		c->src.val = 0;
 		break;
 	case SrcImm:
 	case SrcImmU:
@@ -1139,6 +1152,14 @@ done_prefixes:
 		c->src.bytes = 1;
 		c->src.val = 1;
 		break;
+	case SrcSI:
+		c->src.type = OP_MEM;
+		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->src.ptr = (unsigned long *)
+			register_address(c,  seg_override_base(ctxt, c),
+					 c->regs[VCPU_REGS_RSI]);
+		c->src.val = 0;
+		break;
 	}
 
 	/*
@@ -1168,6 +1189,12 @@ done_prefixes:
 		c->src2.bytes = 1;
 		c->src2.val = 1;
 		break;
+	case Src2Mem16:
+		c->src2.type = OP_MEM;
+		c->src2.bytes = 2;
+		c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
+		c->src2.val = 0;
+		break;
 	}
 
 	/* Decode and fetch the destination operand: register or memory. */
@@ -1180,6 +1207,7 @@ done_prefixes:
 			 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
 		break;
 	case DstMem:
+	case DstMem64:
 		if ((c->d & ModRM) && c->modrm_mod == 3) {
 			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 			c->dst.type = OP_REG;
@@ -1188,12 +1216,24 @@ done_prefixes:
 			break;
 		}
 		c->dst.type = OP_MEM;
+		c->dst.ptr = (unsigned long *)c->modrm_ea;
+		if ((c->d & DstMask) == DstMem64)
+			c->dst.bytes = 8;
+		else
+			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.val = 0;
+		if (c->d & BitOp) {
+			unsigned long mask = ~(c->dst.bytes * 8 - 1);
+
+			c->dst.ptr = (void *)c->dst.ptr +
+						   (c->src.val & mask) / 8;
+		}
 		break;
 	case DstAcc:
 		c->dst.type = OP_REG;
-		c->dst.bytes = c->op_bytes;
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
-		switch (c->op_bytes) {
+		switch (c->dst.bytes) {
 			case 1:
 				c->dst.val = *(u8 *)c->dst.ptr;
 				break;
@@ -1203,18 +1243,248 @@ done_prefixes:
 			case 4:
 				c->dst.val = *(u32 *)c->dst.ptr;
 				break;
+			case 8:
+				c->dst.val = *(u64 *)c->dst.ptr;
+				break;
 		}
 		c->dst.orig_val = c->dst.val;
 		break;
+	case DstDI:
+		c->dst.type = OP_MEM;
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.ptr = (unsigned long *)
+			register_address(c, es_base(ctxt),
+					 c->regs[VCPU_REGS_RDI]);
+		c->dst.val = 0;
+		break;
 	}
 
-	if (c->rip_relative)
-		c->modrm_ea += c->eip;
-
 done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 }
 
+static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+			   struct x86_emulate_ops *ops,
+			   unsigned int size, unsigned short port,
+			   void *dest)
+{
+	struct read_cache *rc = &ctxt->decode.io_read;
+
+	if (rc->pos == rc->end) { /* refill pio read ahead */
+		struct decode_cache *c = &ctxt->decode;
+		unsigned int in_page, n;
+		unsigned int count = c->rep_prefix ?
+			address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
+		in_page = (ctxt->eflags & EFLG_DF) ?
+			offset_in_page(c->regs[VCPU_REGS_RDI]) :
+			PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
+		n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
+			count);
+		if (n == 0)
+			n = 1;
+		rc->pos = rc->end = 0;
+		if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
+			return 0;
+		rc->end = n * size;
+	}
+
+	memcpy(dest, rc->data + rc->pos, size);
+	rc->pos += size;
+	return 1;
+}
+
+static u32 desc_limit_scaled(struct desc_struct *desc)
+{
+	u32 limit = get_desc_limit(desc);
+
+	return desc->g ? (limit << 12) | 0xfff : limit;
+}
+
+static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
+				     struct x86_emulate_ops *ops,
+				     u16 selector, struct desc_ptr *dt)
+{
+	if (selector & 1 << 2) {
+		struct desc_struct desc;
+		memset (dt, 0, sizeof *dt);
+		if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
+			return;
+
+		dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
+		dt->address = get_desc_base(&desc);
+	} else
+		ops->get_gdt(dt, ctxt->vcpu);
+}
+
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				   struct x86_emulate_ops *ops,
+				   u16 selector, struct desc_struct *desc)
+{
+	struct desc_ptr dt;
+	u16 index = selector >> 3;
+	int ret;
+	u32 err;
+	ulong addr;
+
+	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+
+	if (dt.size < index * 8 + 7) {
+		kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+	addr = dt.address + index * 8;
+	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT)
+		kvm_inject_page_fault(ctxt->vcpu, addr, err);
+
+       return ret;
+}
+
+/* allowed just for 8 bytes segments */
+static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				    struct x86_emulate_ops *ops,
+				    u16 selector, struct desc_struct *desc)
+{
+	struct desc_ptr dt;
+	u16 index = selector >> 3;
+	u32 err;
+	ulong addr;
+	int ret;
+
+	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+
+	if (dt.size < index * 8 + 7) {
+		kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+
+	addr = dt.address + index * 8;
+	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT)
+		kvm_inject_page_fault(ctxt->vcpu, addr, err);
+
+	return ret;
+}
+
+static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				   struct x86_emulate_ops *ops,
+				   u16 selector, int seg)
+{
+	struct desc_struct seg_desc;
+	u8 dpl, rpl, cpl;
+	unsigned err_vec = GP_VECTOR;
+	u32 err_code = 0;
+	bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+	int ret;
+
+	memset(&seg_desc, 0, sizeof seg_desc);
+
+	if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
+	    || ctxt->mode == X86EMUL_MODE_REAL) {
+		/* set real mode segment descriptor */
+		set_desc_base(&seg_desc, selector << 4);
+		set_desc_limit(&seg_desc, 0xffff);
+		seg_desc.type = 3;
+		seg_desc.p = 1;
+		seg_desc.s = 1;
+		goto load;
+	}
+
+	/* NULL selector is not valid for TR, CS and SS */
+	if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+	    && null_selector)
+		goto exception;
+
+	/* TR should be in GDT only */
+	if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+		goto exception;
+
+	if (null_selector) /* for NULL selector skip all following checks */
+		goto load;
+
+	ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+
+	err_code = selector & 0xfffc;
+	err_vec = GP_VECTOR;
+
+	/* can't load system descriptor into segment selecor */
+	if (seg <= VCPU_SREG_GS && !seg_desc.s)
+		goto exception;
+
+	if (!seg_desc.p) {
+		err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+		goto exception;
+	}
+
+	rpl = selector & 3;
+	dpl = seg_desc.dpl;
+	cpl = ops->cpl(ctxt->vcpu);
+
+	switch (seg) {
+	case VCPU_SREG_SS:
+		/*
+		 * segment is not a writable data segment or segment
+		 * selector's RPL != CPL or segment selector's RPL != CPL
+		 */
+		if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
+			goto exception;
+		break;
+	case VCPU_SREG_CS:
+		if (!(seg_desc.type & 8))
+			goto exception;
+
+		if (seg_desc.type & 4) {
+			/* conforming */
+			if (dpl > cpl)
+				goto exception;
+		} else {
+			/* nonconforming */
+			if (rpl > cpl || dpl != cpl)
+				goto exception;
+		}
+		/* CS(RPL) <- CPL */
+		selector = (selector & 0xfffc) | cpl;
+		break;
+	case VCPU_SREG_TR:
+		if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
+			goto exception;
+		break;
+	case VCPU_SREG_LDTR:
+		if (seg_desc.s || seg_desc.type != 2)
+			goto exception;
+		break;
+	default: /*  DS, ES, FS, or GS */
+		/*
+		 * segment is not a data or readable code segment or
+		 * ((segment is a data or nonconforming code segment)
+		 * and (both RPL and CPL > DPL))
+		 */
+		if ((seg_desc.type & 0xa) == 0x8 ||
+		    (((seg_desc.type & 0xc) != 0xc) &&
+		     (rpl > dpl && cpl > dpl)))
+			goto exception;
+		break;
+	}
+
+	if (seg_desc.s) {
+		/* mark segment as accessed */
+		seg_desc.type |= 1;
+		ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
+		if (ret != X86EMUL_CONTINUE)
+			return ret;
+	}
+load:
+	ops->set_segment_selector(selector, seg, ctxt->vcpu);
+	ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
+	return X86EMUL_CONTINUE;
+exception:
+	kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code);
+	return X86EMUL_PROPAGATE_FAULT;
+}
+
 static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
@@ -1251,7 +1521,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 	int rc;
 	unsigned long val, change_mask;
 	int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
-	int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu);
+	int cpl = ops->cpl(ctxt->vcpu);
 
 	rc = emulate_pop(ctxt, ops, &val, len);
 	if (rc != X86EMUL_CONTINUE)
@@ -1306,10 +1576,10 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
 	int rc;
 
 	rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
-	if (rc != 0)
+	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg);
+	rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
 	return rc;
 }
 
@@ -1332,7 +1602,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
 			struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc = 0;
+	int rc = X86EMUL_CONTINUE;
 	int reg = VCPU_REGS_RDI;
 
 	while (reg >= VCPU_REGS_RAX) {
@@ -1343,7 +1613,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
 		}
 
 		rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			break;
 		--reg;
 	}
@@ -1354,12 +1624,8 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
 				struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc;
 
-	rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
-	if (rc != 0)
-		return rc;
-	return 0;
+	return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
 }
 
 static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
@@ -1395,7 +1661,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 			       struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc = 0;
 
 	switch (c->modrm_reg) {
 	case 0 ... 1:	/* test */
@@ -1408,11 +1673,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 		emulate_1op("neg", c->dst, ctxt->eflags);
 		break;
 	default:
-		DPRINTF("Cannot emulate %02x\n", c->b);
-		rc = X86EMUL_UNHANDLEABLE;
-		break;
+		return 0;
 	}
-	return rc;
+	return 1;
 }
 
 static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -1442,20 +1705,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
 		emulate_push(ctxt);
 		break;
 	}
-	return 0;
+	return X86EMUL_CONTINUE;
 }
 
 static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
-			       struct x86_emulate_ops *ops,
-			       unsigned long memop)
+			       struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	u64 old, new;
-	int rc;
-
-	rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
+	u64 old = c->dst.orig_val;
 
 	if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
 	    ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
@@ -1463,17 +1720,13 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
 		c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
 		c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
 		ctxt->eflags &= ~EFLG_ZF;
-
 	} else {
-		new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
+		c->dst.val = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
 		       (u32) c->regs[VCPU_REGS_RBX];
 
-		rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
-		if (rc != X86EMUL_CONTINUE)
-			return rc;
 		ctxt->eflags |= EFLG_ZF;
 	}
-	return 0;
+	return X86EMUL_CONTINUE;
 }
 
 static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
@@ -1484,14 +1737,14 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
 	unsigned long cs;
 
 	rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
-	if (rc)
+	if (rc != X86EMUL_CONTINUE)
 		return rc;
 	if (c->op_bytes == 4)
 		c->eip = (u32)c->eip;
 	rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
-	if (rc)
+	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS);
+	rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
 	return rc;
 }
 
@@ -1544,7 +1797,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 	default:
 		break;
 	}
-	return 0;
+	return X86EMUL_CONTINUE;
 }
 
 static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
@@ -1598,8 +1851,11 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
 	u64 msr_data;
 
 	/* syscall is not available in real mode */
-	if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86)
-		return X86EMUL_UNHANDLEABLE;
+	if (ctxt->mode == X86EMUL_MODE_REAL ||
+	    ctxt->mode == X86EMUL_MODE_VM86) {
+		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
 
 	setup_syscalls_segments(ctxt, &cs, &ss);
 
@@ -1649,14 +1905,16 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
 	/* inject #GP if in real mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL) {
 		kvm_inject_gp(ctxt->vcpu, 0);
-		return X86EMUL_UNHANDLEABLE;
+		return X86EMUL_PROPAGATE_FAULT;
 	}
 
 	/* XXX sysenter/sysexit have not been tested in 64bit mode.
 	* Therefore, we inject an #UD.
 	*/
-	if (ctxt->mode == X86EMUL_MODE_PROT64)
-		return X86EMUL_UNHANDLEABLE;
+	if (ctxt->mode == X86EMUL_MODE_PROT64) {
+		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
 
 	setup_syscalls_segments(ctxt, &cs, &ss);
 
@@ -1711,7 +1969,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
 	if (ctxt->mode == X86EMUL_MODE_REAL ||
 	    ctxt->mode == X86EMUL_MODE_VM86) {
 		kvm_inject_gp(ctxt->vcpu, 0);
-		return X86EMUL_UNHANDLEABLE;
+		return X86EMUL_PROPAGATE_FAULT;
 	}
 
 	setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1756,7 +2014,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
-static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
+			      struct x86_emulate_ops *ops)
 {
 	int iopl;
 	if (ctxt->mode == X86EMUL_MODE_REAL)
@@ -1764,7 +2023,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
 	if (ctxt->mode == X86EMUL_MODE_VM86)
 		return true;
 	iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
-	return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl;
+	return ops->cpl(ctxt->vcpu) > iopl;
 }
 
 static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
@@ -1801,22 +2060,419 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
 				 struct x86_emulate_ops *ops,
 				 u16 port, u16 len)
 {
-	if (emulator_bad_iopl(ctxt))
+	if (emulator_bad_iopl(ctxt, ops))
 		if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
 			return false;
 	return true;
 }
 
+static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
+				      struct x86_emulate_ops *ops,
+				      int seg)
+{
+	struct desc_struct desc;
+	if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
+		return get_desc_base(&desc);
+	else
+		return ~0;
+}
+
+static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
+				struct x86_emulate_ops *ops,
+				struct tss_segment_16 *tss)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	tss->ip = c->eip;
+	tss->flag = ctxt->eflags;
+	tss->ax = c->regs[VCPU_REGS_RAX];
+	tss->cx = c->regs[VCPU_REGS_RCX];
+	tss->dx = c->regs[VCPU_REGS_RDX];
+	tss->bx = c->regs[VCPU_REGS_RBX];
+	tss->sp = c->regs[VCPU_REGS_RSP];
+	tss->bp = c->regs[VCPU_REGS_RBP];
+	tss->si = c->regs[VCPU_REGS_RSI];
+	tss->di = c->regs[VCPU_REGS_RDI];
+
+	tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
+	tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+	tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
+	tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
+	tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
+}
+
+static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
+				 struct x86_emulate_ops *ops,
+				 struct tss_segment_16 *tss)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int ret;
+
+	c->eip = tss->ip;
+	ctxt->eflags = tss->flag | 2;
+	c->regs[VCPU_REGS_RAX] = tss->ax;
+	c->regs[VCPU_REGS_RCX] = tss->cx;
+	c->regs[VCPU_REGS_RDX] = tss->dx;
+	c->regs[VCPU_REGS_RBX] = tss->bx;
+	c->regs[VCPU_REGS_RSP] = tss->sp;
+	c->regs[VCPU_REGS_RBP] = tss->bp;
+	c->regs[VCPU_REGS_RSI] = tss->si;
+	c->regs[VCPU_REGS_RDI] = tss->di;
+
+	/*
+	 * SDM says that segment selectors are loaded before segment
+	 * descriptors
+	 */
+	ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu);
+	ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
+	ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
+	ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
+
+	/*
+	 * Now load segment descriptors. If fault happenes at this stage
+	 * it is handled in a context of new task
+	 */
+	ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+
+	return X86EMUL_CONTINUE;
+}
+
+static int task_switch_16(struct x86_emulate_ctxt *ctxt,
+			  struct x86_emulate_ops *ops,
+			  u16 tss_selector, u16 old_tss_sel,
+			  ulong old_tss_base, struct desc_struct *new_desc)
+{
+	struct tss_segment_16 tss_seg;
+	int ret;
+	u32 err, new_tss_base = get_desc_base(new_desc);
+
+	ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+			    &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT) {
+		/* FIXME: need to provide precise fault address */
+		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		return ret;
+	}
+
+	save_state_to_tss16(ctxt, ops, &tss_seg);
+
+	ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+			     &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT) {
+		/* FIXME: need to provide precise fault address */
+		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		return ret;
+	}
+
+	ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+			    &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT) {
+		/* FIXME: need to provide precise fault address */
+		kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+		return ret;
+	}
+
+	if (old_tss_sel != 0xffff) {
+		tss_seg.prev_task_link = old_tss_sel;
+
+		ret = ops->write_std(new_tss_base,
+				     &tss_seg.prev_task_link,
+				     sizeof tss_seg.prev_task_link,
+				     ctxt->vcpu, &err);
+		if (ret == X86EMUL_PROPAGATE_FAULT) {
+			/* FIXME: need to provide precise fault address */
+			kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+			return ret;
+		}
+	}
+
+	return load_state_from_tss16(ctxt, ops, &tss_seg);
+}
+
+static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
+				struct x86_emulate_ops *ops,
+				struct tss_segment_32 *tss)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	tss->cr3 = ops->get_cr(3, ctxt->vcpu);
+	tss->eip = c->eip;
+	tss->eflags = ctxt->eflags;
+	tss->eax = c->regs[VCPU_REGS_RAX];
+	tss->ecx = c->regs[VCPU_REGS_RCX];
+	tss->edx = c->regs[VCPU_REGS_RDX];
+	tss->ebx = c->regs[VCPU_REGS_RBX];
+	tss->esp = c->regs[VCPU_REGS_RSP];
+	tss->ebp = c->regs[VCPU_REGS_RBP];
+	tss->esi = c->regs[VCPU_REGS_RSI];
+	tss->edi = c->regs[VCPU_REGS_RDI];
+
+	tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
+	tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+	tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
+	tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
+	tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu);
+	tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu);
+	tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
+}
+
+static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
+				 struct x86_emulate_ops *ops,
+				 struct tss_segment_32 *tss)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int ret;
+
+	ops->set_cr(3, tss->cr3, ctxt->vcpu);
+	c->eip = tss->eip;
+	ctxt->eflags = tss->eflags | 2;
+	c->regs[VCPU_REGS_RAX] = tss->eax;
+	c->regs[VCPU_REGS_RCX] = tss->ecx;
+	c->regs[VCPU_REGS_RDX] = tss->edx;
+	c->regs[VCPU_REGS_RBX] = tss->ebx;
+	c->regs[VCPU_REGS_RSP] = tss->esp;
+	c->regs[VCPU_REGS_RBP] = tss->ebp;
+	c->regs[VCPU_REGS_RSI] = tss->esi;
+	c->regs[VCPU_REGS_RDI] = tss->edi;
+
+	/*
+	 * SDM says that segment selectors are loaded before segment
+	 * descriptors
+	 */
+	ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu);
+	ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
+	ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
+	ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
+	ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu);
+	ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu);
+
+	/*
+	 * Now load segment descriptors. If fault happenes at this stage
+	 * it is handled in a context of new task
+	 */
+	ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+
+	return X86EMUL_CONTINUE;
+}
+
+static int task_switch_32(struct x86_emulate_ctxt *ctxt,
+			  struct x86_emulate_ops *ops,
+			  u16 tss_selector, u16 old_tss_sel,
+			  ulong old_tss_base, struct desc_struct *new_desc)
+{
+	struct tss_segment_32 tss_seg;
+	int ret;
+	u32 err, new_tss_base = get_desc_base(new_desc);
+
+	ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+			    &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT) {
+		/* FIXME: need to provide precise fault address */
+		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		return ret;
+	}
+
+	save_state_to_tss32(ctxt, ops, &tss_seg);
+
+	ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+			     &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT) {
+		/* FIXME: need to provide precise fault address */
+		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		return ret;
+	}
+
+	ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+			    &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT) {
+		/* FIXME: need to provide precise fault address */
+		kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+		return ret;
+	}
+
+	if (old_tss_sel != 0xffff) {
+		tss_seg.prev_task_link = old_tss_sel;
+
+		ret = ops->write_std(new_tss_base,
+				     &tss_seg.prev_task_link,
+				     sizeof tss_seg.prev_task_link,
+				     ctxt->vcpu, &err);
+		if (ret == X86EMUL_PROPAGATE_FAULT) {
+			/* FIXME: need to provide precise fault address */
+			kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+			return ret;
+		}
+	}
+
+	return load_state_from_tss32(ctxt, ops, &tss_seg);
+}
+
+static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
+				   struct x86_emulate_ops *ops,
+				   u16 tss_selector, int reason,
+				   bool has_error_code, u32 error_code)
+{
+	struct desc_struct curr_tss_desc, next_tss_desc;
+	int ret;
+	u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
+	ulong old_tss_base =
+		get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
+	u32 desc_limit;
+
+	/* FIXME: old_tss_base == ~0 ? */
+
+	ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+
+	/* FIXME: check that next_tss_desc is tss */
+
+	if (reason != TASK_SWITCH_IRET) {
+		if ((tss_selector & 3) > next_tss_desc.dpl ||
+		    ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
+			kvm_inject_gp(ctxt->vcpu, 0);
+			return X86EMUL_PROPAGATE_FAULT;
+		}
+	}
+
+	desc_limit = desc_limit_scaled(&next_tss_desc);
+	if (!next_tss_desc.p ||
+	    ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
+	     desc_limit < 0x2b)) {
+		kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
+				      tss_selector & 0xfffc);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+
+	if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+		curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
+		write_segment_descriptor(ctxt, ops, old_tss_sel,
+					 &curr_tss_desc);
+	}
+
+	if (reason == TASK_SWITCH_IRET)
+		ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
+
+	/* set back link to prev task only if NT bit is set in eflags
+	   note that old_tss_sel is not used afetr this point */
+	if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+		old_tss_sel = 0xffff;
+
+	if (next_tss_desc.type & 8)
+		ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel,
+				     old_tss_base, &next_tss_desc);
+	else
+		ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
+				     old_tss_base, &next_tss_desc);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+
+	if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
+		ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
+
+	if (reason != TASK_SWITCH_IRET) {
+		next_tss_desc.type |= (1 << 1); /* set busy flag */
+		write_segment_descriptor(ctxt, ops, tss_selector,
+					 &next_tss_desc);
+	}
+
+	ops->set_cr(0,  ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
+	ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu);
+	ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
+
+	if (has_error_code) {
+		struct decode_cache *c = &ctxt->decode;
+
+		c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
+		c->lock_prefix = 0;
+		c->src.val = (unsigned long) error_code;
+		emulate_push(ctxt);
+	}
+
+	return ret;
+}
+
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+			 struct x86_emulate_ops *ops,
+			 u16 tss_selector, int reason,
+			 bool has_error_code, u32 error_code)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc;
+
+	memset(c, 0, sizeof(struct decode_cache));
+	c->eip = ctxt->eip;
+	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+	c->dst.type = OP_NONE;
+
+	rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
+				     has_error_code, error_code);
+
+	if (rc == X86EMUL_CONTINUE) {
+		memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
+		kvm_rip_write(ctxt->vcpu, c->eip);
+		rc = writeback(ctxt, ops);
+	}
+
+	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
+}
+
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
+			    int reg, struct operand *op)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
+
+	register_address_increment(c, &c->regs[reg], df * op->bytes);
+	op->ptr = (unsigned long *)register_address(c,  base, c->regs[reg]);
+}
+
 int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
-	unsigned long memop = 0;
 	u64 msr_data;
-	unsigned long saved_eip = 0;
 	struct decode_cache *c = &ctxt->decode;
-	unsigned int port;
-	int io_dir_in;
-	int rc = 0;
+	int rc = X86EMUL_CONTINUE;
+	int saved_dst_type = c->dst.type;
 
 	ctxt->interruptibility = 0;
 
@@ -1826,26 +2482,30 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	 */
 
 	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
-	saved_eip = c->eip;
+
+	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
+		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		goto done;
+	}
 
 	/* LOCK prefix is allowed only with some instructions */
-	if (c->lock_prefix && !(c->d & Lock)) {
+	if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
 		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
 		goto done;
 	}
 
 	/* Privileged instruction can be executed only in CPL=0 */
-	if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) {
+	if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
 		kvm_inject_gp(ctxt->vcpu, 0);
 		goto done;
 	}
 
-	if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
-		memop = c->modrm_ea;
-
 	if (c->rep_prefix && (c->d & String)) {
+		ctxt->restart = true;
 		/* All REP prefixes have the same first termination condition */
-		if (c->regs[VCPU_REGS_RCX] == 0) {
+		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
+		string_done:
+			ctxt->restart = false;
 			kvm_rip_write(ctxt->vcpu, c->eip);
 			goto done;
 		}
@@ -1857,25 +2517,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		 * 	- if REPNE/REPNZ and ZF = 1 then done
 		 */
 		if ((c->b == 0xa6) || (c->b == 0xa7) ||
-				(c->b == 0xae) || (c->b == 0xaf)) {
+		    (c->b == 0xae) || (c->b == 0xaf)) {
 			if ((c->rep_prefix == REPE_PREFIX) &&
-				((ctxt->eflags & EFLG_ZF) == 0)) {
-					kvm_rip_write(ctxt->vcpu, c->eip);
-					goto done;
-			}
+			    ((ctxt->eflags & EFLG_ZF) == 0))
+				goto string_done;
 			if ((c->rep_prefix == REPNE_PREFIX) &&
-				((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
-				kvm_rip_write(ctxt->vcpu, c->eip);
-				goto done;
-			}
+			    ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
+				goto string_done;
 		}
-		c->regs[VCPU_REGS_RCX]--;
-		c->eip = kvm_rip_read(ctxt->vcpu);
+		c->eip = ctxt->eip;
 	}
 
 	if (c->src.type == OP_MEM) {
-		c->src.ptr = (unsigned long *)memop;
-		c->src.val = 0;
 		rc = ops->read_emulated((unsigned long)c->src.ptr,
 					&c->src.val,
 					c->src.bytes,
@@ -1885,29 +2538,25 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		c->src.orig_val = c->src.val;
 	}
 
+	if (c->src2.type == OP_MEM) {
+		rc = ops->read_emulated((unsigned long)c->src2.ptr,
+					&c->src2.val,
+					c->src2.bytes,
+					ctxt->vcpu);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+	}
+
 	if ((c->d & DstMask) == ImplicitOps)
 		goto special_insn;
 
 
-	if (c->dst.type == OP_MEM) {
-		c->dst.ptr = (unsigned long *)memop;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.val = 0;
-		if (c->d & BitOp) {
-			unsigned long mask = ~(c->dst.bytes * 8 - 1);
-
-			c->dst.ptr = (void *)c->dst.ptr +
-						   (c->src.val & mask) / 8;
-		}
-		if (!(c->d & Mov)) {
-			/* optimisation - avoid slow emulated read */
-			rc = ops->read_emulated((unsigned long)c->dst.ptr,
-						&c->dst.val,
-						c->dst.bytes,
-						ctxt->vcpu);
-			if (rc != X86EMUL_CONTINUE)
-				goto done;
-		}
+	if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
+		/* optimisation - avoid slow emulated read if Mov */
+		rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val,
+					c->dst.bytes, ctxt->vcpu);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
 	}
 	c->dst.orig_val = c->dst.val;
 
@@ -1926,7 +2575,7 @@ special_insn:
 		break;
 	case 0x07:		/* pop es */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0x08 ... 0x0d:
@@ -1945,7 +2594,7 @@ special_insn:
 		break;
 	case 0x17:		/* pop ss */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0x18 ... 0x1d:
@@ -1957,7 +2606,7 @@ special_insn:
 		break;
 	case 0x1f:		/* pop ds */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0x20 ... 0x25:
@@ -1988,7 +2637,7 @@ special_insn:
 	case 0x58 ... 0x5f: /* pop reg */
 	pop_instruction:
 		rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0x60:	/* pusha */
@@ -1996,7 +2645,7 @@ special_insn:
 		break;
 	case 0x61:	/* popa */
 		rc = emulate_popa(ctxt, ops);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0x63:		/* movsxd */
@@ -2010,47 +2659,29 @@ special_insn:
 		break;
 	case 0x6c:		/* insb */
 	case 0x6d:		/* insw/insd */
+		c->dst.bytes = min(c->dst.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
-					  (c->d & ByteOp) ? 1 : c->op_bytes)) {
+					  c->dst.bytes)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			goto done;
 		}
-		if (kvm_emulate_pio_string(ctxt->vcpu,
-				1,
-				(c->d & ByteOp) ? 1 : c->op_bytes,
-				c->rep_prefix ?
-				address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
-				(ctxt->eflags & EFLG_DF),
-				register_address(c, es_base(ctxt),
-						 c->regs[VCPU_REGS_RDI]),
-				c->rep_prefix,
-				c->regs[VCPU_REGS_RDX]) == 0) {
-			c->eip = saved_eip;
-			return -1;
-		}
-		return 0;
+		if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
+				     c->regs[VCPU_REGS_RDX], &c->dst.val))
+			goto done; /* IO is needed, skip writeback */
+		break;
 	case 0x6e:		/* outsb */
 	case 0x6f:		/* outsw/outsd */
+		c->src.bytes = min(c->src.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
-					  (c->d & ByteOp) ? 1 : c->op_bytes)) {
+					  c->src.bytes)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			goto done;
 		}
-		if (kvm_emulate_pio_string(ctxt->vcpu,
-				0,
-				(c->d & ByteOp) ? 1 : c->op_bytes,
-				c->rep_prefix ?
-				address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
-				(ctxt->eflags & EFLG_DF),
-					 register_address(c,
-					  seg_override_base(ctxt, c),
-						 c->regs[VCPU_REGS_RSI]),
-				c->rep_prefix,
-				c->regs[VCPU_REGS_RDX]) == 0) {
-			c->eip = saved_eip;
-			return -1;
-		}
-		return 0;
+		ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
+				      &c->src.val, 1, ctxt->vcpu);
+
+		c->dst.type = OP_NONE; /* nothing to writeback */
+		break;
 	case 0x70 ... 0x7f: /* jcc (short) */
 		if (test_cc(c->b, ctxt->eflags))
 			jmp_rel(c, c->src.val);
@@ -2107,12 +2738,11 @@ special_insn:
 	case 0x8c: { /* mov r/m, sreg */
 		struct kvm_segment segreg;
 
-		if (c->modrm_reg <= 5)
+		if (c->modrm_reg <= VCPU_SREG_GS)
 			kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
 		else {
-			printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n",
-			       c->modrm);
-			goto cannot_emulate;
+			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			goto done;
 		}
 		c->dst.val = segreg.selector;
 		break;
@@ -2132,16 +2762,16 @@ special_insn:
 		}
 
 		if (c->modrm_reg == VCPU_SREG_SS)
-			toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
+			toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
 
-		rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg);
+		rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
 
 		c->dst.type = OP_NONE;  /* Disable writeback. */
 		break;
 	}
 	case 0x8f:		/* pop (sole member of Grp1a) */
 		rc = emulate_grp1a(ctxt, ops);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0x90: /* nop / xchg r8,rax */
@@ -2175,89 +2805,16 @@ special_insn:
 		c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
 		break;
 	case 0xa4 ... 0xa5:	/* movs */
-		c->dst.type = OP_MEM;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)register_address(c,
-						   es_base(ctxt),
-						   c->regs[VCPU_REGS_RDI]);
-		rc = ops->read_emulated(register_address(c,
-						seg_override_base(ctxt, c),
-						c->regs[VCPU_REGS_RSI]),
-					&c->dst.val,
-					c->dst.bytes, ctxt->vcpu);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-		register_address_increment(c, &c->regs[VCPU_REGS_RSI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-							   : c->dst.bytes);
-		register_address_increment(c, &c->regs[VCPU_REGS_RDI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-							   : c->dst.bytes);
-		break;
+		goto mov;
 	case 0xa6 ... 0xa7:	/* cmps */
-		c->src.type = OP_NONE; /* Disable writeback. */
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.ptr = (unsigned long *)register_address(c,
-				       seg_override_base(ctxt, c),
-						   c->regs[VCPU_REGS_RSI]);
-		rc = ops->read_emulated((unsigned long)c->src.ptr,
-					&c->src.val,
-					c->src.bytes,
-					ctxt->vcpu);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-
 		c->dst.type = OP_NONE; /* Disable writeback. */
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)register_address(c,
-						   es_base(ctxt),
-						   c->regs[VCPU_REGS_RDI]);
-		rc = ops->read_emulated((unsigned long)c->dst.ptr,
-					&c->dst.val,
-					c->dst.bytes,
-					ctxt->vcpu);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-
 		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
-
-		emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
-
-		register_address_increment(c, &c->regs[VCPU_REGS_RSI],
-				       (ctxt->eflags & EFLG_DF) ? -c->src.bytes
-								  : c->src.bytes);
-		register_address_increment(c, &c->regs[VCPU_REGS_RDI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-								  : c->dst.bytes);
-
-		break;
+		goto cmp;
 	case 0xaa ... 0xab:	/* stos */
-		c->dst.type = OP_MEM;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)register_address(c,
-						   es_base(ctxt),
-						   c->regs[VCPU_REGS_RDI]);
 		c->dst.val = c->regs[VCPU_REGS_RAX];
-		register_address_increment(c, &c->regs[VCPU_REGS_RDI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-							   : c->dst.bytes);
 		break;
 	case 0xac ... 0xad:	/* lods */
-		c->dst.type = OP_REG;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
-		rc = ops->read_emulated(register_address(c,
-						seg_override_base(ctxt, c),
-						c->regs[VCPU_REGS_RSI]),
-					&c->dst.val,
-					c->dst.bytes,
-					ctxt->vcpu);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-		register_address_increment(c, &c->regs[VCPU_REGS_RSI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-							   : c->dst.bytes);
-		break;
+		goto mov;
 	case 0xae ... 0xaf:	/* scas */
 		DPRINTF("Urk! I don't handle SCAS.\n");
 		goto cannot_emulate;
@@ -2277,7 +2834,7 @@ special_insn:
 		break;
 	case 0xcb:		/* ret far */
 		rc = emulate_ret_far(ctxt, ops);
-		if (rc)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0xd0 ... 0xd1:	/* Grp2 */
@@ -2290,14 +2847,10 @@ special_insn:
 		break;
 	case 0xe4: 	/* inb */
 	case 0xe5: 	/* in */
-		port = c->src.val;
-		io_dir_in = 1;
-		goto do_io;
+		goto do_io_in;
 	case 0xe6: /* outb */
 	case 0xe7: /* out */
-		port = c->src.val;
-		io_dir_in = 0;
-		goto do_io;
+		goto do_io_out;
 	case 0xe8: /* call (near) */ {
 		long int rel = c->src.val;
 		c->src.val = (unsigned long) c->eip;
@@ -2308,8 +2861,9 @@ special_insn:
 	case 0xe9: /* jmp rel */
 		goto jmp;
 	case 0xea: /* jmp far */
-		if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val,
-						VCPU_SREG_CS))
+	jump_far:
+		if (load_segment_descriptor(ctxt, ops, c->src2.val,
+					    VCPU_SREG_CS))
 			goto done;
 
 		c->eip = c->src.val;
@@ -2321,25 +2875,29 @@ special_insn:
 		break;
 	case 0xec: /* in al,dx */
 	case 0xed: /* in (e/r)ax,dx */
-		port = c->regs[VCPU_REGS_RDX];
-		io_dir_in = 1;
-		goto do_io;
+		c->src.val = c->regs[VCPU_REGS_RDX];
+	do_io_in:
+		c->dst.bytes = min(c->dst.bytes, 4u);
+		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
+			kvm_inject_gp(ctxt->vcpu, 0);
+			goto done;
+		}
+		if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
+				     &c->dst.val))
+			goto done; /* IO is needed */
+		break;
 	case 0xee: /* out al,dx */
 	case 0xef: /* out (e/r)ax,dx */
-		port = c->regs[VCPU_REGS_RDX];
-		io_dir_in = 0;
-	do_io:
-		if (!emulator_io_permited(ctxt, ops, port,
-					  (c->d & ByteOp) ? 1 : c->op_bytes)) {
+		c->src.val = c->regs[VCPU_REGS_RDX];
+	do_io_out:
+		c->dst.bytes = min(c->dst.bytes, 4u);
+		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			goto done;
 		}
-		if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
-				   (c->d & ByteOp) ? 1 : c->op_bytes,
-				   port) != 0) {
-			c->eip = saved_eip;
-			goto cannot_emulate;
-		}
+		ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
+				      ctxt->vcpu);
+		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xf4:              /* hlt */
 		ctxt->vcpu->arch.halt_request = 1;
@@ -2350,16 +2908,15 @@ special_insn:
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xf6 ... 0xf7:	/* Grp3 */
-		rc = emulate_grp3(ctxt, ops);
-		if (rc != 0)
-			goto done;
+		if (!emulate_grp3(ctxt, ops))
+			goto cannot_emulate;
 		break;
 	case 0xf8: /* clc */
 		ctxt->eflags &= ~EFLG_CF;
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xfa: /* cli */
-		if (emulator_bad_iopl(ctxt))
+		if (emulator_bad_iopl(ctxt, ops))
 			kvm_inject_gp(ctxt->vcpu, 0);
 		else {
 			ctxt->eflags &= ~X86_EFLAGS_IF;
@@ -2367,10 +2924,10 @@ special_insn:
 		}
 		break;
 	case 0xfb: /* sti */
-		if (emulator_bad_iopl(ctxt))
+		if (emulator_bad_iopl(ctxt, ops))
 			kvm_inject_gp(ctxt->vcpu, 0);
 		else {
-			toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
+			toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
 			ctxt->eflags |= X86_EFLAGS_IF;
 			c->dst.type = OP_NONE;	/* Disable writeback. */
 		}
@@ -2383,28 +2940,55 @@ special_insn:
 		ctxt->eflags |= EFLG_DF;
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
-	case 0xfe ... 0xff:	/* Grp4/Grp5 */
+	case 0xfe: /* Grp4 */
+	grp45:
 		rc = emulate_grp45(ctxt, ops);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
+	case 0xff: /* Grp5 */
+		if (c->modrm_reg == 5)
+			goto jump_far;
+		goto grp45;
 	}
 
 writeback:
 	rc = writeback(ctxt, ops);
-	if (rc != 0)
+	if (rc != X86EMUL_CONTINUE)
 		goto done;
 
+	/*
+	 * restore dst type in case the decoding will be reused
+	 * (happens for string instruction )
+	 */
+	c->dst.type = saved_dst_type;
+
+	if ((c->d & SrcMask) == SrcSI)
+		string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
+				&c->src);
+
+	if ((c->d & DstMask) == DstDI)
+		string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
+
+	if (c->rep_prefix && (c->d & String)) {
+		struct read_cache *rc = &ctxt->decode.io_read;
+		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
+		/*
+		 * Re-enter guest when pio read ahead buffer is empty or,
+		 * if it is not used, after each 1024 iteration.
+		 */
+		if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
+		    (rc->end != 0 && rc->end == rc->pos))
+			ctxt->restart = false;
+	}
+
 	/* Commit shadow register state. */
 	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(ctxt->vcpu, c->eip);
+	ops->set_rflags(ctxt->vcpu, ctxt->eflags);
 
 done:
-	if (rc == X86EMUL_UNHANDLEABLE) {
-		c->eip = saved_eip;
-		return -1;
-	}
-	return 0;
+	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 
 twobyte_insn:
 	switch (c->b) {
@@ -2418,18 +3002,18 @@ twobyte_insn:
 				goto cannot_emulate;
 
 			rc = kvm_fix_hypercall(ctxt->vcpu);
-			if (rc)
+			if (rc != X86EMUL_CONTINUE)
 				goto done;
 
 			/* Let the processor re-execute the fixed hypercall */
-			c->eip = kvm_rip_read(ctxt->vcpu);
+			c->eip = ctxt->eip;
 			/* Disable writeback. */
 			c->dst.type = OP_NONE;
 			break;
 		case 2: /* lgdt */
 			rc = read_descriptor(ctxt, ops, c->src.ptr,
 					     &size, &address, c->op_bytes);
-			if (rc)
+			if (rc != X86EMUL_CONTINUE)
 				goto done;
 			realmode_lgdt(ctxt->vcpu, size, address);
 			/* Disable writeback. */
@@ -2440,7 +3024,7 @@ twobyte_insn:
 				switch (c->modrm_rm) {
 				case 1:
 					rc = kvm_fix_hypercall(ctxt->vcpu);
-					if (rc)
+					if (rc != X86EMUL_CONTINUE)
 						goto done;
 					break;
 				default:
@@ -2450,7 +3034,7 @@ twobyte_insn:
 				rc = read_descriptor(ctxt, ops, c->src.ptr,
 						     &size, &address,
 						     c->op_bytes);
-				if (rc)
+				if (rc != X86EMUL_CONTINUE)
 					goto done;
 				realmode_lidt(ctxt->vcpu, size, address);
 			}
@@ -2459,15 +3043,18 @@ twobyte_insn:
 			break;
 		case 4: /* smsw */
 			c->dst.bytes = 2;
-			c->dst.val = realmode_get_cr(ctxt->vcpu, 0);
+			c->dst.val = ops->get_cr(0, ctxt->vcpu);
 			break;
 		case 6: /* lmsw */
-			realmode_lmsw(ctxt->vcpu, (u16)c->src.val,
-				      &ctxt->eflags);
+			ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) |
+				    (c->src.val & 0x0f), ctxt->vcpu);
 			c->dst.type = OP_NONE;
 			break;
+		case 5: /* not defined */
+			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			goto done;
 		case 7: /* invlpg*/
-			emulate_invlpg(ctxt->vcpu, memop);
+			emulate_invlpg(ctxt->vcpu, c->modrm_ea);
 			/* Disable writeback. */
 			c->dst.type = OP_NONE;
 			break;
@@ -2493,54 +3080,54 @@ twobyte_insn:
 		c->dst.type = OP_NONE;
 		break;
 	case 0x20: /* mov cr, reg */
-		if (c->modrm_mod != 3)
-			goto cannot_emulate;
-		c->regs[c->modrm_rm] =
-				realmode_get_cr(ctxt->vcpu, c->modrm_reg);
+		switch (c->modrm_reg) {
+		case 1:
+		case 5 ... 7:
+		case 9 ... 15:
+			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			goto done;
+		}
+		c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x21: /* mov from dr to reg */
-		if (c->modrm_mod != 3)
-			goto cannot_emulate;
-		rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
-		if (rc)
-			goto cannot_emulate;
+		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
+		    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
+			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			goto done;
+		}
+		emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x22: /* mov reg, cr */
-		if (c->modrm_mod != 3)
-			goto cannot_emulate;
-		realmode_set_cr(ctxt->vcpu,
-				c->modrm_reg, c->modrm_val, &ctxt->eflags);
+		ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
 		c->dst.type = OP_NONE;
 		break;
 	case 0x23: /* mov from reg to dr */
-		if (c->modrm_mod != 3)
-			goto cannot_emulate;
-		rc = emulator_set_dr(ctxt, c->modrm_reg,
-				     c->regs[c->modrm_rm]);
-		if (rc)
-			goto cannot_emulate;
+		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
+		    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
+			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			goto done;
+		}
+		emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]);
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x30:
 		/* wrmsr */
 		msr_data = (u32)c->regs[VCPU_REGS_RAX]
 			| ((u64)c->regs[VCPU_REGS_RDX] << 32);
-		rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
-		if (rc) {
+		if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
-			c->eip = kvm_rip_read(ctxt->vcpu);
+			goto done;
 		}
 		rc = X86EMUL_CONTINUE;
 		c->dst.type = OP_NONE;
 		break;
 	case 0x32:
 		/* rdmsr */
-		rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
-		if (rc) {
+		if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
-			c->eip = kvm_rip_read(ctxt->vcpu);
+			goto done;
 		} else {
 			c->regs[VCPU_REGS_RAX] = (u32)msr_data;
 			c->regs[VCPU_REGS_RDX] = msr_data >> 32;
@@ -2577,7 +3164,7 @@ twobyte_insn:
 		break;
 	case 0xa1:	 /* pop fs */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0xa3:
@@ -2596,7 +3183,7 @@ twobyte_insn:
 		break;
 	case 0xa9:	/* pop gs */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0xab:
@@ -2668,16 +3255,14 @@ twobyte_insn:
 							(u64) c->src.val;
 		break;
 	case 0xc7:		/* Grp9 (cmpxchg8b) */
-		rc = emulate_grp9(ctxt, ops, memop);
-		if (rc != 0)
+		rc = emulate_grp9(ctxt, ops);
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
-		c->dst.type = OP_NONE;
 		break;
 	}
 	goto writeback;
 
 cannot_emulate:
 	DPRINTF("Cannot emulate %02x\n", c->b);
-	c->eip = saved_eip;
 	return -1;
 }
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index a790fa128a9..93825ff3338 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -33,6 +33,29 @@
 #include <linux/kvm_host.h>
 #include "trace.h"
 
+static void pic_lock(struct kvm_pic *s)
+	__acquires(&s->lock)
+{
+	raw_spin_lock(&s->lock);
+}
+
+static void pic_unlock(struct kvm_pic *s)
+	__releases(&s->lock)
+{
+	bool wakeup = s->wakeup_needed;
+	struct kvm_vcpu *vcpu;
+
+	s->wakeup_needed = false;
+
+	raw_spin_unlock(&s->lock);
+
+	if (wakeup) {
+		vcpu = s->kvm->bsp_vcpu;
+		if (vcpu)
+			kvm_vcpu_kick(vcpu);
+	}
+}
+
 static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
 {
 	s->isr &= ~(1 << irq);
@@ -45,19 +68,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
 	 * Other interrupt may be delivered to PIC while lock is dropped but
 	 * it should be safe since PIC state is already updated at this stage.
 	 */
-	raw_spin_unlock(&s->pics_state->lock);
+	pic_unlock(s->pics_state);
 	kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
-	raw_spin_lock(&s->pics_state->lock);
+	pic_lock(s->pics_state);
 }
 
 void kvm_pic_clear_isr_ack(struct kvm *kvm)
 {
 	struct kvm_pic *s = pic_irqchip(kvm);
 
-	raw_spin_lock(&s->lock);
+	pic_lock(s);
 	s->pics[0].isr_ack = 0xff;
 	s->pics[1].isr_ack = 0xff;
-	raw_spin_unlock(&s->lock);
+	pic_unlock(s);
 }
 
 /*
@@ -158,9 +181,9 @@ static void pic_update_irq(struct kvm_pic *s)
 
 void kvm_pic_update_irq(struct kvm_pic *s)
 {
-	raw_spin_lock(&s->lock);
+	pic_lock(s);
 	pic_update_irq(s);
-	raw_spin_unlock(&s->lock);
+	pic_unlock(s);
 }
 
 int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -168,14 +191,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
 	struct kvm_pic *s = opaque;
 	int ret = -1;
 
-	raw_spin_lock(&s->lock);
+	pic_lock(s);
 	if (irq >= 0 && irq < PIC_NUM_PINS) {
 		ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
 		pic_update_irq(s);
 		trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
 				      s->pics[irq >> 3].imr, ret == 0);
 	}
-	raw_spin_unlock(&s->lock);
+	pic_unlock(s);
 
 	return ret;
 }
@@ -205,7 +228,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 	int irq, irq2, intno;
 	struct kvm_pic *s = pic_irqchip(kvm);
 
-	raw_spin_lock(&s->lock);
+	pic_lock(s);
 	irq = pic_get_irq(&s->pics[0]);
 	if (irq >= 0) {
 		pic_intack(&s->pics[0], irq);
@@ -230,7 +253,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 		intno = s->pics[0].irq_base + irq;
 	}
 	pic_update_irq(s);
-	raw_spin_unlock(&s->lock);
+	pic_unlock(s);
 
 	return intno;
 }
@@ -444,7 +467,7 @@ static int picdev_write(struct kvm_io_device *this,
 			printk(KERN_ERR "PIC: non byte write\n");
 		return 0;
 	}
-	raw_spin_lock(&s->lock);
+	pic_lock(s);
 	switch (addr) {
 	case 0x20:
 	case 0x21:
@@ -457,7 +480,7 @@ static int picdev_write(struct kvm_io_device *this,
 		elcr_ioport_write(&s->pics[addr & 1], addr, data);
 		break;
 	}
-	raw_spin_unlock(&s->lock);
+	pic_unlock(s);
 	return 0;
 }
 
@@ -474,7 +497,7 @@ static int picdev_read(struct kvm_io_device *this,
 			printk(KERN_ERR "PIC: non byte read\n");
 		return 0;
 	}
-	raw_spin_lock(&s->lock);
+	pic_lock(s);
 	switch (addr) {
 	case 0x20:
 	case 0x21:
@@ -488,7 +511,7 @@ static int picdev_read(struct kvm_io_device *this,
 		break;
 	}
 	*(unsigned char *)val = data;
-	raw_spin_unlock(&s->lock);
+	pic_unlock(s);
 	return 0;
 }
 
@@ -505,7 +528,7 @@ static void pic_irq_request(void *opaque, int level)
 	s->output = level;
 	if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
 		s->pics[0].isr_ack &= ~(1 << irq);
-		kvm_vcpu_kick(vcpu);
+		s->wakeup_needed = true;
 	}
 }
 
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 34b15915754..cd1f362f413 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -63,6 +63,7 @@ struct kvm_kpic_state {
 
 struct kvm_pic {
 	raw_spinlock_t lock;
+	bool wakeup_needed;
 	unsigned pending_acks;
 	struct kvm *kvm;
 	struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 55c7524dda5..64bc6ea78d9 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -10,9 +10,7 @@ struct kvm_timer {
 };
 
 struct kvm_timer_ops {
-        bool (*is_periodic)(struct kvm_timer *);
+	bool (*is_periodic)(struct kvm_timer *);
 };
 
-
 enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
-
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 19a8906bcaa..a6f695d7692 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -148,7 +148,6 @@ module_param(oos_shadow, bool, 0644);
 
 #include <trace/events/kvm.h>
 
-#undef TRACE_INCLUDE_FILE
 #define CREATE_TRACE_POINTS
 #include "mmutrace.h"
 
@@ -174,12 +173,7 @@ struct kvm_shadow_walk_iterator {
 	     shadow_walk_okay(&(_walker));			\
 	     shadow_walk_next(&(_walker)))
 
-
-struct kvm_unsync_walk {
-	int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
-};
-
-typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
+typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
 
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
@@ -223,7 +217,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
-static int is_write_protection(struct kvm_vcpu *vcpu)
+static bool is_write_protection(struct kvm_vcpu *vcpu)
 {
 	return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
 }
@@ -327,7 +321,6 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
 		page = alloc_page(GFP_KERNEL);
 		if (!page)
 			return -ENOMEM;
-		set_page_private(page, 0);
 		cache->objects[cache->nobjs++] = page_address(page);
 	}
 	return 0;
@@ -438,9 +431,9 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 	int i;
 
 	gfn = unalias_gfn(kvm, gfn);
+	slot = gfn_to_memslot_unaliased(kvm, gfn);
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		slot          = gfn_to_memslot_unaliased(kvm, gfn);
 		write_count   = slot_largepage_idx(gfn, slot, i);
 		*write_count -= 1;
 		WARN_ON(*write_count < 0);
@@ -654,7 +647,6 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 {
 	struct kvm_rmap_desc *desc;
-	struct kvm_rmap_desc *prev_desc;
 	u64 *prev_spte;
 	int i;
 
@@ -666,7 +658,6 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 		return NULL;
 	}
 	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-	prev_desc = NULL;
 	prev_spte = NULL;
 	while (desc) {
 		for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
@@ -794,7 +785,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 	int retval = 0;
 	struct kvm_memslots *slots;
 
-	slots = rcu_dereference(kvm->memslots);
+	slots = kvm_memslots(kvm);
 
 	for (i = 0; i < slots->nmemslots; i++) {
 		struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -925,7 +916,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
-	INIT_LIST_HEAD(&sp->oos_link);
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	sp->multimapped = 0;
 	sp->parent_pte = parent_pte;
@@ -1009,8 +999,7 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
 }
 
 
-static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			    mmu_parent_walk_fn fn)
+static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
 {
 	struct kvm_pte_chain *pte_chain;
 	struct hlist_node *node;
@@ -1019,8 +1008,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 
 	if (!sp->multimapped && sp->parent_pte) {
 		parent_sp = page_header(__pa(sp->parent_pte));
-		fn(vcpu, parent_sp);
-		mmu_parent_walk(vcpu, parent_sp, fn);
+		fn(parent_sp);
+		mmu_parent_walk(parent_sp, fn);
 		return;
 	}
 	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
@@ -1028,8 +1017,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			if (!pte_chain->parent_ptes[i])
 				break;
 			parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
-			fn(vcpu, parent_sp);
-			mmu_parent_walk(vcpu, parent_sp, fn);
+			fn(parent_sp);
+			mmu_parent_walk(parent_sp, fn);
 		}
 }
 
@@ -1066,16 +1055,15 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
 		}
 }
 
-static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int unsync_walk_fn(struct kvm_mmu_page *sp)
 {
 	kvm_mmu_update_parents_unsync(sp);
 	return 1;
 }
 
-static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
-					struct kvm_mmu_page *sp)
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
 {
-	mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+	mmu_parent_walk(sp, unsync_walk_fn);
 	kvm_mmu_update_parents_unsync(sp);
 }
 
@@ -1201,6 +1189,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	WARN_ON(!sp->unsync);
+	trace_kvm_mmu_sync_page(sp);
 	sp->unsync = 0;
 	--kvm->stat.mmu_unsync;
 }
@@ -1209,12 +1198,11 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 
 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
-	if (sp->role.glevels != vcpu->arch.mmu.root_level) {
+	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
 		kvm_mmu_zap_page(vcpu->kvm, sp);
 		return 1;
 	}
 
-	trace_kvm_mmu_sync_page(sp);
 	if (rmap_write_protect(vcpu->kvm, sp->gfn))
 		kvm_flush_remote_tlbs(vcpu->kvm);
 	kvm_unlink_unsync_page(vcpu->kvm, sp);
@@ -1331,6 +1319,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	role = vcpu->arch.mmu.base_role;
 	role.level = level;
 	role.direct = direct;
+	if (role.direct)
+		role.cr4_pae = 0;
 	role.access = access;
 	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
@@ -1351,7 +1341,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 			if (sp->unsync_children) {
 				set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
-				kvm_mmu_mark_parents_unsync(vcpu, sp);
+				kvm_mmu_mark_parents_unsync(sp);
 			}
 			trace_kvm_mmu_get_page(sp, false);
 			return sp;
@@ -1573,13 +1563,14 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 	r = 0;
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
+restart:
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
 		if (sp->gfn == gfn && !sp->role.direct) {
 			pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
 				 sp->role.word);
 			r = 1;
 			if (kvm_mmu_zap_page(kvm, sp))
-				n = bucket->first;
+				goto restart;
 		}
 	return r;
 }
@@ -1593,13 +1584,14 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
+restart:
 	hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
 		if (sp->gfn == gfn && !sp->role.direct
 		    && !sp->role.invalid) {
 			pgprintk("%s: zap %lx %x\n",
 				 __func__, gfn, sp->role.word);
 			if (kvm_mmu_zap_page(kvm, sp))
-				nn = bucket->first;
+				goto restart;
 		}
 	}
 }
@@ -1626,20 +1618,6 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp)
 	}
 }
 
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
-{
-	struct page *page;
-
-	gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
-
-	if (gpa == UNMAPPED_GVA)
-		return NULL;
-
-	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-
-	return page;
-}
-
 /*
  * The function is based on mtrr_type_lookup() in
  * arch/x86/kernel/cpu/mtrr/generic.c
@@ -1752,7 +1730,6 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	struct kvm_mmu_page *s;
 	struct hlist_node *node, *n;
 
-	trace_kvm_mmu_unsync_page(sp);
 	index = kvm_page_table_hashfn(sp->gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 	/* don't unsync if pagetable is shadowed with multiple roles */
@@ -1762,10 +1739,11 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		if (s->role.word != sp->role.word)
 			return 1;
 	}
+	trace_kvm_mmu_unsync_page(sp);
 	++vcpu->kvm->stat.mmu_unsync;
 	sp->unsync = 1;
 
-	kvm_mmu_mark_parents_unsync(vcpu, sp);
+	kvm_mmu_mark_parents_unsync(sp);
 
 	mmu_convert_notrap(sp);
 	return 0;
@@ -1837,6 +1815,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 		spte |= PT_WRITABLE_MASK;
 
+		if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
+			spte &= ~PT_USER_MASK;
+
 		/*
 		 * Optimization: for pte sync, if spte was writable the hash
 		 * lookup is unnecessary (and expensive). Write protection
@@ -1892,6 +1873,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
 			mmu_page_remove_parent_pte(child, sptep);
+			__set_spte(sptep, shadow_trap_nonpresent_pte);
+			kvm_flush_remote_tlbs(vcpu->kvm);
 		} else if (pfn != spte_to_pfn(*sptep)) {
 			pgprintk("hfn old %lx new %lx\n",
 				 spte_to_pfn(*sptep), pfn);
@@ -2081,21 +2064,23 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
 		ASSERT(!VALID_PAGE(root));
-		if (tdp_enabled)
-			direct = 1;
 		if (mmu_check_root(vcpu, root_gfn))
 			return 1;
+		if (tdp_enabled) {
+			direct = 1;
+			root_gfn = 0;
+		}
+		spin_lock(&vcpu->kvm->mmu_lock);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
 				      PT64_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
+		spin_unlock(&vcpu->kvm->mmu_lock);
 		vcpu->arch.mmu.root_hpa = root;
 		return 0;
 	}
 	direct = !is_paging(vcpu);
-	if (tdp_enabled)
-		direct = 1;
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -2111,11 +2096,18 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = 0;
 		if (mmu_check_root(vcpu, root_gfn))
 			return 1;
+		if (tdp_enabled) {
+			direct = 1;
+			root_gfn = i << 30;
+		}
+		spin_lock(&vcpu->kvm->mmu_lock);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
 				      PT32_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
+		spin_unlock(&vcpu->kvm->mmu_lock);
+
 		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
 	}
 	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
@@ -2299,13 +2291,19 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 		/* no rsvd bits for 2 level 4K page table entries */
 		context->rsvd_bits_mask[0][1] = 0;
 		context->rsvd_bits_mask[0][0] = 0;
+		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+
+		if (!is_pse(vcpu)) {
+			context->rsvd_bits_mask[1][1] = 0;
+			break;
+		}
+
 		if (is_cpuid_PSE36())
 			/* 36bits PSE 4MB page */
 			context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
 		else
 			/* 32 bits PSE 4MB page */
 			context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
-		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
 		break;
 	case PT32E_ROOT_LEVEL:
 		context->rsvd_bits_mask[0][2] =
@@ -2318,7 +2316,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 62) |
 			rsvd_bits(13, 20);		/* large page */
-		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
 		break;
 	case PT64_ROOT_LEVEL:
 		context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
@@ -2336,7 +2334,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 51) |
 			rsvd_bits(13, 20);		/* large page */
-		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
 		break;
 	}
 }
@@ -2438,7 +2436,8 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 	else
 		r = paging32_init_context(vcpu);
 
-	vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level;
+	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
+	vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
 
 	return r;
 }
@@ -2478,7 +2477,9 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 		goto out;
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
+	spin_unlock(&vcpu->kvm->mmu_lock);
 	r = mmu_alloc_roots(vcpu);
+	spin_lock(&vcpu->kvm->mmu_lock);
 	mmu_sync_roots(vcpu);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	if (r)
@@ -2527,7 +2528,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
         }
 
 	++vcpu->kvm->stat.mmu_pte_updated;
-	if (sp->role.glevels == PT32_ROOT_LEVEL)
+	if (!sp->role.cr4_pae)
 		paging32_update_pte(vcpu, sp, spte, new);
 	else
 		paging64_update_pte(vcpu, sp, spte, new);
@@ -2562,36 +2563,11 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
 }
 
 static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-					  const u8 *new, int bytes)
+					  u64 gpte)
 {
 	gfn_t gfn;
-	int r;
-	u64 gpte = 0;
 	pfn_t pfn;
 
-	if (bytes != 4 && bytes != 8)
-		return;
-
-	/*
-	 * Assume that the pte write on a page table of the same type
-	 * as the current vcpu paging mode.  This is nearly always true
-	 * (might be false while changing modes).  Note it is verified later
-	 * by update_pte().
-	 */
-	if (is_pae(vcpu)) {
-		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
-		if ((bytes == 4) && (gpa % 4 == 0)) {
-			r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
-			if (r)
-				return;
-			memcpy((void *)&gpte + (gpa % 8), new, 4);
-		} else if ((bytes == 8) && (gpa % 8 == 0)) {
-			memcpy((void *)&gpte, new, 8);
-		}
-	} else {
-		if ((bytes == 4) && (gpa % 4 == 0))
-			memcpy((void *)&gpte, new, 4);
-	}
 	if (!is_present_gpte(gpte))
 		return;
 	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -2640,10 +2616,46 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	int flooded = 0;
 	int npte;
 	int r;
+	int invlpg_counter;
 
 	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
-	mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
+
+	invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
+
+	/*
+	 * Assume that the pte write on a page table of the same type
+	 * as the current vcpu paging mode.  This is nearly always true
+	 * (might be false while changing modes).  Note it is verified later
+	 * by update_pte().
+	 */
+	if ((is_pae(vcpu) && bytes == 4) || !new) {
+		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+		if (is_pae(vcpu)) {
+			gpa &= ~(gpa_t)7;
+			bytes = 8;
+		}
+		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
+		if (r)
+			gentry = 0;
+		new = (const u8 *)&gentry;
+	}
+
+	switch (bytes) {
+	case 4:
+		gentry = *(const u32 *)new;
+		break;
+	case 8:
+		gentry = *(const u64 *)new;
+		break;
+	default:
+		gentry = 0;
+		break;
+	}
+
+	mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
 	spin_lock(&vcpu->kvm->mmu_lock);
+	if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
+		gentry = 0;
 	kvm_mmu_access_page(vcpu, gfn);
 	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
@@ -2662,10 +2674,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	}
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+
+restart:
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
 		if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
 			continue;
-		pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
+		pte_size = sp->role.cr4_pae ? 8 : 4;
 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
 		misaligned |= bytes < 4;
 		if (misaligned || flooded) {
@@ -2682,14 +2696,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
 				 gpa, bytes, sp->role.word);
 			if (kvm_mmu_zap_page(vcpu->kvm, sp))
-				n = bucket->first;
+				goto restart;
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
 		}
 		page_offset = offset;
 		level = sp->role.level;
 		npte = 1;
-		if (sp->role.glevels == PT32_ROOT_LEVEL) {
+		if (!sp->role.cr4_pae) {
 			page_offset <<= 1;	/* 32->64 */
 			/*
 			 * A 32-bit pde maps 4MB while the shadow pdes map
@@ -2707,20 +2721,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 				continue;
 		}
 		spte = &sp->spt[page_offset / sizeof(*spte)];
-		if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
-			gentry = 0;
-			r = kvm_read_guest_atomic(vcpu->kvm,
-						  gpa & ~(u64)(pte_size - 1),
-						  &gentry, pte_size);
-			new = (const void *)&gentry;
-			if (r < 0)
-				new = NULL;
-		}
 		while (npte--) {
 			entry = *spte;
 			mmu_pte_write_zap_pte(vcpu, sp, spte);
-			if (new)
-				mmu_pte_write_new_pte(vcpu, sp, spte, new);
+			if (gentry)
+				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
 			mmu_pte_write_flush_tlb(vcpu, entry, *spte);
 			++spte;
 		}
@@ -2900,22 +2905,23 @@ void kvm_mmu_zap_all(struct kvm *kvm)
 	struct kvm_mmu_page *sp, *node;
 
 	spin_lock(&kvm->mmu_lock);
+restart:
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
 		if (kvm_mmu_zap_page(kvm, sp))
-			node = container_of(kvm->arch.active_mmu_pages.next,
-					    struct kvm_mmu_page, link);
+			goto restart;
+
 	spin_unlock(&kvm->mmu_lock);
 
 	kvm_flush_remote_tlbs(kvm);
 }
 
-static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
+static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
 {
 	struct kvm_mmu_page *page;
 
 	page = container_of(kvm->arch.active_mmu_pages.prev,
 			    struct kvm_mmu_page, link);
-	kvm_mmu_zap_page(kvm, page);
+	return kvm_mmu_zap_page(kvm, page) + 1;
 }
 
 static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
@@ -2927,7 +2933,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
 	spin_lock(&kvm_lock);
 
 	list_for_each_entry(kvm, &vm_list, vm_list) {
-		int npages, idx;
+		int npages, idx, freed_pages;
 
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
@@ -2935,8 +2941,8 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
 			 kvm->arch.n_free_mmu_pages;
 		cache_count += npages;
 		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
-			kvm_mmu_remove_one_alloc_mmu_page(kvm);
-			cache_count--;
+			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm);
+			cache_count -= freed_pages;
 			kvm_freed = kvm;
 		}
 		nr_to_scan--;
@@ -3011,7 +3017,8 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
 	unsigned int  nr_pages = 0;
 	struct kvm_memslots *slots;
 
-	slots = rcu_dereference(kvm->memslots);
+	slots = kvm_memslots(kvm);
+
 	for (i = 0; i < slots->nmemslots; i++)
 		nr_pages += slots->memslots[i].npages;
 
@@ -3174,8 +3181,7 @@ static gva_t canonicalize(gva_t gva)
 }
 
 
-typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp,
-				 u64 *sptep);
+typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
 
 static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
 			    inspect_spte_fn fn)
@@ -3191,7 +3197,7 @@ static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
 				child = page_header(ent & PT64_BASE_ADDR_MASK);
 				__mmu_spte_walk(kvm, child, fn);
 			} else
-				fn(kvm, sp, &sp->spt[i]);
+				fn(kvm, &sp->spt[i]);
 		}
 	}
 }
@@ -3282,11 +3288,13 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
 
 static int count_rmaps(struct kvm_vcpu *vcpu)
 {
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_memslots *slots;
 	int nmaps = 0;
 	int i, j, k, idx;
 
 	idx = srcu_read_lock(&kvm->srcu);
-	slots = rcu_dereference(kvm->memslots);
+	slots = kvm_memslots(kvm);
 	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
 		struct kvm_memory_slot *m = &slots->memslots[i];
 		struct kvm_rmap_desc *d;
@@ -3315,7 +3323,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
 	return nmaps;
 }
 
-void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
+void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 {
 	unsigned long *rmapp;
 	struct kvm_mmu_page *rev_sp;
@@ -3331,14 +3339,14 @@ void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
 			printk(KERN_ERR "%s: no memslot for gfn %ld\n",
 					 audit_msg, gfn);
 			printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
-					audit_msg, sptep - rev_sp->spt,
+			       audit_msg, (long int)(sptep - rev_sp->spt),
 					rev_sp->gfn);
 			dump_stack();
 			return;
 		}
 
 		rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
-				    is_large_pte(*sptep));
+				    rev_sp->role.level);
 		if (!*rmapp) {
 			if (!printk_ratelimit())
 				return;
@@ -3373,7 +3381,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
 				continue;
 			if (!(ent & PT_WRITABLE_MASK))
 				continue;
-			inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]);
+			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
 		}
 	}
 	return;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3e4a5c6ca2a..42f07b1bfbc 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -6,14 +6,12 @@
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvmmmu
-#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE mmutrace
 
 #define KVM_MMU_PAGE_FIELDS \
 	__field(__u64, gfn) \
 	__field(__u32, role) \
 	__field(__u32, root_count) \
-	__field(__u32, unsync)
+	__field(bool, unsync)
 
 #define KVM_MMU_PAGE_ASSIGN(sp)			     \
 	__entry->gfn = sp->gfn;			     \
@@ -30,14 +28,14 @@
 								        \
 	role.word = __entry->role;					\
 									\
-	trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge"	\
+	trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s"		\
 			 " %snxe root %u %s%c",				\
-			 __entry->gfn, role.level, role.glevels,	\
+			 __entry->gfn, role.level,			\
+			 role.cr4_pae ? " pae" : "",			\
 			 role.quadrant,					\
 			 role.direct ? " direct" : "",			\
 			 access_str[role.access],			\
 			 role.invalid ? " invalid" : "",		\
-			 role.cr4_pge ? "" : "!",			\
 			 role.nxe ? "" : "!",				\
 			 __entry->root_count,				\
 			 __entry->unsync ? "unsync" : "sync", 0);	\
@@ -94,15 +92,15 @@ TRACE_EVENT(
 	TP_printk("pte %llx level %u", __entry->pte, __entry->level)
 );
 
-/* We set a pte accessed bit */
-TRACE_EVENT(
-	kvm_mmu_set_accessed_bit,
+DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class,
+
 	TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
 	TP_ARGS(table_gfn, index, size),
 
 	TP_STRUCT__entry(
 		__field(__u64, gpa)
-		),
+	),
 
 	TP_fast_assign(
 		__entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
@@ -112,22 +110,20 @@ TRACE_EVENT(
 	TP_printk("gpa %llx", __entry->gpa)
 );
 
-/* We set a pte dirty bit */
-TRACE_EVENT(
-	kvm_mmu_set_dirty_bit,
+/* We set a pte accessed bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit,
+
 	TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
-	TP_ARGS(table_gfn, index, size),
 
-	TP_STRUCT__entry(
-		__field(__u64, gpa)
-		),
+	TP_ARGS(table_gfn, index, size)
+);
 
-	TP_fast_assign(
-		__entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
-				+ index * size;
-		),
+/* We set a pte dirty bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit,
 
-	TP_printk("gpa %llx", __entry->gpa)
+	TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+	TP_ARGS(table_gfn, index, size)
 );
 
 TRACE_EVENT(
@@ -166,55 +162,45 @@ TRACE_EVENT(
 		  __entry->created ? "new" : "existing")
 );
 
-TRACE_EVENT(
-	kvm_mmu_sync_page,
+DECLARE_EVENT_CLASS(kvm_mmu_page_class,
+
 	TP_PROTO(struct kvm_mmu_page *sp),
 	TP_ARGS(sp),
 
 	TP_STRUCT__entry(
 		KVM_MMU_PAGE_FIELDS
-		),
+	),
 
 	TP_fast_assign(
 		KVM_MMU_PAGE_ASSIGN(sp)
-		),
+	),
 
 	TP_printk("%s", KVM_MMU_PAGE_PRINTK())
 );
 
-TRACE_EVENT(
-	kvm_mmu_unsync_page,
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page,
 	TP_PROTO(struct kvm_mmu_page *sp),
-	TP_ARGS(sp),
-
-	TP_STRUCT__entry(
-		KVM_MMU_PAGE_FIELDS
-		),
 
-	TP_fast_assign(
-		KVM_MMU_PAGE_ASSIGN(sp)
-		),
-
-	TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+	TP_ARGS(sp)
 );
 
-TRACE_EVENT(
-	kvm_mmu_zap_page,
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
 	TP_PROTO(struct kvm_mmu_page *sp),
-	TP_ARGS(sp),
 
-	TP_STRUCT__entry(
-		KVM_MMU_PAGE_FIELDS
-		),
+	TP_ARGS(sp)
+);
 
-	TP_fast_assign(
-		KVM_MMU_PAGE_ASSIGN(sp)
-		),
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page,
+	TP_PROTO(struct kvm_mmu_page *sp),
 
-	TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+	TP_ARGS(sp)
 );
-
 #endif /* _TRACE_KVMMMU_H */
 
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE mmutrace
+
 /* This part must be outside protection */
 #include <trace/define_trace.h>
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 81eab9a50e6..89d66ca4d87 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -170,7 +170,7 @@ walk:
 			goto access_error;
 
 #if PTTYPE == 64
-		if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
+		if (fetch_fault && (pte & PT64_NX_MASK))
 			goto access_error;
 #endif
 
@@ -190,10 +190,10 @@ walk:
 
 		if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
 		    ((walker->level == PT_DIRECTORY_LEVEL) &&
-				(pte & PT_PAGE_SIZE_MASK)  &&
+				is_large_pte(pte) &&
 				(PTTYPE == 64 || is_pse(vcpu))) ||
 		    ((walker->level == PT_PDPE_LEVEL) &&
-				(pte & PT_PAGE_SIZE_MASK)  &&
+				is_large_pte(pte) &&
 				is_long_mode(vcpu))) {
 			int lvl = walker->level;
 
@@ -258,11 +258,17 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	pt_element_t gpte;
 	unsigned pte_access;
 	pfn_t pfn;
+	u64 new_spte;
 
 	gpte = *(const pt_element_t *)pte;
 	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
-		if (!is_present_gpte(gpte))
-			__set_spte(spte, shadow_notrap_nonpresent_pte);
+		if (!is_present_gpte(gpte)) {
+			if (page->unsync)
+				new_spte = shadow_trap_nonpresent_pte;
+			else
+				new_spte = shadow_notrap_nonpresent_pte;
+			__set_spte(spte, new_spte);
+		}
 		return;
 	}
 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
@@ -457,6 +463,7 @@ out_unlock:
 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
 	struct kvm_shadow_walk_iterator iterator;
+	gpa_t pte_gpa = -1;
 	int level;
 	u64 *sptep;
 	int need_flush = 0;
@@ -467,9 +474,16 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 		level = iterator.level;
 		sptep = iterator.sptep;
 
-		if (level == PT_PAGE_TABLE_LEVEL  ||
-		    ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
-		    ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
+		if (is_last_spte(*sptep, level)) {
+			struct kvm_mmu_page *sp = page_header(__pa(sptep));
+			int offset, shift;
+
+			shift = PAGE_SHIFT -
+				  (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
+			offset = sp->role.quadrant << shift;
+
+			pte_gpa = (sp->gfn << PAGE_SHIFT) + offset;
+			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
 
 			if (is_shadow_present_pte(*sptep)) {
 				rmap_remove(vcpu->kvm, sptep);
@@ -487,7 +501,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 
 	if (need_flush)
 		kvm_flush_remote_tlbs(vcpu->kvm);
+
+	atomic_inc(&vcpu->kvm->arch.invlpg_counter);
+
 	spin_unlock(&vcpu->kvm->mmu_lock);
+
+	if (pte_gpa == -1)
+		return;
+
+	if (mmu_topup_memory_caches(vcpu))
+		return;
+	kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
 }
 
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
@@ -551,12 +575,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
 	int i, offset, nr_present;
 	bool reset_host_protection;
+	gpa_t first_pte_gpa;
 
 	offset = nr_present = 0;
 
 	if (PTTYPE == 32)
 		offset = sp->role.quadrant << PT64_LEVEL_BITS;
 
+	first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
+
 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 		unsigned pte_access;
 		pt_element_t gpte;
@@ -566,8 +593,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		if (!is_shadow_present_pte(sp->spt[i]))
 			continue;
 
-		pte_gpa = gfn_to_gpa(sp->gfn);
-		pte_gpa += (i+offset) * sizeof(pt_element_t);
+		pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
 
 		if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
 					  sizeof(pt_element_t)))
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 737361fcd50..ce438e0fdd2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -28,6 +28,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/slab.h>
 
+#include <asm/tlbflush.h>
 #include <asm/desc.h>
 
 #include <asm/virtext.h>
@@ -44,10 +45,11 @@ MODULE_LICENSE("GPL");
 #define SEG_TYPE_LDT 2
 #define SEG_TYPE_BUSY_TSS16 3
 
-#define SVM_FEATURE_NPT  (1 << 0)
-#define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_FEATURE_SVML (1 << 2)
-#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
+#define SVM_FEATURE_NPT            (1 <<  0)
+#define SVM_FEATURE_LBRV           (1 <<  1)
+#define SVM_FEATURE_SVML           (1 <<  2)
+#define SVM_FEATURE_NRIP           (1 <<  3)
+#define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
 
 #define NESTED_EXIT_HOST	0	/* Exit handled on host level */
 #define NESTED_EXIT_DONE	1	/* Exit caused nested vmexit  */
@@ -55,6 +57,8 @@ MODULE_LICENSE("GPL");
 
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 
+static bool erratum_383_found __read_mostly;
+
 static const u32 host_save_user_msrs[] = {
 #ifdef CONFIG_X86_64
 	MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
@@ -70,6 +74,7 @@ struct kvm_vcpu;
 struct nested_state {
 	struct vmcb *hsave;
 	u64 hsave_msr;
+	u64 vm_cr_msr;
 	u64 vmcb;
 
 	/* These are the merged vectors */
@@ -77,6 +82,7 @@ struct nested_state {
 
 	/* gpa pointers to the real vectors */
 	u64 vmcb_msrpm;
+	u64 vmcb_iopm;
 
 	/* A VMEXIT is required but not yet emulated */
 	bool exit_required;
@@ -91,6 +97,9 @@ struct nested_state {
 
 };
 
+#define MSRPM_OFFSETS	16
+static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+
 struct vcpu_svm {
 	struct kvm_vcpu vcpu;
 	struct vmcb *vmcb;
@@ -110,13 +119,39 @@ struct vcpu_svm {
 	struct nested_state nested;
 
 	bool nmi_singlestep;
+
+	unsigned int3_injected;
+	unsigned long int3_rip;
+};
+
+#define MSR_INVALID			0xffffffffU
+
+static struct svm_direct_access_msrs {
+	u32 index;   /* Index of the MSR */
+	bool always; /* True if intercept is always on */
+} direct_access_msrs[] = {
+	{ .index = MSR_K6_STAR,				.always = true  },
+	{ .index = MSR_IA32_SYSENTER_CS,		.always = true  },
+#ifdef CONFIG_X86_64
+	{ .index = MSR_GS_BASE,				.always = true  },
+	{ .index = MSR_FS_BASE,				.always = true  },
+	{ .index = MSR_KERNEL_GS_BASE,			.always = true  },
+	{ .index = MSR_LSTAR,				.always = true  },
+	{ .index = MSR_CSTAR,				.always = true  },
+	{ .index = MSR_SYSCALL_MASK,			.always = true  },
+#endif
+	{ .index = MSR_IA32_LASTBRANCHFROMIP,		.always = false },
+	{ .index = MSR_IA32_LASTBRANCHTOIP,		.always = false },
+	{ .index = MSR_IA32_LASTINTFROMIP,		.always = false },
+	{ .index = MSR_IA32_LASTINTTOIP,		.always = false },
+	{ .index = MSR_INVALID,				.always = false },
 };
 
 /* enable NPT for AMD64 and X86 with PAE */
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 static bool npt_enabled = true;
 #else
-static bool npt_enabled = false;
+static bool npt_enabled;
 #endif
 static int npt = 1;
 
@@ -129,6 +164,7 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
 
 static int nested_svm_exit_handled(struct vcpu_svm *svm);
+static int nested_svm_intercept(struct vcpu_svm *svm);
 static int nested_svm_vmexit(struct vcpu_svm *svm);
 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 				      bool has_error_code, u32 error_code);
@@ -163,8 +199,8 @@ static unsigned long iopm_base;
 struct kvm_ldttss_desc {
 	u16 limit0;
 	u16 base0;
-	unsigned base1 : 8, type : 5, dpl : 2, p : 1;
-	unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
+	unsigned base1:8, type:5, dpl:2, p:1;
+	unsigned limit1:4, zero0:3, g:1, base2:8;
 	u32 base3;
 	u32 zero1;
 } __attribute__((packed));
@@ -194,6 +230,27 @@ static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 #define MSRS_RANGE_SIZE 2048
 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 
+static u32 svm_msrpm_offset(u32 msr)
+{
+	u32 offset;
+	int i;
+
+	for (i = 0; i < NUM_MSR_MAPS; i++) {
+		if (msr < msrpm_ranges[i] ||
+		    msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
+			continue;
+
+		offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
+		offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
+
+		/* Now we have the u8 offset - but need the u32 offset */
+		return offset / 4;
+	}
+
+	/* MSR not in any range */
+	return MSR_INVALID;
+}
+
 #define MAX_INST_SIZE 15
 
 static inline u32 svm_has(u32 feat)
@@ -213,7 +270,7 @@ static inline void stgi(void)
 
 static inline void invlpga(unsigned long addr, u32 asid)
 {
-	asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
+	asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
 }
 
 static inline void force_new_asid(struct kvm_vcpu *vcpu)
@@ -235,23 +292,6 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 	vcpu->arch.efer = efer;
 }
 
-static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
-				bool has_error_code, u32 error_code)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	/* If we are within a nested VM we'd better #VMEXIT and let the
-	   guest handle the exception */
-	if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
-		return;
-
-	svm->vmcb->control.event_inj = nr
-		| SVM_EVTINJ_VALID
-		| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
-		| SVM_EVTINJ_TYPE_EXEPT;
-	svm->vmcb->control.event_inj_err = error_code;
-}
-
 static int is_external_interrupt(u32 info)
 {
 	info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
@@ -264,7 +304,7 @@ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 	u32 ret = 0;
 
 	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
-		ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS;
+		ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 	return ret & mask;
 }
 
@@ -283,6 +323,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	if (svm->vmcb->control.next_rip != 0)
+		svm->next_rip = svm->vmcb->control.next_rip;
+
 	if (!svm->next_rip) {
 		if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
 				EMULATE_DONE)
@@ -297,6 +340,68 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	svm_set_interrupt_shadow(vcpu, 0);
 }
 
+static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
+				bool has_error_code, u32 error_code,
+				bool reinject)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	/*
+	 * If we are within a nested VM we'd better #VMEXIT and let the guest
+	 * handle the exception
+	 */
+	if (!reinject &&
+	    nested_svm_check_exception(svm, nr, has_error_code, error_code))
+		return;
+
+	if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) {
+		unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
+
+		/*
+		 * For guest debugging where we have to reinject #BP if some
+		 * INT3 is guest-owned:
+		 * Emulate nRIP by moving RIP forward. Will fail if injection
+		 * raises a fault that is not intercepted. Still better than
+		 * failing in all cases.
+		 */
+		skip_emulated_instruction(&svm->vcpu);
+		rip = kvm_rip_read(&svm->vcpu);
+		svm->int3_rip = rip + svm->vmcb->save.cs.base;
+		svm->int3_injected = rip - old_rip;
+	}
+
+	svm->vmcb->control.event_inj = nr
+		| SVM_EVTINJ_VALID
+		| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+		| SVM_EVTINJ_TYPE_EXEPT;
+	svm->vmcb->control.event_inj_err = error_code;
+}
+
+static void svm_init_erratum_383(void)
+{
+	u32 low, high;
+	int err;
+	u64 val;
+
+	/* Only Fam10h is affected */
+	if (boot_cpu_data.x86 != 0x10)
+		return;
+
+	/* Use _safe variants to not break nested virtualization */
+	val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
+	if (err)
+		return;
+
+	val |= (1ULL << 47);
+
+	low  = lower_32_bits(val);
+	high = upper_32_bits(val);
+
+	native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
+
+	erratum_383_found = true;
+}
+
 static int has_svm(void)
 {
 	const char *msg;
@@ -319,7 +424,7 @@ static int svm_hardware_enable(void *garbage)
 
 	struct svm_cpu_data *sd;
 	uint64_t efer;
-	struct descriptor_table gdt_descr;
+	struct desc_ptr gdt_descr;
 	struct desc_struct *gdt;
 	int me = raw_smp_processor_id();
 
@@ -344,14 +449,16 @@ static int svm_hardware_enable(void *garbage)
 	sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 	sd->next_asid = sd->max_asid + 1;
 
-	kvm_get_gdt(&gdt_descr);
-	gdt = (struct desc_struct *)gdt_descr.base;
+	native_store_gdt(&gdt_descr);
+	gdt = (struct desc_struct *)gdt_descr.address;
 	sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 
 	wrmsrl(MSR_EFER, efer | EFER_SVME);
 
 	wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
 
+	svm_init_erratum_383();
+
 	return 0;
 }
 
@@ -391,42 +498,98 @@ err_1:
 
 }
 
+static bool valid_msr_intercept(u32 index)
+{
+	int i;
+
+	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
+		if (direct_access_msrs[i].index == index)
+			return true;
+
+	return false;
+}
+
 static void set_msr_interception(u32 *msrpm, unsigned msr,
 				 int read, int write)
 {
+	u8 bit_read, bit_write;
+	unsigned long tmp;
+	u32 offset;
+
+	/*
+	 * If this warning triggers extend the direct_access_msrs list at the
+	 * beginning of the file
+	 */
+	WARN_ON(!valid_msr_intercept(msr));
+
+	offset    = svm_msrpm_offset(msr);
+	bit_read  = 2 * (msr & 0x0f);
+	bit_write = 2 * (msr & 0x0f) + 1;
+	tmp       = msrpm[offset];
+
+	BUG_ON(offset == MSR_INVALID);
+
+	read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
+	write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
+
+	msrpm[offset] = tmp;
+}
+
+static void svm_vcpu_init_msrpm(u32 *msrpm)
+{
 	int i;
 
-	for (i = 0; i < NUM_MSR_MAPS; i++) {
-		if (msr >= msrpm_ranges[i] &&
-		    msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
-			u32 msr_offset = (i * MSRS_IN_RANGE + msr -
-					  msrpm_ranges[i]) * 2;
-
-			u32 *base = msrpm + (msr_offset / 32);
-			u32 msr_shift = msr_offset % 32;
-			u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
-			*base = (*base & ~(0x3 << msr_shift)) |
-				(mask << msr_shift);
+	memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+
+	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+		if (!direct_access_msrs[i].always)
+			continue;
+
+		set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
+	}
+}
+
+static void add_msr_offset(u32 offset)
+{
+	int i;
+
+	for (i = 0; i < MSRPM_OFFSETS; ++i) {
+
+		/* Offset already in list? */
+		if (msrpm_offsets[i] == offset)
 			return;
-		}
+
+		/* Slot used by another offset? */
+		if (msrpm_offsets[i] != MSR_INVALID)
+			continue;
+
+		/* Add offset to list */
+		msrpm_offsets[i] = offset;
+
+		return;
 	}
+
+	/*
+	 * If this BUG triggers the msrpm_offsets table has an overflow. Just
+	 * increase MSRPM_OFFSETS in this case.
+	 */
 	BUG();
 }
 
-static void svm_vcpu_init_msrpm(u32 *msrpm)
+static void init_msrpm_offsets(void)
 {
-	memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+	int i;
 
-#ifdef CONFIG_X86_64
-	set_msr_interception(msrpm, MSR_GS_BASE, 1, 1);
-	set_msr_interception(msrpm, MSR_FS_BASE, 1, 1);
-	set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1);
-	set_msr_interception(msrpm, MSR_LSTAR, 1, 1);
-	set_msr_interception(msrpm, MSR_CSTAR, 1, 1);
-	set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1);
-#endif
-	set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
-	set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
+	memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
+
+	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+		u32 offset;
+
+		offset = svm_msrpm_offset(direct_access_msrs[i].index);
+		BUG_ON(offset == MSR_INVALID);
+
+		add_msr_offset(offset);
+	}
 }
 
 static void svm_enable_lbrv(struct vcpu_svm *svm)
@@ -467,6 +630,8 @@ static __init int svm_hardware_setup(void)
 	memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
 	iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
 
+	init_msrpm_offsets();
+
 	if (boot_cpu_has(X86_FEATURE_NX))
 		kvm_enable_efer_bits(EFER_NX);
 
@@ -523,7 +688,7 @@ static void init_seg(struct vmcb_seg *seg)
 {
 	seg->selector = 0;
 	seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
-		SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
+		      SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
 	seg->limit = 0xffff;
 	seg->base = 0;
 }
@@ -543,16 +708,16 @@ static void init_vmcb(struct vcpu_svm *svm)
 
 	svm->vcpu.fpu_active = 1;
 
-	control->intercept_cr_read = 	INTERCEPT_CR0_MASK |
+	control->intercept_cr_read =	INTERCEPT_CR0_MASK |
 					INTERCEPT_CR3_MASK |
 					INTERCEPT_CR4_MASK;
 
-	control->intercept_cr_write = 	INTERCEPT_CR0_MASK |
+	control->intercept_cr_write =	INTERCEPT_CR0_MASK |
 					INTERCEPT_CR3_MASK |
 					INTERCEPT_CR4_MASK |
 					INTERCEPT_CR8_MASK;
 
-	control->intercept_dr_read = 	INTERCEPT_DR0_MASK |
+	control->intercept_dr_read =	INTERCEPT_DR0_MASK |
 					INTERCEPT_DR1_MASK |
 					INTERCEPT_DR2_MASK |
 					INTERCEPT_DR3_MASK |
@@ -561,7 +726,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 					INTERCEPT_DR6_MASK |
 					INTERCEPT_DR7_MASK;
 
-	control->intercept_dr_write = 	INTERCEPT_DR0_MASK |
+	control->intercept_dr_write =	INTERCEPT_DR0_MASK |
 					INTERCEPT_DR1_MASK |
 					INTERCEPT_DR2_MASK |
 					INTERCEPT_DR3_MASK |
@@ -575,7 +740,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 					(1 << MC_VECTOR);
 
 
-	control->intercept = 	(1ULL << INTERCEPT_INTR) |
+	control->intercept =	(1ULL << INTERCEPT_INTR) |
 				(1ULL << INTERCEPT_NMI) |
 				(1ULL << INTERCEPT_SMI) |
 				(1ULL << INTERCEPT_SELECTIVE_CR0) |
@@ -636,7 +801,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 	save->rip = 0x0000fff0;
 	svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
 
-	/* This is the guest-visible cr0 value.
+	/*
+	 * This is the guest-visible cr0 value.
 	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
 	 */
 	svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
@@ -729,6 +895,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	svm_vcpu_init_msrpm(svm->msrpm);
 
 	svm->nested.msrpm = page_address(nested_msrpm_pages);
+	svm_vcpu_init_msrpm(svm->nested.msrpm);
 
 	svm->vmcb = page_address(page);
 	clear_page(svm->vmcb);
@@ -882,7 +1049,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 	var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
 	var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
 
-	/* AMD's VMCB does not have an explicit unusable field, so emulate it
+	/*
+	 * AMD's VMCB does not have an explicit unusable field, so emulate it
 	 * for cross vendor migration purposes by "not present"
 	 */
 	var->unusable = !var->present || (var->type == 0);
@@ -918,7 +1086,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 			var->type |= 0x1;
 		break;
 	case VCPU_SREG_SS:
-		/* On AMD CPUs sometimes the DB bit in the segment
+		/*
+		 * On AMD CPUs sometimes the DB bit in the segment
 		 * descriptor is left as 1, although the whole segment has
 		 * been made unusable. Clear it here to pass an Intel VMX
 		 * entry check when cross vendor migrating.
@@ -936,36 +1105,36 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu)
 	return save->cpl;
 }
 
-static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	dt->limit = svm->vmcb->save.idtr.limit;
-	dt->base = svm->vmcb->save.idtr.base;
+	dt->size = svm->vmcb->save.idtr.limit;
+	dt->address = svm->vmcb->save.idtr.base;
 }
 
-static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	svm->vmcb->save.idtr.limit = dt->limit;
-	svm->vmcb->save.idtr.base = dt->base ;
+	svm->vmcb->save.idtr.limit = dt->size;
+	svm->vmcb->save.idtr.base = dt->address ;
 }
 
-static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	dt->limit = svm->vmcb->save.gdtr.limit;
-	dt->base = svm->vmcb->save.gdtr.base;
+	dt->size = svm->vmcb->save.gdtr.limit;
+	dt->address = svm->vmcb->save.gdtr.base;
 }
 
-static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	svm->vmcb->save.gdtr.limit = dt->limit;
-	svm->vmcb->save.gdtr.base = dt->base ;
+	svm->vmcb->save.gdtr.limit = dt->size;
+	svm->vmcb->save.gdtr.base = dt->address ;
 }
 
 static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -978,6 +1147,7 @@ static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 
 static void update_cr0_intercept(struct vcpu_svm *svm)
 {
+	struct vmcb *vmcb = svm->vmcb;
 	ulong gcr0 = svm->vcpu.arch.cr0;
 	u64 *hcr0 = &svm->vmcb->save.cr0;
 
@@ -989,11 +1159,25 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
 
 
 	if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
-		svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
-		svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+		vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
+		vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+		if (is_nested(svm)) {
+			struct vmcb *hsave = svm->nested.hsave;
+
+			hsave->control.intercept_cr_read  &= ~INTERCEPT_CR0_MASK;
+			hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+			vmcb->control.intercept_cr_read  |= svm->nested.intercept_cr_read;
+			vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
+		}
 	} else {
 		svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
 		svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
+		if (is_nested(svm)) {
+			struct vmcb *hsave = svm->nested.hsave;
+
+			hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
+			hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
+		}
 	}
 }
 
@@ -1001,6 +1185,27 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	if (is_nested(svm)) {
+		/*
+		 * We are here because we run in nested mode, the host kvm
+		 * intercepts cr0 writes but the l1 hypervisor does not.
+		 * But the L1 hypervisor may intercept selective cr0 writes.
+		 * This needs to be checked here.
+		 */
+		unsigned long old, new;
+
+		/* Remove bits that would trigger a real cr0 write intercept */
+		old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
+		new = cr0 & SVM_CR0_SELECTIVE_MASK;
+
+		if (old == new) {
+			/* cr0 write with ts and mp unchanged */
+			svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+			if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE)
+				return;
+		}
+	}
+
 #ifdef CONFIG_X86_64
 	if (vcpu->arch.efer & EFER_LME) {
 		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
@@ -1134,70 +1339,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
 	svm->vmcb->control.asid = sd->next_asid++;
 }
 
-static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest)
+static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	switch (dr) {
-	case 0 ... 3:
-		*dest = vcpu->arch.db[dr];
-		break;
-	case 4:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
-			return EMULATE_FAIL; /* will re-inject UD */
-		/* fall through */
-	case 6:
-		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
-			*dest = vcpu->arch.dr6;
-		else
-			*dest = svm->vmcb->save.dr6;
-		break;
-	case 5:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
-			return EMULATE_FAIL; /* will re-inject UD */
-		/* fall through */
-	case 7:
-		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
-			*dest = vcpu->arch.dr7;
-		else
-			*dest = svm->vmcb->save.dr7;
-		break;
-	}
-
-	return EMULATE_DONE;
-}
-
-static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	switch (dr) {
-	case 0 ... 3:
-		vcpu->arch.db[dr] = value;
-		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
-			vcpu->arch.eff_db[dr] = value;
-		break;
-	case 4:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
-			return EMULATE_FAIL; /* will re-inject UD */
-		/* fall through */
-	case 6:
-		vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
-		break;
-	case 5:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
-			return EMULATE_FAIL; /* will re-inject UD */
-		/* fall through */
-	case 7:
-		vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
-		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
-			svm->vmcb->save.dr7 = vcpu->arch.dr7;
-			vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
-		}
-		break;
-	}
-
-	return EMULATE_DONE;
+	svm->vmcb->save.dr7 = value;
 }
 
 static int pf_interception(struct vcpu_svm *svm)
@@ -1234,7 +1380,7 @@ static int db_interception(struct vcpu_svm *svm)
 	}
 
 	if (svm->vcpu.guest_debug &
-	    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
+	    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
 		kvm_run->exit_reason = KVM_EXIT_DEBUG;
 		kvm_run->debug.arch.pc =
 			svm->vmcb->save.cs.base + svm->vmcb->save.rip;
@@ -1268,7 +1414,22 @@ static int ud_interception(struct vcpu_svm *svm)
 static void svm_fpu_activate(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
+	u32 excp;
+
+	if (is_nested(svm)) {
+		u32 h_excp, n_excp;
+
+		h_excp  = svm->nested.hsave->control.intercept_exceptions;
+		n_excp  = svm->nested.intercept_exceptions;
+		h_excp &= ~(1 << NM_VECTOR);
+		excp    = h_excp | n_excp;
+	} else {
+		excp  = svm->vmcb->control.intercept_exceptions;
+		excp &= ~(1 << NM_VECTOR);
+	}
+
+	svm->vmcb->control.intercept_exceptions = excp;
+
 	svm->vcpu.fpu_active = 1;
 	update_cr0_intercept(svm);
 }
@@ -1279,8 +1440,59 @@ static int nm_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
-static int mc_interception(struct vcpu_svm *svm)
+static bool is_erratum_383(void)
 {
+	int err, i;
+	u64 value;
+
+	if (!erratum_383_found)
+		return false;
+
+	value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
+	if (err)
+		return false;
+
+	/* Bit 62 may or may not be set for this mce */
+	value &= ~(1ULL << 62);
+
+	if (value != 0xb600000000010015ULL)
+		return false;
+
+	/* Clear MCi_STATUS registers */
+	for (i = 0; i < 6; ++i)
+		native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
+
+	value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
+	if (!err) {
+		u32 low, high;
+
+		value &= ~(1ULL << 2);
+		low    = lower_32_bits(value);
+		high   = upper_32_bits(value);
+
+		native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
+	}
+
+	/* Flush tlb to evict multi-match entries */
+	__flush_tlb_all();
+
+	return true;
+}
+
+static void svm_handle_mce(struct vcpu_svm *svm)
+{
+	if (is_erratum_383()) {
+		/*
+		 * Erratum 383 triggered. Guest state is corrupt so kill the
+		 * guest.
+		 */
+		pr_err("KVM: Guest triggered AMD Erratum 383\n");
+
+		set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests);
+
+		return;
+	}
+
 	/*
 	 * On an #MC intercept the MCE handler is not called automatically in
 	 * the host. So do it by hand here.
@@ -1289,6 +1501,11 @@ static int mc_interception(struct vcpu_svm *svm)
 		"int $0x12\n");
 	/* not sure if we ever come back to this point */
 
+	return;
+}
+
+static int mc_interception(struct vcpu_svm *svm)
+{
 	return 1;
 }
 
@@ -1309,29 +1526,23 @@ static int shutdown_interception(struct vcpu_svm *svm)
 
 static int io_interception(struct vcpu_svm *svm)
 {
+	struct kvm_vcpu *vcpu = &svm->vcpu;
 	u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
 	int size, in, string;
 	unsigned port;
 
 	++svm->vcpu.stat.io_exits;
-
-	svm->next_rip = svm->vmcb->control.exit_info_2;
-
 	string = (io_info & SVM_IOIO_STR_MASK) != 0;
-
-	if (string) {
-		if (emulate_instruction(&svm->vcpu,
-					0, 0, 0) == EMULATE_DO_MMIO)
-			return 0;
-		return 1;
-	}
-
 	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
+	if (string || in)
+		return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+
 	port = io_info >> 16;
 	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
-
+	svm->next_rip = svm->vmcb->control.exit_info_2;
 	skip_emulated_instruction(&svm->vcpu);
-	return kvm_emulate_pio(&svm->vcpu, in, size, port);
+
+	return kvm_fast_pio_out(vcpu, size, port);
 }
 
 static int nmi_interception(struct vcpu_svm *svm)
@@ -1384,6 +1595,8 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 				      bool has_error_code, u32 error_code)
 {
+	int vmexit;
+
 	if (!is_nested(svm))
 		return 0;
 
@@ -1392,21 +1605,28 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 	svm->vmcb->control.exit_info_1 = error_code;
 	svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
 
-	return nested_svm_exit_handled(svm);
+	vmexit = nested_svm_intercept(svm);
+	if (vmexit == NESTED_EXIT_DONE)
+		svm->nested.exit_required = true;
+
+	return vmexit;
 }
 
-static inline int nested_svm_intr(struct vcpu_svm *svm)
+/* This function returns true if it is save to enable the irq window */
+static inline bool nested_svm_intr(struct vcpu_svm *svm)
 {
 	if (!is_nested(svm))
-		return 0;
+		return true;
 
 	if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
-		return 0;
+		return true;
 
 	if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
-		return 0;
+		return false;
 
-	svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+	svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
+	svm->vmcb->control.exit_info_1 = 0;
+	svm->vmcb->control.exit_info_2 = 0;
 
 	if (svm->nested.intercept & 1ULL) {
 		/*
@@ -1417,21 +1637,40 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
 		 */
 		svm->nested.exit_required = true;
 		trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
-		return 1;
+		return false;
 	}
 
-	return 0;
+	return true;
 }
 
-static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
+/* This function returns true if it is save to enable the nmi window */
+static inline bool nested_svm_nmi(struct vcpu_svm *svm)
+{
+	if (!is_nested(svm))
+		return true;
+
+	if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
+		return true;
+
+	svm->vmcb->control.exit_code = SVM_EXIT_NMI;
+	svm->nested.exit_required = true;
+
+	return false;
+}
+
+static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
 {
 	struct page *page;
 
+	might_sleep();
+
 	page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
 	if (is_error_page(page))
 		goto error;
 
-	return kmap_atomic(page, idx);
+	*_page = page;
+
+	return kmap(page);
 
 error:
 	kvm_release_page_clean(page);
@@ -1440,61 +1679,55 @@ error:
 	return NULL;
 }
 
-static void nested_svm_unmap(void *addr, enum km_type idx)
+static void nested_svm_unmap(struct page *page)
 {
-	struct page *page;
+	kunmap(page);
+	kvm_release_page_dirty(page);
+}
 
-	if (!addr)
-		return;
+static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
+{
+	unsigned port;
+	u8 val, bit;
+	u64 gpa;
 
-	page = kmap_atomic_to_page(addr);
+	if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
+		return NESTED_EXIT_HOST;
 
-	kunmap_atomic(addr, idx);
-	kvm_release_page_dirty(page);
+	port = svm->vmcb->control.exit_info_1 >> 16;
+	gpa  = svm->nested.vmcb_iopm + (port / 8);
+	bit  = port % 8;
+	val  = 0;
+
+	if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
+		val &= (1 << bit);
+
+	return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
 }
 
-static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 {
-	u32 param = svm->vmcb->control.exit_info_1 & 1;
-	u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
-	bool ret = false;
-	u32 t0, t1;
-	u8 *msrpm;
+	u32 offset, msr, value;
+	int write, mask;
 
 	if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
-		return false;
+		return NESTED_EXIT_HOST;
 
-	msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
+	msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+	offset = svm_msrpm_offset(msr);
+	write  = svm->vmcb->control.exit_info_1 & 1;
+	mask   = 1 << ((2 * (msr & 0xf)) + write);
 
-	if (!msrpm)
-		goto out;
+	if (offset == MSR_INVALID)
+		return NESTED_EXIT_DONE;
 
-	switch (msr) {
-	case 0 ... 0x1fff:
-		t0 = (msr * 2) % 8;
-		t1 = msr / 8;
-		break;
-	case 0xc0000000 ... 0xc0001fff:
-		t0 = (8192 + msr - 0xc0000000) * 2;
-		t1 = (t0 / 8);
-		t0 %= 8;
-		break;
-	case 0xc0010000 ... 0xc0011fff:
-		t0 = (16384 + msr - 0xc0010000) * 2;
-		t1 = (t0 / 8);
-		t0 %= 8;
-		break;
-	default:
-		ret = true;
-		goto out;
-	}
-
-	ret = msrpm[t1] & ((1 << param) << t0);
+	/* Offset is in 32 bit units but need in 8 bit units */
+	offset *= 4;
 
-out:
-	nested_svm_unmap(msrpm, KM_USER0);
+	if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
+		return NESTED_EXIT_DONE;
 
-	return ret;
+	return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
 }
 
 static int nested_svm_exit_special(struct vcpu_svm *svm)
@@ -1504,17 +1737,21 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
 	switch (exit_code) {
 	case SVM_EXIT_INTR:
 	case SVM_EXIT_NMI:
+	case SVM_EXIT_EXCP_BASE + MC_VECTOR:
 		return NESTED_EXIT_HOST;
-		/* For now we are always handling NPFs when using them */
 	case SVM_EXIT_NPF:
+		/* For now we are always handling NPFs when using them */
 		if (npt_enabled)
 			return NESTED_EXIT_HOST;
 		break;
-	/* When we're shadowing, trap PFs */
 	case SVM_EXIT_EXCP_BASE + PF_VECTOR:
+		/* When we're shadowing, trap PFs */
 		if (!npt_enabled)
 			return NESTED_EXIT_HOST;
 		break;
+	case SVM_EXIT_EXCP_BASE + NM_VECTOR:
+		nm_interception(svm);
+		break;
 	default:
 		break;
 	}
@@ -1525,7 +1762,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
 /*
  * If this function returns true, this #vmexit was already handled
  */
-static int nested_svm_exit_handled(struct vcpu_svm *svm)
+static int nested_svm_intercept(struct vcpu_svm *svm)
 {
 	u32 exit_code = svm->vmcb->control.exit_code;
 	int vmexit = NESTED_EXIT_HOST;
@@ -1534,6 +1771,9 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
 	case SVM_EXIT_MSR:
 		vmexit = nested_svm_exit_handled_msr(svm);
 		break;
+	case SVM_EXIT_IOIO:
+		vmexit = nested_svm_intercept_ioio(svm);
+		break;
 	case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
 		u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
 		if (svm->nested.intercept_cr_read & cr_bits)
@@ -1564,6 +1804,10 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
 			vmexit = NESTED_EXIT_DONE;
 		break;
 	}
+	case SVM_EXIT_ERR: {
+		vmexit = NESTED_EXIT_DONE;
+		break;
+	}
 	default: {
 		u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
 		if (svm->nested.intercept & exit_bits)
@@ -1571,9 +1815,17 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
 	}
 	}
 
-	if (vmexit == NESTED_EXIT_DONE) {
+	return vmexit;
+}
+
+static int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+	int vmexit;
+
+	vmexit = nested_svm_intercept(svm);
+
+	if (vmexit == NESTED_EXIT_DONE)
 		nested_svm_vmexit(svm);
-	}
 
 	return vmexit;
 }
@@ -1615,6 +1867,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	struct vmcb *nested_vmcb;
 	struct vmcb *hsave = svm->nested.hsave;
 	struct vmcb *vmcb = svm->vmcb;
+	struct page *page;
 
 	trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
 				       vmcb->control.exit_info_1,
@@ -1622,10 +1875,13 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 				       vmcb->control.exit_int_info,
 				       vmcb->control.exit_int_info_err);
 
-	nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
+	nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
 	if (!nested_vmcb)
 		return 1;
 
+	/* Exit nested SVM mode */
+	svm->nested.vmcb = 0;
+
 	/* Give the current vmcb to the guest */
 	disable_gif(svm);
 
@@ -1635,9 +1891,10 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->save.ds     = vmcb->save.ds;
 	nested_vmcb->save.gdtr   = vmcb->save.gdtr;
 	nested_vmcb->save.idtr   = vmcb->save.idtr;
-	if (npt_enabled)
-		nested_vmcb->save.cr3    = vmcb->save.cr3;
+	nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
+	nested_vmcb->save.cr3    = svm->vcpu.arch.cr3;
 	nested_vmcb->save.cr2    = vmcb->save.cr2;
+	nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
 	nested_vmcb->save.rflags = vmcb->save.rflags;
 	nested_vmcb->save.rip    = vmcb->save.rip;
 	nested_vmcb->save.rsp    = vmcb->save.rsp;
@@ -1709,10 +1966,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	svm->vmcb->save.cpl = 0;
 	svm->vmcb->control.exit_int_info = 0;
 
-	/* Exit nested SVM mode */
-	svm->nested.vmcb = 0;
-
-	nested_svm_unmap(nested_vmcb, KM_USER0);
+	nested_svm_unmap(page);
 
 	kvm_mmu_reset_context(&svm->vcpu);
 	kvm_mmu_load(&svm->vcpu);
@@ -1722,19 +1976,33 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 
 static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 {
-	u32 *nested_msrpm;
+	/*
+	 * This function merges the msr permission bitmaps of kvm and the
+	 * nested vmcb. It is omptimized in that it only merges the parts where
+	 * the kvm msr permission bitmap may contain zero bits
+	 */
 	int i;
 
-	nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
-	if (!nested_msrpm)
-		return false;
+	if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+		return true;
 
-	for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
-		svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
+	for (i = 0; i < MSRPM_OFFSETS; i++) {
+		u32 value, p;
+		u64 offset;
 
-	svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
+		if (msrpm_offsets[i] == 0xffffffff)
+			break;
+
+		p      = msrpm_offsets[i];
+		offset = svm->nested.vmcb_msrpm + (p * 4);
 
-	nested_svm_unmap(nested_msrpm, KM_USER0);
+		if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
+			return false;
+
+		svm->nested.msrpm[p] = svm->msrpm[p] | value;
+	}
+
+	svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
 
 	return true;
 }
@@ -1744,26 +2012,34 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	struct vmcb *nested_vmcb;
 	struct vmcb *hsave = svm->nested.hsave;
 	struct vmcb *vmcb = svm->vmcb;
+	struct page *page;
+	u64 vmcb_gpa;
+
+	vmcb_gpa = svm->vmcb->save.rax;
 
-	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
 	if (!nested_vmcb)
 		return false;
 
-	/* nested_vmcb is our indicator if nested SVM is activated */
-	svm->nested.vmcb = svm->vmcb->save.rax;
-
-	trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
+	trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa,
 			       nested_vmcb->save.rip,
 			       nested_vmcb->control.int_ctl,
 			       nested_vmcb->control.event_inj,
 			       nested_vmcb->control.nested_ctl);
 
+	trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read,
+				    nested_vmcb->control.intercept_cr_write,
+				    nested_vmcb->control.intercept_exceptions,
+				    nested_vmcb->control.intercept);
+
 	/* Clear internal status */
 	kvm_clear_exception_queue(&svm->vcpu);
 	kvm_clear_interrupt_queue(&svm->vcpu);
 
-	/* Save the old vmcb, so we don't need to pick what we save, but
-	   can restore everything when a VMEXIT occurs */
+	/*
+	 * Save the old vmcb, so we don't need to pick what we save, but can
+	 * restore everything when a VMEXIT occurs
+	 */
 	hsave->save.es     = vmcb->save.es;
 	hsave->save.cs     = vmcb->save.cs;
 	hsave->save.ss     = vmcb->save.ss;
@@ -1803,14 +2079,17 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	if (npt_enabled) {
 		svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
 		svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
-	} else {
+	} else
 		kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
-		kvm_mmu_reset_context(&svm->vcpu);
-	}
+
+	/* Guest paging mode is active - reset mmu */
+	kvm_mmu_reset_context(&svm->vcpu);
+
 	svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
+
 	/* In case we don't even reach vcpu_run, the fields are not updated */
 	svm->vmcb->save.rax = nested_vmcb->save.rax;
 	svm->vmcb->save.rsp = nested_vmcb->save.rsp;
@@ -1819,22 +2098,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
 	svm->vmcb->save.cpl = nested_vmcb->save.cpl;
 
-	/* We don't want a nested guest to be more powerful than the guest,
-	   so all intercepts are ORed */
-	svm->vmcb->control.intercept_cr_read |=
-		nested_vmcb->control.intercept_cr_read;
-	svm->vmcb->control.intercept_cr_write |=
-		nested_vmcb->control.intercept_cr_write;
-	svm->vmcb->control.intercept_dr_read |=
-		nested_vmcb->control.intercept_dr_read;
-	svm->vmcb->control.intercept_dr_write |=
-		nested_vmcb->control.intercept_dr_write;
-	svm->vmcb->control.intercept_exceptions |=
-		nested_vmcb->control.intercept_exceptions;
-
-	svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
-
-	svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
+	svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
+	svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
 
 	/* cache intercepts */
 	svm->nested.intercept_cr_read    = nested_vmcb->control.intercept_cr_read;
@@ -1851,13 +2116,43 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	else
 		svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
 
+	if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
+		/* We only want the cr8 intercept bits of the guest */
+		svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK;
+		svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+	}
+
+	/* We don't want to see VMMCALLs from a nested guest */
+	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL);
+
+	/*
+	 * We don't want a nested guest to be more powerful than the guest, so
+	 * all intercepts are ORed
+	 */
+	svm->vmcb->control.intercept_cr_read |=
+		nested_vmcb->control.intercept_cr_read;
+	svm->vmcb->control.intercept_cr_write |=
+		nested_vmcb->control.intercept_cr_write;
+	svm->vmcb->control.intercept_dr_read |=
+		nested_vmcb->control.intercept_dr_read;
+	svm->vmcb->control.intercept_dr_write |=
+		nested_vmcb->control.intercept_dr_write;
+	svm->vmcb->control.intercept_exceptions |=
+		nested_vmcb->control.intercept_exceptions;
+
+	svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
+
+	svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
 	svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
 	svm->vmcb->control.int_state = nested_vmcb->control.int_state;
 	svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
 	svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
 	svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
 
-	nested_svm_unmap(nested_vmcb, KM_USER0);
+	nested_svm_unmap(page);
+
+	/* nested_vmcb is our indicator if nested SVM is activated */
+	svm->nested.vmcb = vmcb_gpa;
 
 	enable_gif(svm);
 
@@ -1883,6 +2178,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
 static int vmload_interception(struct vcpu_svm *svm)
 {
 	struct vmcb *nested_vmcb;
+	struct page *page;
 
 	if (nested_svm_check_permissions(svm))
 		return 1;
@@ -1890,12 +2186,12 @@ static int vmload_interception(struct vcpu_svm *svm)
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
 	skip_emulated_instruction(&svm->vcpu);
 
-	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
 	if (!nested_vmcb)
 		return 1;
 
 	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
-	nested_svm_unmap(nested_vmcb, KM_USER0);
+	nested_svm_unmap(page);
 
 	return 1;
 }
@@ -1903,6 +2199,7 @@ static int vmload_interception(struct vcpu_svm *svm)
 static int vmsave_interception(struct vcpu_svm *svm)
 {
 	struct vmcb *nested_vmcb;
+	struct page *page;
 
 	if (nested_svm_check_permissions(svm))
 		return 1;
@@ -1910,12 +2207,12 @@ static int vmsave_interception(struct vcpu_svm *svm)
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
 	skip_emulated_instruction(&svm->vcpu);
 
-	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
 	if (!nested_vmcb)
 		return 1;
 
 	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
-	nested_svm_unmap(nested_vmcb, KM_USER0);
+	nested_svm_unmap(page);
 
 	return 1;
 }
@@ -2018,6 +2315,8 @@ static int task_switch_interception(struct vcpu_svm *svm)
 		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
 	uint32_t idt_v =
 		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
+	bool has_error_code = false;
+	u32 error_code = 0;
 
 	tss_selector = (u16)svm->vmcb->control.exit_info_1;
 
@@ -2038,6 +2337,12 @@ static int task_switch_interception(struct vcpu_svm *svm)
 			svm->vcpu.arch.nmi_injected = false;
 			break;
 		case SVM_EXITINTINFO_TYPE_EXEPT:
+			if (svm->vmcb->control.exit_info_2 &
+			    (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
+				has_error_code = true;
+				error_code =
+					(u32)svm->vmcb->control.exit_info_2;
+			}
 			kvm_clear_exception_queue(&svm->vcpu);
 			break;
 		case SVM_EXITINTINFO_TYPE_INTR:
@@ -2054,7 +2359,14 @@ static int task_switch_interception(struct vcpu_svm *svm)
 	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
 		skip_emulated_instruction(&svm->vcpu);
 
-	return kvm_task_switch(&svm->vcpu, tss_selector, reason);
+	if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
+				has_error_code, error_code) == EMULATE_FAIL) {
+		svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		svm->vcpu.run->internal.ndata = 0;
+		return 0;
+	}
+	return 1;
 }
 
 static int cpuid_interception(struct vcpu_svm *svm)
@@ -2145,9 +2457,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 	case MSR_IA32_SYSENTER_ESP:
 		*data = svm->sysenter_esp;
 		break;
-	/* Nobody will change the following 5 values in the VMCB so
-	   we can safely return them on rdmsr. They will always be 0
-	   until LBRV is implemented. */
+	/*
+	 * Nobody will change the following 5 values in the VMCB so we can
+	 * safely return them on rdmsr. They will always be 0 until LBRV is
+	 * implemented.
+	 */
 	case MSR_IA32_DEBUGCTLMSR:
 		*data = svm->vmcb->save.dbgctl;
 		break;
@@ -2167,7 +2481,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 		*data = svm->nested.hsave_msr;
 		break;
 	case MSR_VM_CR:
-		*data = 0;
+		*data = svm->nested.vm_cr_msr;
 		break;
 	case MSR_IA32_UCODE_REV:
 		*data = 0x01000065;
@@ -2197,6 +2511,31 @@ static int rdmsr_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
+static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	int svm_dis, chg_mask;
+
+	if (data & ~SVM_VM_CR_VALID_MASK)
+		return 1;
+
+	chg_mask = SVM_VM_CR_VALID_MASK;
+
+	if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
+		chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
+
+	svm->nested.vm_cr_msr &= ~chg_mask;
+	svm->nested.vm_cr_msr |= (data & chg_mask);
+
+	svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
+
+	/* check for svm_disable while efer.svme is set */
+	if (svm_dis && (vcpu->arch.efer & EFER_SVME))
+		return 1;
+
+	return 0;
+}
+
 static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -2263,6 +2602,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 		svm->nested.hsave_msr = data;
 		break;
 	case MSR_VM_CR:
+		return svm_set_vm_cr(vcpu, data);
 	case MSR_VM_IGNNE:
 		pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
 		break;
@@ -2326,16 +2666,16 @@ static int pause_interception(struct vcpu_svm *svm)
 }
 
 static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
-	[SVM_EXIT_READ_CR0]           		= emulate_on_interception,
-	[SVM_EXIT_READ_CR3]           		= emulate_on_interception,
-	[SVM_EXIT_READ_CR4]           		= emulate_on_interception,
-	[SVM_EXIT_READ_CR8]           		= emulate_on_interception,
+	[SVM_EXIT_READ_CR0]			= emulate_on_interception,
+	[SVM_EXIT_READ_CR3]			= emulate_on_interception,
+	[SVM_EXIT_READ_CR4]			= emulate_on_interception,
+	[SVM_EXIT_READ_CR8]			= emulate_on_interception,
 	[SVM_EXIT_CR0_SEL_WRITE]		= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR0]          		= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR3]          		= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR4]          		= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR8]          		= cr8_write_interception,
-	[SVM_EXIT_READ_DR0] 			= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR0]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR3]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR4]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR8]			= cr8_write_interception,
+	[SVM_EXIT_READ_DR0]			= emulate_on_interception,
 	[SVM_EXIT_READ_DR1]			= emulate_on_interception,
 	[SVM_EXIT_READ_DR2]			= emulate_on_interception,
 	[SVM_EXIT_READ_DR3]			= emulate_on_interception,
@@ -2354,15 +2694,14 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_EXCP_BASE + DB_VECTOR]	= db_interception,
 	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception,
 	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
-	[SVM_EXIT_EXCP_BASE + PF_VECTOR] 	= pf_interception,
-	[SVM_EXIT_EXCP_BASE + NM_VECTOR] 	= nm_interception,
-	[SVM_EXIT_EXCP_BASE + MC_VECTOR] 	= mc_interception,
-	[SVM_EXIT_INTR] 			= intr_interception,
+	[SVM_EXIT_EXCP_BASE + PF_VECTOR]	= pf_interception,
+	[SVM_EXIT_EXCP_BASE + NM_VECTOR]	= nm_interception,
+	[SVM_EXIT_EXCP_BASE + MC_VECTOR]	= mc_interception,
+	[SVM_EXIT_INTR]				= intr_interception,
 	[SVM_EXIT_NMI]				= nmi_interception,
 	[SVM_EXIT_SMI]				= nop_on_interception,
 	[SVM_EXIT_INIT]				= nop_on_interception,
 	[SVM_EXIT_VINTR]			= interrupt_window_interception,
-	/* [SVM_EXIT_CR0_SEL_WRITE]		= emulate_on_interception, */
 	[SVM_EXIT_CPUID]			= cpuid_interception,
 	[SVM_EXIT_IRET]                         = iret_interception,
 	[SVM_EXIT_INVD]                         = emulate_on_interception,
@@ -2370,7 +2709,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_HLT]				= halt_interception,
 	[SVM_EXIT_INVLPG]			= invlpg_interception,
 	[SVM_EXIT_INVLPGA]			= invlpga_interception,
-	[SVM_EXIT_IOIO] 		  	= io_interception,
+	[SVM_EXIT_IOIO]				= io_interception,
 	[SVM_EXIT_MSR]				= msr_interception,
 	[SVM_EXIT_TASK_SWITCH]			= task_switch_interception,
 	[SVM_EXIT_SHUTDOWN]			= shutdown_interception,
@@ -2393,7 +2732,12 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 	struct kvm_run *kvm_run = vcpu->run;
 	u32 exit_code = svm->vmcb->control.exit_code;
 
-	trace_kvm_exit(exit_code, svm->vmcb->save.rip);
+	trace_kvm_exit(exit_code, vcpu);
+
+	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
+		vcpu->arch.cr0 = svm->vmcb->save.cr0;
+	if (npt_enabled)
+		vcpu->arch.cr3 = svm->vmcb->save.cr3;
 
 	if (unlikely(svm->nested.exit_required)) {
 		nested_svm_vmexit(svm);
@@ -2422,11 +2766,6 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 
 	svm_complete_interrupts(svm);
 
-	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
-		vcpu->arch.cr0 = svm->vmcb->save.cr0;
-	if (npt_enabled)
-		vcpu->arch.cr3 = svm->vmcb->save.cr3;
-
 	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		kvm_run->fail_entry.hardware_entry_failure_reason
@@ -2511,6 +2850,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+		return;
+
 	if (irr == -1)
 		return;
 
@@ -2522,8 +2864,12 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb *vmcb = svm->vmcb;
-	return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
-		!(svm->vcpu.arch.hflags & HF_NMI_MASK);
+	int ret;
+	ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+	      !(svm->vcpu.arch.hflags & HF_NMI_MASK);
+	ret = ret && gif_set(svm) && nested_svm_nmi(svm);
+
+	return ret;
 }
 
 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -2568,13 +2914,13 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	nested_svm_intr(svm);
-
-	/* In case GIF=0 we can't rely on the CPU to tell us when
-	 * GIF becomes 1, because that's a separate STGI/VMRUN intercept.
-	 * The next time we get that intercept, this function will be
-	 * called again though and we'll get the vintr intercept. */
-	if (gif_set(svm)) {
+	/*
+	 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
+	 * 1, because that's a separate STGI/VMRUN intercept.  The next time we
+	 * get that intercept, this function will be called again though and
+	 * we'll get the vintr intercept.
+	 */
+	if (gif_set(svm) && nested_svm_intr(svm)) {
 		svm_set_vintr(svm);
 		svm_inject_irq(svm, 0x0);
 	}
@@ -2588,9 +2934,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 	    == HF_NMI_MASK)
 		return; /* IRET will cause a vm exit */
 
-	/* Something prevents NMI from been injected. Single step over
-	   possible problem (IRET or exception injection or interrupt
-	   shadow) */
+	/*
+	 * Something prevents NMI from been injected. Single step over possible
+	 * problem (IRET or exception injection or interrupt shadow)
+	 */
 	svm->nmi_singlestep = true;
 	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
 	update_db_intercept(vcpu);
@@ -2614,6 +2961,9 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+		return;
+
 	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
 		int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
 		kvm_set_cr8(vcpu, cr8);
@@ -2625,6 +2975,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u64 cr8;
 
+	if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+		return;
+
 	cr8 = kvm_get_cr8(vcpu);
 	svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
 	svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
@@ -2635,6 +2988,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 	u8 vector;
 	int type;
 	u32 exitintinfo = svm->vmcb->control.exit_int_info;
+	unsigned int3_injected = svm->int3_injected;
+
+	svm->int3_injected = 0;
 
 	if (svm->vcpu.arch.hflags & HF_IRET_MASK)
 		svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
@@ -2654,18 +3010,25 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 		svm->vcpu.arch.nmi_injected = true;
 		break;
 	case SVM_EXITINTINFO_TYPE_EXEPT:
-		/* In case of software exception do not reinject an exception
-		   vector, but re-execute and instruction instead */
-		if (is_nested(svm))
-			break;
-		if (kvm_exception_is_soft(vector))
+		/*
+		 * In case of software exceptions, do not reinject the vector,
+		 * but re-execute the instruction instead. Rewind RIP first
+		 * if we emulated INT3 before.
+		 */
+		if (kvm_exception_is_soft(vector)) {
+			if (vector == BP_VECTOR && int3_injected &&
+			    kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
+				kvm_rip_write(&svm->vcpu,
+					      kvm_rip_read(&svm->vcpu) -
+					      int3_injected);
 			break;
+		}
 		if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
 			u32 err = svm->vmcb->control.exit_int_info_err;
-			kvm_queue_exception_e(&svm->vcpu, vector, err);
+			kvm_requeue_exception_e(&svm->vcpu, vector, err);
 
 		} else
-			kvm_queue_exception(&svm->vcpu, vector);
+			kvm_requeue_exception(&svm->vcpu, vector);
 		break;
 	case SVM_EXITINTINFO_TYPE_INTR:
 		kvm_queue_interrupt(&svm->vcpu, vector, false);
@@ -2688,6 +3051,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	u16 gs_selector;
 	u16 ldt_selector;
 
+	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
 	/*
 	 * A vmexit emulation is required before the vcpu can be executed
 	 * again.
@@ -2695,10 +3062,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(svm->nested.exit_required))
 		return;
 
-	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
-	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
-	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
-
 	pre_svm_run(svm);
 
 	sync_lapic_to_cr8(vcpu);
@@ -2811,6 +3174,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 		vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
 		vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
 	}
+
+	/*
+	 * We need to handle MC intercepts here before the vcpu has a chance to
+	 * change the physical cpu
+	 */
+	if (unlikely(svm->vmcb->control.exit_code ==
+		     SVM_EXIT_EXCP_BASE + MC_VECTOR))
+		svm_handle_mce(svm);
 }
 
 #undef R
@@ -2879,25 +3250,39 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 {
 }
 
+static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+{
+	switch (func) {
+	case 0x8000000A:
+		entry->eax = 1; /* SVM revision 1 */
+		entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
+				   ASID emulation to nested SVM */
+		entry->ecx = 0; /* Reserved */
+		entry->edx = 0; /* Do not support any additional features */
+
+		break;
+	}
+}
+
 static const struct trace_print_flags svm_exit_reasons_str[] = {
-	{ SVM_EXIT_READ_CR0,           		"read_cr0" },
-	{ SVM_EXIT_READ_CR3,	      		"read_cr3" },
-	{ SVM_EXIT_READ_CR4,	      		"read_cr4" },
-	{ SVM_EXIT_READ_CR8,  	      		"read_cr8" },
-	{ SVM_EXIT_WRITE_CR0,          		"write_cr0" },
-	{ SVM_EXIT_WRITE_CR3,	      		"write_cr3" },
-	{ SVM_EXIT_WRITE_CR4,          		"write_cr4" },
-	{ SVM_EXIT_WRITE_CR8, 	      		"write_cr8" },
-	{ SVM_EXIT_READ_DR0, 	      		"read_dr0" },
-	{ SVM_EXIT_READ_DR1,	      		"read_dr1" },
-	{ SVM_EXIT_READ_DR2,	      		"read_dr2" },
-	{ SVM_EXIT_READ_DR3,	      		"read_dr3" },
-	{ SVM_EXIT_WRITE_DR0,	      		"write_dr0" },
-	{ SVM_EXIT_WRITE_DR1,	      		"write_dr1" },
-	{ SVM_EXIT_WRITE_DR2,	      		"write_dr2" },
-	{ SVM_EXIT_WRITE_DR3,	      		"write_dr3" },
-	{ SVM_EXIT_WRITE_DR5,	      		"write_dr5" },
-	{ SVM_EXIT_WRITE_DR7,	      		"write_dr7" },
+	{ SVM_EXIT_READ_CR0,			"read_cr0" },
+	{ SVM_EXIT_READ_CR3,			"read_cr3" },
+	{ SVM_EXIT_READ_CR4,			"read_cr4" },
+	{ SVM_EXIT_READ_CR8,			"read_cr8" },
+	{ SVM_EXIT_WRITE_CR0,			"write_cr0" },
+	{ SVM_EXIT_WRITE_CR3,			"write_cr3" },
+	{ SVM_EXIT_WRITE_CR4,			"write_cr4" },
+	{ SVM_EXIT_WRITE_CR8,			"write_cr8" },
+	{ SVM_EXIT_READ_DR0,			"read_dr0" },
+	{ SVM_EXIT_READ_DR1,			"read_dr1" },
+	{ SVM_EXIT_READ_DR2,			"read_dr2" },
+	{ SVM_EXIT_READ_DR3,			"read_dr3" },
+	{ SVM_EXIT_WRITE_DR0,			"write_dr0" },
+	{ SVM_EXIT_WRITE_DR1,			"write_dr1" },
+	{ SVM_EXIT_WRITE_DR2,			"write_dr2" },
+	{ SVM_EXIT_WRITE_DR3,			"write_dr3" },
+	{ SVM_EXIT_WRITE_DR5,			"write_dr5" },
+	{ SVM_EXIT_WRITE_DR7,			"write_dr7" },
 	{ SVM_EXIT_EXCP_BASE + DB_VECTOR,	"DB excp" },
 	{ SVM_EXIT_EXCP_BASE + BP_VECTOR,	"BP excp" },
 	{ SVM_EXIT_EXCP_BASE + UD_VECTOR,	"UD excp" },
@@ -2946,8 +3331,10 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	update_cr0_intercept(svm);
 	svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
+	if (is_nested(svm))
+		svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
+	update_cr0_intercept(svm);
 }
 
 static struct kvm_x86_ops svm_x86_ops = {
@@ -2986,8 +3373,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_idt = svm_set_idt,
 	.get_gdt = svm_get_gdt,
 	.set_gdt = svm_set_gdt,
-	.get_dr = svm_get_dr,
-	.set_dr = svm_set_dr,
+	.set_dr7 = svm_set_dr7,
 	.cache_reg = svm_cache_reg,
 	.get_rflags = svm_get_rflags,
 	.set_rflags = svm_set_rflags,
@@ -3023,12 +3409,14 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.cpuid_update = svm_cpuid_update,
 
 	.rdtscp_supported = svm_rdtscp_supported,
+
+	.set_supported_cpuid = svm_set_supported_cpuid,
 };
 
 static int __init svm_init(void)
 {
 	return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
-			      THIS_MODULE);
+			__alignof__(struct vcpu_svm), THIS_MODULE);
 }
 
 static void __exit svm_exit(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index eea40439066..4ddadb1a5ff 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -12,7 +12,8 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
 	/*
 	 * There is a race window between reading and incrementing, but we do
 	 * not care about potentially loosing timer events in the !reinject
-	 * case anyway.
+	 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
+	 * in vcpu_enter_guest.
 	 */
 	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
 		atomic_inc(&ktimer->pending);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 6ad30a29f04..a6544b8e7c0 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -5,8 +5,6 @@
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvm
-#define TRACE_INCLUDE_PATH arch/x86/kvm
-#define TRACE_INCLUDE_FILE trace
 
 /*
  * Tracepoint for guest mode entry.
@@ -184,8 +182,8 @@ TRACE_EVENT(kvm_apic,
  * Tracepoint for kvm guest exit:
  */
 TRACE_EVENT(kvm_exit,
-	TP_PROTO(unsigned int exit_reason, unsigned long guest_rip),
-	TP_ARGS(exit_reason, guest_rip),
+	TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu),
+	TP_ARGS(exit_reason, vcpu),
 
 	TP_STRUCT__entry(
 		__field(	unsigned int,	exit_reason	)
@@ -194,7 +192,7 @@ TRACE_EVENT(kvm_exit,
 
 	TP_fast_assign(
 		__entry->exit_reason	= exit_reason;
-		__entry->guest_rip	= guest_rip;
+		__entry->guest_rip	= kvm_rip_read(vcpu);
 	),
 
 	TP_printk("reason %s rip 0x%lx",
@@ -221,6 +219,38 @@ TRACE_EVENT(kvm_inj_virq,
 	TP_printk("irq %u", __entry->irq)
 );
 
+#define EXS(x) { x##_VECTOR, "#" #x }
+
+#define kvm_trace_sym_exc						\
+	EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM),	\
+	EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF),		\
+	EXS(MF), EXS(MC)
+
+/*
+ * Tracepoint for kvm interrupt injection:
+ */
+TRACE_EVENT(kvm_inj_exception,
+	TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
+	TP_ARGS(exception, has_error, error_code),
+
+	TP_STRUCT__entry(
+		__field(	u8,	exception	)
+		__field(	u8,	has_error	)
+		__field(	u32,	error_code	)
+	),
+
+	TP_fast_assign(
+		__entry->exception	= exception;
+		__entry->has_error	= has_error;
+		__entry->error_code	= error_code;
+	),
+
+	TP_printk("%s (0x%x)",
+		  __print_symbolic(__entry->exception, kvm_trace_sym_exc),
+		  /* FIXME: don't print error_code if not present */
+		  __entry->has_error ? __entry->error_code : 0)
+);
+
 /*
  * Tracepoint for page fault.
  */
@@ -413,12 +443,34 @@ TRACE_EVENT(kvm_nested_vmrun,
 	),
 
 	TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
-		  "event_inj: 0x%08x npt: %s\n",
+		  "event_inj: 0x%08x npt: %s",
 		__entry->rip, __entry->vmcb, __entry->nested_rip,
 		__entry->int_ctl, __entry->event_inj,
 		__entry->npt ? "on" : "off")
 );
 
+TRACE_EVENT(kvm_nested_intercepts,
+	    TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
+	    TP_ARGS(cr_read, cr_write, exceptions, intercept),
+
+	TP_STRUCT__entry(
+		__field(	__u16,		cr_read		)
+		__field(	__u16,		cr_write	)
+		__field(	__u32,		exceptions	)
+		__field(	__u64,		intercept	)
+	),
+
+	TP_fast_assign(
+		__entry->cr_read	= cr_read;
+		__entry->cr_write	= cr_write;
+		__entry->exceptions	= exceptions;
+		__entry->intercept	= intercept;
+	),
+
+	TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
+		__entry->cr_read, __entry->cr_write, __entry->exceptions,
+		__entry->intercept)
+);
 /*
  * Tracepoint for #VMEXIT while nested
  */
@@ -447,7 +499,7 @@ TRACE_EVENT(kvm_nested_vmexit,
 		__entry->exit_int_info_err	= exit_int_info_err;
 	),
 	TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
-		  "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
+		  "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
 		  __entry->rip,
 		  ftrace_print_symbols_seq(p, __entry->exit_code,
 					   kvm_x86_ops->exit_reasons_str),
@@ -482,7 +534,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
 	),
 
 	TP_printk("reason: %s ext_inf1: 0x%016llx "
-		  "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
+		  "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
 		  ftrace_print_symbols_seq(p, __entry->exit_code,
 					   kvm_x86_ops->exit_reasons_str),
 		__entry->exit_info1, __entry->exit_info2,
@@ -504,7 +556,7 @@ TRACE_EVENT(kvm_nested_intr_vmexit,
 		__entry->rip	=	rip
 	),
 
-	TP_printk("rip: 0x%016llx\n", __entry->rip)
+	TP_printk("rip: 0x%016llx", __entry->rip)
 );
 
 /*
@@ -526,7 +578,7 @@ TRACE_EVENT(kvm_invlpga,
 		__entry->address	=	address;
 	),
 
-	TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n",
+	TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx",
 		  __entry->rip, __entry->asid, __entry->address)
 );
 
@@ -547,11 +599,102 @@ TRACE_EVENT(kvm_skinit,
 		__entry->slb		=	slb;
 	),
 
-	TP_printk("rip: 0x%016llx slb: 0x%08x\n",
+	TP_printk("rip: 0x%016llx slb: 0x%08x",
 		  __entry->rip, __entry->slb)
 );
 
+#define __print_insn(insn, ilen) ({		                 \
+	int i;							 \
+	const char *ret = p->buffer + p->len;			 \
+								 \
+	for (i = 0; i < ilen; ++i)				 \
+		trace_seq_printf(p, " %02x", insn[i]);		 \
+	trace_seq_printf(p, "%c", 0);				 \
+	ret;							 \
+	})
+
+#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
+#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
+#define KVM_EMUL_INSN_F_CS_D   (1 << 2)
+#define KVM_EMUL_INSN_F_CS_L   (1 << 3)
+
+#define kvm_trace_symbol_emul_flags	                  \
+	{ 0,   			    "real" },		  \
+	{ KVM_EMUL_INSN_F_CR0_PE			  \
+	  | KVM_EMUL_INSN_F_EFL_VM, "vm16" },		  \
+	{ KVM_EMUL_INSN_F_CR0_PE,   "prot16" },		  \
+	{ KVM_EMUL_INSN_F_CR0_PE			  \
+	  | KVM_EMUL_INSN_F_CS_D,   "prot32" },		  \
+	{ KVM_EMUL_INSN_F_CR0_PE			  \
+	  | KVM_EMUL_INSN_F_CS_L,   "prot64" }
+
+#define kei_decode_mode(mode) ({			\
+	u8 flags = 0xff;				\
+	switch (mode) {					\
+	case X86EMUL_MODE_REAL:				\
+		flags = 0;				\
+		break;					\
+	case X86EMUL_MODE_VM86:				\
+		flags = KVM_EMUL_INSN_F_EFL_VM;		\
+		break;					\
+	case X86EMUL_MODE_PROT16:			\
+		flags = KVM_EMUL_INSN_F_CR0_PE;		\
+		break;					\
+	case X86EMUL_MODE_PROT32:			\
+		flags = KVM_EMUL_INSN_F_CR0_PE		\
+			| KVM_EMUL_INSN_F_CS_D;		\
+		break;					\
+	case X86EMUL_MODE_PROT64:			\
+		flags = KVM_EMUL_INSN_F_CR0_PE		\
+			| KVM_EMUL_INSN_F_CS_L;		\
+		break;					\
+	}						\
+	flags;						\
+	})
+
+TRACE_EVENT(kvm_emulate_insn,
+	TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed),
+	TP_ARGS(vcpu, failed),
+
+	TP_STRUCT__entry(
+		__field(    __u64, rip                       )
+		__field(    __u32, csbase                    )
+		__field(    __u8,  len                       )
+		__array(    __u8,  insn,    15	             )
+		__field(    __u8,  flags       	   	     )
+		__field(    __u8,  failed                    )
+		),
+
+	TP_fast_assign(
+		__entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start;
+		__entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
+		__entry->len = vcpu->arch.emulate_ctxt.decode.eip
+			       - vcpu->arch.emulate_ctxt.decode.fetch.start;
+		memcpy(__entry->insn,
+		       vcpu->arch.emulate_ctxt.decode.fetch.data,
+		       15);
+		__entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
+		__entry->failed = failed;
+		),
+
+	TP_printk("%x:%llx:%s (%s)%s",
+		  __entry->csbase, __entry->rip,
+		  __print_insn(__entry->insn, __entry->len),
+		  __print_symbolic(__entry->flags,
+				   kvm_trace_symbol_emul_flags),
+		  __entry->failed ? " failed" : ""
+		)
+	);
+
+#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
+#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
+
 #endif /* _TRACE_KVM_H */
 
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH arch/x86/kvm
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
 /* This part must be outside protection */
 #include <trace/define_trace.h>
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index edca080407a..859a01a07db 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -27,6 +27,7 @@
 #include <linux/moduleparam.h>
 #include <linux/ftrace_event.h>
 #include <linux/slab.h>
+#include <linux/tboot.h>
 #include "kvm_cache_regs.h"
 #include "x86.h"
 
@@ -98,6 +99,8 @@ module_param(ple_gap, int, S_IRUGO);
 static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
 module_param(ple_window, int, S_IRUGO);
 
+#define NR_AUTOLOAD_MSRS 1
+
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
@@ -125,6 +128,11 @@ struct vcpu_vmx {
 	u64 		      msr_guest_kernel_gs_base;
 #endif
 	struct vmcs          *vmcs;
+	struct msr_autoload {
+		unsigned nr;
+		struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
+		struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
+	} msr_autoload;
 	struct {
 		int           loaded;
 		u16           fs_sel, gs_sel, ldt_sel;
@@ -234,56 +242,56 @@ static const u32 vmx_msr_index[] = {
 };
 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
 
-static inline int is_page_fault(u32 intr_info)
+static inline bool is_page_fault(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
 		(INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
 }
 
-static inline int is_no_device(u32 intr_info)
+static inline bool is_no_device(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
 		(INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
 }
 
-static inline int is_invalid_opcode(u32 intr_info)
+static inline bool is_invalid_opcode(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
 		(INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
 }
 
-static inline int is_external_interrupt(u32 intr_info)
+static inline bool is_external_interrupt(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
 		== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
 }
 
-static inline int is_machine_check(u32 intr_info)
+static inline bool is_machine_check(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
 		(INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
 }
 
-static inline int cpu_has_vmx_msr_bitmap(void)
+static inline bool cpu_has_vmx_msr_bitmap(void)
 {
 	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
 }
 
-static inline int cpu_has_vmx_tpr_shadow(void)
+static inline bool cpu_has_vmx_tpr_shadow(void)
 {
 	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
 }
 
-static inline int vm_need_tpr_shadow(struct kvm *kvm)
+static inline bool vm_need_tpr_shadow(struct kvm *kvm)
 {
 	return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
 }
 
-static inline int cpu_has_secondary_exec_ctrls(void)
+static inline bool cpu_has_secondary_exec_ctrls(void)
 {
 	return vmcs_config.cpu_based_exec_ctrl &
 		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -303,80 +311,80 @@ static inline bool cpu_has_vmx_flexpriority(void)
 
 static inline bool cpu_has_vmx_ept_execute_only(void)
 {
-	return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT);
+	return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
 }
 
 static inline bool cpu_has_vmx_eptp_uncacheable(void)
 {
-	return !!(vmx_capability.ept & VMX_EPTP_UC_BIT);
+	return vmx_capability.ept & VMX_EPTP_UC_BIT;
 }
 
 static inline bool cpu_has_vmx_eptp_writeback(void)
 {
-	return !!(vmx_capability.ept & VMX_EPTP_WB_BIT);
+	return vmx_capability.ept & VMX_EPTP_WB_BIT;
 }
 
 static inline bool cpu_has_vmx_ept_2m_page(void)
 {
-	return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
+	return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
 }
 
 static inline bool cpu_has_vmx_ept_1g_page(void)
 {
-	return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT);
+	return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
 }
 
-static inline int cpu_has_vmx_invept_individual_addr(void)
+static inline bool cpu_has_vmx_invept_individual_addr(void)
 {
-	return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
+	return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
 }
 
-static inline int cpu_has_vmx_invept_context(void)
+static inline bool cpu_has_vmx_invept_context(void)
 {
-	return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT);
+	return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
 }
 
-static inline int cpu_has_vmx_invept_global(void)
+static inline bool cpu_has_vmx_invept_global(void)
 {
-	return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT);
+	return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
 }
 
-static inline int cpu_has_vmx_ept(void)
+static inline bool cpu_has_vmx_ept(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
 		SECONDARY_EXEC_ENABLE_EPT;
 }
 
-static inline int cpu_has_vmx_unrestricted_guest(void)
+static inline bool cpu_has_vmx_unrestricted_guest(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
 		SECONDARY_EXEC_UNRESTRICTED_GUEST;
 }
 
-static inline int cpu_has_vmx_ple(void)
+static inline bool cpu_has_vmx_ple(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
 		SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 }
 
-static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
+static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
 {
 	return flexpriority_enabled && irqchip_in_kernel(kvm);
 }
 
-static inline int cpu_has_vmx_vpid(void)
+static inline bool cpu_has_vmx_vpid(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
 		SECONDARY_EXEC_ENABLE_VPID;
 }
 
-static inline int cpu_has_vmx_rdtscp(void)
+static inline bool cpu_has_vmx_rdtscp(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
 		SECONDARY_EXEC_RDTSCP;
 }
 
-static inline int cpu_has_virtual_nmis(void)
+static inline bool cpu_has_virtual_nmis(void)
 {
 	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
 }
@@ -595,16 +603,56 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
+static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
+{
+	unsigned i;
+	struct msr_autoload *m = &vmx->msr_autoload;
+
+	for (i = 0; i < m->nr; ++i)
+		if (m->guest[i].index == msr)
+			break;
+
+	if (i == m->nr)
+		return;
+	--m->nr;
+	m->guest[i] = m->guest[m->nr];
+	m->host[i] = m->host[m->nr];
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
+	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
+}
+
+static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
+				  u64 guest_val, u64 host_val)
+{
+	unsigned i;
+	struct msr_autoload *m = &vmx->msr_autoload;
+
+	for (i = 0; i < m->nr; ++i)
+		if (m->guest[i].index == msr)
+			break;
+
+	if (i == m->nr) {
+		++m->nr;
+		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
+		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
+	}
+
+	m->guest[i].index = msr;
+	m->guest[i].value = guest_val;
+	m->host[i].index = msr;
+	m->host[i].value = host_val;
+}
+
 static void reload_tss(void)
 {
 	/*
 	 * VT restores TR but not its size.  Useless.
 	 */
-	struct descriptor_table gdt;
+	struct desc_ptr gdt;
 	struct desc_struct *descs;
 
-	kvm_get_gdt(&gdt);
-	descs = (void *)gdt.base;
+	native_store_gdt(&gdt);
+	descs = (void *)gdt.address;
 	descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
 	load_TR_desc();
 }
@@ -631,9 +679,57 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 	guest_efer |= host_efer & ignore_bits;
 	vmx->guest_msrs[efer_offset].data = guest_efer;
 	vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+
+	clear_atomic_switch_msr(vmx, MSR_EFER);
+	/* On ept, can't emulate nx, and must switch nx atomically */
+	if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
+		guest_efer = vmx->vcpu.arch.efer;
+		if (!(guest_efer & EFER_LMA))
+			guest_efer &= ~EFER_LME;
+		add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
+		return false;
+	}
+
 	return true;
 }
 
+static unsigned long segment_base(u16 selector)
+{
+	struct desc_ptr gdt;
+	struct desc_struct *d;
+	unsigned long table_base;
+	unsigned long v;
+
+	if (!(selector & ~3))
+		return 0;
+
+	native_store_gdt(&gdt);
+	table_base = gdt.address;
+
+	if (selector & 4) {           /* from ldt */
+		u16 ldt_selector = kvm_read_ldt();
+
+		if (!(ldt_selector & ~3))
+			return 0;
+
+		table_base = segment_base(ldt_selector);
+	}
+	d = (struct desc_struct *)(table_base + (selector & ~7));
+	v = get_desc_base(d);
+#ifdef CONFIG_X86_64
+       if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
+               v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
+#endif
+	return v;
+}
+
+static inline unsigned long kvm_read_tr_base(void)
+{
+	u16 tr;
+	asm("str %0" : "=g"(tr));
+	return segment_base(tr);
+}
+
 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -758,7 +854,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	if (vcpu->cpu != cpu) {
-		struct descriptor_table dt;
+		struct desc_ptr dt;
 		unsigned long sysenter_esp;
 
 		vcpu->cpu = cpu;
@@ -767,8 +863,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		 * processors.
 		 */
 		vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
-		kvm_get_gdt(&dt);
-		vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
+		native_store_gdt(&dt);
+		vmcs_writel(HOST_GDTR_BASE, dt.address);   /* 22.2.4 */
 
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
@@ -846,9 +942,9 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 	int ret = 0;
 
 	if (interruptibility & GUEST_INTR_STATE_STI)
-		ret |= X86_SHADOW_INT_STI;
+		ret |= KVM_X86_SHADOW_INT_STI;
 	if (interruptibility & GUEST_INTR_STATE_MOV_SS)
-		ret |= X86_SHADOW_INT_MOV_SS;
+		ret |= KVM_X86_SHADOW_INT_MOV_SS;
 
 	return ret & mask;
 }
@@ -860,9 +956,9 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 
 	interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
 
-	if (mask & X86_SHADOW_INT_MOV_SS)
+	if (mask & KVM_X86_SHADOW_INT_MOV_SS)
 		interruptibility |= GUEST_INTR_STATE_MOV_SS;
-	if (mask & X86_SHADOW_INT_STI)
+	else if (mask & KVM_X86_SHADOW_INT_STI)
 		interruptibility |= GUEST_INTR_STATE_STI;
 
 	if ((interruptibility != interruptibility_old))
@@ -882,7 +978,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 }
 
 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
-				bool has_error_code, u32 error_code)
+				bool has_error_code, u32 error_code,
+				bool reinject)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
@@ -1176,9 +1273,16 @@ static __init int vmx_disabled_by_bios(void)
 	u64 msr;
 
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
-	return (msr & (FEATURE_CONTROL_LOCKED |
-		       FEATURE_CONTROL_VMXON_ENABLED))
-	    == FEATURE_CONTROL_LOCKED;
+	if (msr & FEATURE_CONTROL_LOCKED) {
+		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
+			&& tboot_enabled())
+			return 1;
+		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
+			&& !tboot_enabled())
+			return 1;
+	}
+
+	return 0;
 	/* locked but not enabled */
 }
 
@@ -1186,21 +1290,23 @@ static int hardware_enable(void *garbage)
 {
 	int cpu = raw_smp_processor_id();
 	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
-	u64 old;
+	u64 old, test_bits;
 
 	if (read_cr4() & X86_CR4_VMXE)
 		return -EBUSY;
 
 	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
-	if ((old & (FEATURE_CONTROL_LOCKED |
-		    FEATURE_CONTROL_VMXON_ENABLED))
-	    != (FEATURE_CONTROL_LOCKED |
-		FEATURE_CONTROL_VMXON_ENABLED))
+
+	test_bits = FEATURE_CONTROL_LOCKED;
+	test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+	if (tboot_enabled())
+		test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
+
+	if ((old & test_bits) != test_bits) {
 		/* enable and lock */
-		wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
-		       FEATURE_CONTROL_LOCKED |
-		       FEATURE_CONTROL_VMXON_ENABLED);
+		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
+	}
 	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
 	asm volatile (ASM_VMX_VMXON_RAX
 		      : : "a"(&phys_addr), "m"(phys_addr)
@@ -1521,7 +1627,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
 		struct kvm_memslots *slots;
 		gfn_t base_gfn;
 
-		slots = rcu_dereference(kvm->memslots);
+		slots = kvm_memslots(kvm);
 		base_gfn = kvm->memslots->memslots[0].base_gfn +
 				 kvm->memslots->memslots[0].npages - 3;
 		return base_gfn << PAGE_SHIFT;
@@ -1649,6 +1755,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 	vmcs_write32(VM_ENTRY_CONTROLS,
 		     vmcs_read32(VM_ENTRY_CONTROLS)
 		     & ~VM_ENTRY_IA32E_MODE);
+	vmx_set_efer(vcpu, vcpu->arch.efer);
 }
 
 #endif
@@ -1934,28 +2041,28 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
 	*l = (ar >> 13) & 1;
 }
 
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
-	dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
-	dt->base = vmcs_readl(GUEST_IDTR_BASE);
+	dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
+	dt->address = vmcs_readl(GUEST_IDTR_BASE);
 }
 
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
-	vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
-	vmcs_writel(GUEST_IDTR_BASE, dt->base);
+	vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
+	vmcs_writel(GUEST_IDTR_BASE, dt->address);
 }
 
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
-	dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
-	dt->base = vmcs_readl(GUEST_GDTR_BASE);
+	dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
+	dt->address = vmcs_readl(GUEST_GDTR_BASE);
 }
 
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
-	vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
-	vmcs_writel(GUEST_GDTR_BASE, dt->base);
+	vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
+	vmcs_writel(GUEST_GDTR_BASE, dt->address);
 }
 
 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
@@ -2296,6 +2403,16 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
 	spin_unlock(&vmx_vpid_lock);
 }
 
+static void free_vpid(struct vcpu_vmx *vmx)
+{
+	if (!enable_vpid)
+		return;
+	spin_lock(&vmx_vpid_lock);
+	if (vmx->vpid != 0)
+		__clear_bit(vmx->vpid, vmx_vpid_bitmap);
+	spin_unlock(&vmx_vpid_lock);
+}
+
 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
 {
 	int f = sizeof(unsigned long);
@@ -2334,7 +2451,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	u32 junk;
 	u64 host_pat, tsc_this, tsc_base;
 	unsigned long a;
-	struct descriptor_table dt;
+	struct desc_ptr dt;
 	int i;
 	unsigned long kvm_vmx_return;
 	u32 exec_control;
@@ -2415,14 +2532,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
 	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
 
-	kvm_get_idt(&dt);
-	vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
+	native_store_idt(&dt);
+	vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
 
 	asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
 	vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
 	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
 
 	rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
 	vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
@@ -2947,22 +3066,20 @@ static int handle_io(struct kvm_vcpu *vcpu)
 	int size, in, string;
 	unsigned port;
 
-	++vcpu->stat.io_exits;
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	string = (exit_qualification & 16) != 0;
+	in = (exit_qualification & 8) != 0;
 
-	if (string) {
-		if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
-			return 0;
-		return 1;
-	}
+	++vcpu->stat.io_exits;
 
-	size = (exit_qualification & 7) + 1;
-	in = (exit_qualification & 8) != 0;
-	port = exit_qualification >> 16;
+	if (string || in)
+		return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
 
+	port = exit_qualification >> 16;
+	size = (exit_qualification & 7) + 1;
 	skip_emulated_instruction(vcpu);
-	return kvm_emulate_pio(vcpu, in, size, port);
+
+	return kvm_fast_pio_out(vcpu, size, port);
 }
 
 static void
@@ -3053,19 +3170,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static int check_dr_alias(struct kvm_vcpu *vcpu)
-{
-	if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-		kvm_queue_exception(vcpu, UD_VECTOR);
-		return -1;
-	}
-	return 0;
-}
-
 static int handle_dr(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification;
-	unsigned long val;
 	int dr, reg;
 
 	/* Do not handle if the CPL > 0, will trigger GP on re-entry */
@@ -3100,67 +3207,20 @@ static int handle_dr(struct kvm_vcpu *vcpu)
 	dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
 	reg = DEBUG_REG_ACCESS_REG(exit_qualification);
 	if (exit_qualification & TYPE_MOV_FROM_DR) {
-		switch (dr) {
-		case 0 ... 3:
-			val = vcpu->arch.db[dr];
-			break;
-		case 4:
-			if (check_dr_alias(vcpu) < 0)
-				return 1;
-			/* fall through */
-		case 6:
-			val = vcpu->arch.dr6;
-			break;
-		case 5:
-			if (check_dr_alias(vcpu) < 0)
-				return 1;
-			/* fall through */
-		default: /* 7 */
-			val = vcpu->arch.dr7;
-			break;
-		}
-		kvm_register_write(vcpu, reg, val);
-	} else {
-		val = vcpu->arch.regs[reg];
-		switch (dr) {
-		case 0 ... 3:
-			vcpu->arch.db[dr] = val;
-			if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
-				vcpu->arch.eff_db[dr] = val;
-			break;
-		case 4:
-			if (check_dr_alias(vcpu) < 0)
-				return 1;
-			/* fall through */
-		case 6:
-			if (val & 0xffffffff00000000ULL) {
-				kvm_inject_gp(vcpu, 0);
-				return 1;
-			}
-			vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
-			break;
-		case 5:
-			if (check_dr_alias(vcpu) < 0)
-				return 1;
-			/* fall through */
-		default: /* 7 */
-			if (val & 0xffffffff00000000ULL) {
-				kvm_inject_gp(vcpu, 0);
-				return 1;
-			}
-			vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
-			if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
-				vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
-				vcpu->arch.switch_db_regs =
-					(val & DR7_BP_EN_MASK);
-			}
-			break;
-		}
-	}
+		unsigned long val;
+		if (!kvm_get_dr(vcpu, dr, &val))
+			kvm_register_write(vcpu, reg, val);
+	} else
+		kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
 	skip_emulated_instruction(vcpu);
 	return 1;
 }
 
+static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	vmcs_writel(GUEST_DR7, val);
+}
+
 static int handle_cpuid(struct kvm_vcpu *vcpu)
 {
 	kvm_emulate_cpuid(vcpu);
@@ -3292,6 +3352,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long exit_qualification;
+	bool has_error_code = false;
+	u32 error_code = 0;
 	u16 tss_selector;
 	int reason, type, idt_v;
 
@@ -3314,6 +3376,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 			kvm_clear_interrupt_queue(vcpu);
 			break;
 		case INTR_TYPE_HARD_EXCEPTION:
+			if (vmx->idt_vectoring_info &
+			    VECTORING_INFO_DELIVER_CODE_MASK) {
+				has_error_code = true;
+				error_code =
+					vmcs_read32(IDT_VECTORING_ERROR_CODE);
+			}
+			/* fall through */
 		case INTR_TYPE_SOFT_EXCEPTION:
 			kvm_clear_exception_queue(vcpu);
 			break;
@@ -3328,8 +3397,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 		       type != INTR_TYPE_NMI_INTR))
 		skip_emulated_instruction(vcpu);
 
-	if (!kvm_task_switch(vcpu, tss_selector, reason))
+	if (kvm_task_switch(vcpu, tss_selector, reason,
+				has_error_code, error_code) == EMULATE_FAIL) {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		vcpu->run->internal.ndata = 0;
 		return 0;
+	}
 
 	/* clear all local breakpoint enable flags */
 	vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
@@ -3574,7 +3648,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	u32 exit_reason = vmx->exit_reason;
 	u32 vectoring_info = vmx->idt_vectoring_info;
 
-	trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
+	trace_kvm_exit(exit_reason, vcpu);
 
 	/* If guest state is invalid, start emulating */
 	if (vmx->emulation_required && emulate_invalid_guest_state)
@@ -3923,10 +3997,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	spin_lock(&vmx_vpid_lock);
-	if (vmx->vpid != 0)
-		__clear_bit(vmx->vpid, vmx_vpid_bitmap);
-	spin_unlock(&vmx_vpid_lock);
+	free_vpid(vmx);
 	vmx_free_vmcs(vcpu);
 	kfree(vmx->guest_msrs);
 	kvm_vcpu_uninit(vcpu);
@@ -3988,6 +4059,7 @@ free_msrs:
 uninit_vcpu:
 	kvm_vcpu_uninit(&vmx->vcpu);
 free_vcpu:
+	free_vpid(vmx);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 	return ERR_PTR(err);
 }
@@ -4118,6 +4190,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+{
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -4154,6 +4230,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_idt = vmx_set_idt,
 	.get_gdt = vmx_get_gdt,
 	.set_gdt = vmx_set_gdt,
+	.set_dr7 = vmx_set_dr7,
 	.cache_reg = vmx_cache_reg,
 	.get_rflags = vmx_get_rflags,
 	.set_rflags = vmx_set_rflags,
@@ -4189,6 +4266,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.cpuid_update = vmx_cpuid_update,
 
 	.rdtscp_supported = vmx_rdtscp_supported,
+
+	.set_supported_cpuid = vmx_set_supported_cpuid,
 };
 
 static int __init vmx_init(void)
@@ -4236,7 +4315,8 @@ static int __init vmx_init(void)
 
 	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
-	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
+	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+		     __alignof__(struct vcpu_vmx), THIS_MODULE);
 	if (r)
 		goto out3;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dd9bc8fb81a..05d571f6f19 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -42,7 +42,7 @@
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <trace/events/kvm.h>
-#undef TRACE_INCLUDE_FILE
+
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
@@ -224,34 +224,6 @@ static void drop_user_return_notifiers(void *ignore)
 		kvm_on_user_return(&smsr->urn);
 }
 
-unsigned long segment_base(u16 selector)
-{
-	struct descriptor_table gdt;
-	struct desc_struct *d;
-	unsigned long table_base;
-	unsigned long v;
-
-	if (selector == 0)
-		return 0;
-
-	kvm_get_gdt(&gdt);
-	table_base = gdt.base;
-
-	if (selector & 4) {           /* from ldt */
-		u16 ldt_selector = kvm_read_ldt();
-
-		table_base = segment_base(ldt_selector);
-	}
-	d = (struct desc_struct *)(table_base + (selector & ~7));
-	v = get_desc_base(d);
-#ifdef CONFIG_X86_64
-	if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
-		v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
-#endif
-	return v;
-}
-EXPORT_SYMBOL_GPL(segment_base);
-
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 {
 	if (irqchip_in_kernel(vcpu->kvm))
@@ -293,7 +265,8 @@ static int exception_class(int vector)
 }
 
 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
-		unsigned nr, bool has_error, u32 error_code)
+		unsigned nr, bool has_error, u32 error_code,
+		bool reinject)
 {
 	u32 prev_nr;
 	int class1, class2;
@@ -304,6 +277,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 		vcpu->arch.exception.has_error_code = has_error;
 		vcpu->arch.exception.nr = nr;
 		vcpu->arch.exception.error_code = error_code;
+		vcpu->arch.exception.reinject = reinject;
 		return;
 	}
 
@@ -332,10 +306,16 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 {
-	kvm_multiple_exception(vcpu, nr, false, 0);
+	kvm_multiple_exception(vcpu, nr, false, 0, false);
 }
 EXPORT_SYMBOL_GPL(kvm_queue_exception);
 
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+	kvm_multiple_exception(vcpu, nr, false, 0, true);
+}
+EXPORT_SYMBOL_GPL(kvm_requeue_exception);
+
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 			   u32 error_code)
 {
@@ -352,10 +332,16 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 {
-	kvm_multiple_exception(vcpu, nr, true, error_code);
+	kvm_multiple_exception(vcpu, nr, true, error_code, false);
 }
 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 
+void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
+{
+	kvm_multiple_exception(vcpu, nr, true, error_code, true);
+}
+EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
+
 /*
  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
  * a #GP and return false.
@@ -476,7 +462,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	}
 
 	kvm_x86_ops->set_cr0(vcpu, cr0);
-	vcpu->arch.cr0 = cr0;
 
 	kvm_mmu_reset_context(vcpu);
 	return;
@@ -485,7 +470,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
 
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 {
-	kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f));
+	kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
@@ -517,7 +502,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	}
 	kvm_x86_ops->set_cr4(vcpu, cr4);
 	vcpu->arch.cr4 = cr4;
-	vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
 	kvm_mmu_reset_context(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -592,6 +576,80 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+	switch (dr) {
+	case 0 ... 3:
+		vcpu->arch.db[dr] = val;
+		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+			vcpu->arch.eff_db[dr] = val;
+		break;
+	case 4:
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+			kvm_queue_exception(vcpu, UD_VECTOR);
+			return 1;
+		}
+		/* fall through */
+	case 6:
+		if (val & 0xffffffff00000000ULL) {
+			kvm_inject_gp(vcpu, 0);
+			return 1;
+		}
+		vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+		break;
+	case 5:
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+			kvm_queue_exception(vcpu, UD_VECTOR);
+			return 1;
+		}
+		/* fall through */
+	default: /* 7 */
+		if (val & 0xffffffff00000000ULL) {
+			kvm_inject_gp(vcpu, 0);
+			return 1;
+		}
+		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+			kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
+			vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
+		}
+		break;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_dr);
+
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+{
+	switch (dr) {
+	case 0 ... 3:
+		*val = vcpu->arch.db[dr];
+		break;
+	case 4:
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+			kvm_queue_exception(vcpu, UD_VECTOR);
+			return 1;
+		}
+		/* fall through */
+	case 6:
+		*val = vcpu->arch.dr6;
+		break;
+	case 5:
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+			kvm_queue_exception(vcpu, UD_VECTOR);
+			return 1;
+		}
+		/* fall through */
+	default: /* 7 */
+		*val = vcpu->arch.dr7;
+		break;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_get_dr);
+
 static inline u32 bit(int bitno)
 {
 	return 1 << (bitno & 31);
@@ -606,9 +664,10 @@ static inline u32 bit(int bitno)
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN	5
+#define KVM_SAVE_MSRS_BEGIN	7
 static u32 msrs_to_save[] = {
 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
 	HV_X64_MSR_APIC_ASSIST_PAGE,
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
@@ -625,48 +684,42 @@ static u32 emulated_msrs[] = {
 	MSR_IA32_MISC_ENABLE,
 };
 
-static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
+static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
-	if (efer & efer_reserved_bits) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	if (efer & efer_reserved_bits)
+		return 1;
 
 	if (is_paging(vcpu)
-	    && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	    && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
+		return 1;
 
 	if (efer & EFER_FFXSR) {
 		struct kvm_cpuid_entry2 *feat;
 
 		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
-		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
+		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
+			return 1;
 	}
 
 	if (efer & EFER_SVME) {
 		struct kvm_cpuid_entry2 *feat;
 
 		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
-		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
+		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
+			return 1;
 	}
 
-	kvm_x86_ops->set_efer(vcpu, efer);
-
 	efer &= ~EFER_LMA;
 	efer |= vcpu->arch.efer & EFER_LMA;
 
+	kvm_x86_ops->set_efer(vcpu, efer);
+
 	vcpu->arch.efer = efer;
 
 	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
 	kvm_mmu_reset_context(vcpu);
+
+	return 0;
 }
 
 void kvm_enable_efer_bits(u64 mask)
@@ -696,14 +749,22 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 
 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
-	static int version;
+	int version;
+	int r;
 	struct pvclock_wall_clock wc;
 	struct timespec boot;
 
 	if (!wall_clock)
 		return;
 
-	version++;
+	r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
+	if (r)
+		return;
+
+	if (version & 1)
+		++version;  /* first time write, random junk */
+
+	++version;
 
 	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 
@@ -796,6 +857,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	vcpu->hv_clock.system_time = ts.tv_nsec +
 				     (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
 
+	vcpu->hv_clock.flags = 0;
+
 	/*
 	 * The interface expects us to write an even number signaling that the
 	 * update is finished. Since the guest won't see the intermediate
@@ -1087,10 +1150,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	switch (msr) {
 	case MSR_EFER:
-		set_efer(vcpu, data);
-		break;
+		return set_efer(vcpu, data);
 	case MSR_K7_HWCR:
 		data &= ~(u64)0x40;	/* ignore flush filter disable */
+		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
 		if (data != 0) {
 			pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
 				data);
@@ -1133,10 +1196,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	case MSR_IA32_MISC_ENABLE:
 		vcpu->arch.ia32_misc_enable_msr = data;
 		break;
+	case MSR_KVM_WALL_CLOCK_NEW:
 	case MSR_KVM_WALL_CLOCK:
 		vcpu->kvm->arch.wall_clock = data;
 		kvm_write_wall_clock(vcpu->kvm, data);
 		break;
+	case MSR_KVM_SYSTEM_TIME_NEW:
 	case MSR_KVM_SYSTEM_TIME: {
 		if (vcpu->arch.time_page) {
 			kvm_release_page_dirty(vcpu->arch.time_page);
@@ -1408,9 +1473,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		data = vcpu->arch.efer;
 		break;
 	case MSR_KVM_WALL_CLOCK:
+	case MSR_KVM_WALL_CLOCK_NEW:
 		data = vcpu->kvm->arch.wall_clock;
 		break;
 	case MSR_KVM_SYSTEM_TIME:
+	case MSR_KVM_SYSTEM_TIME_NEW:
 		data = vcpu->arch.time;
 		break;
 	case MSR_IA32_P5_MC_ADDR:
@@ -1549,6 +1616,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_HYPERV_VAPIC:
 	case KVM_CAP_HYPERV_SPIN:
 	case KVM_CAP_PCI_SEGMENT:
+	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 		r = 1;
 		break;
@@ -1769,6 +1837,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 {
 	int r;
 
+	vcpu_load(vcpu);
 	r = -E2BIG;
 	if (cpuid->nent < vcpu->arch.cpuid_nent)
 		goto out;
@@ -1780,6 +1849,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 
 out:
 	cpuid->nent = vcpu->arch.cpuid_nent;
+	vcpu_put(vcpu);
 	return r;
 }
 
@@ -1910,6 +1980,24 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		}
 		break;
 	}
+	case KVM_CPUID_SIGNATURE: {
+		char signature[12] = "KVMKVMKVM\0\0";
+		u32 *sigptr = (u32 *)signature;
+		entry->eax = 0;
+		entry->ebx = sigptr[0];
+		entry->ecx = sigptr[1];
+		entry->edx = sigptr[2];
+		break;
+	}
+	case KVM_CPUID_FEATURES:
+		entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
+			     (1 << KVM_FEATURE_NOP_IO_DELAY) |
+			     (1 << KVM_FEATURE_CLOCKSOURCE2) |
+			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
+		entry->ebx = 0;
+		entry->ecx = 0;
+		entry->edx = 0;
+		break;
 	case 0x80000000:
 		entry->eax = min(entry->eax, 0x8000001a);
 		break;
@@ -1918,6 +2006,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->ecx &= kvm_supported_word6_x86_features;
 		break;
 	}
+
+	kvm_x86_ops->set_supported_cpuid(function, entry);
+
 	put_cpu();
 }
 
@@ -1953,6 +2044,23 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
 		do_cpuid_ent(&cpuid_entries[nent], func, 0,
 			     &nent, cpuid->nent);
+
+
+
+	r = -E2BIG;
+	if (nent >= cpuid->nent)
+		goto out_free;
+
+	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
+		     cpuid->nent);
+
+	r = -E2BIG;
+	if (nent >= cpuid->nent)
+		goto out_free;
+
+	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
+		     cpuid->nent);
+
 	r = -E2BIG;
 	if (nent >= cpuid->nent)
 		goto out_free;
@@ -2032,6 +2140,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	int r;
 	unsigned bank_num = mcg_cap & 0xff, bank;
 
+	vcpu_load(vcpu);
 	r = -EINVAL;
 	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
 		goto out;
@@ -2046,6 +2155,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	for (bank = 0; bank < bank_num; bank++)
 		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
 out:
+	vcpu_put(vcpu);
 	return r;
 }
 
@@ -2105,14 +2215,20 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 {
 	vcpu_load(vcpu);
 
-	events->exception.injected = vcpu->arch.exception.pending;
+	events->exception.injected =
+		vcpu->arch.exception.pending &&
+		!kvm_exception_is_soft(vcpu->arch.exception.nr);
 	events->exception.nr = vcpu->arch.exception.nr;
 	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
 	events->exception.error_code = vcpu->arch.exception.error_code;
 
-	events->interrupt.injected = vcpu->arch.interrupt.pending;
+	events->interrupt.injected =
+		vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
 	events->interrupt.nr = vcpu->arch.interrupt.nr;
-	events->interrupt.soft = vcpu->arch.interrupt.soft;
+	events->interrupt.soft = 0;
+	events->interrupt.shadow =
+		kvm_x86_ops->get_interrupt_shadow(vcpu,
+			KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
 
 	events->nmi.injected = vcpu->arch.nmi_injected;
 	events->nmi.pending = vcpu->arch.nmi_pending;
@@ -2121,7 +2237,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 	events->sipi_vector = vcpu->arch.sipi_vector;
 
 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
-			 | KVM_VCPUEVENT_VALID_SIPI_VECTOR);
+			 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
+			 | KVM_VCPUEVENT_VALID_SHADOW);
 
 	vcpu_put(vcpu);
 }
@@ -2130,7 +2247,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 					      struct kvm_vcpu_events *events)
 {
 	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
-			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR))
+			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
+			      | KVM_VCPUEVENT_VALID_SHADOW))
 		return -EINVAL;
 
 	vcpu_load(vcpu);
@@ -2145,6 +2263,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	vcpu->arch.interrupt.soft = events->interrupt.soft;
 	if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
 		kvm_pic_clear_isr_ack(vcpu->kvm);
+	if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
+		kvm_x86_ops->set_interrupt_shadow(vcpu,
+						  events->interrupt.shadow);
 
 	vcpu->arch.nmi_injected = events->nmi.injected;
 	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
@@ -2159,6 +2280,36 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+					     struct kvm_debugregs *dbgregs)
+{
+	vcpu_load(vcpu);
+
+	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
+	dbgregs->dr6 = vcpu->arch.dr6;
+	dbgregs->dr7 = vcpu->arch.dr7;
+	dbgregs->flags = 0;
+
+	vcpu_put(vcpu);
+}
+
+static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
+					    struct kvm_debugregs *dbgregs)
+{
+	if (dbgregs->flags)
+		return -EINVAL;
+
+	vcpu_load(vcpu);
+
+	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+	vcpu->arch.dr6 = dbgregs->dr6;
+	vcpu->arch.dr7 = dbgregs->dr7;
+
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
@@ -2313,7 +2464,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EFAULT;
 		if (copy_from_user(&mce, argp, sizeof mce))
 			goto out;
+		vcpu_load(vcpu);
 		r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+		vcpu_put(vcpu);
 		break;
 	}
 	case KVM_GET_VCPU_EVENTS: {
@@ -2337,6 +2490,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
 		break;
 	}
+	case KVM_GET_DEBUGREGS: {
+		struct kvm_debugregs dbgregs;
+
+		kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+
+		r = -EFAULT;
+		if (copy_to_user(argp, &dbgregs,
+				 sizeof(struct kvm_debugregs)))
+			break;
+		r = 0;
+		break;
+	}
+	case KVM_SET_DEBUGREGS: {
+		struct kvm_debugregs dbgregs;
+
+		r = -EFAULT;
+		if (copy_from_user(&dbgregs, argp,
+				   sizeof(struct kvm_debugregs)))
+			break;
+
+		r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
@@ -2390,7 +2566,7 @@ gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
 	struct kvm_mem_alias *alias;
 	struct kvm_mem_aliases *aliases;
 
-	aliases = rcu_dereference(kvm->arch.aliases);
+	aliases = kvm_aliases(kvm);
 
 	for (i = 0; i < aliases->naliases; ++i) {
 		alias = &aliases->aliases[i];
@@ -2409,7 +2585,7 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 	struct kvm_mem_alias *alias;
 	struct kvm_mem_aliases *aliases;
 
-	aliases = rcu_dereference(kvm->arch.aliases);
+	aliases = kvm_aliases(kvm);
 
 	for (i = 0; i < aliases->naliases; ++i) {
 		alias = &aliases->aliases[i];
@@ -2804,11 +2980,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = -EFAULT;
 		if (copy_from_user(&irq_event, argp, sizeof irq_event))
 			goto out;
+		r = -ENXIO;
 		if (irqchip_in_kernel(kvm)) {
 			__s32 status;
 			status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
 					irq_event.irq, irq_event.level);
 			if (ioctl == KVM_IRQ_LINE_STATUS) {
+				r = -EFAULT;
 				irq_event.status = status;
 				if (copy_to_user(argp, &irq_event,
 							sizeof irq_event))
@@ -3024,6 +3202,18 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 	return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
 }
 
+static void kvm_set_segment(struct kvm_vcpu *vcpu,
+			struct kvm_segment *var, int seg)
+{
+	kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+
+void kvm_get_segment(struct kvm_vcpu *vcpu,
+		     struct kvm_segment *var, int seg)
+{
+	kvm_x86_ops->get_segment(vcpu, var, seg);
+}
+
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
@@ -3104,14 +3294,17 @@ static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
 }
 
-static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
-				struct kvm_vcpu *vcpu, u32 *error)
+static int kvm_write_guest_virt_system(gva_t addr, void *val,
+				       unsigned int bytes,
+				       struct kvm_vcpu *vcpu,
+				       u32 *error)
 {
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
 
 	while (bytes) {
-		gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
+		gpa_t gpa =  vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
+						       PFERR_WRITE_MASK, error);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
@@ -3134,7 +3327,6 @@ out:
 	return r;
 }
 
-
 static int emulator_read_emulated(unsigned long addr,
 				  void *val,
 				  unsigned int bytes,
@@ -3237,9 +3429,9 @@ mmio:
 }
 
 int emulator_write_emulated(unsigned long addr,
-				   const void *val,
-				   unsigned int bytes,
-				   struct kvm_vcpu *vcpu)
+			    const void *val,
+			    unsigned int bytes,
+			    struct kvm_vcpu *vcpu)
 {
 	/* Crossing a page boundary? */
 	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
@@ -3257,45 +3449,150 @@ int emulator_write_emulated(unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(emulator_write_emulated);
 
+#define CMPXCHG_TYPE(t, ptr, old, new) \
+	(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
+
+#ifdef CONFIG_X86_64
+#  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
+#else
+#  define CMPXCHG64(ptr, old, new) \
+	(cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
+#endif
+
 static int emulator_cmpxchg_emulated(unsigned long addr,
 				     const void *old,
 				     const void *new,
 				     unsigned int bytes,
 				     struct kvm_vcpu *vcpu)
 {
-	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
-#ifndef CONFIG_X86_64
-	/* guests cmpxchg8b have to be emulated atomically */
-	if (bytes == 8) {
-		gpa_t gpa;
-		struct page *page;
-		char *kaddr;
-		u64 val;
+	gpa_t gpa;
+	struct page *page;
+	char *kaddr;
+	bool exchanged;
 
-		gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
+	/* guests cmpxchg8b have to be emulated atomically */
+	if (bytes > 8 || (bytes & (bytes - 1)))
+		goto emul_write;
 
-		if (gpa == UNMAPPED_GVA ||
-		   (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
-			goto emul_write;
+	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
 
-		if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
-			goto emul_write;
+	if (gpa == UNMAPPED_GVA ||
+	    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+		goto emul_write;
 
-		val = *(u64 *)new;
+	if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+		goto emul_write;
 
-		page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
 
-		kaddr = kmap_atomic(page, KM_USER0);
-		set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
-		kunmap_atomic(kaddr, KM_USER0);
-		kvm_release_page_dirty(page);
+	kaddr = kmap_atomic(page, KM_USER0);
+	kaddr += offset_in_page(gpa);
+	switch (bytes) {
+	case 1:
+		exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
+		break;
+	case 2:
+		exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
+		break;
+	case 4:
+		exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
+		break;
+	case 8:
+		exchanged = CMPXCHG64(kaddr, old, new);
+		break;
+	default:
+		BUG();
 	}
+	kunmap_atomic(kaddr, KM_USER0);
+	kvm_release_page_dirty(page);
+
+	if (!exchanged)
+		return X86EMUL_CMPXCHG_FAILED;
+
+	kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
+
+	return X86EMUL_CONTINUE;
+
 emul_write:
-#endif
+	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 
 	return emulator_write_emulated(addr, new, bytes, vcpu);
 }
 
+static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
+{
+	/* TODO: String I/O for in kernel device */
+	int r;
+
+	if (vcpu->arch.pio.in)
+		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
+				    vcpu->arch.pio.size, pd);
+	else
+		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+				     vcpu->arch.pio.port, vcpu->arch.pio.size,
+				     pd);
+	return r;
+}
+
+
+static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
+			     unsigned int count, struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.pio.count)
+		goto data_avail;
+
+	trace_kvm_pio(1, port, size, 1);
+
+	vcpu->arch.pio.port = port;
+	vcpu->arch.pio.in = 1;
+	vcpu->arch.pio.count  = count;
+	vcpu->arch.pio.size = size;
+
+	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+	data_avail:
+		memcpy(val, vcpu->arch.pio_data, size * count);
+		vcpu->arch.pio.count = 0;
+		return 1;
+	}
+
+	vcpu->run->exit_reason = KVM_EXIT_IO;
+	vcpu->run->io.direction = KVM_EXIT_IO_IN;
+	vcpu->run->io.size = size;
+	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+	vcpu->run->io.count = count;
+	vcpu->run->io.port = port;
+
+	return 0;
+}
+
+static int emulator_pio_out_emulated(int size, unsigned short port,
+			      const void *val, unsigned int count,
+			      struct kvm_vcpu *vcpu)
+{
+	trace_kvm_pio(0, port, size, 1);
+
+	vcpu->arch.pio.port = port;
+	vcpu->arch.pio.in = 0;
+	vcpu->arch.pio.count = count;
+	vcpu->arch.pio.size = size;
+
+	memcpy(vcpu->arch.pio_data, val, size * count);
+
+	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+		vcpu->arch.pio.count = 0;
+		return 1;
+	}
+
+	vcpu->run->exit_reason = KVM_EXIT_IO;
+	vcpu->run->io.direction = KVM_EXIT_IO_OUT;
+	vcpu->run->io.size = size;
+	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+	vcpu->run->io.count = count;
+	vcpu->run->io.port = port;
+
+	return 0;
+}
+
 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
 {
 	return kvm_x86_ops->get_segment_base(vcpu, seg);
@@ -3316,14 +3613,14 @@ int emulate_clts(struct kvm_vcpu *vcpu)
 
 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
 {
-	return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest);
+	return kvm_get_dr(ctxt->vcpu, dr, dest);
 }
 
 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
 {
 	unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
 
-	return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask);
+	return kvm_set_dr(ctxt->vcpu, dr, value & mask);
 }
 
 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -3344,12 +3641,167 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
 }
 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
 
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+{
+	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+}
+
+static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
+{
+	unsigned long value;
+
+	switch (cr) {
+	case 0:
+		value = kvm_read_cr0(vcpu);
+		break;
+	case 2:
+		value = vcpu->arch.cr2;
+		break;
+	case 3:
+		value = vcpu->arch.cr3;
+		break;
+	case 4:
+		value = kvm_read_cr4(vcpu);
+		break;
+	case 8:
+		value = kvm_get_cr8(vcpu);
+		break;
+	default:
+		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+		return 0;
+	}
+
+	return value;
+}
+
+static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
+{
+	switch (cr) {
+	case 0:
+		kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+		break;
+	case 2:
+		vcpu->arch.cr2 = val;
+		break;
+	case 3:
+		kvm_set_cr3(vcpu, val);
+		break;
+	case 4:
+		kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+		break;
+	case 8:
+		kvm_set_cr8(vcpu, val & 0xfUL);
+		break;
+	default:
+		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+	}
+}
+
+static int emulator_get_cpl(struct kvm_vcpu *vcpu)
+{
+	return kvm_x86_ops->get_cpl(vcpu);
+}
+
+static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
+{
+	kvm_x86_ops->get_gdt(vcpu, dt);
+}
+
+static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
+					   struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment var;
+
+	kvm_get_segment(vcpu, &var, seg);
+
+	if (var.unusable)
+		return false;
+
+	if (var.g)
+		var.limit >>= 12;
+	set_desc_limit(desc, var.limit);
+	set_desc_base(desc, (unsigned long)var.base);
+	desc->type = var.type;
+	desc->s = var.s;
+	desc->dpl = var.dpl;
+	desc->p = var.present;
+	desc->avl = var.avl;
+	desc->l = var.l;
+	desc->d = var.db;
+	desc->g = var.g;
+
+	return true;
+}
+
+static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
+					   struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment var;
+
+	/* needed to preserve selector */
+	kvm_get_segment(vcpu, &var, seg);
+
+	var.base = get_desc_base(desc);
+	var.limit = get_desc_limit(desc);
+	if (desc->g)
+		var.limit = (var.limit << 12) | 0xfff;
+	var.type = desc->type;
+	var.present = desc->p;
+	var.dpl = desc->dpl;
+	var.db = desc->d;
+	var.s = desc->s;
+	var.l = desc->l;
+	var.g = desc->g;
+	var.avl = desc->avl;
+	var.present = desc->p;
+	var.unusable = !var.present;
+	var.padding = 0;
+
+	kvm_set_segment(vcpu, &var, seg);
+	return;
+}
+
+static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment kvm_seg;
+
+	kvm_get_segment(vcpu, &kvm_seg, seg);
+	return kvm_seg.selector;
+}
+
+static void emulator_set_segment_selector(u16 sel, int seg,
+					  struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment kvm_seg;
+
+	kvm_get_segment(vcpu, &kvm_seg, seg);
+	kvm_seg.selector = sel;
+	kvm_set_segment(vcpu, &kvm_seg, seg);
+}
+
+static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+	kvm_x86_ops->set_rflags(vcpu, rflags);
+}
+
 static struct x86_emulate_ops emulate_ops = {
 	.read_std            = kvm_read_guest_virt_system,
+	.write_std           = kvm_write_guest_virt_system,
 	.fetch               = kvm_fetch_guest_virt,
 	.read_emulated       = emulator_read_emulated,
 	.write_emulated      = emulator_write_emulated,
 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
+	.pio_in_emulated     = emulator_pio_in_emulated,
+	.pio_out_emulated    = emulator_pio_out_emulated,
+	.get_cached_descriptor = emulator_get_cached_descriptor,
+	.set_cached_descriptor = emulator_set_cached_descriptor,
+	.get_segment_selector = emulator_get_segment_selector,
+	.set_segment_selector = emulator_set_segment_selector,
+	.get_gdt             = emulator_get_gdt,
+	.get_cr              = emulator_get_cr,
+	.set_cr              = emulator_set_cr,
+	.cpl                 = emulator_get_cpl,
+	.set_rflags          = emulator_set_rflags,
 };
 
 static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3380,14 +3832,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	cache_all_regs(vcpu);
 
 	vcpu->mmio_is_write = 0;
-	vcpu->arch.pio.string = 0;
 
 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
 		int cs_db, cs_l;
 		kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
 		vcpu->arch.emulate_ctxt.vcpu = vcpu;
-		vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
+		vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+		vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
 		vcpu->arch.emulate_ctxt.mode =
 			(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
 			(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
@@ -3396,6 +3848,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 			? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
 
 		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+		trace_kvm_emulate_insn_start(vcpu);
 
 		/* Only allow emulation of specific instructions on #UD
 		 * (namely VMMCALL, sysenter, sysexit, syscall)*/
@@ -3428,6 +3881,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		++vcpu->stat.insn_emulation;
 		if (r)  {
 			++vcpu->stat.insn_emulation_fail;
+			trace_kvm_emulate_insn_failed(vcpu);
 			if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
 				return EMULATE_DONE;
 			return EMULATE_FAIL;
@@ -3439,16 +3893,20 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		return EMULATE_DONE;
 	}
 
+restart:
 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
 
 	if (r == 0)
 		kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
 
-	if (vcpu->arch.pio.string)
+	if (vcpu->arch.pio.count) {
+		if (!vcpu->arch.pio.in)
+			vcpu->arch.pio.count = 0;
 		return EMULATE_DO_MMIO;
+	}
 
-	if ((r || vcpu->mmio_is_write) && run) {
+	if (r || vcpu->mmio_is_write) {
 		run->exit_reason = KVM_EXIT_MMIO;
 		run->mmio.phys_addr = vcpu->mmio_phys_addr;
 		memcpy(run->mmio.data, vcpu->mmio_data, 8);
@@ -3458,222 +3916,41 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 
 	if (r) {
 		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
-			return EMULATE_DONE;
+			goto done;
 		if (!vcpu->mmio_needed) {
+			++vcpu->stat.insn_emulation_fail;
+			trace_kvm_emulate_insn_failed(vcpu);
 			kvm_report_emulation_failure(vcpu, "mmio");
 			return EMULATE_FAIL;
 		}
 		return EMULATE_DO_MMIO;
 	}
 
-	kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
-
 	if (vcpu->mmio_is_write) {
 		vcpu->mmio_needed = 0;
 		return EMULATE_DO_MMIO;
 	}
 
-	return EMULATE_DONE;
-}
-EXPORT_SYMBOL_GPL(emulate_instruction);
-
-static int pio_copy_data(struct kvm_vcpu *vcpu)
-{
-	void *p = vcpu->arch.pio_data;
-	gva_t q = vcpu->arch.pio.guest_gva;
-	unsigned bytes;
-	int ret;
-	u32 error_code;
-
-	bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
-	if (vcpu->arch.pio.in)
-		ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
-	else
-		ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
-
-	if (ret == X86EMUL_PROPAGATE_FAULT)
-		kvm_inject_page_fault(vcpu, q, error_code);
-
-	return ret;
-}
-
-int complete_pio(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pio_request *io = &vcpu->arch.pio;
-	long delta;
-	int r;
-	unsigned long val;
-
-	if (!io->string) {
-		if (io->in) {
-			val = kvm_register_read(vcpu, VCPU_REGS_RAX);
-			memcpy(&val, vcpu->arch.pio_data, io->size);
-			kvm_register_write(vcpu, VCPU_REGS_RAX, val);
-		}
-	} else {
-		if (io->in) {
-			r = pio_copy_data(vcpu);
-			if (r)
-				goto out;
-		}
-
-		delta = 1;
-		if (io->rep) {
-			delta *= io->cur_count;
-			/*
-			 * The size of the register should really depend on
-			 * current address size.
-			 */
-			val = kvm_register_read(vcpu, VCPU_REGS_RCX);
-			val -= delta;
-			kvm_register_write(vcpu, VCPU_REGS_RCX, val);
-		}
-		if (io->down)
-			delta = -delta;
-		delta *= io->size;
-		if (io->in) {
-			val = kvm_register_read(vcpu, VCPU_REGS_RDI);
-			val += delta;
-			kvm_register_write(vcpu, VCPU_REGS_RDI, val);
-		} else {
-			val = kvm_register_read(vcpu, VCPU_REGS_RSI);
-			val += delta;
-			kvm_register_write(vcpu, VCPU_REGS_RSI, val);
-		}
-	}
-out:
-	io->count -= io->cur_count;
-	io->cur_count = 0;
-
-	return 0;
-}
-
-static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
-{
-	/* TODO: String I/O for in kernel device */
-	int r;
-
-	if (vcpu->arch.pio.in)
-		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
-				    vcpu->arch.pio.size, pd);
-	else
-		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
-				     vcpu->arch.pio.port, vcpu->arch.pio.size,
-				     pd);
-	return r;
-}
+done:
+	if (vcpu->arch.exception.pending)
+		vcpu->arch.emulate_ctxt.restart = false;
 
-static int pio_string_write(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pio_request *io = &vcpu->arch.pio;
-	void *pd = vcpu->arch.pio_data;
-	int i, r = 0;
+	if (vcpu->arch.emulate_ctxt.restart)
+		goto restart;
 
-	for (i = 0; i < io->cur_count; i++) {
-		if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
-				     io->port, io->size, pd)) {
-			r = -EOPNOTSUPP;
-			break;
-		}
-		pd += io->size;
-	}
-	return r;
-}
-
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
-{
-	unsigned long val;
-
-	trace_kvm_pio(!in, port, size, 1);
-
-	vcpu->run->exit_reason = KVM_EXIT_IO;
-	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
-	vcpu->run->io.size = vcpu->arch.pio.size = size;
-	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
-	vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
-	vcpu->run->io.port = vcpu->arch.pio.port = port;
-	vcpu->arch.pio.in = in;
-	vcpu->arch.pio.string = 0;
-	vcpu->arch.pio.down = 0;
-	vcpu->arch.pio.rep = 0;
-
-	if (!vcpu->arch.pio.in) {
-		val = kvm_register_read(vcpu, VCPU_REGS_RAX);
-		memcpy(vcpu->arch.pio_data, &val, 4);
-	}
-
-	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
-		complete_pio(vcpu);
-		return 1;
-	}
-	return 0;
+	return EMULATE_DONE;
 }
-EXPORT_SYMBOL_GPL(kvm_emulate_pio);
+EXPORT_SYMBOL_GPL(emulate_instruction);
 
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
-		  int size, unsigned long count, int down,
-		  gva_t address, int rep, unsigned port)
+int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
 {
-	unsigned now, in_page;
-	int ret = 0;
-
-	trace_kvm_pio(!in, port, size, count);
-
-	vcpu->run->exit_reason = KVM_EXIT_IO;
-	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
-	vcpu->run->io.size = vcpu->arch.pio.size = size;
-	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
-	vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
-	vcpu->run->io.port = vcpu->arch.pio.port = port;
-	vcpu->arch.pio.in = in;
-	vcpu->arch.pio.string = 1;
-	vcpu->arch.pio.down = down;
-	vcpu->arch.pio.rep = rep;
-
-	if (!count) {
-		kvm_x86_ops->skip_emulated_instruction(vcpu);
-		return 1;
-	}
-
-	if (!down)
-		in_page = PAGE_SIZE - offset_in_page(address);
-	else
-		in_page = offset_in_page(address) + size;
-	now = min(count, (unsigned long)in_page / size);
-	if (!now)
-		now = 1;
-	if (down) {
-		/*
-		 * String I/O in reverse.  Yuck.  Kill the guest, fix later.
-		 */
-		pr_unimpl(vcpu, "guest string pio down\n");
-		kvm_inject_gp(vcpu, 0);
-		return 1;
-	}
-	vcpu->run->io.count = now;
-	vcpu->arch.pio.cur_count = now;
-
-	if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
-		kvm_x86_ops->skip_emulated_instruction(vcpu);
-
-	vcpu->arch.pio.guest_gva = address;
-
-	if (!vcpu->arch.pio.in) {
-		/* string PIO write */
-		ret = pio_copy_data(vcpu);
-		if (ret == X86EMUL_PROPAGATE_FAULT)
-			return 1;
-		if (ret == 0 && !pio_string_write(vcpu)) {
-			complete_pio(vcpu);
-			if (vcpu->arch.pio.count == 0)
-				ret = 1;
-		}
-	}
-	/* no string PIO read support yet */
-
+	unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
+	/* do not return to emulator after return from userspace */
+	vcpu->arch.pio.count = 0;
 	return ret;
 }
-EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
+EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
 
 static void bounce_off(void *info)
 {
@@ -3996,85 +4273,20 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
 	return emulator_write_emulated(rip, instruction, 3, vcpu);
 }
 
-static u64 mk_cr_64(u64 curr_cr, u32 new_val)
-{
-	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
-}
-
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
 {
-	struct descriptor_table dt = { limit, base };
+	struct desc_ptr dt = { limit, base };
 
 	kvm_x86_ops->set_gdt(vcpu, &dt);
 }
 
 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
 {
-	struct descriptor_table dt = { limit, base };
+	struct desc_ptr dt = { limit, base };
 
 	kvm_x86_ops->set_idt(vcpu, &dt);
 }
 
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
-		   unsigned long *rflags)
-{
-	kvm_lmsw(vcpu, msw);
-	*rflags = kvm_get_rflags(vcpu);
-}
-
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
-{
-	unsigned long value;
-
-	switch (cr) {
-	case 0:
-		value = kvm_read_cr0(vcpu);
-		break;
-	case 2:
-		value = vcpu->arch.cr2;
-		break;
-	case 3:
-		value = vcpu->arch.cr3;
-		break;
-	case 4:
-		value = kvm_read_cr4(vcpu);
-		break;
-	case 8:
-		value = kvm_get_cr8(vcpu);
-		break;
-	default:
-		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
-		return 0;
-	}
-
-	return value;
-}
-
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
-		     unsigned long *rflags)
-{
-	switch (cr) {
-	case 0:
-		kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
-		*rflags = kvm_get_rflags(vcpu);
-		break;
-	case 2:
-		vcpu->arch.cr2 = val;
-		break;
-	case 3:
-		kvm_set_cr3(vcpu, val);
-		break;
-	case 4:
-		kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
-		break;
-	case 8:
-		kvm_set_cr8(vcpu, val & 0xfUL);
-		break;
-	default:
-		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
-	}
-}
-
 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
 {
 	struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
@@ -4138,9 +4350,13 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
 
+	best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
+	if (!best || best->eax < 0x80000008)
+		goto not_found;
 	best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
 	if (best)
 		return best->eax & 0xff;
+not_found:
 	return 36;
 }
 
@@ -4254,9 +4470,13 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
 {
 	/* try to reinject previous events if any */
 	if (vcpu->arch.exception.pending) {
+		trace_kvm_inj_exception(vcpu->arch.exception.nr,
+					vcpu->arch.exception.has_error_code,
+					vcpu->arch.exception.error_code);
 		kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
 					  vcpu->arch.exception.has_error_code,
-					  vcpu->arch.exception.error_code);
+					  vcpu->arch.exception.error_code,
+					  vcpu->arch.exception.reinject);
 		return;
 	}
 
@@ -4486,7 +4706,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 	}
 
 	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
-	post_kvm_run_save(vcpu);
 
 	vapic_exit(vcpu);
 
@@ -4514,26 +4733,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (!irqchip_in_kernel(vcpu->kvm))
 		kvm_set_cr8(vcpu, kvm_run->cr8);
 
-	if (vcpu->arch.pio.cur_count) {
-		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-		r = complete_pio(vcpu);
-		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-		if (r)
-			goto out;
-	}
-	if (vcpu->mmio_needed) {
-		memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
-		vcpu->mmio_read_completed = 1;
-		vcpu->mmio_needed = 0;
-
+	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
+	    vcpu->arch.emulate_ctxt.restart) {
+		if (vcpu->mmio_needed) {
+			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+			vcpu->mmio_read_completed = 1;
+			vcpu->mmio_needed = 0;
+		}
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-		r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
-					EMULTYPE_NO_DECODE);
+		r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
 		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 		if (r == EMULATE_DO_MMIO) {
-			/*
-			 * Read-modify-write.  Back to userspace.
-			 */
 			r = 0;
 			goto out;
 		}
@@ -4545,6 +4755,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	r = __vcpu_run(vcpu);
 
 out:
+	post_kvm_run_save(vcpu);
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
@@ -4616,12 +4827,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	return 0;
 }
 
-void kvm_get_segment(struct kvm_vcpu *vcpu,
-		     struct kvm_segment *var, int seg)
-{
-	kvm_x86_ops->get_segment(vcpu, var, seg);
-}
-
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
 {
 	struct kvm_segment cs;
@@ -4635,7 +4840,7 @@ EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
-	struct descriptor_table dt;
+	struct desc_ptr dt;
 
 	vcpu_load(vcpu);
 
@@ -4650,11 +4855,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
 
 	kvm_x86_ops->get_idt(vcpu, &dt);
-	sregs->idt.limit = dt.limit;
-	sregs->idt.base = dt.base;
+	sregs->idt.limit = dt.size;
+	sregs->idt.base = dt.address;
 	kvm_x86_ops->get_gdt(vcpu, &dt);
-	sregs->gdt.limit = dt.limit;
-	sregs->gdt.base = dt.base;
+	sregs->gdt.limit = dt.size;
+	sregs->gdt.base = dt.address;
 
 	sregs->cr0 = kvm_read_cr0(vcpu);
 	sregs->cr2 = vcpu->arch.cr2;
@@ -4693,563 +4898,33 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-static void kvm_set_segment(struct kvm_vcpu *vcpu,
-			struct kvm_segment *var, int seg)
-{
-	kvm_x86_ops->set_segment(vcpu, var, seg);
-}
-
-static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
-				   struct kvm_segment *kvm_desct)
-{
-	kvm_desct->base = get_desc_base(seg_desc);
-	kvm_desct->limit = get_desc_limit(seg_desc);
-	if (seg_desc->g) {
-		kvm_desct->limit <<= 12;
-		kvm_desct->limit |= 0xfff;
-	}
-	kvm_desct->selector = selector;
-	kvm_desct->type = seg_desc->type;
-	kvm_desct->present = seg_desc->p;
-	kvm_desct->dpl = seg_desc->dpl;
-	kvm_desct->db = seg_desc->d;
-	kvm_desct->s = seg_desc->s;
-	kvm_desct->l = seg_desc->l;
-	kvm_desct->g = seg_desc->g;
-	kvm_desct->avl = seg_desc->avl;
-	if (!selector)
-		kvm_desct->unusable = 1;
-	else
-		kvm_desct->unusable = 0;
-	kvm_desct->padding = 0;
-}
-
-static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
-					  u16 selector,
-					  struct descriptor_table *dtable)
-{
-	if (selector & 1 << 2) {
-		struct kvm_segment kvm_seg;
-
-		kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
-
-		if (kvm_seg.unusable)
-			dtable->limit = 0;
-		else
-			dtable->limit = kvm_seg.limit;
-		dtable->base = kvm_seg.base;
-	}
-	else
-		kvm_x86_ops->get_gdt(vcpu, dtable);
-}
-
-/* allowed just for 8 bytes segments */
-static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
-					 struct desc_struct *seg_desc)
-{
-	struct descriptor_table dtable;
-	u16 index = selector >> 3;
-	int ret;
-	u32 err;
-	gva_t addr;
-
-	get_segment_descriptor_dtable(vcpu, selector, &dtable);
-
-	if (dtable.limit < index * 8 + 7) {
-		kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
-		return X86EMUL_PROPAGATE_FAULT;
-	}
-	addr = dtable.base + index * 8;
-	ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
-					 vcpu,  &err);
-	if (ret == X86EMUL_PROPAGATE_FAULT)
-		kvm_inject_page_fault(vcpu, addr, err);
-
-       return ret;
-}
-
-/* allowed just for 8 bytes segments */
-static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
-					 struct desc_struct *seg_desc)
-{
-	struct descriptor_table dtable;
-	u16 index = selector >> 3;
-
-	get_segment_descriptor_dtable(vcpu, selector, &dtable);
-
-	if (dtable.limit < index * 8 + 7)
-		return 1;
-	return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
-}
-
-static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
-			       struct desc_struct *seg_desc)
-{
-	u32 base_addr = get_desc_base(seg_desc);
-
-	return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
-}
-
-static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
-			     struct desc_struct *seg_desc)
-{
-	u32 base_addr = get_desc_base(seg_desc);
-
-	return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
-}
-
-static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
-{
-	struct kvm_segment kvm_seg;
-
-	kvm_get_segment(vcpu, &kvm_seg, seg);
-	return kvm_seg.selector;
-}
-
-static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
-{
-	struct kvm_segment segvar = {
-		.base = selector << 4,
-		.limit = 0xffff,
-		.selector = selector,
-		.type = 3,
-		.present = 1,
-		.dpl = 3,
-		.db = 0,
-		.s = 1,
-		.l = 0,
-		.g = 0,
-		.avl = 0,
-		.unusable = 0,
-	};
-	kvm_x86_ops->set_segment(vcpu, &segvar, seg);
-	return X86EMUL_CONTINUE;
-}
-
-static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
+		    bool has_error_code, u32 error_code)
 {
-	return (seg != VCPU_SREG_LDTR) &&
-		(seg != VCPU_SREG_TR) &&
-		(kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
-}
-
-int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
-{
-	struct kvm_segment kvm_seg;
-	struct desc_struct seg_desc;
-	u8 dpl, rpl, cpl;
-	unsigned err_vec = GP_VECTOR;
-	u32 err_code = 0;
-	bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
-	int ret;
+	int cs_db, cs_l, ret;
+	cache_all_regs(vcpu);
 
-	if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
-		return kvm_load_realmode_segment(vcpu, selector, seg);
+	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
-	/* NULL selector is not valid for TR, CS and SS */
-	if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
-	    && null_selector)
-		goto exception;
+	vcpu->arch.emulate_ctxt.vcpu = vcpu;
+	vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+	vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
+	vcpu->arch.emulate_ctxt.mode =
+		(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+		(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+		? X86EMUL_MODE_VM86 : cs_l
+		? X86EMUL_MODE_PROT64 :	cs_db
+		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
 
-	/* TR should be in GDT only */
-	if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
-		goto exception;
+	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
+				   tss_selector, reason, has_error_code,
+				   error_code);
 
-	ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
 	if (ret)
-		return ret;
-
-	seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
-
-	if (null_selector) { /* for NULL selector skip all following checks */
-		kvm_seg.unusable = 1;
-		goto load;
-	}
-
-	err_code = selector & 0xfffc;
-	err_vec = GP_VECTOR;
-
-	/* can't load system descriptor into segment selecor */
-	if (seg <= VCPU_SREG_GS && !kvm_seg.s)
-		goto exception;
-
-	if (!kvm_seg.present) {
-		err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
-		goto exception;
-	}
-
-	rpl = selector & 3;
-	dpl = kvm_seg.dpl;
-	cpl = kvm_x86_ops->get_cpl(vcpu);
-
-	switch (seg) {
-	case VCPU_SREG_SS:
-		/*
-		 * segment is not a writable data segment or segment
-		 * selector's RPL != CPL or segment selector's RPL != CPL
-		 */
-		if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
-			goto exception;
-		break;
-	case VCPU_SREG_CS:
-		if (!(kvm_seg.type & 8))
-			goto exception;
-
-		if (kvm_seg.type & 4) {
-			/* conforming */
-			if (dpl > cpl)
-				goto exception;
-		} else {
-			/* nonconforming */
-			if (rpl > cpl || dpl != cpl)
-				goto exception;
-		}
-		/* CS(RPL) <- CPL */
-		selector = (selector & 0xfffc) | cpl;
-            break;
-	case VCPU_SREG_TR:
-		if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
-			goto exception;
-		break;
-	case VCPU_SREG_LDTR:
-		if (kvm_seg.s || kvm_seg.type != 2)
-			goto exception;
-		break;
-	default: /*  DS, ES, FS, or GS */
-		/*
-		 * segment is not a data or readable code segment or
-		 * ((segment is a data or nonconforming code segment)
-		 * and (both RPL and CPL > DPL))
-		 */
-		if ((kvm_seg.type & 0xa) == 0x8 ||
-		    (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
-			goto exception;
-		break;
-	}
-
-	if (!kvm_seg.unusable && kvm_seg.s) {
-		/* mark segment as accessed */
-		kvm_seg.type |= 1;
-		seg_desc.type |= 1;
-		save_guest_segment_descriptor(vcpu, selector, &seg_desc);
-	}
-load:
-	kvm_set_segment(vcpu, &kvm_seg, seg);
-	return X86EMUL_CONTINUE;
-exception:
-	kvm_queue_exception_e(vcpu, err_vec, err_code);
-	return X86EMUL_PROPAGATE_FAULT;
-}
-
-static void save_state_to_tss32(struct kvm_vcpu *vcpu,
-				struct tss_segment_32 *tss)
-{
-	tss->cr3 = vcpu->arch.cr3;
-	tss->eip = kvm_rip_read(vcpu);
-	tss->eflags = kvm_get_rflags(vcpu);
-	tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
-	tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
-	tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
-	tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
-	tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
-	tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
-	tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
-	tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
-	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
-	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
-	tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
-	tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
-	tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
-	tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
-	tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-}
-
-static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
-{
-	struct kvm_segment kvm_seg;
-	kvm_get_segment(vcpu, &kvm_seg, seg);
-	kvm_seg.selector = sel;
-	kvm_set_segment(vcpu, &kvm_seg, seg);
-}
-
-static int load_state_from_tss32(struct kvm_vcpu *vcpu,
-				  struct tss_segment_32 *tss)
-{
-	kvm_set_cr3(vcpu, tss->cr3);
-
-	kvm_rip_write(vcpu, tss->eip);
-	kvm_set_rflags(vcpu, tss->eflags | 2);
-
-	kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
-	kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
-	kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
-	kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
-	kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
-	kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
-	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
-	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
-
-	/*
-	 * SDM says that segment selectors are loaded before segment
-	 * descriptors
-	 */
-	kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
-	kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
-	kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
-	kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
-	kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
-	kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
-	kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
-
-	/*
-	 * Now load segment descriptors. If fault happenes at this stage
-	 * it is handled in a context of new task
-	 */
-	if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
-		return 1;
+		return EMULATE_FAIL;
 
-	if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
-		return 1;
-	return 0;
-}
-
-static void save_state_to_tss16(struct kvm_vcpu *vcpu,
-				struct tss_segment_16 *tss)
-{
-	tss->ip = kvm_rip_read(vcpu);
-	tss->flag = kvm_get_rflags(vcpu);
-	tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
-	tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
-	tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
-	tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
-	tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
-	tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
-	tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
-	tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
-
-	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
-	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
-	tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
-	tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
-	tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-}
-
-static int load_state_from_tss16(struct kvm_vcpu *vcpu,
-				 struct tss_segment_16 *tss)
-{
-	kvm_rip_write(vcpu, tss->ip);
-	kvm_set_rflags(vcpu, tss->flag | 2);
-	kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
-	kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
-	kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
-	kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
-	kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
-	kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
-	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
-	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
-
-	/*
-	 * SDM says that segment selectors are loaded before segment
-	 * descriptors
-	 */
-	kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
-	kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
-	kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
-	kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
-	kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
-
-	/*
-	 * Now load segment descriptors. If fault happenes at this stage
-	 * it is handled in a context of new task
-	 */
-	if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
-		return 1;
-	return 0;
-}
-
-static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
-			      u16 old_tss_sel, u32 old_tss_base,
-			      struct desc_struct *nseg_desc)
-{
-	struct tss_segment_16 tss_segment_16;
-	int ret = 0;
-
-	if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
-			   sizeof tss_segment_16))
-		goto out;
-
-	save_state_to_tss16(vcpu, &tss_segment_16);
-
-	if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
-			    sizeof tss_segment_16))
-		goto out;
-
-	if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
-			   &tss_segment_16, sizeof tss_segment_16))
-		goto out;
-
-	if (old_tss_sel != 0xffff) {
-		tss_segment_16.prev_task_link = old_tss_sel;
-
-		if (kvm_write_guest(vcpu->kvm,
-				    get_tss_base_addr_write(vcpu, nseg_desc),
-				    &tss_segment_16.prev_task_link,
-				    sizeof tss_segment_16.prev_task_link))
-			goto out;
-	}
-
-	if (load_state_from_tss16(vcpu, &tss_segment_16))
-		goto out;
-
-	ret = 1;
-out:
-	return ret;
-}
-
-static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
-		       u16 old_tss_sel, u32 old_tss_base,
-		       struct desc_struct *nseg_desc)
-{
-	struct tss_segment_32 tss_segment_32;
-	int ret = 0;
-
-	if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
-			   sizeof tss_segment_32))
-		goto out;
-
-	save_state_to_tss32(vcpu, &tss_segment_32);
-
-	if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
-			    sizeof tss_segment_32))
-		goto out;
-
-	if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
-			   &tss_segment_32, sizeof tss_segment_32))
-		goto out;
-
-	if (old_tss_sel != 0xffff) {
-		tss_segment_32.prev_task_link = old_tss_sel;
-
-		if (kvm_write_guest(vcpu->kvm,
-				    get_tss_base_addr_write(vcpu, nseg_desc),
-				    &tss_segment_32.prev_task_link,
-				    sizeof tss_segment_32.prev_task_link))
-			goto out;
-	}
-
-	if (load_state_from_tss32(vcpu, &tss_segment_32))
-		goto out;
-
-	ret = 1;
-out:
-	return ret;
-}
-
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
-{
-	struct kvm_segment tr_seg;
-	struct desc_struct cseg_desc;
-	struct desc_struct nseg_desc;
-	int ret = 0;
-	u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
-	u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
-	u32 desc_limit;
-
-	old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
-
-	/* FIXME: Handle errors. Failure to read either TSS or their
-	 * descriptors should generate a pagefault.
-	 */
-	if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
-		goto out;
-
-	if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
-		goto out;
-
-	if (reason != TASK_SWITCH_IRET) {
-		int cpl;
-
-		cpl = kvm_x86_ops->get_cpl(vcpu);
-		if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
-			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
-			return 1;
-		}
-	}
-
-	desc_limit = get_desc_limit(&nseg_desc);
-	if (!nseg_desc.p ||
-	    ((desc_limit < 0x67 && (nseg_desc.type & 8)) ||
-	     desc_limit < 0x2b)) {
-		kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
-		return 1;
-	}
-
-	if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
-		cseg_desc.type &= ~(1 << 1); //clear the B flag
-		save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
-	}
-
-	if (reason == TASK_SWITCH_IRET) {
-		u32 eflags = kvm_get_rflags(vcpu);
-		kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
-	}
-
-	/* set back link to prev task only if NT bit is set in eflags
-	   note that old_tss_sel is not used afetr this point */
-	if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
-		old_tss_sel = 0xffff;
-
-	if (nseg_desc.type & 8)
-		ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
-					 old_tss_base, &nseg_desc);
-	else
-		ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
-					 old_tss_base, &nseg_desc);
-
-	if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
-		u32 eflags = kvm_get_rflags(vcpu);
-		kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
-	}
-
-	if (reason != TASK_SWITCH_IRET) {
-		nseg_desc.type |= (1 << 1);
-		save_guest_segment_descriptor(vcpu, tss_selector,
-					      &nseg_desc);
-	}
-
-	kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
-	seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
-	tr_seg.type = 11;
-	kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
-out:
-	return ret;
+	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	return EMULATE_DONE;
 }
 EXPORT_SYMBOL_GPL(kvm_task_switch);
 
@@ -5258,15 +4933,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 {
 	int mmu_reset_needed = 0;
 	int pending_vec, max_bits;
-	struct descriptor_table dt;
+	struct desc_ptr dt;
 
 	vcpu_load(vcpu);
 
-	dt.limit = sregs->idt.limit;
-	dt.base = sregs->idt.base;
+	dt.size = sregs->idt.limit;
+	dt.address = sregs->idt.base;
 	kvm_x86_ops->set_idt(vcpu, &dt);
-	dt.limit = sregs->gdt.limit;
-	dt.base = sregs->gdt.base;
+	dt.size = sregs->gdt.limit;
+	dt.address = sregs->gdt.base;
 	kvm_x86_ops->set_gdt(vcpu, &dt);
 
 	vcpu->arch.cr2 = sregs->cr2;
@@ -5365,11 +5040,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 		vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
 	}
 
-	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
-		vcpu->arch.singlestep_cs =
-			get_segment_selector(vcpu, VCPU_SREG_CS);
-		vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
-	}
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
+			get_segment_base(vcpu, VCPU_SREG_CS);
 
 	/*
 	 * Trigger an rflags update that will inject or remove the trace
@@ -5860,13 +5533,22 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
 	return kvm_x86_ops->interrupt_allowed(vcpu);
 }
 
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
+{
+	unsigned long current_rip = kvm_rip_read(vcpu) +
+		get_segment_base(vcpu, VCPU_SREG_CS);
+
+	return current_rip == linear_rip;
+}
+EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
+
 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
 {
 	unsigned long rflags;
 
 	rflags = kvm_x86_ops->get_rflags(vcpu);
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-		rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
+		rflags &= ~X86_EFLAGS_TF;
 	return rflags;
 }
 EXPORT_SYMBOL_GPL(kvm_get_rflags);
@@ -5874,10 +5556,8 @@ EXPORT_SYMBOL_GPL(kvm_get_rflags);
 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
-	    vcpu->arch.singlestep_cs ==
-			get_segment_selector(vcpu, VCPU_SREG_CS) &&
-	    vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
-		rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+	    kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
+		rflags |= X86_EFLAGS_TF;
 	kvm_x86_ops->set_rflags(vcpu, rflags);
 }
 EXPORT_SYMBOL_GPL(kvm_set_rflags);
@@ -5893,3 +5573,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b7a404722d2..f4b54458285 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,6 +65,13 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
 	return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
 }
 
+static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
+{
+	return rcu_dereference_check(kvm->arch.aliases,
+			srcu_read_lock_held(&kvm->srcu)
+			|| lockdep_is_held(&kvm->slots_lock));
+}
+
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 2bdf628066b..9257510b483 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1390,7 +1390,6 @@ __init void lguest_init(void)
 #endif
 #ifdef CONFIG_ACPI
 	acpi_disabled = 1;
-	acpi_ht = 0;
 #endif
 
 	/*
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 550df481acc..787c52ca49c 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -3,12 +3,6 @@
 #include <linux/module.h>
 #include <linux/bootmem.h>
 
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-# define DBG(x...) printk(KERN_DEBUG x)
-#else
-# define DBG(x...)
-#endif
-
 /*
  * Which logical CPUs are on which nodes
  */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 8948f47fde0..a7bcc23ef96 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -33,9 +33,6 @@ int numa_off __initdata;
 static unsigned long __initdata nodemap_addr;
 static unsigned long __initdata nodemap_size;
 
-DEFINE_PER_CPU(int, node_number) = 0;
-EXPORT_PER_CPU_SYMBOL(node_number);
-
 /*
  * Map cpu index to node index
  */
@@ -809,7 +806,7 @@ void __cpuinit numa_set_node(int cpu, int node)
 	per_cpu(x86_cpu_to_node_map, cpu) = node;
 
 	if (node != NUMA_NO_NODE)
-		per_cpu(node_number, cpu) = node;
+		set_cpu_numa_node(cpu, node);
 }
 
 void __cpuinit numa_clear_node(int cpu)
@@ -867,7 +864,7 @@ void __cpuinit numa_remove_cpu(int cpu)
 	numa_set_cpumask(cpu, 0);
 }
 
-int cpu_to_node(int cpu)
+int __cpu_to_node(int cpu)
 {
 	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
 		printk(KERN_WARNING
@@ -877,7 +874,7 @@ int cpu_to_node(int cpu)
 	}
 	return per_cpu(x86_cpu_to_node_map, cpu);
 }
-EXPORT_SYMBOL(cpu_to_node);
+EXPORT_SYMBOL(__cpu_to_node);
 
 /*
  * Same function as cpu_to_node() but used if called before the
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index bbe5502ee1c..acc15b23b74 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -336,6 +336,7 @@ int free_memtype(u64 start, u64 end)
 {
 	int err = -EINVAL;
 	int is_range_ram;
+	struct memtype *entry;
 
 	if (!pat_enabled)
 		return 0;
@@ -355,17 +356,20 @@ int free_memtype(u64 start, u64 end)
 	}
 
 	spin_lock(&memtype_lock);
-	err = rbt_memtype_erase(start, end);
+	entry = rbt_memtype_erase(start, end);
 	spin_unlock(&memtype_lock);
 
-	if (err) {
+	if (!entry) {
 		printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
 			current->comm, current->pid, start, end);
+		return -EINVAL;
 	}
 
+	kfree(entry);
+
 	dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
 
-	return err;
+	return 0;
 }
 
 
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
index 4f39eefa3e6..77e5ba153fa 100644
--- a/arch/x86/mm/pat_internal.h
+++ b/arch/x86/mm/pat_internal.h
@@ -28,15 +28,15 @@ static inline char *cattr_name(unsigned long flags)
 #ifdef CONFIG_X86_PAT
 extern int rbt_memtype_check_insert(struct memtype *new,
 					unsigned long *new_type);
-extern int rbt_memtype_erase(u64 start, u64 end);
+extern struct memtype *rbt_memtype_erase(u64 start, u64 end);
 extern struct memtype *rbt_memtype_lookup(u64 addr);
 extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos);
 #else
 static inline int rbt_memtype_check_insert(struct memtype *new,
 					unsigned long *new_type)
 { return 0; }
-static inline int rbt_memtype_erase(u64 start, u64 end)
-{ return 0; }
+static inline struct memtype *rbt_memtype_erase(u64 start, u64 end)
+{ return NULL; }
 static inline struct memtype *rbt_memtype_lookup(u64 addr)
 { return NULL; }
 static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 07de4cb8cc3..f537087bb74 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -231,16 +231,17 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
 	return err;
 }
 
-int rbt_memtype_erase(u64 start, u64 end)
+struct memtype *rbt_memtype_erase(u64 start, u64 end)
 {
 	struct memtype *data;
 
 	data = memtype_rb_exact_match(&memtype_rbroot, start, end);
 	if (!data)
-		return -EINVAL;
+		goto out;
 
 	rb_erase(&data->rb, &memtype_rbroot);
-	return 0;
+out:
+	return data;
 }
 
 struct memtype *rbt_memtype_lookup(u64 addr)
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index df3d5c861cd..308e32570d8 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -34,7 +34,7 @@
 /* IA32 Manual 3, 2-1 */
 static unsigned char prefix_codes[] = {
 	0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
-	0x65, 0x2E, 0x3E, 0x66, 0x67
+	0x65, 0x66, 0x67
 };
 /* IA32 Manual 3, 3-432*/
 static unsigned int reg_rop[] = {
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 792854003ed..cac71849925 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -9,7 +9,6 @@
 #include <linux/pagemap.h>
 #include <linux/spinlock.h>
 #include <linux/module.h>
-#include <linux/quicklist.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index b110d97fb92..a0207a7fdf3 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -18,6 +18,8 @@ obj-$(CONFIG_X86_MRST)		+= mrst.o
 obj-y				+= common.o early.o
 obj-y				+= amd_bus.o bus_numa.o
 
+obj-$(CONFIG_PCI_CNB20LE_QUIRK)	+= broadcom_bus.o
+
 ifeq ($(CONFIG_PCI_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
 endif
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 31930fd30ea..2ec04c424a6 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -207,10 +207,9 @@ get_current_resources(struct acpi_device *device, int busnum,
 	if (!info.res)
 		goto res_alloc_fail;
 
-	info.name = kmalloc(16, GFP_KERNEL);
+	info.name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum);
 	if (!info.name)
 		goto name_alloc_fail;
-	sprintf(info.name, "PCI Bus %04x:%02x", domain, busnum);
 
 	info.res_num = 0;
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
@@ -224,8 +223,11 @@ res_alloc_fail:
 	return;
 }
 
-struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum)
+struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 {
+	struct acpi_device *device = root->device;
+	int domain = root->segment;
+	int busnum = root->secondary.start;
 	struct pci_bus *bus;
 	struct pci_sysdata *sd;
 	int node;
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
new file mode 100644
index 00000000000..0846a5bbbfb
--- /dev/null
+++ b/arch/x86/pci/broadcom_bus.c
@@ -0,0 +1,101 @@
+/*
+ * Read address ranges from a Broadcom CNB20LE Host Bridge
+ *
+ * Copyright (c) 2010 Ira W. Snyder <iws@ovro.caltech.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/delay.h>
+#include <linux/dmi.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <asm/pci_x86.h>
+
+#include "bus_numa.h"
+
+static void __devinit cnb20le_res(struct pci_dev *dev)
+{
+	struct pci_root_info *info;
+	struct resource res;
+	u16 word1, word2;
+	u8 fbus, lbus;
+	int i;
+
+	/*
+	 * The x86_pci_root_bus_res_quirks() function already refuses to use
+	 * this information if ACPI _CRS was used. Therefore, we don't bother
+	 * checking if ACPI is enabled, and just generate the information
+	 * for both the ACPI _CRS and no ACPI cases.
+	 */
+
+	info = &pci_root_info[pci_root_num];
+	pci_root_num++;
+
+	/* read the PCI bus numbers */
+	pci_read_config_byte(dev, 0x44, &fbus);
+	pci_read_config_byte(dev, 0x45, &lbus);
+	info->bus_min = fbus;
+	info->bus_max = lbus;
+
+	/*
+	 * Add the legacy IDE ports on bus 0
+	 *
+	 * These do not exist anywhere in the bridge registers, AFAICT. I do
+	 * not have the datasheet, so this is the best I can do.
+	 */
+	if (fbus == 0) {
+		update_res(info, 0x01f0, 0x01f7, IORESOURCE_IO, 0);
+		update_res(info, 0x03f6, 0x03f6, IORESOURCE_IO, 0);
+		update_res(info, 0x0170, 0x0177, IORESOURCE_IO, 0);
+		update_res(info, 0x0376, 0x0376, IORESOURCE_IO, 0);
+		update_res(info, 0xffa0, 0xffaf, IORESOURCE_IO, 0);
+	}
+
+	/* read the non-prefetchable memory window */
+	pci_read_config_word(dev, 0xc0, &word1);
+	pci_read_config_word(dev, 0xc2, &word2);
+	if (word1 != word2) {
+		res.start = (word1 << 16) | 0x0000;
+		res.end   = (word2 << 16) | 0xffff;
+		res.flags = IORESOURCE_MEM;
+		update_res(info, res.start, res.end, res.flags, 0);
+	}
+
+	/* read the prefetchable memory window */
+	pci_read_config_word(dev, 0xc4, &word1);
+	pci_read_config_word(dev, 0xc6, &word2);
+	if (word1 != word2) {
+		res.start = (word1 << 16) | 0x0000;
+		res.end   = (word2 << 16) | 0xffff;
+		res.flags = IORESOURCE_MEM | IORESOURCE_PREFETCH;
+		update_res(info, res.start, res.end, res.flags, 0);
+	}
+
+	/* read the IO port window */
+	pci_read_config_word(dev, 0xd0, &word1);
+	pci_read_config_word(dev, 0xd2, &word2);
+	if (word1 != word2) {
+		res.start = word1;
+		res.end   = word2;
+		res.flags = IORESOURCE_IO;
+		update_res(info, res.start, res.end, res.flags, 0);
+	}
+
+	/* print information about this host bridge */
+	res.start = fbus;
+	res.end   = lbus;
+	res.flags = IORESOURCE_BUS;
+	dev_info(&dev->dev, "CNB20LE PCI Host Bridge (domain %04x %pR)\n",
+			    pci_domain_nr(dev->bus), &res);
+
+	for (i = 0; i < info->res_num; i++)
+		dev_info(&dev->dev, "host bridge window %pR\n", &info->res[i]);
+}
+
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE,
+			cnb20le_res);
+
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index cf2e93869c4..215a27ae050 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -76,7 +76,7 @@ struct pci_ops pci_root_ops = {
  * This interrupt-safe spinlock protects all accesses to PCI
  * configuration space.
  */
-DEFINE_SPINLOCK(pci_config_lock);
+DEFINE_RAW_SPINLOCK(pci_config_lock);
 
 static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
 {
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index 347d882b3bb..bd33620b007 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -27,7 +27,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus,
 		return -EINVAL;
 	}
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8);
 
@@ -43,7 +43,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus,
 		break;
 	}
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
@@ -56,7 +56,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus,
 	if ((bus > 255) || (devfn > 255) || (reg > 4095))
 		return -EINVAL;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8);
 
@@ -72,7 +72,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus,
 		break;
 	}
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
@@ -108,7 +108,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus,
 	if (dev & 0x10) 
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	outb((u8)(0xF0 | (fn << 1)), 0xCF8);
 	outb((u8)bus, 0xCFA);
@@ -127,7 +127,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus,
 
 	outb(0, 0xCF8);
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
@@ -147,7 +147,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,
 	if (dev & 0x10) 
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	outb((u8)(0xF0 | (fn << 1)), 0xCF8);
 	outb((u8)bus, 0xCFA);
@@ -166,7 +166,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,
 
 	outb(0, 0xCF8);    
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 97da2ba9344..6fdb3ec30c3 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -96,6 +96,7 @@ EXPORT_SYMBOL(pcibios_align_resource);
  *	  the fact the PCI specs explicitly allow address decoders to be
  *	  shared between expansion ROMs and other resource regions, it's
  *	  at least dangerous)
+ *	- bad resource sizes or overlaps with other regions
  *
  *  Our solution:
  *	(1) Allocate resources for all buses behind PCI-to-PCI bridges.
@@ -136,6 +137,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
 					 * child resource allocations in this
 					 * range.
 					 */
+					r->start = r->end = 0;
 					r->flags = 0;
 				}
 			}
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 5d362b5ba06..9810a0f76c9 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -589,8 +589,6 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
 	case PCI_DEVICE_ID_INTEL_ICH10_1:
 	case PCI_DEVICE_ID_INTEL_ICH10_2:
 	case PCI_DEVICE_ID_INTEL_ICH10_3:
-	case PCI_DEVICE_ID_INTEL_CPT_LPC1:
-	case PCI_DEVICE_ID_INTEL_CPT_LPC2:
 		r->name = "PIIX/ICH";
 		r->get = pirq_piix_get;
 		r->set = pirq_piix_set;
@@ -605,6 +603,13 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
 		return 1;
 	}
 
+	if ((device >= PCI_DEVICE_ID_INTEL_CPT_LPC_MIN) && 
+		(device <= PCI_DEVICE_ID_INTEL_CPT_LPC_MAX)) {
+		r->name = "PIIX/ICH";
+		r->get = pirq_piix_get;
+		r->set = pirq_piix_set;
+		return 1;
+	}
 	return 0;
 }
 
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 0db5eaf5456..8d460eaf524 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -11,28 +11,14 @@
  */
 static void __devinit pcibios_fixup_peer_bridges(void)
 {
-	int n, devfn;
-	long node;
+	int n;
 
 	if (pcibios_last_bus <= 0 || pcibios_last_bus > 0xff)
 		return;
 	DBG("PCI: Peer bridge fixup\n");
 
-	for (n=0; n <= pcibios_last_bus; n++) {
-		u32 l;
-		if (pci_find_bus(0, n))
-			continue;
-		node = get_mp_bus_to_node(n);
-		for (devfn = 0; devfn < 256; devfn += 8) {
-			if (!raw_pci_read(0, n, devfn, PCI_VENDOR_ID, 2, &l) &&
-			    l != 0x0000 && l != 0xffff) {
-				DBG("Found device at %02x:%02x [%04x]\n", n, devfn, l);
-				printk(KERN_INFO "PCI: Discovered peer bus %02x\n", n);
-				pci_scan_bus_on_node(n, &pci_root_ops, node);
-				break;
-			}
-		}
-	}
+	for (n=0; n <= pcibios_last_bus; n++)
+		pcibios_scan_specific_bus(n);
 }
 
 int __init pci_legacy_init(void)
@@ -50,6 +36,28 @@ int __init pci_legacy_init(void)
 	return 0;
 }
 
+void pcibios_scan_specific_bus(int busn)
+{
+	int devfn;
+	long node;
+	u32 l;
+
+	if (pci_find_bus(0, busn))
+		return;
+
+	node = get_mp_bus_to_node(busn);
+	for (devfn = 0; devfn < 256; devfn += 8) {
+		if (!raw_pci_read(0, busn, devfn, PCI_VENDOR_ID, 2, &l) &&
+		    l != 0x0000 && l != 0xffff) {
+			DBG("Found device at %02x:%02x [%04x]\n", busn, devfn, l);
+			printk(KERN_INFO "PCI: Discovered peer bus %02x\n", busn);
+			pci_scan_bus_on_node(busn, &pci_root_ops, node);
+			return;
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(pcibios_scan_specific_bus);
+
 int __init pci_subsys_init(void)
 {
 	/*
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 39b9ebe8f88..a918553ebc7 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -483,16 +483,17 @@ static void __init pci_mmcfg_reject_broken(int early)
 	list_for_each_entry(cfg, &pci_mmcfg_list, list) {
 		int valid = 0;
 
-		if (!early && !acpi_disabled)
+		if (!early && !acpi_disabled) {
 			valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0);
 
-		if (valid)
-			continue;
-
-		if (!early)
-			printk(KERN_ERR FW_BUG PREFIX
-			       "MMCONFIG at %pR not reserved in "
-			       "ACPI motherboard resources\n", &cfg->res);
+			if (valid)
+				continue;
+			else
+				printk(KERN_ERR FW_BUG PREFIX
+				       "MMCONFIG at %pR not reserved in "
+				       "ACPI motherboard resources\n",
+				       &cfg->res);
+		}
 
 		/* Don't try to do this check unless configuration
 		   type 1 is available. how about type 2 ?*/
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index 90d5fd476ed..a3d9c54792a 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -64,7 +64,7 @@ err:		*value = -1;
 	if (!base)
 		goto err;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	pci_exp_set_dev_base(base, bus, devfn);
 
@@ -79,7 +79,7 @@ err:		*value = -1;
 		*value = mmio_config_readl(mmcfg_virt_addr + reg);
 		break;
 	}
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
@@ -97,7 +97,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
 	if (!base)
 		return -EINVAL;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	pci_exp_set_dev_base(base, bus, devfn);
 
@@ -112,7 +112,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
 		mmio_config_writel(mmcfg_virt_addr + reg, value);
 		break;
 	}
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 8223738ad80..5c9e2458df4 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -37,7 +37,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
 	if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
 		return -EINVAL;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	write_cf8(bus, devfn, reg);
 
@@ -62,7 +62,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
 		break;
 	}
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
@@ -76,7 +76,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
 	if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) 
 		return -EINVAL;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	write_cf8(bus, devfn, reg);
 
@@ -101,7 +101,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
 		break;
 	}
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 59a225c17b8..2492d165096 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -162,7 +162,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
 	if (!value || (bus > 255) || (devfn > 255) || (reg > 255))
 		return -EINVAL;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	switch (len) {
 	case 1:
@@ -213,7 +213,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
 		break;
 	}
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return (int)((result & 0xff00) >> 8);
 }
@@ -228,7 +228,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus,
 	if ((bus > 255) || (devfn > 255) || (reg > 255)) 
 		return -EINVAL;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	switch (len) {
 	case 1:
@@ -269,7 +269,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus,
 		break;
 	}
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return (int)((result & 0xff00) >> 8);
 }
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 0a979f3e5b8..1290ba54b35 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -105,6 +105,8 @@ static void __save_processor_state(struct saved_context *ctxt)
 	ctxt->cr4 = read_cr4();
 	ctxt->cr8 = read_cr8();
 #endif
+	ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE,
+					       &ctxt->misc_enable);
 }
 
 /* Needed by apm.c */
@@ -152,6 +154,8 @@ static void fix_processor_context(void)
  */
 static void __restore_processor_state(struct saved_context *ctxt)
 {
+	if (ctxt->misc_enable_saved)
+		wrmsrl(MSR_IA32_MISC_ENABLE, ctxt->misc_enable);
 	/*
 	 * control registers
 	 */
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 987267f79bf..a9c66110803 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -60,6 +60,6 @@ static void xen_vcpu_notify_restore(void *data)
 
 void xen_arch_resume(void)
 {
-	smp_call_function(xen_vcpu_notify_restore,
-			       (void *)CLOCK_EVT_NOTIFY_RESUME, 1);
+	on_each_cpu(xen_vcpu_notify_restore,
+		    (void *)CLOCK_EVT_NOTIFY_RESUME, 1);
 }