From 4af57b787b4be09419a2bb48aa705fa87ef41cca Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@ksplice.com>
Date: Sat, 20 Feb 2010 01:03:34 +0100
Subject: Rename .data.cacheline_aligned to .data..cacheline_aligned.

Signed-off-by: Tim Abbott <tabbott@ksplice.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Signed-off-by: Michal Marek <mmarek@suse.cz>
---
 arch/x86/kernel/init_task.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 3a54dcb9cd0..43e9ccf4494 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -34,7 +34,7 @@ EXPORT_SYMBOL(init_task);
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
  * no more per-task TSS's. The TSS size is kept cacheline-aligned
- * so they are allowed to end up in the .data.cacheline_aligned
+ * so they are allowed to end up in the .data..cacheline_aligned
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-- 
cgit v1.2.3-70-g09d2


From 7c74df07f90cabe61d700727bca04682b4e477f3 Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@ksplice.com>
Date: Sat, 20 Feb 2010 01:03:38 +0100
Subject: Rename .bss.page_aligned to .bss..page_aligned.

Signed-off-by: Tim Abbott <tabbott@ksplice.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Signed-off-by: Michal Marek <mmarek@suse.cz>
---
 arch/x86/kernel/vmlinux.lds.S     | 2 +-
 include/asm-generic/vmlinux.lds.h | 2 +-
 include/linux/linkage.h           | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index f92a0da608c..8b6bb4e7f5c 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -305,7 +305,7 @@ SECTIONS
 	. = ALIGN(PAGE_SIZE);
 	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
 		__bss_start = .;
-		*(.bss.page_aligned)
+		*(.bss..page_aligned)
 		*(.bss)
 		. = ALIGN(4);
 		__bss_stop = .;
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 569c25a8555..32cddc15594 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -499,7 +499,7 @@
 #define BSS(bss_align)							\
 	. = ALIGN(bss_align);						\
 	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {				\
-		*(.bss.page_aligned)					\
+		*(.bss..page_aligned)					\
 		*(.dynbss)						\
 		*(.bss)							\
 		*(COMMON)						\
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 05f4406d599..7135ebc8428 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -19,7 +19,7 @@
 #endif
 
 #define __page_aligned_data	__section(.data..page_aligned) __aligned(PAGE_SIZE)
-#define __page_aligned_bss	__section(.bss.page_aligned) __aligned(PAGE_SIZE)
+#define __page_aligned_bss	__section(.bss..page_aligned) __aligned(PAGE_SIZE)
 
 /*
  * For assembly routines.
@@ -28,7 +28,7 @@
  * alignment directives yourself
  */
 #define __PAGE_ALIGNED_DATA	.section ".data..page_aligned", "aw"
-#define __PAGE_ALIGNED_BSS	.section ".bss.page_aligned", "aw"
+#define __PAGE_ALIGNED_BSS	.section ".bss..page_aligned", "aw"
 
 /*
  * This is used by architectures to keep arguments on the stack
-- 
cgit v1.2.3-70-g09d2


From 54cb27a71f51d304342c79e62fd7667f2171062b Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sat, 20 Feb 2010 01:03:44 +0100
Subject: Rename .data.read_mostly to .data..read_mostly.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Signed-off-by: Michal Marek <mmarek@suse.cz>
---
 arch/ia64/include/asm/cache.h     | 2 +-
 arch/ia64/kernel/paravirtentry.S  | 2 +-
 arch/ia64/xen/xensetup.S          | 2 +-
 arch/parisc/include/asm/cache.h   | 2 +-
 arch/parisc/kernel/head.S         | 2 +-
 arch/powerpc/include/asm/cache.h  | 2 +-
 arch/powerpc/kernel/vmlinux.lds.S | 2 +-
 arch/s390/include/asm/cache.h     | 2 +-
 arch/sh/include/asm/cache.h       | 2 +-
 arch/sparc/include/asm/cache.h    | 2 +-
 arch/x86/include/asm/cache.h      | 2 +-
 include/asm-generic/vmlinux.lds.h | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/include/asm/cache.h b/arch/ia64/include/asm/cache.h
index e7482bd628f..988254a7d34 100644
--- a/arch/ia64/include/asm/cache.h
+++ b/arch/ia64/include/asm/cache.h
@@ -24,6 +24,6 @@
 # define SMP_CACHE_BYTES	(1 << 3)
 #endif
 
-#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+#define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
 #endif /* _ASM_IA64_CACHE_H */
diff --git a/arch/ia64/kernel/paravirtentry.S b/arch/ia64/kernel/paravirtentry.S
index 6158560d7f1..92d880c4d3d 100644
--- a/arch/ia64/kernel/paravirtentry.S
+++ b/arch/ia64/kernel/paravirtentry.S
@@ -28,7 +28,7 @@
 #include "entry.h"
 
 #define DATA8(sym, init_value)			\
-	.pushsection .data.read_mostly ;	\
+	.pushsection .data..read_mostly ;	\
 	.align 8 ;				\
 	.global sym ;				\
 	sym: ;					\
diff --git a/arch/ia64/xen/xensetup.S b/arch/ia64/xen/xensetup.S
index aff8346ea19..b820ed02ab9 100644
--- a/arch/ia64/xen/xensetup.S
+++ b/arch/ia64/xen/xensetup.S
@@ -14,7 +14,7 @@
 #include <linux/init.h>
 #include <xen/interface/elfnote.h>
 
-	.section .data.read_mostly
+	.section .data..read_mostly
 	.align 8
 	.global xen_domain_type
 xen_domain_type:
diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h
index 32c2cca7434..45effe6978f 100644
--- a/arch/parisc/include/asm/cache.h
+++ b/arch/parisc/include/asm/cache.h
@@ -28,7 +28,7 @@
 
 #define SMP_CACHE_BYTES L1_CACHE_BYTES
 
-#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+#define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
 void parisc_cache_init(void);	/* initializes cache-flushing */
 void disable_sr_hashing_asm(int); /* low level support for above */
diff --git a/arch/parisc/kernel/head.S b/arch/parisc/kernel/head.S
index 0e3d9f9b9e3..4dbdf0ed6fa 100644
--- a/arch/parisc/kernel/head.S
+++ b/arch/parisc/kernel/head.S
@@ -345,7 +345,7 @@ smp_slave_stext:
 ENDPROC(stext)
 
 #ifndef CONFIG_64BIT
-	.section .data.read_mostly
+	.section .data..read_mostly
 
 	.align	4
 	.export	$global$,data
diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h
index 81de6eb3455..3f41ab9c1e4 100644
--- a/arch/powerpc/include/asm/cache.h
+++ b/arch/powerpc/include/asm/cache.h
@@ -38,7 +38,7 @@ extern struct ppc64_caches ppc64_caches;
 #endif /* __powerpc64__ && ! __ASSEMBLY__ */
 
 #if !defined(__ASSEMBLY__)
-#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+#define __read_mostly __attribute__((__section__(".data..read_mostly")))
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 951e6c5b2c8..8a0deefac08 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -233,7 +233,7 @@ SECTIONS
 		CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
 	}
 
-	.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+	.data..read_mostly : AT(ADDR(.data..read_mostly) - LOAD_OFFSET) {
 		READ_MOSTLY_DATA(L1_CACHE_BYTES)
 	}
 
diff --git a/arch/s390/include/asm/cache.h b/arch/s390/include/asm/cache.h
index 9b866816863..24aafa68b64 100644
--- a/arch/s390/include/asm/cache.h
+++ b/arch/s390/include/asm/cache.h
@@ -14,6 +14,6 @@
 #define L1_CACHE_BYTES     256
 #define L1_CACHE_SHIFT     8
 
-#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+#define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
 #endif
diff --git a/arch/sh/include/asm/cache.h b/arch/sh/include/asm/cache.h
index 02df18ea960..455a9a9bd7d 100644
--- a/arch/sh/include/asm/cache.h
+++ b/arch/sh/include/asm/cache.h
@@ -14,7 +14,7 @@
 
 #define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
 
-#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+#define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
 #ifndef __ASSEMBLY__
 struct cache_info {
diff --git a/arch/sparc/include/asm/cache.h b/arch/sparc/include/asm/cache.h
index 41f85ae4bd4..2909f0ab6f9 100644
--- a/arch/sparc/include/asm/cache.h
+++ b/arch/sparc/include/asm/cache.h
@@ -19,7 +19,7 @@
 
 #define SMP_CACHE_BYTES (1 << SMP_CACHE_BYTES_SHIFT)
 
-#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+#define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
 #ifdef CONFIG_SPARC32
 #include <asm/asi.h>
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
index 2f9047cfaac..48f99f15452 100644
--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -7,7 +7,7 @@
 #define L1_CACHE_SHIFT	(CONFIG_X86_L1_CACHE_SHIFT)
 #define L1_CACHE_BYTES	(1 << L1_CACHE_SHIFT)
 
-#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+#define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
 #define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT
 #define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index e304fcd10bc..6f6da4f080f 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -185,7 +185,7 @@
 
 #define READ_MOSTLY_DATA(align)						\
 	. = ALIGN(align);						\
-	*(.data.read_mostly)
+	*(.data..read_mostly)
 
 #define CACHELINE_ALIGNED_DATA(align)					\
 	. = ALIGN(align);						\
-- 
cgit v1.2.3-70-g09d2


From 041d5f94c4d67444c40584db0d1cacf32a47a25b Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sat, 20 Feb 2010 01:03:46 +0100
Subject: Rename .rodata.compressed to .rodata..compressed.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Signed-off-by: Michal Marek <mmarek@suse.cz>
---
 arch/sh/boot/compressed/vmlinux.scr    | 2 +-
 arch/x86/boot/compressed/mkpiggy.c     | 2 +-
 arch/x86/boot/compressed/vmlinux.lds.S | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/sh/boot/compressed/vmlinux.scr b/arch/sh/boot/compressed/vmlinux.scr
index f02382ae5c4..862d7480823 100644
--- a/arch/sh/boot/compressed/vmlinux.scr
+++ b/arch/sh/boot/compressed/vmlinux.scr
@@ -1,6 +1,6 @@
 SECTIONS
 {
-  .rodata.compressed : {
+  .rodata..compressed : {
 	input_len = .;
 	LONG(input_data_end - input_data) input_data = .;
 	*(.data)
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
index bcbd36c4143..5c228129d17 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -77,7 +77,7 @@ int main(int argc, char *argv[])
 	offs += 32*1024 + 18;	/* Add 32K + 18 bytes slack */
 	offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
 
-	printf(".section \".rodata.compressed\",\"a\",@progbits\n");
+	printf(".section \".rodata..compressed\",\"a\",@progbits\n");
 	printf(".globl z_input_len\n");
 	printf("z_input_len = %lu\n", ilen);
 	printf(".globl z_output_len\n");
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index a6f1a59a5b0..5ddabceee12 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -26,8 +26,8 @@ SECTIONS
 		HEAD_TEXT
 		_ehead = . ;
 	}
-	.rodata.compressed : {
-		*(.rodata.compressed)
+	.rodata..compressed : {
+		*(.rodata..compressed)
 	}
 	.text :	{
 		_text = .; 	/* Text */
-- 
cgit v1.2.3-70-g09d2


From 819d67621edba0822352f7ae2a7ccb0387223675 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sat, 20 Feb 2010 01:03:49 +0100
Subject: Rename .text.page_aligned to .text..page_aligned.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Signed-off-by: Michal Marek <mmarek@suse.cz>
---
 arch/x86/kernel/acpi/wakeup_32.S | 2 +-
 arch/x86/kernel/vmlinux.lds.S    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 8ded418b059..13ab720573e 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -1,4 +1,4 @@
-	.section .text.page_aligned
+	.section .text..page_aligned
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/page_types.h>
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 8b6bb4e7f5c..f3d77d729d5 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -97,7 +97,7 @@ SECTIONS
 		HEAD_TEXT
 #ifdef CONFIG_X86_32
 		. = ALIGN(PAGE_SIZE);
-		*(.text.page_aligned)
+		*(.text..page_aligned)
 #endif
 		. = ALIGN(8);
 		_stext = .;
-- 
cgit v1.2.3-70-g09d2


From c273fb3b5d0490d3058f6cce77a92860671ee7b6 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Sat, 20 Feb 2010 01:03:53 +0100
Subject: Rename .data.init to .data..init.

Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Signed-off-by: Michal Marek <mmarek@suse.cz>
---
 arch/sparc/boot/btfixupprep.c  | 2 +-
 arch/x86/kernel/setup_percpu.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/sparc/boot/btfixupprep.c b/arch/sparc/boot/btfixupprep.c
index bbf91b9c3d3..e7f2940bd27 100644
--- a/arch/sparc/boot/btfixupprep.c
+++ b/arch/sparc/boot/btfixupprep.c
@@ -325,7 +325,7 @@ main1:
 		(*rr)->next = NULL;
 	}
 	printf("! Generated by btfixupprep. Do not edit.\n\n");
-	printf("\t.section\t\".data.init\",#alloc,#write\n\t.align\t4\n\n");
+	printf("\t.section\t\".data..init\",#alloc,#write\n\t.align\t4\n\n");
 	printf("\t.global\t___btfixup_start\n___btfixup_start:\n\n");
 	for (i = 0; i < last; i++) {
 		f = array + i;
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 35abcb8b00e..8c9f68ec97a 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -241,7 +241,7 @@ void __init setup_per_cpu_areas(void)
 #endif
 #endif
 		/*
-		 * Up to this point, the boot CPU has been using .data.init
+		 * Up to this point, the boot CPU has been using .init.data
 		 * area.  Reload any changed state for the boot CPU.
 		 */
 		if (cpu == boot_cpu_id)
-- 
cgit v1.2.3-70-g09d2


From 12387a46bb150f5608de4aa9a90dfdddbf991e3f Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 10 Mar 2010 18:28:55 +0800
Subject: crypto: aesni-intel - Add AES-NI accelerated CTR mode

To take advantage of the hardware pipeline implementation of AES-NI
instructions. CTR mode cryption is implemented in ASM to schedule
multiple AES-NI instructions one after another. This way, some latency
of AES-NI instruction can be eliminated.

Performance testing based on dm-crypt should 50% reduction of
ecryption/decryption time.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_asm.S  | 115 ++++++++++++++++++++++++++++++++
 arch/x86/crypto/aesni-intel_glue.c | 130 +++++++++++++++++++++++++++++++++++--
 2 files changed, 238 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 20bb0e1ac68..822846a9eba 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -32,6 +32,9 @@
 #define IN	IN1
 #define KEY	%xmm2
 #define IV	%xmm3
+#define BSWAP_MASK %xmm10
+#define CTR	%xmm11
+#define INC	%xmm12
 
 #define KEYP	%rdi
 #define OUTP	%rsi
@@ -42,6 +45,7 @@
 #define T1	%r10
 #define TKEYP	T1
 #define T2	%r11
+#define TCTR_LOW T2
 
 _key_expansion_128:
 _key_expansion_256a:
@@ -724,3 +728,114 @@ ENTRY(aesni_cbc_dec)
 	movups IV, (IVP)
 .Lcbc_dec_just_ret:
 	ret
+
+.align 16
+.Lbswap_mask:
+	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+/*
+ * _aesni_inc_init:	internal ABI
+ *	setup registers used by _aesni_inc
+ * input:
+ *	IV
+ * output:
+ *	CTR:	== IV, in little endian
+ *	TCTR_LOW: == lower qword of CTR
+ *	INC:	== 1, in little endian
+ *	BSWAP_MASK == endian swapping mask
+ */
+_aesni_inc_init:
+	movaps .Lbswap_mask, BSWAP_MASK
+	movaps IV, CTR
+	PSHUFB_XMM BSWAP_MASK CTR
+	mov $1, TCTR_LOW
+	movq TCTR_LOW, INC
+	movq CTR, TCTR_LOW
+	ret
+
+/*
+ * _aesni_inc:		internal ABI
+ *	Increase IV by 1, IV is in big endian
+ * input:
+ *	IV
+ *	CTR:	== IV, in little endian
+ *	TCTR_LOW: == lower qword of CTR
+ *	INC:	== 1, in little endian
+ *	BSWAP_MASK == endian swapping mask
+ * output:
+ *	IV:	Increase by 1
+ * changed:
+ *	CTR:	== output IV, in little endian
+ *	TCTR_LOW: == lower qword of CTR
+ */
+_aesni_inc:
+	paddq INC, CTR
+	add $1, TCTR_LOW
+	jnc .Linc_low
+	pslldq $8, INC
+	paddq INC, CTR
+	psrldq $8, INC
+.Linc_low:
+	movaps CTR, IV
+	PSHUFB_XMM BSWAP_MASK IV
+	ret
+
+/*
+ * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
+ *		      size_t len, u8 *iv)
+ */
+ENTRY(aesni_ctr_enc)
+	cmp $16, LEN
+	jb .Lctr_enc_just_ret
+	mov 480(KEYP), KLEN
+	movups (IVP), IV
+	call _aesni_inc_init
+	cmp $64, LEN
+	jb .Lctr_enc_loop1
+.align 4
+.Lctr_enc_loop4:
+	movaps IV, STATE1
+	call _aesni_inc
+	movups (INP), IN1
+	movaps IV, STATE2
+	call _aesni_inc
+	movups 0x10(INP), IN2
+	movaps IV, STATE3
+	call _aesni_inc
+	movups 0x20(INP), IN3
+	movaps IV, STATE4
+	call _aesni_inc
+	movups 0x30(INP), IN4
+	call _aesni_enc4
+	pxor IN1, STATE1
+	movups STATE1, (OUTP)
+	pxor IN2, STATE2
+	movups STATE2, 0x10(OUTP)
+	pxor IN3, STATE3
+	movups STATE3, 0x20(OUTP)
+	pxor IN4, STATE4
+	movups STATE4, 0x30(OUTP)
+	sub $64, LEN
+	add $64, INP
+	add $64, OUTP
+	cmp $64, LEN
+	jge .Lctr_enc_loop4
+	cmp $16, LEN
+	jb .Lctr_enc_ret
+.align 4
+.Lctr_enc_loop1:
+	movaps IV, STATE
+	call _aesni_inc
+	movups (INP), IN
+	call _aesni_enc1
+	pxor IN, STATE
+	movups STATE, (OUTP)
+	sub $16, LEN
+	add $16, INP
+	add $16, OUTP
+	cmp $16, LEN
+	jge .Lctr_enc_loop1
+.Lctr_enc_ret:
+	movups IV, (IVP)
+.Lctr_enc_just_ret:
+	ret
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 49c552c060e..2cb3dcc4490 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -18,6 +18,7 @@
 #include <crypto/algapi.h>
 #include <crypto/aes.h>
 #include <crypto/cryptd.h>
+#include <crypto/ctr.h>
 #include <asm/i387.h>
 #include <asm/aes.h>
 
@@ -58,6 +59,8 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
 asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
 			      const u8 *in, unsigned int len, u8 *iv);
+asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
+			      const u8 *in, unsigned int len, u8 *iv);
 
 static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
 {
@@ -321,6 +324,72 @@ static struct crypto_alg blk_cbc_alg = {
 	},
 };
 
+static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
+			    struct blkcipher_walk *walk)
+{
+	u8 *ctrblk = walk->iv;
+	u8 keystream[AES_BLOCK_SIZE];
+	u8 *src = walk->src.virt.addr;
+	u8 *dst = walk->dst.virt.addr;
+	unsigned int nbytes = walk->nbytes;
+
+	aesni_enc(ctx, keystream, ctrblk);
+	crypto_xor(keystream, src, nbytes);
+	memcpy(dst, keystream, nbytes);
+	crypto_inc(ctrblk, AES_BLOCK_SIZE);
+}
+
+static int ctr_crypt(struct blkcipher_desc *desc,
+		     struct scatterlist *dst, struct scatterlist *src,
+		     unsigned int nbytes)
+{
+	struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm));
+	struct blkcipher_walk walk;
+	int err;
+
+	blkcipher_walk_init(&walk, dst, src, nbytes);
+	err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE);
+	desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP;
+
+	kernel_fpu_begin();
+	while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) {
+		aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr,
+			      nbytes & AES_BLOCK_MASK, walk.iv);
+		nbytes &= AES_BLOCK_SIZE - 1;
+		err = blkcipher_walk_done(desc, &walk, nbytes);
+	}
+	if (walk.nbytes) {
+		ctr_crypt_final(ctx, &walk);
+		err = blkcipher_walk_done(desc, &walk, 0);
+	}
+	kernel_fpu_end();
+
+	return err;
+}
+
+static struct crypto_alg blk_ctr_alg = {
+	.cra_name		= "__ctr-aes-aesni",
+	.cra_driver_name	= "__driver-ctr-aes-aesni",
+	.cra_priority		= 0,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1,
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_blkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(blk_ctr_alg.cra_list),
+	.cra_u = {
+		.blkcipher = {
+			.min_keysize	= AES_MIN_KEY_SIZE,
+			.max_keysize	= AES_MAX_KEY_SIZE,
+			.ivsize		= AES_BLOCK_SIZE,
+			.setkey		= aes_set_key,
+			.encrypt	= ctr_crypt,
+			.decrypt	= ctr_crypt,
+		},
+	},
+};
+
 static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
 			unsigned int key_len)
 {
@@ -467,13 +536,11 @@ static struct crypto_alg ablk_cbc_alg = {
 	},
 };
 
-#ifdef HAS_CTR
 static int ablk_ctr_init(struct crypto_tfm *tfm)
 {
 	struct cryptd_ablkcipher *cryptd_tfm;
 
-	cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))",
-					     0, 0);
+	cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-aes-aesni", 0, 0);
 	if (IS_ERR(cryptd_tfm))
 		return PTR_ERR(cryptd_tfm);
 	ablk_init_common(tfm, cryptd_tfm);
@@ -500,11 +567,50 @@ static struct crypto_alg ablk_ctr_alg = {
 			.ivsize		= AES_BLOCK_SIZE,
 			.setkey		= ablk_set_key,
 			.encrypt	= ablk_encrypt,
-			.decrypt	= ablk_decrypt,
+			.decrypt	= ablk_encrypt,
 			.geniv		= "chainiv",
 		},
 	},
 };
+
+#ifdef HAS_CTR
+static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm)
+{
+	struct cryptd_ablkcipher *cryptd_tfm;
+
+	cryptd_tfm = cryptd_alloc_ablkcipher(
+		"rfc3686(__driver-ctr-aes-aesni)", 0, 0);
+	if (IS_ERR(cryptd_tfm))
+		return PTR_ERR(cryptd_tfm);
+	ablk_init_common(tfm, cryptd_tfm);
+	return 0;
+}
+
+static struct crypto_alg ablk_rfc3686_ctr_alg = {
+	.cra_name		= "rfc3686(ctr(aes))",
+	.cra_driver_name	= "rfc3686-ctr-aes-aesni",
+	.cra_priority		= 400,
+	.cra_flags		= CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC,
+	.cra_blocksize		= 1,
+	.cra_ctxsize		= sizeof(struct async_aes_ctx),
+	.cra_alignmask		= 0,
+	.cra_type		= &crypto_ablkcipher_type,
+	.cra_module		= THIS_MODULE,
+	.cra_list		= LIST_HEAD_INIT(ablk_rfc3686_ctr_alg.cra_list),
+	.cra_init		= ablk_rfc3686_ctr_init,
+	.cra_exit		= ablk_exit,
+	.cra_u = {
+		.ablkcipher = {
+			.min_keysize = AES_MIN_KEY_SIZE+CTR_RFC3686_NONCE_SIZE,
+			.max_keysize = AES_MAX_KEY_SIZE+CTR_RFC3686_NONCE_SIZE,
+			.ivsize	     = CTR_RFC3686_IV_SIZE,
+			.setkey	     = ablk_set_key,
+			.encrypt     = ablk_encrypt,
+			.decrypt     = ablk_decrypt,
+			.geniv	     = "seqiv",
+		},
+	},
+};
 #endif
 
 #ifdef HAS_LRW
@@ -640,13 +746,17 @@ static int __init aesni_init(void)
 		goto blk_ecb_err;
 	if ((err = crypto_register_alg(&blk_cbc_alg)))
 		goto blk_cbc_err;
+	if ((err = crypto_register_alg(&blk_ctr_alg)))
+		goto blk_ctr_err;
 	if ((err = crypto_register_alg(&ablk_ecb_alg)))
 		goto ablk_ecb_err;
 	if ((err = crypto_register_alg(&ablk_cbc_alg)))
 		goto ablk_cbc_err;
-#ifdef HAS_CTR
 	if ((err = crypto_register_alg(&ablk_ctr_alg)))
 		goto ablk_ctr_err;
+#ifdef HAS_CTR
+	if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
+		goto ablk_rfc3686_ctr_err;
 #endif
 #ifdef HAS_LRW
 	if ((err = crypto_register_alg(&ablk_lrw_alg)))
@@ -675,13 +785,17 @@ ablk_pcbc_err:
 ablk_lrw_err:
 #endif
 #ifdef HAS_CTR
+	crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
+ablk_rfc3686_ctr_err:
+#endif
 	crypto_unregister_alg(&ablk_ctr_alg);
 ablk_ctr_err:
-#endif
 	crypto_unregister_alg(&ablk_cbc_alg);
 ablk_cbc_err:
 	crypto_unregister_alg(&ablk_ecb_alg);
 ablk_ecb_err:
+	crypto_unregister_alg(&blk_ctr_alg);
+blk_ctr_err:
 	crypto_unregister_alg(&blk_cbc_alg);
 blk_cbc_err:
 	crypto_unregister_alg(&blk_ecb_alg);
@@ -705,10 +819,12 @@ static void __exit aesni_exit(void)
 	crypto_unregister_alg(&ablk_lrw_alg);
 #endif
 #ifdef HAS_CTR
-	crypto_unregister_alg(&ablk_ctr_alg);
+	crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
 #endif
+	crypto_unregister_alg(&ablk_ctr_alg);
 	crypto_unregister_alg(&ablk_cbc_alg);
 	crypto_unregister_alg(&ablk_ecb_alg);
+	crypto_unregister_alg(&blk_ctr_alg);
 	crypto_unregister_alg(&blk_cbc_alg);
 	crypto_unregister_alg(&blk_ecb_alg);
 	crypto_unregister_alg(&__aesni_alg);
-- 
cgit v1.2.3-70-g09d2


From 32cbd7dfce93382a70f155bf539871b4c55bed29 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Sat, 13 Mar 2010 16:28:42 +0800
Subject: crypto: aesni-intel - Fix CTR optimization build failure with gas
 2.16.1

Andrew Morton reported that AES-NI CTR optimization failed to compile
with gas 2.16.1, the error message is as follow:

arch/x86/crypto/aesni-intel_asm.S: Assembler messages:
arch/x86/crypto/aesni-intel_asm.S:752: Error: suffix or operands invalid for `movq'
arch/x86/crypto/aesni-intel_asm.S:753: Error: suffix or operands invalid for `movq'

To fix this, a gas macro is defined to assemble movq with 64bit
general purpose registers and XMM registers. The macro will generate
the raw .byte sequence for needed instructions.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_asm.S |  4 +-
 arch/x86/include/asm/inst.h       | 96 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 95 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 822846a9eba..ff16756a51c 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -749,8 +749,8 @@ _aesni_inc_init:
 	movaps IV, CTR
 	PSHUFB_XMM BSWAP_MASK CTR
 	mov $1, TCTR_LOW
-	movq TCTR_LOW, INC
-	movq CTR, TCTR_LOW
+	MOVQ_R64_XMM TCTR_LOW INC
+	MOVQ_R64_XMM CTR TCTR_LOW
 	ret
 
 /*
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
index 14cf526091f..840a399701b 100644
--- a/arch/x86/include/asm/inst.h
+++ b/arch/x86/include/asm/inst.h
@@ -7,7 +7,66 @@
 
 #ifdef __ASSEMBLY__
 
+#define REG_NUM_INVALID		100
+
+#define REG_TYPE_R64		0
+#define REG_TYPE_XMM		1
+#define REG_TYPE_INVALID	100
+
+	.macro R64_NUM opd r64
+	\opd = REG_NUM_INVALID
+	.ifc \r64,%rax
+	\opd = 0
+	.endif
+	.ifc \r64,%rcx
+	\opd = 1
+	.endif
+	.ifc \r64,%rdx
+	\opd = 2
+	.endif
+	.ifc \r64,%rbx
+	\opd = 3
+	.endif
+	.ifc \r64,%rsp
+	\opd = 4
+	.endif
+	.ifc \r64,%rbp
+	\opd = 5
+	.endif
+	.ifc \r64,%rsi
+	\opd = 6
+	.endif
+	.ifc \r64,%rdi
+	\opd = 7
+	.endif
+	.ifc \r64,%r8
+	\opd = 8
+	.endif
+	.ifc \r64,%r9
+	\opd = 9
+	.endif
+	.ifc \r64,%r10
+	\opd = 10
+	.endif
+	.ifc \r64,%r11
+	\opd = 11
+	.endif
+	.ifc \r64,%r12
+	\opd = 12
+	.endif
+	.ifc \r64,%r13
+	\opd = 13
+	.endif
+	.ifc \r64,%r14
+	\opd = 14
+	.endif
+	.ifc \r64,%r15
+	\opd = 15
+	.endif
+	.endm
+
 	.macro XMM_NUM opd xmm
+	\opd = REG_NUM_INVALID
 	.ifc \xmm,%xmm0
 	\opd = 0
 	.endif
@@ -58,13 +117,25 @@
 	.endif
 	.endm
 
+	.macro REG_TYPE type reg
+	R64_NUM reg_type_r64 \reg
+	XMM_NUM reg_type_xmm \reg
+	.if reg_type_r64 != REG_NUM_INVALID
+	\type = REG_TYPE_R64
+	.elseif reg_type_xmm != REG_NUM_INVALID
+	\type = REG_TYPE_XMM
+	.else
+	\type = REG_TYPE_INVALID
+	.endif
+	.endm
+
 	.macro PFX_OPD_SIZE
 	.byte 0x66
 	.endm
 
-	.macro PFX_REX opd1 opd2
-	.if (\opd1 | \opd2) & 8
-	.byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1)
+	.macro PFX_REX opd1 opd2 W=0
+	.if ((\opd1 | \opd2) & 8) || \W
+	.byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3)
 	.endif
 	.endm
 
@@ -145,6 +216,25 @@
 	.byte 0x0f, 0x38, 0xdf
 	MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2
 	.endm
+
+	.macro MOVQ_R64_XMM opd1 opd2
+	REG_TYPE movq_r64_xmm_opd1_type \opd1
+	.if movq_r64_xmm_opd1_type == REG_TYPE_XMM
+	XMM_NUM movq_r64_xmm_opd1 \opd1
+	R64_NUM movq_r64_xmm_opd2 \opd2
+	.else
+	R64_NUM movq_r64_xmm_opd1 \opd1
+	XMM_NUM movq_r64_xmm_opd2 \opd2
+	.endif
+	PFX_OPD_SIZE
+	PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1
+	.if movq_r64_xmm_opd1_type == REG_TYPE_XMM
+	.byte 0x0f, 0x7e
+	.else
+	.byte 0x0f, 0x6e
+	.endif
+	MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
+	.endm
 #endif
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From 68ca406930d6380b3be7ada5f15fcf85bfcbd552 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Fri, 19 Feb 2010 00:09:22 -0500
Subject: ACPI: delete the "acpi=ht" boot option

acpi=ht was important in 2003 -- before ACPI was
universally deployed and enabled by default in
the major Linux distributions.

At that time, there were a fair number of people who
or chose to, or needed to, run with acpi=off,
yet also wanted access to Hyper-threading.

Today we find that many invocations of "acpi=ht"
are accidental, and thus is it possible that it
is doing more harm than good.

In 2.6.34, we warn on invocation of acpi=ht.
In 2.6.35, we delete the boot option.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 Documentation/kernel-parameters.txt |  3 +--
 arch/ia64/include/asm/acpi.h        |  1 -
 arch/x86/include/asm/acpi.h         |  2 --
 arch/x86/kernel/acpi/boot.c         | 19 +++----------------
 arch/x86/lguest/boot.c              |  1 -
 drivers/acpi/tables.c               |  4 ++--
 6 files changed, 6 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 3bc48b0bd3a..41b924ff0b5 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -143,11 +143,10 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	acpi=		[HW,ACPI,X86]
 			Advanced Configuration and Power Interface
-			Format: { force | off | ht | strict | noirq | rsdt }
+			Format: { force | off | strict | noirq | rsdt }
 			force -- enable ACPI if default was off
 			off -- disable ACPI if default was on
 			noirq -- do not use ACPI for IRQ routing
-			ht -- run only enough ACPI to enable Hyper Threading
 			strict -- Be less tolerant of platforms that are not
 				strictly ACPI specification compliant.
 			rsdt -- prefer RSDT over (default) XSDT
diff --git a/arch/ia64/include/asm/acpi.h b/arch/ia64/include/asm/acpi.h
index 21adbd7f90f..837dc82a013 100644
--- a/arch/ia64/include/asm/acpi.h
+++ b/arch/ia64/include/asm/acpi.h
@@ -94,7 +94,6 @@ ia64_acpi_release_global_lock (unsigned int *lock)
 #define acpi_noirq 0	/* ACPI always enabled on IA64 */
 #define acpi_pci_disabled 0 /* ACPI PCI always enabled on IA64 */
 #define acpi_strict 1	/* no ACPI spec workarounds on IA64 */
-#define acpi_ht 0	/* no HT-only mode on IA64 */
 #endif
 #define acpi_processor_cstate_check(x) (x) /* no idle limits on IA64 :) */
 static inline void disable_acpi(void) { }
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 56f462cf22d..aa2c39d968f 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -85,7 +85,6 @@ extern int acpi_ioapic;
 extern int acpi_noirq;
 extern int acpi_strict;
 extern int acpi_disabled;
-extern int acpi_ht;
 extern int acpi_pci_disabled;
 extern int acpi_skip_timer_override;
 extern int acpi_use_timer_override;
@@ -97,7 +96,6 @@ void acpi_pic_sci_set_trigger(unsigned int, u16);
 static inline void disable_acpi(void)
 {
 	acpi_disabled = 1;
-	acpi_ht = 0;
 	acpi_pci_disabled = 1;
 	acpi_noirq = 1;
 }
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7914ab0ad76..63bcf39f8f2 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -62,7 +62,6 @@ EXPORT_SYMBOL(acpi_disabled);
 int acpi_noirq;				/* skip ACPI IRQ initialization */
 int acpi_pci_disabled;		/* skip ACPI PCI scan and IRQ initialization */
 EXPORT_SYMBOL(acpi_pci_disabled);
-int acpi_ht __initdata = 1;	/* enable HT */
 
 int acpi_lapic;
 int acpi_ioapic;
@@ -1460,9 +1459,8 @@ void __init acpi_boot_table_init(void)
 
 	/*
 	 * If acpi_disabled, bail out
-	 * One exception: acpi=ht continues far enough to enumerate LAPICs
 	 */
-	if (acpi_disabled && !acpi_ht)
+	if (acpi_disabled)
 		return; 
 
 	/*
@@ -1493,9 +1491,8 @@ int __init early_acpi_boot_init(void)
 {
 	/*
 	 * If acpi_disabled, bail out
-	 * One exception: acpi=ht continues far enough to enumerate LAPICs
 	 */
-	if (acpi_disabled && !acpi_ht)
+	if (acpi_disabled)
 		return 1;
 
 	/*
@@ -1513,9 +1510,8 @@ int __init acpi_boot_init(void)
 
 	/*
 	 * If acpi_disabled, bail out
-	 * One exception: acpi=ht continues far enough to enumerate LAPICs
 	 */
-	if (acpi_disabled && !acpi_ht)
+	if (acpi_disabled)
 		return 1;
 
 	acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1550,21 +1546,12 @@ static int __init parse_acpi(char *arg)
 	/* acpi=force to over-ride black-list */
 	else if (strcmp(arg, "force") == 0) {
 		acpi_force = 1;
-		acpi_ht = 1;
 		acpi_disabled = 0;
 	}
 	/* acpi=strict disables out-of-spec workarounds */
 	else if (strcmp(arg, "strict") == 0) {
 		acpi_strict = 1;
 	}
-	/* Limit ACPI just to boot-time to enable HT */
-	else if (strcmp(arg, "ht") == 0) {
-		if (!acpi_force) {
-			printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n");
-			disable_acpi();
-		}
-		acpi_ht = 1;
-	}
 	/* acpi=rsdt use RSDT instead of XSDT */
 	else if (strcmp(arg, "rsdt") == 0) {
 		acpi_rsdt_forced = 1;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 7e59dc1d3fc..8eb9eed6028 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1391,7 +1391,6 @@ __init void lguest_init(void)
 #endif
 #ifdef CONFIG_ACPI
 	acpi_disabled = 1;
-	acpi_ht = 0;
 #endif
 
 	/*
diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c
index 8a0ed2800e6..f336bca7c45 100644
--- a/drivers/acpi/tables.c
+++ b/drivers/acpi/tables.c
@@ -213,7 +213,7 @@ acpi_table_parse_entries(char *id,
 	unsigned long table_end;
 	acpi_size tbl_size;
 
-	if (acpi_disabled && !acpi_ht)
+	if (acpi_disabled)
 		return -ENODEV;
 
 	if (!handler)
@@ -280,7 +280,7 @@ int __init acpi_table_parse(char *id, acpi_table_handler handler)
 	struct acpi_table_header *table = NULL;
 	acpi_size tbl_size;
 
-	if (acpi_disabled && !acpi_ht)
+	if (acpi_disabled)
 		return -ENODEV;
 
 	if (!handler)
-- 
cgit v1.2.3-70-g09d2


From 62e7bec49479e0c61e8cfd914f722a9ca6fd52e5 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Wed, 24 Mar 2010 21:37:57 +0800
Subject: crypto: aesni-intel - Fix another CTR build failure with gas 2.16.1

The previous AES-NI CTR optimization compiling failure gas 2.16.1 fix
introduces another compiling failure by itself. This patch fixes that.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/include/asm/inst.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
index 840a399701b..280bf7fb6ab 100644
--- a/arch/x86/include/asm/inst.h
+++ b/arch/x86/include/asm/inst.h
@@ -120,9 +120,9 @@
 	.macro REG_TYPE type reg
 	R64_NUM reg_type_r64 \reg
 	XMM_NUM reg_type_xmm \reg
-	.if reg_type_r64 != REG_NUM_INVALID
+	.if reg_type_r64 <> REG_NUM_INVALID
 	\type = REG_TYPE_R64
-	.elseif reg_type_xmm != REG_NUM_INVALID
+	.elseif reg_type_xmm <> REG_NUM_INVALID
 	\type = REG_TYPE_XMM
 	.else
 	\type = REG_TYPE_INVALID
-- 
cgit v1.2.3-70-g09d2


From 57283776b2b821ba4d592f61cad04d0293412740 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Thu, 11 Mar 2010 12:20:11 -0700
Subject: ACPI: pci_root: pass acpi_pci_root to arch-specific scan

The acpi_pci_root structure contains all the individual items (acpi_device,
domain, bus number) we pass to pci_acpi_scan_root(), so just pass the
single acpi_pci_root pointer directly.

This will make it easier to add _CBA support later.  For _CBA, we need the
entire downstream bus range, not just the base bus number.  We have that in
the acpi_pci_root structure, so passing the pointer makes it available to
the arch-specific code.

Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Reviewed-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/ia64/pci/pci.c         | 5 ++++-
 arch/x86/pci/acpi.c         | 5 ++++-
 drivers/acpi/pci_root.c     | 2 +-
 include/acpi/acpi_drivers.h | 3 +--
 4 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index 64aff520b89..aa2533ae7e9 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -335,8 +335,11 @@ pcibios_setup_root_windows(struct pci_bus *bus, struct pci_controller *ctrl)
 }
 
 struct pci_bus * __devinit
-pci_acpi_scan_root(struct acpi_device *device, int domain, int bus)
+pci_acpi_scan_root(struct acpi_pci_root *root)
 {
+	struct acpi_device *device = root->device;
+	int domain = root->segment;
+	int bus = root->secondary.start;
 	struct pci_controller *controller;
 	unsigned int windows = 0;
 	struct pci_bus *pbus;
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index e31160216ef..0b7882dbe78 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -229,8 +229,11 @@ res_alloc_fail:
 	return;
 }
 
-struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum)
+struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
 {
+	struct acpi_device *device = root->device;
+	int domain = root->segment;
+	int busnum = root->secondary.start;
 	struct pci_bus *bus;
 	struct pci_sysdata *sd;
 	int node;
diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index bf476fea97a..680450c905b 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -498,7 +498,7 @@ static int __devinit acpi_pci_root_add(struct acpi_device *device)
 	 * PCI namespace does not get created until this call is made (and 
 	 * thus the root bridge's pci_dev does not exist).
 	 */
-	root->bus = pci_acpi_scan_root(device, segment, root->secondary.start);
+	root->bus = pci_acpi_scan_root(root);
 	if (!root->bus) {
 		printk(KERN_ERR PREFIX
 			    "Bus %04x:%02x not present in PCI namespace\n",
diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h
index 4f7b44866b7..23d78b4d088 100644
--- a/include/acpi/acpi_drivers.h
+++ b/include/acpi/acpi_drivers.h
@@ -104,8 +104,7 @@ int acpi_pci_bind_root(struct acpi_device *device);
 
 /* Arch-defined function to add a bus to the system */
 
-struct pci_bus *pci_acpi_scan_root(struct acpi_device *device, int domain,
-				   int bus);
+struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root);
 void pci_acpi_crs_quirks(void);
 
 /* --------------------------------------------------------------------------
-- 
cgit v1.2.3-70-g09d2


From 7926c09dea682be6f3b2e42f16c50d8554c6bbdc Mon Sep 17 00:00:00 2001
From: Jan III Sobieski <jan3sobi3ski@gmail.com>
Date: Sun, 28 Mar 2010 15:38:31 +0200
Subject: add random binaries to .gitignore

Signed-off-by: Jan III Sobieski <jan3sobi3ski@gmail.com>
Signed-off-by: Michal Marek <mmarek@suse.cz>
---
 Documentation/.gitignore | 7 +++++++
 arch/x86/.gitignore      | 3 +++
 2 files changed, 10 insertions(+)
 create mode 100644 Documentation/.gitignore
 create mode 100644 arch/x86/.gitignore

(limited to 'arch/x86')

diff --git a/Documentation/.gitignore b/Documentation/.gitignore
new file mode 100644
index 00000000000..bcd907b4141
--- /dev/null
+++ b/Documentation/.gitignore
@@ -0,0 +1,7 @@
+filesystems/dnotify_test
+laptops/dslm
+timers/hpet_example
+vm/hugepage-mmap
+vm/hugepage-shm
+vm/map_hugetlb
+
diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore
new file mode 100644
index 00000000000..028079065af
--- /dev/null
+++ b/arch/x86/.gitignore
@@ -0,0 +1,3 @@
+boot/compressed/vmlinux
+tools/test_get_len
+
-- 
cgit v1.2.3-70-g09d2


From 89a27f4d0e042a2fa3391a76b652aec3e16ef200 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 16 Feb 2010 10:51:48 +0200
Subject: KVM: use desc_ptr struct instead of kvm private descriptor_table

x86 arch defines desc_ptr for idt/gdt pointers, no need to define
another structure in kvm code.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 17 ++++++---------
 arch/x86/kvm/svm.c              | 28 ++++++++++++-------------
 arch/x86/kvm/vmx.c              | 36 ++++++++++++++++----------------
 arch/x86/kvm/x86.c              | 46 ++++++++++++++++++++---------------------
 4 files changed, 61 insertions(+), 66 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 06d9e79ca37..cf392dfb800 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -461,11 +461,6 @@ struct kvm_vcpu_stat {
 	u32 nmi_injections;
 };
 
-struct descriptor_table {
-	u16 limit;
-	unsigned long base;
-} __attribute__((packed));
-
 struct kvm_x86_ops {
 	int (*cpu_has_kvm_support)(void);          /* __init */
 	int (*disabled_by_bios)(void);             /* __init */
@@ -503,10 +498,10 @@ struct kvm_x86_ops {
 	void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
 	void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
 	void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
-	void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
-	void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
+	void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+	void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+	void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
+	void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
 	int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
 	int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
 	void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
@@ -724,12 +719,12 @@ static inline void kvm_load_ldt(u16 sel)
 	asm("lldt %0" : : "rm"(sel));
 }
 
-static inline void kvm_get_idt(struct descriptor_table *table)
+static inline void kvm_get_idt(struct desc_ptr *table)
 {
 	asm("sidt %0" : "=m"(*table));
 }
 
-static inline void kvm_get_gdt(struct descriptor_table *table)
+static inline void kvm_get_gdt(struct desc_ptr *table)
 {
 	asm("sgdt %0" : "=m"(*table));
 }
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2ba58206812..77fa2e3053b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -319,7 +319,7 @@ static int svm_hardware_enable(void *garbage)
 
 	struct svm_cpu_data *sd;
 	uint64_t efer;
-	struct descriptor_table gdt_descr;
+	struct desc_ptr gdt_descr;
 	struct desc_struct *gdt;
 	int me = raw_smp_processor_id();
 
@@ -345,7 +345,7 @@ static int svm_hardware_enable(void *garbage)
 	sd->next_asid = sd->max_asid + 1;
 
 	kvm_get_gdt(&gdt_descr);
-	gdt = (struct desc_struct *)gdt_descr.base;
+	gdt = (struct desc_struct *)gdt_descr.address;
 	sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 
 	wrmsrl(MSR_EFER, efer | EFER_SVME);
@@ -936,36 +936,36 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu)
 	return save->cpl;
 }
 
-static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	dt->limit = svm->vmcb->save.idtr.limit;
-	dt->base = svm->vmcb->save.idtr.base;
+	dt->size = svm->vmcb->save.idtr.limit;
+	dt->address = svm->vmcb->save.idtr.base;
 }
 
-static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	svm->vmcb->save.idtr.limit = dt->limit;
-	svm->vmcb->save.idtr.base = dt->base ;
+	svm->vmcb->save.idtr.limit = dt->size;
+	svm->vmcb->save.idtr.base = dt->address ;
 }
 
-static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	dt->limit = svm->vmcb->save.gdtr.limit;
-	dt->base = svm->vmcb->save.gdtr.base;
+	dt->size = svm->vmcb->save.gdtr.limit;
+	dt->address = svm->vmcb->save.gdtr.base;
 }
 
-static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	svm->vmcb->save.gdtr.limit = dt->limit;
-	svm->vmcb->save.gdtr.base = dt->base ;
+	svm->vmcb->save.gdtr.limit = dt->size;
+	svm->vmcb->save.gdtr.base = dt->address ;
 }
 
 static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bc933cfb4e6..68f895b0045 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -600,11 +600,11 @@ static void reload_tss(void)
 	/*
 	 * VT restores TR but not its size.  Useless.
 	 */
-	struct descriptor_table gdt;
+	struct desc_ptr gdt;
 	struct desc_struct *descs;
 
 	kvm_get_gdt(&gdt);
-	descs = (void *)gdt.base;
+	descs = (void *)gdt.address;
 	descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
 	load_TR_desc();
 }
@@ -758,7 +758,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	if (vcpu->cpu != cpu) {
-		struct descriptor_table dt;
+		struct desc_ptr dt;
 		unsigned long sysenter_esp;
 
 		vcpu->cpu = cpu;
@@ -768,7 +768,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		 */
 		vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
 		kvm_get_gdt(&dt);
-		vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
+		vmcs_writel(HOST_GDTR_BASE, dt.address);   /* 22.2.4 */
 
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
@@ -1934,28 +1934,28 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
 	*l = (ar >> 13) & 1;
 }
 
-static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
-	dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
-	dt->base = vmcs_readl(GUEST_IDTR_BASE);
+	dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
+	dt->address = vmcs_readl(GUEST_IDTR_BASE);
 }
 
-static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
-	vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
-	vmcs_writel(GUEST_IDTR_BASE, dt->base);
+	vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
+	vmcs_writel(GUEST_IDTR_BASE, dt->address);
 }
 
-static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
-	dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
-	dt->base = vmcs_readl(GUEST_GDTR_BASE);
+	dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
+	dt->address = vmcs_readl(GUEST_GDTR_BASE);
 }
 
-static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
+static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
 {
-	vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
-	vmcs_writel(GUEST_GDTR_BASE, dt->base);
+	vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
+	vmcs_writel(GUEST_GDTR_BASE, dt->address);
 }
 
 static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
@@ -2334,7 +2334,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	u32 junk;
 	u64 host_pat, tsc_this, tsc_base;
 	unsigned long a;
-	struct descriptor_table dt;
+	struct desc_ptr dt;
 	int i;
 	unsigned long kvm_vmx_return;
 	u32 exec_control;
@@ -2416,7 +2416,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
 
 	kvm_get_idt(&dt);
-	vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
+	vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
 
 	asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
 	vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3c4ca98ad27..274a8e39bca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -225,7 +225,7 @@ static void drop_user_return_notifiers(void *ignore)
 
 unsigned long segment_base(u16 selector)
 {
-	struct descriptor_table gdt;
+	struct desc_ptr gdt;
 	struct desc_struct *d;
 	unsigned long table_base;
 	unsigned long v;
@@ -234,7 +234,7 @@ unsigned long segment_base(u16 selector)
 		return 0;
 
 	kvm_get_gdt(&gdt);
-	table_base = gdt.base;
+	table_base = gdt.address;
 
 	if (selector & 4) {           /* from ldt */
 		u16 ldt_selector = kvm_read_ldt();
@@ -3949,14 +3949,14 @@ static u64 mk_cr_64(u64 curr_cr, u32 new_val)
 
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
 {
-	struct descriptor_table dt = { limit, base };
+	struct desc_ptr dt = { limit, base };
 
 	kvm_x86_ops->set_gdt(vcpu, &dt);
 }
 
 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
 {
-	struct descriptor_table dt = { limit, base };
+	struct desc_ptr dt = { limit, base };
 
 	kvm_x86_ops->set_idt(vcpu, &dt);
 }
@@ -4581,7 +4581,7 @@ EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
-	struct descriptor_table dt;
+	struct desc_ptr dt;
 
 	vcpu_load(vcpu);
 
@@ -4596,11 +4596,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
 
 	kvm_x86_ops->get_idt(vcpu, &dt);
-	sregs->idt.limit = dt.limit;
-	sregs->idt.base = dt.base;
+	sregs->idt.limit = dt.size;
+	sregs->idt.base = dt.address;
 	kvm_x86_ops->get_gdt(vcpu, &dt);
-	sregs->gdt.limit = dt.limit;
-	sregs->gdt.base = dt.base;
+	sregs->gdt.limit = dt.size;
+	sregs->gdt.base = dt.address;
 
 	sregs->cr0 = kvm_read_cr0(vcpu);
 	sregs->cr2 = vcpu->arch.cr2;
@@ -4672,7 +4672,7 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
 
 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
 					  u16 selector,
-					  struct descriptor_table *dtable)
+					  struct desc_ptr *dtable)
 {
 	if (selector & 1 << 2) {
 		struct kvm_segment kvm_seg;
@@ -4680,10 +4680,10 @@ static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
 		kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
 
 		if (kvm_seg.unusable)
-			dtable->limit = 0;
+			dtable->size = 0;
 		else
-			dtable->limit = kvm_seg.limit;
-		dtable->base = kvm_seg.base;
+			dtable->size = kvm_seg.limit;
+		dtable->address = kvm_seg.base;
 	}
 	else
 		kvm_x86_ops->get_gdt(vcpu, dtable);
@@ -4693,7 +4693,7 @@ static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 					 struct desc_struct *seg_desc)
 {
-	struct descriptor_table dtable;
+	struct desc_ptr dtable;
 	u16 index = selector >> 3;
 	int ret;
 	u32 err;
@@ -4701,7 +4701,7 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 
 	get_segment_descriptor_dtable(vcpu, selector, &dtable);
 
-	if (dtable.limit < index * 8 + 7) {
+	if (dtable.size < index * 8 + 7) {
 		kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
 		return X86EMUL_PROPAGATE_FAULT;
 	}
@@ -4718,14 +4718,14 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 					 struct desc_struct *seg_desc)
 {
-	struct descriptor_table dtable;
+	struct desc_ptr dtable;
 	u16 index = selector >> 3;
 
 	get_segment_descriptor_dtable(vcpu, selector, &dtable);
 
-	if (dtable.limit < index * 8 + 7)
+	if (dtable.size < index * 8 + 7)
 		return 1;
-	return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
+	return kvm_write_guest_virt(dtable.address + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
 }
 
 static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
@@ -5204,15 +5204,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 {
 	int mmu_reset_needed = 0;
 	int pending_vec, max_bits;
-	struct descriptor_table dt;
+	struct desc_ptr dt;
 
 	vcpu_load(vcpu);
 
-	dt.limit = sregs->idt.limit;
-	dt.base = sregs->idt.base;
+	dt.size = sregs->idt.limit;
+	dt.address = sregs->idt.base;
 	kvm_x86_ops->set_idt(vcpu, &dt);
-	dt.limit = sregs->gdt.limit;
-	dt.base = sregs->gdt.base;
+	dt.size = sregs->gdt.limit;
+	dt.address = sregs->gdt.base;
 	kvm_x86_ops->set_gdt(vcpu, &dt);
 
 	vcpu->arch.cr2 = sregs->cr2;
-- 
cgit v1.2.3-70-g09d2


From 1161624f15f584096a0df3dda70403cd1d00721e Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 11 Feb 2010 14:43:14 +0200
Subject: KVM: inject #UD in 64bit mode from instruction that are not valid
 there

Some instruction are obsolete in a long mode. Inject #UD.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4dade6ac082..96d4bef06e1 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1015,11 +1015,6 @@ done_prefixes:
 		}
 	}
 
-	if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
-		kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
-		return -1;
-	}
-
 	if (c->d & Group) {
 		group = c->d & GroupMask;
 		c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1828,6 +1823,11 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
 	saved_eip = c->eip;
 
+	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
+		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		goto done;
+	}
+
 	/* LOCK prefix is allowed only with some instructions */
 	if (c->lock_prefix && !(c->d & Lock)) {
 		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
-- 
cgit v1.2.3-70-g09d2


From 3e2815e9fa6c06bcb8a9340e43008bbe48437d25 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Fri, 12 Feb 2010 15:53:59 +0900
Subject: KVM: x86 emulator: X86EMUL macro replacements: from
 do_fetch_insn_byte() to x86_decode_insn()

This patch just replaces the integer values used inside x86's
decode functions to X86EMUL_*.

By this patch, it becomes clearer that we are using X86EMUL_*
value propagated from ops->read_std() in do_fetch_insn_byte().

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 96d4bef06e1..b8aed35ab5f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -647,20 +647,20 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
 	if (linear < fc->start || linear >= fc->end) {
 		size = min(15UL, PAGE_SIZE - offset_in_page(linear));
 		rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL);
-		if (rc)
+		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		fc->start = linear;
 		fc->end = linear + size;
 	}
 	*dest = fc->data[linear - fc->start];
-	return 0;
+	return X86EMUL_CONTINUE;
 }
 
 static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
 			 struct x86_emulate_ops *ops,
 			 unsigned long eip, void *dest, unsigned size)
 {
-	int rc = 0;
+	int rc;
 
 	/* x86 instructions are limited to 15 bytes. */
 	if (eip + size - ctxt->decode.eip_orig > 15)
@@ -668,10 +668,10 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
 	eip += ctxt->cs_base;
 	while (size--) {
 		rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
-		if (rc)
+		if (rc != X86EMUL_CONTINUE)
 			return rc;
 	}
-	return 0;
+	return X86EMUL_CONTINUE;
 }
 
 /*
@@ -782,7 +782,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	u8 sib;
 	int index_reg = 0, base_reg = 0, scale;
-	int rc = 0;
+	int rc = X86EMUL_CONTINUE;
 
 	if (c->rex_prefix) {
 		c->modrm_reg = (c->rex_prefix & 4) << 1;	/* REX.R */
@@ -895,7 +895,7 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
 		      struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc = 0;
+	int rc = X86EMUL_CONTINUE;
 
 	switch (c->ad_bytes) {
 	case 2:
@@ -916,7 +916,7 @@ int
 x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc = 0;
+	int rc = X86EMUL_CONTINUE;
 	int mode = ctxt->mode;
 	int def_op_bytes, def_ad_bytes, group;
 
@@ -1041,7 +1041,7 @@ done_prefixes:
 		rc = decode_modrm(ctxt, ops);
 	else if (c->d & MemAbs)
 		rc = decode_abs(ctxt, ops);
-	if (rc)
+	if (rc != X86EMUL_CONTINUE)
 		goto done;
 
 	if (!c->has_seg_override)
-- 
cgit v1.2.3-70-g09d2


From 1b30eaa84609031c06e417eafd5b68f45e4266f7 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Fri, 12 Feb 2010 15:57:56 +0900
Subject: KVM: x86 emulator: X86EMUL macro replacements: x86_emulate_insn() and
 its helpers

This patch just replaces integer values used inside
x86_emulate_insn() and its helper functions to X86EMUL_*.

The purpose of this is to make it clear what will happen
when the variable rc is compared to X86EMUL_* at the end
of x86_emulate_insn().

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 62 +++++++++++++++++++++++---------------------------
 1 file changed, 29 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b8aed35ab5f..ee1a2a2c12e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -702,7 +702,7 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
 	*address = 0;
 	rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
 			   ctxt->vcpu, NULL);
-	if (rc)
+	if (rc != X86EMUL_CONTINUE)
 		return rc;
 	rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
 			   ctxt->vcpu, NULL);
@@ -1301,7 +1301,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
 	int rc;
 
 	rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
-	if (rc != 0)
+	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
 	rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg);
@@ -1327,7 +1327,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
 			struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc = 0;
+	int rc = X86EMUL_CONTINUE;
 	int reg = VCPU_REGS_RDI;
 
 	while (reg >= VCPU_REGS_RAX) {
@@ -1338,7 +1338,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
 		}
 
 		rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			break;
 		--reg;
 	}
@@ -1349,12 +1349,8 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
 				struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc;
 
-	rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
-	if (rc != 0)
-		return rc;
-	return 0;
+	return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes);
 }
 
 static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
@@ -1390,7 +1386,7 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 			       struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc = 0;
+	int rc = X86EMUL_CONTINUE;
 
 	switch (c->modrm_reg) {
 	case 0 ... 1:	/* test */
@@ -1437,7 +1433,7 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
 		emulate_push(ctxt);
 		break;
 	}
-	return 0;
+	return X86EMUL_CONTINUE;
 }
 
 static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
@@ -1468,7 +1464,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
 			return rc;
 		ctxt->eflags |= EFLG_ZF;
 	}
-	return 0;
+	return X86EMUL_CONTINUE;
 }
 
 static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
@@ -1479,12 +1475,12 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
 	unsigned long cs;
 
 	rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes);
-	if (rc)
+	if (rc != X86EMUL_CONTINUE)
 		return rc;
 	if (c->op_bytes == 4)
 		c->eip = (u32)c->eip;
 	rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
-	if (rc)
+	if (rc != X86EMUL_CONTINUE)
 		return rc;
 	rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS);
 	return rc;
@@ -1539,7 +1535,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 	default:
 		break;
 	}
-	return 0;
+	return X86EMUL_CONTINUE;
 }
 
 static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
@@ -1811,7 +1807,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	struct decode_cache *c = &ctxt->decode;
 	unsigned int port;
 	int io_dir_in;
-	int rc = 0;
+	int rc = X86EMUL_CONTINUE;
 
 	ctxt->interruptibility = 0;
 
@@ -1926,7 +1922,7 @@ special_insn:
 		break;
 	case 0x07:		/* pop es */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0x08 ... 0x0d:
@@ -1945,7 +1941,7 @@ special_insn:
 		break;
 	case 0x17:		/* pop ss */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0x18 ... 0x1d:
@@ -1957,7 +1953,7 @@ special_insn:
 		break;
 	case 0x1f:		/* pop ds */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0x20 ... 0x25:
@@ -1988,7 +1984,7 @@ special_insn:
 	case 0x58 ... 0x5f: /* pop reg */
 	pop_instruction:
 		rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0x60:	/* pusha */
@@ -1996,7 +1992,7 @@ special_insn:
 		break;
 	case 0x61:	/* popa */
 		rc = emulate_popa(ctxt, ops);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0x63:		/* movsxd */
@@ -2141,7 +2137,7 @@ special_insn:
 	}
 	case 0x8f:		/* pop (sole member of Grp1a) */
 		rc = emulate_grp1a(ctxt, ops);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0x90: /* nop / xchg r8,rax */
@@ -2277,7 +2273,7 @@ special_insn:
 		break;
 	case 0xcb:		/* ret far */
 		rc = emulate_ret_far(ctxt, ops);
-		if (rc)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0xd0 ... 0xd1:	/* Grp2 */
@@ -2351,7 +2347,7 @@ special_insn:
 		break;
 	case 0xf6 ... 0xf7:	/* Grp3 */
 		rc = emulate_grp3(ctxt, ops);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0xf8: /* clc */
@@ -2385,14 +2381,14 @@ special_insn:
 		break;
 	case 0xfe ... 0xff:	/* Grp4/Grp5 */
 		rc = emulate_grp45(ctxt, ops);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	}
 
 writeback:
 	rc = writeback(ctxt, ops);
-	if (rc != 0)
+	if (rc != X86EMUL_CONTINUE)
 		goto done;
 
 	/* Commit shadow register state. */
@@ -2418,7 +2414,7 @@ twobyte_insn:
 				goto cannot_emulate;
 
 			rc = kvm_fix_hypercall(ctxt->vcpu);
-			if (rc)
+			if (rc != X86EMUL_CONTINUE)
 				goto done;
 
 			/* Let the processor re-execute the fixed hypercall */
@@ -2429,7 +2425,7 @@ twobyte_insn:
 		case 2: /* lgdt */
 			rc = read_descriptor(ctxt, ops, c->src.ptr,
 					     &size, &address, c->op_bytes);
-			if (rc)
+			if (rc != X86EMUL_CONTINUE)
 				goto done;
 			realmode_lgdt(ctxt->vcpu, size, address);
 			/* Disable writeback. */
@@ -2440,7 +2436,7 @@ twobyte_insn:
 				switch (c->modrm_rm) {
 				case 1:
 					rc = kvm_fix_hypercall(ctxt->vcpu);
-					if (rc)
+					if (rc != X86EMUL_CONTINUE)
 						goto done;
 					break;
 				default:
@@ -2450,7 +2446,7 @@ twobyte_insn:
 				rc = read_descriptor(ctxt, ops, c->src.ptr,
 						     &size, &address,
 						     c->op_bytes);
-				if (rc)
+				if (rc != X86EMUL_CONTINUE)
 					goto done;
 				realmode_lidt(ctxt->vcpu, size, address);
 			}
@@ -2577,7 +2573,7 @@ twobyte_insn:
 		break;
 	case 0xa1:	 /* pop fs */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0xa3:
@@ -2596,7 +2592,7 @@ twobyte_insn:
 		break;
 	case 0xa9:	/* pop gs */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
 	case 0xab:
@@ -2669,7 +2665,7 @@ twobyte_insn:
 		break;
 	case 0xc7:		/* Grp9 (cmpxchg8b) */
 		rc = emulate_grp9(ctxt, ops, memop);
-		if (rc != 0)
+		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		c->dst.type = OP_NONE;
 		break;
-- 
cgit v1.2.3-70-g09d2


From 0e4176a15f9af494ad098cb5a76bcfa17e14282b Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Fri, 12 Feb 2010 16:00:55 +0900
Subject: KVM: x86 emulator: Fix x86_emulate_insn() not to use the variable rc
 for non-X86EMUL values

This patch makes non-X86EMUL_* family functions not to use
the variable rc.

Be sure that this changes nothing but makes the purpose of
the variable rc clearer.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ee1a2a2c12e..35f7acd4a91 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2498,9 +2498,9 @@ twobyte_insn:
 	case 0x21: /* mov from dr to reg */
 		if (c->modrm_mod != 3)
 			goto cannot_emulate;
-		rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
-		if (rc)
+		if (emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]))
 			goto cannot_emulate;
+		rc = X86EMUL_CONTINUE;
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x22: /* mov reg, cr */
@@ -2513,18 +2513,16 @@ twobyte_insn:
 	case 0x23: /* mov from reg to dr */
 		if (c->modrm_mod != 3)
 			goto cannot_emulate;
-		rc = emulator_set_dr(ctxt, c->modrm_reg,
-				     c->regs[c->modrm_rm]);
-		if (rc)
+		if (emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]))
 			goto cannot_emulate;
+		rc = X86EMUL_CONTINUE;
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x30:
 		/* wrmsr */
 		msr_data = (u32)c->regs[VCPU_REGS_RAX]
 			| ((u64)c->regs[VCPU_REGS_RDX] << 32);
-		rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
-		if (rc) {
+		if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			c->eip = kvm_rip_read(ctxt->vcpu);
 		}
@@ -2533,8 +2531,7 @@ twobyte_insn:
 		break;
 	case 0x32:
 		/* rdmsr */
-		rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
-		if (rc) {
+		if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			c->eip = kvm_rip_read(ctxt->vcpu);
 		} else {
-- 
cgit v1.2.3-70-g09d2


From ad91f8ffbb18413e79f9f976a55b4e11d02e6a6d Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Fri, 12 Feb 2010 16:02:54 +0900
Subject: KVM: remove redundant prototype of load_pdptrs()

This patch removes redundant prototype of load_pdptrs().

I found load_pdptrs() twice in kvm_host.h. Let's remove one.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index cf392dfb800..d46e791de12 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -670,7 +670,6 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
 void kvm_enable_tdp(void);
 void kvm_disable_tdp(void);
 
-int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
 int complete_pio(struct kvm_vcpu *vcpu);
 bool kvm_check_iopl(struct kvm_vcpu *vcpu);
 
-- 
cgit v1.2.3-70-g09d2


From 7597f129d8b6799da7a264e6d6f7401668d3a36d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 19 Feb 2010 16:23:00 +0100
Subject: KVM: SVM: Don't use kmap_atomic in nested_svm_map

Use of kmap_atomic disables preemption but if we run in
shadow-shadow mode the vmrun emulation executes kvm_set_cr3
which might sleep or fault. So use kmap instead for
nested_svm_map.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 47 ++++++++++++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 77fa2e3053b..f9da35b06ec 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1423,7 +1423,7 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
 	return 0;
 }
 
-static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
+static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
 {
 	struct page *page;
 
@@ -1431,7 +1431,9 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
 	if (is_error_page(page))
 		goto error;
 
-	return kmap_atomic(page, idx);
+	*_page = page;
+
+	return kmap(page);
 
 error:
 	kvm_release_page_clean(page);
@@ -1440,16 +1442,9 @@ error:
 	return NULL;
 }
 
-static void nested_svm_unmap(void *addr, enum km_type idx)
+static void nested_svm_unmap(struct page *page)
 {
-	struct page *page;
-
-	if (!addr)
-		return;
-
-	page = kmap_atomic_to_page(addr);
-
-	kunmap_atomic(addr, idx);
+	kunmap(page);
 	kvm_release_page_dirty(page);
 }
 
@@ -1457,6 +1452,7 @@ static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 {
 	u32 param = svm->vmcb->control.exit_info_1 & 1;
 	u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+	struct page *page;
 	bool ret = false;
 	u32 t0, t1;
 	u8 *msrpm;
@@ -1464,7 +1460,7 @@ static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 	if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
 		return false;
 
-	msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
+	msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, &page);
 
 	if (!msrpm)
 		goto out;
@@ -1492,7 +1488,7 @@ static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 	ret = msrpm[t1] & ((1 << param) << t0);
 
 out:
-	nested_svm_unmap(msrpm, KM_USER0);
+	nested_svm_unmap(page);
 
 	return ret;
 }
@@ -1615,6 +1611,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	struct vmcb *nested_vmcb;
 	struct vmcb *hsave = svm->nested.hsave;
 	struct vmcb *vmcb = svm->vmcb;
+	struct page *page;
 
 	trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
 				       vmcb->control.exit_info_1,
@@ -1622,7 +1619,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 				       vmcb->control.exit_int_info,
 				       vmcb->control.exit_int_info_err);
 
-	nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
+	nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
 	if (!nested_vmcb)
 		return 1;
 
@@ -1712,7 +1709,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	/* Exit nested SVM mode */
 	svm->nested.vmcb = 0;
 
-	nested_svm_unmap(nested_vmcb, KM_USER0);
+	nested_svm_unmap(page);
 
 	kvm_mmu_reset_context(&svm->vcpu);
 	kvm_mmu_load(&svm->vcpu);
@@ -1723,9 +1720,10 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 {
 	u32 *nested_msrpm;
+	struct page *page;
 	int i;
 
-	nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0);
+	nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, &page);
 	if (!nested_msrpm)
 		return false;
 
@@ -1734,7 +1732,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 
 	svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
 
-	nested_svm_unmap(nested_msrpm, KM_USER0);
+	nested_svm_unmap(page);
 
 	return true;
 }
@@ -1744,8 +1742,9 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	struct vmcb *nested_vmcb;
 	struct vmcb *hsave = svm->nested.hsave;
 	struct vmcb *vmcb = svm->vmcb;
+	struct page *page;
 
-	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
 	if (!nested_vmcb)
 		return false;
 
@@ -1857,7 +1856,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
 	svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
 
-	nested_svm_unmap(nested_vmcb, KM_USER0);
+	nested_svm_unmap(page);
 
 	enable_gif(svm);
 
@@ -1883,6 +1882,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
 static int vmload_interception(struct vcpu_svm *svm)
 {
 	struct vmcb *nested_vmcb;
+	struct page *page;
 
 	if (nested_svm_check_permissions(svm))
 		return 1;
@@ -1890,12 +1890,12 @@ static int vmload_interception(struct vcpu_svm *svm)
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
 	skip_emulated_instruction(&svm->vcpu);
 
-	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
 	if (!nested_vmcb)
 		return 1;
 
 	nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
-	nested_svm_unmap(nested_vmcb, KM_USER0);
+	nested_svm_unmap(page);
 
 	return 1;
 }
@@ -1903,6 +1903,7 @@ static int vmload_interception(struct vcpu_svm *svm)
 static int vmsave_interception(struct vcpu_svm *svm)
 {
 	struct vmcb *nested_vmcb;
+	struct page *page;
 
 	if (nested_svm_check_permissions(svm))
 		return 1;
@@ -1910,12 +1911,12 @@ static int vmsave_interception(struct vcpu_svm *svm)
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
 	skip_emulated_instruction(&svm->vcpu);
 
-	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0);
+	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
 	if (!nested_vmcb)
 		return 1;
 
 	nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
-	nested_svm_unmap(nested_vmcb, KM_USER0);
+	nested_svm_unmap(page);
 
 	return 1;
 }
-- 
cgit v1.2.3-70-g09d2


From b8e88bc8ffba5fe53fb8d8a0a4be3bbcffeebe56 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 19 Feb 2010 16:23:02 +0100
Subject: KVM: SVM: Fix schedule-while-atomic on nested exception handling

Move the actual vmexit routine out of code that runs with
irqs and preemption disabled.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f9da35b06ec..c27da0ad040 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -129,6 +129,7 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
 
 static int nested_svm_exit_handled(struct vcpu_svm *svm);
+static int nested_svm_intercept(struct vcpu_svm *svm);
 static int nested_svm_vmexit(struct vcpu_svm *svm);
 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 				      bool has_error_code, u32 error_code);
@@ -1384,6 +1385,8 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm)
 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 				      bool has_error_code, u32 error_code)
 {
+	int vmexit;
+
 	if (!is_nested(svm))
 		return 0;
 
@@ -1392,7 +1395,11 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 	svm->vmcb->control.exit_info_1 = error_code;
 	svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
 
-	return nested_svm_exit_handled(svm);
+	vmexit = nested_svm_intercept(svm);
+	if (vmexit == NESTED_EXIT_DONE)
+		svm->nested.exit_required = true;
+
+	return vmexit;
 }
 
 static inline int nested_svm_intr(struct vcpu_svm *svm)
@@ -1521,7 +1528,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
 /*
  * If this function returns true, this #vmexit was already handled
  */
-static int nested_svm_exit_handled(struct vcpu_svm *svm)
+static int nested_svm_intercept(struct vcpu_svm *svm)
 {
 	u32 exit_code = svm->vmcb->control.exit_code;
 	int vmexit = NESTED_EXIT_HOST;
@@ -1567,9 +1574,17 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
 	}
 	}
 
-	if (vmexit == NESTED_EXIT_DONE) {
+	return vmexit;
+}
+
+static int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+	int vmexit;
+
+	vmexit = nested_svm_intercept(svm);
+
+	if (vmexit == NESTED_EXIT_DONE)
 		nested_svm_vmexit(svm);
-	}
 
 	return vmexit;
 }
-- 
cgit v1.2.3-70-g09d2


From cdbbdc1210223879450555fee04c29ebf116576b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 19 Feb 2010 16:23:03 +0100
Subject: KVM: SVM: Sync all control registers on nested vmexit

Currently the vmexit emulation does not sync control
registers were the access is typically intercepted by the
nested hypervisor. But we can not count on that intercepts
to sync these registers too and make the code
architecturally more correct.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c27da0ad040..02f8d491d15 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1647,9 +1647,13 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->save.ds     = vmcb->save.ds;
 	nested_vmcb->save.gdtr   = vmcb->save.gdtr;
 	nested_vmcb->save.idtr   = vmcb->save.idtr;
+	nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
 	if (npt_enabled)
 		nested_vmcb->save.cr3    = vmcb->save.cr3;
+	else
+		nested_vmcb->save.cr3    = svm->vcpu.arch.cr3;
 	nested_vmcb->save.cr2    = vmcb->save.cr2;
+	nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
 	nested_vmcb->save.rflags = vmcb->save.rflags;
 	nested_vmcb->save.rip    = vmcb->save.rip;
 	nested_vmcb->save.rsp    = vmcb->save.rsp;
-- 
cgit v1.2.3-70-g09d2


From 6c3bd3d7660c35a703073b81eccfd5a3b7c15295 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 19 Feb 2010 16:23:04 +0100
Subject: KVM: SVM: Annotate nested_svm_map with might_sleep()

The nested_svm_map() function can sleep and must not be
called from atomic context. So annotate that function.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 02f8d491d15..4bc018333d7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1434,6 +1434,8 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
 {
 	struct page *page;
 
+	might_sleep();
+
 	page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
 	if (is_error_page(page))
 		goto error;
-- 
cgit v1.2.3-70-g09d2


From 4c7da8cb43c09e71a405b5aeaa58a1dbac3c39e9 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 19 Feb 2010 16:23:05 +0100
Subject: KVM: SVM: Fix nested msr intercept handling

The nested_svm_exit_handled_msr() function maps only one
page of the guests msr permission bitmap. This patch changes
the code to use kvm_read_guest to fix the bug.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 4bc018333d7..4459c477af9 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1461,19 +1461,13 @@ static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 {
 	u32 param = svm->vmcb->control.exit_info_1 & 1;
 	u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
-	struct page *page;
 	bool ret = false;
 	u32 t0, t1;
-	u8 *msrpm;
+	u8 val;
 
 	if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
 		return false;
 
-	msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, &page);
-
-	if (!msrpm)
-		goto out;
-
 	switch (msr) {
 	case 0 ... 0x1fff:
 		t0 = (msr * 2) % 8;
@@ -1494,11 +1488,10 @@ static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 		goto out;
 	}
 
-	ret = msrpm[t1] & ((1 << param) << t0);
+	if (!kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + t1, &val, 1))
+		ret = val & ((1 << param) << t0);
 
 out:
-	nested_svm_unmap(page);
-
 	return ret;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 88ab24adc7142506c8583ac36a34fa388300b750 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 19 Feb 2010 16:23:06 +0100
Subject: KVM: SVM: Don't sync nested cr8 to lapic and back

This patch makes syncing of the guest tpr to the lapic
conditional on !nested. Otherwise a nested guest using the
TPR could freeze the guest.
Another important change this patch introduces is that the
cr8 intercept bits are no longer ORed at vmrun emulation if
the guest sets VINTR_MASKING in its VMCB. The reason is that
nested cr8 accesses need alway be handled by the nested
hypervisor because they change the shadow version of the
tpr.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 46 +++++++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 4459c477af9..481bd0ee5f7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1832,21 +1832,6 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
 	svm->vmcb->save.cpl = nested_vmcb->save.cpl;
 
-	/* We don't want a nested guest to be more powerful than the guest,
-	   so all intercepts are ORed */
-	svm->vmcb->control.intercept_cr_read |=
-		nested_vmcb->control.intercept_cr_read;
-	svm->vmcb->control.intercept_cr_write |=
-		nested_vmcb->control.intercept_cr_write;
-	svm->vmcb->control.intercept_dr_read |=
-		nested_vmcb->control.intercept_dr_read;
-	svm->vmcb->control.intercept_dr_write |=
-		nested_vmcb->control.intercept_dr_write;
-	svm->vmcb->control.intercept_exceptions |=
-		nested_vmcb->control.intercept_exceptions;
-
-	svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
-
 	svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
 
 	/* cache intercepts */
@@ -1864,6 +1849,28 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	else
 		svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
 
+	if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
+		/* We only want the cr8 intercept bits of the guest */
+		svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK;
+		svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+	}
+
+	/* We don't want a nested guest to be more powerful than the guest,
+	   so all intercepts are ORed */
+	svm->vmcb->control.intercept_cr_read |=
+		nested_vmcb->control.intercept_cr_read;
+	svm->vmcb->control.intercept_cr_write |=
+		nested_vmcb->control.intercept_cr_write;
+	svm->vmcb->control.intercept_dr_read |=
+		nested_vmcb->control.intercept_dr_read;
+	svm->vmcb->control.intercept_dr_write |=
+		nested_vmcb->control.intercept_dr_write;
+	svm->vmcb->control.intercept_exceptions |=
+		nested_vmcb->control.intercept_exceptions;
+
+	svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
+
+	svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
 	svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
 	svm->vmcb->control.int_state = nested_vmcb->control.int_state;
 	svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
@@ -2526,6 +2533,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+		return;
+
 	if (irr == -1)
 		return;
 
@@ -2629,6 +2639,9 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+		return;
+
 	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
 		int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
 		kvm_set_cr8(vcpu, cr8);
@@ -2640,6 +2653,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u64 cr8;
 
+	if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+		return;
+
 	cr8 = kvm_get_cr8(vcpu);
 	svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
 	svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
-- 
cgit v1.2.3-70-g09d2


From 06fc7772690dec2a0e3814633357babf8f63af41 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 19 Feb 2010 16:23:07 +0100
Subject: KVM: SVM: Activate nested state only when guest state is complete

Certain functions called during the emulated world switch
behave differently when the vcpu is running nested. This is
not the expected behavior during a world switch emulation.
This patch ensures that the nested state is activated only
if the vcpu is completly in nested state.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 481bd0ee5f7..8ace0b0da93 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1633,6 +1633,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	if (!nested_vmcb)
 		return 1;
 
+	/* Exit nested SVM mode */
+	svm->nested.vmcb = 0;
+
 	/* Give the current vmcb to the guest */
 	disable_gif(svm);
 
@@ -1720,9 +1723,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	svm->vmcb->save.cpl = 0;
 	svm->vmcb->control.exit_int_info = 0;
 
-	/* Exit nested SVM mode */
-	svm->nested.vmcb = 0;
-
 	nested_svm_unmap(page);
 
 	kvm_mmu_reset_context(&svm->vcpu);
@@ -1757,14 +1757,14 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	struct vmcb *hsave = svm->nested.hsave;
 	struct vmcb *vmcb = svm->vmcb;
 	struct page *page;
+	u64 vmcb_gpa;
+
+	vmcb_gpa = svm->vmcb->save.rax;
 
 	nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
 	if (!nested_vmcb)
 		return false;
 
-	/* nested_vmcb is our indicator if nested SVM is activated */
-	svm->nested.vmcb = svm->vmcb->save.rax;
-
 	trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
 			       nested_vmcb->save.rip,
 			       nested_vmcb->control.int_ctl,
@@ -1879,6 +1879,9 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 
 	nested_svm_unmap(page);
 
+	/* nested_vmcb is our indicator if nested SVM is activated */
+	svm->nested.vmcb = vmcb_gpa;
+
 	enable_gif(svm);
 
 	return true;
-- 
cgit v1.2.3-70-g09d2


From 66a562f7e2576cde384ec813b481404d8f54f4c6 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 19 Feb 2010 16:23:08 +0100
Subject: KVM: SVM: Make lazy FPU switching work with nested svm

The new lazy fpu switching code may disable cr0 intercepts
when running nested. This is a bug because the nested
hypervisor may still want to intercept cr0 which will break
in this situation. This patch fixes this issue and makes
lazy fpu switching working with nested svm.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 43 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8ace0b0da93..a8ec53fe74f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -979,6 +979,7 @@ static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
 
 static void update_cr0_intercept(struct vcpu_svm *svm)
 {
+	struct vmcb *vmcb = svm->vmcb;
 	ulong gcr0 = svm->vcpu.arch.cr0;
 	u64 *hcr0 = &svm->vmcb->save.cr0;
 
@@ -990,11 +991,25 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
 
 
 	if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
-		svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
-		svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+		vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
+		vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+		if (is_nested(svm)) {
+			struct vmcb *hsave = svm->nested.hsave;
+
+			hsave->control.intercept_cr_read  &= ~INTERCEPT_CR0_MASK;
+			hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
+			vmcb->control.intercept_cr_read  |= svm->nested.intercept_cr_read;
+			vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
+		}
 	} else {
 		svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
 		svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
+		if (is_nested(svm)) {
+			struct vmcb *hsave = svm->nested.hsave;
+
+			hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
+			hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
+		}
 	}
 }
 
@@ -1269,7 +1284,22 @@ static int ud_interception(struct vcpu_svm *svm)
 static void svm_fpu_activate(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
+	u32 excp;
+
+	if (is_nested(svm)) {
+		u32 h_excp, n_excp;
+
+		h_excp  = svm->nested.hsave->control.intercept_exceptions;
+		n_excp  = svm->nested.intercept_exceptions;
+		h_excp &= ~(1 << NM_VECTOR);
+		excp    = h_excp | n_excp;
+	} else {
+		excp  = svm->vmcb->control.intercept_exceptions;
+	        excp &= ~(1 << NM_VECTOR);
+	}
+
+	svm->vmcb->control.intercept_exceptions = excp;
+
 	svm->vcpu.fpu_active = 1;
 	update_cr0_intercept(svm);
 }
@@ -1513,6 +1543,9 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
 		if (!npt_enabled)
 			return NESTED_EXIT_HOST;
 		break;
+	case SVM_EXIT_EXCP_BASE + NM_VECTOR:
+		nm_interception(svm);
+		break;
 	default:
 		break;
 	}
@@ -2980,8 +3013,10 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	update_cr0_intercept(svm);
 	svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
+	if (is_nested(svm))
+		svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
+	update_cr0_intercept(svm);
 }
 
 static struct kvm_x86_ops svm_x86_ops = {
-- 
cgit v1.2.3-70-g09d2


From 052ce6211c4f7309988068fccdb7204c721871df Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 19 Feb 2010 16:23:09 +0100
Subject: KVM: SVM: Remove newlines from nested trace points

The tracing infrastructure adds its own newlines. Remove
them from the trace point printk format strings.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/trace.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 6ad30a29f04..12f8d2dee98 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -413,7 +413,7 @@ TRACE_EVENT(kvm_nested_vmrun,
 	),
 
 	TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
-		  "event_inj: 0x%08x npt: %s\n",
+		  "event_inj: 0x%08x npt: %s",
 		__entry->rip, __entry->vmcb, __entry->nested_rip,
 		__entry->int_ctl, __entry->event_inj,
 		__entry->npt ? "on" : "off")
@@ -447,7 +447,7 @@ TRACE_EVENT(kvm_nested_vmexit,
 		__entry->exit_int_info_err	= exit_int_info_err;
 	),
 	TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
-		  "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
+		  "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
 		  __entry->rip,
 		  ftrace_print_symbols_seq(p, __entry->exit_code,
 					   kvm_x86_ops->exit_reasons_str),
@@ -482,7 +482,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
 	),
 
 	TP_printk("reason: %s ext_inf1: 0x%016llx "
-		  "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
+		  "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
 		  ftrace_print_symbols_seq(p, __entry->exit_code,
 					   kvm_x86_ops->exit_reasons_str),
 		__entry->exit_info1, __entry->exit_info2,
@@ -504,7 +504,7 @@ TRACE_EVENT(kvm_nested_intr_vmexit,
 		__entry->rip	=	rip
 	),
 
-	TP_printk("rip: 0x%016llx\n", __entry->rip)
+	TP_printk("rip: 0x%016llx", __entry->rip)
 );
 
 /*
@@ -526,7 +526,7 @@ TRACE_EVENT(kvm_invlpga,
 		__entry->address	=	address;
 	),
 
-	TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n",
+	TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx",
 		  __entry->rip, __entry->asid, __entry->address)
 );
 
@@ -547,7 +547,7 @@ TRACE_EVENT(kvm_skinit,
 		__entry->slb		=	slb;
 	),
 
-	TP_printk("rip: 0x%016llx slb: 0x%08x\n",
+	TP_printk("rip: 0x%016llx slb: 0x%08x",
 		  __entry->rip, __entry->slb)
 );
 
-- 
cgit v1.2.3-70-g09d2


From 112592da0dc2460c95e8a89d0c5657c6a30286aa Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 21 Feb 2010 15:00:47 +0200
Subject: KVM: drop unneeded kvm_run check in emulate_instruction()

vcpu->run is initialized on vcpu creation and can never be NULL
here.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 274a8e39bca..1d27a57026a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3443,7 +3443,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	if (vcpu->arch.pio.string)
 		return EMULATE_DO_MMIO;
 
-	if ((r || vcpu->mmio_is_write) && run) {
+	if (r || vcpu->mmio_is_write) {
 		run->exit_reason = KVM_EXIT_MMIO;
 		run->mmio.phys_addr = vcpu->mmio_phys_addr;
 		memcpy(run->mmio.data, vcpu->mmio_data, 8);
-- 
cgit v1.2.3-70-g09d2


From 8fe546547cf6857a9d984bfe2f2194910f3fc5d0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 19 Feb 2010 16:23:01 +0100
Subject: KVM: SVM: Fix wrong interrupt injection in enable_irq_windows

The nested_svm_intr() function does not execute the vmexit
anymore. Therefore we may still be in the nested state after
that function ran. This patch changes the nested_svm_intr()
function to return wether the irq window could be enabled.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a8ec53fe74f..294bbca3417 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1432,16 +1432,17 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
 	return vmexit;
 }
 
-static inline int nested_svm_intr(struct vcpu_svm *svm)
+/* This function returns true if it is save to enable the irq window */
+static inline bool nested_svm_intr(struct vcpu_svm *svm)
 {
 	if (!is_nested(svm))
-		return 0;
+		return true;
 
 	if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
-		return 0;
+		return true;
 
 	if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
-		return 0;
+		return false;
 
 	svm->vmcb->control.exit_code = SVM_EXIT_INTR;
 
@@ -1454,10 +1455,10 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
 		 */
 		svm->nested.exit_required = true;
 		trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
-		return 1;
+		return false;
 	}
 
-	return 0;
+	return true;
 }
 
 static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
@@ -2629,13 +2630,11 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	nested_svm_intr(svm);
-
 	/* In case GIF=0 we can't rely on the CPU to tell us when
 	 * GIF becomes 1, because that's a separate STGI/VMRUN intercept.
 	 * The next time we get that intercept, this function will be
 	 * called again though and we'll get the vintr intercept. */
-	if (gif_set(svm)) {
+	if (gif_set(svm) && nested_svm_intr(svm)) {
 		svm_set_vintr(svm);
 		svm_inject_irq(svm, 0x0);
 	}
-- 
cgit v1.2.3-70-g09d2


From 03b82a30ea8b26199901b219848d706dbd70c609 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 15 Feb 2010 10:45:41 +0100
Subject: KVM: x86: Do not return soft events in vcpu_events

To avoid that user space migrates a pending software exception or
interrupt, mask them out on KVM_GET_VCPU_EVENTS. Without this, user
space would try to reinject them, and we would have to reconstruct the
proper instruction length for VMX event injection. Now the pending event
will be reinjected via executing the triggering instruction again.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1d27a57026a..2b1c9f2fb8d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2100,14 +2100,17 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 {
 	vcpu_load(vcpu);
 
-	events->exception.injected = vcpu->arch.exception.pending;
+	events->exception.injected =
+		vcpu->arch.exception.pending &&
+		!kvm_exception_is_soft(vcpu->arch.exception.nr);
 	events->exception.nr = vcpu->arch.exception.nr;
 	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
 	events->exception.error_code = vcpu->arch.exception.error_code;
 
-	events->interrupt.injected = vcpu->arch.interrupt.pending;
+	events->interrupt.injected =
+		vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
 	events->interrupt.nr = vcpu->arch.interrupt.nr;
-	events->interrupt.soft = vcpu->arch.interrupt.soft;
+	events->interrupt.soft = 0;
 
 	events->nmi.injected = vcpu->arch.nmi_injected;
 	events->nmi.pending = vcpu->arch.nmi_pending;
-- 
cgit v1.2.3-70-g09d2


From 48005f64d0ea965d454e38b5181af4aba9bdef5b Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 19 Feb 2010 19:38:07 +0100
Subject: KVM: x86: Save&restore interrupt shadow mask

The interrupt shadow created by STI or MOV-SS-like operations is part of
the VCPU state and must be preserved across migration. Transfer it in
the spare padding field of kvm_vcpu_events.interrupt.

As a side effect we now have to make vmx_set_interrupt_shadow robust
against both shadow types being set. Give MOV SS a higher priority and
skip STI in that case to avoid that VMX throws a fault on next entry.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/kvm/api.txt          | 11 ++++++++++-
 arch/x86/include/asm/kvm.h         |  7 ++++++-
 arch/x86/include/asm/kvm_emulate.h |  3 ---
 arch/x86/kvm/emulate.c             |  4 ++--
 arch/x86/kvm/svm.c                 |  2 +-
 arch/x86/kvm/vmx.c                 |  8 ++++----
 arch/x86/kvm/x86.c                 | 12 ++++++++++--
 include/linux/kvm.h                |  1 +
 8 files changed, 34 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index beb444a9501..9e5de5a1c4e 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -656,6 +656,7 @@ struct kvm_clock_data {
 4.29 KVM_GET_VCPU_EVENTS
 
 Capability: KVM_CAP_VCPU_EVENTS
+Extended by: KVM_CAP_INTR_SHADOW
 Architectures: x86
 Type: vm ioctl
 Parameters: struct kvm_vcpu_event (out)
@@ -676,7 +677,7 @@ struct kvm_vcpu_events {
 		__u8 injected;
 		__u8 nr;
 		__u8 soft;
-		__u8 pad;
+		__u8 shadow;
 	} interrupt;
 	struct {
 		__u8 injected;
@@ -688,9 +689,13 @@ struct kvm_vcpu_events {
 	__u32 flags;
 };
 
+KVM_VCPUEVENT_VALID_SHADOW may be set in the flags field to signal that
+interrupt.shadow contains a valid state. Otherwise, this field is undefined.
+
 4.30 KVM_SET_VCPU_EVENTS
 
 Capability: KVM_CAP_VCPU_EVENTS
+Extended by: KVM_CAP_INTR_SHADOW
 Architectures: x86
 Type: vm ioctl
 Parameters: struct kvm_vcpu_event (in)
@@ -709,6 +714,10 @@ current in-kernel state. The bits are:
 KVM_VCPUEVENT_VALID_NMI_PENDING - transfer nmi.pending to the kernel
 KVM_VCPUEVENT_VALID_SIPI_VECTOR - transfer sipi_vector
 
+If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in
+the flags field to signal that interrupt.shadow contains a valid state and
+shall be written into the VCPU.
+
 
 5. The kvm_run structure
 
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index f46b79f6c16..fb6117063ea 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -257,6 +257,11 @@ struct kvm_reinject_control {
 /* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
 #define KVM_VCPUEVENT_VALID_NMI_PENDING	0x00000001
 #define KVM_VCPUEVENT_VALID_SIPI_VECTOR	0x00000002
+#define KVM_VCPUEVENT_VALID_SHADOW	0x00000004
+
+/* Interrupt shadow states */
+#define KVM_X86_SHADOW_INT_MOV_SS	0x01
+#define KVM_X86_SHADOW_INT_STI		0x02
 
 /* for KVM_GET/SET_VCPU_EVENTS */
 struct kvm_vcpu_events {
@@ -271,7 +276,7 @@ struct kvm_vcpu_events {
 		__u8 injected;
 		__u8 nr;
 		__u8 soft;
-		__u8 pad;
+		__u8 shadow;
 	} interrupt;
 	struct {
 		__u8 injected;
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7a6f54fa13b..2666d7ac322 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -153,9 +153,6 @@ struct decode_cache {
 	struct fetch_cache fetch;
 };
 
-#define X86_SHADOW_INT_MOV_SS  1
-#define X86_SHADOW_INT_STI     2
-
 struct x86_emulate_ctxt {
 	/* Register state before/after emulation. */
 	struct kvm_vcpu *vcpu;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 35f7acd4a91..c9f604b0819 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2128,7 +2128,7 @@ special_insn:
 		}
 
 		if (c->modrm_reg == VCPU_SREG_SS)
-			toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
+			toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
 
 		rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg);
 
@@ -2366,7 +2366,7 @@ special_insn:
 		if (emulator_bad_iopl(ctxt))
 			kvm_inject_gp(ctxt->vcpu, 0);
 		else {
-			toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
+			toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
 			ctxt->eflags |= X86_EFLAGS_IF;
 			c->dst.type = OP_NONE;	/* Disable writeback. */
 		}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 294bbca3417..bd8f52f0823 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -265,7 +265,7 @@ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 	u32 ret = 0;
 
 	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
-		ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS;
+		ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
 	return ret & mask;
 }
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 68f895b0045..61f03980ada 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -846,9 +846,9 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 	int ret = 0;
 
 	if (interruptibility & GUEST_INTR_STATE_STI)
-		ret |= X86_SHADOW_INT_STI;
+		ret |= KVM_X86_SHADOW_INT_STI;
 	if (interruptibility & GUEST_INTR_STATE_MOV_SS)
-		ret |= X86_SHADOW_INT_MOV_SS;
+		ret |= KVM_X86_SHADOW_INT_MOV_SS;
 
 	return ret & mask;
 }
@@ -860,9 +860,9 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
 
 	interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
 
-	if (mask & X86_SHADOW_INT_MOV_SS)
+	if (mask & KVM_X86_SHADOW_INT_MOV_SS)
 		interruptibility |= GUEST_INTR_STATE_MOV_SS;
-	if (mask & X86_SHADOW_INT_STI)
+	else if (mask & KVM_X86_SHADOW_INT_STI)
 		interruptibility |= GUEST_INTR_STATE_STI;
 
 	if ((interruptibility != interruptibility_old))
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2b1c9f2fb8d..84ffd95ee19 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2111,6 +2111,9 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 		vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
 	events->interrupt.nr = vcpu->arch.interrupt.nr;
 	events->interrupt.soft = 0;
+	events->interrupt.shadow =
+		kvm_x86_ops->get_interrupt_shadow(vcpu,
+			KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
 
 	events->nmi.injected = vcpu->arch.nmi_injected;
 	events->nmi.pending = vcpu->arch.nmi_pending;
@@ -2119,7 +2122,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 	events->sipi_vector = vcpu->arch.sipi_vector;
 
 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
-			 | KVM_VCPUEVENT_VALID_SIPI_VECTOR);
+			 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
+			 | KVM_VCPUEVENT_VALID_SHADOW);
 
 	vcpu_put(vcpu);
 }
@@ -2128,7 +2132,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 					      struct kvm_vcpu_events *events)
 {
 	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
-			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR))
+			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
+			      | KVM_VCPUEVENT_VALID_SHADOW))
 		return -EINVAL;
 
 	vcpu_load(vcpu);
@@ -2143,6 +2148,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	vcpu->arch.interrupt.soft = events->interrupt.soft;
 	if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
 		kvm_pic_clear_isr_ack(vcpu->kvm);
+	if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
+		kvm_x86_ops->set_interrupt_shadow(vcpu,
+						  events->interrupt.shadow);
 
 	vcpu->arch.nmi_injected = events->nmi.injected;
 	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 360f85e8c43..48516a2a0b8 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -502,6 +502,7 @@ struct kvm_ioeventfd {
 #define KVM_CAP_HYPERV_SPIN 46
 #define KVM_CAP_PCI_SEGMENT 47
 #define KVM_CAP_PPC_PAIRED_SINGLES 48
+#define KVM_CAP_INTR_SHADOW 49
 #define KVM_CAP_X86_ROBUST_SINGLESTEP 51
 
 #ifdef KVM_CAP_IRQ_ROUTING
-- 
cgit v1.2.3-70-g09d2


From a1efbe77c1fd7c34a97a76a61520bf23fb3663f6 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 15 Feb 2010 10:45:43 +0100
Subject: KVM: x86: Add support for saving&restoring debug registers

So far user space was not able to save and restore debug registers for
migration or after reset. Plug this hole.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/kvm/api.txt  | 31 ++++++++++++++++++++++++++
 arch/x86/include/asm/kvm.h | 10 +++++++++
 arch/x86/kvm/x86.c         | 54 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/kvm.h        |  6 ++++++
 4 files changed, 101 insertions(+)

(limited to 'arch/x86')

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 9e5de5a1c4e..d170cb43554 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -718,6 +718,37 @@ If KVM_CAP_INTR_SHADOW is available, KVM_VCPUEVENT_VALID_SHADOW can be set in
 the flags field to signal that interrupt.shadow contains a valid state and
 shall be written into the VCPU.
 
+4.32 KVM_GET_DEBUGREGS
+
+Capability: KVM_CAP_DEBUGREGS
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_debugregs (out)
+Returns: 0 on success, -1 on error
+
+Reads debug registers from the vcpu.
+
+struct kvm_debugregs {
+	__u64 db[4];
+	__u64 dr6;
+	__u64 dr7;
+	__u64 flags;
+	__u64 reserved[9];
+};
+
+4.33 KVM_SET_DEBUGREGS
+
+Capability: KVM_CAP_DEBUGREGS
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_debugregs (in)
+Returns: 0 on success, -1 on error
+
+Writes debug registers into the vcpu.
+
+See KVM_GET_DEBUGREGS for the data structure. The flags field is unused
+yet and must be cleared on entry.
+
 
 5. The kvm_run structure
 
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index fb6117063ea..ff90055c7f0 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -21,6 +21,7 @@
 #define __KVM_HAVE_PIT_STATE2
 #define __KVM_HAVE_XEN_HVM
 #define __KVM_HAVE_VCPU_EVENTS
+#define __KVM_HAVE_DEBUGREGS
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
@@ -289,4 +290,13 @@ struct kvm_vcpu_events {
 	__u32 reserved[10];
 };
 
+/* for KVM_GET/SET_DEBUGREGS */
+struct kvm_debugregs {
+	__u64 db[4];
+	__u64 dr6;
+	__u64 dr7;
+	__u64 flags;
+	__u64 reserved[9];
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 84ffd95ee19..efeeabd84ec 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1548,6 +1548,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_HYPERV_VAPIC:
 	case KVM_CAP_HYPERV_SPIN:
 	case KVM_CAP_PCI_SEGMENT:
+	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 		r = 1;
 		break;
@@ -2165,6 +2166,36 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
+					     struct kvm_debugregs *dbgregs)
+{
+	vcpu_load(vcpu);
+
+	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
+	dbgregs->dr6 = vcpu->arch.dr6;
+	dbgregs->dr7 = vcpu->arch.dr7;
+	dbgregs->flags = 0;
+
+	vcpu_put(vcpu);
+}
+
+static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
+					    struct kvm_debugregs *dbgregs)
+{
+	if (dbgregs->flags)
+		return -EINVAL;
+
+	vcpu_load(vcpu);
+
+	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+	vcpu->arch.dr6 = dbgregs->dr6;
+	vcpu->arch.dr7 = dbgregs->dr7;
+
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
@@ -2343,6 +2374,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
 		break;
 	}
+	case KVM_GET_DEBUGREGS: {
+		struct kvm_debugregs dbgregs;
+
+		kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
+
+		r = -EFAULT;
+		if (copy_to_user(argp, &dbgregs,
+				 sizeof(struct kvm_debugregs)))
+			break;
+		r = 0;
+		break;
+	}
+	case KVM_SET_DEBUGREGS: {
+		struct kvm_debugregs dbgregs;
+
+		r = -EFAULT;
+		if (copy_from_user(&dbgregs, argp,
+				   sizeof(struct kvm_debugregs)))
+			break;
+
+		r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 48516a2a0b8..ce2876717a8 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -503,6 +503,9 @@ struct kvm_ioeventfd {
 #define KVM_CAP_PCI_SEGMENT 47
 #define KVM_CAP_PPC_PAIRED_SINGLES 48
 #define KVM_CAP_INTR_SHADOW 49
+#ifdef __KVM_HAVE_DEBUGREGS
+#define KVM_CAP_DEBUGREGS 50
+#endif
 #define KVM_CAP_X86_ROBUST_SINGLESTEP 51
 
 #ifdef KVM_CAP_IRQ_ROUTING
@@ -690,6 +693,9 @@ struct kvm_clock_data {
 /* Available with KVM_CAP_VCPU_EVENTS */
 #define KVM_GET_VCPU_EVENTS       _IOR(KVMIO,  0x9f, struct kvm_vcpu_events)
 #define KVM_SET_VCPU_EVENTS       _IOW(KVMIO,  0xa0, struct kvm_vcpu_events)
+/* Available with KVM_CAP_DEBUGREGS */
+#define KVM_GET_DEBUGREGS         _IOR(KVMIO,  0xa1, struct kvm_debugregs)
+#define KVM_SET_DEBUGREGS         _IOW(KVMIO,  0xa2, struct kvm_debugregs)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
-- 
cgit v1.2.3-70-g09d2


From 50a085bdd48af08cc7e3178ba0d7c1d5d8191698 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 24 Feb 2010 10:41:58 +0100
Subject: KVM: x86: Kick VCPU outside PIC lock again

This restores the deferred VCPU kicking before 956f97cf. We need this
over -rt as wake_up* requires non-atomic context in this configuration.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8259.c | 53 +++++++++++++++++++++++++++++++++++++---------------
 arch/x86/kvm/irq.h   |  1 +
 2 files changed, 39 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index a790fa128a9..93825ff3338 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -33,6 +33,29 @@
 #include <linux/kvm_host.h>
 #include "trace.h"
 
+static void pic_lock(struct kvm_pic *s)
+	__acquires(&s->lock)
+{
+	raw_spin_lock(&s->lock);
+}
+
+static void pic_unlock(struct kvm_pic *s)
+	__releases(&s->lock)
+{
+	bool wakeup = s->wakeup_needed;
+	struct kvm_vcpu *vcpu;
+
+	s->wakeup_needed = false;
+
+	raw_spin_unlock(&s->lock);
+
+	if (wakeup) {
+		vcpu = s->kvm->bsp_vcpu;
+		if (vcpu)
+			kvm_vcpu_kick(vcpu);
+	}
+}
+
 static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
 {
 	s->isr &= ~(1 << irq);
@@ -45,19 +68,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
 	 * Other interrupt may be delivered to PIC while lock is dropped but
 	 * it should be safe since PIC state is already updated at this stage.
 	 */
-	raw_spin_unlock(&s->pics_state->lock);
+	pic_unlock(s->pics_state);
 	kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
-	raw_spin_lock(&s->pics_state->lock);
+	pic_lock(s->pics_state);
 }
 
 void kvm_pic_clear_isr_ack(struct kvm *kvm)
 {
 	struct kvm_pic *s = pic_irqchip(kvm);
 
-	raw_spin_lock(&s->lock);
+	pic_lock(s);
 	s->pics[0].isr_ack = 0xff;
 	s->pics[1].isr_ack = 0xff;
-	raw_spin_unlock(&s->lock);
+	pic_unlock(s);
 }
 
 /*
@@ -158,9 +181,9 @@ static void pic_update_irq(struct kvm_pic *s)
 
 void kvm_pic_update_irq(struct kvm_pic *s)
 {
-	raw_spin_lock(&s->lock);
+	pic_lock(s);
 	pic_update_irq(s);
-	raw_spin_unlock(&s->lock);
+	pic_unlock(s);
 }
 
 int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -168,14 +191,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
 	struct kvm_pic *s = opaque;
 	int ret = -1;
 
-	raw_spin_lock(&s->lock);
+	pic_lock(s);
 	if (irq >= 0 && irq < PIC_NUM_PINS) {
 		ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
 		pic_update_irq(s);
 		trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
 				      s->pics[irq >> 3].imr, ret == 0);
 	}
-	raw_spin_unlock(&s->lock);
+	pic_unlock(s);
 
 	return ret;
 }
@@ -205,7 +228,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 	int irq, irq2, intno;
 	struct kvm_pic *s = pic_irqchip(kvm);
 
-	raw_spin_lock(&s->lock);
+	pic_lock(s);
 	irq = pic_get_irq(&s->pics[0]);
 	if (irq >= 0) {
 		pic_intack(&s->pics[0], irq);
@@ -230,7 +253,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 		intno = s->pics[0].irq_base + irq;
 	}
 	pic_update_irq(s);
-	raw_spin_unlock(&s->lock);
+	pic_unlock(s);
 
 	return intno;
 }
@@ -444,7 +467,7 @@ static int picdev_write(struct kvm_io_device *this,
 			printk(KERN_ERR "PIC: non byte write\n");
 		return 0;
 	}
-	raw_spin_lock(&s->lock);
+	pic_lock(s);
 	switch (addr) {
 	case 0x20:
 	case 0x21:
@@ -457,7 +480,7 @@ static int picdev_write(struct kvm_io_device *this,
 		elcr_ioport_write(&s->pics[addr & 1], addr, data);
 		break;
 	}
-	raw_spin_unlock(&s->lock);
+	pic_unlock(s);
 	return 0;
 }
 
@@ -474,7 +497,7 @@ static int picdev_read(struct kvm_io_device *this,
 			printk(KERN_ERR "PIC: non byte read\n");
 		return 0;
 	}
-	raw_spin_lock(&s->lock);
+	pic_lock(s);
 	switch (addr) {
 	case 0x20:
 	case 0x21:
@@ -488,7 +511,7 @@ static int picdev_read(struct kvm_io_device *this,
 		break;
 	}
 	*(unsigned char *)val = data;
-	raw_spin_unlock(&s->lock);
+	pic_unlock(s);
 	return 0;
 }
 
@@ -505,7 +528,7 @@ static void pic_irq_request(void *opaque, int level)
 	s->output = level;
 	if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
 		s->pics[0].isr_ack &= ~(1 << irq);
-		kvm_vcpu_kick(vcpu);
+		s->wakeup_needed = true;
 	}
 }
 
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 34b15915754..cd1f362f413 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -63,6 +63,7 @@ struct kvm_kpic_state {
 
 struct kvm_pic {
 	raw_spinlock_t lock;
+	bool wakeup_needed;
 	unsigned pending_acks;
 	struct kvm *kvm;
 	struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
-- 
cgit v1.2.3-70-g09d2


From 116a4752c82f767a59c27b2630a8474b936a776a Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 23 Feb 2010 17:47:54 +0100
Subject: KVM: SVM: Move svm_queue_exception

Move svm_queue_exception past skip_emulated_instruction to allow calling
it later on.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index bd8f52f0823..908ec61895c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -236,23 +236,6 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 	vcpu->arch.efer = efer;
 }
 
-static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
-				bool has_error_code, u32 error_code)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	/* If we are within a nested VM we'd better #VMEXIT and let the
-	   guest handle the exception */
-	if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
-		return;
-
-	svm->vmcb->control.event_inj = nr
-		| SVM_EVTINJ_VALID
-		| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
-		| SVM_EVTINJ_TYPE_EXEPT;
-	svm->vmcb->control.event_inj_err = error_code;
-}
-
 static int is_external_interrupt(u32 info)
 {
 	info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
@@ -298,6 +281,23 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	svm_set_interrupt_shadow(vcpu, 0);
 }
 
+static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
+				bool has_error_code, u32 error_code)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	/* If we are within a nested VM we'd better #VMEXIT and let the
+	   guest handle the exception */
+	if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
+		return;
+
+	svm->vmcb->control.event_inj = nr
+		| SVM_EVTINJ_VALID
+		| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+		| SVM_EVTINJ_TYPE_EXEPT;
+	svm->vmcb->control.event_inj_err = error_code;
+}
+
 static int has_svm(void)
 {
 	const char *msg;
-- 
cgit v1.2.3-70-g09d2


From f92653eeb496fe8624ac4b0d628c916a06a3d25c Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 23 Feb 2010 17:47:55 +0100
Subject: KVM: x86: Add kvm_is_linear_rip

Based on Gleb's suggestion: Add a helper kvm_is_linear_rip that matches
a given linear RIP against the current one. Use this for guest
single-stepping, more users will follow.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  4 +++-
 arch/x86/kvm/x86.c              | 21 +++++++++++++--------
 2 files changed, 16 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d46e791de12..502fff123e2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -362,8 +362,8 @@ struct kvm_vcpu_arch {
 	u64 *mce_banks;
 
 	/* used for guest single stepping over the given code position */
-	u16 singlestep_cs;
 	unsigned long singlestep_rip;
+
 	/* fields used by HYPER-V emulation */
 	u64 hv_vapic;
 };
@@ -820,4 +820,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 void kvm_define_shared_msr(unsigned index, u32 msr);
 void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
 
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
+
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index efeeabd84ec..f2f246ae4a4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5376,11 +5376,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 		vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
 	}
 
-	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
-		vcpu->arch.singlestep_cs =
-			get_segment_selector(vcpu, VCPU_SREG_CS);
-		vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
-	}
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
+			get_segment_base(vcpu, VCPU_SREG_CS);
 
 	/*
 	 * Trigger an rflags update that will inject or remove the trace
@@ -5871,6 +5869,15 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
 	return kvm_x86_ops->interrupt_allowed(vcpu);
 }
 
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
+{
+	unsigned long current_rip = kvm_rip_read(vcpu) +
+		get_segment_base(vcpu, VCPU_SREG_CS);
+
+	return current_rip == linear_rip;
+}
+EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
+
 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
 {
 	unsigned long rflags;
@@ -5885,9 +5892,7 @@ EXPORT_SYMBOL_GPL(kvm_get_rflags);
 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
-	    vcpu->arch.singlestep_cs ==
-			get_segment_selector(vcpu, VCPU_SREG_CS) &&
-	    vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
+	    kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
 		rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
 	kvm_x86_ops->set_rflags(vcpu, rflags);
 }
-- 
cgit v1.2.3-70-g09d2


From 66b7138f913635b40822890ba8913548f8b285b8 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 23 Feb 2010 17:47:56 +0100
Subject: KVM: SVM: Emulate nRIP feature when reinjecting INT3

When in guest debugging mode, we have to reinject those #BP software
exceptions that are caused by guest-injected INT3. As older AMD
processors do not support the required nRIP VMCB field, try to emulate
it by moving RIP past the instruction on exception injection. Fix it up
again in case the injection failed and we were able to catch this. This
does not work for unintercepted faults, but it is better than doing
nothing.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 38 +++++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 908ec61895c..468ff6e721c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -47,6 +47,7 @@ MODULE_LICENSE("GPL");
 #define SVM_FEATURE_NPT  (1 << 0)
 #define SVM_FEATURE_LBRV (1 << 1)
 #define SVM_FEATURE_SVML (1 << 2)
+#define SVM_FEATURE_NRIP (1 << 3)
 #define SVM_FEATURE_PAUSE_FILTER (1 << 10)
 
 #define NESTED_EXIT_HOST	0	/* Exit handled on host level */
@@ -110,6 +111,9 @@ struct vcpu_svm {
 	struct nested_state nested;
 
 	bool nmi_singlestep;
+
+	unsigned int3_injected;
+	unsigned long int3_rip;
 };
 
 /* enable NPT for AMD64 and X86 with PAE */
@@ -291,6 +295,22 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
 		return;
 
+	if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) {
+		unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
+
+		/*
+		 * For guest debugging where we have to reinject #BP if some
+		 * INT3 is guest-owned:
+		 * Emulate nRIP by moving RIP forward. Will fail if injection
+		 * raises a fault that is not intercepted. Still better than
+		 * failing in all cases.
+		 */
+		skip_emulated_instruction(&svm->vcpu);
+		rip = kvm_rip_read(&svm->vcpu);
+		svm->int3_rip = rip + svm->vmcb->save.cs.base;
+		svm->int3_injected = rip - old_rip;
+	}
+
 	svm->vmcb->control.event_inj = nr
 		| SVM_EVTINJ_VALID
 		| (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
@@ -2701,6 +2721,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 	u8 vector;
 	int type;
 	u32 exitintinfo = svm->vmcb->control.exit_int_info;
+	unsigned int3_injected = svm->int3_injected;
+
+	svm->int3_injected = 0;
 
 	if (svm->vcpu.arch.hflags & HF_IRET_MASK)
 		svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
@@ -2720,12 +2743,21 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 		svm->vcpu.arch.nmi_injected = true;
 		break;
 	case SVM_EXITINTINFO_TYPE_EXEPT:
-		/* In case of software exception do not reinject an exception
-		   vector, but re-execute and instruction instead */
 		if (is_nested(svm))
 			break;
-		if (kvm_exception_is_soft(vector))
+		/*
+		 * In case of software exceptions, do not reinject the vector,
+		 * but re-execute the instruction instead. Rewind RIP first
+		 * if we emulated INT3 before.
+		 */
+		if (kvm_exception_is_soft(vector)) {
+			if (vector == BP_VECTOR && int3_injected &&
+			    kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
+				kvm_rip_write(&svm->vcpu,
+					      kvm_rip_read(&svm->vcpu) -
+					      int3_injected);
 			break;
+		}
 		if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
 			u32 err = svm->vmcb->control.exit_int_info_err;
 			kvm_queue_exception_e(&svm->vcpu, vector, err);
-- 
cgit v1.2.3-70-g09d2


From c310bac5a20fc37f761bd7297ba2e52cf40d79c6 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 23 Feb 2010 17:47:58 +0100
Subject: KVM: x86: Drop RF manipulation for guest single-stepping

RF is not required for injecting TF as the latter will trigger only
after an instruction execution anyway. So do not touch RF when arming or
disarming guest single-step mode.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f2f246ae4a4..a519fc6ed05 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5884,7 +5884,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
 
 	rflags = kvm_x86_ops->get_rflags(vcpu);
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-		rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
+		rflags &= ~X86_EFLAGS_TF;
 	return rflags;
 }
 EXPORT_SYMBOL_GPL(kvm_get_rflags);
@@ -5893,7 +5893,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
 	    kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
-		rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
+		rflags |= X86_EFLAGS_TF;
 	kvm_x86_ops->set_rflags(vcpu, rflags);
 }
 EXPORT_SYMBOL_GPL(kvm_set_rflags);
-- 
cgit v1.2.3-70-g09d2


From 83bf0002c91b65744db78df36d4f1af27bd9099b Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 23 Feb 2010 17:47:59 +0100
Subject: KVM: x86: Preserve injected TF across emulation

Call directly into the vendor services for getting/setting rflags in
emulate_instruction to ensure injected TF survives the emulation.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a519fc6ed05..3a367f35ceb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3447,7 +3447,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
 		vcpu->arch.emulate_ctxt.vcpu = vcpu;
-		vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
+		vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
 		vcpu->arch.emulate_ctxt.mode =
 			(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
 			(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
@@ -3526,7 +3526,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		return EMULATE_DO_MMIO;
 	}
 
-	kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 
 	if (vcpu->mmio_is_write) {
 		vcpu->mmio_needed = 0;
-- 
cgit v1.2.3-70-g09d2


From e02317153e77150fed9609c3212c98204ec3ea74 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 24 Feb 2010 18:59:10 +0100
Subject: KVM: SVM: Coding style cleanup

This patch removes whitespace errors, fixes comment formats
and most of checkpatch warnings. Now vim does not show
c-space-errors anymore.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 148 +++++++++++++++++++++++++++++------------------------
 1 file changed, 81 insertions(+), 67 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 468ff6e721c..44679530ad5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -120,7 +120,7 @@ struct vcpu_svm {
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 static bool npt_enabled = true;
 #else
-static bool npt_enabled = false;
+static bool npt_enabled;
 #endif
 static int npt = 1;
 
@@ -168,8 +168,8 @@ static unsigned long iopm_base;
 struct kvm_ldttss_desc {
 	u16 limit0;
 	u16 base0;
-	unsigned base1 : 8, type : 5, dpl : 2, p : 1;
-	unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
+	unsigned base1:8, type:5, dpl:2, p:1;
+	unsigned limit1:4, zero0:3, g:1, base2:8;
 	u32 base3;
 	u32 zero1;
 } __attribute__((packed));
@@ -218,7 +218,7 @@ static inline void stgi(void)
 
 static inline void invlpga(unsigned long addr, u32 asid)
 {
-	asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
+	asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
 }
 
 static inline void force_new_asid(struct kvm_vcpu *vcpu)
@@ -290,8 +290,10 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	/* If we are within a nested VM we'd better #VMEXIT and let the
-	   guest handle the exception */
+	/*
+	 * If we are within a nested VM we'd better #VMEXIT and let the guest
+	 * handle the exception
+	 */
 	if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
 		return;
 
@@ -544,7 +546,7 @@ static void init_seg(struct vmcb_seg *seg)
 {
 	seg->selector = 0;
 	seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
-		SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
+		      SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
 	seg->limit = 0xffff;
 	seg->base = 0;
 }
@@ -564,16 +566,16 @@ static void init_vmcb(struct vcpu_svm *svm)
 
 	svm->vcpu.fpu_active = 1;
 
-	control->intercept_cr_read = 	INTERCEPT_CR0_MASK |
+	control->intercept_cr_read =	INTERCEPT_CR0_MASK |
 					INTERCEPT_CR3_MASK |
 					INTERCEPT_CR4_MASK;
 
-	control->intercept_cr_write = 	INTERCEPT_CR0_MASK |
+	control->intercept_cr_write =	INTERCEPT_CR0_MASK |
 					INTERCEPT_CR3_MASK |
 					INTERCEPT_CR4_MASK |
 					INTERCEPT_CR8_MASK;
 
-	control->intercept_dr_read = 	INTERCEPT_DR0_MASK |
+	control->intercept_dr_read =	INTERCEPT_DR0_MASK |
 					INTERCEPT_DR1_MASK |
 					INTERCEPT_DR2_MASK |
 					INTERCEPT_DR3_MASK |
@@ -582,7 +584,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 					INTERCEPT_DR6_MASK |
 					INTERCEPT_DR7_MASK;
 
-	control->intercept_dr_write = 	INTERCEPT_DR0_MASK |
+	control->intercept_dr_write =	INTERCEPT_DR0_MASK |
 					INTERCEPT_DR1_MASK |
 					INTERCEPT_DR2_MASK |
 					INTERCEPT_DR3_MASK |
@@ -596,7 +598,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 					(1 << MC_VECTOR);
 
 
-	control->intercept = 	(1ULL << INTERCEPT_INTR) |
+	control->intercept =	(1ULL << INTERCEPT_INTR) |
 				(1ULL << INTERCEPT_NMI) |
 				(1ULL << INTERCEPT_SMI) |
 				(1ULL << INTERCEPT_SELECTIVE_CR0) |
@@ -657,7 +659,8 @@ static void init_vmcb(struct vcpu_svm *svm)
 	save->rip = 0x0000fff0;
 	svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
 
-	/* This is the guest-visible cr0 value.
+	/*
+	 * This is the guest-visible cr0 value.
 	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
 	 */
 	svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
@@ -903,7 +906,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 	var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
 	var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
 
-	/* AMD's VMCB does not have an explicit unusable field, so emulate it
+	/*
+	 * AMD's VMCB does not have an explicit unusable field, so emulate it
 	 * for cross vendor migration purposes by "not present"
 	 */
 	var->unusable = !var->present || (var->type == 0);
@@ -939,7 +943,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 			var->type |= 0x1;
 		break;
 	case VCPU_SREG_SS:
-		/* On AMD CPUs sometimes the DB bit in the segment
+		/*
+		 * On AMD CPUs sometimes the DB bit in the segment
 		 * descriptor is left as 1, although the whole segment has
 		 * been made unusable. Clear it here to pass an Intel VMX
 		 * entry check when cross vendor migrating.
@@ -1270,7 +1275,7 @@ static int db_interception(struct vcpu_svm *svm)
 	}
 
 	if (svm->vcpu.guest_debug &
-	    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
+	    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
 		kvm_run->exit_reason = KVM_EXIT_DEBUG;
 		kvm_run->debug.arch.pc =
 			svm->vmcb->save.cs.base + svm->vmcb->save.rip;
@@ -1315,7 +1320,7 @@ static void svm_fpu_activate(struct kvm_vcpu *vcpu)
 		excp    = h_excp | n_excp;
 	} else {
 		excp  = svm->vmcb->control.intercept_exceptions;
-	        excp &= ~(1 << NM_VECTOR);
+		excp &= ~(1 << NM_VECTOR);
 	}
 
 	svm->vmcb->control.intercept_exceptions = excp;
@@ -1554,13 +1559,13 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
 	case SVM_EXIT_INTR:
 	case SVM_EXIT_NMI:
 		return NESTED_EXIT_HOST;
-		/* For now we are always handling NPFs when using them */
 	case SVM_EXIT_NPF:
+		/* For now we are always handling NPFs when using them */
 		if (npt_enabled)
 			return NESTED_EXIT_HOST;
 		break;
-	/* When we're shadowing, trap PFs */
 	case SVM_EXIT_EXCP_BASE + PF_VECTOR:
+		/* When we're shadowing, trap PFs */
 		if (!npt_enabled)
 			return NESTED_EXIT_HOST;
 		break;
@@ -1795,7 +1800,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 	if (!nested_msrpm)
 		return false;
 
-	for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
+	for (i = 0; i < PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
 		svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
 
 	svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
@@ -1829,8 +1834,10 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	kvm_clear_exception_queue(&svm->vcpu);
 	kvm_clear_interrupt_queue(&svm->vcpu);
 
-	/* Save the old vmcb, so we don't need to pick what we save, but
-	   can restore everything when a VMEXIT occurs */
+	/*
+	 * Save the old vmcb, so we don't need to pick what we save, but can
+	 * restore everything when a VMEXIT occurs
+	 */
 	hsave->save.es     = vmcb->save.es;
 	hsave->save.cs     = vmcb->save.cs;
 	hsave->save.ss     = vmcb->save.ss;
@@ -1878,6 +1885,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
+
 	/* In case we don't even reach vcpu_run, the fields are not updated */
 	svm->vmcb->save.rax = nested_vmcb->save.rax;
 	svm->vmcb->save.rsp = nested_vmcb->save.rsp;
@@ -1909,8 +1917,10 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 		svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
 	}
 
-	/* We don't want a nested guest to be more powerful than the guest,
-	   so all intercepts are ORed */
+	/*
+	 * We don't want a nested guest to be more powerful than the guest, so
+	 * all intercepts are ORed
+	 */
 	svm->vmcb->control.intercept_cr_read |=
 		nested_vmcb->control.intercept_cr_read;
 	svm->vmcb->control.intercept_cr_write |=
@@ -2224,9 +2234,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 	case MSR_IA32_SYSENTER_ESP:
 		*data = svm->sysenter_esp;
 		break;
-	/* Nobody will change the following 5 values in the VMCB so
-	   we can safely return them on rdmsr. They will always be 0
-	   until LBRV is implemented. */
+	/*
+	 * Nobody will change the following 5 values in the VMCB so we can
+	 * safely return them on rdmsr. They will always be 0 until LBRV is
+	 * implemented.
+	 */
 	case MSR_IA32_DEBUGCTLMSR:
 		*data = svm->vmcb->save.dbgctl;
 		break;
@@ -2405,16 +2417,16 @@ static int pause_interception(struct vcpu_svm *svm)
 }
 
 static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
-	[SVM_EXIT_READ_CR0]           		= emulate_on_interception,
-	[SVM_EXIT_READ_CR3]           		= emulate_on_interception,
-	[SVM_EXIT_READ_CR4]           		= emulate_on_interception,
-	[SVM_EXIT_READ_CR8]           		= emulate_on_interception,
+	[SVM_EXIT_READ_CR0]			= emulate_on_interception,
+	[SVM_EXIT_READ_CR3]			= emulate_on_interception,
+	[SVM_EXIT_READ_CR4]			= emulate_on_interception,
+	[SVM_EXIT_READ_CR8]			= emulate_on_interception,
 	[SVM_EXIT_CR0_SEL_WRITE]		= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR0]          		= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR3]          		= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR4]          		= emulate_on_interception,
-	[SVM_EXIT_WRITE_CR8]          		= cr8_write_interception,
-	[SVM_EXIT_READ_DR0] 			= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR0]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR3]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR4]			= emulate_on_interception,
+	[SVM_EXIT_WRITE_CR8]			= cr8_write_interception,
+	[SVM_EXIT_READ_DR0]			= emulate_on_interception,
 	[SVM_EXIT_READ_DR1]			= emulate_on_interception,
 	[SVM_EXIT_READ_DR2]			= emulate_on_interception,
 	[SVM_EXIT_READ_DR3]			= emulate_on_interception,
@@ -2433,15 +2445,14 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_EXCP_BASE + DB_VECTOR]	= db_interception,
 	[SVM_EXIT_EXCP_BASE + BP_VECTOR]	= bp_interception,
 	[SVM_EXIT_EXCP_BASE + UD_VECTOR]	= ud_interception,
-	[SVM_EXIT_EXCP_BASE + PF_VECTOR] 	= pf_interception,
-	[SVM_EXIT_EXCP_BASE + NM_VECTOR] 	= nm_interception,
-	[SVM_EXIT_EXCP_BASE + MC_VECTOR] 	= mc_interception,
-	[SVM_EXIT_INTR] 			= intr_interception,
+	[SVM_EXIT_EXCP_BASE + PF_VECTOR]	= pf_interception,
+	[SVM_EXIT_EXCP_BASE + NM_VECTOR]	= nm_interception,
+	[SVM_EXIT_EXCP_BASE + MC_VECTOR]	= mc_interception,
+	[SVM_EXIT_INTR]				= intr_interception,
 	[SVM_EXIT_NMI]				= nmi_interception,
 	[SVM_EXIT_SMI]				= nop_on_interception,
 	[SVM_EXIT_INIT]				= nop_on_interception,
 	[SVM_EXIT_VINTR]			= interrupt_window_interception,
-	/* [SVM_EXIT_CR0_SEL_WRITE]		= emulate_on_interception, */
 	[SVM_EXIT_CPUID]			= cpuid_interception,
 	[SVM_EXIT_IRET]                         = iret_interception,
 	[SVM_EXIT_INVD]                         = emulate_on_interception,
@@ -2449,7 +2460,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_HLT]				= halt_interception,
 	[SVM_EXIT_INVLPG]			= invlpg_interception,
 	[SVM_EXIT_INVLPGA]			= invlpga_interception,
-	[SVM_EXIT_IOIO] 		  	= io_interception,
+	[SVM_EXIT_IOIO]				= io_interception,
 	[SVM_EXIT_MSR]				= msr_interception,
 	[SVM_EXIT_TASK_SWITCH]			= task_switch_interception,
 	[SVM_EXIT_SHUTDOWN]			= shutdown_interception,
@@ -2650,10 +2661,12 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	/* In case GIF=0 we can't rely on the CPU to tell us when
-	 * GIF becomes 1, because that's a separate STGI/VMRUN intercept.
-	 * The next time we get that intercept, this function will be
-	 * called again though and we'll get the vintr intercept. */
+	/*
+	 * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
+	 * 1, because that's a separate STGI/VMRUN intercept.  The next time we
+	 * get that intercept, this function will be called again though and
+	 * we'll get the vintr intercept.
+	 */
 	if (gif_set(svm) && nested_svm_intr(svm)) {
 		svm_set_vintr(svm);
 		svm_inject_irq(svm, 0x0);
@@ -2668,9 +2681,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 	    == HF_NMI_MASK)
 		return; /* IRET will cause a vm exit */
 
-	/* Something prevents NMI from been injected. Single step over
-	   possible problem (IRET or exception injection or interrupt
-	   shadow) */
+	/*
+	 * Something prevents NMI from been injected. Single step over possible
+	 * problem (IRET or exception injection or interrupt shadow)
+	 */
 	svm->nmi_singlestep = true;
 	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
 	update_db_intercept(vcpu);
@@ -2978,24 +2992,24 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 }
 
 static const struct trace_print_flags svm_exit_reasons_str[] = {
-	{ SVM_EXIT_READ_CR0,           		"read_cr0" },
-	{ SVM_EXIT_READ_CR3,	      		"read_cr3" },
-	{ SVM_EXIT_READ_CR4,	      		"read_cr4" },
-	{ SVM_EXIT_READ_CR8,  	      		"read_cr8" },
-	{ SVM_EXIT_WRITE_CR0,          		"write_cr0" },
-	{ SVM_EXIT_WRITE_CR3,	      		"write_cr3" },
-	{ SVM_EXIT_WRITE_CR4,          		"write_cr4" },
-	{ SVM_EXIT_WRITE_CR8, 	      		"write_cr8" },
-	{ SVM_EXIT_READ_DR0, 	      		"read_dr0" },
-	{ SVM_EXIT_READ_DR1,	      		"read_dr1" },
-	{ SVM_EXIT_READ_DR2,	      		"read_dr2" },
-	{ SVM_EXIT_READ_DR3,	      		"read_dr3" },
-	{ SVM_EXIT_WRITE_DR0,	      		"write_dr0" },
-	{ SVM_EXIT_WRITE_DR1,	      		"write_dr1" },
-	{ SVM_EXIT_WRITE_DR2,	      		"write_dr2" },
-	{ SVM_EXIT_WRITE_DR3,	      		"write_dr3" },
-	{ SVM_EXIT_WRITE_DR5,	      		"write_dr5" },
-	{ SVM_EXIT_WRITE_DR7,	      		"write_dr7" },
+	{ SVM_EXIT_READ_CR0,			"read_cr0" },
+	{ SVM_EXIT_READ_CR3,			"read_cr3" },
+	{ SVM_EXIT_READ_CR4,			"read_cr4" },
+	{ SVM_EXIT_READ_CR8,			"read_cr8" },
+	{ SVM_EXIT_WRITE_CR0,			"write_cr0" },
+	{ SVM_EXIT_WRITE_CR3,			"write_cr3" },
+	{ SVM_EXIT_WRITE_CR4,			"write_cr4" },
+	{ SVM_EXIT_WRITE_CR8,			"write_cr8" },
+	{ SVM_EXIT_READ_DR0,			"read_dr0" },
+	{ SVM_EXIT_READ_DR1,			"read_dr1" },
+	{ SVM_EXIT_READ_DR2,			"read_dr2" },
+	{ SVM_EXIT_READ_DR3,			"read_dr3" },
+	{ SVM_EXIT_WRITE_DR0,			"write_dr0" },
+	{ SVM_EXIT_WRITE_DR1,			"write_dr1" },
+	{ SVM_EXIT_WRITE_DR2,			"write_dr2" },
+	{ SVM_EXIT_WRITE_DR3,			"write_dr3" },
+	{ SVM_EXIT_WRITE_DR5,			"write_dr5" },
+	{ SVM_EXIT_WRITE_DR7,			"write_dr7" },
 	{ SVM_EXIT_EXCP_BASE + DB_VECTOR,	"DB excp" },
 	{ SVM_EXIT_EXCP_BASE + BP_VECTOR,	"BP excp" },
 	{ SVM_EXIT_EXCP_BASE + UD_VECTOR,	"UD excp" },
-- 
cgit v1.2.3-70-g09d2


From 0e5cbe368b366f02cf3fe707aea2c0efac1bf70e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 24 Feb 2010 18:59:11 +0100
Subject: KVM: SVM: Reset MMU on nested_svm_vmrun for NPT too

Without resetting the MMU the gva_to_pga function will not
work reliably when the vcpu is running in nested context.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 44679530ad5..ef40b90219f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1877,10 +1877,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	if (npt_enabled) {
 		svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
 		svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
-	} else {
+	} else
 		kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
-		kvm_mmu_reset_context(&svm->vcpu);
-	}
+
+	/* Guest paging mode is active - reset mmu */
+	kvm_mmu_reset_context(&svm->vcpu);
+
 	svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
-- 
cgit v1.2.3-70-g09d2


From 887f500ca1a721cac237ba56374b9c0f578251ec Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 24 Feb 2010 18:59:12 +0100
Subject: KVM: SVM: Check for nested intercepts on NMI injection

This patch implements the NMI intercept checking for nested
svm.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ef40b90219f..9f1143105e0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1486,6 +1486,21 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
 	return true;
 }
 
+/* This function returns true if it is save to enable the nmi window */
+static inline bool nested_svm_nmi(struct vcpu_svm *svm)
+{
+	if (!is_nested(svm))
+		return true;
+
+	if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
+		return true;
+
+	svm->vmcb->control.exit_code = SVM_EXIT_NMI;
+	svm->nested.exit_required = true;
+
+	return false;
+}
+
 static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
 {
 	struct page *page;
@@ -2687,9 +2702,11 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 	 * Something prevents NMI from been injected. Single step over possible
 	 * problem (IRET or exception injection or interrupt shadow)
 	 */
-	svm->nmi_singlestep = true;
-	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
-	update_db_intercept(vcpu);
+	if (gif_set(svm) && nested_svm_nmi(svm)) {
+		svm->nmi_singlestep = true;
+		svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+		update_db_intercept(vcpu);
+	}
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
-- 
cgit v1.2.3-70-g09d2


From ecf1405df235da3efea213427ac7da7f816e9d06 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 24 Feb 2010 18:59:13 +0100
Subject: KVM: SVM: Restore tracing of nested vmcb address

A recent change broke tracing of the nested vmcb address. It
was reported as 0 all the time. This patch fixes it.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9f1143105e0..2e4e089646a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1839,7 +1839,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	if (!nested_vmcb)
 		return false;
 
-	trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
+	trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa,
 			       nested_vmcb->save.rip,
 			       nested_vmcb->control.int_ctl,
 			       nested_vmcb->control.event_inj,
-- 
cgit v1.2.3-70-g09d2


From 2e554e8d67926024b01e97d2fe652810165354e2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 24 Feb 2010 18:59:14 +0100
Subject: KVM: SVM: Add kvm_nested_intercepts tracepoint

This patch adds a tracepoint to get information about the
most important intercept bitmasks from the nested vmcb.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c   |  5 +++++
 arch/x86/kvm/trace.h | 22 ++++++++++++++++++++++
 arch/x86/kvm/x86.c   |  1 +
 3 files changed, 28 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2e4e089646a..cac761c6d1d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1845,6 +1845,11 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 			       nested_vmcb->control.event_inj,
 			       nested_vmcb->control.nested_ctl);
 
+	trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read,
+				    nested_vmcb->control.intercept_cr_write,
+				    nested_vmcb->control.intercept_exceptions,
+				    nested_vmcb->control.intercept);
+
 	/* Clear internal status */
 	kvm_clear_exception_queue(&svm->vcpu);
 	kvm_clear_interrupt_queue(&svm->vcpu);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 12f8d2dee98..17b52ccd977 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -419,6 +419,28 @@ TRACE_EVENT(kvm_nested_vmrun,
 		__entry->npt ? "on" : "off")
 );
 
+TRACE_EVENT(kvm_nested_intercepts,
+	    TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept),
+	    TP_ARGS(cr_read, cr_write, exceptions, intercept),
+
+	TP_STRUCT__entry(
+		__field(	__u16,		cr_read		)
+		__field(	__u16,		cr_write	)
+		__field(	__u32,		exceptions	)
+		__field(	__u64,		intercept	)
+	),
+
+	TP_fast_assign(
+		__entry->cr_read	= cr_read;
+		__entry->cr_write	= cr_write;
+		__entry->exceptions	= exceptions;
+		__entry->intercept	= intercept;
+	),
+
+	TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx",
+		__entry->cr_read, __entry->cr_write, __entry->exceptions,
+		__entry->intercept)
+);
 /*
  * Tracepoint for #VMEXIT while nested
  */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3a367f35ceb..1aa4d6e26ba 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5909,3 +5909,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
-- 
cgit v1.2.3-70-g09d2


From 4a810181c8bcd73112f5c62b205b5583fd4a197f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 24 Feb 2010 18:59:15 +0100
Subject: KVM: SVM: Implement emulation of vm_cr msr

This patch implements the emulation of the vm_cr msr for
nested svm.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/svm.h |  4 ++++
 arch/x86/kvm/svm.c         | 29 ++++++++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 38638cd2fa4..b26a38d8535 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -115,6 +115,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 #define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
 #define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
 
+#define SVM_VM_CR_VALID_MASK	0x001fULL
+#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
+#define SVM_VM_CR_SVM_DIS_MASK  0x0010ULL
+
 struct __attribute__ ((__packed__)) vmcb_seg {
 	u16 selector;
 	u16 attrib;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index cac761c6d1d..b4aac5c7ad8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -71,6 +71,7 @@ struct kvm_vcpu;
 struct nested_state {
 	struct vmcb *hsave;
 	u64 hsave_msr;
+	u64 vm_cr_msr;
 	u64 vmcb;
 
 	/* These are the merged vectors */
@@ -2280,7 +2281,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 		*data = svm->nested.hsave_msr;
 		break;
 	case MSR_VM_CR:
-		*data = 0;
+		*data = svm->nested.vm_cr_msr;
 		break;
 	case MSR_IA32_UCODE_REV:
 		*data = 0x01000065;
@@ -2310,6 +2311,31 @@ static int rdmsr_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
+static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	int svm_dis, chg_mask;
+
+	if (data & ~SVM_VM_CR_VALID_MASK)
+		return 1;
+
+	chg_mask = SVM_VM_CR_VALID_MASK;
+
+	if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
+		chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
+
+	svm->nested.vm_cr_msr &= ~chg_mask;
+	svm->nested.vm_cr_msr |= (data & chg_mask);
+
+	svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
+
+	/* check for svm_disable while efer.svme is set */
+	if (svm_dis && (vcpu->arch.efer & EFER_SVME))
+		return 1;
+
+	return 0;
+}
+
 static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -2376,6 +2402,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 		svm->nested.hsave_msr = data;
 		break;
 	case MSR_VM_CR:
+		return svm_set_vm_cr(vcpu, data);
 	case MSR_VM_IGNNE:
 		pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
 		break;
-- 
cgit v1.2.3-70-g09d2


From 82494028dff648c29e3a40915f1fceca51b4490a Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 24 Feb 2010 18:59:16 +0100
Subject: KVM: SVM: Ignore write of hwcr.ignne

Hyper-V as a guest wants to write this bit. This patch
ignores it.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1aa4d6e26ba..81b7af5558f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1090,6 +1090,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		break;
 	case MSR_K7_HWCR:
 		data &= ~(u64)0x40;	/* ignore flush filter disable */
+		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
 		if (data != 0) {
 			pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
 				data);
-- 
cgit v1.2.3-70-g09d2


From b44ea385d8cb187e04ec8d901d4c320c8b07c40b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 24 Feb 2010 18:59:17 +0100
Subject: KVM: x86: Don't set arch.cr0 in kvm_set_cr0

The vcpu->arch.cr0 variable is already set in the
architecture specific set_cr0 callbacks. There is no need to
set it in the common code.
This allows the architecture code to keep the old arch.cr0
value if it wants. This is required for nested svm to decide
if a selective_cr0 exit needs to be injected.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 81b7af5558f..d0b184b5c24 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -475,7 +475,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	}
 
 	kvm_x86_ops->set_cr0(vcpu, cr0);
-	vcpu->arch.cr0 = cr0;
 
 	kvm_mmu_reset_context(vcpu);
 	return;
-- 
cgit v1.2.3-70-g09d2


From 7f5d8b5600b5294137886b46bf00ef811d0fdf32 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 24 Feb 2010 18:59:18 +0100
Subject: KVM: SVM: Handle nested selective_cr0 intercept correctly

If we have the following situation with nested svm:

1. Host KVM intercepts cr0 writes
2. Guest hypervisor intercepts only selective cr0 writes

Then we get an cr0 write intercept which is handled on the
host. But that intercepts may actually be a selective cr0
intercept for the guest. This patch checks for this
condition and injects a selective cr0 intercept if needed.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b4aac5c7ad8..631d2e54449 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1043,6 +1043,27 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	if (is_nested(svm)) {
+		/*
+		 * We are here because we run in nested mode, the host kvm
+		 * intercepts cr0 writes but the l1 hypervisor does not.
+		 * But the L1 hypervisor may intercept selective cr0 writes.
+		 * This needs to be checked here.
+		 */
+		unsigned long old, new;
+
+		/* Remove bits that would trigger a real cr0 write intercept */
+		old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
+		new = cr0 & SVM_CR0_SELECTIVE_MASK;
+
+		if (old == new) {
+			/* cr0 write with ts and mp unchanged */
+			svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
+			if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE)
+				return;
+		}
+	}
+
 #ifdef CONFIG_X86_64
 	if (vcpu->arch.efer & EFER_LME) {
 		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
-- 
cgit v1.2.3-70-g09d2


From 197717d5813fc39a7185a3177b76f4a3b2405df7 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 24 Feb 2010 18:59:19 +0100
Subject: KVM: SVM: Clear exit_info for injected INTR exits

When injecting an vmexit.intr into the nested hypervisor
there might be leftover values in the exit_info fields.
Clear them to not confuse nested hypervisors.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 631d2e54449..38f1fceefea 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1491,7 +1491,9 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
 	if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
 		return false;
 
-	svm->vmcb->control.exit_code = SVM_EXIT_INTR;
+	svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
+	svm->vmcb->control.exit_info_1 = 0;
+	svm->vmcb->control.exit_info_2 = 0;
 
 	if (svm->nested.intercept & 1ULL) {
 		/*
-- 
cgit v1.2.3-70-g09d2


From d6ab1ed44627c91d0a857a430b7ec4ed8648c7a5 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 25 Feb 2010 12:43:07 +0200
Subject: KVM: Drop kvm_get_gdt() in favor of generic linux function

Linux now has native_store_gdt() to do the same. Use it instead of
kvm local version.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 5 -----
 arch/x86/kvm/svm.c              | 2 +-
 arch/x86/kvm/vmx.c              | 4 ++--
 arch/x86/kvm/x86.c              | 2 +-
 4 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 502fff123e2..e3167224d93 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -723,11 +723,6 @@ static inline void kvm_get_idt(struct desc_ptr *table)
 	asm("sidt %0" : "=m"(*table));
 }
 
-static inline void kvm_get_gdt(struct desc_ptr *table)
-{
-	asm("sgdt %0" : "=m"(*table));
-}
-
 static inline unsigned long kvm_read_tr_base(void)
 {
 	u16 tr;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 38f1fceefea..6882be5b926 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -368,7 +368,7 @@ static int svm_hardware_enable(void *garbage)
 	sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
 	sd->next_asid = sd->max_asid + 1;
 
-	kvm_get_gdt(&gdt_descr);
+	native_store_gdt(&gdt_descr);
 	gdt = (struct desc_struct *)gdt_descr.address;
 	sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 61f03980ada..68712bdf040 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -603,7 +603,7 @@ static void reload_tss(void)
 	struct desc_ptr gdt;
 	struct desc_struct *descs;
 
-	kvm_get_gdt(&gdt);
+	native_store_gdt(&gdt);
 	descs = (void *)gdt.address;
 	descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
 	load_TR_desc();
@@ -767,7 +767,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		 * processors.
 		 */
 		vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
-		kvm_get_gdt(&dt);
+		native_store_gdt(&dt);
 		vmcs_writel(HOST_GDTR_BASE, dt.address);   /* 22.2.4 */
 
 		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d0b184b5c24..e07b243055f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -233,7 +233,7 @@ unsigned long segment_base(u16 selector)
 	if (selector == 0)
 		return 0;
 
-	kvm_get_gdt(&gdt);
+	native_store_gdt(&gdt);
 	table_base = gdt.address;
 
 	if (selector & 4) {           /* from ldt */
-- 
cgit v1.2.3-70-g09d2


From 254d4d48a56925622a5592ad590a738735b66135 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 25 Feb 2010 12:43:08 +0200
Subject: KVM: fix segment_base() error checking

fix segment_base() to properly check for null segment selector and
avoid accessing NULL pointer if ldt selector in null.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e07b243055f..814e72a02ef 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -230,7 +230,7 @@ unsigned long segment_base(u16 selector)
 	unsigned long table_base;
 	unsigned long v;
 
-	if (selector == 0)
+	if (!(selector & ~3))
 		return 0;
 
 	native_store_gdt(&gdt);
@@ -239,6 +239,8 @@ unsigned long segment_base(u16 selector)
 	if (selector & 4) {           /* from ldt */
 		u16 ldt_selector = kvm_read_ldt();
 
+		if (!(ldt_selector & ~3))
+			return 0;
 		table_base = segment_base(ldt_selector);
 	}
 	d = (struct desc_struct *)(table_base + (selector & ~7));
-- 
cgit v1.2.3-70-g09d2


From 2d49ec72d3fab0aa90510a64a973d594c48b1fd1 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 25 Feb 2010 12:43:09 +0200
Subject: KVM: move segment_base() into vmx.c

segment_base() is used only by vmx so move it there.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  9 ---------
 arch/x86/kvm/vmx.c              | 37 +++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              | 30 ------------------------------
 3 files changed, 37 insertions(+), 39 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e3167224d93..ec891a2ce86 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -644,8 +644,6 @@ int emulator_write_emulated(unsigned long addr,
 			    unsigned int bytes,
 			    struct kvm_vcpu *vcpu);
 
-unsigned long segment_base(u16 selector);
-
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       const u8 *new, int bytes,
@@ -723,13 +721,6 @@ static inline void kvm_get_idt(struct desc_ptr *table)
 	asm("sidt %0" : "=m"(*table));
 }
 
-static inline unsigned long kvm_read_tr_base(void)
-{
-	u16 tr;
-	asm("str %0" : "=g"(tr));
-	return segment_base(tr);
-}
-
 #ifdef CONFIG_X86_64
 static inline unsigned long read_msr(unsigned long msr)
 {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 68712bdf040..8e2a24693be 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -634,6 +634,43 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 	return true;
 }
 
+static unsigned long segment_base(u16 selector)
+{
+	struct desc_ptr gdt;
+	struct desc_struct *d;
+	unsigned long table_base;
+	unsigned long v;
+
+	if (!(selector & ~3))
+		return 0;
+
+	native_store_gdt(&gdt);
+	table_base = gdt.address;
+
+	if (selector & 4) {           /* from ldt */
+		u16 ldt_selector = kvm_read_ldt();
+
+		if (!(ldt_selector & ~3))
+			return 0;
+
+		table_base = segment_base(ldt_selector);
+	}
+	d = (struct desc_struct *)(table_base + (selector & ~7));
+	v = get_desc_base(d);
+#ifdef CONFIG_X86_64
+       if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
+               v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
+#endif
+	return v;
+}
+
+static inline unsigned long kvm_read_tr_base(void)
+{
+	u16 tr;
+	asm("str %0" : "=g"(tr));
+	return segment_base(tr);
+}
+
 static void vmx_save_host_state(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 814e72a02ef..2b34dc705cf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -223,36 +223,6 @@ static void drop_user_return_notifiers(void *ignore)
 		kvm_on_user_return(&smsr->urn);
 }
 
-unsigned long segment_base(u16 selector)
-{
-	struct desc_ptr gdt;
-	struct desc_struct *d;
-	unsigned long table_base;
-	unsigned long v;
-
-	if (!(selector & ~3))
-		return 0;
-
-	native_store_gdt(&gdt);
-	table_base = gdt.address;
-
-	if (selector & 4) {           /* from ldt */
-		u16 ldt_selector = kvm_read_ldt();
-
-		if (!(ldt_selector & ~3))
-			return 0;
-		table_base = segment_base(ldt_selector);
-	}
-	d = (struct desc_struct *)(table_base + (selector & ~7));
-	v = get_desc_base(d);
-#ifdef CONFIG_X86_64
-	if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
-		v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
-#endif
-	return v;
-}
-EXPORT_SYMBOL_GPL(segment_base);
-
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
 {
 	if (irqchip_in_kernel(vcpu->kvm))
-- 
cgit v1.2.3-70-g09d2


From e35b7b9c9e7d8768ee34e5904fed4cb0f2c2cb5d Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 25 Feb 2010 16:36:42 +0200
Subject: KVM: x86 emulator: Add decoding of 16bit second in memory argument

Add decoding of Ep type of argument used by callf/jmpf.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c9f604b0819..97a740368b3 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -85,6 +85,9 @@
 #define Src2ImmByte (2<<29)
 #define Src2One     (3<<29)
 #define Src2Imm16   (4<<29)
+#define Src2Mem16   (5<<29) /* Used for Ep encoding. First argument has to be
+			       in memory and second argument is located
+			       immediately after the first one in memory. */
 #define Src2Mask    (7<<29)
 
 enum {
@@ -1163,6 +1166,10 @@ done_prefixes:
 		c->src2.bytes = 1;
 		c->src2.val = 1;
 		break;
+	case Src2Mem16:
+		c->src2.bytes = 2;
+		c->src2.type = OP_MEM;
+		break;
 	}
 
 	/* Decode and fetch the destination operand: register or memory. */
@@ -1881,6 +1888,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		c->src.orig_val = c->src.val;
 	}
 
+	if (c->src2.type == OP_MEM) {
+		c->src2.ptr = (unsigned long *)(memop + c->src.bytes);
+		c->src2.val = 0;
+		rc = ops->read_emulated((unsigned long)c->src2.ptr,
+					&c->src2.val,
+					c->src2.bytes,
+					ctxt->vcpu);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
+	}
+
 	if ((c->d & DstMask) == ImplicitOps)
 		goto special_insn;
 
-- 
cgit v1.2.3-70-g09d2


From ea79849d4c8461034b75acb19c8041b6fddee2a5 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 25 Feb 2010 16:36:43 +0200
Subject: KVM: x86 emulator: Implement jmp far opcode ff/5

Implement jmp far opcode ff/5. It is used by multiboot loader.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 97a740368b3..5b6794adaa2 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -346,7 +346,8 @@ static u32 group_table[] = {
 	[Group5*8] =
 	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
 	SrcMem | ModRM | Stack, 0,
-	SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
+	SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps,
+	SrcMem | ModRM | Stack, 0,
 	[Group7*8] =
 	0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
 	SrcNone | ModRM | DstMem | Mov, 0,
@@ -2322,6 +2323,7 @@ special_insn:
 	case 0xe9: /* jmp rel */
 		goto jmp;
 	case 0xea: /* jmp far */
+	jump_far:
 		if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val,
 						VCPU_SREG_CS))
 			goto done;
@@ -2397,11 +2399,16 @@ special_insn:
 		ctxt->eflags |= EFLG_DF;
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
-	case 0xfe ... 0xff:	/* Grp4/Grp5 */
+	case 0xfe: /* Grp4 */
+	grp45:
 		rc = emulate_grp45(ctxt, ops);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
+	case 0xff: /* Grp5 */
+		if (c->modrm_reg == 5)
+			goto jump_far;
+		goto grp45;
 	}
 
 writeback:
-- 
cgit v1.2.3-70-g09d2


From 696e409dbd1ce325129c5030267365619364dfa0 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Thu, 23 Jul 2009 06:57:45 -0300
Subject: edac_mce: Add an interface driver to report mce errors via edac

edac_mce module is an interface module that gets mcelog data and
forwards to any registered edac module that expects to receive data via
mce.

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 10 +++++++
 drivers/edac/Kconfig             |  8 +++++-
 drivers/edac/Makefile            |  1 +
 drivers/edac/edac_mce.c          | 58 ++++++++++++++++++++++++++++++++++++++++
 include/linux/edac_mce.h         | 31 +++++++++++++++++++++
 5 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 drivers/edac/edac_mce.c
 create mode 100644 include/linux/edac_mce.h

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8a6f0afa767..6585ff07ddf 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -36,6 +36,7 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/debugfs.h>
+#include <linux/edac_mce.h>
 
 #include <asm/processor.h>
 #include <asm/hw_irq.h>
@@ -168,6 +169,15 @@ void mce_log(struct mce *mce)
 	for (;;) {
 		entry = rcu_dereference_check_mce(mcelog.next);
 		for (;;) {
+			/*
+			 * If edac_mce is enabled, it will check the error type
+			 * and will process it, if it is a known error.
+			 * Otherwise, the error will be sent through mcelog
+			 * interface
+			 */
+			if (edac_mce_parse(mce))
+				return;
+
 			/*
 			 * When the buffer fills up discard new entries.
 			 * Assume that the earlier errors are the more
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 391ddbfb2a3..5b7fbc5aec8 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -69,6 +69,9 @@ config EDAC_MM_EDAC
 	  occurred so that a particular failing memory module can be
 	  replaced.  If unsure, select 'Y'.
 
+config EDAC_MCE
+	tristate
+
 config EDAC_AMD64
 	tristate "AMD64 (Opteron, Athlon64) K8, F10h, F11h"
 	depends on EDAC_MM_EDAC && K8_NB && X86_64 && PCI && EDAC_DECODE_MCE
@@ -169,9 +172,12 @@ config EDAC_I5400
 config EDAC_I7CORE
 	tristate "Intel i7 Core (Nehalem) processors"
 	depends on EDAC_MM_EDAC && PCI && X86
+	select EDAC_MCE
 	help
 	  Support for error detection and correction the Intel
-	  i7 Core (Nehalem) Integrated Memory Controller
+	  i7 Core (Nehalem) Integrated Memory Controller that exists on
+	  newer processors like i7 Core, i7 Core Extreme, Xeon 35xx
+	  and Xeon 55xx processors.
 
 config EDAC_I82860
 	tristate "Intel 82860"
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index b9996195b23..ca6b1bb24cc 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -8,6 +8,7 @@
 
 obj-$(CONFIG_EDAC)			:= edac_stub.o
 obj-$(CONFIG_EDAC_MM_EDAC)		+= edac_core.o
+obj-$(CONFIG_EDAC_MCE)			+= edac_mce.o
 
 edac_core-objs	:= edac_mc.o edac_device.o edac_mc_sysfs.o edac_pci_sysfs.o
 edac_core-objs	+= edac_module.o edac_device_sysfs.o
diff --git a/drivers/edac/edac_mce.c b/drivers/edac/edac_mce.c
new file mode 100644
index 00000000000..b1efa8e5192
--- /dev/null
+++ b/drivers/edac/edac_mce.c
@@ -0,0 +1,58 @@
+/* Provides edac interface to mcelog events
+ *
+ * This file may be distributed under the terms of the
+ * GNU General Public License version 2.
+ *
+ * Copyright (c) 2009 by:
+ *	 Mauro Carvalho Chehab <mchehab@redhat.com>
+ *
+ * Red Hat Inc. http://www.redhat.com
+ */
+
+#include <linux/module.h>
+#include <linux/edac_mce.h>
+#include <asm/mce.h>
+
+int edac_mce_enabled;
+EXPORT_SYMBOL_GPL(edac_mce_enabled);
+
+
+/*
+ * Extension interface
+ */
+
+static LIST_HEAD(edac_mce_list);
+static DEFINE_MUTEX(edac_mce_lock);
+
+int edac_mce_register(struct edac_mce *edac_mce)
+{
+	mutex_lock(&edac_mce_lock);
+	list_add_tail(&edac_mce->list, &edac_mce_list);
+	mutex_unlock(&edac_mce_lock);
+	return 0;
+}
+EXPORT_SYMBOL(edac_mce_register);
+
+void edac_mce_unregister(struct edac_mce *edac_mce)
+{
+	mutex_lock(&edac_mce_lock);
+	list_del(&edac_mce->list);
+	mutex_unlock(&edac_mce_lock);
+}
+EXPORT_SYMBOL(edac_mce_unregister);
+
+
+
+int edac_mce_queue(struct mce *mce)
+{
+	struct edac_mce *edac_mce;
+
+	list_for_each_entry(edac_mce, &edac_mce_list, list) {
+		if (edac_mce->check_error(edac_mce->priv, mce))
+			return 1;
+	}
+
+	/* Nobody queued the error */
+	return 0;
+}
+EXPORT_SYMBOL_GPL(edac_mce_queue);
diff --git a/include/linux/edac_mce.h b/include/linux/edac_mce.h
new file mode 100644
index 00000000000..f974fc03536
--- /dev/null
+++ b/include/linux/edac_mce.h
@@ -0,0 +1,31 @@
+/* Provides edac interface to mcelog events
+ *
+ * This file may be distributed under the terms of the
+ * GNU General Public License version 2.
+ *
+ * Copyright (c) 2009 by:
+ *	 Mauro Carvalho Chehab <mchehab@redhat.com>
+ *
+ * Red Hat Inc. http://www.redhat.com
+ */
+
+#if defined(CONFIG_EDAC_MCE) || \
+			(defined(CONFIG_EDAC_MCE_MODULE) && defined(MODULE))
+
+#include <asm/mce.h>
+#include <linux/list.h>
+
+struct edac_mce {
+	struct list_head list;
+
+	void *priv;
+	int (*check_error)(void *priv, struct mce *mce);
+};
+
+int edac_mce_register(struct edac_mce *edac_mce);
+void edac_mce_unregister(struct edac_mce *edac_mce);
+int edac_mce_parse(struct mce *mce);
+
+#else
+#define edac_mce_parse(mce) (0)
+#endif
-- 
cgit v1.2.3-70-g09d2


From 5707b24a50b40582226618c56692af932db9fe02 Mon Sep 17 00:00:00 2001
From: Aristeu Rozanski <aris@redhat.com>
Date: Thu, 9 Jul 2009 22:21:13 -0300
Subject: pci: Add a probing code that seeks for an specific bus

This patch adds a probing code that seeks for an specific pci bus. It
still needs testing, but it is hoped that this will help to identify the
memory controller with Xeon 55xx series.

Signed-off-by: Aristeu Sergio <arozansk@redhat.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 arch/x86/include/asm/pci_x86.h |  2 ++
 arch/x86/pci/legacy.c          | 42 +++++++++++++++++++++++++-----------------
 2 files changed, 27 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 1a0422348d6..ff5404b82fd 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -53,6 +53,8 @@ extern int pcibios_last_bus;
 extern struct pci_bus *pci_root_bus;
 extern struct pci_ops pci_root_ops;
 
+void pcibios_scan_specific_bus(int busn);
+
 /* pci-irq.c */
 
 struct irq_info {
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 0db5eaf5456..c734c277b11 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -11,28 +11,14 @@
  */
 static void __devinit pcibios_fixup_peer_bridges(void)
 {
-	int n, devfn;
-	long node;
+	int n;
 
 	if (pcibios_last_bus <= 0 || pcibios_last_bus > 0xff)
 		return;
 	DBG("PCI: Peer bridge fixup\n");
 
-	for (n=0; n <= pcibios_last_bus; n++) {
-		u32 l;
-		if (pci_find_bus(0, n))
-			continue;
-		node = get_mp_bus_to_node(n);
-		for (devfn = 0; devfn < 256; devfn += 8) {
-			if (!raw_pci_read(0, n, devfn, PCI_VENDOR_ID, 2, &l) &&
-			    l != 0x0000 && l != 0xffff) {
-				DBG("Found device at %02x:%02x [%04x]\n", n, devfn, l);
-				printk(KERN_INFO "PCI: Discovered peer bus %02x\n", n);
-				pci_scan_bus_on_node(n, &pci_root_ops, node);
-				break;
-			}
-		}
-	}
+	for (n=0; n <= pcibios_last_bus; n++)
+		pcibios_scan_specific_bus(n);
 }
 
 int __init pci_legacy_init(void)
@@ -49,6 +35,28 @@ int __init pci_legacy_init(void)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(pci_legacy_init);
+
+void pcibios_scan_specific_bus(int busn)
+{
+	int devfn;
+	long node;
+	u32 l;
+
+	if (pci_find_bus(0, busn))
+		return;
+
+	node = get_mp_bus_to_node(busn);
+	for (devfn = 0; devfn < 256; devfn += 8) {
+		if (!raw_pci_read(0, busn, devfn, PCI_VENDOR_ID, 2, &l) &&
+		    l != 0x0000 && l != 0xffff) {
+			DBG("Found device at %02x:%02x [%04x]\n", busn, devfn, l);
+			printk(KERN_INFO "PCI: Discovered peer bus %02x\n", busn);
+			pci_scan_bus_on_node(busn, &pci_root_ops, node);
+			return;
+		}
+	}
+}
 
 int __init pci_subsys_init(void)
 {
-- 
cgit v1.2.3-70-g09d2


From d1fd4fb69eeeb7db0693df58b9116db498d5bfe1 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@redhat.com>
Date: Fri, 10 Jul 2009 18:39:53 -0300
Subject: i7core_edac: Add a code to probe Xeon 55xx bus

This code changes the detection procedure of i7core_edac. Instead of
directly probing for MC registers, it probes for another register found
on Nehalem. If found, it tries to pick the first MC PCI BUS. This should
work fine with Xeon 35xx, but, on Xeon 55xx, this is at bus 254 and 255
that are not properly detected by the non-legacy PCI methods.

The new detection code scans specifically at buses 254 and 255 for the
Xeon 55xx devices.

This code has not tested yet. After working, a change at the code will
be needed, since the i7core is not yet ready for working with 2 sets of
MC.

Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 arch/x86/pci/legacy.c      |  1 +
 drivers/edac/i7core_edac.c | 17 +++++++++++++----
 include/linux/pci.h        |  1 +
 include/linux/pci_ids.h    |  1 +
 4 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index c734c277b11..d6cc2eddf33 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -57,6 +57,7 @@ void pcibios_scan_specific_bus(int busn)
 		}
 	}
 }
+EXPORT_SYMBOL_GPL(pcibios_scan_specific_bus);
 
 int __init pci_subsys_init(void)
 {
diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c
index 26cd5c924d5..eec0c13c020 100644
--- a/drivers/edac/i7core_edac.c
+++ b/drivers/edac/i7core_edac.c
@@ -221,15 +221,15 @@ struct i7core_dev_info {
 	.dev_id = (device_id)
 
 struct pci_id_descr pci_devs[] = {
+		/* Generic Non-core registers */
+	{ PCI_DESCR(0, 0, PCI_DEVICE_ID_INTEL_I7_NOCORE)  },
+
 		/* Memory controller */
 	{ PCI_DESCR(3, 0, PCI_DEVICE_ID_INTEL_I7_MCR)     },
 	{ PCI_DESCR(3, 1, PCI_DEVICE_ID_INTEL_I7_MC_TAD)  },
 	{ PCI_DESCR(3, 2, PCI_DEVICE_ID_INTEL_I7_MC_RAS)  }, /* if RDIMM is supported */
 	{ PCI_DESCR(3, 4, PCI_DEVICE_ID_INTEL_I7_MC_TEST) },
 
-		/* Generic Non-core registers */
-	{ PCI_DESCR(0, 0, PCI_DEVICE_ID_INTEL_I7_NOCORE)  },
-
 		/* Channel 0 */
 	{ PCI_DESCR(4, 0, PCI_DEVICE_ID_INTEL_I7_MC_CH0_CTRL) },
 	{ PCI_DESCR(4, 1, PCI_DEVICE_ID_INTEL_I7_MC_CH0_ADDR) },
@@ -255,7 +255,7 @@ struct pci_id_descr pci_devs[] = {
  * This should match the first device at pci_devs table
  */
 static const struct pci_device_id i7core_pci_tbl[] __devinitdata = {
-	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_I7_MCR)},
+	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_X58_HUB_MGMT)},
 	{0,}			/* 0 terminated list. */
 };
 
@@ -1069,6 +1069,15 @@ static int i7core_get_devices(void)
 	for (i = 0; i < N_DEVS; i++) {
 		pdev = pci_get_device(PCI_VENDOR_ID_INTEL,
 					pci_devs[i].dev_id, NULL);
+
+		if (!pdev && !i) {
+			pcibios_scan_specific_bus(254);
+			pcibios_scan_specific_bus(255);
+
+			pdev = pci_get_device(PCI_VENDOR_ID_INTEL,
+						pci_devs[i].dev_id, NULL);
+		}
+
 		if (likely(pdev))
 			pci_devs[i].pdev = pdev;
 		else {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index a788fa12ff3..5e2c7e15187 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -621,6 +621,7 @@ void pci_fixup_cardbus(struct pci_bus *);
 
 /* Generic PCI functions used internally */
 
+void pcibios_scan_specific_bus(int busn);
 extern struct pci_bus *pci_find_bus(int domain, int busnr);
 void pci_bus_add_devices(const struct pci_bus *bus);
 struct pci_bus *pci_scan_bus_parented(struct device *parent, int bus,
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 9d5bfe86ba7..12c3da6ef14 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2554,6 +2554,7 @@
 #define PCI_DEVICE_ID_INTEL_IOAT_TBG5	0x342a
 #define PCI_DEVICE_ID_INTEL_IOAT_TBG6	0x342b
 #define PCI_DEVICE_ID_INTEL_IOAT_TBG7	0x342c
+#define PCI_DEVICE_ID_INTEL_X58_HUB_MGMT 0x342e
 #define PCI_DEVICE_ID_INTEL_IOAT_TBG0	0x3430
 #define PCI_DEVICE_ID_INTEL_IOAT_TBG1	0x3431
 #define PCI_DEVICE_ID_INTEL_IOAT_TBG2	0x3432
-- 
cgit v1.2.3-70-g09d2


From 4f7b9e7cbe68c97dbe1266709ecfc8b807b0d0ee Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Fri, 4 Dec 2009 16:49:34 -0200
Subject: i7core_edac: do not export static functions

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
---
 arch/x86/pci/legacy.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index d6cc2eddf33..8d460eaf524 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -35,7 +35,6 @@ int __init pci_legacy_init(void)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(pci_legacy_init);
 
 void pcibios_scan_specific_bus(int busn)
 {
-- 
cgit v1.2.3-70-g09d2


From d19f61f098ae9315b76a97962007f687683916d4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 17 Feb 2010 14:35:25 +0000
Subject: x86/PCI: Convert pci_config_lock to raw_spinlock

pci_config_lock must be a real spinlock in preempt-rt. Convert it to
raw_spinlock. No change for !RT kernels.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/include/asm/pci_x86.h |  2 +-
 arch/x86/pci/common.c          |  2 +-
 arch/x86/pci/direct.c          | 16 ++++++++--------
 arch/x86/pci/mmconfig_32.c     |  8 ++++----
 arch/x86/pci/numaq_32.c        |  8 ++++----
 arch/x86/pci/pcbios.c          |  8 ++++----
 6 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 1a0422348d6..8d8797eae5d 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -83,7 +83,7 @@ struct irq_routing_table {
 
 extern unsigned int pcibios_irq_mask;
 
-extern spinlock_t pci_config_lock;
+extern raw_spinlock_t pci_config_lock;
 
 extern int (*pcibios_enable_irq)(struct pci_dev *dev);
 extern void (*pcibios_disable_irq)(struct pci_dev *dev);
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index cf2e93869c4..215a27ae050 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -76,7 +76,7 @@ struct pci_ops pci_root_ops = {
  * This interrupt-safe spinlock protects all accesses to PCI
  * configuration space.
  */
-DEFINE_SPINLOCK(pci_config_lock);
+DEFINE_RAW_SPINLOCK(pci_config_lock);
 
 static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d)
 {
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index 347d882b3bb..bd33620b007 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -27,7 +27,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus,
 		return -EINVAL;
 	}
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8);
 
@@ -43,7 +43,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus,
 		break;
 	}
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
@@ -56,7 +56,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus,
 	if ((bus > 255) || (devfn > 255) || (reg > 4095))
 		return -EINVAL;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8);
 
@@ -72,7 +72,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus,
 		break;
 	}
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
@@ -108,7 +108,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus,
 	if (dev & 0x10) 
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	outb((u8)(0xF0 | (fn << 1)), 0xCF8);
 	outb((u8)bus, 0xCFA);
@@ -127,7 +127,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus,
 
 	outb(0, 0xCF8);
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
@@ -147,7 +147,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,
 	if (dev & 0x10) 
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	outb((u8)(0xF0 | (fn << 1)), 0xCF8);
 	outb((u8)bus, 0xCFA);
@@ -166,7 +166,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,
 
 	outb(0, 0xCF8);    
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index 90d5fd476ed..a3d9c54792a 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -64,7 +64,7 @@ err:		*value = -1;
 	if (!base)
 		goto err;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	pci_exp_set_dev_base(base, bus, devfn);
 
@@ -79,7 +79,7 @@ err:		*value = -1;
 		*value = mmio_config_readl(mmcfg_virt_addr + reg);
 		break;
 	}
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
@@ -97,7 +97,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
 	if (!base)
 		return -EINVAL;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	pci_exp_set_dev_base(base, bus, devfn);
 
@@ -112,7 +112,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus,
 		mmio_config_writel(mmcfg_virt_addr + reg, value);
 		break;
 	}
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 8223738ad80..5c9e2458df4 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -37,7 +37,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
 	if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255))
 		return -EINVAL;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	write_cf8(bus, devfn, reg);
 
@@ -62,7 +62,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus,
 		break;
 	}
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
@@ -76,7 +76,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
 	if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) 
 		return -EINVAL;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	write_cf8(bus, devfn, reg);
 
@@ -101,7 +101,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus,
 		break;
 	}
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return 0;
 }
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 59a225c17b8..2492d165096 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -162,7 +162,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
 	if (!value || (bus > 255) || (devfn > 255) || (reg > 255))
 		return -EINVAL;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	switch (len) {
 	case 1:
@@ -213,7 +213,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus,
 		break;
 	}
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return (int)((result & 0xff00) >> 8);
 }
@@ -228,7 +228,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus,
 	if ((bus > 255) || (devfn > 255) || (reg > 255)) 
 		return -EINVAL;
 
-	spin_lock_irqsave(&pci_config_lock, flags);
+	raw_spin_lock_irqsave(&pci_config_lock, flags);
 
 	switch (len) {
 	case 1:
@@ -269,7 +269,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus,
 		break;
 	}
 
-	spin_unlock_irqrestore(&pci_config_lock, flags);
+	raw_spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return (int)((result & 0xff00) >> 8);
 }
-- 
cgit v1.2.3-70-g09d2


From 33852cb03ee4cdb05dc6e3a21ec19a4ee63511a4 Mon Sep 17 00:00:00 2001
From: Seth Heasley <seth.heasley@intel.com>
Date: Thu, 25 Mar 2010 16:11:37 -0700
Subject: x86/PCI: irq and pci_ids patch for additional Intel Cougar Point
 DeviceIDs

This patch adds additional LPC Controller DeviceIDs for the Intel Cougar
Point PCH.

The DeviceIDs are defined and referenced as a range of values, the same
way Ibex Peak was implemented.

Signed-off-by: Seth Heasley <seth.heasley@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/irq.c      | 9 +++++++--
 include/linux/pci_ids.h | 4 ++--
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 5d362b5ba06..9810a0f76c9 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -589,8 +589,6 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
 	case PCI_DEVICE_ID_INTEL_ICH10_1:
 	case PCI_DEVICE_ID_INTEL_ICH10_2:
 	case PCI_DEVICE_ID_INTEL_ICH10_3:
-	case PCI_DEVICE_ID_INTEL_CPT_LPC1:
-	case PCI_DEVICE_ID_INTEL_CPT_LPC2:
 		r->name = "PIIX/ICH";
 		r->get = pirq_piix_get;
 		r->set = pirq_piix_set;
@@ -605,6 +603,13 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
 		return 1;
 	}
 
+	if ((device >= PCI_DEVICE_ID_INTEL_CPT_LPC_MIN) && 
+		(device <= PCI_DEVICE_ID_INTEL_CPT_LPC_MAX)) {
+		r->name = "PIIX/ICH";
+		r->get = pirq_piix_get;
+		r->set = pirq_piix_set;
+		return 1;
+	}
 	return 0;
 }
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 9f688d243b8..ae66851870b 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2419,8 +2419,8 @@
 #define PCI_DEVICE_ID_INTEL_82845_HB	0x1a30
 #define PCI_DEVICE_ID_INTEL_IOAT	0x1a38
 #define PCI_DEVICE_ID_INTEL_CPT_SMBUS	0x1c22
-#define PCI_DEVICE_ID_INTEL_CPT_LPC1	0x1c42
-#define PCI_DEVICE_ID_INTEL_CPT_LPC2	0x1c43
+#define PCI_DEVICE_ID_INTEL_CPT_LPC_MIN	0x1c41
+#define PCI_DEVICE_ID_INTEL_CPT_LPC_MAX	0x1c5f
 #define PCI_DEVICE_ID_INTEL_82801AA_0	0x2410
 #define PCI_DEVICE_ID_INTEL_82801AA_1	0x2411
 #define PCI_DEVICE_ID_INTEL_82801AA_3	0x2413
-- 
cgit v1.2.3-70-g09d2


From b6dacf63e9fb2e7a1369843d6cef332f76fca6a3 Mon Sep 17 00:00:00 2001
From: Matthew Garrett <mjg@redhat.com>
Date: Tue, 11 May 2010 13:49:25 -0400
Subject: ACPI: Unconditionally set SCI_EN on resume

The ACPI spec tells us that the firmware will reenable SCI_EN on resume.
Reality disagrees in some cases. The ACPI spec tells us that the only way
to set SCI_EN is via an SMM call.
https://bugzilla.kernel.org/show_bug.cgi?id=13745 shows us that doing so
may break machines. Tracing the ACPI calls made by Windows shows that it
unconditionally sets SCI_EN on resume with a direct register write, and
therefore the overwhelming probability is that everything is fine with
this behaviour.

Signed-off-by: Matthew Garrett <mjg@redhat.com>
Tested-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/acpi/sleep.c |   2 -
 drivers/acpi/sleep.c         | 157 +------------------------------------------
 include/linux/acpi.h         |   1 -
 3 files changed, 2 insertions(+), 158 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index f9961034e55..82e508677b9 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -162,8 +162,6 @@ static int __init acpi_sleep_setup(char *str)
 #endif
 		if (strncmp(str, "old_ordering", 12) == 0)
 			acpi_old_suspend_ordering();
-		if (strncmp(str, "sci_force_enable", 16) == 0)
-			acpi_set_sci_en_on_resume();
 		str = strchr(str, ',');
 		if (str != NULL)
 			str += strspn(str, ", \t");
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index baa76bbf244..4ab2275b446 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -80,22 +80,6 @@ static int acpi_sleep_prepare(u32 acpi_state)
 
 #ifdef CONFIG_ACPI_SLEEP
 static u32 acpi_target_sleep_state = ACPI_STATE_S0;
-/*
- * According to the ACPI specification the BIOS should make sure that ACPI is
- * enabled and SCI_EN bit is set on wake-up from S1 - S3 sleep states.  Still,
- * some BIOSes don't do that and therefore we use acpi_enable() to enable ACPI
- * on such systems during resume.  Unfortunately that doesn't help in
- * particularly pathological cases in which SCI_EN has to be set directly on
- * resume, although the specification states very clearly that this flag is
- * owned by the hardware.  The set_sci_en_on_resume variable will be set in such
- * cases.
- */
-static bool set_sci_en_on_resume;
-
-void __init acpi_set_sci_en_on_resume(void)
-{
-	set_sci_en_on_resume = true;
-}
 
 /*
  * ACPI 1.0 wants us to execute _PTS before suspending devices, so we allow the
@@ -253,11 +237,8 @@ static int acpi_suspend_enter(suspend_state_t pm_state)
 		break;
 	}
 
-	/* If ACPI is not enabled by the BIOS, we need to enable it here. */
-	if (set_sci_en_on_resume)
-		acpi_write_bit_register(ACPI_BITREG_SCI_ENABLE, 1);
-	else
-		acpi_enable();
+	/* This violates the spec but is required for bug compatibility. */
+	acpi_write_bit_register(ACPI_BITREG_SCI_ENABLE, 1);
 
 	/* Reprogram control registers and execute _BFS */
 	acpi_leave_sleep_state_prep(acpi_state);
@@ -346,12 +327,6 @@ static int __init init_old_suspend_ordering(const struct dmi_system_id *d)
 	return 0;
 }
 
-static int __init init_set_sci_en_on_resume(const struct dmi_system_id *d)
-{
-	set_sci_en_on_resume = true;
-	return 0;
-}
-
 static struct dmi_system_id __initdata acpisleep_dmi_table[] = {
 	{
 	.callback = init_old_suspend_ordering,
@@ -370,22 +345,6 @@ static struct dmi_system_id __initdata acpisleep_dmi_table[] = {
 		},
 	},
 	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Apple MacBook 1,1",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Apple Computer, Inc."),
-		DMI_MATCH(DMI_PRODUCT_NAME, "MacBook1,1"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Apple MacMini 1,1",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Apple Computer, Inc."),
-		DMI_MATCH(DMI_PRODUCT_NAME, "Macmini1,1"),
-		},
-	},
-	{
 	.callback = init_old_suspend_ordering,
 	.ident = "Asus Pundit P1-AH2 (M2N8L motherboard)",
 	.matches = {
@@ -394,94 +353,6 @@ static struct dmi_system_id __initdata acpisleep_dmi_table[] = {
 		},
 	},
 	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Toshiba Satellite L300",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "Satellite L300"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Hewlett-Packard HP G7000 Notebook PC",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "HP G7000 Notebook PC"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Hewlett-Packard HP Pavilion dv3 Notebook PC",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "HP Pavilion dv3 Notebook PC"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Hewlett-Packard Pavilion dv4",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "HP Pavilion dv4"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Hewlett-Packard Pavilion dv7",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "HP Pavilion dv7"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Hewlett-Packard Compaq Presario C700 Notebook PC",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "Compaq Presario C700 Notebook PC"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Hewlett-Packard Compaq Presario CQ40 Notebook PC",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
-		DMI_MATCH(DMI_PRODUCT_NAME, "Compaq Presario CQ40 Notebook PC"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Lenovo ThinkPad T410",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
-		DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad T410"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Lenovo ThinkPad T510",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
-		DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad T510"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Lenovo ThinkPad W510",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
-		DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad W510"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Lenovo ThinkPad X201[s]",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
-		DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad X201"),
-		},
-	},
-	{
 	.callback = init_old_suspend_ordering,
 	.ident = "Panasonic CF51-2L",
 	.matches = {
@@ -490,30 +361,6 @@ static struct dmi_system_id __initdata acpisleep_dmi_table[] = {
 		DMI_MATCH(DMI_BOARD_NAME, "CF51-2L"),
 		},
 	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Dell Studio 1558",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-		DMI_MATCH(DMI_PRODUCT_NAME, "Studio 1558"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Dell Studio 1557",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-		DMI_MATCH(DMI_PRODUCT_NAME, "Studio 1557"),
-		},
-	},
-	{
-	.callback = init_set_sci_en_on_resume,
-	.ident = "Dell Studio 1555",
-	.matches = {
-		DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
-		DMI_MATCH(DMI_PRODUCT_NAME, "Studio 1555"),
-		},
-	},
 	{},
 };
 #endif /* CONFIG_SUSPEND */
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index b926afe8c03..87ca4913294 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -251,7 +251,6 @@ int acpi_check_mem_region(resource_size_t start, resource_size_t n,
 void __init acpi_no_s4_hw_signature(void);
 void __init acpi_old_suspend_ordering(void);
 void __init acpi_s4_no_nvs(void);
-void __init acpi_set_sci_en_on_resume(void);
 #endif /* CONFIG_PM_SLEEP */
 
 struct acpi_osc_context {
-- 
cgit v1.2.3-70-g09d2


From 0fc5c3a54d68d0e6c2f3b346dcc924ba928c4d0e Mon Sep 17 00:00:00 2001
From: Andrea Gelmini <andrea.gelmini@gelma.net>
Date: Sat, 27 Feb 2010 17:51:43 +0100
Subject: KVM: arch/x86/kvm/kvm_timer.h checkpatch cleanup

arch/x86/kvm/kvm_timer.h:13: ERROR: code indent should use tabs where possible

Signed-off-by: Andrea Gelmini <andrea.gelmini@gelma.net>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/kvm_timer.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 55c7524dda5..64bc6ea78d9 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -10,9 +10,7 @@ struct kvm_timer {
 };
 
 struct kvm_timer_ops {
-        bool (*is_periodic)(struct kvm_timer *);
+	bool (*is_periodic)(struct kvm_timer *);
 };
 
-
 enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
-
-- 
cgit v1.2.3-70-g09d2


From d24778265ac9b2602889a5e99c6e7ba777a236df Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 1 Mar 2010 15:34:34 +0100
Subject: KVM: SVM: Return correct values in nested_svm_exit_handled_msr

The nested_svm_exit_handled_msr() returned an bool which is
a bug. I worked by accident because the exected integer
return values match with the true and false values. This
patch changes the return value to int and let the function
return the correct values.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6882be5b926..07437ca1278 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1552,16 +1552,16 @@ static void nested_svm_unmap(struct page *page)
 	kvm_release_page_dirty(page);
 }
 
-static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 {
 	u32 param = svm->vmcb->control.exit_info_1 & 1;
 	u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
-	bool ret = false;
 	u32 t0, t1;
+	int ret;
 	u8 val;
 
 	if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
-		return false;
+		return NESTED_EXIT_HOST;
 
 	switch (msr) {
 	case 0 ... 0x1fff:
@@ -1579,12 +1579,12 @@ static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 		t0 %= 8;
 		break;
 	default:
-		ret = true;
+		ret = NESTED_EXIT_DONE;
 		goto out;
 	}
 
 	if (!kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + t1, &val, 1))
-		ret = val & ((1 << param) << t0);
+		ret = val & ((1 << param) << t0) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
 
 out:
 	return ret;
-- 
cgit v1.2.3-70-g09d2


From 455716fa941ec7a03c04bd54e1b906698171b15c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 1 Mar 2010 15:34:35 +0100
Subject: KVM: SVM: Move msrpm offset calculation to seperate function

The algorithm to find the offset in the msrpm for a given
msr is needed at other places too. Move that logic to its
own function.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 53 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 37 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 07437ca1278..429a24435c6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -117,6 +117,8 @@ struct vcpu_svm {
 	unsigned long int3_rip;
 };
 
+#define MSR_INVALID			0xffffffffU
+
 /* enable NPT for AMD64 and X86 with PAE */
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 static bool npt_enabled = true;
@@ -200,6 +202,27 @@ static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
 #define MSRS_RANGE_SIZE 2048
 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
 
+static u32 svm_msrpm_offset(u32 msr)
+{
+	u32 offset;
+	int i;
+
+	for (i = 0; i < NUM_MSR_MAPS; i++) {
+		if (msr < msrpm_ranges[i] ||
+		    msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
+			continue;
+
+		offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
+		offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
+
+		/* Now we have the u8 offset - but need the u32 offset */
+		return offset / 4;
+	}
+
+	/* MSR not in any range */
+	return MSR_INVALID;
+}
+
 #define MAX_INST_SIZE 15
 
 static inline u32 svm_has(u32 feat)
@@ -418,23 +441,21 @@ err_1:
 static void set_msr_interception(u32 *msrpm, unsigned msr,
 				 int read, int write)
 {
-	int i;
+	u8 bit_read, bit_write;
+	unsigned long tmp;
+	u32 offset;
 
-	for (i = 0; i < NUM_MSR_MAPS; i++) {
-		if (msr >= msrpm_ranges[i] &&
-		    msr < msrpm_ranges[i] + MSRS_IN_RANGE) {
-			u32 msr_offset = (i * MSRS_IN_RANGE + msr -
-					  msrpm_ranges[i]) * 2;
-
-			u32 *base = msrpm + (msr_offset / 32);
-			u32 msr_shift = msr_offset % 32;
-			u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1);
-			*base = (*base & ~(0x3 << msr_shift)) |
-				(mask << msr_shift);
-			return;
-		}
-	}
-	BUG();
+	offset    = svm_msrpm_offset(msr);
+	bit_read  = 2 * (msr & 0x0f);
+	bit_write = 2 * (msr & 0x0f) + 1;
+	tmp       = msrpm[offset];
+
+	BUG_ON(offset == MSR_INVALID);
+
+	read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
+	write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
+
+	msrpm[offset] = tmp;
 }
 
 static void svm_vcpu_init_msrpm(u32 *msrpm)
-- 
cgit v1.2.3-70-g09d2


From ac72a9b733995cb3ef538000f6309b5e724aa469 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 1 Mar 2010 15:34:36 +0100
Subject: KVM: SVM: Introduce direct access msr list

This patch introduces a list with all msrs a guest might
have direct access to and changes the svm_vcpu_init_msrpm
function to use this list.
It also adds a check to set_msr_interception which triggers
a warning if a developer changes a msr intercept that is not
in the list.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 56 ++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 46 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 429a24435c6..a079550d388 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -119,6 +119,27 @@ struct vcpu_svm {
 
 #define MSR_INVALID			0xffffffffU
 
+static struct svm_direct_access_msrs {
+	u32 index;   /* Index of the MSR */
+	bool always; /* True if intercept is always on */
+} direct_access_msrs[] = {
+	{ .index = MSR_K6_STAR,				.always = true  },
+	{ .index = MSR_IA32_SYSENTER_CS,		.always = true  },
+#ifdef CONFIG_X86_64
+	{ .index = MSR_GS_BASE,				.always = true  },
+	{ .index = MSR_FS_BASE,				.always = true  },
+	{ .index = MSR_KERNEL_GS_BASE,			.always = true  },
+	{ .index = MSR_LSTAR,				.always = true  },
+	{ .index = MSR_CSTAR,				.always = true  },
+	{ .index = MSR_SYSCALL_MASK,			.always = true  },
+#endif
+	{ .index = MSR_IA32_LASTBRANCHFROMIP,		.always = false },
+	{ .index = MSR_IA32_LASTBRANCHTOIP,		.always = false },
+	{ .index = MSR_IA32_LASTINTFROMIP,		.always = false },
+	{ .index = MSR_IA32_LASTINTTOIP,		.always = false },
+	{ .index = MSR_INVALID,				.always = false },
+};
+
 /* enable NPT for AMD64 and X86 with PAE */
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 static bool npt_enabled = true;
@@ -438,6 +459,17 @@ err_1:
 
 }
 
+static bool valid_msr_intercept(u32 index)
+{
+	int i;
+
+	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
+		if (direct_access_msrs[i].index == index)
+			return true;
+
+	return false;
+}
+
 static void set_msr_interception(u32 *msrpm, unsigned msr,
 				 int read, int write)
 {
@@ -445,6 +477,12 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
 	unsigned long tmp;
 	u32 offset;
 
+	/*
+	 * If this warning triggers extend the direct_access_msrs list at the
+	 * beginning of the file
+	 */
+	WARN_ON(!valid_msr_intercept(msr));
+
 	offset    = svm_msrpm_offset(msr);
 	bit_read  = 2 * (msr & 0x0f);
 	bit_write = 2 * (msr & 0x0f) + 1;
@@ -460,18 +498,16 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
 
 static void svm_vcpu_init_msrpm(u32 *msrpm)
 {
+	int i;
+
 	memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
 
-#ifdef CONFIG_X86_64
-	set_msr_interception(msrpm, MSR_GS_BASE, 1, 1);
-	set_msr_interception(msrpm, MSR_FS_BASE, 1, 1);
-	set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1);
-	set_msr_interception(msrpm, MSR_LSTAR, 1, 1);
-	set_msr_interception(msrpm, MSR_CSTAR, 1, 1);
-	set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1);
-#endif
-	set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
-	set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
+	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+		if (!direct_access_msrs[i].always)
+			continue;
+
+		set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
+	}
 }
 
 static void svm_enable_lbrv(struct vcpu_svm *svm)
-- 
cgit v1.2.3-70-g09d2


From 323c3d809b8bd42d6d557c734d4bdfdefa110445 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 1 Mar 2010 15:34:37 +0100
Subject: KVM: SVM: Optimize nested svm msrpm merging

This patch optimizes the way the msrpm of the host and the
guest are merged. The old code merged the 2 msrpm pages
completly. This code needed to touch 24kb of memory for that
operation. The optimized variant this patch introduces
merges only the parts where the host msrpm may contain zero
bits. This reduces the amount of memory which is touched to
48 bytes.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 71 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a079550d388..45a287e51e1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -93,6 +93,9 @@ struct nested_state {
 
 };
 
+#define MSRPM_OFFSETS	16
+static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
+
 struct vcpu_svm {
 	struct kvm_vcpu vcpu;
 	struct vmcb *vmcb;
@@ -510,6 +513,49 @@ static void svm_vcpu_init_msrpm(u32 *msrpm)
 	}
 }
 
+static void add_msr_offset(u32 offset)
+{
+	int i;
+
+	for (i = 0; i < MSRPM_OFFSETS; ++i) {
+
+		/* Offset already in list? */
+		if (msrpm_offsets[i] == offset)
+			return;
+
+		/* Slot used by another offset? */
+		if (msrpm_offsets[i] != MSR_INVALID)
+			continue;
+
+		/* Add offset to list */
+		msrpm_offsets[i] = offset;
+
+		return;
+	}
+
+	/*
+	 * If this BUG triggers the msrpm_offsets table has an overflow. Just
+	 * increase MSRPM_OFFSETS in this case.
+	 */
+	BUG();
+}
+
+static void init_msrpm_offsets(void)
+{
+	int i;
+
+	memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
+
+	for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
+		u32 offset;
+
+		offset = svm_msrpm_offset(direct_access_msrs[i].index);
+		BUG_ON(offset == MSR_INVALID);
+
+		add_msr_offset(offset);
+	}
+}
+
 static void svm_enable_lbrv(struct vcpu_svm *svm)
 {
 	u32 *msrpm = svm->msrpm;
@@ -548,6 +594,8 @@ static __init int svm_hardware_setup(void)
 	memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
 	iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
 
+	init_msrpm_offsets();
+
 	if (boot_cpu_has(X86_FEATURE_NX))
 		kvm_enable_efer_bits(EFER_NX);
 
@@ -811,6 +859,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	svm_vcpu_init_msrpm(svm->msrpm);
 
 	svm->nested.msrpm = page_address(nested_msrpm_pages);
+	svm_vcpu_init_msrpm(svm->nested.msrpm);
 
 	svm->vmcb = page_address(page);
 	clear_page(svm->vmcb);
@@ -1888,20 +1937,33 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 
 static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 {
-	u32 *nested_msrpm;
-	struct page *page;
+	/*
+	 * This function merges the msr permission bitmaps of kvm and the
+	 * nested vmcb. It is omptimized in that it only merges the parts where
+	 * the kvm msr permission bitmap may contain zero bits
+	 */
 	int i;
 
-	nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, &page);
-	if (!nested_msrpm)
-		return false;
+	if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
+		return true;
 
-	for (i = 0; i < PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++)
-		svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i];
+	for (i = 0; i < MSRPM_OFFSETS; i++) {
+		u32 value, p;
+		u64 offset;
 
-	svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
+		if (msrpm_offsets[i] == 0xffffffff)
+			break;
 
-	nested_svm_unmap(page);
+		offset = svm->nested.vmcb_msrpm + msrpm_offsets[i];
+		p      = msrpm_offsets[i] / 4;
+
+		if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
+			return false;
+
+		svm->nested.msrpm[p] = svm->msrpm[p] | value;
+	}
+
+	svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
 
 	return true;
 }
-- 
cgit v1.2.3-70-g09d2


From 0d6b35378e80c555e20ca3aa3d3cc609b403cbb6 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 1 Mar 2010 15:34:38 +0100
Subject: KVM: SVM: Use svm_msrpm_offset in nested_svm_exit_handled_msr

There is a generic function now to calculate msrpm offsets.
Use that function in nested_svm_exit_handled_msr() remove
the duplicate logic (which had a bug anyway).

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 47 +++++++++++++++++------------------------------
 1 file changed, 17 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 45a287e51e1..7cb2eb906ec 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1660,40 +1660,27 @@ static void nested_svm_unmap(struct page *page)
 
 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 {
-	u32 param = svm->vmcb->control.exit_info_1 & 1;
-	u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
-	u32 t0, t1;
-	int ret;
-	u8 val;
+	u32 offset, msr, value;
+	int write, mask;
 
 	if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
 		return NESTED_EXIT_HOST;
 
-	switch (msr) {
-	case 0 ... 0x1fff:
-		t0 = (msr * 2) % 8;
-		t1 = msr / 8;
-		break;
-	case 0xc0000000 ... 0xc0001fff:
-		t0 = (8192 + msr - 0xc0000000) * 2;
-		t1 = (t0 / 8);
-		t0 %= 8;
-		break;
-	case 0xc0010000 ... 0xc0011fff:
-		t0 = (16384 + msr - 0xc0010000) * 2;
-		t1 = (t0 / 8);
-		t0 %= 8;
-		break;
-	default:
-		ret = NESTED_EXIT_DONE;
-		goto out;
-	}
+	msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+	offset = svm_msrpm_offset(msr);
+	write  = svm->vmcb->control.exit_info_1 & 1;
+	mask   = 1 << ((2 * (msr & 0xf)) + write);
 
-	if (!kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + t1, &val, 1))
-		ret = val & ((1 << param) << t0) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+	if (offset == MSR_INVALID)
+		return NESTED_EXIT_DONE;
 
-out:
-	return ret;
+	/* Offset is in 32 bit units but need in 8 bit units */
+	offset *= 4;
+
+	if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
+		return NESTED_EXIT_DONE;
+
+	return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
 }
 
 static int nested_svm_exit_special(struct vcpu_svm *svm)
@@ -1954,8 +1941,8 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 		if (msrpm_offsets[i] == 0xffffffff)
 			break;
 
-		offset = svm->nested.vmcb_msrpm + msrpm_offsets[i];
-		p      = msrpm_offsets[i] / 4;
+		p      = msrpm_offsets[i];
+		offset = svm->nested.vmcb_msrpm + (p * 4);
 
 		if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
 			return false;
-- 
cgit v1.2.3-70-g09d2


From ce2ac085ff1500aad157cabd8221b7c38eb751bd Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 1 Mar 2010 15:34:39 +0100
Subject: KVM; SVM: Add correct handling of nested iopm

This patch adds the correct handling of the nested io
permission bitmap. Old behavior was to not lookup the port
in the iopm but only reinject an io intercept to the guest.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7cb2eb906ec..ddea391e78c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -79,6 +79,7 @@ struct nested_state {
 
 	/* gpa pointers to the real vectors */
 	u64 vmcb_msrpm;
+	u64 vmcb_iopm;
 
 	/* A VMEXIT is required but not yet emulated */
 	bool exit_required;
@@ -1658,6 +1659,26 @@ static void nested_svm_unmap(struct page *page)
 	kvm_release_page_dirty(page);
 }
 
+static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
+{
+	unsigned port;
+	u8 val, bit;
+	u64 gpa;
+
+	if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
+		return NESTED_EXIT_HOST;
+
+	port = svm->vmcb->control.exit_info_1 >> 16;
+	gpa  = svm->nested.vmcb_iopm + (port / 8);
+	bit  = port % 8;
+	val  = 0;
+
+	if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1))
+		val &= (1 << bit);
+
+	return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+}
+
 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 {
 	u32 offset, msr, value;
@@ -1723,6 +1744,9 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
 	case SVM_EXIT_MSR:
 		vmexit = nested_svm_exit_handled_msr(svm);
 		break;
+	case SVM_EXIT_IOIO:
+		vmexit = nested_svm_intercept_ioio(svm);
+		break;
 	case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
 		u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
 		if (svm->nested.intercept_cr_read & cr_bits)
@@ -2047,6 +2071,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	svm->vmcb->save.cpl = nested_vmcb->save.cpl;
 
 	svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
+	svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
 
 	/* cache intercepts */
 	svm->nested.intercept_cr_read    = nested_vmcb->control.intercept_cr_read;
-- 
cgit v1.2.3-70-g09d2


From f71385383fb86246ab3c69cc17a09ef9883590d8 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 1 Mar 2010 15:34:40 +0100
Subject: KVM: SVM: Ignore lower 12 bit of nested msrpm_pa

These bits are ignored by the hardware too. Implement this
for nested svm too.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ddea391e78c..490bec19988 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2070,7 +2070,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 	svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
 	svm->vmcb->save.cpl = nested_vmcb->save.cpl;
 
-	svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa;
+	svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
 	svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
 
 	/* cache intercepts */
-- 
cgit v1.2.3-70-g09d2


From 835e6b80478e59820cff127adba3e518ae5a43f5 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 3 Mar 2010 17:53:05 +0200
Subject: KVM: x86 emulator mark VMMCALL and LMSW as privileged

LMSW is present in both group tables. It was marked privileged only in
one of them. Intel analog of VMMCALL is already marked privileged.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5b6794adaa2..2832a8c07c6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -362,9 +362,9 @@ static u32 group_table[] = {
 
 static u32 group2_table[] = {
 	[Group7*8] =
-	SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM,
+	SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv,
 	SrcNone | ModRM | DstMem | Mov, 0,
-	SrcMem16 | ModRM | Mov, 0,
+	SrcMem16 | ModRM | Mov | Priv, 0,
 	[Group9*8] =
 	0, 0, 0, 0, 0, 0, 0, 0,
 };
-- 
cgit v1.2.3-70-g09d2


From 2ed152afc7ed61830b848b32936e1541a1a57799 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Wed, 10 Mar 2010 19:00:43 +0800
Subject: KVM: cleanup kvm trace

This patch does:

 - no need call tracepoint_synchronize_unregister() when kvm module
   is unloaded since ftrace can handle it

 - cleanup ftrace's macro

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 1 -
 arch/x86/kvm/mmutrace.h    | 7 +++++--
 arch/x86/kvm/trace.h       | 7 +++++--
 arch/x86/kvm/x86.c         | 2 +-
 include/trace/events/kvm.h | 1 -
 virt/kvm/kvm_main.c        | 1 -
 6 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 19a8906bcaa..3af2dfd8778 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -148,7 +148,6 @@ module_param(oos_shadow, bool, 0644);
 
 #include <trace/events/kvm.h>
 
-#undef TRACE_INCLUDE_FILE
 #define CREATE_TRACE_POINTS
 #include "mmutrace.h"
 
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3e4a5c6ca2a..1fe956ab761 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -6,8 +6,6 @@
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvmmmu
-#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE mmutrace
 
 #define KVM_MMU_PAGE_FIELDS \
 	__field(__u64, gfn) \
@@ -216,5 +214,10 @@ TRACE_EVENT(
 
 #endif /* _TRACE_KVMMMU_H */
 
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE mmutrace
+
 /* This part must be outside protection */
 #include <trace/define_trace.h>
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 17b52ccd977..b75efef79e5 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -5,8 +5,6 @@
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvm
-#define TRACE_INCLUDE_PATH arch/x86/kvm
-#define TRACE_INCLUDE_FILE trace
 
 /*
  * Tracepoint for guest mode entry.
@@ -575,5 +573,10 @@ TRACE_EVENT(kvm_skinit,
 
 #endif /* _TRACE_KVM_H */
 
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH arch/x86/kvm
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
 /* This part must be outside protection */
 #include <trace/define_trace.h>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2b34dc705cf..74e70d975ff 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -41,7 +41,7 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <trace/events/kvm.h>
-#undef TRACE_INCLUDE_FILE
+
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index b17d49dfc3e..6dd3a51ab1c 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -5,7 +5,6 @@
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kvm
-#define TRACE_INCLUDE_FILE kvm
 
 #if defined(__KVM_HAVE_IOAPIC)
 TRACE_EVENT(kvm_set_irq,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5bac6eb0f0a..b152b23cd09 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2279,7 +2279,6 @@ EXPORT_SYMBOL_GPL(kvm_init);
 
 void kvm_exit(void)
 {
-	tracepoint_synchronize_unregister();
 	kvm_exit_debug();
 	misc_deregister(&kvm_dev);
 	kmem_cache_destroy(kvm_vcpu_cache);
-- 
cgit v1.2.3-70-g09d2


From d4f64b6cad0fc0fb4cec868c6ca6b1325949d08b Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan.kim@gmail.com>
Date: Wed, 10 Mar 2010 23:31:22 +0900
Subject: KVM: remove redundant initialization of page->private

The prep_new_page() in page allocator calls set_page_private(page, 0).
So we don't need to reinitialize private of page.

Signed-off-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Avi Kivity<avi@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3af2dfd8778..4455ddbe36f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -326,7 +326,6 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
 		page = alloc_page(GFP_KERNEL);
 		if (!page)
 			return -ENOMEM;
-		set_page_private(page, 0);
 		cache->objects[cache->nobjs++] = page_address(page);
 	}
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 5bfd8b5455e69b37af16a2df1edae2c3b567648c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 11 Mar 2010 10:50:44 +0200
Subject: KVM: Move kvm_exit tracepoint rip reading inside tracepoint

Reading rip is expensive on vmx, so move it inside the tracepoint so we only
incur the cost if tracing is enabled.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c   | 2 +-
 arch/x86/kvm/trace.h | 6 +++---
 arch/x86/kvm/vmx.c   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 490bec19988..abbc3f9d03b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2686,7 +2686,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 	struct kvm_run *kvm_run = vcpu->run;
 	u32 exit_code = svm->vmcb->control.exit_code;
 
-	trace_kvm_exit(exit_code, svm->vmcb->save.rip);
+	trace_kvm_exit(exit_code, vcpu);
 
 	if (unlikely(svm->nested.exit_required)) {
 		nested_svm_vmexit(svm);
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index b75efef79e5..d10b359a21f 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -182,8 +182,8 @@ TRACE_EVENT(kvm_apic,
  * Tracepoint for kvm guest exit:
  */
 TRACE_EVENT(kvm_exit,
-	TP_PROTO(unsigned int exit_reason, unsigned long guest_rip),
-	TP_ARGS(exit_reason, guest_rip),
+	TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu),
+	TP_ARGS(exit_reason, vcpu),
 
 	TP_STRUCT__entry(
 		__field(	unsigned int,	exit_reason	)
@@ -192,7 +192,7 @@ TRACE_EVENT(kvm_exit,
 
 	TP_fast_assign(
 		__entry->exit_reason	= exit_reason;
-		__entry->guest_rip	= guest_rip;
+		__entry->guest_rip	= kvm_rip_read(vcpu);
 	),
 
 	TP_printk("reason %s rip 0x%lx",
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8e2a24693be..3dbfc20824b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3612,7 +3612,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	u32 exit_reason = vmx->exit_reason;
 	u32 vectoring_info = vmx->idt_vectoring_info;
 
-	trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
+	trace_kvm_exit(exit_reason, vcpu);
 
 	/* If guest state is invalid, start emulating */
 	if (vmx->emulation_required && emulate_invalid_guest_state)
-- 
cgit v1.2.3-70-g09d2


From 5c1c85d08da5c257b21b0423b96fa6554aa4cb6f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 11 Mar 2010 13:01:59 +0200
Subject: KVM: Trace exception injection

Often an exception can help point out where things start to go wrong.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/trace.h | 32 ++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c   |  3 +++
 2 files changed, 35 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index d10b359a21f..32c912c40bf 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -219,6 +219,38 @@ TRACE_EVENT(kvm_inj_virq,
 	TP_printk("irq %u", __entry->irq)
 );
 
+#define EXS(x) { x##_VECTOR, "#" #x }
+
+#define kvm_trace_sym_exc						\
+	EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM),	\
+	EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF),		\
+	EXS(MF), EXS(MC)
+
+/*
+ * Tracepoint for kvm interrupt injection:
+ */
+TRACE_EVENT(kvm_inj_exception,
+	TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
+	TP_ARGS(exception, has_error, error_code),
+
+	TP_STRUCT__entry(
+		__field(	u8,	exception	)
+		__field(	u8,	has_error	)
+		__field(	u32,	error_code	)
+	),
+
+	TP_fast_assign(
+		__entry->exception	= exception;
+		__entry->has_error	= has_error;
+		__entry->error_code	= error_code;
+	),
+
+	TP_printk("%s (0x%x)",
+		  __print_symbolic(__entry->exception, kvm_trace_sym_exc),
+		  /* FIXME: don't print error_code if not present */
+		  __entry->has_error ? __entry->error_code : 0)
+);
+
 /*
  * Tracepoint for page fault.
  */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 74e70d975ff..a1cf87fe9f3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4237,6 +4237,9 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
 {
 	/* try to reinject previous events if any */
 	if (vcpu->arch.exception.pending) {
+		trace_kvm_inj_exception(vcpu->arch.exception.nr,
+					vcpu->arch.exception.has_error_code,
+					vcpu->arch.exception.error_code);
 		kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
 					  vcpu->arch.exception.has_error_code,
 					  vcpu->arch.exception.error_code);
-- 
cgit v1.2.3-70-g09d2


From ec68798c8fd0f01cdbd3f3e1a970e76a644cf08e Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Fri, 5 Mar 2010 12:11:48 +0800
Subject: KVM: x86: Use native_store_idt() instead of kvm_get_idt()

This patch use generic linux function native_store_idt()
instead of kvm_get_idt(), and also removed the useless
function kvm_get_idt().

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 5 -----
 arch/x86/kvm/vmx.c              | 2 +-
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ec891a2ce86..ea1b6c615f9 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -716,11 +716,6 @@ static inline void kvm_load_ldt(u16 sel)
 	asm("lldt %0" : : "rm"(sel));
 }
 
-static inline void kvm_get_idt(struct desc_ptr *table)
-{
-	asm("sidt %0" : "=m"(*table));
-}
-
 #ifdef CONFIG_X86_64
 static inline unsigned long read_msr(unsigned long msr)
 {
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3dbfc20824b..33d88e0a060 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2452,7 +2452,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
 	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
 
-	kvm_get_idt(&dt);
+	native_store_idt(&dt);
 	vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
 
 	asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
-- 
cgit v1.2.3-70-g09d2


From 160d2f6c0c90713aa3bb93dd344fe0d527342e26 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Fri, 12 Mar 2010 10:09:45 +0800
Subject: KVM: x86: fix the error of ioctl KVM_IRQ_LINE if no irq chip

If no irq chip in kernel, ioctl KVM_IRQ_LINE will return -EFAULT.
But I see in other place such as KVM_[GET|SET]IRQCHIP, -ENXIO is
return. So this patch used -ENXIO instead of -EFAULT.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a1cf87fe9f3..7a8fe9051ec 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2836,11 +2836,13 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		r = -EFAULT;
 		if (copy_from_user(&irq_event, argp, sizeof irq_event))
 			goto out;
+		r = -ENXIO;
 		if (irqchip_in_kernel(kvm)) {
 			__s32 status;
 			status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
 					irq_event.irq, irq_event.level);
 			if (ioctl == KVM_IRQ_LINE_STATUS) {
+				r = -EFAULT;
 				irq_event.status = status;
 				if (copy_to_user(argp, &irq_event,
 							sizeof irq_event))
-- 
cgit v1.2.3-70-g09d2


From 72016f3a4221799a0b1fdf443ef6e29db572a9bb Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 15 Mar 2010 13:59:53 +0200
Subject: KVM: MMU: Consolidate two guest pte reads in kvm_mmu_pte_write()

kvm_mmu_pte_write() reads guest ptes in two different occasions, both to
allow a 32-bit pae guest to update a pte with 4-byte writes.  Consolidate
these into a single read, which also allows us to consolidate another read
from an invlpg speculating a gpte into the shadow page table.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 69 ++++++++++++++++++++++++------------------------------
 1 file changed, 31 insertions(+), 38 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4455ddbe36f..91f8b171c82 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2560,36 +2560,11 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
 }
 
 static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-					  const u8 *new, int bytes)
+					  u64 gpte)
 {
 	gfn_t gfn;
-	int r;
-	u64 gpte = 0;
 	pfn_t pfn;
 
-	if (bytes != 4 && bytes != 8)
-		return;
-
-	/*
-	 * Assume that the pte write on a page table of the same type
-	 * as the current vcpu paging mode.  This is nearly always true
-	 * (might be false while changing modes).  Note it is verified later
-	 * by update_pte().
-	 */
-	if (is_pae(vcpu)) {
-		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
-		if ((bytes == 4) && (gpa % 4 == 0)) {
-			r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8);
-			if (r)
-				return;
-			memcpy((void *)&gpte + (gpa % 8), new, 4);
-		} else if ((bytes == 8) && (gpa % 8 == 0)) {
-			memcpy((void *)&gpte, new, 8);
-		}
-	} else {
-		if ((bytes == 4) && (gpa % 4 == 0))
-			memcpy((void *)&gpte, new, 4);
-	}
 	if (!is_present_gpte(gpte))
 		return;
 	gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -2640,7 +2615,34 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	int r;
 
 	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
-	mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
+
+	switch (bytes) {
+	case 4:
+		gentry = *(const u32 *)new;
+		break;
+	case 8:
+		gentry = *(const u64 *)new;
+		break;
+	default:
+		gentry = 0;
+		break;
+	}
+
+	/*
+	 * Assume that the pte write on a page table of the same type
+	 * as the current vcpu paging mode.  This is nearly always true
+	 * (might be false while changing modes).  Note it is verified later
+	 * by update_pte().
+	 */
+	if (is_pae(vcpu) && bytes == 4) {
+		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
+		gpa &= ~(gpa_t)7;
+		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, 8);
+		if (r)
+			gentry = 0;
+	}
+
+	mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_access_page(vcpu, gfn);
 	kvm_mmu_free_some_pages(vcpu);
@@ -2705,20 +2707,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 				continue;
 		}
 		spte = &sp->spt[page_offset / sizeof(*spte)];
-		if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
-			gentry = 0;
-			r = kvm_read_guest_atomic(vcpu->kvm,
-						  gpa & ~(u64)(pte_size - 1),
-						  &gentry, pte_size);
-			new = (const void *)&gentry;
-			if (r < 0)
-				new = NULL;
-		}
 		while (npte--) {
 			entry = *spte;
 			mmu_pte_write_zap_pte(vcpu, sp, spte);
-			if (new)
-				mmu_pte_write_new_pte(vcpu, sp, spte, new);
+			if (gentry)
+				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
 			mmu_pte_write_flush_tlb(vcpu, entry, *spte);
 			++spte;
 		}
-- 
cgit v1.2.3-70-g09d2


From daea3e73cb4ac971bee97f333ae027861d00fc0b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 15 Mar 2010 13:59:54 +0200
Subject: KVM: Make locked operations truly atomic

Once upon a time, locked operations were emulated while holding the mmu mutex.
Since mmu pages were write protected, it was safe to emulate the writes in
a non-atomic manner, since there could be no other writer, either in the
guest or in the kernel.

These days emulation takes place without holding the mmu spinlock, so the
write could be preempted by an unshadowing event, which exposes the page
to writes by the guest.  This may cause corruption of guest page tables.

Fix by using an atomic cmpxchg for these operations.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 69 +++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7a8fe9051ec..855f3ea9edd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3291,41 +3291,68 @@ int emulator_write_emulated(unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(emulator_write_emulated);
 
+#define CMPXCHG_TYPE(t, ptr, old, new) \
+	(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
+
+#ifdef CONFIG_X86_64
+#  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
+#else
+#  define CMPXCHG64(ptr, old, new) \
+	(cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u *)(new)) == *(u64 *)(old))
+#endif
+
 static int emulator_cmpxchg_emulated(unsigned long addr,
 				     const void *old,
 				     const void *new,
 				     unsigned int bytes,
 				     struct kvm_vcpu *vcpu)
 {
-	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
-#ifndef CONFIG_X86_64
-	/* guests cmpxchg8b have to be emulated atomically */
-	if (bytes == 8) {
-		gpa_t gpa;
-		struct page *page;
-		char *kaddr;
-		u64 val;
+	gpa_t gpa;
+	struct page *page;
+	char *kaddr;
+	bool exchanged;
 
-		gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
+	/* guests cmpxchg8b have to be emulated atomically */
+	if (bytes > 8 || (bytes & (bytes - 1)))
+		goto emul_write;
 
-		if (gpa == UNMAPPED_GVA ||
-		   (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
-			goto emul_write;
+	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
 
-		if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
-			goto emul_write;
+	if (gpa == UNMAPPED_GVA ||
+	    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+		goto emul_write;
 
-		val = *(u64 *)new;
+	if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+		goto emul_write;
 
-		page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
 
-		kaddr = kmap_atomic(page, KM_USER0);
-		set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
-		kunmap_atomic(kaddr, KM_USER0);
-		kvm_release_page_dirty(page);
+	kaddr = kmap_atomic(page, KM_USER0);
+	kaddr += offset_in_page(gpa);
+	switch (bytes) {
+	case 1:
+		exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
+		break;
+	case 2:
+		exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
+		break;
+	case 4:
+		exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
+		break;
+	case 8:
+		exchanged = CMPXCHG64(kaddr, old, new);
+		break;
+	default:
+		BUG();
 	}
+	kunmap_atomic(kaddr, KM_USER0);
+	kvm_release_page_dirty(page);
+
+	if (!exchanged)
+		return X86EMUL_CMPXCHG_FAILED;
+
 emul_write:
-#endif
+	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 
 	return emulator_write_emulated(addr, new, bytes, vcpu);
 }
-- 
cgit v1.2.3-70-g09d2


From 4a5f48f666ccc4ffdbc54241d9cab06806ed7922 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 15 Mar 2010 13:59:55 +0200
Subject: KVM: Don't follow an atomic operation by a non-atomic one

Currently emulated atomic operations are immediately followed by a non-atomic
operation, so that kvm_mmu_pte_write() can be invoked.  This updates the mmu
but undoes the whole point of doing things atomically.

Fix by only performing the atomic operation and the mmu update, and avoiding
the non-atomic write.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 855f3ea9edd..dd4a7ad63af 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3234,7 +3234,8 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 static int emulator_write_emulated_onepage(unsigned long addr,
 					   const void *val,
 					   unsigned int bytes,
-					   struct kvm_vcpu *vcpu)
+					   struct kvm_vcpu *vcpu,
+					   bool mmu_only)
 {
 	gpa_t                 gpa;
 	u32 error_code;
@@ -3250,6 +3251,10 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 		goto mmio;
 
+	if (mmu_only) {
+		kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
+		return X86EMUL_CONTINUE;
+	}
 	if (emulator_write_phys(vcpu, gpa, val, bytes))
 		return X86EMUL_CONTINUE;
 
@@ -3270,24 +3275,35 @@ mmio:
 	return X86EMUL_CONTINUE;
 }
 
-int emulator_write_emulated(unsigned long addr,
+int __emulator_write_emulated(unsigned long addr,
 				   const void *val,
 				   unsigned int bytes,
-				   struct kvm_vcpu *vcpu)
+				   struct kvm_vcpu *vcpu,
+				   bool mmu_only)
 {
 	/* Crossing a page boundary? */
 	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
 		int rc, now;
 
 		now = -addr & ~PAGE_MASK;
-		rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
+		rc = emulator_write_emulated_onepage(addr, val, now, vcpu,
+						     mmu_only);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		addr += now;
 		val += now;
 		bytes -= now;
 	}
-	return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+	return emulator_write_emulated_onepage(addr, val, bytes, vcpu,
+					       mmu_only);
+}
+
+int emulator_write_emulated(unsigned long addr,
+				   const void *val,
+				   unsigned int bytes,
+				   struct kvm_vcpu *vcpu)
+{
+	return __emulator_write_emulated(addr, val, bytes, vcpu, false);
 }
 EXPORT_SYMBOL_GPL(emulator_write_emulated);
 
@@ -3351,6 +3367,8 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 	if (!exchanged)
 		return X86EMUL_CMPXCHG_FAILED;
 
+	return __emulator_write_emulated(addr, new, bytes, vcpu, true);
+
 emul_write:
 	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 
@@ -4005,7 +4023,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
 
 	kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
-	return emulator_write_emulated(rip, instruction, 3, vcpu);
+	return __emulator_write_emulated(rip, instruction, 3, vcpu, false);
 }
 
 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
-- 
cgit v1.2.3-70-g09d2


From fbc5d139bb92e6822e4c000f97631a072d8babf9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 15 Mar 2010 13:59:56 +0200
Subject: KVM: MMU: Do not instantiate nontrapping spte on unsync page

The update_pte() path currently uses a nontrapping spte when a nonpresent
(or nonaccessed) gpte is written.  This is fine since at present it is only
used on sync pages.  However, on an unsync page this will cause an endless
fault loop as the guest is under no obligation to invlpg a gpte that
transitions from nonpresent to present.

Needed for the next patch which reinstates update_pte() on invlpg.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 81eab9a50e6..4b37e1acd37 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -258,11 +258,17 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	pt_element_t gpte;
 	unsigned pte_access;
 	pfn_t pfn;
+	u64 new_spte;
 
 	gpte = *(const pt_element_t *)pte;
 	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
-		if (!is_present_gpte(gpte))
-			__set_spte(spte, shadow_notrap_nonpresent_pte);
+		if (!is_present_gpte(gpte)) {
+			if (page->unsync)
+				new_spte = shadow_trap_nonpresent_pte;
+			else
+				new_spte = shadow_notrap_nonpresent_pte;
+			__set_spte(spte, new_spte);
+		}
 		return;
 	}
 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
-- 
cgit v1.2.3-70-g09d2


From 08e850c6536db302050c0287649e68e3bbdfe2c7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 15 Mar 2010 13:59:57 +0200
Subject: KVM: MMU: Reinstate pte prefetch on invlpg

Commit fb341f57 removed the pte prefetch on guest invlpg, citing guest races.
However, the SDM is adamant that prefetch is allowed:

  "The processor may create entries in paging-structure caches for
   translations required for prefetches and for accesses that are a
   result of speculative execution that would never actually occur
   in the executed code path."

And, in fact, there was a race in the prefetch code: we picked up the pte
without the mmu lock held, so an older invlpg could install the pte over
a newer invlpg.

Reinstate the prefetch logic, but this time note whether another invlpg has
executed using a counter.  If a race occured, do not install the pte.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c              | 37 +++++++++++++++++++++++--------------
 arch/x86/kvm/paging_tmpl.h      | 15 +++++++++++++++
 3 files changed, 39 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ea1b6c615f9..28826c82d1e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -389,6 +389,7 @@ struct kvm_arch {
 	unsigned int n_free_mmu_pages;
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_alloc_mmu_pages;
+	atomic_t invlpg_counter;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
 	/*
 	 * Hash table of struct kvm_mmu_page.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 91f8b171c82..064c3efb49d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2613,20 +2613,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	int flooded = 0;
 	int npte;
 	int r;
+	int invlpg_counter;
 
 	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
 
-	switch (bytes) {
-	case 4:
-		gentry = *(const u32 *)new;
-		break;
-	case 8:
-		gentry = *(const u64 *)new;
-		break;
-	default:
-		gentry = 0;
-		break;
-	}
+	invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
 
 	/*
 	 * Assume that the pte write on a page table of the same type
@@ -2634,16 +2625,34 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	 * (might be false while changing modes).  Note it is verified later
 	 * by update_pte().
 	 */
-	if (is_pae(vcpu) && bytes == 4) {
+	if ((is_pae(vcpu) && bytes == 4) || !new) {
 		/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
-		gpa &= ~(gpa_t)7;
-		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, 8);
+		if (is_pae(vcpu)) {
+			gpa &= ~(gpa_t)7;
+			bytes = 8;
+		}
+		r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
 		if (r)
 			gentry = 0;
+		new = (const u8 *)&gentry;
+	}
+
+	switch (bytes) {
+	case 4:
+		gentry = *(const u32 *)new;
+		break;
+	case 8:
+		gentry = *(const u64 *)new;
+		break;
+	default:
+		gentry = 0;
+		break;
 	}
 
 	mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
 	spin_lock(&vcpu->kvm->mmu_lock);
+	if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
+		gentry = 0;
 	kvm_mmu_access_page(vcpu, gfn);
 	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4b37e1acd37..067797a7276 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -463,6 +463,7 @@ out_unlock:
 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
 	struct kvm_shadow_walk_iterator iterator;
+	gpa_t pte_gpa = -1;
 	int level;
 	u64 *sptep;
 	int need_flush = 0;
@@ -476,6 +477,10 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 		if (level == PT_PAGE_TABLE_LEVEL  ||
 		    ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
 		    ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
+			struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+			pte_gpa = (sp->gfn << PAGE_SHIFT);
+			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
 
 			if (is_shadow_present_pte(*sptep)) {
 				rmap_remove(vcpu->kvm, sptep);
@@ -493,7 +498,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 
 	if (need_flush)
 		kvm_flush_remote_tlbs(vcpu->kvm);
+
+	atomic_inc(&vcpu->kvm->arch.invlpg_counter);
+
 	spin_unlock(&vcpu->kvm->mmu_lock);
+
+	if (pte_gpa == -1)
+		return;
+
+	if (mmu_topup_memory_caches(vcpu))
+		return;
+	kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
 }
 
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
-- 
cgit v1.2.3-70-g09d2


From d6d367d6783e38634377bc66b62bff3ffd717e5f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 15 Mar 2010 16:38:28 +0200
Subject: KVM: x86 emulator: Fix DstAcc decoding.

Set correct operation length. Add RAX (64bit) handling.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2832a8c07c6..0b70a364f0f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1194,9 +1194,9 @@ done_prefixes:
 		break;
 	case DstAcc:
 		c->dst.type = OP_REG;
-		c->dst.bytes = c->op_bytes;
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		c->dst.ptr = &c->regs[VCPU_REGS_RAX];
-		switch (c->op_bytes) {
+		switch (c->dst.bytes) {
 			case 1:
 				c->dst.val = *(u8 *)c->dst.ptr;
 				break;
@@ -1206,6 +1206,9 @@ done_prefixes:
 			case 4:
 				c->dst.val = *(u32 *)c->dst.ptr;
 				break;
+			case 8:
+				c->dst.val = *(u64 *)c->dst.ptr;
+				break;
 		}
 		c->dst.orig_val = c->dst.val;
 		break;
-- 
cgit v1.2.3-70-g09d2


From c73e197bc525e67b71578126b679446f5b88b508 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 15 Mar 2010 16:38:29 +0200
Subject: KVM: x86 emulator: fix RCX access during rep emulation

During rep emulation access length to RCX depends on current address
mode.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0b70a364f0f..4dce80560d2 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1852,7 +1852,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 
 	if (c->rep_prefix && (c->d & String)) {
 		/* All REP prefixes have the same first termination condition */
-		if (c->regs[VCPU_REGS_RCX] == 0) {
+		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
 			kvm_rip_write(ctxt->vcpu, c->eip);
 			goto done;
 		}
@@ -1876,7 +1876,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 				goto done;
 			}
 		}
-		c->regs[VCPU_REGS_RCX]--;
+		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
 		c->eip = kvm_rip_read(ctxt->vcpu);
 	}
 
-- 
cgit v1.2.3-70-g09d2


From af5b4f7ff7ec76400b89db9538accd9aeb996da4 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 15 Mar 2010 16:38:30 +0200
Subject: KVM: x86 emulator: check return value against correct define

Check return value against correct define instead of open code
the value.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4dce80560d2..670ca8f151d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -566,7 +566,7 @@ static u32 group2_table[] = {
 #define insn_fetch(_type, _size, _eip)                                  \
 ({	unsigned long _x;						\
 	rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size));		\
-	if (rc != 0)							\
+	if (rc != X86EMUL_CONTINUE)					\
 		goto done;						\
 	(_eip) += (_size);						\
 	(_type)_x;							\
-- 
cgit v1.2.3-70-g09d2


From 49c6799a2ce3a6a4dd66021dabeb468901c7a700 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 15 Mar 2010 16:38:31 +0200
Subject: KVM: Remove pointer to rflags from realmode_set_cr parameters.

Mov reg, cr instruction doesn't change flags in any meaningful way, so
no need to update rflags after instruction execution.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 3 +--
 arch/x86/kvm/emulate.c          | 3 +--
 arch/x86/kvm/x86.c              | 4 +---
 3 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 28826c82d1e..53f52025947 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -587,8 +587,7 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
 		   unsigned long *rflags);
 
 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
-		     unsigned long *rflags);
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value);
 void kvm_enable_efer_bits(u64);
 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 670ca8f151d..91450b5cd49 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2534,8 +2534,7 @@ twobyte_insn:
 	case 0x22: /* mov reg, cr */
 		if (c->modrm_mod != 3)
 			goto cannot_emulate;
-		realmode_set_cr(ctxt->vcpu,
-				c->modrm_reg, c->modrm_val, &ctxt->eflags);
+		realmode_set_cr(ctxt->vcpu, c->modrm_reg, c->modrm_val);
 		c->dst.type = OP_NONE;
 		break;
 	case 0x23: /* mov from reg to dr */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dd4a7ad63af..35db4f0db4e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4080,13 +4080,11 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
 	return value;
 }
 
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
-		     unsigned long *rflags)
+void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val)
 {
 	switch (cr) {
 	case 0:
 		kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
-		*rflags = kvm_get_rflags(vcpu);
 		break;
 	case 2:
 		vcpu->arch.cr2 = val;
-- 
cgit v1.2.3-70-g09d2


From 31299944584fd62df8b0cfa30ad2c56f445b8cf2 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Mon, 15 Mar 2010 17:29:09 +0800
Subject: KVM: VMX: change to use bool return values

Make use of bool as return values, and remove some useless
bool value converting. Thanks Avi to point this out.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 54 +++++++++++++++++++++++++++---------------------------
 1 file changed, 27 insertions(+), 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 33d88e0a060..87b3c6843aa 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -234,56 +234,56 @@ static const u32 vmx_msr_index[] = {
 };
 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
 
-static inline int is_page_fault(u32 intr_info)
+static inline bool is_page_fault(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
 		(INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
 }
 
-static inline int is_no_device(u32 intr_info)
+static inline bool is_no_device(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
 		(INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
 }
 
-static inline int is_invalid_opcode(u32 intr_info)
+static inline bool is_invalid_opcode(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
 		(INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
 }
 
-static inline int is_external_interrupt(u32 intr_info)
+static inline bool is_external_interrupt(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
 		== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
 }
 
-static inline int is_machine_check(u32 intr_info)
+static inline bool is_machine_check(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
 			     INTR_INFO_VALID_MASK)) ==
 		(INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
 }
 
-static inline int cpu_has_vmx_msr_bitmap(void)
+static inline bool cpu_has_vmx_msr_bitmap(void)
 {
 	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
 }
 
-static inline int cpu_has_vmx_tpr_shadow(void)
+static inline bool cpu_has_vmx_tpr_shadow(void)
 {
 	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
 }
 
-static inline int vm_need_tpr_shadow(struct kvm *kvm)
+static inline bool vm_need_tpr_shadow(struct kvm *kvm)
 {
 	return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
 }
 
-static inline int cpu_has_secondary_exec_ctrls(void)
+static inline bool cpu_has_secondary_exec_ctrls(void)
 {
 	return vmcs_config.cpu_based_exec_ctrl &
 		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -303,80 +303,80 @@ static inline bool cpu_has_vmx_flexpriority(void)
 
 static inline bool cpu_has_vmx_ept_execute_only(void)
 {
-	return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT);
+	return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
 }
 
 static inline bool cpu_has_vmx_eptp_uncacheable(void)
 {
-	return !!(vmx_capability.ept & VMX_EPTP_UC_BIT);
+	return vmx_capability.ept & VMX_EPTP_UC_BIT;
 }
 
 static inline bool cpu_has_vmx_eptp_writeback(void)
 {
-	return !!(vmx_capability.ept & VMX_EPTP_WB_BIT);
+	return vmx_capability.ept & VMX_EPTP_WB_BIT;
 }
 
 static inline bool cpu_has_vmx_ept_2m_page(void)
 {
-	return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
+	return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
 }
 
 static inline bool cpu_has_vmx_ept_1g_page(void)
 {
-	return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT);
+	return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
 }
 
-static inline int cpu_has_vmx_invept_individual_addr(void)
+static inline bool cpu_has_vmx_invept_individual_addr(void)
 {
-	return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
+	return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
 }
 
-static inline int cpu_has_vmx_invept_context(void)
+static inline bool cpu_has_vmx_invept_context(void)
 {
-	return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT);
+	return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
 }
 
-static inline int cpu_has_vmx_invept_global(void)
+static inline bool cpu_has_vmx_invept_global(void)
 {
-	return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT);
+	return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
 }
 
-static inline int cpu_has_vmx_ept(void)
+static inline bool cpu_has_vmx_ept(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
 		SECONDARY_EXEC_ENABLE_EPT;
 }
 
-static inline int cpu_has_vmx_unrestricted_guest(void)
+static inline bool cpu_has_vmx_unrestricted_guest(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
 		SECONDARY_EXEC_UNRESTRICTED_GUEST;
 }
 
-static inline int cpu_has_vmx_ple(void)
+static inline bool cpu_has_vmx_ple(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
 		SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 }
 
-static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
+static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
 {
 	return flexpriority_enabled && irqchip_in_kernel(kvm);
 }
 
-static inline int cpu_has_vmx_vpid(void)
+static inline bool cpu_has_vmx_vpid(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
 		SECONDARY_EXEC_ENABLE_VPID;
 }
 
-static inline int cpu_has_vmx_rdtscp(void)
+static inline bool cpu_has_vmx_rdtscp(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
 		SECONDARY_EXEC_RDTSCP;
 }
 
-static inline int cpu_has_virtual_nmis(void)
+static inline bool cpu_has_virtual_nmis(void)
 {
 	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
 }
-- 
cgit v1.2.3-70-g09d2


From 52a4661737ecc918633f6b05c611a4af4b5eae5a Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:03 +0200
Subject: KVM: Provide callback to get/set control registers in emulator ops.

Use this callback instead of directly call kvm function. Also rename
realmode_(set|get)_cr to emulator_(set|get)_cr since function has nothing
to do with real mode.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |   3 +-
 arch/x86/include/asm/kvm_host.h    |   2 -
 arch/x86/kvm/emulate.c             |   7 +--
 arch/x86/kvm/x86.c                 | 114 +++++++++++++++++++------------------
 4 files changed, 63 insertions(+), 63 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 2666d7ac322..0c5caa469eb 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -108,7 +108,8 @@ struct x86_emulate_ops {
 				const void *new,
 				unsigned int bytes,
 				struct kvm_vcpu *vcpu);
-
+	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
+	void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
 };
 
 /* Type, address-of, and value of an instruction's operand. */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 53f52025947..9d474c7ae26 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -586,8 +586,6 @@ void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
 		   unsigned long *rflags);
 
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value);
 void kvm_enable_efer_bits(u64);
 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 91450b5cd49..5b060e4be0e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2483,7 +2483,7 @@ twobyte_insn:
 			break;
 		case 4: /* smsw */
 			c->dst.bytes = 2;
-			c->dst.val = realmode_get_cr(ctxt->vcpu, 0);
+			c->dst.val = ops->get_cr(0, ctxt->vcpu);
 			break;
 		case 6: /* lmsw */
 			realmode_lmsw(ctxt->vcpu, (u16)c->src.val,
@@ -2519,8 +2519,7 @@ twobyte_insn:
 	case 0x20: /* mov cr, reg */
 		if (c->modrm_mod != 3)
 			goto cannot_emulate;
-		c->regs[c->modrm_rm] =
-				realmode_get_cr(ctxt->vcpu, c->modrm_reg);
+		c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x21: /* mov from dr to reg */
@@ -2534,7 +2533,7 @@ twobyte_insn:
 	case 0x22: /* mov reg, cr */
 		if (c->modrm_mod != 3)
 			goto cannot_emulate;
-		realmode_set_cr(ctxt->vcpu, c->modrm_reg, c->modrm_val);
+		ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
 		c->dst.type = OP_NONE;
 		break;
 	case 0x23: /* mov from reg to dr */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 35db4f0db4e..94a29759ab2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3423,12 +3423,70 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
 }
 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
 
+static u64 mk_cr_64(u64 curr_cr, u32 new_val)
+{
+	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
+}
+
+static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
+{
+	unsigned long value;
+
+	switch (cr) {
+	case 0:
+		value = kvm_read_cr0(vcpu);
+		break;
+	case 2:
+		value = vcpu->arch.cr2;
+		break;
+	case 3:
+		value = vcpu->arch.cr3;
+		break;
+	case 4:
+		value = kvm_read_cr4(vcpu);
+		break;
+	case 8:
+		value = kvm_get_cr8(vcpu);
+		break;
+	default:
+		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+		return 0;
+	}
+
+	return value;
+}
+
+static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
+{
+	switch (cr) {
+	case 0:
+		kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+		break;
+	case 2:
+		vcpu->arch.cr2 = val;
+		break;
+	case 3:
+		kvm_set_cr3(vcpu, val);
+		break;
+	case 4:
+		kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+		break;
+	case 8:
+		kvm_set_cr8(vcpu, val & 0xfUL);
+		break;
+	default:
+		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+	}
+}
+
 static struct x86_emulate_ops emulate_ops = {
 	.read_std            = kvm_read_guest_virt_system,
 	.fetch               = kvm_fetch_guest_virt,
 	.read_emulated       = emulator_read_emulated,
 	.write_emulated      = emulator_write_emulated,
 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
+	.get_cr              = emulator_get_cr,
+	.set_cr              = emulator_set_cr,
 };
 
 static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -4026,11 +4084,6 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
 	return __emulator_write_emulated(rip, instruction, 3, vcpu, false);
 }
 
-static u64 mk_cr_64(u64 curr_cr, u32 new_val)
-{
-	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
-}
-
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
 {
 	struct desc_ptr dt = { limit, base };
@@ -4052,57 +4105,6 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
 	*rflags = kvm_get_rflags(vcpu);
 }
 
-unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
-{
-	unsigned long value;
-
-	switch (cr) {
-	case 0:
-		value = kvm_read_cr0(vcpu);
-		break;
-	case 2:
-		value = vcpu->arch.cr2;
-		break;
-	case 3:
-		value = vcpu->arch.cr3;
-		break;
-	case 4:
-		value = kvm_read_cr4(vcpu);
-		break;
-	case 8:
-		value = kvm_get_cr8(vcpu);
-		break;
-	default:
-		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
-		return 0;
-	}
-
-	return value;
-}
-
-void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val)
-{
-	switch (cr) {
-	case 0:
-		kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
-		break;
-	case 2:
-		vcpu->arch.cr2 = val;
-		break;
-	case 3:
-		kvm_set_cr3(vcpu, val);
-		break;
-	case 4:
-		kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
-		break;
-	case 8:
-		kvm_set_cr8(vcpu, val & 0xfUL);
-		break;
-	default:
-		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
-	}
-}
-
 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
 {
 	struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
-- 
cgit v1.2.3-70-g09d2


From 93a152be5af3d651ff0ab5459f5e0f9662b22438 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:04 +0200
Subject: KVM: remove realmode_lmsw function.

Use (get|set)_cr callback to emulate lmsw inside emulator.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 --
 arch/x86/kvm/emulate.c          | 4 ++--
 arch/x86/kvm/x86.c              | 7 -------
 3 files changed, 2 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9d474c7ae26..b99cec1547c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -583,8 +583,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
-		   unsigned long *rflags);
 
 void kvm_enable_efer_bits(u64);
 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5b060e4be0e..5e2fa61e810 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2486,8 +2486,8 @@ twobyte_insn:
 			c->dst.val = ops->get_cr(0, ctxt->vcpu);
 			break;
 		case 6: /* lmsw */
-			realmode_lmsw(ctxt->vcpu, (u16)c->src.val,
-				      &ctxt->eflags);
+			ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) |
+				    (c->src.val & 0x0f), ctxt->vcpu);
 			c->dst.type = OP_NONE;
 			break;
 		case 7: /* invlpg*/
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 94a29759ab2..c382e972109 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4098,13 +4098,6 @@ void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
 	kvm_x86_ops->set_idt(vcpu, &dt);
 }
 
-void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
-		   unsigned long *rflags)
-{
-	kvm_lmsw(vcpu, msw);
-	*rflags = kvm_get_rflags(vcpu);
-}
-
 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
 {
 	struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
-- 
cgit v1.2.3-70-g09d2


From 9c5372445c1ad4fcdb4128957ec89334223b8113 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:05 +0200
Subject: KVM: Provide x86_emulate_ctxt callback to get current cpl

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/kvm/emulate.c             | 15 ++++++++-------
 arch/x86/kvm/x86.c                 |  6 ++++++
 3 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0c5caa469eb..b048fd21c54 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -110,6 +110,7 @@ struct x86_emulate_ops {
 				struct kvm_vcpu *vcpu);
 	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
 	void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
+	int (*cpl)(struct kvm_vcpu *vcpu);
 };
 
 /* Type, address-of, and value of an instruction's operand. */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5e2fa61e810..8bd05571672 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1257,7 +1257,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 	int rc;
 	unsigned long val, change_mask;
 	int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
-	int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu);
+	int cpl = ops->cpl(ctxt->vcpu);
 
 	rc = emulate_pop(ctxt, ops, &val, len);
 	if (rc != X86EMUL_CONTINUE)
@@ -1758,7 +1758,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
-static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
+static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
+			      struct x86_emulate_ops *ops)
 {
 	int iopl;
 	if (ctxt->mode == X86EMUL_MODE_REAL)
@@ -1766,7 +1767,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
 	if (ctxt->mode == X86EMUL_MODE_VM86)
 		return true;
 	iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
-	return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl;
+	return ops->cpl(ctxt->vcpu) > iopl;
 }
 
 static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
@@ -1803,7 +1804,7 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
 				 struct x86_emulate_ops *ops,
 				 u16 port, u16 len)
 {
-	if (emulator_bad_iopl(ctxt))
+	if (emulator_bad_iopl(ctxt, ops))
 		if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
 			return false;
 	return true;
@@ -1842,7 +1843,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	}
 
 	/* Privileged instruction can be executed only in CPL=0 */
-	if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) {
+	if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
 		kvm_inject_gp(ctxt->vcpu, 0);
 		goto done;
 	}
@@ -2378,7 +2379,7 @@ special_insn:
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xfa: /* cli */
-		if (emulator_bad_iopl(ctxt))
+		if (emulator_bad_iopl(ctxt, ops))
 			kvm_inject_gp(ctxt->vcpu, 0);
 		else {
 			ctxt->eflags &= ~X86_EFLAGS_IF;
@@ -2386,7 +2387,7 @@ special_insn:
 		}
 		break;
 	case 0xfb: /* sti */
-		if (emulator_bad_iopl(ctxt))
+		if (emulator_bad_iopl(ctxt, ops))
 			kvm_inject_gp(ctxt->vcpu, 0);
 		else {
 			toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c382e972109..9cb28a943c9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3479,6 +3479,11 @@ static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
 	}
 }
 
+static int emulator_get_cpl(struct kvm_vcpu *vcpu)
+{
+	return kvm_x86_ops->get_cpl(vcpu);
+}
+
 static struct x86_emulate_ops emulate_ops = {
 	.read_std            = kvm_read_guest_virt_system,
 	.fetch               = kvm_fetch_guest_virt,
@@ -3487,6 +3492,7 @@ static struct x86_emulate_ops emulate_ops = {
 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
 	.get_cr              = emulator_get_cr,
 	.set_cr              = emulator_set_cr,
+	.cpl                 = emulator_get_cpl,
 };
 
 static void cache_all_regs(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3-70-g09d2


From 063db061b9b3472c925f09ae3a0a8359b80c2295 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:06 +0200
Subject: KVM: Provide current eip as part of emulator context.

Eliminate the need to call back into KVM to get it from emulator.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  3 ++-
 arch/x86/kvm/emulate.c             | 12 ++++++------
 arch/x86/kvm/x86.c                 |  1 +
 3 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b048fd21c54..07657258af8 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -141,7 +141,7 @@ struct decode_cache {
 	u8 seg_override;
 	unsigned int d;
 	unsigned long regs[NR_VCPU_REGS];
-	unsigned long eip, eip_orig;
+	unsigned long eip;
 	/* modrm */
 	u8 modrm;
 	u8 modrm_mod;
@@ -160,6 +160,7 @@ struct x86_emulate_ctxt {
 	struct kvm_vcpu *vcpu;
 
 	unsigned long eflags;
+	unsigned long eip; /* eip before instruction emulation */
 	/* Emulated execution mode, represented by an X86EMUL_MODE value. */
 	int mode;
 	u32 cs_base;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8bd05571672..2c27aa466cf 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -667,7 +667,7 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
 	int rc;
 
 	/* x86 instructions are limited to 15 bytes. */
-	if (eip + size - ctxt->decode.eip_orig > 15)
+	if (eip + size - ctxt->eip > 15)
 		return X86EMUL_UNHANDLEABLE;
 	eip += ctxt->cs_base;
 	while (size--) {
@@ -927,7 +927,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	/* Shadow copy of register state. Committed on successful emulation. */
 
 	memset(c, 0, sizeof(struct decode_cache));
-	c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu);
+	c->eip = ctxt->eip;
 	ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
 	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
 
@@ -1878,7 +1878,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 			}
 		}
 		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
-		c->eip = kvm_rip_read(ctxt->vcpu);
+		c->eip = ctxt->eip;
 	}
 
 	if (c->src.type == OP_MEM) {
@@ -2447,7 +2447,7 @@ twobyte_insn:
 				goto done;
 
 			/* Let the processor re-execute the fixed hypercall */
-			c->eip = kvm_rip_read(ctxt->vcpu);
+			c->eip = ctxt->eip;
 			/* Disable writeback. */
 			c->dst.type = OP_NONE;
 			break;
@@ -2551,7 +2551,7 @@ twobyte_insn:
 			| ((u64)c->regs[VCPU_REGS_RDX] << 32);
 		if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
-			c->eip = kvm_rip_read(ctxt->vcpu);
+			c->eip = ctxt->eip;
 		}
 		rc = X86EMUL_CONTINUE;
 		c->dst.type = OP_NONE;
@@ -2560,7 +2560,7 @@ twobyte_insn:
 		/* rdmsr */
 		if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
-			c->eip = kvm_rip_read(ctxt->vcpu);
+			c->eip = ctxt->eip;
 		} else {
 			c->regs[VCPU_REGS_RAX] = (u32)msr_data;
 			c->regs[VCPU_REGS_RDX] = msr_data >> 32;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9cb28a943c9..0ecd37ac9d3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3531,6 +3531,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 
 		vcpu->arch.emulate_ctxt.vcpu = vcpu;
 		vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+		vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
 		vcpu->arch.emulate_ctxt.mode =
 			(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
 			(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
-- 
cgit v1.2.3-70-g09d2


From 5e3ae6c5407ffb23bc4d9871e09d1b222e1b31a4 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:07 +0200
Subject: KVM: x86 emulator: fix mov r/m, sreg emulation.

mov r/m, sreg generates #UD ins sreg is incorrect.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2c27aa466cf..c3b9334eb24 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2126,12 +2126,11 @@ special_insn:
 	case 0x8c: { /* mov r/m, sreg */
 		struct kvm_segment segreg;
 
-		if (c->modrm_reg <= 5)
+		if (c->modrm_reg <= VCPU_SREG_GS)
 			kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
 		else {
-			printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n",
-			       c->modrm);
-			goto cannot_emulate;
+			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			goto done;
 		}
 		c->dst.val = segreg.selector;
 		break;
-- 
cgit v1.2.3-70-g09d2


From 6e1e5ffee8d95f9bce71eaa029cb5247b0f2f673 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:08 +0200
Subject: KVM: x86 emulator: fix 0f 01 /5 emulation

It is undefined and should generate #UD.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c3b9334eb24..7c7debb424d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2490,6 +2490,9 @@ twobyte_insn:
 				    (c->src.val & 0x0f), ctxt->vcpu);
 			c->dst.type = OP_NONE;
 			break;
+		case 5: /* not defined */
+			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			goto done;
 		case 7: /* invlpg*/
 			emulate_invlpg(ctxt->vcpu, memop);
 			/* Disable writeback. */
-- 
cgit v1.2.3-70-g09d2


From ab8557b2b361c8bb2e2421c791c8f6c4f6ba3d08 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:09 +0200
Subject: KVM: x86 emulator: 0f (20|21|22|23) ignore mod bits.

Resent spec says that for 0f (20|21|22|23) the 2 bits in the mod field
are ignored. Interestingly enough older spec says that 11 is only valid
encoding.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7c7debb424d..fa4604e0325 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2520,28 +2520,20 @@ twobyte_insn:
 		c->dst.type = OP_NONE;
 		break;
 	case 0x20: /* mov cr, reg */
-		if (c->modrm_mod != 3)
-			goto cannot_emulate;
 		c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x21: /* mov from dr to reg */
-		if (c->modrm_mod != 3)
-			goto cannot_emulate;
 		if (emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]))
 			goto cannot_emulate;
 		rc = X86EMUL_CONTINUE;
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x22: /* mov reg, cr */
-		if (c->modrm_mod != 3)
-			goto cannot_emulate;
 		ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
 		c->dst.type = OP_NONE;
 		break;
 	case 0x23: /* mov from reg to dr */
-		if (c->modrm_mod != 3)
-			goto cannot_emulate;
 		if (emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]))
 			goto cannot_emulate;
 		rc = X86EMUL_CONTINUE;
-- 
cgit v1.2.3-70-g09d2


From 6aebfa6ea75f9a02a0339e733090dd40d6f2edfd Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:10 +0200
Subject: KVM: x86 emulator: inject #UD on access to non-existing CR

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index fa4604e0325..836e97ba45d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2520,6 +2520,13 @@ twobyte_insn:
 		c->dst.type = OP_NONE;
 		break;
 	case 0x20: /* mov cr, reg */
+		switch (c->modrm_reg) {
+		case 1:
+		case 5 ... 7:
+		case 9 ... 15:
+			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			goto done;
+		}
 		c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
-- 
cgit v1.2.3-70-g09d2


From 1e470be5a10801cb1c5c145f2cd9e0f5ebaf4f2e Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:11 +0200
Subject: KVM: x86 emulator: fix mov dr to inject #UD when needed.

If CR4.DE=1 access to registers DR4/DR5 cause #UD.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 836e97ba45d..5afddcfa1a7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2531,9 +2531,12 @@ twobyte_insn:
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x21: /* mov from dr to reg */
-		if (emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]))
-			goto cannot_emulate;
-		rc = X86EMUL_CONTINUE;
+		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
+		    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
+			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			goto done;
+		}
+		emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x22: /* mov reg, cr */
@@ -2541,9 +2544,12 @@ twobyte_insn:
 		c->dst.type = OP_NONE;
 		break;
 	case 0x23: /* mov from reg to dr */
-		if (emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]))
-			goto cannot_emulate;
-		rc = X86EMUL_CONTINUE;
+		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
+		    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
+			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			goto done;
+		}
+		emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]);
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x30:
-- 
cgit v1.2.3-70-g09d2


From 2e901c4cf4b550ad37840870246e835889cf7322 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:12 +0200
Subject: KVM: x86 emulator: fix return values of syscall/sysenter/sysexit
 emulations

Return X86EMUL_PROPAGATE_FAULT is fault was injected. Also inject #UD
for those instruction when appropriate.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5afddcfa1a7..1393bf03424 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1600,8 +1600,11 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
 	u64 msr_data;
 
 	/* syscall is not available in real mode */
-	if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86)
-		return X86EMUL_UNHANDLEABLE;
+	if (ctxt->mode == X86EMUL_MODE_REAL ||
+	    ctxt->mode == X86EMUL_MODE_VM86) {
+		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
 
 	setup_syscalls_segments(ctxt, &cs, &ss);
 
@@ -1651,14 +1654,16 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
 	/* inject #GP if in real mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL) {
 		kvm_inject_gp(ctxt->vcpu, 0);
-		return X86EMUL_UNHANDLEABLE;
+		return X86EMUL_PROPAGATE_FAULT;
 	}
 
 	/* XXX sysenter/sysexit have not been tested in 64bit mode.
 	* Therefore, we inject an #UD.
 	*/
-	if (ctxt->mode == X86EMUL_MODE_PROT64)
-		return X86EMUL_UNHANDLEABLE;
+	if (ctxt->mode == X86EMUL_MODE_PROT64) {
+		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
 
 	setup_syscalls_segments(ctxt, &cs, &ss);
 
@@ -1713,7 +1718,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
 	if (ctxt->mode == X86EMUL_MODE_REAL ||
 	    ctxt->mode == X86EMUL_MODE_VM86) {
 		kvm_inject_gp(ctxt->vcpu, 0);
-		return X86EMUL_UNHANDLEABLE;
+		return X86EMUL_PROPAGATE_FAULT;
 	}
 
 	setup_syscalls_segments(ctxt, &cs, &ss);
-- 
cgit v1.2.3-70-g09d2


From fd5253658b403d51fc19e56ecb44c54a3071fded Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:13 +0200
Subject: KVM: x86 emulator: do not call writeback if msr access fails.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1393bf03424..b89a8f21733 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2563,7 +2563,7 @@ twobyte_insn:
 			| ((u64)c->regs[VCPU_REGS_RDX] << 32);
 		if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
-			c->eip = ctxt->eip;
+			goto done;
 		}
 		rc = X86EMUL_CONTINUE;
 		c->dst.type = OP_NONE;
@@ -2572,7 +2572,7 @@ twobyte_insn:
 		/* rdmsr */
 		if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
-			c->eip = ctxt->eip;
+			goto done;
 		} else {
 			c->regs[VCPU_REGS_RAX] = (u32)msr_data;
 			c->regs[VCPU_REGS_RDX] = msr_data >> 32;
-- 
cgit v1.2.3-70-g09d2


From a41ffb7540cb37426759e688083502d6463421b2 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:14 +0200
Subject: KVM: x86 emulator: If LOCK prefix is used dest arg should be memory.

If LOCK prefix is used dest arg should be memory, otherwise instruction
should generate #UD.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b89a8f21733..46a7ee3040a 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1842,7 +1842,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	}
 
 	/* LOCK prefix is allowed only with some instructions */
-	if (c->lock_prefix && !(c->d & Lock)) {
+	if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
 		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
 		goto done;
 	}
-- 
cgit v1.2.3-70-g09d2


From aca06a83071e4e4c9150751db7ea6a46240734fc Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:15 +0200
Subject: KVM: x86 emulator: cleanup grp3 return value

When x86_emulate_insn() does not know how to emulate instruction it
exits via cannot_emulate label in all cases except when emulating
grp3. Fix that.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 46a7ee3040a..d696cbd6ff7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1397,7 +1397,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 			       struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	int rc = X86EMUL_CONTINUE;
 
 	switch (c->modrm_reg) {
 	case 0 ... 1:	/* test */
@@ -1410,11 +1409,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
 		emulate_1op("neg", c->dst, ctxt->eflags);
 		break;
 	default:
-		DPRINTF("Cannot emulate %02x\n", c->b);
-		rc = X86EMUL_UNHANDLEABLE;
-		break;
+		return 0;
 	}
-	return rc;
+	return 1;
 }
 
 static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
@@ -2374,9 +2371,8 @@ special_insn:
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xf6 ... 0xf7:	/* Grp3 */
-		rc = emulate_grp3(ctxt, ops);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
+		if (!emulate_grp3(ctxt, ops))
+			goto cannot_emulate;
 		break;
 	case 0xf8: /* clc */
 		ctxt->eflags &= ~EFLG_CF;
-- 
cgit v1.2.3-70-g09d2


From 2dafc6c234b6064189405f42e1602e9a0abe5a44 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:16 +0200
Subject: KVM: x86 emulator: Provide more callbacks for x86 emulator.

Provide get_cached_descriptor(), set_cached_descriptor(),
get_segment_selector(), set_segment_selector(), get_gdt(),
write_std() callbacks.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  16 +++++
 arch/x86/kvm/x86.c                 | 130 ++++++++++++++++++++++++++++++++-----
 2 files changed, 131 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 07657258af8..f901467a18b 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -62,6 +62,15 @@ struct x86_emulate_ops {
 	int (*read_std)(unsigned long addr, void *val,
 			unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
 
+	/*
+	 * write_std: Write bytes of standard (non-emulated/special) memory.
+	 *            Used for descriptor writing.
+	 *  @addr:  [IN ] Linear address to which to write.
+	 *  @val:   [OUT] Value write to memory, zero-extended to 'u_long'.
+	 *  @bytes: [IN ] Number of bytes to write to memory.
+	 */
+	int (*write_std)(unsigned long addr, void *val,
+			 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
 	/*
 	 * fetch: Read bytes of standard (non-emulated/special) memory.
 	 *        Used for instruction fetch.
@@ -108,6 +117,13 @@ struct x86_emulate_ops {
 				const void *new,
 				unsigned int bytes,
 				struct kvm_vcpu *vcpu);
+	bool (*get_cached_descriptor)(struct desc_struct *desc,
+				      int seg, struct kvm_vcpu *vcpu);
+	void (*set_cached_descriptor)(struct desc_struct *desc,
+				      int seg, struct kvm_vcpu *vcpu);
+	u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
+	void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
+	void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
 	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
 	void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
 	int (*cpl)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0ecd37ac9d3..fbee8fbb33b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3058,6 +3058,18 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 	return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
 }
 
+static void kvm_set_segment(struct kvm_vcpu *vcpu,
+			struct kvm_segment *var, int seg)
+{
+	kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+
+void kvm_get_segment(struct kvm_vcpu *vcpu,
+		     struct kvm_segment *var, int seg)
+{
+	kvm_x86_ops->get_segment(vcpu, var, seg);
+}
+
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
 {
 	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
@@ -3138,14 +3150,18 @@ static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
 }
 
-static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
-				struct kvm_vcpu *vcpu, u32 *error)
+static int kvm_write_guest_virt_helper(gva_t addr, void *val,
+				       unsigned int bytes,
+				       struct kvm_vcpu *vcpu, u32 access,
+				       u32 *error)
 {
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
 
+	access |= PFERR_WRITE_MASK;
+
 	while (bytes) {
-		gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
+		gpa_t gpa =  vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
@@ -3168,6 +3184,19 @@ out:
 	return r;
 }
 
+static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
+				struct kvm_vcpu *vcpu, u32 *error)
+{
+	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, access, error);
+}
+
+static int kvm_write_guest_virt_system(gva_t addr, void *val,
+				       unsigned int bytes,
+				       struct kvm_vcpu *vcpu, u32 *error)
+{
+	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
+}
 
 static int emulator_read_emulated(unsigned long addr,
 				  void *val,
@@ -3484,12 +3513,95 @@ static int emulator_get_cpl(struct kvm_vcpu *vcpu)
 	return kvm_x86_ops->get_cpl(vcpu);
 }
 
+static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
+{
+	kvm_x86_ops->get_gdt(vcpu, dt);
+}
+
+static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
+					   struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment var;
+
+	kvm_get_segment(vcpu, &var, seg);
+
+	if (var.unusable)
+		return false;
+
+	if (var.g)
+		var.limit >>= 12;
+	set_desc_limit(desc, var.limit);
+	set_desc_base(desc, (unsigned long)var.base);
+	desc->type = var.type;
+	desc->s = var.s;
+	desc->dpl = var.dpl;
+	desc->p = var.present;
+	desc->avl = var.avl;
+	desc->l = var.l;
+	desc->d = var.db;
+	desc->g = var.g;
+
+	return true;
+}
+
+static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
+					   struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment var;
+
+	/* needed to preserve selector */
+	kvm_get_segment(vcpu, &var, seg);
+
+	var.base = get_desc_base(desc);
+	var.limit = get_desc_limit(desc);
+	if (desc->g)
+		var.limit = (var.limit << 12) | 0xfff;
+	var.type = desc->type;
+	var.present = desc->p;
+	var.dpl = desc->dpl;
+	var.db = desc->d;
+	var.s = desc->s;
+	var.l = desc->l;
+	var.g = desc->g;
+	var.avl = desc->avl;
+	var.present = desc->p;
+	var.unusable = !var.present;
+	var.padding = 0;
+
+	kvm_set_segment(vcpu, &var, seg);
+	return;
+}
+
+static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment kvm_seg;
+
+	kvm_get_segment(vcpu, &kvm_seg, seg);
+	return kvm_seg.selector;
+}
+
+static void emulator_set_segment_selector(u16 sel, int seg,
+					  struct kvm_vcpu *vcpu)
+{
+	struct kvm_segment kvm_seg;
+
+	kvm_get_segment(vcpu, &kvm_seg, seg);
+	kvm_seg.selector = sel;
+	kvm_set_segment(vcpu, &kvm_seg, seg);
+}
+
 static struct x86_emulate_ops emulate_ops = {
 	.read_std            = kvm_read_guest_virt_system,
+	.write_std           = kvm_write_guest_virt_system,
 	.fetch               = kvm_fetch_guest_virt,
 	.read_emulated       = emulator_read_emulated,
 	.write_emulated      = emulator_write_emulated,
 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
+	.get_cached_descriptor = emulator_get_cached_descriptor,
+	.set_cached_descriptor = emulator_set_cached_descriptor,
+	.get_segment_selector = emulator_get_segment_selector,
+	.set_segment_selector = emulator_set_segment_selector,
+	.get_gdt             = emulator_get_gdt,
 	.get_cr              = emulator_get_cr,
 	.set_cr              = emulator_set_cr,
 	.cpl                 = emulator_get_cpl,
@@ -4649,12 +4761,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	return 0;
 }
 
-void kvm_get_segment(struct kvm_vcpu *vcpu,
-		     struct kvm_segment *var, int seg)
-{
-	kvm_x86_ops->get_segment(vcpu, var, seg);
-}
-
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
 {
 	struct kvm_segment cs;
@@ -4726,12 +4832,6 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-static void kvm_set_segment(struct kvm_vcpu *vcpu,
-			struct kvm_segment *var, int seg)
-{
-	kvm_x86_ops->set_segment(vcpu, var, seg);
-}
-
 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
 				   struct kvm_segment *kvm_desct)
 {
-- 
cgit v1.2.3-70-g09d2


From 38ba30ba51a003360f177d5b8349439fe44fc55b Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:17 +0200
Subject: KVM: x86 emulator: Emulate task switch in emulator.c

Implement emulation of 16/32 bit task switch in emulator.c

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |   5 +
 arch/x86/kvm/emulate.c             | 563 +++++++++++++++++++++++++++++++++++++
 2 files changed, 568 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index f901467a18b..bd469296f5e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -11,6 +11,8 @@
 #ifndef _ASM_X86_KVM_X86_EMULATE_H
 #define _ASM_X86_KVM_X86_EMULATE_H
 
+#include <asm/desc_defs.h>
+
 struct x86_emulate_ctxt;
 
 /*
@@ -210,5 +212,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
 		    struct x86_emulate_ops *ops);
 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
 		     struct x86_emulate_ops *ops);
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+			 struct x86_emulate_ops *ops,
+			 u16 tss_selector, int reason);
 
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d696cbd6ff7..db4776c6b50 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -33,6 +33,7 @@
 #include <asm/kvm_emulate.h>
 
 #include "x86.h"
+#include "tss.h"
 
 /*
  * Opcode effective-address decode tables.
@@ -1221,6 +1222,198 @@ done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 }
 
+static u32 desc_limit_scaled(struct desc_struct *desc)
+{
+	u32 limit = get_desc_limit(desc);
+
+	return desc->g ? (limit << 12) | 0xfff : limit;
+}
+
+static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
+				     struct x86_emulate_ops *ops,
+				     u16 selector, struct desc_ptr *dt)
+{
+	if (selector & 1 << 2) {
+		struct desc_struct desc;
+		memset (dt, 0, sizeof *dt);
+		if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu))
+			return;
+
+		dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
+		dt->address = get_desc_base(&desc);
+	} else
+		ops->get_gdt(dt, ctxt->vcpu);
+}
+
+/* allowed just for 8 bytes segments */
+static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				   struct x86_emulate_ops *ops,
+				   u16 selector, struct desc_struct *desc)
+{
+	struct desc_ptr dt;
+	u16 index = selector >> 3;
+	int ret;
+	u32 err;
+	ulong addr;
+
+	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+
+	if (dt.size < index * 8 + 7) {
+		kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+	addr = dt.address + index * 8;
+	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT)
+		kvm_inject_page_fault(ctxt->vcpu, addr, err);
+
+       return ret;
+}
+
+/* allowed just for 8 bytes segments */
+static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				    struct x86_emulate_ops *ops,
+				    u16 selector, struct desc_struct *desc)
+{
+	struct desc_ptr dt;
+	u16 index = selector >> 3;
+	u32 err;
+	ulong addr;
+	int ret;
+
+	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
+
+	if (dt.size < index * 8 + 7) {
+		kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+
+	addr = dt.address + index * 8;
+	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT)
+		kvm_inject_page_fault(ctxt->vcpu, addr, err);
+
+	return ret;
+}
+
+static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
+				   struct x86_emulate_ops *ops,
+				   u16 selector, int seg)
+{
+	struct desc_struct seg_desc;
+	u8 dpl, rpl, cpl;
+	unsigned err_vec = GP_VECTOR;
+	u32 err_code = 0;
+	bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
+	int ret;
+
+	memset(&seg_desc, 0, sizeof seg_desc);
+
+	if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86)
+	    || ctxt->mode == X86EMUL_MODE_REAL) {
+		/* set real mode segment descriptor */
+		set_desc_base(&seg_desc, selector << 4);
+		set_desc_limit(&seg_desc, 0xffff);
+		seg_desc.type = 3;
+		seg_desc.p = 1;
+		seg_desc.s = 1;
+		goto load;
+	}
+
+	/* NULL selector is not valid for TR, CS and SS */
+	if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
+	    && null_selector)
+		goto exception;
+
+	/* TR should be in GDT only */
+	if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
+		goto exception;
+
+	if (null_selector) /* for NULL selector skip all following checks */
+		goto load;
+
+	ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+
+	err_code = selector & 0xfffc;
+	err_vec = GP_VECTOR;
+
+	/* can't load system descriptor into segment selecor */
+	if (seg <= VCPU_SREG_GS && !seg_desc.s)
+		goto exception;
+
+	if (!seg_desc.p) {
+		err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+		goto exception;
+	}
+
+	rpl = selector & 3;
+	dpl = seg_desc.dpl;
+	cpl = ops->cpl(ctxt->vcpu);
+
+	switch (seg) {
+	case VCPU_SREG_SS:
+		/*
+		 * segment is not a writable data segment or segment
+		 * selector's RPL != CPL or segment selector's RPL != CPL
+		 */
+		if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl)
+			goto exception;
+		break;
+	case VCPU_SREG_CS:
+		if (!(seg_desc.type & 8))
+			goto exception;
+
+		if (seg_desc.type & 4) {
+			/* conforming */
+			if (dpl > cpl)
+				goto exception;
+		} else {
+			/* nonconforming */
+			if (rpl > cpl || dpl != cpl)
+				goto exception;
+		}
+		/* CS(RPL) <- CPL */
+		selector = (selector & 0xfffc) | cpl;
+		break;
+	case VCPU_SREG_TR:
+		if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
+			goto exception;
+		break;
+	case VCPU_SREG_LDTR:
+		if (seg_desc.s || seg_desc.type != 2)
+			goto exception;
+		break;
+	default: /*  DS, ES, FS, or GS */
+		/*
+		 * segment is not a data or readable code segment or
+		 * ((segment is a data or nonconforming code segment)
+		 * and (both RPL and CPL > DPL))
+		 */
+		if ((seg_desc.type & 0xa) == 0x8 ||
+		    (((seg_desc.type & 0xc) != 0xc) &&
+		     (rpl > dpl && cpl > dpl)))
+			goto exception;
+		break;
+	}
+
+	if (seg_desc.s) {
+		/* mark segment as accessed */
+		seg_desc.type |= 1;
+		ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc);
+		if (ret != X86EMUL_CONTINUE)
+			return ret;
+	}
+load:
+	ops->set_segment_selector(selector, seg, ctxt->vcpu);
+	ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
+	return X86EMUL_CONTINUE;
+exception:
+	kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code);
+	return X86EMUL_PROPAGATE_FAULT;
+}
+
 static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
 {
 	struct decode_cache *c = &ctxt->decode;
@@ -1812,6 +2005,376 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
 	return true;
 }
 
+static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
+				      struct x86_emulate_ops *ops,
+				      int seg)
+{
+	struct desc_struct desc;
+	if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
+		return get_desc_base(&desc);
+	else
+		return ~0;
+}
+
+static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
+				struct x86_emulate_ops *ops,
+				struct tss_segment_16 *tss)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	tss->ip = c->eip;
+	tss->flag = ctxt->eflags;
+	tss->ax = c->regs[VCPU_REGS_RAX];
+	tss->cx = c->regs[VCPU_REGS_RCX];
+	tss->dx = c->regs[VCPU_REGS_RDX];
+	tss->bx = c->regs[VCPU_REGS_RBX];
+	tss->sp = c->regs[VCPU_REGS_RSP];
+	tss->bp = c->regs[VCPU_REGS_RBP];
+	tss->si = c->regs[VCPU_REGS_RSI];
+	tss->di = c->regs[VCPU_REGS_RDI];
+
+	tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
+	tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+	tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
+	tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
+	tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
+}
+
+static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
+				 struct x86_emulate_ops *ops,
+				 struct tss_segment_16 *tss)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int ret;
+
+	c->eip = tss->ip;
+	ctxt->eflags = tss->flag | 2;
+	c->regs[VCPU_REGS_RAX] = tss->ax;
+	c->regs[VCPU_REGS_RCX] = tss->cx;
+	c->regs[VCPU_REGS_RDX] = tss->dx;
+	c->regs[VCPU_REGS_RBX] = tss->bx;
+	c->regs[VCPU_REGS_RSP] = tss->sp;
+	c->regs[VCPU_REGS_RBP] = tss->bp;
+	c->regs[VCPU_REGS_RSI] = tss->si;
+	c->regs[VCPU_REGS_RDI] = tss->di;
+
+	/*
+	 * SDM says that segment selectors are loaded before segment
+	 * descriptors
+	 */
+	ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu);
+	ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
+	ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
+	ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
+
+	/*
+	 * Now load segment descriptors. If fault happenes at this stage
+	 * it is handled in a context of new task
+	 */
+	ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+
+	return X86EMUL_CONTINUE;
+}
+
+static int task_switch_16(struct x86_emulate_ctxt *ctxt,
+			  struct x86_emulate_ops *ops,
+			  u16 tss_selector, u16 old_tss_sel,
+			  ulong old_tss_base, struct desc_struct *new_desc)
+{
+	struct tss_segment_16 tss_seg;
+	int ret;
+	u32 err, new_tss_base = get_desc_base(new_desc);
+
+	ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+			    &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT) {
+		/* FIXME: need to provide precise fault address */
+		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		return ret;
+	}
+
+	save_state_to_tss16(ctxt, ops, &tss_seg);
+
+	ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+			     &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT) {
+		/* FIXME: need to provide precise fault address */
+		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		return ret;
+	}
+
+	ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+			    &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT) {
+		/* FIXME: need to provide precise fault address */
+		kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+		return ret;
+	}
+
+	if (old_tss_sel != 0xffff) {
+		tss_seg.prev_task_link = old_tss_sel;
+
+		ret = ops->write_std(new_tss_base,
+				     &tss_seg.prev_task_link,
+				     sizeof tss_seg.prev_task_link,
+				     ctxt->vcpu, &err);
+		if (ret == X86EMUL_PROPAGATE_FAULT) {
+			/* FIXME: need to provide precise fault address */
+			kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+			return ret;
+		}
+	}
+
+	return load_state_from_tss16(ctxt, ops, &tss_seg);
+}
+
+static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
+				struct x86_emulate_ops *ops,
+				struct tss_segment_32 *tss)
+{
+	struct decode_cache *c = &ctxt->decode;
+
+	tss->cr3 = ops->get_cr(3, ctxt->vcpu);
+	tss->eip = c->eip;
+	tss->eflags = ctxt->eflags;
+	tss->eax = c->regs[VCPU_REGS_RAX];
+	tss->ecx = c->regs[VCPU_REGS_RCX];
+	tss->edx = c->regs[VCPU_REGS_RDX];
+	tss->ebx = c->regs[VCPU_REGS_RBX];
+	tss->esp = c->regs[VCPU_REGS_RSP];
+	tss->ebp = c->regs[VCPU_REGS_RBP];
+	tss->esi = c->regs[VCPU_REGS_RSI];
+	tss->edi = c->regs[VCPU_REGS_RDI];
+
+	tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu);
+	tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu);
+	tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu);
+	tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu);
+	tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu);
+	tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu);
+	tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu);
+}
+
+static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
+				 struct x86_emulate_ops *ops,
+				 struct tss_segment_32 *tss)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int ret;
+
+	ops->set_cr(3, tss->cr3, ctxt->vcpu);
+	c->eip = tss->eip;
+	ctxt->eflags = tss->eflags | 2;
+	c->regs[VCPU_REGS_RAX] = tss->eax;
+	c->regs[VCPU_REGS_RCX] = tss->ecx;
+	c->regs[VCPU_REGS_RDX] = tss->edx;
+	c->regs[VCPU_REGS_RBX] = tss->ebx;
+	c->regs[VCPU_REGS_RSP] = tss->esp;
+	c->regs[VCPU_REGS_RBP] = tss->ebp;
+	c->regs[VCPU_REGS_RSI] = tss->esi;
+	c->regs[VCPU_REGS_RDI] = tss->edi;
+
+	/*
+	 * SDM says that segment selectors are loaded before segment
+	 * descriptors
+	 */
+	ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu);
+	ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu);
+	ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu);
+	ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu);
+	ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu);
+	ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu);
+
+	/*
+	 * Now load segment descriptors. If fault happenes at this stage
+	 * it is handled in a context of new task
+	 */
+	ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+
+	return X86EMUL_CONTINUE;
+}
+
+static int task_switch_32(struct x86_emulate_ctxt *ctxt,
+			  struct x86_emulate_ops *ops,
+			  u16 tss_selector, u16 old_tss_sel,
+			  ulong old_tss_base, struct desc_struct *new_desc)
+{
+	struct tss_segment_32 tss_seg;
+	int ret;
+	u32 err, new_tss_base = get_desc_base(new_desc);
+
+	ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+			    &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT) {
+		/* FIXME: need to provide precise fault address */
+		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		return ret;
+	}
+
+	save_state_to_tss32(ctxt, ops, &tss_seg);
+
+	ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+			     &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT) {
+		/* FIXME: need to provide precise fault address */
+		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		return ret;
+	}
+
+	ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
+			    &err);
+	if (ret == X86EMUL_PROPAGATE_FAULT) {
+		/* FIXME: need to provide precise fault address */
+		kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+		return ret;
+	}
+
+	if (old_tss_sel != 0xffff) {
+		tss_seg.prev_task_link = old_tss_sel;
+
+		ret = ops->write_std(new_tss_base,
+				     &tss_seg.prev_task_link,
+				     sizeof tss_seg.prev_task_link,
+				     ctxt->vcpu, &err);
+		if (ret == X86EMUL_PROPAGATE_FAULT) {
+			/* FIXME: need to provide precise fault address */
+			kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+			return ret;
+		}
+	}
+
+	return load_state_from_tss32(ctxt, ops, &tss_seg);
+}
+
+static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
+				    struct x86_emulate_ops *ops,
+				    u16 tss_selector, int reason)
+{
+	struct desc_struct curr_tss_desc, next_tss_desc;
+	int ret;
+	u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
+	ulong old_tss_base =
+		get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
+
+	/* FIXME: old_tss_base == ~0 ? */
+
+	ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+	ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
+
+	/* FIXME: check that next_tss_desc is tss */
+
+	if (reason != TASK_SWITCH_IRET) {
+		if ((tss_selector & 3) > next_tss_desc.dpl ||
+		    ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
+			kvm_inject_gp(ctxt->vcpu, 0);
+			return X86EMUL_PROPAGATE_FAULT;
+		}
+	}
+
+	if (!next_tss_desc.p || desc_limit_scaled(&next_tss_desc) < 0x67) {
+		kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
+				      tss_selector & 0xfffc);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
+
+	if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+		curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */
+		write_segment_descriptor(ctxt, ops, old_tss_sel,
+					 &curr_tss_desc);
+	}
+
+	if (reason == TASK_SWITCH_IRET)
+		ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT;
+
+	/* set back link to prev task only if NT bit is set in eflags
+	   note that old_tss_sel is not used afetr this point */
+	if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+		old_tss_sel = 0xffff;
+
+	if (next_tss_desc.type & 8)
+		ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel,
+				     old_tss_base, &next_tss_desc);
+	else
+		ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
+				     old_tss_base, &next_tss_desc);
+
+	if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
+		ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
+
+	if (reason != TASK_SWITCH_IRET) {
+		next_tss_desc.type |= (1 << 1); /* set busy flag */
+		write_segment_descriptor(ctxt, ops, tss_selector,
+					 &next_tss_desc);
+	}
+
+	ops->set_cr(0,  ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
+	ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu);
+	ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
+
+	return ret;
+}
+
+int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
+			 struct x86_emulate_ops *ops,
+			 u16 tss_selector, int reason)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc;
+
+	memset(c, 0, sizeof(struct decode_cache));
+	c->eip = ctxt->eip;
+	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+
+	rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason);
+
+	if (rc == X86EMUL_CONTINUE) {
+		memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
+		kvm_rip_write(ctxt->vcpu, c->eip);
+	}
+
+	return rc;
+}
+
 int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
-- 
cgit v1.2.3-70-g09d2


From 2e873022f511b82a5318c7af179f588f08d68cb9 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:18 +0200
Subject: KVM: x86 emulator: Use load_segment_descriptor() instead of
 kvm_load_segment_descriptor()

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index db4776c6b50..702bffffd27 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1508,7 +1508,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg);
+	rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg);
 	return rc;
 }
 
@@ -1683,7 +1683,7 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
 	rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
-	rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS);
+	rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
 	return rc;
 }
 
@@ -2717,7 +2717,7 @@ special_insn:
 		if (c->modrm_reg == VCPU_SREG_SS)
 			toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
 
-		rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg);
+		rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
 
 		c->dst.type = OP_NONE;  /* Disable writeback. */
 		break;
@@ -2892,8 +2892,8 @@ special_insn:
 		goto jmp;
 	case 0xea: /* jmp far */
 	jump_far:
-		if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val,
-						VCPU_SREG_CS))
+		if (load_segment_descriptor(ctxt, ops, c->src2.val,
+					    VCPU_SREG_CS))
 			goto done;
 
 		c->eip = c->src.val;
-- 
cgit v1.2.3-70-g09d2


From ceffb4597253b2420d2f171d8b1cdf2cd3137989 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:19 +0200
Subject: KVM: Use task switch from emulator.c

Remove old task switch code from x86.c

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c |   6 +-
 arch/x86/kvm/x86.c     | 561 ++-----------------------------------------------
 2 files changed, 22 insertions(+), 545 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 702bffffd27..8225ec26efe 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2291,6 +2291,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
 	ulong old_tss_base =
 		get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
+	u32 desc_limit;
 
 	/* FIXME: old_tss_base == ~0 ? */
 
@@ -2311,7 +2312,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 		}
 	}
 
-	if (!next_tss_desc.p || desc_limit_scaled(&next_tss_desc) < 0x67) {
+	desc_limit = desc_limit_scaled(&next_tss_desc);
+	if (!next_tss_desc.p ||
+	    ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
+	     desc_limit < 0x2b)) {
 		kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
 				      tss_selector & 0xfffc);
 		return X86EMUL_PROPAGATE_FAULT;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fbee8fbb33b..f69854c8f33 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4832,557 +4832,30 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
-				   struct kvm_segment *kvm_desct)
-{
-	kvm_desct->base = get_desc_base(seg_desc);
-	kvm_desct->limit = get_desc_limit(seg_desc);
-	if (seg_desc->g) {
-		kvm_desct->limit <<= 12;
-		kvm_desct->limit |= 0xfff;
-	}
-	kvm_desct->selector = selector;
-	kvm_desct->type = seg_desc->type;
-	kvm_desct->present = seg_desc->p;
-	kvm_desct->dpl = seg_desc->dpl;
-	kvm_desct->db = seg_desc->d;
-	kvm_desct->s = seg_desc->s;
-	kvm_desct->l = seg_desc->l;
-	kvm_desct->g = seg_desc->g;
-	kvm_desct->avl = seg_desc->avl;
-	if (!selector)
-		kvm_desct->unusable = 1;
-	else
-		kvm_desct->unusable = 0;
-	kvm_desct->padding = 0;
-}
-
-static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
-					  u16 selector,
-					  struct desc_ptr *dtable)
-{
-	if (selector & 1 << 2) {
-		struct kvm_segment kvm_seg;
-
-		kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
-
-		if (kvm_seg.unusable)
-			dtable->size = 0;
-		else
-			dtable->size = kvm_seg.limit;
-		dtable->address = kvm_seg.base;
-	}
-	else
-		kvm_x86_ops->get_gdt(vcpu, dtable);
-}
-
-/* allowed just for 8 bytes segments */
-static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
-					 struct desc_struct *seg_desc)
-{
-	struct desc_ptr dtable;
-	u16 index = selector >> 3;
-	int ret;
-	u32 err;
-	gva_t addr;
-
-	get_segment_descriptor_dtable(vcpu, selector, &dtable);
-
-	if (dtable.size < index * 8 + 7) {
-		kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
-		return X86EMUL_PROPAGATE_FAULT;
-	}
-	addr = dtable.base + index * 8;
-	ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
-					 vcpu,  &err);
-	if (ret == X86EMUL_PROPAGATE_FAULT)
-		kvm_inject_page_fault(vcpu, addr, err);
-
-       return ret;
-}
-
-/* allowed just for 8 bytes segments */
-static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
-					 struct desc_struct *seg_desc)
-{
-	struct desc_ptr dtable;
-	u16 index = selector >> 3;
-
-	get_segment_descriptor_dtable(vcpu, selector, &dtable);
-
-	if (dtable.size < index * 8 + 7)
-		return 1;
-	return kvm_write_guest_virt(dtable.address + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
-}
-
-static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
-			       struct desc_struct *seg_desc)
-{
-	u32 base_addr = get_desc_base(seg_desc);
-
-	return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
-}
-
-static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
-			     struct desc_struct *seg_desc)
-{
-	u32 base_addr = get_desc_base(seg_desc);
-
-	return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
-}
-
-static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
-{
-	struct kvm_segment kvm_seg;
-
-	kvm_get_segment(vcpu, &kvm_seg, seg);
-	return kvm_seg.selector;
-}
-
-static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
-{
-	struct kvm_segment segvar = {
-		.base = selector << 4,
-		.limit = 0xffff,
-		.selector = selector,
-		.type = 3,
-		.present = 1,
-		.dpl = 3,
-		.db = 0,
-		.s = 1,
-		.l = 0,
-		.g = 0,
-		.avl = 0,
-		.unusable = 0,
-	};
-	kvm_x86_ops->set_segment(vcpu, &segvar, seg);
-	return X86EMUL_CONTINUE;
-}
-
-static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
-{
-	return (seg != VCPU_SREG_LDTR) &&
-		(seg != VCPU_SREG_TR) &&
-		(kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
-}
-
-int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
-{
-	struct kvm_segment kvm_seg;
-	struct desc_struct seg_desc;
-	u8 dpl, rpl, cpl;
-	unsigned err_vec = GP_VECTOR;
-	u32 err_code = 0;
-	bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
-	int ret;
-
-	if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
-		return kvm_load_realmode_segment(vcpu, selector, seg);
-
-	/* NULL selector is not valid for TR, CS and SS */
-	if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
-	    && null_selector)
-		goto exception;
-
-	/* TR should be in GDT only */
-	if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
-		goto exception;
-
-	ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
-	if (ret)
-		return ret;
-
-	seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
-
-	if (null_selector) { /* for NULL selector skip all following checks */
-		kvm_seg.unusable = 1;
-		goto load;
-	}
-
-	err_code = selector & 0xfffc;
-	err_vec = GP_VECTOR;
-
-	/* can't load system descriptor into segment selecor */
-	if (seg <= VCPU_SREG_GS && !kvm_seg.s)
-		goto exception;
-
-	if (!kvm_seg.present) {
-		err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
-		goto exception;
-	}
-
-	rpl = selector & 3;
-	dpl = kvm_seg.dpl;
-	cpl = kvm_x86_ops->get_cpl(vcpu);
-
-	switch (seg) {
-	case VCPU_SREG_SS:
-		/*
-		 * segment is not a writable data segment or segment
-		 * selector's RPL != CPL or segment selector's RPL != CPL
-		 */
-		if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
-			goto exception;
-		break;
-	case VCPU_SREG_CS:
-		if (!(kvm_seg.type & 8))
-			goto exception;
-
-		if (kvm_seg.type & 4) {
-			/* conforming */
-			if (dpl > cpl)
-				goto exception;
-		} else {
-			/* nonconforming */
-			if (rpl > cpl || dpl != cpl)
-				goto exception;
-		}
-		/* CS(RPL) <- CPL */
-		selector = (selector & 0xfffc) | cpl;
-            break;
-	case VCPU_SREG_TR:
-		if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
-			goto exception;
-		break;
-	case VCPU_SREG_LDTR:
-		if (kvm_seg.s || kvm_seg.type != 2)
-			goto exception;
-		break;
-	default: /*  DS, ES, FS, or GS */
-		/*
-		 * segment is not a data or readable code segment or
-		 * ((segment is a data or nonconforming code segment)
-		 * and (both RPL and CPL > DPL))
-		 */
-		if ((kvm_seg.type & 0xa) == 0x8 ||
-		    (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
-			goto exception;
-		break;
-	}
-
-	if (!kvm_seg.unusable && kvm_seg.s) {
-		/* mark segment as accessed */
-		kvm_seg.type |= 1;
-		seg_desc.type |= 1;
-		save_guest_segment_descriptor(vcpu, selector, &seg_desc);
-	}
-load:
-	kvm_set_segment(vcpu, &kvm_seg, seg);
-	return X86EMUL_CONTINUE;
-exception:
-	kvm_queue_exception_e(vcpu, err_vec, err_code);
-	return X86EMUL_PROPAGATE_FAULT;
-}
-
-static void save_state_to_tss32(struct kvm_vcpu *vcpu,
-				struct tss_segment_32 *tss)
-{
-	tss->cr3 = vcpu->arch.cr3;
-	tss->eip = kvm_rip_read(vcpu);
-	tss->eflags = kvm_get_rflags(vcpu);
-	tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
-	tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
-	tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
-	tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
-	tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
-	tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
-	tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
-	tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
-	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
-	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
-	tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
-	tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
-	tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
-	tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
-	tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-}
-
-static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
-{
-	struct kvm_segment kvm_seg;
-	kvm_get_segment(vcpu, &kvm_seg, seg);
-	kvm_seg.selector = sel;
-	kvm_set_segment(vcpu, &kvm_seg, seg);
-}
-
-static int load_state_from_tss32(struct kvm_vcpu *vcpu,
-				  struct tss_segment_32 *tss)
-{
-	kvm_set_cr3(vcpu, tss->cr3);
-
-	kvm_rip_write(vcpu, tss->eip);
-	kvm_set_rflags(vcpu, tss->eflags | 2);
-
-	kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
-	kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
-	kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
-	kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
-	kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
-	kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
-	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
-	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
-
-	/*
-	 * SDM says that segment selectors are loaded before segment
-	 * descriptors
-	 */
-	kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
-	kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
-	kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
-	kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
-	kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
-	kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
-	kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
-
-	/*
-	 * Now load segment descriptors. If fault happenes at this stage
-	 * it is handled in a context of new task
-	 */
-	if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
-		return 1;
-	return 0;
-}
-
-static void save_state_to_tss16(struct kvm_vcpu *vcpu,
-				struct tss_segment_16 *tss)
-{
-	tss->ip = kvm_rip_read(vcpu);
-	tss->flag = kvm_get_rflags(vcpu);
-	tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
-	tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
-	tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
-	tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
-	tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
-	tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
-	tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
-	tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
-
-	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
-	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
-	tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
-	tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
-	tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-}
-
-static int load_state_from_tss16(struct kvm_vcpu *vcpu,
-				 struct tss_segment_16 *tss)
-{
-	kvm_rip_write(vcpu, tss->ip);
-	kvm_set_rflags(vcpu, tss->flag | 2);
-	kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
-	kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
-	kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
-	kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
-	kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
-	kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
-	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
-	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
-
-	/*
-	 * SDM says that segment selectors are loaded before segment
-	 * descriptors
-	 */
-	kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
-	kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
-	kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
-	kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
-	kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
-
-	/*
-	 * Now load segment descriptors. If fault happenes at this stage
-	 * it is handled in a context of new task
-	 */
-	if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
-		return 1;
-
-	if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
-		return 1;
-	return 0;
-}
-
-static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
-			      u16 old_tss_sel, u32 old_tss_base,
-			      struct desc_struct *nseg_desc)
-{
-	struct tss_segment_16 tss_segment_16;
-	int ret = 0;
-
-	if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
-			   sizeof tss_segment_16))
-		goto out;
-
-	save_state_to_tss16(vcpu, &tss_segment_16);
-
-	if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
-			    sizeof tss_segment_16))
-		goto out;
-
-	if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
-			   &tss_segment_16, sizeof tss_segment_16))
-		goto out;
-
-	if (old_tss_sel != 0xffff) {
-		tss_segment_16.prev_task_link = old_tss_sel;
-
-		if (kvm_write_guest(vcpu->kvm,
-				    get_tss_base_addr_write(vcpu, nseg_desc),
-				    &tss_segment_16.prev_task_link,
-				    sizeof tss_segment_16.prev_task_link))
-			goto out;
-	}
-
-	if (load_state_from_tss16(vcpu, &tss_segment_16))
-		goto out;
-
-	ret = 1;
-out:
-	return ret;
-}
-
-static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
-		       u16 old_tss_sel, u32 old_tss_base,
-		       struct desc_struct *nseg_desc)
-{
-	struct tss_segment_32 tss_segment_32;
-	int ret = 0;
-
-	if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
-			   sizeof tss_segment_32))
-		goto out;
-
-	save_state_to_tss32(vcpu, &tss_segment_32);
-
-	if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
-			    sizeof tss_segment_32))
-		goto out;
-
-	if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
-			   &tss_segment_32, sizeof tss_segment_32))
-		goto out;
-
-	if (old_tss_sel != 0xffff) {
-		tss_segment_32.prev_task_link = old_tss_sel;
-
-		if (kvm_write_guest(vcpu->kvm,
-				    get_tss_base_addr_write(vcpu, nseg_desc),
-				    &tss_segment_32.prev_task_link,
-				    sizeof tss_segment_32.prev_task_link))
-			goto out;
-	}
-
-	if (load_state_from_tss32(vcpu, &tss_segment_32))
-		goto out;
-
-	ret = 1;
-out:
-	return ret;
-}
-
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
 {
-	struct kvm_segment tr_seg;
-	struct desc_struct cseg_desc;
-	struct desc_struct nseg_desc;
-	int ret = 0;
-	u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
-	u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
-	u32 desc_limit;
-
-	old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
-
-	/* FIXME: Handle errors. Failure to read either TSS or their
-	 * descriptors should generate a pagefault.
-	 */
-	if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
-		goto out;
-
-	if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
-		goto out;
-
-	if (reason != TASK_SWITCH_IRET) {
-		int cpl;
-
-		cpl = kvm_x86_ops->get_cpl(vcpu);
-		if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
-			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
-			return 1;
-		}
-	}
-
-	desc_limit = get_desc_limit(&nseg_desc);
-	if (!nseg_desc.p ||
-	    ((desc_limit < 0x67 && (nseg_desc.type & 8)) ||
-	     desc_limit < 0x2b)) {
-		kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
-		return 1;
-	}
-
-	if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
-		cseg_desc.type &= ~(1 << 1); //clear the B flag
-		save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
-	}
-
-	if (reason == TASK_SWITCH_IRET) {
-		u32 eflags = kvm_get_rflags(vcpu);
-		kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
-	}
+	int cs_db, cs_l, ret;
+	cache_all_regs(vcpu);
 
-	/* set back link to prev task only if NT bit is set in eflags
-	   note that old_tss_sel is not used afetr this point */
-	if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
-		old_tss_sel = 0xffff;
+	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 
-	if (nseg_desc.type & 8)
-		ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
-					 old_tss_base, &nseg_desc);
-	else
-		ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
-					 old_tss_base, &nseg_desc);
+	vcpu->arch.emulate_ctxt.vcpu = vcpu;
+	vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
+	vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
+	vcpu->arch.emulate_ctxt.mode =
+		(!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
+		(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
+		? X86EMUL_MODE_VM86 : cs_l
+		? X86EMUL_MODE_PROT64 :	cs_db
+		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
 
-	if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
-		u32 eflags = kvm_get_rflags(vcpu);
-		kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
-	}
+	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
+				   tss_selector, reason);
 
-	if (reason != TASK_SWITCH_IRET) {
-		nseg_desc.type |= (1 << 1);
-		save_guest_segment_descriptor(vcpu, tss_selector,
-					      &nseg_desc);
-	}
+	if (ret == X86EMUL_CONTINUE)
+		kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 
-	kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
-	seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
-	tr_seg.type = 11;
-	kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
-out:
-	return ret;
+	return (ret != X86EMUL_CONTINUE);
 }
 EXPORT_SYMBOL_GPL(kvm_task_switch);
 
-- 
cgit v1.2.3-70-g09d2


From 69f55cb11e8d789433d111ac3a0f60be37a1ae01 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:20 +0200
Subject: KVM: x86 emulator: populate OP_MEM operand during decoding.

All struct operand fields are initialized during decoding for all
operand types except OP_MEM, but there is no reason for that. Move
OP_MEM operand initialization into decoding stage for consistency.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 66 ++++++++++++++++++++++----------------------------
 1 file changed, 29 insertions(+), 37 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8225ec26efe..0eed6839619 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1057,6 +1057,10 @@ done_prefixes:
 
 	if (c->ad_bytes != 8)
 		c->modrm_ea = (u32)c->modrm_ea;
+
+	if (c->rip_relative)
+		c->modrm_ea += c->eip;
+
 	/*
 	 * Decode and fetch the source operand: register, memory
 	 * or immediate.
@@ -1091,6 +1095,8 @@ done_prefixes:
 			break;
 		}
 		c->src.type = OP_MEM;
+		c->src.ptr = (unsigned long *)c->modrm_ea;
+		c->src.val = 0;
 		break;
 	case SrcImm:
 	case SrcImmU:
@@ -1169,8 +1175,10 @@ done_prefixes:
 		c->src2.val = 1;
 		break;
 	case Src2Mem16:
-		c->src2.bytes = 2;
 		c->src2.type = OP_MEM;
+		c->src2.bytes = 2;
+		c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
+		c->src2.val = 0;
 		break;
 	}
 
@@ -1192,6 +1200,15 @@ done_prefixes:
 			break;
 		}
 		c->dst.type = OP_MEM;
+		c->dst.ptr = (unsigned long *)c->modrm_ea;
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.val = 0;
+		if (c->d & BitOp) {
+			unsigned long mask = ~(c->dst.bytes * 8 - 1);
+
+			c->dst.ptr = (void *)c->dst.ptr +
+						   (c->src.val & mask) / 8;
+		}
 		break;
 	case DstAcc:
 		c->dst.type = OP_REG;
@@ -1215,9 +1232,6 @@ done_prefixes:
 		break;
 	}
 
-	if (c->rip_relative)
-		c->modrm_ea += c->eip;
-
 done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 }
@@ -1638,14 +1652,13 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
 }
 
 static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
-			       struct x86_emulate_ops *ops,
-			       unsigned long memop)
+			       struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
 	u64 old, new;
 	int rc;
 
-	rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
+	rc = ops->read_emulated(c->modrm_ea, &old, 8, ctxt->vcpu);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
@@ -1660,7 +1673,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
 		new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
 		       (u32) c->regs[VCPU_REGS_RBX];
 
-		rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
+		rc = ops->cmpxchg_emulated(c->modrm_ea, &old, &new, 8, ctxt->vcpu);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		ctxt->eflags |= EFLG_ZF;
@@ -2382,7 +2395,6 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
-	unsigned long memop = 0;
 	u64 msr_data;
 	unsigned long saved_eip = 0;
 	struct decode_cache *c = &ctxt->decode;
@@ -2417,9 +2429,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		goto done;
 	}
 
-	if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
-		memop = c->modrm_ea;
-
 	if (c->rep_prefix && (c->d & String)) {
 		/* All REP prefixes have the same first termination condition */
 		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
@@ -2451,8 +2460,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	}
 
 	if (c->src.type == OP_MEM) {
-		c->src.ptr = (unsigned long *)memop;
-		c->src.val = 0;
 		rc = ops->read_emulated((unsigned long)c->src.ptr,
 					&c->src.val,
 					c->src.bytes,
@@ -2463,8 +2470,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	}
 
 	if (c->src2.type == OP_MEM) {
-		c->src2.ptr = (unsigned long *)(memop + c->src.bytes);
-		c->src2.val = 0;
 		rc = ops->read_emulated((unsigned long)c->src2.ptr,
 					&c->src2.val,
 					c->src2.bytes,
@@ -2477,25 +2482,12 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		goto special_insn;
 
 
-	if (c->dst.type == OP_MEM) {
-		c->dst.ptr = (unsigned long *)memop;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.val = 0;
-		if (c->d & BitOp) {
-			unsigned long mask = ~(c->dst.bytes * 8 - 1);
-
-			c->dst.ptr = (void *)c->dst.ptr +
-						   (c->src.val & mask) / 8;
-		}
-		if (!(c->d & Mov)) {
-			/* optimisation - avoid slow emulated read */
-			rc = ops->read_emulated((unsigned long)c->dst.ptr,
-						&c->dst.val,
-						c->dst.bytes,
-						ctxt->vcpu);
-			if (rc != X86EMUL_CONTINUE)
-				goto done;
-		}
+	if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
+		/* optimisation - avoid slow emulated read if Mov */
+		rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val,
+					c->dst.bytes, ctxt->vcpu);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
 	}
 	c->dst.orig_val = c->dst.val;
 
@@ -3062,7 +3054,7 @@ twobyte_insn:
 			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
 			goto done;
 		case 7: /* invlpg*/
-			emulate_invlpg(ctxt->vcpu, memop);
+			emulate_invlpg(ctxt->vcpu, c->modrm_ea);
 			/* Disable writeback. */
 			c->dst.type = OP_NONE;
 			break;
@@ -3263,7 +3255,7 @@ twobyte_insn:
 							(u64) c->src.val;
 		break;
 	case 0xc7:		/* Grp9 (cmpxchg8b) */
-		rc = emulate_grp9(ctxt, ops, memop);
+		rc = emulate_grp9(ctxt, ops);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		c->dst.type = OP_NONE;
-- 
cgit v1.2.3-70-g09d2


From a682e35449abc83d260a8219015c7cb4b25ecced Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:21 +0200
Subject: KVM: x86 emulator: add decoding of X,Y parameters from Intel SDM

Add decoding of X,Y parameters from Intel SDM which are used by string
instruction to specify source and destination. Use this new decoding
to implement movs, cmps, stos, lods in a generic way.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 125 +++++++++++++++++--------------------------------
 1 file changed, 44 insertions(+), 81 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0eed6839619..3b32270a20d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -51,6 +51,7 @@
 #define DstReg      (2<<1)	/* Register operand. */
 #define DstMem      (3<<1)	/* Memory operand. */
 #define DstAcc      (4<<1)      /* Destination Accumulator */
+#define DstDI       (5<<1)	/* Destination is in ES:(E)DI */
 #define DstMask     (7<<1)
 /* Source operand type. */
 #define SrcNone     (0<<4)	/* No source operand. */
@@ -64,6 +65,7 @@
 #define SrcOne      (7<<4)	/* Implied '1' */
 #define SrcImmUByte (8<<4)      /* 8-bit unsigned immediate operand. */
 #define SrcImmU     (9<<4)      /* Immediate operand, unsigned */
+#define SrcSI       (0xa<<4)	/* Source is in the DS:RSI */
 #define SrcMask     (0xf<<4)
 /* Generic ModRM decode. */
 #define ModRM       (1<<8)
@@ -177,12 +179,12 @@ static u32 opcode_table[256] = {
 	/* 0xA0 - 0xA7 */
 	ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
 	ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
-	ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
-	ByteOp | ImplicitOps | String, ImplicitOps | String,
+	ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
+	ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
 	/* 0xA8 - 0xAF */
-	0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
-	ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
-	ByteOp | ImplicitOps | String, ImplicitOps | String,
+	0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
+	ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
+	ByteOp | DstDI | String, DstDI | String,
 	/* 0xB0 - 0xB7 */
 	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
 	ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
@@ -1145,6 +1147,14 @@ done_prefixes:
 		c->src.bytes = 1;
 		c->src.val = 1;
 		break;
+	case SrcSI:
+		c->src.type = OP_MEM;
+		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->src.ptr = (unsigned long *)
+			register_address(c,  seg_override_base(ctxt, c),
+					 c->regs[VCPU_REGS_RSI]);
+		c->src.val = 0;
+		break;
 	}
 
 	/*
@@ -1230,6 +1240,14 @@ done_prefixes:
 		}
 		c->dst.orig_val = c->dst.val;
 		break;
+	case DstDI:
+		c->dst.type = OP_MEM;
+		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->dst.ptr = (unsigned long *)
+			register_address(c, es_base(ctxt),
+					 c->regs[VCPU_REGS_RDI]);
+		c->dst.val = 0;
+		break;
 	}
 
 done:
@@ -2392,6 +2410,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
+			    int reg, unsigned long **ptr)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
+
+	register_address_increment(c, &c->regs[reg], df * c->src.bytes);
+	*ptr = (unsigned long *)register_address(c,  base, c->regs[reg]);
+}
+
 int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
@@ -2754,89 +2782,16 @@ special_insn:
 		c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
 		break;
 	case 0xa4 ... 0xa5:	/* movs */
-		c->dst.type = OP_MEM;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)register_address(c,
-						   es_base(ctxt),
-						   c->regs[VCPU_REGS_RDI]);
-		rc = ops->read_emulated(register_address(c,
-						seg_override_base(ctxt, c),
-						c->regs[VCPU_REGS_RSI]),
-					&c->dst.val,
-					c->dst.bytes, ctxt->vcpu);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-		register_address_increment(c, &c->regs[VCPU_REGS_RSI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-							   : c->dst.bytes);
-		register_address_increment(c, &c->regs[VCPU_REGS_RDI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-							   : c->dst.bytes);
-		break;
+		goto mov;
 	case 0xa6 ... 0xa7:	/* cmps */
-		c->src.type = OP_NONE; /* Disable writeback. */
-		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->src.ptr = (unsigned long *)register_address(c,
-				       seg_override_base(ctxt, c),
-						   c->regs[VCPU_REGS_RSI]);
-		rc = ops->read_emulated((unsigned long)c->src.ptr,
-					&c->src.val,
-					c->src.bytes,
-					ctxt->vcpu);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-
 		c->dst.type = OP_NONE; /* Disable writeback. */
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)register_address(c,
-						   es_base(ctxt),
-						   c->regs[VCPU_REGS_RDI]);
-		rc = ops->read_emulated((unsigned long)c->dst.ptr,
-					&c->dst.val,
-					c->dst.bytes,
-					ctxt->vcpu);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-
 		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
-
-		emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
-
-		register_address_increment(c, &c->regs[VCPU_REGS_RSI],
-				       (ctxt->eflags & EFLG_DF) ? -c->src.bytes
-								  : c->src.bytes);
-		register_address_increment(c, &c->regs[VCPU_REGS_RDI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-								  : c->dst.bytes);
-
-		break;
+		goto cmp;
 	case 0xaa ... 0xab:	/* stos */
-		c->dst.type = OP_MEM;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)register_address(c,
-						   es_base(ctxt),
-						   c->regs[VCPU_REGS_RDI]);
 		c->dst.val = c->regs[VCPU_REGS_RAX];
-		register_address_increment(c, &c->regs[VCPU_REGS_RDI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-							   : c->dst.bytes);
 		break;
 	case 0xac ... 0xad:	/* lods */
-		c->dst.type = OP_REG;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
-		c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
-		rc = ops->read_emulated(register_address(c,
-						seg_override_base(ctxt, c),
-						c->regs[VCPU_REGS_RSI]),
-					&c->dst.val,
-					c->dst.bytes,
-					ctxt->vcpu);
-		if (rc != X86EMUL_CONTINUE)
-			goto done;
-		register_address_increment(c, &c->regs[VCPU_REGS_RSI],
-				       (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
-							   : c->dst.bytes);
-		break;
+		goto mov;
 	case 0xae ... 0xaf:	/* scas */
 		DPRINTF("Urk! I don't handle SCAS.\n");
 		goto cannot_emulate;
@@ -2979,6 +2934,14 @@ writeback:
 	if (rc != X86EMUL_CONTINUE)
 		goto done;
 
+	if ((c->d & SrcMask) == SrcSI)
+		string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
+				&c->src.ptr);
+
+	if ((c->d & DstMask) == DstDI)
+		string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI,
+				&c->dst.ptr);
+
 	/* Commit shadow register state. */
 	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(ctxt->vcpu, c->eip);
-- 
cgit v1.2.3-70-g09d2


From d9271123a46011af26da680baeb7fdf67b498abf Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:22 +0200
Subject: KVM: x86 emulator: during rep emulation decrement ECX only if
 emulation succeeded

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 3b32270a20d..594574d8b9e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2411,13 +2411,13 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 }
 
 static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
-			    int reg, unsigned long **ptr)
+			    int reg, struct operand *op)
 {
 	struct decode_cache *c = &ctxt->decode;
 	int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
 
-	register_address_increment(c, &c->regs[reg], df * c->src.bytes);
-	*ptr = (unsigned long *)register_address(c,  base, c->regs[reg]);
+	register_address_increment(c, &c->regs[reg], df * op->bytes);
+	op->ptr = (unsigned long *)register_address(c,  base, c->regs[reg]);
 }
 
 int
@@ -2483,7 +2483,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 				goto done;
 			}
 		}
-		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
 		c->eip = ctxt->eip;
 	}
 
@@ -2936,11 +2935,13 @@ writeback:
 
 	if ((c->d & SrcMask) == SrcSI)
 		string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
-				&c->src.ptr);
+				&c->src);
 
 	if ((c->d & DstMask) == DstDI)
-		string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI,
-				&c->dst.ptr);
+		string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
+
+	if (c->rep_prefix && (c->d & String))
+		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
 
 	/* Commit shadow register state. */
 	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
-- 
cgit v1.2.3-70-g09d2


From cf8f70bfe38b326bb80b10f76d6544f571040229 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:23 +0200
Subject: KVM: x86 emulator: fix in/out emulation.

in/out emulation is broken now. The breakage is different depending
on where IO device resides. If it is in userspace emulator reports
emulation failure since it incorrectly interprets kvm_emulate_pio()
return value. If IO device is in the kernel emulation of 'in' will do
nothing since kvm_emulate_pio() stores result directly into vcpu
registers, so emulator will overwrite result of emulation during
commit of shadowed register.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |   7 ++
 arch/x86/include/asm/kvm_host.h    |   3 +-
 arch/x86/kvm/emulate.c             |  50 +++++----
 arch/x86/kvm/svm.c                 |  20 ++--
 arch/x86/kvm/vmx.c                 |  18 ++--
 arch/x86/kvm/x86.c                 | 213 +++++++++++++++++++++++--------------
 6 files changed, 178 insertions(+), 133 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index bd469296f5e..679245c9a55 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -119,6 +119,13 @@ struct x86_emulate_ops {
 				const void *new,
 				unsigned int bytes,
 				struct kvm_vcpu *vcpu);
+
+	int (*pio_in_emulated)(int size, unsigned short port, void *val,
+			       unsigned int count, struct kvm_vcpu *vcpu);
+
+	int (*pio_out_emulated)(int size, unsigned short port, const void *val,
+				unsigned int count, struct kvm_vcpu *vcpu);
+
 	bool (*get_cached_descriptor)(struct desc_struct *desc,
 				      int seg, struct kvm_vcpu *vcpu);
 	void (*set_cached_descriptor)(struct desc_struct *desc,
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b99cec1547c..776d3e202b5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -590,8 +590,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
 
 struct x86_emulate_ctxt;
 
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
-		     int size, unsigned port);
+int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
 			   int size, unsigned long count, int down,
 			    gva_t address, int rep, unsigned port);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 594574d8b9e..2d095ce9dc8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -210,13 +210,13 @@ static u32 opcode_table[256] = {
 	0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xE0 - 0xE7 */
 	0, 0, 0, 0,
-	ByteOp | SrcImmUByte, SrcImmUByte,
-	ByteOp | SrcImmUByte, SrcImmUByte,
+	ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
+	ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
 	/* 0xE8 - 0xEF */
 	SrcImm | Stack, SrcImm | ImplicitOps,
 	SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
-	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
-	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
+	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
 	/* 0xF0 - 0xF7 */
 	0, 0, 0, 0,
 	ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
@@ -2426,8 +2426,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	u64 msr_data;
 	unsigned long saved_eip = 0;
 	struct decode_cache *c = &ctxt->decode;
-	unsigned int port;
-	int io_dir_in;
 	int rc = X86EMUL_CONTINUE;
 
 	ctxt->interruptibility = 0;
@@ -2823,14 +2821,10 @@ special_insn:
 		break;
 	case 0xe4: 	/* inb */
 	case 0xe5: 	/* in */
-		port = c->src.val;
-		io_dir_in = 1;
-		goto do_io;
+		goto do_io_in;
 	case 0xe6: /* outb */
 	case 0xe7: /* out */
-		port = c->src.val;
-		io_dir_in = 0;
-		goto do_io;
+		goto do_io_out;
 	case 0xe8: /* call (near) */ {
 		long int rel = c->src.val;
 		c->src.val = (unsigned long) c->eip;
@@ -2855,25 +2849,29 @@ special_insn:
 		break;
 	case 0xec: /* in al,dx */
 	case 0xed: /* in (e/r)ax,dx */
-		port = c->regs[VCPU_REGS_RDX];
-		io_dir_in = 1;
-		goto do_io;
+		c->src.val = c->regs[VCPU_REGS_RDX];
+	do_io_in:
+		c->dst.bytes = min(c->dst.bytes, 4u);
+		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
+			kvm_inject_gp(ctxt->vcpu, 0);
+			goto done;
+		}
+		if (!ops->pio_in_emulated(c->dst.bytes, c->src.val,
+					  &c->dst.val, 1, ctxt->vcpu))
+			goto done; /* IO is needed */
+		break;
 	case 0xee: /* out al,dx */
 	case 0xef: /* out (e/r)ax,dx */
-		port = c->regs[VCPU_REGS_RDX];
-		io_dir_in = 0;
-	do_io:
-		if (!emulator_io_permited(ctxt, ops, port,
-					  (c->d & ByteOp) ? 1 : c->op_bytes)) {
+		c->src.val = c->regs[VCPU_REGS_RDX];
+	do_io_out:
+		c->dst.bytes = min(c->dst.bytes, 4u);
+		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			goto done;
 		}
-		if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
-				   (c->d & ByteOp) ? 1 : c->op_bytes,
-				   port) != 0) {
-			c->eip = saved_eip;
-			goto cannot_emulate;
-		}
+		ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
+				      ctxt->vcpu);
+		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xf4:              /* hlt */
 		ctxt->vcpu->arch.halt_request = 1;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index abbc3f9d03b..e9f79619e18 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1494,29 +1494,23 @@ static int shutdown_interception(struct vcpu_svm *svm)
 
 static int io_interception(struct vcpu_svm *svm)
 {
+	struct kvm_vcpu *vcpu = &svm->vcpu;
 	u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
 	int size, in, string;
 	unsigned port;
 
 	++svm->vcpu.stat.io_exits;
-
-	svm->next_rip = svm->vmcb->control.exit_info_2;
-
 	string = (io_info & SVM_IOIO_STR_MASK) != 0;
-
-	if (string) {
-		if (emulate_instruction(&svm->vcpu,
-					0, 0, 0) == EMULATE_DO_MMIO)
-			return 0;
-		return 1;
-	}
-
 	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
+	if (string || in)
+		return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+
 	port = io_info >> 16;
 	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
-
+	svm->next_rip = svm->vmcb->control.exit_info_2;
 	skip_emulated_instruction(&svm->vcpu);
-	return kvm_emulate_pio(&svm->vcpu, in, size, port);
+
+	return kvm_fast_pio_out(vcpu, size, port);
 }
 
 static int nmi_interception(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 87b3c6843aa..1cceca1c59b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2985,22 +2985,20 @@ static int handle_io(struct kvm_vcpu *vcpu)
 	int size, in, string;
 	unsigned port;
 
-	++vcpu->stat.io_exits;
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	string = (exit_qualification & 16) != 0;
+	in = (exit_qualification & 8) != 0;
 
-	if (string) {
-		if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
-			return 0;
-		return 1;
-	}
+	++vcpu->stat.io_exits;
 
-	size = (exit_qualification & 7) + 1;
-	in = (exit_qualification & 8) != 0;
-	port = exit_qualification >> 16;
+	if (string || in)
+		return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
 
+	port = exit_qualification >> 16;
+	size = (exit_qualification & 7) + 1;
 	skip_emulated_instruction(vcpu);
-	return kvm_emulate_pio(vcpu, in, size, port);
+
+	return kvm_fast_pio_out(vcpu, size, port);
 }
 
 static void
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f69854c8f33..6624ad13ee9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3404,6 +3404,86 @@ emul_write:
 	return emulator_write_emulated(addr, new, bytes, vcpu);
 }
 
+static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
+{
+	/* TODO: String I/O for in kernel device */
+	int r;
+
+	if (vcpu->arch.pio.in)
+		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
+				    vcpu->arch.pio.size, pd);
+	else
+		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+				     vcpu->arch.pio.port, vcpu->arch.pio.size,
+				     pd);
+	return r;
+}
+
+
+static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
+			     unsigned int count, struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.pio.cur_count)
+		goto data_avail;
+
+	trace_kvm_pio(1, port, size, 1);
+
+	vcpu->arch.pio.port = port;
+	vcpu->arch.pio.in = 1;
+	vcpu->arch.pio.string = 0;
+	vcpu->arch.pio.down = 0;
+	vcpu->arch.pio.rep = 0;
+	vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
+	vcpu->arch.pio.size = size;
+
+	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+	data_avail:
+		memcpy(val, vcpu->arch.pio_data, size * count);
+		vcpu->arch.pio.cur_count = 0;
+		return 1;
+	}
+
+	vcpu->run->exit_reason = KVM_EXIT_IO;
+	vcpu->run->io.direction = KVM_EXIT_IO_IN;
+	vcpu->run->io.size = size;
+	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+	vcpu->run->io.count = count;
+	vcpu->run->io.port = port;
+
+	return 0;
+}
+
+static int emulator_pio_out_emulated(int size, unsigned short port,
+			      const void *val, unsigned int count,
+			      struct kvm_vcpu *vcpu)
+{
+	trace_kvm_pio(0, port, size, 1);
+
+	vcpu->arch.pio.port = port;
+	vcpu->arch.pio.in = 0;
+	vcpu->arch.pio.string = 0;
+	vcpu->arch.pio.down = 0;
+	vcpu->arch.pio.rep = 0;
+	vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
+	vcpu->arch.pio.size = size;
+
+	memcpy(vcpu->arch.pio_data, val, size * count);
+
+	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
+		vcpu->arch.pio.cur_count = 0;
+		return 1;
+	}
+
+	vcpu->run->exit_reason = KVM_EXIT_IO;
+	vcpu->run->io.direction = KVM_EXIT_IO_OUT;
+	vcpu->run->io.size = size;
+	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
+	vcpu->run->io.count = count;
+	vcpu->run->io.port = port;
+
+	return 0;
+}
+
 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
 {
 	return kvm_x86_ops->get_segment_base(vcpu, seg);
@@ -3597,6 +3677,8 @@ static struct x86_emulate_ops emulate_ops = {
 	.read_emulated       = emulator_read_emulated,
 	.write_emulated      = emulator_write_emulated,
 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
+	.pio_in_emulated     = emulator_pio_in_emulated,
+	.pio_out_emulated    = emulator_pio_out_emulated,
 	.get_cached_descriptor = emulator_get_cached_descriptor,
 	.set_cached_descriptor = emulator_set_cached_descriptor,
 	.get_segment_selector = emulator_get_segment_selector,
@@ -3704,6 +3786,12 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	if (vcpu->arch.pio.string)
 		return EMULATE_DO_MMIO;
 
+	if (vcpu->arch.pio.cur_count && !vcpu->arch.pio.string) {
+		if (!vcpu->arch.pio.in)
+			vcpu->arch.pio.cur_count = 0;
+		return EMULATE_DO_MMIO;
+	}
+
 	if (r || vcpu->mmio_is_write) {
 		run->exit_reason = KVM_EXIT_MMIO;
 		run->mmio.phys_addr = vcpu->mmio_phys_addr;
@@ -3760,43 +3848,36 @@ int complete_pio(struct kvm_vcpu *vcpu)
 	int r;
 	unsigned long val;
 
-	if (!io->string) {
-		if (io->in) {
-			val = kvm_register_read(vcpu, VCPU_REGS_RAX);
-			memcpy(&val, vcpu->arch.pio_data, io->size);
-			kvm_register_write(vcpu, VCPU_REGS_RAX, val);
-		}
-	} else {
-		if (io->in) {
-			r = pio_copy_data(vcpu);
-			if (r)
-				goto out;
-		}
+	if (io->in) {
+		r = pio_copy_data(vcpu);
+		if (r)
+			goto out;
+	}
 
-		delta = 1;
-		if (io->rep) {
-			delta *= io->cur_count;
-			/*
-			 * The size of the register should really depend on
-			 * current address size.
-			 */
-			val = kvm_register_read(vcpu, VCPU_REGS_RCX);
-			val -= delta;
-			kvm_register_write(vcpu, VCPU_REGS_RCX, val);
-		}
-		if (io->down)
-			delta = -delta;
-		delta *= io->size;
-		if (io->in) {
-			val = kvm_register_read(vcpu, VCPU_REGS_RDI);
-			val += delta;
-			kvm_register_write(vcpu, VCPU_REGS_RDI, val);
-		} else {
-			val = kvm_register_read(vcpu, VCPU_REGS_RSI);
-			val += delta;
-			kvm_register_write(vcpu, VCPU_REGS_RSI, val);
-		}
+	delta = 1;
+	if (io->rep) {
+		delta *= io->cur_count;
+		/*
+		 * The size of the register should really depend on
+		 * current address size.
+		 */
+		val = kvm_register_read(vcpu, VCPU_REGS_RCX);
+		val -= delta;
+		kvm_register_write(vcpu, VCPU_REGS_RCX, val);
+	}
+	if (io->down)
+		delta = -delta;
+	delta *= io->size;
+	if (io->in) {
+		val = kvm_register_read(vcpu, VCPU_REGS_RDI);
+		val += delta;
+		kvm_register_write(vcpu, VCPU_REGS_RDI, val);
+	} else {
+		val = kvm_register_read(vcpu, VCPU_REGS_RSI);
+		val += delta;
+		kvm_register_write(vcpu, VCPU_REGS_RSI, val);
 	}
+
 out:
 	io->count -= io->cur_count;
 	io->cur_count = 0;
@@ -3804,21 +3885,6 @@ out:
 	return 0;
 }
 
-static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
-{
-	/* TODO: String I/O for in kernel device */
-	int r;
-
-	if (vcpu->arch.pio.in)
-		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
-				    vcpu->arch.pio.size, pd);
-	else
-		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
-				     vcpu->arch.pio.port, vcpu->arch.pio.size,
-				     pd);
-	return r;
-}
-
 static int pio_string_write(struct kvm_vcpu *vcpu)
 {
 	struct kvm_pio_request *io = &vcpu->arch.pio;
@@ -3836,36 +3902,6 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
 	return r;
 }
 
-int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
-{
-	unsigned long val;
-
-	trace_kvm_pio(!in, port, size, 1);
-
-	vcpu->run->exit_reason = KVM_EXIT_IO;
-	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
-	vcpu->run->io.size = vcpu->arch.pio.size = size;
-	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
-	vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
-	vcpu->run->io.port = vcpu->arch.pio.port = port;
-	vcpu->arch.pio.in = in;
-	vcpu->arch.pio.string = 0;
-	vcpu->arch.pio.down = 0;
-	vcpu->arch.pio.rep = 0;
-
-	if (!vcpu->arch.pio.in) {
-		val = kvm_register_read(vcpu, VCPU_REGS_RAX);
-		memcpy(vcpu->arch.pio_data, &val, 4);
-	}
-
-	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
-		complete_pio(vcpu);
-		return 1;
-	}
-	return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_pio);
-
 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
 		  int size, unsigned long count, int down,
 		  gva_t address, int rep, unsigned port)
@@ -3931,6 +3967,16 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
 }
 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
 
+int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
+{
+	unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
+	int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
+	/* do not return to emulator after return from userspace */
+	vcpu->arch.pio.cur_count = 0;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
+
 static void bounce_off(void *info)
 {
 	/* nothing */
@@ -4661,9 +4707,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	if (vcpu->arch.pio.cur_count) {
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-		r = complete_pio(vcpu);
+		if (!vcpu->arch.pio.string)
+			r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
+		else
+			r = complete_pio(vcpu);
 		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-		if (r)
+		if (r == EMULATE_DO_MMIO)
 			goto out;
 	}
 	if (vcpu->mmio_needed) {
-- 
cgit v1.2.3-70-g09d2


From 7972995b0c346de76fe260ce0fd6bcc8ffab724a Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:24 +0200
Subject: KVM: x86 emulator: Move string pio emulation into emulator.c

Currently emulation is done outside of emulator so things like doing
ins/outs to/from mmio are broken it also makes it hard (if not impossible)
to implement single stepping in the future. The implementation in this
patch is not efficient since it exits to userspace for each IO while
previous implementation did 'ins' in batches. Further patch that
implements pio in string read ahead address this problem.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   8 --
 arch/x86/kvm/emulate.c          |  48 +++-------
 arch/x86/kvm/x86.c              | 206 ++++------------------------------------
 3 files changed, 32 insertions(+), 230 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 776d3e202b5..26c629a062d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -224,14 +224,9 @@ struct kvm_pv_mmu_op_buffer {
 
 struct kvm_pio_request {
 	unsigned long count;
-	int cur_count;
-	gva_t guest_gva;
 	int in;
 	int port;
 	int size;
-	int string;
-	int down;
-	int rep;
 };
 
 /*
@@ -591,9 +586,6 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
 struct x86_emulate_ctxt;
 
 int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
-			   int size, unsigned long count, int down,
-			    gva_t address, int rep, unsigned port);
 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2d095ce9dc8..2c66e097d91 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -153,8 +153,8 @@ static u32 opcode_table[256] = {
 	0, 0, 0, 0,
 	/* 0x68 - 0x6F */
 	SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
-	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* insb, insw/insd */
-	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* outsb, outsw/outsd */
+	DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
+	SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
 	/* 0x70 - 0x77 */
 	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
 	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
@@ -2615,47 +2615,29 @@ special_insn:
 		break;
 	case 0x6c:		/* insb */
 	case 0x6d:		/* insw/insd */
+		c->dst.bytes = min(c->dst.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
-					  (c->d & ByteOp) ? 1 : c->op_bytes)) {
+					  c->dst.bytes)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			goto done;
 		}
-		if (kvm_emulate_pio_string(ctxt->vcpu,
-				1,
-				(c->d & ByteOp) ? 1 : c->op_bytes,
-				c->rep_prefix ?
-				address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
-				(ctxt->eflags & EFLG_DF),
-				register_address(c, es_base(ctxt),
-						 c->regs[VCPU_REGS_RDI]),
-				c->rep_prefix,
-				c->regs[VCPU_REGS_RDX]) == 0) {
-			c->eip = saved_eip;
-			return -1;
-		}
-		return 0;
+		if (!ops->pio_in_emulated(c->dst.bytes, c->regs[VCPU_REGS_RDX],
+					  &c->dst.val, 1, ctxt->vcpu))
+			goto done; /* IO is needed, skip writeback */
+		break;
 	case 0x6e:		/* outsb */
 	case 0x6f:		/* outsw/outsd */
+		c->src.bytes = min(c->src.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
-					  (c->d & ByteOp) ? 1 : c->op_bytes)) {
+					  c->src.bytes)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			goto done;
 		}
-		if (kvm_emulate_pio_string(ctxt->vcpu,
-				0,
-				(c->d & ByteOp) ? 1 : c->op_bytes,
-				c->rep_prefix ?
-				address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
-				(ctxt->eflags & EFLG_DF),
-					 register_address(c,
-					  seg_override_base(ctxt, c),
-						 c->regs[VCPU_REGS_RSI]),
-				c->rep_prefix,
-				c->regs[VCPU_REGS_RDX]) == 0) {
-			c->eip = saved_eip;
-			return -1;
-		}
-		return 0;
+		ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
+				      &c->src.val, 1, ctxt->vcpu);
+
+		c->dst.type = OP_NONE; /* nothing to writeback */
+		break;
 	case 0x70 ... 0x7f: /* jcc (short) */
 		if (test_cc(c->b, ctxt->eflags))
 			jmp_rel(c, c->src.val);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6624ad13ee9..658e8e8155c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3150,18 +3150,17 @@ static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
 }
 
-static int kvm_write_guest_virt_helper(gva_t addr, void *val,
+static int kvm_write_guest_virt_system(gva_t addr, void *val,
 				       unsigned int bytes,
-				       struct kvm_vcpu *vcpu, u32 access,
+				       struct kvm_vcpu *vcpu,
 				       u32 *error)
 {
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
 
-	access |= PFERR_WRITE_MASK;
-
 	while (bytes) {
-		gpa_t gpa =  vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
+		gpa_t gpa =  vcpu->arch.mmu.gva_to_gpa(vcpu, addr,
+						       PFERR_WRITE_MASK, error);
 		unsigned offset = addr & (PAGE_SIZE-1);
 		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
 		int ret;
@@ -3184,20 +3183,6 @@ out:
 	return r;
 }
 
-static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
-				struct kvm_vcpu *vcpu, u32 *error)
-{
-	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
-	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, access, error);
-}
-
-static int kvm_write_guest_virt_system(gva_t addr, void *val,
-				       unsigned int bytes,
-				       struct kvm_vcpu *vcpu, u32 *error)
-{
-	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
-}
-
 static int emulator_read_emulated(unsigned long addr,
 				  void *val,
 				  unsigned int bytes,
@@ -3423,23 +3408,20 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
 			     unsigned int count, struct kvm_vcpu *vcpu)
 {
-	if (vcpu->arch.pio.cur_count)
+	if (vcpu->arch.pio.count)
 		goto data_avail;
 
 	trace_kvm_pio(1, port, size, 1);
 
 	vcpu->arch.pio.port = port;
 	vcpu->arch.pio.in = 1;
-	vcpu->arch.pio.string = 0;
-	vcpu->arch.pio.down = 0;
-	vcpu->arch.pio.rep = 0;
-	vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
+	vcpu->arch.pio.count  = count;
 	vcpu->arch.pio.size = size;
 
 	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
 	data_avail:
 		memcpy(val, vcpu->arch.pio_data, size * count);
-		vcpu->arch.pio.cur_count = 0;
+		vcpu->arch.pio.count = 0;
 		return 1;
 	}
 
@@ -3461,16 +3443,13 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
 
 	vcpu->arch.pio.port = port;
 	vcpu->arch.pio.in = 0;
-	vcpu->arch.pio.string = 0;
-	vcpu->arch.pio.down = 0;
-	vcpu->arch.pio.rep = 0;
-	vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
+	vcpu->arch.pio.count = count;
 	vcpu->arch.pio.size = size;
 
 	memcpy(vcpu->arch.pio_data, val, size * count);
 
 	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
-		vcpu->arch.pio.cur_count = 0;
+		vcpu->arch.pio.count = 0;
 		return 1;
 	}
 
@@ -3717,7 +3696,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	cache_all_regs(vcpu);
 
 	vcpu->mmio_is_write = 0;
-	vcpu->arch.pio.string = 0;
 
 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
 		int cs_db, cs_l;
@@ -3783,12 +3761,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	if (r == 0)
 		kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
 
-	if (vcpu->arch.pio.string)
-		return EMULATE_DO_MMIO;
-
-	if (vcpu->arch.pio.cur_count && !vcpu->arch.pio.string) {
+	if (vcpu->arch.pio.count) {
 		if (!vcpu->arch.pio.in)
-			vcpu->arch.pio.cur_count = 0;
+			vcpu->arch.pio.count = 0;
 		return EMULATE_DO_MMIO;
 	}
 
@@ -3821,158 +3796,12 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 }
 EXPORT_SYMBOL_GPL(emulate_instruction);
 
-static int pio_copy_data(struct kvm_vcpu *vcpu)
-{
-	void *p = vcpu->arch.pio_data;
-	gva_t q = vcpu->arch.pio.guest_gva;
-	unsigned bytes;
-	int ret;
-	u32 error_code;
-
-	bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
-	if (vcpu->arch.pio.in)
-		ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
-	else
-		ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
-
-	if (ret == X86EMUL_PROPAGATE_FAULT)
-		kvm_inject_page_fault(vcpu, q, error_code);
-
-	return ret;
-}
-
-int complete_pio(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pio_request *io = &vcpu->arch.pio;
-	long delta;
-	int r;
-	unsigned long val;
-
-	if (io->in) {
-		r = pio_copy_data(vcpu);
-		if (r)
-			goto out;
-	}
-
-	delta = 1;
-	if (io->rep) {
-		delta *= io->cur_count;
-		/*
-		 * The size of the register should really depend on
-		 * current address size.
-		 */
-		val = kvm_register_read(vcpu, VCPU_REGS_RCX);
-		val -= delta;
-		kvm_register_write(vcpu, VCPU_REGS_RCX, val);
-	}
-	if (io->down)
-		delta = -delta;
-	delta *= io->size;
-	if (io->in) {
-		val = kvm_register_read(vcpu, VCPU_REGS_RDI);
-		val += delta;
-		kvm_register_write(vcpu, VCPU_REGS_RDI, val);
-	} else {
-		val = kvm_register_read(vcpu, VCPU_REGS_RSI);
-		val += delta;
-		kvm_register_write(vcpu, VCPU_REGS_RSI, val);
-	}
-
-out:
-	io->count -= io->cur_count;
-	io->cur_count = 0;
-
-	return 0;
-}
-
-static int pio_string_write(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pio_request *io = &vcpu->arch.pio;
-	void *pd = vcpu->arch.pio_data;
-	int i, r = 0;
-
-	for (i = 0; i < io->cur_count; i++) {
-		if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
-				     io->port, io->size, pd)) {
-			r = -EOPNOTSUPP;
-			break;
-		}
-		pd += io->size;
-	}
-	return r;
-}
-
-int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
-		  int size, unsigned long count, int down,
-		  gva_t address, int rep, unsigned port)
-{
-	unsigned now, in_page;
-	int ret = 0;
-
-	trace_kvm_pio(!in, port, size, count);
-
-	vcpu->run->exit_reason = KVM_EXIT_IO;
-	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
-	vcpu->run->io.size = vcpu->arch.pio.size = size;
-	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
-	vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
-	vcpu->run->io.port = vcpu->arch.pio.port = port;
-	vcpu->arch.pio.in = in;
-	vcpu->arch.pio.string = 1;
-	vcpu->arch.pio.down = down;
-	vcpu->arch.pio.rep = rep;
-
-	if (!count) {
-		kvm_x86_ops->skip_emulated_instruction(vcpu);
-		return 1;
-	}
-
-	if (!down)
-		in_page = PAGE_SIZE - offset_in_page(address);
-	else
-		in_page = offset_in_page(address) + size;
-	now = min(count, (unsigned long)in_page / size);
-	if (!now)
-		now = 1;
-	if (down) {
-		/*
-		 * String I/O in reverse.  Yuck.  Kill the guest, fix later.
-		 */
-		pr_unimpl(vcpu, "guest string pio down\n");
-		kvm_inject_gp(vcpu, 0);
-		return 1;
-	}
-	vcpu->run->io.count = now;
-	vcpu->arch.pio.cur_count = now;
-
-	if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
-		kvm_x86_ops->skip_emulated_instruction(vcpu);
-
-	vcpu->arch.pio.guest_gva = address;
-
-	if (!vcpu->arch.pio.in) {
-		/* string PIO write */
-		ret = pio_copy_data(vcpu);
-		if (ret == X86EMUL_PROPAGATE_FAULT)
-			return 1;
-		if (ret == 0 && !pio_string_write(vcpu)) {
-			complete_pio(vcpu);
-			if (vcpu->arch.pio.count == 0)
-				ret = 1;
-		}
-	}
-	/* no string PIO read support yet */
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
-
 int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
 {
 	unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
 	int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu);
 	/* do not return to emulator after return from userspace */
-	vcpu->arch.pio.cur_count = 0;
+	vcpu->arch.pio.count = 0;
 	return ret;
 }
 EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
@@ -4705,15 +4534,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (!irqchip_in_kernel(vcpu->kvm))
 		kvm_set_cr8(vcpu, kvm_run->cr8);
 
-	if (vcpu->arch.pio.cur_count) {
+	if (vcpu->arch.pio.count) {
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-		if (!vcpu->arch.pio.string)
-			r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
-		else
-			r = complete_pio(vcpu);
+		r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
 		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-		if (r == EMULATE_DO_MMIO)
+		if (r == EMULATE_DO_MMIO) {
+			r = 0;
 			goto out;
+		}
 	}
 	if (vcpu->mmio_needed) {
 		memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
-- 
cgit v1.2.3-70-g09d2


From cb404fe0898779ec5fe5e06e90aaddcf40aefad8 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:25 +0200
Subject: KVM: x86 emulator: remove saved_eip

c->eip is never written back in case of emulation failure, so no need to
set it to old value.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2c66e097d91..0579d9dd9aa 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2424,7 +2424,6 @@ int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 	u64 msr_data;
-	unsigned long saved_eip = 0;
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
 
@@ -2436,7 +2435,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	 */
 
 	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
-	saved_eip = c->eip;
 
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
 		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
@@ -2928,11 +2926,7 @@ writeback:
 	kvm_rip_write(ctxt->vcpu, c->eip);
 
 done:
-	if (rc == X86EMUL_UNHANDLEABLE) {
-		c->eip = saved_eip;
-		return -1;
-	}
-	return 0;
+	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 
 twobyte_insn:
 	switch (c->b) {
@@ -3209,6 +3203,5 @@ twobyte_insn:
 
 cannot_emulate:
 	DPRINTF("Cannot emulate %02x\n", c->b);
-	c->eip = saved_eip;
 	return -1;
 }
-- 
cgit v1.2.3-70-g09d2


From 5cd21917da245fbe98bd443de2c7f519b3df6814 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:26 +0200
Subject: KVM: x86 emulator: restart string instruction without going back to a
 guest.

Currently when string instruction is only partially complete we go back
to a guest mode, guest tries to reexecute instruction and exits again
and at this point emulation continues. Avoid all of this by restarting
instruction without going back to a guest mode, but return to a guest
mode each 1024 iterations to allow interrupt injection. Pending
exception causes immediate guest entry too.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/kvm/emulate.c             | 34 +++++++++++++++++++++++-----------
 arch/x86/kvm/x86.c                 | 19 ++++++++++++++++++-
 3 files changed, 42 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 679245c9a55..7fda16f89cc 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -193,6 +193,7 @@ struct x86_emulate_ctxt {
 	/* interruptibility state, as a result of execution of STI or MOV SS */
 	int interruptibility;
 
+	bool restart; /* restart string instruction after writeback */
 	/* decode cache */
 	struct decode_cache decode;
 };
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0579d9dd9aa..6de6ad1610d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -927,8 +927,11 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	int mode = ctxt->mode;
 	int def_op_bytes, def_ad_bytes, group;
 
-	/* Shadow copy of register state. Committed on successful emulation. */
 
+	/* we cannot decode insn before we complete previous rep insn */
+	WARN_ON(ctxt->restart);
+
+	/* Shadow copy of register state. Committed on successful emulation. */
 	memset(c, 0, sizeof(struct decode_cache));
 	c->eip = ctxt->eip;
 	ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
@@ -2426,6 +2429,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	u64 msr_data;
 	struct decode_cache *c = &ctxt->decode;
 	int rc = X86EMUL_CONTINUE;
+	int saved_dst_type = c->dst.type;
 
 	ctxt->interruptibility = 0;
 
@@ -2454,8 +2458,11 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	}
 
 	if (c->rep_prefix && (c->d & String)) {
+		ctxt->restart = true;
 		/* All REP prefixes have the same first termination condition */
 		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
+		string_done:
+			ctxt->restart = false;
 			kvm_rip_write(ctxt->vcpu, c->eip);
 			goto done;
 		}
@@ -2467,17 +2474,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		 * 	- if REPNE/REPNZ and ZF = 1 then done
 		 */
 		if ((c->b == 0xa6) || (c->b == 0xa7) ||
-				(c->b == 0xae) || (c->b == 0xaf)) {
+		    (c->b == 0xae) || (c->b == 0xaf)) {
 			if ((c->rep_prefix == REPE_PREFIX) &&
-				((ctxt->eflags & EFLG_ZF) == 0)) {
-					kvm_rip_write(ctxt->vcpu, c->eip);
-					goto done;
-			}
+			    ((ctxt->eflags & EFLG_ZF) == 0))
+				goto string_done;
 			if ((c->rep_prefix == REPNE_PREFIX) &&
-				((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
-				kvm_rip_write(ctxt->vcpu, c->eip);
-				goto done;
-			}
+			    ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
+				goto string_done;
 		}
 		c->eip = ctxt->eip;
 	}
@@ -2911,6 +2914,12 @@ writeback:
 	if (rc != X86EMUL_CONTINUE)
 		goto done;
 
+	/*
+	 * restore dst type in case the decoding will be reused
+	 * (happens for string instruction )
+	 */
+	c->dst.type = saved_dst_type;
+
 	if ((c->d & SrcMask) == SrcSI)
 		string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
 				&c->src);
@@ -2918,8 +2927,11 @@ writeback:
 	if ((c->d & DstMask) == DstDI)
 		string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
 
-	if (c->rep_prefix && (c->d & String))
+	if (c->rep_prefix && (c->d & String)) {
 		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
+		if (!(c->regs[VCPU_REGS_RCX] & 0x3ff))
+			ctxt->restart = false;
+	}
 
 	/* Commit shadow register state. */
 	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 658e8e8155c..c88cb814528 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3755,6 +3755,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		return EMULATE_DONE;
 	}
 
+restart:
 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
 
@@ -3777,7 +3778,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 
 	if (r) {
 		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
-			return EMULATE_DONE;
+			goto done;
 		if (!vcpu->mmio_needed) {
 			kvm_report_emulation_failure(vcpu, "mmio");
 			return EMULATE_FAIL;
@@ -3792,6 +3793,13 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		return EMULATE_DO_MMIO;
 	}
 
+done:
+	if (vcpu->arch.exception.pending)
+		vcpu->arch.emulate_ctxt.restart = false;
+
+	if (vcpu->arch.emulate_ctxt.restart)
+		goto restart;
+
 	return EMULATE_DONE;
 }
 EXPORT_SYMBOL_GPL(emulate_instruction);
@@ -4560,6 +4568,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 			goto out;
 		}
 	}
+	if (vcpu->arch.emulate_ctxt.restart) {
+		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+		r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
+		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+		if (r == EMULATE_DO_MMIO) {
+			r = 0;
+			goto out;
+		}
+	}
 	if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
 		kvm_register_write(vcpu, VCPU_REGS_RAX,
 				     kvm_run->hypercall.ret);
-- 
cgit v1.2.3-70-g09d2


From 7b262e90fc20a49fddf3dad94c8cead1f0439751 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:27 +0200
Subject: KVM: x86 emulator: introduce pio in string read ahead.

To optimize "rep ins" instruction do IO in big chunks ahead of time
instead of doing it only when required during instruction emulation.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  7 ++++++
 arch/x86/kvm/emulate.c             | 46 +++++++++++++++++++++++++++++++++-----
 2 files changed, 48 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 7fda16f89cc..b5e12c58386 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -151,6 +151,12 @@ struct fetch_cache {
 	unsigned long end;
 };
 
+struct read_cache {
+	u8 data[1024];
+	unsigned long pos;
+	unsigned long end;
+};
+
 struct decode_cache {
 	u8 twobyte;
 	u8 b;
@@ -178,6 +184,7 @@ struct decode_cache {
 	void *modrm_ptr;
 	unsigned long modrm_val;
 	struct fetch_cache fetch;
+	struct read_cache io_read;
 };
 
 struct x86_emulate_ctxt {
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6de6ad1610d..ab3fff5bf7c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1257,6 +1257,36 @@ done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 }
 
+static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+			   struct x86_emulate_ops *ops,
+			   unsigned int size, unsigned short port,
+			   void *dest)
+{
+	struct read_cache *rc = &ctxt->decode.io_read;
+
+	if (rc->pos == rc->end) { /* refill pio read ahead */
+		struct decode_cache *c = &ctxt->decode;
+		unsigned int in_page, n;
+		unsigned int count = c->rep_prefix ?
+			address_mask(c, c->regs[VCPU_REGS_RCX]) : 1;
+		in_page = (ctxt->eflags & EFLG_DF) ?
+			offset_in_page(c->regs[VCPU_REGS_RDI]) :
+			PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]);
+		n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size,
+			count);
+		if (n == 0)
+			n = 1;
+		rc->pos = rc->end = 0;
+		if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu))
+			return 0;
+		rc->end = n * size;
+	}
+
+	memcpy(dest, rc->data + rc->pos, size);
+	rc->pos += size;
+	return 1;
+}
+
 static u32 desc_limit_scaled(struct desc_struct *desc)
 {
 	u32 limit = get_desc_limit(desc);
@@ -2622,8 +2652,8 @@ special_insn:
 			kvm_inject_gp(ctxt->vcpu, 0);
 			goto done;
 		}
-		if (!ops->pio_in_emulated(c->dst.bytes, c->regs[VCPU_REGS_RDX],
-					  &c->dst.val, 1, ctxt->vcpu))
+		if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
+				     c->regs[VCPU_REGS_RDX], &c->dst.val))
 			goto done; /* IO is needed, skip writeback */
 		break;
 	case 0x6e:		/* outsb */
@@ -2839,8 +2869,8 @@ special_insn:
 			kvm_inject_gp(ctxt->vcpu, 0);
 			goto done;
 		}
-		if (!ops->pio_in_emulated(c->dst.bytes, c->src.val,
-					  &c->dst.val, 1, ctxt->vcpu))
+		if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
+				     &c->dst.val))
 			goto done; /* IO is needed */
 		break;
 	case 0xee: /* out al,dx */
@@ -2928,8 +2958,14 @@ writeback:
 		string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
 
 	if (c->rep_prefix && (c->d & String)) {
+		struct read_cache *rc = &ctxt->decode.io_read;
 		register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
-		if (!(c->regs[VCPU_REGS_RCX] & 0x3ff))
+		/*
+		 * Re-enter guest when pio read ahead buffer is empty or,
+		 * if it is not used, after each 1024 iteration.
+		 */
+		if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) ||
+		    (rc->end != 0 && rc->end == rc->pos))
 			ctxt->restart = false;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 92bf9748b5bc381070f6adf0b56efd3428e4a97b Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 18 Mar 2010 15:20:28 +0200
Subject: KVM: small kvm_arch_vcpu_ioctl_run() cleanup.

Unify all conditions that get us back into emulator after returning from
userspace.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 32 ++++++--------------------------
 1 file changed, 6 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c88cb814528..cc540f27053 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4542,33 +4542,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (!irqchip_in_kernel(vcpu->kvm))
 		kvm_set_cr8(vcpu, kvm_run->cr8);
 
-	if (vcpu->arch.pio.count) {
-		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-		r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
-		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-		if (r == EMULATE_DO_MMIO) {
-			r = 0;
-			goto out;
-		}
-	}
-	if (vcpu->mmio_needed) {
-		memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
-		vcpu->mmio_read_completed = 1;
-		vcpu->mmio_needed = 0;
-
-		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-		r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
-					EMULTYPE_NO_DECODE);
-		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-		if (r == EMULATE_DO_MMIO) {
-			/*
-			 * Read-modify-write.  Back to userspace.
-			 */
-			r = 0;
-			goto out;
+	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
+	    vcpu->arch.emulate_ctxt.restart) {
+		if (vcpu->mmio_needed) {
+			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
+			vcpu->mmio_read_completed = 1;
+			vcpu->mmio_needed = 0;
 		}
-	}
-	if (vcpu->arch.emulate_ctxt.restart) {
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 		r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
 		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-- 
cgit v1.2.3-70-g09d2


From 9749a6c0f0a4f88ae7bad4f65d7da32769e9b2b7 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Sat, 20 Mar 2010 10:14:13 +0100
Subject: KVM: x86: Fix 32-bit build breakage due to typo

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cc540f27053..b4d3363b78e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3328,7 +3328,7 @@ EXPORT_SYMBOL_GPL(emulator_write_emulated);
 #  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
 #else
 #  define CMPXCHG64(ptr, old, new) \
-	(cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u *)(new)) == *(u64 *)(old))
+	(cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
 #endif
 
 static int emulator_cmpxchg_emulated(unsigned long addr,
-- 
cgit v1.2.3-70-g09d2


From 482ac18ae293a3a0b1e1eea95c10dcc9ceeb4708 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 21 Mar 2010 13:08:20 +0200
Subject: KVM: x86 emulator: commit rflags as part of registers commit

Make sure that rflags is committed only after successful instruction
emulation.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h | 1 +
 arch/x86/kvm/emulate.c             | 1 +
 arch/x86/kvm/x86.c                 | 8 ++++++--
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b5e12c58386..a1319c82050 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -136,6 +136,7 @@ struct x86_emulate_ops {
 	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
 	void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
 	int (*cpl)(struct kvm_vcpu *vcpu);
+	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 };
 
 /* Type, address-of, and value of an instruction's operand. */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ab3fff5bf7c..48de4b89005 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2972,6 +2972,7 @@ writeback:
 	/* Commit shadow register state. */
 	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(ctxt->vcpu, c->eip);
+	ops->set_rflags(ctxt->vcpu, ctxt->eflags);
 
 done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b4d3363b78e..247e805a041 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3649,6 +3649,11 @@ static void emulator_set_segment_selector(u16 sel, int seg,
 	kvm_set_segment(vcpu, &kvm_seg, seg);
 }
 
+static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
+{
+	kvm_x86_ops->set_rflags(vcpu, rflags);
+}
+
 static struct x86_emulate_ops emulate_ops = {
 	.read_std            = kvm_read_guest_virt_system,
 	.write_std           = kvm_write_guest_virt_system,
@@ -3666,6 +3671,7 @@ static struct x86_emulate_ops emulate_ops = {
 	.get_cr              = emulator_get_cr,
 	.set_cr              = emulator_set_cr,
 	.cpl                 = emulator_get_cpl,
+	.set_rflags          = emulator_set_rflags,
 };
 
 static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3786,8 +3792,6 @@ restart:
 		return EMULATE_DO_MMIO;
 	}
 
-	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
-
 	if (vcpu->mmio_is_write) {
 		vcpu->mmio_needed = 0;
 		return EMULATE_DO_MMIO;
-- 
cgit v1.2.3-70-g09d2


From 6550e1f165f384f3a46b60a1be9aba4bc3c2adad Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 21 Mar 2010 13:08:21 +0200
Subject: KVM: x86 emulator: add decoding of CMPXCHG8B dst operand

Decode CMPXCHG8B destination operand in decoding stage. Fixes regression
introduced by "If LOCK prefix is used dest arg should be memory" commit.
This commit relies on dst operand be decoded at the beginning of an
instruction emulation.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 48de4b89005..b8ce53861f6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -52,6 +52,7 @@
 #define DstMem      (3<<1)	/* Memory operand. */
 #define DstAcc      (4<<1)      /* Destination Accumulator */
 #define DstDI       (5<<1)	/* Destination is in ES:(E)DI */
+#define DstMem64    (6<<1)	/* 64bit memory operand */
 #define DstMask     (7<<1)
 /* Source operand type. */
 #define SrcNone     (0<<4)	/* No source operand. */
@@ -360,7 +361,7 @@ static u32 group_table[] = {
 	DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
 	DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
 	[Group9*8] =
-	0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0,
+	0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
 };
 
 static u32 group2_table[] = {
@@ -1205,6 +1206,7 @@ done_prefixes:
 			 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
 		break;
 	case DstMem:
+	case DstMem64:
 		if ((c->d & ModRM) && c->modrm_mod == 3) {
 			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 			c->dst.type = OP_REG;
@@ -1214,7 +1216,10 @@ done_prefixes:
 		}
 		c->dst.type = OP_MEM;
 		c->dst.ptr = (unsigned long *)c->modrm_ea;
-		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		if ((c->d & DstMask) == DstMem64)
+			c->dst.bytes = 8;
+		else
+			c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		c->dst.val = 0;
 		if (c->d & BitOp) {
 			unsigned long mask = ~(c->dst.bytes * 8 - 1);
@@ -1706,12 +1711,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
 			       struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	u64 old, new;
-	int rc;
-
-	rc = ops->read_emulated(c->modrm_ea, &old, 8, ctxt->vcpu);
-	if (rc != X86EMUL_CONTINUE)
-		return rc;
+	u64 old = c->dst.orig_val;
 
 	if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
 	    ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) {
@@ -1719,15 +1719,12 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
 		c->regs[VCPU_REGS_RAX] = (u32) (old >> 0);
 		c->regs[VCPU_REGS_RDX] = (u32) (old >> 32);
 		ctxt->eflags &= ~EFLG_ZF;
-
 	} else {
-		new = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
+		c->dst.val = ((u64)c->regs[VCPU_REGS_RCX] << 32) |
 		       (u32) c->regs[VCPU_REGS_RBX];
 
-		rc = ops->cmpxchg_emulated(c->modrm_ea, &old, &new, 8, ctxt->vcpu);
-		if (rc != X86EMUL_CONTINUE)
-			return rc;
 		ctxt->eflags |= EFLG_ZF;
+		c->lock_prefix = 1;
 	}
 	return X86EMUL_CONTINUE;
 }
@@ -3245,7 +3242,6 @@ twobyte_insn:
 		rc = emulate_grp9(ctxt, ops);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
-		c->dst.type = OP_NONE;
 		break;
 	}
 	goto writeback;
-- 
cgit v1.2.3-70-g09d2


From de3e6480f76804fe06d460ddb1920c7daa07f29b Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 21 Mar 2010 16:58:36 +0200
Subject: KVM: x86 emulator: fix unlocked CMPXCHG8B emulation

When CMPXCHG8B is executed without LOCK prefix it is racy. Preserve this
behaviour in emulator too.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b8ce53861f6..64c9854f045 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1724,7 +1724,6 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
 		       (u32) c->regs[VCPU_REGS_RBX];
 
 		ctxt->eflags |= EFLG_ZF;
-		c->lock_prefix = 1;
 	}
 	return X86EMUL_CONTINUE;
 }
-- 
cgit v1.2.3-70-g09d2


From 041b1359a29b9301bc8baed6efcdbad29f80f6af Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 23 Mar 2010 14:15:53 -0300
Subject: KVM: x86: document KVM_REQ_PENDING_TIMER usage

Document that KVM_REQ_PENDING_TIMER is implicitly used during guest
entry.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/timer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index eea40439066..4ddadb1a5ff 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -12,7 +12,8 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
 	/*
 	 * There is a race window between reading and incrementing, but we do
 	 * not care about potentially loosing timer events in the !reinject
-	 * case anyway.
+	 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
+	 * in vcpu_enter_guest.
 	 */
 	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
 		atomic_inc(&ktimer->pending);
-- 
cgit v1.2.3-70-g09d2


From f815bce8940bc28e42e6b924825bb31df2a4bff5 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 19 Mar 2010 17:58:53 +0800
Subject: KVM: MMU: check reserved bits only if CR4.PSE=1 or CR4.PAE=1

- Check reserved bits only if CR4.PAE=1 or CR4.PSE=1 when guest #PF occurs
- Fix a typo in reset_rsvds_bits_mask()

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 064c3efb49d..83d2ebce9ea 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2297,13 +2297,19 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 		/* no rsvd bits for 2 level 4K page table entries */
 		context->rsvd_bits_mask[0][1] = 0;
 		context->rsvd_bits_mask[0][0] = 0;
+		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
+
+		if (!is_pse(vcpu)) {
+			context->rsvd_bits_mask[1][1] = 0;
+			break;
+		}
+
 		if (is_cpuid_PSE36())
 			/* 36bits PSE 4MB page */
 			context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
 		else
 			/* 32 bits PSE 4MB page */
 			context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
-		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
 		break;
 	case PT32E_ROOT_LEVEL:
 		context->rsvd_bits_mask[0][2] =
@@ -2316,7 +2322,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 62) |
 			rsvd_bits(13, 20);		/* large page */
-		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
 		break;
 	case PT64_ROOT_LEVEL:
 		context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
@@ -2334,7 +2340,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 51) |
 			rsvd_bits(13, 20);		/* large page */
-		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0];
+		context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0];
 		break;
 	}
 }
-- 
cgit v1.2.3-70-g09d2


From 84b0c8c6a6f87b62bca93727dee12ec59e32e597 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 14 Mar 2010 10:16:40 +0200
Subject: KVM: MMU: Disassociate direct maps from guest levels

Direct maps are linear translations for a section of memory, used for
real mode or with large pages.  As such, they are independent of the guest
levels.

Teach the mmu about this by making page->role.glevels = 0 for direct maps.
This allows direct maps to be shared among real mode and the various paging
modes.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 83d2ebce9ea..1cc60d3f445 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1329,6 +1329,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	role = vcpu->arch.mmu.base_role;
 	role.level = level;
 	role.direct = direct;
+	if (role.direct)
+		role.glevels = 0;
 	role.access = access;
 	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
-- 
cgit v1.2.3-70-g09d2


From 805d32dea4dfb8319aaf7e73a681ad410a5e331a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 1 Apr 2010 16:50:45 +0800
Subject: KVM: MMU: cleanup/fix mmu audit code

This patch does:
- 'sp' parameter in inspect_spte_fn() is not used, so remove it
- fix 'kvm' and 'slots' is not defined in count_rmaps()
- fix a bug in inspect_spte_has_rmap()

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1cc60d3f445..6ed7d633aef 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3182,8 +3182,7 @@ static gva_t canonicalize(gva_t gva)
 }
 
 
-typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp,
-				 u64 *sptep);
+typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
 
 static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
 			    inspect_spte_fn fn)
@@ -3199,7 +3198,7 @@ static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
 				child = page_header(ent & PT64_BASE_ADDR_MASK);
 				__mmu_spte_walk(kvm, child, fn);
 			} else
-				fn(kvm, sp, &sp->spt[i]);
+				fn(kvm, &sp->spt[i]);
 		}
 	}
 }
@@ -3290,6 +3289,8 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
 
 static int count_rmaps(struct kvm_vcpu *vcpu)
 {
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_memslots *slots;
 	int nmaps = 0;
 	int i, j, k, idx;
 
@@ -3323,7 +3324,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
 	return nmaps;
 }
 
-void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
+void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 {
 	unsigned long *rmapp;
 	struct kvm_mmu_page *rev_sp;
@@ -3339,14 +3340,14 @@ void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep)
 			printk(KERN_ERR "%s: no memslot for gfn %ld\n",
 					 audit_msg, gfn);
 			printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
-					audit_msg, sptep - rev_sp->spt,
+			       audit_msg, (long int)(sptep - rev_sp->spt),
 					rev_sp->gfn);
 			dump_stack();
 			return;
 		}
 
 		rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
-				    is_large_pte(*sptep));
+				    rev_sp->role.level);
 		if (!*rmapp) {
 			if (!printk_ratelimit())
 				return;
@@ -3381,7 +3382,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
 				continue;
 			if (!(ent & PT_WRITABLE_MASK))
 				continue;
-			inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]);
+			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
 		}
 	}
 	return;
-- 
cgit v1.2.3-70-g09d2


From f84cbb0561d7cefb5fc050ee99d8c82ec9ce883d Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 6 Apr 2010 18:29:05 +0800
Subject: KVM: MMU: remove unused field

kvm_mmu_page.oos_link is not used, so remove it

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 --
 arch/x86/kvm/mmu.c              | 1 -
 2 files changed, 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 26c629a062d..0c49c888be6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -187,8 +187,6 @@ struct kvm_mmu_page {
 	struct list_head link;
 	struct hlist_node hash_link;
 
-	struct list_head oos_link;
-
 	/*
 	 * The following two entries are used to key the shadow page in the
 	 * hash table.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6ed7d633aef..ec8900b6692 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -923,7 +923,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
-	INIT_LIST_HEAD(&sp->oos_link);
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	sp->multimapped = 0;
 	sp->parent_pte = parent_pte;
-- 
cgit v1.2.3-70-g09d2


From 24222c2fec75df30d56d1e2014d45d2559c94f1a Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 6 Apr 2010 18:31:13 +0800
Subject: KVM: MMU: remove unnecessary NX check in walk_addr

After is_rsvd_bits_set() checks, EFER.NXE must be enabled if NX bit is seted

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 067797a7276..d9dea288e51 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -170,7 +170,7 @@ walk:
 			goto access_error;
 
 #if PTTYPE == 64
-		if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK))
+		if (fetch_fault && (pte & PT64_NX_MASK))
 			goto access_error;
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 2fb53ad811e238d5dec8716b99986c3f234e3337 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 11 Apr 2010 13:05:15 +0300
Subject: KVM: x86 emulator: Don't overwrite decode cache

Currently if we an instruction spans a page boundary, when we fetch the
second half we overwrite the first half.  This prevents us from tracing
the full instruction opcodes.

Fix by appending the second half to the first.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 64c9854f045..083b269a83e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -646,21 +646,22 @@ static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
 
 static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
 			      struct x86_emulate_ops *ops,
-			      unsigned long linear, u8 *dest)
+			      unsigned long eip, u8 *dest)
 {
 	struct fetch_cache *fc = &ctxt->decode.fetch;
 	int rc;
-	int size;
+	int size, cur_size;
 
-	if (linear < fc->start || linear >= fc->end) {
-		size = min(15UL, PAGE_SIZE - offset_in_page(linear));
-		rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL);
+	if (eip == fc->end) {
+		cur_size = fc->end - fc->start;
+		size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
+		rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
+				size, ctxt->vcpu, NULL);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
-		fc->start = linear;
-		fc->end = linear + size;
+		fc->end += size;
 	}
-	*dest = fc->data[linear - fc->start];
+	*dest = fc->data[eip - fc->start];
 	return X86EMUL_CONTINUE;
 }
 
@@ -673,7 +674,6 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
 	/* x86 instructions are limited to 15 bytes. */
 	if (eip + size - ctxt->eip > 15)
 		return X86EMUL_UNHANDLEABLE;
-	eip += ctxt->cs_base;
 	while (size--) {
 		rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
 		if (rc != X86EMUL_CONTINUE)
@@ -935,6 +935,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	/* Shadow copy of register state. Committed on successful emulation. */
 	memset(c, 0, sizeof(struct decode_cache));
 	c->eip = ctxt->eip;
+	c->fetch.start = c->fetch.end = c->eip;
 	ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
 	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
 
-- 
cgit v1.2.3-70-g09d2


From e46479f852adab6027e4950d69400d967bf7bc6f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 11 Apr 2010 13:05:16 +0300
Subject: KVM: Trace emulated instructions

Log emulated instructions in ftrace, especially if they failed.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/trace.h | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c   |  4 +++
 2 files changed, 90 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 32c912c40bf..a6544b8e7c0 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -603,6 +603,92 @@ TRACE_EVENT(kvm_skinit,
 		  __entry->rip, __entry->slb)
 );
 
+#define __print_insn(insn, ilen) ({		                 \
+	int i;							 \
+	const char *ret = p->buffer + p->len;			 \
+								 \
+	for (i = 0; i < ilen; ++i)				 \
+		trace_seq_printf(p, " %02x", insn[i]);		 \
+	trace_seq_printf(p, "%c", 0);				 \
+	ret;							 \
+	})
+
+#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
+#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
+#define KVM_EMUL_INSN_F_CS_D   (1 << 2)
+#define KVM_EMUL_INSN_F_CS_L   (1 << 3)
+
+#define kvm_trace_symbol_emul_flags	                  \
+	{ 0,   			    "real" },		  \
+	{ KVM_EMUL_INSN_F_CR0_PE			  \
+	  | KVM_EMUL_INSN_F_EFL_VM, "vm16" },		  \
+	{ KVM_EMUL_INSN_F_CR0_PE,   "prot16" },		  \
+	{ KVM_EMUL_INSN_F_CR0_PE			  \
+	  | KVM_EMUL_INSN_F_CS_D,   "prot32" },		  \
+	{ KVM_EMUL_INSN_F_CR0_PE			  \
+	  | KVM_EMUL_INSN_F_CS_L,   "prot64" }
+
+#define kei_decode_mode(mode) ({			\
+	u8 flags = 0xff;				\
+	switch (mode) {					\
+	case X86EMUL_MODE_REAL:				\
+		flags = 0;				\
+		break;					\
+	case X86EMUL_MODE_VM86:				\
+		flags = KVM_EMUL_INSN_F_EFL_VM;		\
+		break;					\
+	case X86EMUL_MODE_PROT16:			\
+		flags = KVM_EMUL_INSN_F_CR0_PE;		\
+		break;					\
+	case X86EMUL_MODE_PROT32:			\
+		flags = KVM_EMUL_INSN_F_CR0_PE		\
+			| KVM_EMUL_INSN_F_CS_D;		\
+		break;					\
+	case X86EMUL_MODE_PROT64:			\
+		flags = KVM_EMUL_INSN_F_CR0_PE		\
+			| KVM_EMUL_INSN_F_CS_L;		\
+		break;					\
+	}						\
+	flags;						\
+	})
+
+TRACE_EVENT(kvm_emulate_insn,
+	TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed),
+	TP_ARGS(vcpu, failed),
+
+	TP_STRUCT__entry(
+		__field(    __u64, rip                       )
+		__field(    __u32, csbase                    )
+		__field(    __u8,  len                       )
+		__array(    __u8,  insn,    15	             )
+		__field(    __u8,  flags       	   	     )
+		__field(    __u8,  failed                    )
+		),
+
+	TP_fast_assign(
+		__entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start;
+		__entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS);
+		__entry->len = vcpu->arch.emulate_ctxt.decode.eip
+			       - vcpu->arch.emulate_ctxt.decode.fetch.start;
+		memcpy(__entry->insn,
+		       vcpu->arch.emulate_ctxt.decode.fetch.data,
+		       15);
+		__entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode);
+		__entry->failed = failed;
+		),
+
+	TP_printk("%x:%llx:%s (%s)%s",
+		  __entry->csbase, __entry->rip,
+		  __print_insn(__entry->insn, __entry->len),
+		  __print_symbolic(__entry->flags,
+				   kvm_trace_symbol_emul_flags),
+		  __entry->failed ? " failed" : ""
+		)
+	);
+
+#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
+#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)
+
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 247e805a041..33a40c544c7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3718,6 +3718,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 			? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
 
 		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+		trace_kvm_emulate_insn_start(vcpu);
 
 		/* Only allow emulation of specific instructions on #UD
 		 * (namely VMMCALL, sysenter, sysexit, syscall)*/
@@ -3750,6 +3751,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		++vcpu->stat.insn_emulation;
 		if (r)  {
 			++vcpu->stat.insn_emulation_fail;
+			trace_kvm_emulate_insn_failed(vcpu);
 			if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
 				return EMULATE_DONE;
 			return EMULATE_FAIL;
@@ -3786,6 +3788,8 @@ restart:
 		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
 			goto done;
 		if (!vcpu->mmio_needed) {
+			++vcpu->stat.insn_emulation_fail;
+			trace_kvm_emulate_insn_failed(vcpu);
 			kvm_report_emulation_failure(vcpu, "mmio");
 			return EMULATE_FAIL;
 		}
-- 
cgit v1.2.3-70-g09d2


From f7a711971edd952352a89698db1d36f469e25f77 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 11 Apr 2010 15:33:32 +0300
Subject: KVM: Fix MAXPHYADDR calculation when cpuid does not support it

MAXPHYADDR is derived from cpuid 0x80000008, but when that isn't present, we
get some random value.

Fix by checking first that cpuid 0x80000008 is supported.

Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 33a40c544c7..d65e481c5fa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4171,9 +4171,13 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best;
 
+	best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
+	if (!best || best->eax < 0x80000008)
+		goto not_found;
 	best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
 	if (best)
 		return best->eax & 0xff;
+not_found:
 	return 36;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 6bc31bdc55cad6609b1610b4cecad312664f2808 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Sun, 11 Apr 2010 23:07:28 +0200
Subject: KVM: SVM: implement NEXTRIPsave SVM feature

On SVM we set the instruction length of skipped instructions
to hard-coded, well known values, which could be wrong when (bogus,
but valid) prefixes (REX, segment override) are used.
Newer AMD processors (Fam10h 45nm and better, aka. PhenomII or
AthlonII) have an explicit NEXTRIP field in the VMCB containing the
desired information.
Since it is cheap to do so, we use this field to override the guessed
value on newer processors.
A fix for older CPUs would be rather expensive, as it would require
to fetch and partially decode the instruction. As the problem is not
a security issue and needs special, handcrafted code to trigger
(no compiler will ever generate such code), I omit a fix for older
CPUs.
If someone is interested, I have both a patch for these CPUs as well as
demo code triggering this issue: It segfaults under KVM, but runs
perfectly on native Linux.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/svm.h |  4 +++-
 arch/x86/kvm/svm.c         | 13 ++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index b26a38d8535..1d91d05f936 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -81,7 +81,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
 	u32 event_inj_err;
 	u64 nested_cr3;
 	u64 lbr_ctl;
-	u8 reserved_5[832];
+	u64 reserved_5;
+	u64 next_rip;
+	u8 reserved_6[816];
 };
 
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e9f79619e18..64b7f60dc5b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -44,11 +44,11 @@ MODULE_LICENSE("GPL");
 #define SEG_TYPE_LDT 2
 #define SEG_TYPE_BUSY_TSS16 3
 
-#define SVM_FEATURE_NPT  (1 << 0)
-#define SVM_FEATURE_LBRV (1 << 1)
-#define SVM_FEATURE_SVML (1 << 2)
-#define SVM_FEATURE_NRIP (1 << 3)
-#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
+#define SVM_FEATURE_NPT            (1 <<  0)
+#define SVM_FEATURE_LBRV           (1 <<  1)
+#define SVM_FEATURE_SVML           (1 <<  2)
+#define SVM_FEATURE_NRIP           (1 <<  3)
+#define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
 
 #define NESTED_EXIT_HOST	0	/* Exit handled on host level */
 #define NESTED_EXIT_DONE	1	/* Exit caused nested vmexit  */
@@ -320,6 +320,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
+	if (svm->vmcb->control.next_rip != 0)
+		svm->next_rip = svm->vmcb->control.next_rip;
+
 	if (!svm->next_rip) {
 		if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
 				EMULATE_DONE)
-- 
cgit v1.2.3-70-g09d2


From 020df0794f5764e742feaa718be88b8f1b4ce04f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 13 Apr 2010 10:05:23 +0300
Subject: KVM: move DR register access handling into generic code

Currently both SVM and VMX have their own DR handling code. Move it to
x86.c.

Acked-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  5 +--
 arch/x86/kvm/svm.c              | 66 ++--------------------------------
 arch/x86/kvm/vmx.c              | 78 ++++++-----------------------------------
 arch/x86/kvm/x86.c              | 78 +++++++++++++++++++++++++++++++++++++++--
 4 files changed, 93 insertions(+), 134 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0c49c888be6..5d5e0a9afcf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -496,8 +496,7 @@ struct kvm_x86_ops {
 	void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
 	void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
 	void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt);
-	int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
-	int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
+	void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value);
 	void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
 	unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
@@ -602,6 +601,8 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 64b7f60dc5b..87b36fbbfec 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1307,70 +1307,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
 	svm->vmcb->control.asid = sd->next_asid++;
 }
 
-static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest)
+static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	switch (dr) {
-	case 0 ... 3:
-		*dest = vcpu->arch.db[dr];
-		break;
-	case 4:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
-			return EMULATE_FAIL; /* will re-inject UD */
-		/* fall through */
-	case 6:
-		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
-			*dest = vcpu->arch.dr6;
-		else
-			*dest = svm->vmcb->save.dr6;
-		break;
-	case 5:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
-			return EMULATE_FAIL; /* will re-inject UD */
-		/* fall through */
-	case 7:
-		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
-			*dest = vcpu->arch.dr7;
-		else
-			*dest = svm->vmcb->save.dr7;
-		break;
-	}
-
-	return EMULATE_DONE;
-}
-
-static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	switch (dr) {
-	case 0 ... 3:
-		vcpu->arch.db[dr] = value;
-		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
-			vcpu->arch.eff_db[dr] = value;
-		break;
-	case 4:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
-			return EMULATE_FAIL; /* will re-inject UD */
-		/* fall through */
-	case 6:
-		vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
-		break;
-	case 5:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
-			return EMULATE_FAIL; /* will re-inject UD */
-		/* fall through */
-	case 7:
-		vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
-		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
-			svm->vmcb->save.dr7 = vcpu->arch.dr7;
-			vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
-		}
-		break;
-	}
-
-	return EMULATE_DONE;
+	svm->vmcb->save.dr7 = value;
 }
 
 static int pf_interception(struct vcpu_svm *svm)
@@ -3302,8 +3243,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_idt = svm_set_idt,
 	.get_gdt = svm_get_gdt,
 	.set_gdt = svm_set_gdt,
-	.get_dr = svm_get_dr,
-	.set_dr = svm_set_dr,
+	.set_dr7 = svm_set_dr7,
 	.cache_reg = svm_cache_reg,
 	.get_rflags = svm_get_rflags,
 	.set_rflags = svm_set_rflags,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1cceca1c59b..fb4a8869bb9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3089,19 +3089,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static int check_dr_alias(struct kvm_vcpu *vcpu)
-{
-	if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-		kvm_queue_exception(vcpu, UD_VECTOR);
-		return -1;
-	}
-	return 0;
-}
-
 static int handle_dr(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification;
-	unsigned long val;
 	int dr, reg;
 
 	/* Do not handle if the CPL > 0, will trigger GP on re-entry */
@@ -3136,67 +3126,20 @@ static int handle_dr(struct kvm_vcpu *vcpu)
 	dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
 	reg = DEBUG_REG_ACCESS_REG(exit_qualification);
 	if (exit_qualification & TYPE_MOV_FROM_DR) {
-		switch (dr) {
-		case 0 ... 3:
-			val = vcpu->arch.db[dr];
-			break;
-		case 4:
-			if (check_dr_alias(vcpu) < 0)
-				return 1;
-			/* fall through */
-		case 6:
-			val = vcpu->arch.dr6;
-			break;
-		case 5:
-			if (check_dr_alias(vcpu) < 0)
-				return 1;
-			/* fall through */
-		default: /* 7 */
-			val = vcpu->arch.dr7;
-			break;
-		}
-		kvm_register_write(vcpu, reg, val);
-	} else {
-		val = vcpu->arch.regs[reg];
-		switch (dr) {
-		case 0 ... 3:
-			vcpu->arch.db[dr] = val;
-			if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
-				vcpu->arch.eff_db[dr] = val;
-			break;
-		case 4:
-			if (check_dr_alias(vcpu) < 0)
-				return 1;
-			/* fall through */
-		case 6:
-			if (val & 0xffffffff00000000ULL) {
-				kvm_inject_gp(vcpu, 0);
-				return 1;
-			}
-			vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
-			break;
-		case 5:
-			if (check_dr_alias(vcpu) < 0)
-				return 1;
-			/* fall through */
-		default: /* 7 */
-			if (val & 0xffffffff00000000ULL) {
-				kvm_inject_gp(vcpu, 0);
-				return 1;
-			}
-			vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
-			if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
-				vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
-				vcpu->arch.switch_db_regs =
-					(val & DR7_BP_EN_MASK);
-			}
-			break;
-		}
-	}
+		unsigned long val;
+		if (!kvm_get_dr(vcpu, dr, &val))
+			kvm_register_write(vcpu, reg, val);
+	} else
+		kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]);
 	skip_emulated_instruction(vcpu);
 	return 1;
 }
 
+static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
+{
+	vmcs_writel(GUEST_DR7, val);
+}
+
 static int handle_cpuid(struct kvm_vcpu *vcpu)
 {
 	kvm_emulate_cpuid(vcpu);
@@ -4187,6 +4130,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_idt = vmx_set_idt,
 	.get_gdt = vmx_get_gdt,
 	.set_gdt = vmx_set_gdt,
+	.set_dr7 = vmx_set_dr7,
 	.cache_reg = vmx_cache_reg,
 	.get_rflags = vmx_get_rflags,
 	.set_rflags = vmx_set_rflags,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d65e481c5fa..09dccac2df7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -562,6 +562,80 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+	switch (dr) {
+	case 0 ... 3:
+		vcpu->arch.db[dr] = val;
+		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
+			vcpu->arch.eff_db[dr] = val;
+		break;
+	case 4:
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+			kvm_queue_exception(vcpu, UD_VECTOR);
+			return 1;
+		}
+		/* fall through */
+	case 6:
+		if (val & 0xffffffff00000000ULL) {
+			kvm_inject_gp(vcpu, 0);
+			return 1;
+		}
+		vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
+		break;
+	case 5:
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+			kvm_queue_exception(vcpu, UD_VECTOR);
+			return 1;
+		}
+		/* fall through */
+	default: /* 7 */
+		if (val & 0xffffffff00000000ULL) {
+			kvm_inject_gp(vcpu, 0);
+			return 1;
+		}
+		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
+		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+			kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
+			vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK);
+		}
+		break;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_dr);
+
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+{
+	switch (dr) {
+	case 0 ... 3:
+		*val = vcpu->arch.db[dr];
+		break;
+	case 4:
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+			kvm_queue_exception(vcpu, UD_VECTOR);
+			return 1;
+		}
+		/* fall through */
+	case 6:
+		*val = vcpu->arch.dr6;
+		break;
+	case 5:
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
+			kvm_queue_exception(vcpu, UD_VECTOR);
+			return 1;
+		}
+		/* fall through */
+	default: /* 7 */
+		*val = vcpu->arch.dr7;
+		break;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_get_dr);
+
 static inline u32 bit(int bitno)
 {
 	return 1 << (bitno & 31);
@@ -3483,14 +3557,14 @@ int emulate_clts(struct kvm_vcpu *vcpu)
 
 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
 {
-	return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest);
+	return kvm_get_dr(ctxt->vcpu, dr, dest);
 }
 
 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
 {
 	unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
 
-	return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask);
+	return kvm_set_dr(ctxt->vcpu, dr, value & mask);
 }
 
 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
-- 
cgit v1.2.3-70-g09d2


From 8f6abd06f521112a0a3bc906df273fa3ce0a9387 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 13 Apr 2010 10:21:56 +0300
Subject: KVM: x86: get rid of mmu_only parameter in emulator_write_emulated()

We can call kvm_mmu_pte_write() directly from
emulator_cmpxchg_emulated() instead of passing mmu_only down to
emulator_write_emulated_onepage() and call it there.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 36 +++++++++++-------------------------
 1 file changed, 11 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 09dccac2df7..40991527f54 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3322,8 +3322,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 static int emulator_write_emulated_onepage(unsigned long addr,
 					   const void *val,
 					   unsigned int bytes,
-					   struct kvm_vcpu *vcpu,
-					   bool mmu_only)
+					   struct kvm_vcpu *vcpu)
 {
 	gpa_t                 gpa;
 	u32 error_code;
@@ -3339,10 +3338,6 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 		goto mmio;
 
-	if (mmu_only) {
-		kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
-		return X86EMUL_CONTINUE;
-	}
 	if (emulator_write_phys(vcpu, gpa, val, bytes))
 		return X86EMUL_CONTINUE;
 
@@ -3363,35 +3358,24 @@ mmio:
 	return X86EMUL_CONTINUE;
 }
 
-int __emulator_write_emulated(unsigned long addr,
-				   const void *val,
-				   unsigned int bytes,
-				   struct kvm_vcpu *vcpu,
-				   bool mmu_only)
+int emulator_write_emulated(unsigned long addr,
+			    const void *val,
+			    unsigned int bytes,
+			    struct kvm_vcpu *vcpu)
 {
 	/* Crossing a page boundary? */
 	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
 		int rc, now;
 
 		now = -addr & ~PAGE_MASK;
-		rc = emulator_write_emulated_onepage(addr, val, now, vcpu,
-						     mmu_only);
+		rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		addr += now;
 		val += now;
 		bytes -= now;
 	}
-	return emulator_write_emulated_onepage(addr, val, bytes, vcpu,
-					       mmu_only);
-}
-
-int emulator_write_emulated(unsigned long addr,
-				   const void *val,
-				   unsigned int bytes,
-				   struct kvm_vcpu *vcpu)
-{
-	return __emulator_write_emulated(addr, val, bytes, vcpu, false);
+	return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
 }
 EXPORT_SYMBOL_GPL(emulator_write_emulated);
 
@@ -3455,7 +3439,9 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 	if (!exchanged)
 		return X86EMUL_CMPXCHG_FAILED;
 
-	return __emulator_write_emulated(addr, new, bytes, vcpu, true);
+	kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1);
+
+	return X86EMUL_CONTINUE;
 
 emul_write:
 	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
@@ -4165,7 +4151,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
 
 	kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
-	return __emulator_write_emulated(rip, instruction, 3, vcpu, false);
+	return emulator_write_emulated(rip, instruction, 3, vcpu);
 }
 
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-- 
cgit v1.2.3-70-g09d2


From 0760d44868f351ba30fc9a08cf1830e46aa72466 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 14 Apr 2010 15:50:57 +0200
Subject: KVM: x86: Terminate early if task_switch_16/32 failed

Stop the switch immediately if task_switch_16/32 returned an error. Only
if that step succeeded, the switch should actually take place and update
any register states.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 083b269a83e..aace5659bbe 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2402,6 +2402,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	else
 		ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel,
 				     old_tss_base, &next_tss_desc);
+	if (ret != X86EMUL_CONTINUE)
+		return ret;
 
 	if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE)
 		ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT;
-- 
cgit v1.2.3-70-g09d2


From e269fb2189fb86d79d64c0ca74c6c1a549ad4aa3 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Wed, 14 Apr 2010 15:51:09 +0200
Subject: KVM: x86: Push potential exception error code on task switches

When a fault triggers a task switch, the error code, if existent, has to
be pushed on the new task's stack. Implement the missing bits.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  3 ++-
 arch/x86/include/asm/kvm_host.h    |  3 ++-
 arch/x86/include/asm/svm.h         |  1 +
 arch/x86/kvm/emulate.c             | 22 ++++++++++++++++++----
 arch/x86/kvm/svm.c                 | 11 ++++++++++-
 arch/x86/kvm/vmx.c                 | 12 +++++++++++-
 arch/x86/kvm/x86.c                 |  6 ++++--
 7 files changed, 48 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index a1319c82050..0b2729bf207 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -230,6 +230,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
 		     struct x86_emulate_ops *ops);
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 			 struct x86_emulate_ops *ops,
-			 u16 tss_selector, int reason);
+			 u16 tss_selector, int reason,
+			 bool has_error_code, u32 error_code);
 
 #endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5d5e0a9afcf..3602728d54d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -595,7 +595,8 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
 
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
+		    bool has_error_code, u32 error_code);
 
 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 1d91d05f936..0e831059ac5 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -244,6 +244,7 @@ struct __attribute__ ((__packed__)) vmcb {
 
 #define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
 #define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
+#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
 
 #define	SVM_EXIT_READ_CR0 	0x000
 #define	SVM_EXIT_READ_CR3 	0x003
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index aace5659bbe..585d0ef4a5f 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2344,8 +2344,9 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 }
 
 static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
-				    struct x86_emulate_ops *ops,
-				    u16 tss_selector, int reason)
+				   struct x86_emulate_ops *ops,
+				   u16 tss_selector, int reason,
+				   bool has_error_code, u32 error_code)
 {
 	struct desc_struct curr_tss_desc, next_tss_desc;
 	int ret;
@@ -2418,12 +2419,22 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu);
 	ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
 
+	if (has_error_code) {
+		struct decode_cache *c = &ctxt->decode;
+
+		c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
+		c->lock_prefix = 0;
+		c->src.val = (unsigned long) error_code;
+		emulate_push(ctxt);
+	}
+
 	return ret;
 }
 
 int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 			 struct x86_emulate_ops *ops,
-			 u16 tss_selector, int reason)
+			 u16 tss_selector, int reason,
+			 bool has_error_code, u32 error_code)
 {
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
@@ -2431,12 +2442,15 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 	memset(c, 0, sizeof(struct decode_cache));
 	c->eip = ctxt->eip;
 	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+	c->dst.type = OP_NONE;
 
-	rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason);
+	rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
+				     has_error_code, error_code);
 
 	if (rc == X86EMUL_CONTINUE) {
 		memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
 		kvm_rip_write(ctxt->vcpu, c->eip);
+		rc = writeback(ctxt, ops);
 	}
 
 	return rc;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 87b36fbbfec..78af52222fd 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2222,6 +2222,8 @@ static int task_switch_interception(struct vcpu_svm *svm)
 		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
 	uint32_t idt_v =
 		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
+	bool has_error_code = false;
+	u32 error_code = 0;
 
 	tss_selector = (u16)svm->vmcb->control.exit_info_1;
 
@@ -2242,6 +2244,12 @@ static int task_switch_interception(struct vcpu_svm *svm)
 			svm->vcpu.arch.nmi_injected = false;
 			break;
 		case SVM_EXITINTINFO_TYPE_EXEPT:
+			if (svm->vmcb->control.exit_info_2 &
+			    (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
+				has_error_code = true;
+				error_code =
+					(u32)svm->vmcb->control.exit_info_2;
+			}
 			kvm_clear_exception_queue(&svm->vcpu);
 			break;
 		case SVM_EXITINTINFO_TYPE_INTR:
@@ -2258,7 +2266,8 @@ static int task_switch_interception(struct vcpu_svm *svm)
 	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
 		skip_emulated_instruction(&svm->vcpu);
 
-	return kvm_task_switch(&svm->vcpu, tss_selector, reason);
+	return kvm_task_switch(&svm->vcpu, tss_selector, reason,
+			       has_error_code, error_code);
 }
 
 static int cpuid_interception(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fb4a8869bb9..1b38d8a88cf 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3271,6 +3271,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long exit_qualification;
+	bool has_error_code = false;
+	u32 error_code = 0;
 	u16 tss_selector;
 	int reason, type, idt_v;
 
@@ -3293,6 +3295,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 			kvm_clear_interrupt_queue(vcpu);
 			break;
 		case INTR_TYPE_HARD_EXCEPTION:
+			if (vmx->idt_vectoring_info &
+			    VECTORING_INFO_DELIVER_CODE_MASK) {
+				has_error_code = true;
+				error_code =
+					vmcs_read32(IDT_VECTORING_ERROR_CODE);
+			}
+			/* fall through */
 		case INTR_TYPE_SOFT_EXCEPTION:
 			kvm_clear_exception_queue(vcpu);
 			break;
@@ -3307,7 +3316,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 		       type != INTR_TYPE_NMI_INTR))
 		skip_emulated_instruction(vcpu);
 
-	if (!kvm_task_switch(vcpu, tss_selector, reason))
+	if (!kvm_task_switch(vcpu, tss_selector, reason, has_error_code,
+			     error_code))
 		return 0;
 
 	/* clear all local breakpoint enable flags */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 40991527f54..58a295c6bf6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4778,7 +4778,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
-int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
+		    bool has_error_code, u32 error_code)
 {
 	int cs_db, cs_l, ret;
 	cache_all_regs(vcpu);
@@ -4796,7 +4797,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
 		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
 
 	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
-				   tss_selector, reason);
+				   tss_selector, reason, has_error_code,
+				   error_code);
 
 	if (ret == X86EMUL_CONTINUE)
 		kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
-- 
cgit v1.2.3-70-g09d2


From 5b7e0102ae744e9175b905f4267a81393bdb7a75 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 14 Apr 2010 19:20:03 +0300
Subject: KVM: MMU: Replace role.glevels with role.cr4_pae

There is no real distinction between glevels=3 and glevels=4; both have
exactly the same format and the code is treated exactly the same way.  Drop
role.glevels and replace is with role.cr4_pae (which is meaningful).  This
simplifies the code a bit.

As a side effect, it allows sharing shadow page tables between pae and
longmode guest page tables at the same guest page.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu.c              | 12 ++++++------
 arch/x86/kvm/mmutrace.h         |  5 +++--
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3602728d54d..707d272ae4a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -171,8 +171,8 @@ struct kvm_pte_chain {
 union kvm_mmu_page_role {
 	unsigned word;
 	struct {
-		unsigned glevels:4;
 		unsigned level:4;
+		unsigned cr4_pae:1;
 		unsigned quadrant:2;
 		unsigned pad_for_nice_hex_output:6;
 		unsigned direct:1;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ec8900b6692..51aa580d0aa 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1206,7 +1206,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 
 static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
-	if (sp->role.glevels != vcpu->arch.mmu.root_level) {
+	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
 		kvm_mmu_zap_page(vcpu->kvm, sp);
 		return 1;
 	}
@@ -1329,7 +1329,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	role.level = level;
 	role.direct = direct;
 	if (role.direct)
-		role.glevels = 0;
+		role.cr4_pae = 0;
 	role.access = access;
 	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
@@ -2443,7 +2443,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 	else
 		r = paging32_init_context(vcpu);
 
-	vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level;
+	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
 
 	return r;
 }
@@ -2532,7 +2532,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
         }
 
 	++vcpu->kvm->stat.mmu_pte_updated;
-	if (sp->role.glevels == PT32_ROOT_LEVEL)
+	if (!sp->role.cr4_pae)
 		paging32_update_pte(vcpu, sp, spte, new);
 	else
 		paging64_update_pte(vcpu, sp, spte, new);
@@ -2681,7 +2681,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
 		if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
 			continue;
-		pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
+		pte_size = sp->role.cr4_pae ? 8 : 4;
 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
 		misaligned |= bytes < 4;
 		if (misaligned || flooded) {
@@ -2705,7 +2705,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		page_offset = offset;
 		level = sp->role.level;
 		npte = 1;
-		if (sp->role.glevels == PT32_ROOT_LEVEL) {
+		if (!sp->role.cr4_pae) {
 			page_offset <<= 1;	/* 32->64 */
 			/*
 			 * A 32-bit pde maps 4MB while the shadow pdes map
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 1fe956ab761..3851f1f3030 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -28,9 +28,10 @@
 								        \
 	role.word = __entry->role;					\
 									\
-	trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge"	\
+	trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s %spge"		\
 			 " %snxe root %u %s%c",				\
-			 __entry->gfn, role.level, role.glevels,	\
+			 __entry->gfn, role.level,			\
+			 role.cr4_pae ? " pae" : "",			\
 			 role.quadrant,					\
 			 role.direct ? " direct" : "",			\
 			 access_str[role.access],			\
-- 
cgit v1.2.3-70-g09d2


From 19d04437267f00c7b50343513693b7a3174ff908 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 15 Apr 2010 12:29:50 +0300
Subject: KVM: fix emulator_task_switch() return value.

emulator_task_switch() should return -1 for failure and 0 for success to
the caller, just like x86_emulate_insn() does.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 arch/x86/kvm/x86.c     | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 585d0ef4a5f..5ac0bb465ed 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2453,7 +2453,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 		rc = writeback(ctxt, ops);
 	}
 
-	return rc;
+	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 }
 
 static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 58a295c6bf6..30efeead451 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4800,10 +4800,11 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 				   tss_selector, reason, has_error_code,
 				   error_code);
 
-	if (ret == X86EMUL_CONTINUE)
-		kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	if (ret)
+		return EMULATE_FAIL;
 
-	return (ret != X86EMUL_CONTINUE);
+	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	return EMULATE_DONE;
 }
 EXPORT_SYMBOL_GPL(kvm_task_switch);
 
-- 
cgit v1.2.3-70-g09d2


From 1b8c7934a4063653095fb9fa88f9169f5b52f52b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 16 Apr 2010 21:23:41 +0800
Subject: KVM: MMU: remove unused struct kvm_unsync_walk

Remove 'struct kvm_unsync_walk' since it's not used.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 51aa580d0aa..b57be03d442 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -173,11 +173,6 @@ struct kvm_shadow_walk_iterator {
 	     shadow_walk_okay(&(_walker));			\
 	     shadow_walk_next(&(_walker)))
 
-
-struct kvm_unsync_walk {
-	int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
-};
-
 typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
 
 static struct kmem_cache *pte_chain_cache;
-- 
cgit v1.2.3-70-g09d2


From 0571d366e0be571be14581cb5e28d9c3f6e0d0b1 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 16 Apr 2010 21:27:54 +0800
Subject: KVM: MMU: reduce 'struct kvm_mmu_page' size

Define 'multimapped' as 'bool'.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 707d272ae4a..3c31c5ad37a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -202,9 +202,9 @@ struct kvm_mmu_page {
 	 * in this shadow page.
 	 */
 	DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
-	int multimapped;         /* More than one parent_pte? */
-	int root_count;          /* Currently serving as active root */
+	bool multimapped;         /* More than one parent_pte? */
 	bool unsync;
+	int root_count;          /* Currently serving as active root */
 	unsigned int unsync_children;
 	union {
 		u64 *parent_pte;               /* !multimapped */
-- 
cgit v1.2.3-70-g09d2


From 6b18493d60e7f54a0feb771fa141b5fcb72ce1e4 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 16 Apr 2010 21:29:17 +0800
Subject: KVM: MMU: remove unused parameter in mmu_parent_walk()

'vcpu' is unused, remove it

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b57be03d442..45f3aa5213c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -173,7 +173,7 @@ struct kvm_shadow_walk_iterator {
 	     shadow_walk_okay(&(_walker));			\
 	     shadow_walk_next(&(_walker)))
 
-typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
+typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
 
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
@@ -1001,8 +1001,7 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
 }
 
 
-static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			    mmu_parent_walk_fn fn)
+static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
 {
 	struct kvm_pte_chain *pte_chain;
 	struct hlist_node *node;
@@ -1011,8 +1010,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 
 	if (!sp->multimapped && sp->parent_pte) {
 		parent_sp = page_header(__pa(sp->parent_pte));
-		fn(vcpu, parent_sp);
-		mmu_parent_walk(vcpu, parent_sp, fn);
+		fn(parent_sp);
+		mmu_parent_walk(parent_sp, fn);
 		return;
 	}
 	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
@@ -1020,8 +1019,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			if (!pte_chain->parent_ptes[i])
 				break;
 			parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
-			fn(vcpu, parent_sp);
-			mmu_parent_walk(vcpu, parent_sp, fn);
+			fn(parent_sp);
+			mmu_parent_walk(parent_sp, fn);
 		}
 }
 
@@ -1058,16 +1057,15 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
 		}
 }
 
-static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int unsync_walk_fn(struct kvm_mmu_page *sp)
 {
 	kvm_mmu_update_parents_unsync(sp);
 	return 1;
 }
 
-static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
-					struct kvm_mmu_page *sp)
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
 {
-	mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+	mmu_parent_walk(sp, unsync_walk_fn);
 	kvm_mmu_update_parents_unsync(sp);
 }
 
@@ -1345,7 +1343,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 			if (sp->unsync_children) {
 				set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
-				kvm_mmu_mark_parents_unsync(vcpu, sp);
+				kvm_mmu_mark_parents_unsync(sp);
 			}
 			trace_kvm_mmu_get_page(sp, false);
 			return sp;
@@ -1759,7 +1757,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	++vcpu->kvm->stat.mmu_unsync;
 	sp->unsync = 1;
 
-	kvm_mmu_mark_parents_unsync(vcpu, sp);
+	kvm_mmu_mark_parents_unsync(sp);
 
 	mmu_convert_notrap(sp);
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From acb5451789f21ad51215897bb8f9306a05e8acd4 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 15 Apr 2010 21:03:50 +0300
Subject: KVM: prevent spurious exit to userspace during task switch emulation.

If kvm_task_switch() fails code exits to userspace without specifying
exit reason, so the previous exit reason is reused by userspace. Fix
this by specifying exit reason correctly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 10 ++++++++--
 arch/x86/kvm/vmx.c |  8 ++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 78af52222fd..ab78eb8ba89 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2266,8 +2266,14 @@ static int task_switch_interception(struct vcpu_svm *svm)
 	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
 		skip_emulated_instruction(&svm->vcpu);
 
-	return kvm_task_switch(&svm->vcpu, tss_selector, reason,
-			       has_error_code, error_code);
+	if (kvm_task_switch(&svm->vcpu, tss_selector, reason,
+				has_error_code, error_code) == EMULATE_FAIL) {
+		svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		svm->vcpu.run->internal.ndata = 0;
+		return 0;
+	}
+	return 1;
 }
 
 static int cpuid_interception(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1b38d8a88cf..6e5e75e0d7d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3316,9 +3316,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
 		       type != INTR_TYPE_NMI_INTR))
 		skip_emulated_instruction(vcpu);
 
-	if (!kvm_task_switch(vcpu, tss_selector, reason, has_error_code,
-			     error_code))
+	if (kvm_task_switch(vcpu, tss_selector, reason,
+				has_error_code, error_code) == EMULATE_FAIL) {
+		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+		vcpu->run->internal.ndata = 0;
 		return 0;
+	}
 
 	/* clear all local breakpoint enable flags */
 	vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55);
-- 
cgit v1.2.3-70-g09d2


From 3246af0ece6c61689847417977733f0b12dc4b6f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 16 Apr 2010 16:35:54 +0800
Subject: KVM: MMU: cleanup for hlist walk restart

Quote from Avi:

|Just change the assignment to a 'goto restart;' please,
|I don't like playing with list_for_each internals.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 45f3aa5213c..7a17db1cdcd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1565,13 +1565,14 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 	r = 0;
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
+restart:
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
 		if (sp->gfn == gfn && !sp->role.direct) {
 			pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
 				 sp->role.word);
 			r = 1;
 			if (kvm_mmu_zap_page(kvm, sp))
-				n = bucket->first;
+				goto restart;
 		}
 	return r;
 }
@@ -1585,13 +1586,14 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &kvm->arch.mmu_page_hash[index];
+restart:
 	hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
 		if (sp->gfn == gfn && !sp->role.direct
 		    && !sp->role.invalid) {
 			pgprintk("%s: zap %lx %x\n",
 				 __func__, gfn, sp->role.word);
 			if (kvm_mmu_zap_page(kvm, sp))
-				nn = bucket->first;
+				goto restart;
 		}
 	}
 }
@@ -2671,6 +2673,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	}
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+
+restart:
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
 		if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
 			continue;
@@ -2691,7 +2695,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
 				 gpa, bytes, sp->role.word);
 			if (kvm_mmu_zap_page(vcpu->kvm, sp))
-				n = bucket->first;
+				goto restart;
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
 		}
@@ -2900,10 +2904,11 @@ void kvm_mmu_zap_all(struct kvm *kvm)
 	struct kvm_mmu_page *sp, *node;
 
 	spin_lock(&kvm->mmu_lock);
+restart:
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
 		if (kvm_mmu_zap_page(kvm, sp))
-			node = container_of(kvm->arch.active_mmu_pages.next,
-					    struct kvm_mmu_page, link);
+			goto restart;
+
 	spin_unlock(&kvm->mmu_lock);
 
 	kvm_flush_remote_tlbs(kvm);
-- 
cgit v1.2.3-70-g09d2


From 90d83dc3d49f5101addae962ccc1b4aff66b68d8 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 19 Apr 2010 17:41:23 +0800
Subject: KVM: use the correct RCU API for PROVE_RCU=y

The RCU/SRCU API have already changed for proving RCU usage.

I got the following dmesg when PROVE_RCU=y because we used incorrect API.
This patch coverts rcu_deference() to srcu_dereference() or family API.

===================================================
[ INFO: suspicious rcu_dereference_check() usage. ]
---------------------------------------------------
arch/x86/kvm/mmu.c:3020 invoked rcu_dereference_check() without protection!

other info that might help us debug this:

rcu_scheduler_active = 1, debug_locks = 0
2 locks held by qemu-system-x86/8550:
 #0:  (&kvm->slots_lock){+.+.+.}, at: [<ffffffffa011a6ac>] kvm_set_memory_region+0x29/0x50 [kvm]
 #1:  (&(&kvm->mmu_lock)->rlock){+.+...}, at: [<ffffffffa012262d>] kvm_arch_commit_memory_region+0xa6/0xe2 [kvm]

stack backtrace:
Pid: 8550, comm: qemu-system-x86 Not tainted 2.6.34-rc4-tip-01028-g939eab1 #27
Call Trace:
 [<ffffffff8106c59e>] lockdep_rcu_dereference+0xaa/0xb3
 [<ffffffffa012f6c1>] kvm_mmu_calculate_mmu_pages+0x44/0x7d [kvm]
 [<ffffffffa012263e>] kvm_arch_commit_memory_region+0xb7/0xe2 [kvm]
 [<ffffffffa011a5d7>] __kvm_set_memory_region+0x636/0x6e2 [kvm]
 [<ffffffffa011a6ba>] kvm_set_memory_region+0x37/0x50 [kvm]
 [<ffffffffa015e956>] vmx_set_tss_addr+0x46/0x5a [kvm_intel]
 [<ffffffffa0126592>] kvm_arch_vm_ioctl+0x17a/0xcf8 [kvm]
 [<ffffffff810a8692>] ? unlock_page+0x27/0x2c
 [<ffffffff810bf879>] ? __do_fault+0x3a9/0x3e1
 [<ffffffffa011b12f>] kvm_vm_ioctl+0x364/0x38d [kvm]
 [<ffffffff81060cfa>] ? up_read+0x23/0x3d
 [<ffffffff810f3587>] vfs_ioctl+0x32/0xa6
 [<ffffffff810f3b19>] do_vfs_ioctl+0x495/0x4db
 [<ffffffff810e6b2f>] ? fget_light+0xc2/0x241
 [<ffffffff810e416c>] ? do_sys_open+0x104/0x116
 [<ffffffff81382d6d>] ? retint_swapgs+0xe/0x13
 [<ffffffff810f3ba6>] sys_ioctl+0x47/0x6a
 [<ffffffff810021db>] system_call_fastpath+0x16/0x1b

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c |  2 +-
 arch/s390/kvm/kvm-s390.h |  2 +-
 arch/x86/kvm/mmu.c       |  7 ++++---
 arch/x86/kvm/vmx.c       |  2 +-
 arch/x86/kvm/x86.c       |  4 ++--
 arch/x86/kvm/x86.h       |  7 +++++++
 include/linux/kvm_host.h |  7 +++++++
 virt/kvm/iommu.c         |  4 ++--
 virt/kvm/kvm_main.c      | 13 ++++++++-----
 9 files changed, 33 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index d7bac1f75af..d5f4e916120 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1381,7 +1381,7 @@ static void kvm_release_vm_pages(struct kvm *kvm)
 	int i, j;
 	unsigned long base_gfn;
 
-	slots = rcu_dereference(kvm->memslots);
+	slots = kvm_memslots(kvm);
 	for (i = 0; i < slots->nmemslots; i++) {
 		memslot = &slots->memslots[i];
 		base_gfn = memslot->base_gfn;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 60f09ab3672..cfa9d177745 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -72,7 +72,7 @@ static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu)
 	struct kvm_memslots *memslots;
 
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
-	memslots = rcu_dereference(vcpu->kvm->memslots);
+	memslots = kvm_memslots(vcpu->kvm);
 
 	mem = &memslots->memslots[0];
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7a17db1cdcd..0682a393ad9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -787,7 +787,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 	int retval = 0;
 	struct kvm_memslots *slots;
 
-	slots = rcu_dereference(kvm->memslots);
+	slots = kvm_memslots(kvm);
 
 	for (i = 0; i < slots->nmemslots; i++) {
 		struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -3016,7 +3016,8 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
 	unsigned int  nr_pages = 0;
 	struct kvm_memslots *slots;
 
-	slots = rcu_dereference(kvm->memslots);
+	slots = kvm_memslots(kvm);
+
 	for (i = 0; i < slots->nmemslots; i++)
 		nr_pages += slots->memslots[i].npages;
 
@@ -3292,7 +3293,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
 	int i, j, k, idx;
 
 	idx = srcu_read_lock(&kvm->srcu);
-	slots = rcu_dereference(kvm->memslots);
+	slots = kvm_memslots(kvm);
 	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
 		struct kvm_memory_slot *m = &slots->memslots[i];
 		struct kvm_rmap_desc *d;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0b896ac7e4b..d0a10b5612e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1558,7 +1558,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
 		struct kvm_memslots *slots;
 		gfn_t base_gfn;
 
-		slots = rcu_dereference(kvm->memslots);
+		slots = kvm_memslots(kvm);
 		base_gfn = kvm->memslots->memslots[0].base_gfn +
 				 kvm->memslots->memslots[0].npages - 3;
 		return base_gfn << PAGE_SHIFT;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 58a96e6a234..638248c9699 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2497,7 +2497,7 @@ gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
 	struct kvm_mem_alias *alias;
 	struct kvm_mem_aliases *aliases;
 
-	aliases = rcu_dereference(kvm->arch.aliases);
+	aliases = kvm_aliases(kvm);
 
 	for (i = 0; i < aliases->naliases; ++i) {
 		alias = &aliases->aliases[i];
@@ -2516,7 +2516,7 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 	struct kvm_mem_alias *alias;
 	struct kvm_mem_aliases *aliases;
 
-	aliases = rcu_dereference(kvm->arch.aliases);
+	aliases = kvm_aliases(kvm);
 
 	for (i = 0; i < aliases->naliases; ++i) {
 		alias = &aliases->aliases[i];
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b7a404722d2..f4b54458285 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,6 +65,13 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
 	return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
 }
 
+static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
+{
+	return rcu_dereference_check(kvm->arch.aliases,
+			srcu_read_lock_held(&kvm->srcu)
+			|| lockdep_is_held(&kvm->slots_lock));
+}
+
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 55830638aeb..1ed030bad59 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -250,6 +250,13 @@ void kvm_exit(void);
 void kvm_get_kvm(struct kvm *kvm);
 void kvm_put_kvm(struct kvm *kvm);
 
+static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
+{
+	return rcu_dereference_check(kvm->memslots,
+			srcu_read_lock_held(&kvm->srcu)
+			|| lockdep_is_held(&kvm->slots_lock));
+}
+
 #define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
 #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
 static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 80fd3ad3b2d..37ca71ebdba 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -78,7 +78,7 @@ static int kvm_iommu_map_memslots(struct kvm *kvm)
 	int i, r = 0;
 	struct kvm_memslots *slots;
 
-	slots = rcu_dereference(kvm->memslots);
+	slots = kvm_memslots(kvm);
 
 	for (i = 0; i < slots->nmemslots; i++) {
 		r = kvm_iommu_map_pages(kvm, &slots->memslots[i]);
@@ -217,7 +217,7 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm)
 	int i;
 	struct kvm_memslots *slots;
 
-	slots = rcu_dereference(kvm->memslots);
+	slots = kvm_memslots(kvm);
 
 	for (i = 0; i < slots->nmemslots; i++) {
 		kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d6351a34b29..4901ec5061b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -834,7 +834,7 @@ EXPORT_SYMBOL_GPL(kvm_is_error_hva);
 struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
 {
 	int i;
-	struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
+	struct kvm_memslots *slots = kvm_memslots(kvm);
 
 	for (i = 0; i < slots->nmemslots; ++i) {
 		struct kvm_memory_slot *memslot = &slots->memslots[i];
@@ -856,7 +856,7 @@ struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 {
 	int i;
-	struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
+	struct kvm_memslots *slots = kvm_memslots(kvm);
 
 	gfn = unalias_gfn_instantiation(kvm, gfn);
 	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
@@ -900,7 +900,7 @@ out:
 int memslot_id(struct kvm *kvm, gfn_t gfn)
 {
 	int i;
-	struct kvm_memslots *slots = rcu_dereference(kvm->memslots);
+	struct kvm_memslots *slots = kvm_memslots(kvm);
 	struct kvm_memory_slot *memslot = NULL;
 
 	gfn = unalias_gfn(kvm, gfn);
@@ -1994,7 +1994,9 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val)
 {
 	int i;
-	struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]);
+	struct kvm_io_bus *bus;
+
+	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
 	for (i = 0; i < bus->dev_count; i++)
 		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
 			return 0;
@@ -2006,8 +2008,9 @@ int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		    int len, void *val)
 {
 	int i;
-	struct kvm_io_bus *bus = rcu_dereference(kvm->buses[bus_idx]);
+	struct kvm_io_bus *bus;
 
+	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
 	for (i = 0; i < bus->dev_count; i++)
 		if (!kvm_iodevice_read(bus->devs[i], addr, len, val))
 			return 0;
-- 
cgit v1.2.3-70-g09d2


From 87bc3bf972af0585ba5415aebbc8bd09b6a2ee94 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 19 Apr 2010 17:25:53 +0300
Subject: KVM: MMU: Drop cr4.pge from shadow page role

Since commit bf47a760f66ad, we no longer handle ptes with the global bit
set specially, so there is no reason to distinguish between shadow pages
created with cr4.gpe set and clear.

Such tracking is expensive when the guest toggles cr4.pge, so drop it.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/mmutrace.h         | 3 +--
 arch/x86/kvm/x86.c              | 1 -
 3 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3c31c5ad37a..d47d087568f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -178,7 +178,6 @@ union kvm_mmu_page_role {
 		unsigned direct:1;
 		unsigned access:3;
 		unsigned invalid:1;
-		unsigned cr4_pge:1;
 		unsigned nxe:1;
 	};
 };
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3851f1f3030..bc4f7f0be2b 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -28,7 +28,7 @@
 								        \
 	role.word = __entry->role;					\
 									\
-	trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s %spge"		\
+	trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s"		\
 			 " %snxe root %u %s%c",				\
 			 __entry->gfn, role.level,			\
 			 role.cr4_pae ? " pae" : "",			\
@@ -36,7 +36,6 @@
 			 role.direct ? " direct" : "",			\
 			 access_str[role.access],			\
 			 role.invalid ? " invalid" : "",		\
-			 role.cr4_pge ? "" : "!",			\
 			 role.nxe ? "" : "!",				\
 			 __entry->root_count,				\
 			 __entry->unsync ? "unsync" : "sync", 0);	\
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 638248c9699..cf37ac6644e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -488,7 +488,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	}
 	kvm_x86_ops->set_cr4(vcpu, cr4);
 	vcpu->arch.cr4 = cr4;
-	vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
 	kvm_mmu_reset_context(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
-- 
cgit v1.2.3-70-g09d2


From 51fb60d81b483ae75614d401e7d4271884894113 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Fri, 16 Apr 2010 17:16:40 +0800
Subject: KVM: MMU: Move sync_page() first pte address calculation out of loop

Move first pte address calculation out of loop to save some cycles.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d9dea288e51..5910557b3f3 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -572,12 +572,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
 	int i, offset, nr_present;
 	bool reset_host_protection;
+	gpa_t first_pte_gpa;
 
 	offset = nr_present = 0;
 
 	if (PTTYPE == 32)
 		offset = sp->role.quadrant << PT64_LEVEL_BITS;
 
+	first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
+
 	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
 		unsigned pte_access;
 		pt_element_t gpte;
@@ -587,8 +590,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		if (!is_shadow_present_pte(sp->spt[i]))
 			continue;
 
-		pte_gpa = gfn_to_gpa(sp->gfn);
-		pte_gpa += (i+offset) * sizeof(pt_element_t);
+		pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
 
 		if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
 					  sizeof(pt_element_t)))
-- 
cgit v1.2.3-70-g09d2


From 814a59d2077d630cffca7e2878c5b6f9b91ba725 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Fri, 16 Apr 2010 17:18:01 +0800
Subject: KVM: MMU: Make use of is_large_pte() in walker

Make use of is_large_pte() instead of checking PT_PAGE_SIZE_MASK
bit directly.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 5910557b3f3..d0cc07eb6ed 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -190,10 +190,10 @@ walk:
 
 		if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
 		    ((walker->level == PT_DIRECTORY_LEVEL) &&
-				(pte & PT_PAGE_SIZE_MASK)  &&
+				is_large_pte(pte) &&
 				(PTTYPE == 64 || is_pse(vcpu))) ||
 		    ((walker->level == PT_PDPE_LEVEL) &&
-				(pte & PT_PAGE_SIZE_MASK)  &&
+				is_large_pte(pte) &&
 				is_long_mode(vcpu))) {
 			int lvl = walker->level;
 
-- 
cgit v1.2.3-70-g09d2


From b2fc15a5ef6679a5f87fee012a836294532bc628 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Fri, 16 Apr 2010 17:18:54 +0800
Subject: KVM: MMU: Remove unused varialbe in rmap_next()

Remove unused varialbe in rmap_next()

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0682a393ad9..7eff794457f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -647,7 +647,6 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 {
 	struct kvm_rmap_desc *desc;
-	struct kvm_rmap_desc *prev_desc;
 	u64 *prev_spte;
 	int i;
 
@@ -659,7 +658,6 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 		return NULL;
 	}
 	desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
-	prev_desc = NULL;
 	prev_spte = NULL;
 	while (desc) {
 		for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) {
-- 
cgit v1.2.3-70-g09d2


From 2a059bf444dd7758ccf48f217cd981570132be85 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Fri, 16 Apr 2010 17:19:48 +0800
Subject: KVM: Get rid of dead function gva_to_page()

Nobody use gva_to_page() anymore, get rid of it.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c       | 14 --------------
 include/linux/kvm_host.h |  1 -
 2 files changed, 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7eff794457f..d2a110e870f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1618,20 +1618,6 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp)
 	}
 }
 
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
-{
-	struct page *page;
-
-	gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
-
-	if (gpa == UNMAPPED_GVA)
-		return NULL;
-
-	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
-
-	return page;
-}
-
 /*
  * The function is based on mtrr_type_lookup() in
  * arch/x86/kernel/cpu/mtrr/generic.c
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1ed030bad59..ce027d51809 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -260,7 +260,6 @@ static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
 #define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
 #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB)
 static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
-struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva);
 
 extern struct page *bad_page;
 extern pfn_t bad_pfn;
-- 
cgit v1.2.3-70-g09d2


From 77a1a715707d0f60ce0cfbe44070527a0a561f01 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Fri, 16 Apr 2010 16:21:42 +0800
Subject: KVM: MMU: cleanup for function unaccount_shadowed()

Since gfn is not changed in the for loop, we do not need to call
gfn_to_memslot_unaliased() under the loop, and it is safe to move
it out.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d2a110e870f..ddfa8658fb6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -431,9 +431,9 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 	int i;
 
 	gfn = unalias_gfn(kvm, gfn);
+	slot = gfn_to_memslot_unaliased(kvm, gfn);
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
-		slot          = gfn_to_memslot_unaliased(kvm, gfn);
 		write_count   = slot_largepage_idx(gfn, slot, i);
 		*write_count -= 1;
 		WARN_ON(*write_count < 0);
-- 
cgit v1.2.3-70-g09d2


From cdbecfc398a904ce9f5c126638b09a2429fb86ed Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sat, 17 Apr 2010 16:41:47 +0800
Subject: KVM: VMX: free vpid when fail to create vcpu

Fix bug of the exception path, free allocated vpid when fail
to create vcpu.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d0a10b5612e..54c0035a63f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2333,6 +2333,16 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
 	spin_unlock(&vmx_vpid_lock);
 }
 
+static void free_vpid(struct vcpu_vmx *vmx)
+{
+	if (!enable_vpid)
+		return;
+	spin_lock(&vmx_vpid_lock);
+	if (vmx->vpid != 0)
+		__clear_bit(vmx->vpid, vmx_vpid_bitmap);
+	spin_unlock(&vmx_vpid_lock);
+}
+
 static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
 {
 	int f = sizeof(unsigned long);
@@ -3916,10 +3926,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	spin_lock(&vmx_vpid_lock);
-	if (vmx->vpid != 0)
-		__clear_bit(vmx->vpid, vmx_vpid_bitmap);
-	spin_unlock(&vmx_vpid_lock);
+	free_vpid(vmx);
 	vmx_free_vmcs(vcpu);
 	kfree(vmx->guest_msrs);
 	kvm_vcpu_uninit(vcpu);
@@ -3981,6 +3988,7 @@ free_msrs:
 uninit_vcpu:
 	kvm_vcpu_uninit(&vmx->vcpu);
 free_vcpu:
+	free_vpid(vmx);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 	return ERR_PTR(err);
 }
-- 
cgit v1.2.3-70-g09d2


From 924584ccb08c338ebd2f40936ff2321c1cce6a6d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 22 Apr 2010 12:33:07 +0200
Subject: KVM: SVM: Fix nested nmi handling

The patch introducing nested nmi handling had a bug. The
check does not belong to enable_nmi_window but must be in
nmi_allowed. This patch fixes this.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ab78eb8ba89..ec205847be6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2771,8 +2771,12 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb *vmcb = svm->vmcb;
-	return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
-		!(svm->vcpu.arch.hflags & HF_NMI_MASK);
+	int ret;
+	ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+	      !(svm->vcpu.arch.hflags & HF_NMI_MASK);
+	ret = ret && gif_set(svm) && nested_svm_nmi(svm);
+
+	return ret;
 }
 
 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -2841,11 +2845,9 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 	 * Something prevents NMI from been injected. Single step over possible
 	 * problem (IRET or exception injection or interrupt shadow)
 	 */
-	if (gif_set(svm) && nested_svm_nmi(svm)) {
-		svm->nmi_singlestep = true;
-		svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
-		update_db_intercept(vcpu);
-	}
+	svm->nmi_singlestep = true;
+	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+	update_db_intercept(vcpu);
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
-- 
cgit v1.2.3-70-g09d2


From 2041a06a50a2ef4062c8454482aa06e25f6cccde Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 22 Apr 2010 12:33:08 +0200
Subject: KVM: SVM: Make sure rip is synced to vmcb before nested vmexit

This patch fixes a bug where a nested guest always went over
the same instruction because the rip was not advanced on a
nested vmexit.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ec205847be6..c480d7f64a6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2960,6 +2960,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	u16 gs_selector;
 	u16 ldt_selector;
 
+	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
 	/*
 	 * A vmexit emulation is required before the vcpu can be executed
 	 * again.
@@ -2967,10 +2971,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 	if (unlikely(svm->nested.exit_required))
 		return;
 
-	svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
-	svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
-	svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
-
 	pre_svm_run(svm);
 
 	sync_lapic_to_cr8(vcpu);
-- 
cgit v1.2.3-70-g09d2


From 2be4fc7a02c368dcfda83a386a5101c211b9535e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 22 Apr 2010 12:33:09 +0200
Subject: KVM: SVM: Sync cr0 and cr3 to kvm state before nested handling

This patch syncs cr0 and cr3 from the vmcb to the kvm state
before nested intercept handling is done. This allows to
simplify the vmexit path.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c480d7f64a6..5ad9d802bd1 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1799,10 +1799,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->save.gdtr   = vmcb->save.gdtr;
 	nested_vmcb->save.idtr   = vmcb->save.idtr;
 	nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
-	if (npt_enabled)
-		nested_vmcb->save.cr3    = vmcb->save.cr3;
-	else
-		nested_vmcb->save.cr3    = svm->vcpu.arch.cr3;
+	nested_vmcb->save.cr3    = svm->vcpu.arch.cr3;
 	nested_vmcb->save.cr2    = vmcb->save.cr2;
 	nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
 	nested_vmcb->save.rflags = vmcb->save.rflags;
@@ -2641,6 +2638,11 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 
 	trace_kvm_exit(exit_code, vcpu);
 
+	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
+		vcpu->arch.cr0 = svm->vmcb->save.cr0;
+	if (npt_enabled)
+		vcpu->arch.cr3 = svm->vmcb->save.cr3;
+
 	if (unlikely(svm->nested.exit_required)) {
 		nested_svm_vmexit(svm);
 		svm->nested.exit_required = false;
@@ -2668,11 +2670,6 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 
 	svm_complete_interrupts(svm);
 
-	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
-		vcpu->arch.cr0 = svm->vmcb->save.cr0;
-	if (npt_enabled)
-		vcpu->arch.cr3 = svm->vmcb->save.cr3;
-
 	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		kvm_run->fail_entry.hardware_entry_failure_reason
-- 
cgit v1.2.3-70-g09d2


From 228070b1b31abc16c331b57bf965101c6eb1d167 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 22 Apr 2010 12:33:10 +0200
Subject: KVM: SVM: Propagate nested entry failure into guest hypervisor

This patch implements propagation of a failes guest vmrun
back into the guest instead of killing the whole guest.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5ad9d802bd1..b10d1630c20 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1715,6 +1715,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
 			vmexit = NESTED_EXIT_DONE;
 		break;
 	}
+	case SVM_EXIT_ERR: {
+		vmexit = NESTED_EXIT_DONE;
+		break;
+	}
 	default: {
 		u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
 		if (svm->nested.intercept & exit_bits)
-- 
cgit v1.2.3-70-g09d2


From d4330ef2fb2236a1e3a176f0f68360f4c0a8661b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 22 Apr 2010 12:33:11 +0200
Subject: KVM: x86: Add callback to let modules decide over some supported
 cpuid bits

This patch adds the get_supported_cpuid callback to
kvm_x86_ops. It will be used in do_cpuid_ent to delegate the
decission about some supported cpuid bits to the
architecture modules.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 ++
 arch/x86/kvm/svm.c              | 6 ++++++
 arch/x86/kvm/vmx.c              | 6 ++++++
 arch/x86/kvm/x86.c              | 3 +++
 4 files changed, 17 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d47d087568f..357573af974 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -528,6 +528,8 @@ struct kvm_x86_ops {
 	int (*get_lpage_level)(void);
 	bool (*rdtscp_supported)(void);
 
+	void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
+
 	const struct trace_print_flags *exit_reasons_str;
 };
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index b10d1630c20..0fa2035bd8e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3152,6 +3152,10 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 {
 }
 
+static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+{
+}
+
 static const struct trace_print_flags svm_exit_reasons_str[] = {
 	{ SVM_EXIT_READ_CR0,			"read_cr0" },
 	{ SVM_EXIT_READ_CR3,			"read_cr3" },
@@ -3297,6 +3301,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.cpuid_update = svm_cpuid_update,
 
 	.rdtscp_supported = svm_rdtscp_supported,
+
+	.set_supported_cpuid = svm_set_supported_cpuid,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 54c0035a63f..9f8532b1fa9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4119,6 +4119,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+{
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -4191,6 +4195,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.cpuid_update = vmx_cpuid_update,
 
 	.rdtscp_supported = vmx_rdtscp_supported,
+
+	.set_supported_cpuid = vmx_set_supported_cpuid,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 848c814e8c3..6e6434332f2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1960,6 +1960,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->ecx &= kvm_supported_word6_x86_features;
 		break;
 	}
+
+	kvm_x86_ops->set_supported_cpuid(function, entry);
+
 	put_cpu();
 }
 
-- 
cgit v1.2.3-70-g09d2


From c2c63a493924e09a1984d1374a0e60dfd54fc0b0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 22 Apr 2010 12:33:12 +0200
Subject: KVM: SVM: Report emulated SVM features to userspace

This patch implements the reporting of the emulated SVM
features to userspace instead of the real hardware
capabilities. Every real hardware capability needs emulation
in nested svm so the old behavior was broken.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0fa2035bd8e..65fc11438b7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3154,6 +3154,16 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 
 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 {
+	switch (func) {
+	case 0x8000000A:
+		entry->eax = 1; /* SVM revision 1 */
+		entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
+				   ASID emulation to nested SVM */
+		entry->ecx = 0; /* Reserved */
+		entry->edx = 0; /* Do not support any additional features */
+
+		break;
+	}
 }
 
 static const struct trace_print_flags svm_exit_reasons_str[] = {
-- 
cgit v1.2.3-70-g09d2


From ce7ddec4bbbc08f0c2901cc103773aed864b09fd Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 22 Apr 2010 12:33:13 +0200
Subject: KVM: x86: Allow marking an exception as reinjected

This patch adds logic to kvm/x86 which allows to mark an
injected exception as reinjected. This allows to remove an
ugly hack from svm_complete_interrupts that prevented
exceptions from being reinjected at all in the nested case.
The hack was necessary because an reinjected exception into
the nested guest could cause a nested vmexit emulation. But
reinjected exceptions must not intercept. The downside of
the hack is that a exception that in injected could get
lost.
This patch fixes the problem and puts the code for it into
generic x86 files because. Nested-VMX will likely have the
same problem and could reuse the code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  6 +++++-
 arch/x86/kvm/svm.c              | 12 ++++++------
 arch/x86/kvm/vmx.c              |  3 ++-
 arch/x86/kvm/x86.c              | 23 +++++++++++++++++++----
 4 files changed, 32 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 357573af974..3f0007b076d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -312,6 +312,7 @@ struct kvm_vcpu_arch {
 	struct kvm_queued_exception {
 		bool pending;
 		bool has_error_code;
+		bool reinject;
 		u8 nr;
 		u32 error_code;
 	} exception;
@@ -514,7 +515,8 @@ struct kvm_x86_ops {
 	void (*set_irq)(struct kvm_vcpu *vcpu);
 	void (*set_nmi)(struct kvm_vcpu *vcpu);
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
-				bool has_error_code, u32 error_code);
+				bool has_error_code, u32 error_code,
+				bool reinject);
 	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
 	int (*nmi_allowed)(struct kvm_vcpu *vcpu);
 	bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
@@ -617,6 +619,8 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
 
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
+void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
 			   u32 error_code);
 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 65fc11438b7..30e49fe7f8c 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -338,7 +338,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 }
 
 static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
-				bool has_error_code, u32 error_code)
+				bool has_error_code, u32 error_code,
+				bool reinject)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -346,7 +347,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	 * If we are within a nested VM we'd better #VMEXIT and let the guest
 	 * handle the exception
 	 */
-	if (nested_svm_check_exception(svm, nr, has_error_code, error_code))
+	if (!reinject &&
+	    nested_svm_check_exception(svm, nr, has_error_code, error_code))
 		return;
 
 	if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) {
@@ -2918,8 +2920,6 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 		svm->vcpu.arch.nmi_injected = true;
 		break;
 	case SVM_EXITINTINFO_TYPE_EXEPT:
-		if (is_nested(svm))
-			break;
 		/*
 		 * In case of software exceptions, do not reinject the vector,
 		 * but re-execute the instruction instead. Rewind RIP first
@@ -2935,10 +2935,10 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 		}
 		if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
 			u32 err = svm->vmcb->control.exit_int_info_err;
-			kvm_queue_exception_e(&svm->vcpu, vector, err);
+			kvm_requeue_exception_e(&svm->vcpu, vector, err);
 
 		} else
-			kvm_queue_exception(&svm->vcpu, vector);
+			kvm_requeue_exception(&svm->vcpu, vector);
 		break;
 	case SVM_EXITINTINFO_TYPE_INTR:
 		kvm_queue_interrupt(&svm->vcpu, vector, false);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9f8532b1fa9..875b785228f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -919,7 +919,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 }
 
 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
-				bool has_error_code, u32 error_code)
+				bool has_error_code, u32 error_code,
+				bool reinject)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6e6434332f2..6b2ce1d2d74 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -265,7 +265,8 @@ static int exception_class(int vector)
 }
 
 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
-		unsigned nr, bool has_error, u32 error_code)
+		unsigned nr, bool has_error, u32 error_code,
+		bool reinject)
 {
 	u32 prev_nr;
 	int class1, class2;
@@ -276,6 +277,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 		vcpu->arch.exception.has_error_code = has_error;
 		vcpu->arch.exception.nr = nr;
 		vcpu->arch.exception.error_code = error_code;
+		vcpu->arch.exception.reinject = true;
 		return;
 	}
 
@@ -304,10 +306,16 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 {
-	kvm_multiple_exception(vcpu, nr, false, 0);
+	kvm_multiple_exception(vcpu, nr, false, 0, false);
 }
 EXPORT_SYMBOL_GPL(kvm_queue_exception);
 
+void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
+{
+	kvm_multiple_exception(vcpu, nr, false, 0, true);
+}
+EXPORT_SYMBOL_GPL(kvm_requeue_exception);
+
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 			   u32 error_code)
 {
@@ -324,10 +332,16 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);
 
 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
 {
-	kvm_multiple_exception(vcpu, nr, true, error_code);
+	kvm_multiple_exception(vcpu, nr, true, error_code, false);
 }
 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
 
+void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
+{
+	kvm_multiple_exception(vcpu, nr, true, error_code, true);
+}
+EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
+
 /*
  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
  * a #GP and return false.
@@ -4408,7 +4422,8 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
 					vcpu->arch.exception.error_code);
 		kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
 					  vcpu->arch.exception.has_error_code,
-					  vcpu->arch.exception.error_code);
+					  vcpu->arch.exception.error_code,
+					  vcpu->arch.exception.reinject);
 		return;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From ff47a49b235e59e606e1cb267a08f7fbeb5719d1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 22 Apr 2010 12:33:14 +0200
Subject: KVM: SVM: Handle MCE intercepts always on host level

This patch prevents MCE intercepts from being propagated
into the L1 guest if they happened in an L2 guest.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 30e49fe7f8c..889f66022e5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1651,6 +1651,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
 	switch (exit_code) {
 	case SVM_EXIT_INTR:
 	case SVM_EXIT_NMI:
+	case SVM_EXIT_EXCP_BASE + MC_VECTOR:
 		return NESTED_EXIT_HOST;
 	case SVM_EXIT_NPF:
 		/* For now we are always handling NPFs when using them */
-- 
cgit v1.2.3-70-g09d2


From df2fb6e7106dd0b76e3576bfaecbeb6f34843709 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Thu, 22 Apr 2010 17:33:57 +0800
Subject: KVM: MMU: fix sp->unsync type error in trace event definition

sp->unsync is bool now, so update trace event declaration.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmutrace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index bc4f7f0be2b..40a1786f36c 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -11,7 +11,7 @@
 	__field(__u64, gfn) \
 	__field(__u32, role) \
 	__field(__u32, root_count) \
-	__field(__u32, unsync)
+	__field(bool, unsync)
 
 #define KVM_MMU_PAGE_ASSIGN(sp)			     \
 	__entry->gfn = sp->gfn;			     \
-- 
cgit v1.2.3-70-g09d2


From 9a58a3333923c7fef4ba6ac9afd817429e63a1fe Mon Sep 17 00:00:00 2001
From: Sreedhara DS <sreedhara.ds@intel.com>
Date: Mon, 26 Apr 2010 18:13:05 +0100
Subject: IPC driver for Intel Mobile Internet Device (MID) platforms

The IPC (inter processor communications) is used to provide the
communications between kernel and system control units on some embedded
Intel x86 platforms.

(Various bits of clean up and restructuring by Alan Cox)

Signed-off-by: Sreedhara DS <sreedhara.ds@intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
---
 arch/x86/include/asm/intel_scu_ipc.h |  55 +++
 drivers/platform/x86/Kconfig         |   8 +
 drivers/platform/x86/Makefile        |   1 +
 drivers/platform/x86/intel_scu_ipc.c | 829 +++++++++++++++++++++++++++++++++++
 4 files changed, 893 insertions(+)
 create mode 100644 arch/x86/include/asm/intel_scu_ipc.h
 create mode 100644 drivers/platform/x86/intel_scu_ipc.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h
new file mode 100644
index 00000000000..4470c9ad4a3
--- /dev/null
+++ b/arch/x86/include/asm/intel_scu_ipc.h
@@ -0,0 +1,55 @@
+#ifndef _ASM_X86_INTEL_SCU_IPC_H_
+#define  _ASM_X86_INTEL_SCU_IPC_H_
+
+/* Read single register */
+int intel_scu_ipc_ioread8(u16 addr, u8 *data);
+
+/* Read two sequential registers */
+int intel_scu_ipc_ioread16(u16 addr, u16 *data);
+
+/* Read four sequential registers */
+int intel_scu_ipc_ioread32(u16 addr, u32 *data);
+
+/* Read a vector */
+int intel_scu_ipc_readv(u16 *addr, u8 *data, int len);
+
+/* Write single register */
+int intel_scu_ipc_iowrite8(u16 addr, u8 data);
+
+/* Write two sequential registers */
+int intel_scu_ipc_iowrite16(u16 addr, u16 data);
+
+/* Write four sequential registers */
+int intel_scu_ipc_iowrite32(u16 addr, u32 data);
+
+/* Write a vector */
+int intel_scu_ipc_writev(u16 *addr, u8 *data, int len);
+
+/* Update single register based on the mask */
+int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask);
+
+/*
+ * Indirect register read
+ * Can be used when SCCB(System Controller Configuration Block) register
+ * HRIM(Honor Restricted IPC Messages) is set (bit 23)
+ */
+int intel_scu_ipc_register_read(u32 addr, u32 *data);
+
+/*
+ * Indirect register write
+ * Can be used when SCCB(System Controller Configuration Block) register
+ * HRIM(Honor Restricted IPC Messages) is set (bit 23)
+ */
+int intel_scu_ipc_register_write(u32 addr, u32 data);
+
+/* Issue commands to the SCU with or without data */
+int intel_scu_ipc_simple_command(int cmd, int sub);
+int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen,
+							u32 *out, int outlen);
+/* I2C control api */
+int intel_scu_ipc_i2c_cntrl(u32 addr, u32 *data);
+
+/* Update FW version */
+int intel_scu_ipc_fw_update(u8 *buffer, u32 length);
+
+#endif
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 6c3320d7505..619134151ca 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -527,4 +527,12 @@ config ACPI_CMPC
 	  keys as input device, backlight device, tablet and accelerometer
 	  devices.
 
+config INTEL_SCU_IPC
+	bool "Intel SCU IPC Support"
+	depends on X86_MRST
+	default y
+	---help---
+	  IPC is used to bridge the communications between kernel and SCU on
+	  some embedded Intel x86 platforms.
+
 endif # X86_PLATFORM_DEVICES
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index a906490e353..8770bfe7143 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_ACPI_ASUS)		+= asus_acpi.o
 obj-$(CONFIG_TOPSTAR_LAPTOP)	+= topstar-laptop.o
 obj-$(CONFIG_ACPI_TOSHIBA)	+= toshiba_acpi.o
 obj-$(CONFIG_TOSHIBA_BT_RFKILL)	+= toshiba_bluetooth.o
+obj-$(CONFIG_INTEL_SCU_IPC)	+= intel_scu_ipc.o
diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c
new file mode 100644
index 00000000000..576c3ed9243
--- /dev/null
+++ b/drivers/platform/x86/intel_scu_ipc.c
@@ -0,0 +1,829 @@
+/*
+ * intel_scu_ipc.c: Driver for the Intel SCU IPC mechanism
+ *
+ * (C) Copyright 2008-2010 Intel Corporation
+ * Author: Sreedhara DS (sreedhara.ds@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ *
+ * SCU runing in ARC processor communicates with other entity running in IA
+ * core through IPC mechanism which in turn messaging between IA core ad SCU.
+ * SCU has two IPC mechanism IPC-1 and IPC-2. IPC-1 is used between IA32 and
+ * SCU where IPC-2 is used between P-Unit and SCU. This driver delas with
+ * IPC-1 Driver provides an API for power control unit registers (e.g. MSIC)
+ * along with other APIs.
+ */
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/sysdev.h>
+#include <linux/pm.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <asm/setup.h>
+#include <asm/intel_scu_ipc.h>
+
+/* IPC defines the following message types */
+#define IPCMSG_WATCHDOG_TIMER 0xF8 /* Set Kernel Watchdog Threshold */
+#define IPCMSG_BATTERY        0xEF /* Coulomb Counter Accumulator */
+#define IPCMSG_FW_UPDATE      0xFE /* Firmware update */
+#define IPCMSG_PCNTRL         0xFF /* Power controller unit read/write */
+#define IPCMSG_FW_REVISION    0xF4 /* Get firmware revision */
+
+/* Command id associated with message IPCMSG_PCNTRL */
+#define IPC_CMD_PCNTRL_W      0 /* Register write */
+#define IPC_CMD_PCNTRL_R      1 /* Register read */
+#define IPC_CMD_PCNTRL_M      2 /* Register read-modify-write */
+
+/* Miscelaneous Command ids */
+#define IPC_CMD_INDIRECT_RD   2 /* 32bit indirect read */
+#define IPC_CMD_INDIRECT_WR   5 /* 32bit indirect write */
+
+/*
+ * IPC register summary
+ *
+ * IPC register blocks are memory mapped at fixed address of 0xFF11C000
+ * To read or write information to the SCU, driver writes to IPC-1 memory
+ * mapped registers (base address 0xFF11C000). The following is the IPC
+ * mechanism
+ *
+ * 1. IA core cDMI interface claims this transaction and converts it to a
+ *    Transaction Layer Packet (TLP) message which is sent across the cDMI.
+ *
+ * 2. South Complex cDMI block receives this message and writes it to
+ *    the IPC-1 register block, causing an interrupt to the SCU
+ *
+ * 3. SCU firmware decodes this interrupt and IPC message and the appropriate
+ *    message handler is called within firmware.
+ */
+
+#define IPC_BASE_ADDR     0xFF11C000	/* IPC1 base register address */
+#define IPC_MAX_ADDR      0x100		/* Maximum IPC regisers */
+#define IPC_WWBUF_SIZE    16		/* IPC Write buffer Size */
+#define IPC_RWBUF_SIZE    16		/* IPC Read buffer Size */
+#define IPC_I2C_BASE      0xFF12B000	/* I2C control register base address */
+#define IPC_I2C_MAX_ADDR  0x10		/* Maximum I2C regisers */
+
+static int ipc_probe(struct pci_dev *dev, const struct pci_device_id *id);
+static void ipc_remove(struct pci_dev *pdev);
+
+struct intel_scu_ipc_dev {
+	struct pci_dev *pdev;
+	void __iomem *ipc_base;
+	void __iomem *i2c_base;
+};
+
+static struct intel_scu_ipc_dev  ipcdev; /* Only one for now */
+
+static int platform = 1;
+module_param(platform, int, 0);
+MODULE_PARM_DESC(platform, "1 for moorestown platform");
+
+
+
+
+/*
+ * IPC Read Buffer (Read Only):
+ * 16 byte buffer for receiving data from SCU, if IPC command
+ * processing results in response data
+ */
+#define IPC_READ_BUFFER		0x90
+
+#define IPC_I2C_CNTRL_ADDR	0
+#define I2C_DATA_ADDR		0x04
+
+static DEFINE_MUTEX(ipclock); /* lock used to prevent multiple call to SCU */
+
+/*
+ * Command Register (Write Only):
+ * A write to this register results in an interrupt to the SCU core processor
+ * Format:
+ * |rfu2(8) | size(8) | command id(4) | rfu1(3) | ioc(1) | command(8)|
+ */
+static inline void ipc_command(u32 cmd) /* Send ipc command */
+{
+	writel(cmd, ipcdev.ipc_base);
+}
+
+/*
+ * IPC Write Buffer (Write Only):
+ * 16-byte buffer for sending data associated with IPC command to
+ * SCU. Size of the data is specified in the IPC_COMMAND_REG register
+ */
+static inline void ipc_data_writel(u32 data, u32 offset) /* Write ipc data */
+{
+	writel(data, ipcdev.ipc_base + 0x80 + offset);
+}
+
+/*
+ * IPC destination Pointer (Write Only):
+ * Use content as pointer for destination write
+ */
+static inline void ipc_write_dptr(u32 data) /* Write dptr data */
+{
+	writel(data, ipcdev.ipc_base + 0x0C);
+}
+
+/*
+ * IPC Source Pointer (Write Only):
+ * Use content as pointer for read location
+*/
+static inline void ipc_write_sptr(u32 data) /* Write dptr data */
+{
+	writel(data, ipcdev.ipc_base + 0x08);
+}
+
+/*
+ * Status Register (Read Only):
+ * Driver will read this register to get the ready/busy status of the IPC
+ * block and error status of the IPC command that was just processed by SCU
+ * Format:
+ * |rfu3(8)|error code(8)|initiator id(8)|cmd id(4)|rfu1(2)|error(1)|busy(1)|
+ */
+
+static inline u8 ipc_read_status(void)
+{
+	return __raw_readl(ipcdev.ipc_base + 0x04);
+}
+
+static inline u8 ipc_data_readb(u32 offset) /* Read ipc byte data */
+{
+	return readb(ipcdev.ipc_base + IPC_READ_BUFFER + offset);
+}
+
+static inline u8 ipc_data_readl(u32 offset) /* Read ipc u32 data */
+{
+	return readl(ipcdev.ipc_base + IPC_READ_BUFFER + offset);
+}
+
+static inline int busy_loop(void) /* Wait till scu status is busy */
+{
+	u32 status = 0;
+	u32 loop_count = 0;
+
+	status = ipc_read_status();
+	while (status & 1) {
+		udelay(1); /* scu processing time is in few u secods */
+		status = ipc_read_status();
+		loop_count++;
+		/* break if scu doesn't reset busy bit after huge retry */
+		if (loop_count > 100000) {
+			dev_err(&ipcdev.pdev->dev, "IPC timed out");
+			return -ETIMEDOUT;
+		}
+	}
+	return (status >> 1) & 1;
+}
+
+/* Read/Write power control(PMIC in Langwell, MSIC in PenWell) registers */
+static int pwr_reg_rdwr(u16 *addr, u8 *data, u32 count, u32 op, u32 id)
+{
+	int nc;
+	u32 offset = 0;
+	u32 err = 0;
+	u8 cbuf[IPC_WWBUF_SIZE] = { '\0' };
+	u32 *wbuf = (u32 *)&cbuf;
+
+	mutex_lock(&ipclock);
+	if (ipcdev.pdev == NULL) {
+		mutex_unlock(&ipclock);
+		return -ENODEV;
+	}
+
+	if (platform == 1) {
+		/* Entry is 4 bytes for read/write, 5 bytes for read modify */
+		for (nc = 0; nc < count; nc++) {
+			cbuf[offset] = addr[nc];
+			cbuf[offset + 1] = addr[nc] >> 8;
+			if (id != IPC_CMD_PCNTRL_R)
+				cbuf[offset + 2] = data[nc];
+			if (id == IPC_CMD_PCNTRL_M) {
+				cbuf[offset + 3] = data[nc + 1];
+				offset += 1;
+			}
+			offset += 3;
+		}
+		for (nc = 0, offset = 0; nc < count; nc++, offset += 4)
+			ipc_data_writel(wbuf[nc], offset); /* Write wbuff */
+
+	} else {
+		for (nc = 0, offset = 0; nc < count; nc++, offset += 2)
+			ipc_data_writel(addr[nc], offset); /* Write addresses */
+		if (id != IPC_CMD_PCNTRL_R) {
+			for (nc = 0; nc < count; nc++, offset++)
+				ipc_data_writel(data[nc], offset); /* Write data */
+			if (id == IPC_CMD_PCNTRL_M)
+				ipc_data_writel(data[nc + 1], offset); /* Mask value*/
+		}
+	}
+
+	if (id != IPC_CMD_PCNTRL_M)
+		ipc_command((count * 3) << 16 |  id << 12 | 0 << 8 | op);
+	else
+		ipc_command((count * 4) << 16 |  id << 12 | 0 << 8 | op);
+
+	err = busy_loop();
+
+	if (id == IPC_CMD_PCNTRL_R) { /* Read rbuf */
+		/* Workaround: values are read as 0 without memcpy_fromio */
+		memcpy_fromio(cbuf, ipcdev.ipc_base + IPC_READ_BUFFER, 16);
+		if (platform == 1) {
+			for (nc = 0, offset = 2; nc < count; nc++, offset += 3)
+				data[nc] = ipc_data_readb(offset);
+		} else {
+			for (nc = 0; nc < count; nc++)
+				data[nc] = ipc_data_readb(nc);
+		}
+	}
+	mutex_unlock(&ipclock);
+	return err;
+}
+
+/**
+ *	intel_scu_ipc_ioread8		-	read a word via the SCU
+ *	@addr: register on SCU
+ *	@data: return pointer for read byte
+ *
+ *	Read a single register. Returns 0 on success or an error code. All
+ *	locking between SCU accesses is handled for the caller.
+ *
+ *	This function may sleep.
+ */
+int intel_scu_ipc_ioread8(u16 addr, u8 *data)
+{
+	return pwr_reg_rdwr(&addr, data, 1, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_R);
+}
+EXPORT_SYMBOL(intel_scu_ipc_ioread8);
+
+/**
+ *	intel_scu_ipc_ioread16		-	read a word via the SCU
+ *	@addr: register on SCU
+ *	@data: return pointer for read word
+ *
+ *	Read a register pair. Returns 0 on success or an error code. All
+ *	locking between SCU accesses is handled for the caller.
+ *
+ *	This function may sleep.
+ */
+int intel_scu_ipc_ioread16(u16 addr, u16 *data)
+{
+	u16 x[2] = {addr, addr + 1 };
+	return pwr_reg_rdwr(x, (u8 *)data, 2, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_R);
+}
+EXPORT_SYMBOL(intel_scu_ipc_ioread16);
+
+/**
+ *	intel_scu_ipc_ioread32		-	read a dword via the SCU
+ *	@addr: register on SCU
+ *	@data: return pointer for read dword
+ *
+ *	Read four registers. Returns 0 on success or an error code. All
+ *	locking between SCU accesses is handled for the caller.
+ *
+ *	This function may sleep.
+ */
+int intel_scu_ipc_ioread32(u16 addr, u32 *data)
+{
+	u16 x[4] = {addr, addr + 1, addr + 2, addr + 3};
+	return pwr_reg_rdwr(x, (u8 *)data, 4, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_R);
+}
+EXPORT_SYMBOL(intel_scu_ipc_ioread32);
+
+/**
+ *	intel_scu_ipc_iowrite8		-	write a byte via the SCU
+ *	@addr: register on SCU
+ *	@data: byte to write
+ *
+ *	Write a single register. Returns 0 on success or an error code. All
+ *	locking between SCU accesses is handled for the caller.
+ *
+ *	This function may sleep.
+ */
+int intel_scu_ipc_iowrite8(u16 addr, u8 data)
+{
+	return pwr_reg_rdwr(&addr, &data, 1, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_W);
+}
+EXPORT_SYMBOL(intel_scu_ipc_iowrite8);
+
+/**
+ *	intel_scu_ipc_iowrite16		-	write a word via the SCU
+ *	@addr: register on SCU
+ *	@data: word to write
+ *
+ *	Write two registers. Returns 0 on success or an error code. All
+ *	locking between SCU accesses is handled for the caller.
+ *
+ *	This function may sleep.
+ */
+int intel_scu_ipc_iowrite16(u16 addr, u16 data)
+{
+	u16 x[2] = {addr, addr + 1 };
+	return pwr_reg_rdwr(x, (u8 *)&data, 2, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_W);
+}
+EXPORT_SYMBOL(intel_scu_ipc_iowrite16);
+
+/**
+ *	intel_scu_ipc_iowrite32		-	write a dword via the SCU
+ *	@addr: register on SCU
+ *	@data: dword to write
+ *
+ *	Write four registers. Returns 0 on success or an error code. All
+ *	locking between SCU accesses is handled for the caller.
+ *
+ *	This function may sleep.
+ */
+int intel_scu_ipc_iowrite32(u16 addr, u32 data)
+{
+	u16 x[4] = {addr, addr + 1, addr + 2, addr + 3};
+	return pwr_reg_rdwr(x, (u8 *)&data, 4, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_W);
+}
+EXPORT_SYMBOL(intel_scu_ipc_iowrite32);
+
+/**
+ *	intel_scu_ipc_readvv		-	read a set of registers
+ *	@addr: register list
+ *	@data: bytes to return
+ *	@len: length of array
+ *
+ *	Read registers. Returns 0 on success or an error code. All
+ *	locking between SCU accesses is handled for the caller.
+ *
+ *	The largest array length permitted by the hardware is 5 items.
+ *
+ *	This function may sleep.
+ */
+int intel_scu_ipc_readv(u16 *addr, u8 *data, int len)
+{
+	return pwr_reg_rdwr(addr, data, len, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_R);
+}
+EXPORT_SYMBOL(intel_scu_ipc_readv);
+
+/**
+ *	intel_scu_ipc_writev		-	write a set of registers
+ *	@addr: register list
+ *	@data: bytes to write
+ *	@len: length of array
+ *
+ *	Write registers. Returns 0 on success or an error code. All
+ *	locking between SCU accesses is handled for the caller.
+ *
+ *	The largest array length permitted by the hardware is 5 items.
+ *
+ *	This function may sleep.
+ *
+ */
+int intel_scu_ipc_writev(u16 *addr, u8 *data, int len)
+{
+	return pwr_reg_rdwr(addr, data, len, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_W);
+}
+EXPORT_SYMBOL(intel_scu_ipc_writev);
+
+
+/**
+ *	intel_scu_ipc_update_register	-	r/m/w a register
+ *	@addr: register address
+ *	@bits: bits to update
+ *	@mask: mask of bits to update
+ *
+ *	Read-modify-write power control unit register. The first data argument
+ *	must be register value and second is mask value
+ *	mask is a bitmap that indicates which bits to update.
+ *	0 = masked. Don't modify this bit, 1 = modify this bit.
+ *	returns 0 on success or an error code.
+ *
+ *	This function may sleep. Locking between SCU accesses is handled
+ *	for the caller.
+ */
+int intel_scu_ipc_update_register(u16 addr, u8 bits, u8 mask)
+{
+	u8 data[2] = { bits, mask };
+	return pwr_reg_rdwr(&addr, data, 1, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_M);
+}
+EXPORT_SYMBOL(intel_scu_ipc_update_register);
+
+/**
+ *	intel_scu_ipc_register_read	-	32bit indirect read
+ *	@addr: register address
+ *	@value: 32bit value return
+ *
+ *	Performs IA 32 bit indirect read, returns 0 on success, or an
+ *	error code.
+ *
+ *	Can be used when SCCB(System Controller Configuration Block) register
+ *	HRIM(Honor Restricted IPC Messages) is set (bit 23)
+ *
+ *	This function may sleep. Locking for SCU accesses is handled for
+ *	the caller.
+ */
+int intel_scu_ipc_register_read(u32 addr, u32 *value)
+{
+	u32 err = 0;
+
+	mutex_lock(&ipclock);
+	if (ipcdev.pdev == NULL) {
+		mutex_unlock(&ipclock);
+		return -ENODEV;
+	}
+	ipc_write_sptr(addr);
+	ipc_command(4 << 16 | IPC_CMD_INDIRECT_RD);
+	err = busy_loop();
+	*value = ipc_data_readl(0);
+	mutex_unlock(&ipclock);
+	return err;
+}
+EXPORT_SYMBOL(intel_scu_ipc_register_read);
+
+/**
+ *	intel_scu_ipc_register_write	-	32bit indirect write
+ *	@addr: register address
+ *	@value: 32bit value to write
+ *
+ *	Performs IA 32 bit indirect write, returns 0 on success, or an
+ *	error code.
+ *
+ *	Can be used when SCCB(System Controller Configuration Block) register
+ *	HRIM(Honor Restricted IPC Messages) is set (bit 23)
+ *
+ *	This function may sleep. Locking for SCU accesses is handled for
+ *	the caller.
+ */
+int intel_scu_ipc_register_write(u32 addr, u32 value)
+{
+	u32 err = 0;
+
+	mutex_lock(&ipclock);
+	if (ipcdev.pdev == NULL) {
+		mutex_unlock(&ipclock);
+		return -ENODEV;
+	}
+	ipc_write_dptr(addr);
+	ipc_data_writel(value, 0);
+	ipc_command(4 << 16 | IPC_CMD_INDIRECT_WR);
+	err = busy_loop();
+	mutex_unlock(&ipclock);
+	return err;
+}
+EXPORT_SYMBOL(intel_scu_ipc_register_write);
+
+/**
+ *	intel_scu_ipc_simple_command	-	send a simple command
+ *	@cmd: command
+ *	@sub: sub type
+ *
+ *	Issue a simple command to the SCU. Do not use this interface if
+ *	you must then access data as any data values may be overwritten
+ *	by another SCU access by the time this function returns.
+ *
+ *	This function may sleep. Locking for SCU accesses is handled for
+ *	the caller.
+ */
+int intel_scu_ipc_simple_command(int cmd, int sub)
+{
+	u32 err = 0;
+
+	mutex_lock(&ipclock);
+	if (ipcdev.pdev == NULL) {
+		mutex_unlock(&ipclock);
+		return -ENODEV;
+	}
+	ipc_command(cmd << 12 | sub);
+	err = busy_loop();
+	mutex_unlock(&ipclock);
+	return err;
+}
+EXPORT_SYMBOL(intel_scu_ipc_simple_command);
+
+/**
+ *	intel_scu_ipc_command	-	command with data
+ *	@cmd: command
+ *	@sub: sub type
+ *	@in: input data
+ *	@inlen: input length
+ *	@out: output data
+ *	@outlein: output length
+ *
+ *	Issue a command to the SCU which involves data transfers. Do the
+ *	data copies under the lock but leave it for the caller to interpret
+ */
+
+int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen,
+							u32 *out, int outlen)
+{
+	u32 err = 0;
+	int i = 0;
+
+	mutex_lock(&ipclock);
+	if (ipcdev.pdev == NULL) {
+		mutex_unlock(&ipclock);
+		return -ENODEV;
+	}
+
+	for (i = 0; i < inlen; i++)
+		ipc_data_writel(*in++, 4 * i);
+
+	ipc_command(cmd << 12 | sub);
+	err = busy_loop();
+
+	for (i = 0; i < outlen; i++)
+		*out++ = ipc_data_readl(4 * i);
+
+	mutex_unlock(&ipclock);
+	return err;
+}
+EXPORT_SYMBOL(intel_scu_ipc_command);
+
+/*I2C commands */
+#define IPC_I2C_WRITE 1 /* I2C Write command */
+#define IPC_I2C_READ  2 /* I2C Read command */
+
+/**
+ *	intel_scu_ipc_i2c_cntrl		-	I2C read/write operations
+ *	@addr: I2C address + command bits
+ *	@data: data to read/write
+ *
+ *	Perform an an I2C read/write operation via the SCU. All locking is
+ *	handled for the caller. This function may sleep.
+ *
+ *	Returns an error code or 0 on success.
+ *
+ *	This has to be in the IPC driver for the locking.
+ */
+int intel_scu_ipc_i2c_cntrl(u32 addr, u32 *data)
+{
+	u32 cmd = 0;
+
+	mutex_lock(&ipclock);
+	cmd = (addr >> 24) & 0xFF;
+	if (cmd == IPC_I2C_READ) {
+		writel(addr, ipcdev.i2c_base + IPC_I2C_CNTRL_ADDR);
+		/* Write not getting updated without delay */
+		mdelay(1);
+		*data = readl(ipcdev.i2c_base + I2C_DATA_ADDR);
+	} else if (cmd == IPC_I2C_WRITE) {
+		writel(addr, ipcdev.i2c_base + I2C_DATA_ADDR);
+		mdelay(1);
+		writel(addr, ipcdev.i2c_base + IPC_I2C_CNTRL_ADDR);
+	} else {
+		dev_err(&ipcdev.pdev->dev,
+			"intel_scu_ipc: I2C INVALID_CMD = 0x%x\n", cmd);
+
+		mutex_unlock(&ipclock);
+		return -1;
+	}
+	mutex_unlock(&ipclock);
+	return 0;
+}
+EXPORT_SYMBOL(intel_scu_ipc_i2c_cntrl);
+
+#define IPC_FW_LOAD_ADDR 0xFFFC0000 /* Storage location for FW image */
+#define IPC_FW_UPDATE_MBOX_ADDR 0xFFFFDFF4 /* Mailbox between ipc and scu */
+#define IPC_MAX_FW_SIZE 262144 /* 256K storage size for loading the FW image */
+#define IPC_FW_MIP_HEADER_SIZE 2048 /* Firmware MIP header size */
+/* IPC inform SCU to get ready for update process */
+#define IPC_CMD_FW_UPDATE_READY  0x10FE
+/* IPC inform SCU to go for update process */
+#define IPC_CMD_FW_UPDATE_GO     0x20FE
+/* Status code for fw update */
+#define IPC_FW_UPDATE_SUCCESS	0x444f4e45 /* Status code 'DONE' */
+#define IPC_FW_UPDATE_BADN	0x4241444E /* Status code 'BADN' */
+#define IPC_FW_TXHIGH		0x54784849 /* Status code 'IPC_FW_TXHIGH' */
+#define IPC_FW_TXLOW		0x54784c4f /* Status code 'IPC_FW_TXLOW' */
+
+struct fw_update_mailbox {
+	u32    status;
+	u32    scu_flag;
+	u32    driver_flag;
+};
+
+
+/**
+ *	intel_scu_ipc_fw_update	-	 Firmware update utility
+ *	@buffer: firmware buffer
+ *	@length: size of firmware buffer
+ *
+ *	This function provides an interface to load the firmware into
+ *	the SCU. Returns 0 on success or -1 on failure
+ */
+int intel_scu_ipc_fw_update(u8 *buffer, u32 length)
+{
+	void __iomem *fw_update_base;
+	struct fw_update_mailbox __iomem *mailbox = NULL;
+	int retry_cnt = 0;
+	u32 status;
+
+	mutex_lock(&ipclock);
+	fw_update_base = ioremap_nocache(IPC_FW_LOAD_ADDR, (128*1024));
+	if (fw_update_base == NULL) {
+		mutex_unlock(&ipclock);
+		return -ENOMEM;
+	}
+	mailbox = ioremap_nocache(IPC_FW_UPDATE_MBOX_ADDR,
+					sizeof(struct fw_update_mailbox));
+	if (mailbox == NULL) {
+		iounmap(fw_update_base);
+		mutex_unlock(&ipclock);
+		return -ENOMEM;
+	}
+
+	ipc_command(IPC_CMD_FW_UPDATE_READY);
+
+	/* Intitialize mailbox */
+	writel(0, &mailbox->status);
+	writel(0, &mailbox->scu_flag);
+	writel(0, &mailbox->driver_flag);
+
+	/* Driver copies the 2KB MIP header to SRAM at 0xFFFC0000*/
+	memcpy_toio(fw_update_base, buffer, 0x800);
+
+	/* Driver sends "FW Update" IPC command (CMD_ID 0xFE; MSG_ID 0x02).
+	* Upon receiving this command, SCU will write the 2K MIP header
+	* from 0xFFFC0000 into NAND.
+	* SCU will write a status code into the Mailbox, and then set scu_flag.
+	*/
+
+	ipc_command(IPC_CMD_FW_UPDATE_GO);
+
+	/*Driver stalls until scu_flag is set */
+	while (readl(&mailbox->scu_flag) != 1) {
+		rmb();
+		mdelay(1);
+	}
+
+	/* Driver checks Mailbox status.
+	 * If the status is 'BADN', then abort (bad NAND).
+	 * If the status is 'IPC_FW_TXLOW', then continue.
+	 */
+	while (readl(&mailbox->status) != IPC_FW_TXLOW) {
+		rmb();
+		mdelay(10);
+	}
+	mdelay(10);
+
+update_retry:
+	if (retry_cnt > 5)
+		goto update_end;
+
+	if (readl(&mailbox->status) != IPC_FW_TXLOW)
+		goto update_end;
+	buffer = buffer + 0x800;
+	memcpy_toio(fw_update_base, buffer, 0x20000);
+	writel(1, &mailbox->driver_flag);
+	while (readl(&mailbox->scu_flag) == 1) {
+		rmb();
+		mdelay(1);
+	}
+
+	/* check for 'BADN' */
+	if (readl(&mailbox->status) == IPC_FW_UPDATE_BADN)
+		goto update_end;
+
+	while (readl(&mailbox->status) != IPC_FW_TXHIGH) {
+		rmb();
+		mdelay(10);
+	}
+	mdelay(10);
+
+	if (readl(&mailbox->status) != IPC_FW_TXHIGH)
+		goto update_end;
+
+	buffer = buffer + 0x20000;
+	memcpy_toio(fw_update_base, buffer, 0x20000);
+	writel(0, &mailbox->driver_flag);
+
+	while (mailbox->scu_flag == 0) {
+		rmb();
+		mdelay(1);
+	}
+
+	/* check for 'BADN' */
+	if (readl(&mailbox->status) == IPC_FW_UPDATE_BADN)
+		goto update_end;
+
+	if (readl(&mailbox->status) == IPC_FW_TXLOW) {
+		++retry_cnt;
+		goto update_retry;
+	}
+
+update_end:
+	status = readl(&mailbox->status);
+
+	iounmap(fw_update_base);
+	iounmap(mailbox);
+	mutex_unlock(&ipclock);
+
+	if (status == IPC_FW_UPDATE_SUCCESS)
+		return 0;
+	return -1;
+}
+EXPORT_SYMBOL(intel_scu_ipc_fw_update);
+
+/*
+ * Interrupt handler gets called when ioc bit of IPC_COMMAND_REG set to 1
+ * When ioc bit is set to 1, caller api must wait for interrupt handler called
+ * which in turn unlocks the caller api. Currently this is not used
+ *
+ * This is edge triggered so we need take no action to clear anything
+ */
+static irqreturn_t ioc(int irq, void *dev_id)
+{
+	return IRQ_HANDLED;
+}
+
+/**
+ *	ipc_probe	-	probe an Intel SCU IPC
+ *	@dev: the PCI device matching
+ *	@id: entry in the match table
+ *
+ *	Enable and install an intel SCU IPC. This appears in the PCI space
+ *	but uses some hard coded addresses as well.
+ */
+static int ipc_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	int err;
+	resource_size_t pci_resource;
+
+	if (ipcdev.pdev)		/* We support only one SCU */
+		return -EBUSY;
+
+	ipcdev.pdev = pci_dev_get(dev);
+
+	err = pci_enable_device(dev);
+	if (err)
+		return err;
+
+	err = pci_request_regions(dev, "intel_scu_ipc");
+	if (err)
+		return err;
+
+	pci_resource = pci_resource_start(dev, 0);
+	if (!pci_resource)
+		return -ENOMEM;
+
+	if (request_irq(dev->irq, ioc, 0, "intel_scu_ipc", &ipcdev))
+		return -EBUSY;
+
+	ipcdev.ipc_base = ioremap_nocache(IPC_BASE_ADDR, IPC_MAX_ADDR);
+	if (!ipcdev.ipc_base)
+		return -ENOMEM;
+
+	ipcdev.i2c_base = ioremap_nocache(IPC_I2C_BASE, IPC_I2C_MAX_ADDR);
+	if (!ipcdev.i2c_base) {
+		iounmap(ipcdev.ipc_base);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+/**
+ *	ipc_remove	-	remove a bound IPC device
+ *	@pdev: PCI device
+ *
+ *	In practice the SCU is not removable but this function is also
+ *	called for each device on a module unload or cleanup which is the
+ *	path that will get used.
+ *
+ *	Free up the mappings and release the PCI resources
+ */
+static void ipc_remove(struct pci_dev *pdev)
+{
+	free_irq(pdev->irq, &ipcdev);
+	pci_release_regions(pdev);
+	pci_dev_put(ipcdev.pdev);
+	iounmap(ipcdev.ipc_base);
+	iounmap(ipcdev.i2c_base);
+	ipcdev.pdev = NULL;
+}
+
+static const struct pci_device_id pci_ids[] = {
+	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x080e)},
+	{ 0,}
+};
+MODULE_DEVICE_TABLE(pci, pci_ids);
+
+static struct pci_driver ipc_driver = {
+	.name = "intel_scu_ipc",
+	.id_table = pci_ids,
+	.probe = ipc_probe,
+	.remove = ipc_remove,
+};
+
+
+static int __init intel_scu_ipc_init(void)
+{
+	return  pci_register_driver(&ipc_driver);
+}
+
+static void __exit intel_scu_ipc_exit(void)
+{
+	pci_unregister_driver(&ipc_driver);
+}
+
+MODULE_AUTHOR("Sreedhara DS <sreedhara.ds@intel.com>");
+MODULE_DESCRIPTION("Intel SCU IPC driver");
+MODULE_LICENSE("GPL");
+
+module_init(intel_scu_ipc_init);
+module_exit(intel_scu_ipc_exit);
-- 
cgit v1.2.3-70-g09d2


From 0db1a7bc00216a981d0b7056627ad8682f4f0636 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Mon, 17 May 2010 16:13:04 +0800
Subject: perf, x86: P4 PMU -- handle unflagged events

It might happen that an event can overflow without
the proper overflow flag set. Check the sign bit in
the raw counter value to solve this problem.

Tested-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: fweisbec@gmail.com
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <1274083984.6540.15.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 424fc8de68e..02f07283023 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -465,15 +465,21 @@ out:
 	return rc;
 }
 
-static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
+static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
 {
-	unsigned long dummy;
+	int overflow = 0;
+	u32 low, high;
 
-	rdmsrl(hwc->config_base + hwc->idx, dummy);
-	if (dummy & P4_CCCR_OVF) {
+	rdmsr(hwc->config_base + hwc->idx, low, high);
+
+	/* we need to check high bit for unflagged overflows */
+	if ((low & P4_CCCR_OVF) || (high & (1 << 31))) {
+		overflow = 1;
 		(void)checking_wrmsrl(hwc->config_base + hwc->idx,
-			((u64)dummy) & ~P4_CCCR_OVF);
+			((u64)low) & ~P4_CCCR_OVF);
 	}
+
+	return overflow;
 }
 
 static inline void p4_pmu_disable_event(struct perf_event *event)
@@ -584,21 +590,15 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
 
 		WARN_ON_ONCE(hwc->idx != idx);
 
-		/*
-		 * FIXME: Redundant call, actually not needed
-		 * but just to check if we're screwed
-		 */
-		p4_pmu_clear_cccr_ovf(hwc);
+		/* it might be unflagged overflow */
+		handled = p4_pmu_clear_cccr_ovf(hwc);
 
 		val = x86_perf_event_update(event);
-		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
+		if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
 			continue;
 
-		/*
-		 * event overflow
-		 */
-		handled		= 1;
-		data.period	= event->hw.last_period;
+		/* event overflow for sure */
+		data.period = event->hw.last_period;
 
 		if (!x86_perf_event_set_period(event))
 			continue;
-- 
cgit v1.2.3-70-g09d2


From ef4f30f54e265c2f6f9ac9eda4db158a4e16050b Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Tue, 18 May 2010 17:29:14 +0800
Subject: perf, x86: P4 PMU -- fix typo in unflagged NMI handling

Tested-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
LKML-Reference: <1274174954.22793.17.camel@minggr.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 02f07283023..87e1803e67a 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -473,7 +473,7 @@ static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
 	rdmsr(hwc->config_base + hwc->idx, low, high);
 
 	/* we need to check high bit for unflagged overflows */
-	if ((low & P4_CCCR_OVF) || (high & (1 << 31))) {
+	if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
 		overflow = 1;
 		(void)checking_wrmsrl(hwc->config_base + hwc->idx,
 			((u64)low) & ~P4_CCCR_OVF);
-- 
cgit v1.2.3-70-g09d2


From a02ce953a14d6a8aab721b129b3c8ff4981a76e6 Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Wed, 5 May 2010 17:08:49 +0800
Subject: x86/PCI: make ACPI MCFG reserved error messages ACPI specific

Both ACPI and SFI firmwares will have MCFG space, but the error message
isn't valid on SFI, so don't print the message in that case.

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/mmconfig-shared.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 39b9ebe8f88..a918553ebc7 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -483,16 +483,17 @@ static void __init pci_mmcfg_reject_broken(int early)
 	list_for_each_entry(cfg, &pci_mmcfg_list, list) {
 		int valid = 0;
 
-		if (!early && !acpi_disabled)
+		if (!early && !acpi_disabled) {
 			valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0);
 
-		if (valid)
-			continue;
-
-		if (!early)
-			printk(KERN_ERR FW_BUG PREFIX
-			       "MMCONFIG at %pR not reserved in "
-			       "ACPI motherboard resources\n", &cfg->res);
+			if (valid)
+				continue;
+			else
+				printk(KERN_ERR FW_BUG PREFIX
+				       "MMCONFIG at %pR not reserved in "
+				       "ACPI motherboard resources\n",
+				       &cfg->res);
+		}
 
 		/* Don't try to do this check unless configuration
 		   type 1 is available. how about type 2 ?*/
-- 
cgit v1.2.3-70-g09d2


From 623aab896ee1a532cb540bcf0d5ae8a88275afd5 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Wed, 19 May 2010 01:19:17 +0400
Subject: perf, x86: P4 PMU -- do a real check for ESCR address being in hash

To prevent from clashes in future code modifications
do a real check for ESCR address being in hash. At
moment the callers are known to pass sane values but
better to be on a safe side.

And comment fix.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Lin Ming <ming.m.lin@intel.com>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100518212439.004503600@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 87e1803e67a..5f8e36d6279 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -670,7 +670,7 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
 
 /*
  * ESCR address hashing is tricky, ESCRs are not sequential
- * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03e0) and
+ * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
  * the metric between any ESCRs is laid in range [0xa0,0xe1]
  *
  * so we make ~70% filled hashtable
@@ -735,8 +735,9 @@ static int p4_get_escr_idx(unsigned int addr)
 {
 	unsigned int idx = P4_ESCR_MSR_IDX(addr);
 
-	if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
-			!p4_escr_table[idx])) {
+	if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE	||
+			!p4_escr_table[idx]		||
+			p4_escr_table[idx] != addr)) {
 		WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
 		return -1;
 	}
-- 
cgit v1.2.3-70-g09d2


From 9d36dfcf219e2ba1f1d169a7f92dcf2cbd4e05f0 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Wed, 19 May 2010 01:19:18 +0400
Subject: perf, x86: P4_pmu_schedule_events -- use smp_processor_id instead of
 raw_

This snippet somehow escaped the commit:

 | commit 137351e0feeb9f25d99488ee1afc1c79f5499a9a
 | Author: Cyrill Gorcunov <gorcunov@openvz.org>
 | Date:   Sat May 8 15:25:52 2010 +0400
 |
 |    x86, perf: P4 PMU -- protect sensible procedures from preemption

so bring it eventually back. It helps to catch
preemption issue (if there will be, rule of thumb --
don't use raw_ if you can).

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Lin Ming <ming.m.lin@intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100518212439.167259349@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event_p4.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 5f8e36d6279..ae85d69644d 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -763,7 +763,7 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign
 {
 	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
-	int cpu = raw_smp_processor_id();
+	int cpu = smp_processor_id();
 	struct hw_perf_event *hwc;
 	struct p4_event_bind *bind;
 	unsigned int i, thread, num;
-- 
cgit v1.2.3-70-g09d2


From ce7f15452cc1dc1eca795542367871a07f37aa79 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Wed, 19 May 2010 01:19:19 +0400
Subject: perf, x86: P4 PMU -- add missing bit in CCCR mask

Should be there for the sake of RAW events.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Lin Ming <ming.m.lin@intel.com>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20100518212439.354345151@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_event_p4.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index b05400a542f..64a8ebff06f 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -89,7 +89,8 @@
 	P4_CCCR_ENABLE)
 
 /* HT mask */
-#define P4_CCCR_MASK_HT	(P4_CCCR_MASK | P4_CCCR_THREAD_ANY)
+#define P4_CCCR_MASK_HT				\
+	(P4_CCCR_MASK | P4_CCCR_OVF_PMI_T1 | P4_CCCR_THREAD_ANY)
 
 #define P4_GEN_ESCR_EMASK(class, name, bit)	\
 	class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
-- 
cgit v1.2.3-70-g09d2


From 5a7388c2d2faa2cc70c2d4717c8d7836d55459e0 Mon Sep 17 00:00:00 2001
From: Eric Northup <digitaleric@google.com>
Date: Mon, 26 Apr 2010 17:00:05 -0700
Subject: KVM: MMU: fix hashing for TDP and non-paging modes

For TDP mode, avoid creating multiple page table roots for the single
guest-to-host physical address map by fixing the inputs used for the
shadow page table hash in mmu_alloc_roots().

Signed-off-by: Eric Northup <digitaleric@google.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ddfa8658fb6..9696d654b01 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2059,10 +2059,12 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		hpa_t root = vcpu->arch.mmu.root_hpa;
 
 		ASSERT(!VALID_PAGE(root));
-		if (tdp_enabled)
-			direct = 1;
 		if (mmu_check_root(vcpu, root_gfn))
 			return 1;
+		if (tdp_enabled) {
+			direct = 1;
+			root_gfn = 0;
+		}
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
 				      PT64_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
@@ -2072,8 +2074,6 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 	direct = !is_paging(vcpu);
-	if (tdp_enabled)
-		direct = 1;
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
@@ -2089,6 +2089,10 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = 0;
 		if (mmu_check_root(vcpu, root_gfn))
 			return 1;
+		if (tdp_enabled) {
+			direct = 1;
+			root_gfn = i << 30;
+		}
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
 				      PT32_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
-- 
cgit v1.2.3-70-g09d2


From d35b8dd9355805f17225fdbfee4bc704d7bf7547 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Tue, 27 Apr 2010 10:39:49 +0800
Subject: KVM: Fix mmu shrinker error

kvm_mmu_remove_one_alloc_mmu_page() assumes kvm_mmu_zap_page() only reclaims
only one sp, but that's not the case. This will cause mmu shrinker returns
a wrong number. This patch fix the counting error.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9696d654b01..18d2f584945 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2902,13 +2902,13 @@ restart:
 	kvm_flush_remote_tlbs(kvm);
 }
 
-static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
+static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
 {
 	struct kvm_mmu_page *page;
 
 	page = container_of(kvm->arch.active_mmu_pages.prev,
 			    struct kvm_mmu_page, link);
-	kvm_mmu_zap_page(kvm, page);
+	return kvm_mmu_zap_page(kvm, page) + 1;
 }
 
 static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
@@ -2920,7 +2920,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
 	spin_lock(&kvm_lock);
 
 	list_for_each_entry(kvm, &vm_list, vm_list) {
-		int npages, idx;
+		int npages, idx, freed_pages;
 
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
@@ -2928,8 +2928,8 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
 			 kvm->arch.n_free_mmu_pages;
 		cache_count += npages;
 		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
-			kvm_mmu_remove_one_alloc_mmu_page(kvm);
-			cache_count--;
+			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm);
+			cache_count -= freed_pages;
 			kvm_freed = kvm;
 		}
 		nr_to_scan--;
-- 
cgit v1.2.3-70-g09d2


From 22c9b2d166a88573a933e8d355833d36579d83be Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Wed, 28 Apr 2010 11:54:44 +0800
Subject: KVM: MMU: fix for calculating gpa in invlpg code

If the guest is 32-bit, we should use 'quadrant' to adjust gpa
offset

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d0cc07eb6ed..3464fdbdb98 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -478,8 +478,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 		    ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
 		    ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
 			struct kvm_mmu_page *sp = page_header(__pa(sptep));
+			int offset, shift;
 
-			pte_gpa = (sp->gfn << PAGE_SHIFT);
+			shift = PAGE_SHIFT -
+				  (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
+			offset = sp->role.quadrant << shift;
+
+			pte_gpa = (sp->gfn << PAGE_SHIFT) + offset;
 			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
 
 			if (is_shadow_present_pte(*sptep)) {
-- 
cgit v1.2.3-70-g09d2


From 85f2067c3192212bce3bd2e1d6ad10153d6f2a4e Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Wed, 28 Apr 2010 11:54:55 +0800
Subject: KVM: MMU: convert mmu tracepoints

Convert mmu tracepoints by using DECLARE_EVENT_CLASS

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmutrace.h | 69 +++++++++++++++++++------------------------------
 1 file changed, 26 insertions(+), 43 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 40a1786f36c..42f07b1bfbc 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -92,15 +92,15 @@ TRACE_EVENT(
 	TP_printk("pte %llx level %u", __entry->pte, __entry->level)
 );
 
-/* We set a pte accessed bit */
-TRACE_EVENT(
-	kvm_mmu_set_accessed_bit,
+DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class,
+
 	TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
 	TP_ARGS(table_gfn, index, size),
 
 	TP_STRUCT__entry(
 		__field(__u64, gpa)
-		),
+	),
 
 	TP_fast_assign(
 		__entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
@@ -110,22 +110,20 @@ TRACE_EVENT(
 	TP_printk("gpa %llx", __entry->gpa)
 );
 
-/* We set a pte dirty bit */
-TRACE_EVENT(
-	kvm_mmu_set_dirty_bit,
+/* We set a pte accessed bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit,
+
 	TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
-	TP_ARGS(table_gfn, index, size),
 
-	TP_STRUCT__entry(
-		__field(__u64, gpa)
-		),
+	TP_ARGS(table_gfn, index, size)
+);
 
-	TP_fast_assign(
-		__entry->gpa = ((u64)table_gfn << PAGE_SHIFT)
-				+ index * size;
-		),
+/* We set a pte dirty bit */
+DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit,
 
-	TP_printk("gpa %llx", __entry->gpa)
+	TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size),
+
+	TP_ARGS(table_gfn, index, size)
 );
 
 TRACE_EVENT(
@@ -164,54 +162,39 @@ TRACE_EVENT(
 		  __entry->created ? "new" : "existing")
 );
 
-TRACE_EVENT(
-	kvm_mmu_sync_page,
+DECLARE_EVENT_CLASS(kvm_mmu_page_class,
+
 	TP_PROTO(struct kvm_mmu_page *sp),
 	TP_ARGS(sp),
 
 	TP_STRUCT__entry(
 		KVM_MMU_PAGE_FIELDS
-		),
+	),
 
 	TP_fast_assign(
 		KVM_MMU_PAGE_ASSIGN(sp)
-		),
+	),
 
 	TP_printk("%s", KVM_MMU_PAGE_PRINTK())
 );
 
-TRACE_EVENT(
-	kvm_mmu_unsync_page,
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page,
 	TP_PROTO(struct kvm_mmu_page *sp),
-	TP_ARGS(sp),
-
-	TP_STRUCT__entry(
-		KVM_MMU_PAGE_FIELDS
-		),
-
-	TP_fast_assign(
-		KVM_MMU_PAGE_ASSIGN(sp)
-		),
 
-	TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+	TP_ARGS(sp)
 );
 
-TRACE_EVENT(
-	kvm_mmu_zap_page,
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
 	TP_PROTO(struct kvm_mmu_page *sp),
-	TP_ARGS(sp),
 
-	TP_STRUCT__entry(
-		KVM_MMU_PAGE_FIELDS
-		),
+	TP_ARGS(sp)
+);
 
-	TP_fast_assign(
-		KVM_MMU_PAGE_ASSIGN(sp)
-		),
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page,
+	TP_PROTO(struct kvm_mmu_page *sp),
 
-	TP_printk("%s", KVM_MMU_PAGE_PRINTK())
+	TP_ARGS(sp)
 );
-
 #endif /* _TRACE_KVMMMU_H */
 
 #undef TRACE_INCLUDE_PATH
-- 
cgit v1.2.3-70-g09d2


From 5e1b3ddbf220d2256a6a0d676a219ed76b8048d0 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Wed, 28 Apr 2010 11:55:06 +0800
Subject: KVM: MMU: move unsync/sync tracpoints to proper place

Move unsync/sync tracepoints to the proper place, it's good
for us to obtain unsync page live time

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 18d2f584945..51eb6d6abd8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1189,6 +1189,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	WARN_ON(!sp->unsync);
+	trace_kvm_mmu_sync_page(sp);
 	sp->unsync = 0;
 	--kvm->stat.mmu_unsync;
 }
@@ -1202,7 +1203,6 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		return 1;
 	}
 
-	trace_kvm_mmu_sync_page(sp);
 	if (rmap_write_protect(vcpu->kvm, sp->gfn))
 		kvm_flush_remote_tlbs(vcpu->kvm);
 	kvm_unlink_unsync_page(vcpu->kvm, sp);
@@ -1730,7 +1730,6 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	struct kvm_mmu_page *s;
 	struct hlist_node *node, *n;
 
-	trace_kvm_mmu_unsync_page(sp);
 	index = kvm_page_table_hashfn(sp->gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 	/* don't unsync if pagetable is shadowed with multiple roles */
@@ -1740,6 +1739,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		if (s->role.word != sp->role.word)
 			return 1;
 	}
+	trace_kvm_mmu_unsync_page(sp);
 	++vcpu->kvm->stat.mmu_unsync;
 	sp->unsync = 1;
 
-- 
cgit v1.2.3-70-g09d2


From 884a0ff0b68b3ece5987507de168215e14ef7849 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Wed, 28 Apr 2010 11:55:15 +0800
Subject: KVM: MMU: cleanup invlpg code

Using is_last_spte() to cleanup invlpg code

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 3464fdbdb98..89d66ca4d87 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -474,9 +474,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 		level = iterator.level;
 		sptep = iterator.sptep;
 
-		if (level == PT_PAGE_TABLE_LEVEL  ||
-		    ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
-		    ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
+		if (is_last_spte(*sptep, level)) {
 			struct kvm_mmu_page *sp = page_header(__pa(sptep));
 			int offset, shift;
 
-- 
cgit v1.2.3-70-g09d2


From 0ee75bead83da4791e5cbf659806c54d8ee40f12 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 28 Apr 2010 15:39:01 +0300
Subject: KVM: Let vcpu structure alignment be determined at runtime

vmx and svm vcpus have different contents and therefore may have different
alignmment requirements.  Let each specify its required alignment.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/vmm.c       | 2 +-
 arch/powerpc/kvm/44x.c    | 2 +-
 arch/powerpc/kvm/book3s.c | 3 ++-
 arch/powerpc/kvm/e500.c   | 2 +-
 arch/s390/kvm/kvm-s390.c  | 2 +-
 arch/x86/kvm/svm.c        | 2 +-
 arch/x86/kvm/vmx.c        | 3 ++-
 include/linux/kvm_host.h  | 2 +-
 virt/kvm/kvm_main.c       | 7 ++++---
 9 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/kvm/vmm.c b/arch/ia64/kvm/vmm.c
index 7a62f75778c..f0b9cac8241 100644
--- a/arch/ia64/kvm/vmm.c
+++ b/arch/ia64/kvm/vmm.c
@@ -51,7 +51,7 @@ static int __init  kvm_vmm_init(void)
 	vmm_fpswa_interface = fpswa_interface;
 
 	/*Register vmm data to kvm side*/
-	return kvm_init(&vmm_info, 1024, THIS_MODULE);
+	return kvm_init(&vmm_info, 1024, 0, THIS_MODULE);
 }
 
 static void __exit kvm_vmm_exit(void)
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 689a57c2ac8..73c0a3f64ed 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -147,7 +147,7 @@ static int __init kvmppc_44x_init(void)
 	if (r)
 		return r;
 
-	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), THIS_MODULE);
+	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), 0, THIS_MODULE);
 }
 
 static void __exit kvmppc_44x_exit(void)
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 28e785fb2ca..11f226ff446 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1385,7 +1385,8 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 static int kvmppc_book3s_init(void)
 {
-	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), THIS_MODULE);
+	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
+			THIS_MODULE);
 }
 
 static void kvmppc_book3s_exit(void)
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 669a5c5fc7d..bc2b4004eb2 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -161,7 +161,7 @@ static int __init kvmppc_e500_init(void)
 	flush_icache_range(kvmppc_booke_handlers,
 			kvmppc_booke_handlers + max_ivor + kvmppc_handler_len);
 
-	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), THIS_MODULE);
+	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
 }
 
 static void __init kvmppc_e500_exit(void)
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ee7c713686c..8093e6f47f4 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -752,7 +752,7 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 static int __init kvm_s390_init(void)
 {
 	int ret;
-	ret = kvm_init(NULL, sizeof(struct kvm_vcpu), THIS_MODULE);
+	ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
 	if (ret)
 		return ret;
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 889f66022e5..2511664ff67 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3319,7 +3319,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 static int __init svm_init(void)
 {
 	return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
-			      THIS_MODULE);
+			__alignof__(struct vcpu_svm), THIS_MODULE);
 }
 
 static void __exit svm_exit(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 875b785228f..2e872967860 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4245,7 +4245,8 @@ static int __init vmx_init(void)
 
 	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
-	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
+	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+		     __alignof__(struct vcpu_vmx), THIS_MODULE);
 	if (r)
 		goto out3;
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ce027d51809..7cb116afa1c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -243,7 +243,7 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
 void vcpu_load(struct kvm_vcpu *vcpu);
 void vcpu_put(struct kvm_vcpu *vcpu);
 
-int kvm_init(void *opaque, unsigned int vcpu_size,
+int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 		  struct module *module);
 void kvm_exit(void);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9ab1a77941e..f032806a212 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2178,7 +2178,7 @@ static void kvm_sched_out(struct preempt_notifier *pn,
 	kvm_arch_vcpu_put(vcpu);
 }
 
-int kvm_init(void *opaque, unsigned int vcpu_size,
+int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 		  struct module *module)
 {
 	int r;
@@ -2228,8 +2228,9 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 		goto out_free_4;
 
 	/* A kmem cache lets us meet the alignment requirements of fx_save. */
-	kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
-					   __alignof__(struct kvm_vcpu),
+	if (!vcpu_align)
+		vcpu_align = __alignof__(struct kvm_vcpu);
+	kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align,
 					   0, NULL);
 	if (!kvm_vcpu_cache) {
 		r = -ENOMEM;
-- 
cgit v1.2.3-70-g09d2


From 19b95dba0324e55505682a18ff9a437fbf4a2592 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 28 Apr 2010 15:40:31 +0300
Subject: KVM: VMX: Add definition for msr autoload entry

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/vmx.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index fb9a080740e..44973189076 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -25,6 +25,8 @@
  *
  */
 
+#include <linux/types.h>
+
 /*
  * Definitions of Primary Processor-Based VM-Execution Controls.
  */
@@ -394,6 +396,10 @@ enum vmcs_field {
 #define ASM_VMX_INVEPT		  ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
 #define ASM_VMX_INVVPID		  ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
 
-
+struct vmx_msr_entry {
+	u32 index;
+	u32 reserved;
+	u64 value;
+} __aligned(16);
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From 5dfa3d170e17cbf9e4816a5ba2f5913c31c03e93 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 28 Apr 2010 15:41:03 +0300
Subject: KVM: VMX: Add definitions for guest and host EFER autoswitch vmcs
 entries

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/vmx.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 44973189076..9e6779f7cf2 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -122,6 +122,8 @@ enum vmcs_field {
 	GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
 	GUEST_IA32_PAT			= 0x00002804,
 	GUEST_IA32_PAT_HIGH		= 0x00002805,
+	GUEST_IA32_EFER			= 0x00002806,
+	GUEST_IA32_EFER_HIGH		= 0x00002807,
 	GUEST_PDPTR0                    = 0x0000280a,
 	GUEST_PDPTR0_HIGH               = 0x0000280b,
 	GUEST_PDPTR1                    = 0x0000280c,
@@ -132,6 +134,8 @@ enum vmcs_field {
 	GUEST_PDPTR3_HIGH               = 0x00002811,
 	HOST_IA32_PAT			= 0x00002c00,
 	HOST_IA32_PAT_HIGH		= 0x00002c01,
+	HOST_IA32_EFER			= 0x00002c02,
+	HOST_IA32_EFER_HIGH		= 0x00002c03,
 	PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
 	CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
 	EXCEPTION_BITMAP                = 0x00004004,
-- 
cgit v1.2.3-70-g09d2


From 61d2ef2ce3e0161bedf5d2867f546a8df77fa9bc Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 28 Apr 2010 16:40:38 +0300
Subject: KVM: VMX: Add facility to atomically switch MSRs on guest entry/exit

Some guest msr values cannot be used on the host (for example. EFER.NX=0),
so we need to switch them atomically during guest entry or exit.

Add a facility to program the vmx msr autoload registers accordingly.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2e872967860..ae22dcf1721 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -98,6 +98,8 @@ module_param(ple_gap, int, S_IRUGO);
 static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
 module_param(ple_window, int, S_IRUGO);
 
+#define NR_AUTOLOAD_MSRS 1
+
 struct vmcs {
 	u32 revision_id;
 	u32 abort;
@@ -125,6 +127,11 @@ struct vcpu_vmx {
 	u64 		      msr_guest_kernel_gs_base;
 #endif
 	struct vmcs          *vmcs;
+	struct msr_autoload {
+		unsigned nr;
+		struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
+		struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
+	} msr_autoload;
 	struct {
 		int           loaded;
 		u16           fs_sel, gs_sel, ldt_sel;
@@ -595,6 +602,46 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 
+static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
+{
+	unsigned i;
+	struct msr_autoload *m = &vmx->msr_autoload;
+
+	for (i = 0; i < m->nr; ++i)
+		if (m->guest[i].index == msr)
+			break;
+
+	if (i == m->nr)
+		return;
+	--m->nr;
+	m->guest[i] = m->guest[m->nr];
+	m->host[i] = m->host[m->nr];
+	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
+	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
+}
+
+static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
+				  u64 guest_val, u64 host_val)
+{
+	unsigned i;
+	struct msr_autoload *m = &vmx->msr_autoload;
+
+	for (i = 0; i < m->nr; ++i)
+		if (m->guest[i].index == msr)
+			break;
+
+	if (i == m->nr) {
+		++m->nr;
+		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
+		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
+	}
+
+	m->guest[i].index = msr;
+	m->guest[i].value = guest_val;
+	m->host[i].index = msr;
+	m->host[i].value = host_val;
+}
+
 static void reload_tss(void)
 {
 	/*
@@ -2470,7 +2517,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */
 	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
+	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
+	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
 
 	rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
 	vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
-- 
cgit v1.2.3-70-g09d2


From 84ad33ef5dbc12665ad42ee07a2daed473d3ec54 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 28 Apr 2010 16:42:29 +0300
Subject: KVM: VMX: Atomically switch efer if EPT && !EFER.NX

When EPT is enabled, we cannot emulate EFER.NX=0 through the shadow page
tables.  This causes accesses through ptes with bit 63 set to succeed instead
of failing a reserved bit check.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ae22dcf1721..c4f3955c64e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -678,6 +678,17 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
 	guest_efer |= host_efer & ignore_bits;
 	vmx->guest_msrs[efer_offset].data = guest_efer;
 	vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
+
+	clear_atomic_switch_msr(vmx, MSR_EFER);
+	/* On ept, can't emulate nx, and must switch nx atomically */
+	if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
+		guest_efer = vmx->vcpu.arch.efer;
+		if (!(guest_efer & EFER_LMA))
+			guest_efer &= ~EFER_LME;
+		add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
+		return false;
+	}
+
 	return true;
 }
 
@@ -1734,6 +1745,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 	vmcs_write32(VM_ENTRY_CONTROLS,
 		     vmcs_read32(VM_ENTRY_CONTROLS)
 		     & ~VM_ENTRY_IA32E_MODE);
+	vmx_set_efer(vcpu, vcpu->arch.efer);
 }
 
 #endif
-- 
cgit v1.2.3-70-g09d2


From f1d86e469b60f9e1afed5c17a6e723c2c9c55ceb Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 3 May 2010 23:04:27 -0300
Subject: KVM: x86: properly update ready_for_interrupt_injection

The recent changes to emulate string instructions without entering guest
mode exposed a bug where pending interrupts are not properly reflected
in ready_for_interrupt_injection.

The result is that userspace overwrites a previously queued interrupt,
when irqchip's are emulated in userspace.

Fix by always updating state before returning to userspace.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6b2ce1d2d74..dff08e527ec 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4653,7 +4653,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 	}
 
 	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
-	post_kvm_run_save(vcpu);
 
 	vapic_exit(vcpu);
 
@@ -4703,6 +4702,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	r = __vcpu_run(vcpu);
 
 out:
+	post_kvm_run_save(vcpu);
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-- 
cgit v1.2.3-70-g09d2


From cafd66595d92591e4bd25c3904e004fc6f897e2d Mon Sep 17 00:00:00 2001
From: Shane Wang <shane.wang@intel.com>
Date: Thu, 29 Apr 2010 12:09:01 -0400
Subject: KVM: VMX: enable VMXON check with SMX enabled (Intel TXT)

Per document, for feature control MSR:

  Bit 1 enables VMXON in SMX operation. If the bit is clear, execution
        of VMXON in SMX operation causes a general-protection exception.
  Bit 2 enables VMXON outside SMX operation. If the bit is clear, execution
        of VMXON outside SMX operation causes a general-protection exception.

This patch is to enable this kind of check with SMX for VMXON in KVM.

Signed-off-by: Shane Wang <shane.wang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/msr-index.h |  5 +++--
 arch/x86/kernel/tboot.c          |  1 +
 arch/x86/kvm/vmx.c               | 32 +++++++++++++++++++++-----------
 include/linux/tboot.h            |  1 +
 4 files changed, 26 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index bc473acfa7f..f9324851eba 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -202,8 +202,9 @@
 #define MSR_IA32_EBL_CR_POWERON		0x0000002a
 #define MSR_IA32_FEATURE_CONTROL        0x0000003a
 
-#define FEATURE_CONTROL_LOCKED		(1<<0)
-#define FEATURE_CONTROL_VMXON_ENABLED	(1<<2)
+#define FEATURE_CONTROL_LOCKED				(1<<0)
+#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX	(1<<1)
+#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX	(1<<2)
 
 #define MSR_IA32_APICBASE		0x0000001b
 #define MSR_IA32_APICBASE_BSP		(1<<8)
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 86c9f91b48a..46b827778d1 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -46,6 +46,7 @@
 
 /* Global pointer to shared data; NULL means no measured launch. */
 struct tboot *tboot __read_mostly;
+EXPORT_SYMBOL(tboot);
 
 /* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
 #define AP_WAIT_TIMEOUT		1
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c4f3955c64e..d2a47aefdee 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -27,6 +27,7 @@
 #include <linux/moduleparam.h>
 #include <linux/ftrace_event.h>
 #include <linux/slab.h>
+#include <linux/tboot.h>
 #include "kvm_cache_regs.h"
 #include "x86.h"
 
@@ -1272,9 +1273,16 @@ static __init int vmx_disabled_by_bios(void)
 	u64 msr;
 
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
-	return (msr & (FEATURE_CONTROL_LOCKED |
-		       FEATURE_CONTROL_VMXON_ENABLED))
-	    == FEATURE_CONTROL_LOCKED;
+	if (msr & FEATURE_CONTROL_LOCKED) {
+		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
+			&& tboot_enabled())
+			return 1;
+		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
+			&& !tboot_enabled())
+			return 1;
+	}
+
+	return 0;
 	/* locked but not enabled */
 }
 
@@ -1282,21 +1290,23 @@ static int hardware_enable(void *garbage)
 {
 	int cpu = raw_smp_processor_id();
 	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
-	u64 old;
+	u64 old, test_bits;
 
 	if (read_cr4() & X86_CR4_VMXE)
 		return -EBUSY;
 
 	INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
 	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
-	if ((old & (FEATURE_CONTROL_LOCKED |
-		    FEATURE_CONTROL_VMXON_ENABLED))
-	    != (FEATURE_CONTROL_LOCKED |
-		FEATURE_CONTROL_VMXON_ENABLED))
+
+	test_bits = FEATURE_CONTROL_LOCKED;
+	test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+	if (tboot_enabled())
+		test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
+
+	if ((old & test_bits) != test_bits) {
 		/* enable and lock */
-		wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
-		       FEATURE_CONTROL_LOCKED |
-		       FEATURE_CONTROL_VMXON_ENABLED);
+		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
+	}
 	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
 	asm volatile (ASM_VMX_VMXON_RAX
 		      : : "a"(&phys_addr), "m"(phys_addr)
diff --git a/include/linux/tboot.h b/include/linux/tboot.h
index bf2a0c74887..1dba6ee5520 100644
--- a/include/linux/tboot.h
+++ b/include/linux/tboot.h
@@ -150,6 +150,7 @@ extern int tboot_force_iommu(void);
 
 #else
 
+#define tboot_enabled()			0
 #define tboot_probe()			do { } while (0)
 #define tboot_shutdown(shutdown_type)	do { } while (0)
 #define tboot_sleep(sleep_state, pm1a_control, pm1b_control)	\
-- 
cgit v1.2.3-70-g09d2


From 8facbbff071ff2b19268d3732e31badc60471e21 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 4 May 2010 12:58:32 +0300
Subject: KVM: MMU: Don't read pdptrs with mmu spinlock held in mmu_alloc_roots

On svm, kvm_read_pdptr() may require reading guest memory, which can sleep.

Push the spinlock into mmu_alloc_roots(), and only take it after we've read
the pdptr.

Tested-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 51eb6d6abd8..de996380ec2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2065,11 +2065,13 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			direct = 1;
 			root_gfn = 0;
 		}
+		spin_lock(&vcpu->kvm->mmu_lock);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
 				      PT64_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
+		spin_unlock(&vcpu->kvm->mmu_lock);
 		vcpu->arch.mmu.root_hpa = root;
 		return 0;
 	}
@@ -2093,11 +2095,14 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			direct = 1;
 			root_gfn = i << 30;
 		}
+		spin_lock(&vcpu->kvm->mmu_lock);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
 				      PT32_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
+		spin_unlock(&vcpu->kvm->mmu_lock);
+
 		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
 	}
 	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
@@ -2466,7 +2471,9 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 		goto out;
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
+	spin_unlock(&vcpu->kvm->mmu_lock);
 	r = mmu_alloc_roots(vcpu);
+	spin_lock(&vcpu->kvm->mmu_lock);
 	mmu_sync_roots(vcpu);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	if (r)
-- 
cgit v1.2.3-70-g09d2


From 9ed3c444ab8987c7b219173a2f7807e3f71e234e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 4 May 2010 15:00:37 +0300
Subject: KVM: Fix wallclock version writing race

Wallclock writing uses an unprotected global variable to hold the version;
this can cause one guest to interfere with another if both write their
wallclock at the same time.

Acked-by: Glauber Costa <glommer@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dff08e527ec..54f73b6a006 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -754,14 +754,22 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
 
 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
 {
-	static int version;
+	int version;
+	int r;
 	struct pvclock_wall_clock wc;
 	struct timespec boot;
 
 	if (!wall_clock)
 		return;
 
-	version++;
+	r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
+	if (r)
+		return;
+
+	if (version & 1)
+		++version;  /* first time write, random junk */
+
+	++version;
 
 	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
 
-- 
cgit v1.2.3-70-g09d2


From 3f0fd2927b737c0ac2e04af7858b60d1e927d4b1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 5 May 2010 16:04:41 +0200
Subject: KVM: x86: Fix exception reinjection forced to true

The patch merged recently which allowed to mark an exception
as reinjected has a bug as it always marks the exception as
reinjected. This breaks nested-svm shadow-on-shadow
implementation.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 54f73b6a006..161ede2b5f9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -277,7 +277,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 		vcpu->arch.exception.has_error_code = has_error;
 		vcpu->arch.exception.nr = nr;
 		vcpu->arch.exception.error_code = error_code;
-		vcpu->arch.exception.reinject = true;
+		vcpu->arch.exception.reinject = reinject;
 		return;
 	}
 
-- 
cgit v1.2.3-70-g09d2


From 0d945bd9351199744c1e89d57a70615b6ee9f394 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 5 May 2010 16:04:45 +0200
Subject: KVM: SVM: Don't allow nested guest to VMMCALL into host

This patch disables the possibility for a l2-guest to do a
VMMCALL directly into the host. This would happen if the
l1-hypervisor doesn't intercept VMMCALL and the l2-guest
executes this instruction.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2511664ff67..4a66ffe1dc8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2036,6 +2036,9 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 		svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
 	}
 
+	/* We don't want to see VMMCALLs from a nested guest */
+	svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL);
+
 	/*
 	 * We don't want a nested guest to be more powerful than the guest, so
 	 * all intercepts are ORed
-- 
cgit v1.2.3-70-g09d2


From b69e8caef5b190af48c525f6d715e7b7728a77f6 Mon Sep 17 00:00:00 2001
From: "Roedel, Joerg" <Joerg.Roedel@amd.com>
Date: Thu, 6 May 2010 11:38:43 +0200
Subject: KVM: x86: Inject #GP with the right rip on efer writes

This patch fixes a bug in the KVM efer-msr write path. If a
guest writes to a reserved efer bit the set_efer function
injects the #GP directly. The architecture dependent wrmsr
function does not see this, assumes success and advances the
rip. This results in a #GP in the guest with the wrong rip.
This patch fixes this by reporting efer write errors back to
the architectural wrmsr function.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 161ede2b5f9..fe6d126633d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -683,37 +683,29 @@ static u32 emulated_msrs[] = {
 	MSR_IA32_MISC_ENABLE,
 };
 
-static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
+static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
-	if (efer & efer_reserved_bits) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	if (efer & efer_reserved_bits)
+		return 1;
 
 	if (is_paging(vcpu)
-	    && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	    && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
+		return 1;
 
 	if (efer & EFER_FFXSR) {
 		struct kvm_cpuid_entry2 *feat;
 
 		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
-		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
+		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
+			return 1;
 	}
 
 	if (efer & EFER_SVME) {
 		struct kvm_cpuid_entry2 *feat;
 
 		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
-		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
+		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
+			return 1;
 	}
 
 	kvm_x86_ops->set_efer(vcpu, efer);
@@ -725,6 +717,8 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
 	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
 	kvm_mmu_reset_context(vcpu);
+
+	return 0;
 }
 
 void kvm_enable_efer_bits(u64 mask)
@@ -1153,8 +1147,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
 	switch (msr) {
 	case MSR_EFER:
-		set_efer(vcpu, data);
-		break;
+		return set_efer(vcpu, data);
 	case MSR_K7_HWCR:
 		data &= ~(u64)0x40;	/* ignore flush filter disable */
 		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
-- 
cgit v1.2.3-70-g09d2


From 424c32f1aa3112632a657d45698c8e7666668f78 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Tue, 11 May 2010 12:17:39 -0400
Subject: x86, paravirt: Enable pvclock flags in vcpu_time_info structure

This patch removes one padding byte and transform it into a flags
field. New versions of guests using pvclock will query these flags
upon each read.

Flags, however, will only be interpreted when the guest decides to.
It uses the pvclock_valid_flags function to signal that a specific
set of flags should be taken into consideration. Which flags are valid
are usually devised via HV negotiation.

Signed-off-by: Glauber Costa <glommer@redhat.com>
CC: Jeremy Fitzhardinge <jeremy@goop.org>
Acked-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/pvclock-abi.h | 3 ++-
 arch/x86/include/asm/pvclock.h     | 1 +
 arch/x86/kernel/pvclock.c          | 9 +++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index 6d93508f262..ec5c41ac4be 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -29,7 +29,8 @@ struct pvclock_vcpu_time_info {
 	u64   system_time;
 	u32   tsc_to_system_mul;
 	s8    tsc_shift;
-	u8    pad[3];
+	u8    flags;
+	u8    pad[2];
 } __attribute__((__packed__)); /* 32 bytes */
 
 struct pvclock_wall_clock {
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 53235fd5f8c..cd02f324aa6 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -6,6 +6,7 @@
 
 /* some helper functions for xen and kvm pv clock sources */
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+void pvclock_set_flags(u8 flags);
 unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
 void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
 			    struct pvclock_vcpu_time_info *vcpu,
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 03801f2f761..f7fdd56bc0a 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -31,8 +31,16 @@ struct pvclock_shadow_time {
 	u32 tsc_to_nsec_mul;
 	int tsc_shift;
 	u32 version;
+	u8  flags;
 };
 
+static u8 valid_flags __read_mostly = 0;
+
+void pvclock_set_flags(u8 flags)
+{
+	valid_flags = flags;
+}
+
 /*
  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
  * yielding a 64-bit result.
@@ -91,6 +99,7 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
 		dst->system_timestamp  = src->system_time;
 		dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
 		dst->tsc_shift         = src->tsc_shift;
+		dst->flags             = src->flags;
 		rmb();		/* test version after fetching data */
 	} while ((src->version & 1) || (dst->version != src->version));
 
-- 
cgit v1.2.3-70-g09d2


From 489fb490dbf8dab0249ad82b56688ae3842a79e8 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Tue, 11 May 2010 12:17:40 -0400
Subject: x86, paravirt: Add a global synchronization point for pvclock

In recent stress tests, it was found that pvclock-based systems
could seriously warp in smp systems. Using ingo's time-warp-test.c,
I could trigger a scenario as bad as 1.5mi warps a minute in some systems.
(to be fair, it wasn't that bad in most of them). Investigating further, I
found out that such warps were caused by the very offset-based calculation
pvclock is based on.

This happens even on some machines that report constant_tsc in its tsc flags,
specially on multi-socket ones.

Two reads of the same kernel timestamp at approx the same time, will likely
have tsc timestamped in different occasions too. This means the delta we
calculate is unpredictable at best, and can probably be smaller in a cpu
that is legitimately reading clock in a forward ocasion.

Some adjustments on the host could make this window less likely to happen,
but still, it pretty much poses as an intrinsic problem of the mechanism.

A while ago, I though about using a shared variable anyway, to hold clock
last state, but gave up due to the high contention locking was likely
to introduce, possibly rendering the thing useless on big machines. I argue,
however, that locking is not necessary.

We do a read-and-return sequence in pvclock, and between read and return,
the global value can have changed. However, it can only have changed
by means of an addition of a positive value. So if we detected that our
clock timestamp is less than the current global, we know that we need to
return a higher one, even though it is not exactly the one we compared to.

OTOH, if we detect we're greater than the current time source, we atomically
replace the value with our new readings. This do causes contention on big
boxes (but big here means *BIG*), but it seems like a good trade off, since
it provide us with a time source guaranteed to be stable wrt time warps.

After this patch is applied, I don't see a single warp in time during 5 days
of execution, in any of the machines I saw them before.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Zachary Amsden <zamsden@redhat.com>
CC: Jeremy Fitzhardinge <jeremy@goop.org>
CC: Avi Kivity <avi@redhat.com>
CC: Marcelo Tosatti <mtosatti@redhat.com>
CC: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kernel/pvclock.c | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index f7fdd56bc0a..f5bc40e1697 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -118,11 +118,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
 	return pv_tsc_khz;
 }
 
+static atomic64_t last_value = ATOMIC64_INIT(0);
+
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 {
 	struct pvclock_shadow_time shadow;
 	unsigned version;
 	cycle_t ret, offset;
+	u64 last;
 
 	do {
 		version = pvclock_get_time_values(&shadow, src);
@@ -132,6 +135,27 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 		barrier();
 	} while (version != src->version);
 
+	/*
+	 * Assumption here is that last_value, a global accumulator, always goes
+	 * forward. If we are less than that, we should not be much smaller.
+	 * We assume there is an error marging we're inside, and then the correction
+	 * does not sacrifice accuracy.
+	 *
+	 * For reads: global may have changed between test and return,
+	 * but this means someone else updated poked the clock at a later time.
+	 * We just need to make sure we are not seeing a backwards event.
+	 *
+	 * For updates: last_value = ret is not enough, since two vcpus could be
+	 * updating at the same time, and one of them could be slightly behind,
+	 * making the assumption that last_value always go forward fail to hold.
+	 */
+	last = atomic64_read(&last_value);
+	do {
+		if (ret < last)
+			return last;
+		last = atomic64_cmpxchg(&last_value, last, ret);
+	} while (unlikely(last != ret));
+
 	return ret;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 11c6bffa42b85e703c21a1d2372dce7262daca8e Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Tue, 11 May 2010 12:17:41 -0400
Subject: KVM: x86: change msr numbers for kvmclock

Avi pointed out a while ago that those MSRs falls into the pentium
PMU range. So the idea here is to add new ones, and after a while,
deprecate the old ones.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_para.h | 4 ++++
 arch/x86/kvm/x86.c              | 7 ++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index ffae1420e7d..97348082782 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -20,6 +20,10 @@
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
+/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
+#define MSR_KVM_WALL_CLOCK_NEW  0x4b564d00
+#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
+
 #define KVM_MAX_MMU_OP_BATCH           32
 
 /* Operations for KVM_HC_MMU_OP */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fe6d126633d..73d342c69ed 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -664,9 +664,10 @@ static inline u32 bit(int bitno)
  * kvm-specific. Those are put in the beginning of the list.
  */
 
-#define KVM_SAVE_MSRS_BEGIN	5
+#define KVM_SAVE_MSRS_BEGIN	7
 static u32 msrs_to_save[] = {
 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
 	HV_X64_MSR_APIC_ASSIST_PAGE,
 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
@@ -1193,10 +1194,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	case MSR_IA32_MISC_ENABLE:
 		vcpu->arch.ia32_misc_enable_msr = data;
 		break;
+	case MSR_KVM_WALL_CLOCK_NEW:
 	case MSR_KVM_WALL_CLOCK:
 		vcpu->kvm->arch.wall_clock = data;
 		kvm_write_wall_clock(vcpu->kvm, data);
 		break;
+	case MSR_KVM_SYSTEM_TIME_NEW:
 	case MSR_KVM_SYSTEM_TIME: {
 		if (vcpu->arch.time_page) {
 			kvm_release_page_dirty(vcpu->arch.time_page);
@@ -1468,9 +1471,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 		data = vcpu->arch.efer;
 		break;
 	case MSR_KVM_WALL_CLOCK:
+	case MSR_KVM_WALL_CLOCK_NEW:
 		data = vcpu->kvm->arch.wall_clock;
 		break;
 	case MSR_KVM_SYSTEM_TIME:
+	case MSR_KVM_SYSTEM_TIME_NEW:
 		data = vcpu->arch.time;
 		break;
 	case MSR_IA32_P5_MC_ADDR:
-- 
cgit v1.2.3-70-g09d2


From 0e6ac58acbcddbc9d1687214f0d43d8657cc036c Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Tue, 11 May 2010 12:17:42 -0400
Subject: KVM: x86: add new KVMCLOCK cpuid feature

This cpuid, KVM_CPUID_CLOCKSOURCE2, will indicate to the guest
that kvmclock is available through a new set of MSRs. The old ones
are deprecated.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_para.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 97348082782..f019f8cb182 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -16,6 +16,10 @@
 #define KVM_FEATURE_CLOCKSOURCE		0
 #define KVM_FEATURE_NOP_IO_DELAY	1
 #define KVM_FEATURE_MMU_OP		2
+/* This indicates that the new set of kvmclock msrs
+ * are available. The use of 0x11 and 0x12 is deprecated
+ */
+#define KVM_FEATURE_CLOCKSOURCE2        3
 
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
-- 
cgit v1.2.3-70-g09d2


From 84478c829d0f474a1d6749207c53daacc305d4e1 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Tue, 11 May 2010 12:17:43 -0400
Subject: KVM: x86: export paravirtual cpuid flags in KVM_GET_SUPPORTED_CPUID

Right now, we were using individual KVM_CAP entities to communicate
userspace about which cpuids we support. This is suboptimal, since it
generates a delay between the feature arriving in the host, and
being available at the guest.

A much better mechanism is to list para features in KVM_GET_SUPPORTED_CPUID.
This makes userspace automatically aware of what we provide. And if we
ever add a new cpuid bit in the future, we have to do that again,
which create some complexity and delay in feature adoption.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 73d342c69ed..419c4512e27 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1972,6 +1972,23 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		}
 		break;
 	}
+	case KVM_CPUID_SIGNATURE: {
+		char signature[12] = "KVMKVMKVM\0\0";
+		u32 *sigptr = (u32 *)signature;
+		entry->eax = 0;
+		entry->ebx = sigptr[0];
+		entry->ecx = sigptr[1];
+		entry->edx = sigptr[2];
+		break;
+	}
+	case KVM_CPUID_FEATURES:
+		entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
+			     (1 << KVM_FEATURE_NOP_IO_DELAY) |
+			     (1 << KVM_FEATURE_CLOCKSOURCE2);
+		entry->ebx = 0;
+		entry->ecx = 0;
+		entry->edx = 0;
+		break;
 	case 0x80000000:
 		entry->eax = min(entry->eax, 0x8000001a);
 		break;
@@ -2018,6 +2035,23 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
 		do_cpuid_ent(&cpuid_entries[nent], func, 0,
 			     &nent, cpuid->nent);
+
+
+
+	r = -E2BIG;
+	if (nent >= cpuid->nent)
+		goto out_free;
+
+	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
+		     cpuid->nent);
+
+	r = -E2BIG;
+	if (nent >= cpuid->nent)
+		goto out_free;
+
+	do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent,
+		     cpuid->nent);
+
 	r = -E2BIG;
 	if (nent >= cpuid->nent)
 		goto out_free;
-- 
cgit v1.2.3-70-g09d2


From 838815a78785022f6611e5c48386567aea7b818b Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Tue, 11 May 2010 12:17:44 -0400
Subject: x86: KVM guest: Try using new kvm clock msrs

We now added a new set of clock-related msrs in replacement of the old
ones. In theory, we could just try to use them and get a return value
indicating they do not exist, due to our use of kvm_write_msr_save.

However, kvm clock registration happens very early, and if we ever
try to write to a non-existant MSR, we raise a lethal #GP, since our
idt handlers are not in place yet.

So this patch tests for a cpuid feature exported by the host to
decide which set of msrs are supported.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kernel/kvmclock.c | 53 ++++++++++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index feaeb0d3aa4..59c740fcb9b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -29,6 +29,8 @@
 #define KVM_SCALE 22
 
 static int kvmclock = 1;
+static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
+static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
 
 static int parse_no_kvmclock(char *arg)
 {
@@ -54,7 +56,8 @@ static unsigned long kvm_get_wallclock(void)
 
 	low = (int)__pa_symbol(&wall_clock);
 	high = ((u64)__pa_symbol(&wall_clock) >> 32);
-	native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
+
+	native_write_msr(msr_kvm_wall_clock, low, high);
 
 	vcpu_time = &get_cpu_var(hv_clock);
 	pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
@@ -130,7 +133,8 @@ static int kvm_register_clock(char *txt)
 	high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
 	printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
 	       cpu, high, low, txt);
-	return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
+
+	return native_write_msr_safe(msr_kvm_system_time, low, high);
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -165,14 +169,14 @@ static void __init kvm_smp_prepare_boot_cpu(void)
 #ifdef CONFIG_KEXEC
 static void kvm_crash_shutdown(struct pt_regs *regs)
 {
-	native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+	native_write_msr(msr_kvm_system_time, 0, 0);
 	native_machine_crash_shutdown(regs);
 }
 #endif
 
 static void kvm_shutdown(void)
 {
-	native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+	native_write_msr(msr_kvm_system_time, 0, 0);
 	native_machine_shutdown();
 }
 
@@ -181,27 +185,34 @@ void __init kvmclock_init(void)
 	if (!kvm_para_available())
 		return;
 
-	if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
-		if (kvm_register_clock("boot clock"))
-			return;
-		pv_time_ops.sched_clock = kvm_clock_read;
-		x86_platform.calibrate_tsc = kvm_get_tsc_khz;
-		x86_platform.get_wallclock = kvm_get_wallclock;
-		x86_platform.set_wallclock = kvm_set_wallclock;
+	if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
+		msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
+		msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
+	} else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
+		return;
+
+	printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
+		msr_kvm_system_time, msr_kvm_wall_clock);
+
+	if (kvm_register_clock("boot clock"))
+		return;
+	pv_time_ops.sched_clock = kvm_clock_read;
+	x86_platform.calibrate_tsc = kvm_get_tsc_khz;
+	x86_platform.get_wallclock = kvm_get_wallclock;
+	x86_platform.set_wallclock = kvm_set_wallclock;
 #ifdef CONFIG_X86_LOCAL_APIC
-		x86_cpuinit.setup_percpu_clockev =
-			kvm_setup_secondary_clock;
+	x86_cpuinit.setup_percpu_clockev =
+		kvm_setup_secondary_clock;
 #endif
 #ifdef CONFIG_SMP
-		smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
 #endif
-		machine_ops.shutdown  = kvm_shutdown;
+	machine_ops.shutdown  = kvm_shutdown;
 #ifdef CONFIG_KEXEC
-		machine_ops.crash_shutdown  = kvm_crash_shutdown;
+	machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
-		kvm_get_preset_lpj();
-		clocksource_register(&kvm_clock);
-		pv_info.paravirt_enabled = 1;
-		pv_info.name = "KVM";
-	}
+	kvm_get_preset_lpj();
+	clocksource_register(&kvm_clock);
+	pv_info.paravirt_enabled = 1;
+	pv_info.name = "KVM";
 }
-- 
cgit v1.2.3-70-g09d2


From 3a0d7256a6fb8c13f9fac6cd63250f97a8f0d8de Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Tue, 11 May 2010 12:17:45 -0400
Subject: x86, paravirt: don't compute pvclock adjustments if we trust the tsc

If the HV told us we can fully trust the TSC, skip any
correction

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_para.h    | 5 +++++
 arch/x86/include/asm/pvclock-abi.h | 1 +
 arch/x86/kernel/kvmclock.c         | 3 +++
 arch/x86/kernel/pvclock.c          | 4 ++++
 4 files changed, 13 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index f019f8cb182..05eba5e9a8e 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -21,6 +21,11 @@
  */
 #define KVM_FEATURE_CLOCKSOURCE2        3
 
+/* The last 8 bits are used to indicate how to interpret the flags field
+ * in pvclock structure. If no bits are set, all flags are ignored.
+ */
+#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT	24
+
 #define MSR_KVM_WALL_CLOCK  0x11
 #define MSR_KVM_SYSTEM_TIME 0x12
 
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
index ec5c41ac4be..35f2d1948ad 100644
--- a/arch/x86/include/asm/pvclock-abi.h
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -39,5 +39,6 @@ struct pvclock_wall_clock {
 	u32   nsec;
 } __attribute__((__packed__));
 
+#define PVCLOCK_TSC_STABLE_BIT	(1 << 0)
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 59c740fcb9b..eb9b76c716c 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -215,4 +215,7 @@ void __init kvmclock_init(void)
 	clocksource_register(&kvm_clock);
 	pv_info.paravirt_enabled = 1;
 	pv_info.name = "KVM";
+
+	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
 }
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index f5bc40e1697..239427ca02a 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -135,6 +135,10 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 		barrier();
 	} while (version != src->version);
 
+	if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
+		(shadow.flags & PVCLOCK_TSC_STABLE_BIT))
+		return ret;
+
 	/*
 	 * Assumption here is that last_value, a global accumulator, always goes
 	 * forward. If we are less than that, we should not be much smaller.
-- 
cgit v1.2.3-70-g09d2


From 371bcf646d170ee1325abaf4f3e73485b4fd4d2d Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Tue, 11 May 2010 12:17:46 -0400
Subject: KVM: x86: Tell the guest we'll warn it about tsc stability

This patch puts up the flag that tells the guest that we'll warn it
about the tsc being trustworthy or not. By now, we also say
it is not.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Acked-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 419c4512e27..474a27fc42d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -857,6 +857,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	vcpu->hv_clock.system_time = ts.tv_nsec +
 				     (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
 
+	vcpu->hv_clock.flags = 0;
+
 	/*
 	 * The interface expects us to write an even number signaling that the
 	 * update is finished. Since the guest won't see the intermediate
@@ -1984,7 +1986,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	case KVM_CPUID_FEATURES:
 		entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
 			     (1 << KVM_FEATURE_NOP_IO_DELAY) |
-			     (1 << KVM_FEATURE_CLOCKSOURCE2);
+			     (1 << KVM_FEATURE_CLOCKSOURCE2) |
+			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
 		entry->ebx = 0;
 		entry->ecx = 0;
 		entry->edx = 0;
-- 
cgit v1.2.3-70-g09d2


From f78e917688edbf1f14c318d2e50dc8e7dad20445 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 12 May 2010 00:28:44 +0300
Subject: KVM: Don't allow lmsw to clear cr0.pe

The current lmsw implementation allows the guest to clear cr0.pe, contrary
to the manual, which breaks EMM386.EXE.

Fix by ORing the old cr0.pe with lmsw's operand.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 474a27fc42d..fa1c5192559 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -470,7 +470,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
 
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 {
-	kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f));
+	kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
-- 
cgit v1.2.3-70-g09d2


From a3d204e28579427609c3d15d2310127ebaa47d94 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 12 May 2010 16:40:40 +0800
Subject: KVM: x86: Check LMA bit before set_efer

kvm_x86_ops->set_efer() would execute vcpu->arch.efer = efer, so the
checking of LMA bit didn't work.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa1c5192559..f0846d2aa95 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -709,11 +709,11 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 			return 1;
 	}
 
-	kvm_x86_ops->set_efer(vcpu, efer);
-
 	efer &= ~EFER_LMA;
 	efer |= vcpu->arch.efer & EFER_LMA;
 
+	kvm_x86_ops->set_efer(vcpu, efer);
+
 	vcpu->arch.efer = efer;
 
 	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
-- 
cgit v1.2.3-70-g09d2


From 3dbe141595faa48a067add3e47bba3205b79d33c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 12 May 2010 11:48:18 +0300
Subject: KVM: MMU: Segregate shadow pages with different cr0.wp

When cr0.wp=0, we may shadow a gpte having u/s=1 and r/w=0 with an spte
having u/s=0 and r/w=1.  This allows excessive access if the guest sets
cr0.wp=1 and accesses through this spte.

Fix by making cr0.wp part of the base role; we'll have different sptes for
the two cases and the problem disappears.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/kvm/mmu.txt       | 2 ++
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/mmu.c              | 3 ++-
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/Documentation/kvm/mmu.txt b/Documentation/kvm/mmu.txt
index 0cc28fb84f4..aaed6ab9d7a 100644
--- a/Documentation/kvm/mmu.txt
+++ b/Documentation/kvm/mmu.txt
@@ -163,6 +163,8 @@ Shadow pages contain the following information:
     32-bit or 64-bit gptes are in use).
   role.cr4_nxe:
     Contains the value of efer.nxe for which the page is valid.
+  role.cr0_wp:
+    Contains the value of cr0.wp for which the page is valid.
   gfn:
     Either the guest page table containing the translations shadowed by this
     page, or the base page frame for linear translations.  See role.direct.
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3f0007b076d..76f5483cffe 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -179,6 +179,7 @@ union kvm_mmu_page_role {
 		unsigned access:3;
 		unsigned invalid:1;
 		unsigned nxe:1;
+		unsigned cr0_wp:1;
 	};
 };
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index de996380ec2..81563e76e28 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -217,7 +217,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
-static int is_write_protection(struct kvm_vcpu *vcpu)
+static bool is_write_protection(struct kvm_vcpu *vcpu)
 {
 	return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
 }
@@ -2432,6 +2432,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
 		r = paging32_init_context(vcpu);
 
 	vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
+	vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
 
 	return r;
 }
-- 
cgit v1.2.3-70-g09d2


From 8fbf065d625617bbbf6b72d5f78f84ad13c8b547 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 13 May 2010 11:50:19 +0300
Subject: KVM: x86: Add missing locking to arch specific vcpu ioctls

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f0846d2aa95..39f49580205 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1833,6 +1833,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 {
 	int r;
 
+	vcpu_load(vcpu);
 	r = -E2BIG;
 	if (cpuid->nent < vcpu->arch.cpuid_nent)
 		goto out;
@@ -1844,6 +1845,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 
 out:
 	cpuid->nent = vcpu->arch.cpuid_nent;
+	vcpu_put(vcpu);
 	return r;
 }
 
@@ -2134,6 +2136,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	int r;
 	unsigned bank_num = mcg_cap & 0xff, bank;
 
+	vcpu_load(vcpu);
 	r = -EINVAL;
 	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
 		goto out;
@@ -2148,6 +2151,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	for (bank = 0; bank < bank_num; bank++)
 		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
 out:
+	vcpu_put(vcpu);
 	return r;
 }
 
@@ -2456,7 +2460,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EFAULT;
 		if (copy_from_user(&mce, argp, sizeof mce))
 			goto out;
+		vcpu_load(vcpu);
 		r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
+		vcpu_put(vcpu);
 		break;
 	}
 	case KVM_GET_VCPU_EVENTS: {
-- 
cgit v1.2.3-70-g09d2


From d334a49113a4a33109fd24e46073280ecd1bea0d Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 18 May 2010 14:35:20 +0800
Subject: ACPI, APEI, Generic Hardware Error Source memory error support

Generic Hardware Error Source provides a way to report platform
hardware errors (such as that from chipset). It works in so called
"Firmware First" mode, that is, hardware errors are reported to
firmware firstly, then reported to Linux by firmware. This way, some
non-standard hardware error registers or non-standard hardware link
can be checked by firmware to produce more valuable hardware error
information for Linux.

Now, only SCI notification type and memory errors are supported. More
notification type and hardware error type will be added later. These
memory errors are reported to user space through /dev/mcelog via
faking a corrected Machine Check, so that the error memory page can be
offlined by /sbin/mcelog if the error count for one page is beyond the
threshold.

On some machines, Machine Check can not report physical address for
some corrected memory errors, but GHES can do that. So this simplified
GHES is implemented firstly.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/include/asm/mce.h            |   8 +
 arch/x86/kernel/cpu/mcheck/Makefile   |   2 +
 arch/x86/kernel/cpu/mcheck/mce-apei.c |  52 +++++
 drivers/acpi/apei/Kconfig             |  14 ++
 drivers/acpi/apei/Makefile            |   1 +
 drivers/acpi/apei/ghes.c              | 427 ++++++++++++++++++++++++++++++++++
 6 files changed, 504 insertions(+)
 create mode 100644 arch/x86/kernel/cpu/mcheck/mce-apei.c
 create mode 100644 drivers/acpi/apei/ghes.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 6c3fdd631ed..f32a4301c4d 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -225,5 +225,13 @@ extern void mcheck_intel_therm_init(void);
 static inline void mcheck_intel_therm_init(void) { }
 #endif
 
+/*
+ * Used by APEI to report memory error via /dev/mcelog
+ */
+
+struct cper_sec_mem_err;
+extern void apei_mce_report_mem_error(int corrected,
+				      struct cper_sec_mem_err *mem_err);
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 4ac6d48fe11..bb34b03af25 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -7,3 +7,5 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
 obj-$(CONFIG_X86_MCE_INJECT)	+= mce-inject.o
 
 obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
+
+obj-$(CONFIG_ACPI_APEI)		+= mce-apei.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
new file mode 100644
index 00000000000..4eccd1fadb1
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -0,0 +1,52 @@
+/*
+ * Bridge between MCE and APEI
+ *
+ * On some machine, corrected memory errors are reported via APEI
+ * generic hardware error source (GHES) instead of corrected Machine
+ * Check. These corrected memory errors can be reported to user space
+ * through /dev/mcelog via faking a corrected Machine Check, so that
+ * the error memory page can be offlined by /sbin/mcelog if the error
+ * count for one page is beyond the threshold.
+ *
+ * Copyright 2010 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/cper.h>
+#include <acpi/apei.h>
+#include <asm/mce.h>
+
+#include "mce-internal.h"
+
+void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
+{
+	struct mce m;
+
+	/* Only corrected MC is reported */
+	if (!corrected)
+		return;
+
+	mce_setup(&m);
+	m.bank = 1;
+	/* Fake a memory read corrected error with unknown channel */
+	m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
+	m.addr = mem_err->physical_addr;
+	mce_log(&m);
+	mce_notify_irq();
+}
+EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
index 5f0a41c2bc6..f8c668f27b5 100644
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -7,6 +7,20 @@ config ACPI_APEI
 	  especially. In addition it supports error serialization and
 	  error injection.
 
+config ACPI_APEI_GHES
+	tristate "APEI Generic Hardware Error Source"
+	depends on ACPI_APEI && X86
+	select ACPI_HED
+	help
+	  Generic Hardware Error Source provides a way to report
+	  platform hardware errors (such as that from chipset). It
+	  works in so called "Firmware First" mode, that is, hardware
+	  errors are reported to firmware firstly, then reported to
+	  Linux by firmware. This way, some non-standard hardware
+	  error registers or non-standard hardware link can be checked
+	  by firmware to produce more valuable hardware error
+	  information for Linux.
+
 config ACPI_APEI_EINJ
 	tristate "APEI Error INJection (EINJ)"
 	depends on ACPI_APEI && DEBUG_FS
diff --git a/drivers/acpi/apei/Makefile b/drivers/acpi/apei/Makefile
index fef963ec536..41c61db4c51 100644
--- a/drivers/acpi/apei/Makefile
+++ b/drivers/acpi/apei/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_ACPI_APEI)		+= apei.o
+obj-$(CONFIG_ACPI_APEI_GHES)	+= ghes.o
 obj-$(CONFIG_ACPI_APEI_EINJ)	+= einj.o
 
 apei-y := apei-base.o hest.o cper.o
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
new file mode 100644
index 00000000000..fd0cc016a09
--- /dev/null
+++ b/drivers/acpi/apei/ghes.c
@@ -0,0 +1,427 @@
+/*
+ * APEI Generic Hardware Error Source support
+ *
+ * Generic Hardware Error Source provides a way to report platform
+ * hardware errors (such as that from chipset). It works in so called
+ * "Firmware First" mode, that is, hardware errors are reported to
+ * firmware firstly, then reported to Linux by firmware. This way,
+ * some non-standard hardware error registers or non-standard hardware
+ * link can be checked by firmware to produce more hardware error
+ * information for Linux.
+ *
+ * For more information about Generic Hardware Error Source, please
+ * refer to ACPI Specification version 4.0, section 17.3.2.6
+ *
+ * Now, only SCI notification type and memory errors are
+ * supported. More notification type and hardware error type will be
+ * added later.
+ *
+ * Copyright 2010 Intel Corp.
+ *   Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation;
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/interrupt.h>
+#include <linux/cper.h>
+#include <linux/kdebug.h>
+#include <acpi/apei.h>
+#include <acpi/atomicio.h>
+#include <acpi/hed.h>
+#include <asm/mce.h>
+
+#include "apei-internal.h"
+
+#define GHES_PFX	"GHES: "
+
+#define GHES_ESTATUS_MAX_SIZE		65536
+
+/*
+ * One struct ghes is created for each generic hardware error
+ * source.
+ *
+ * It provides the context for APEI hardware error timer/IRQ/SCI/NMI
+ * handler. Handler for one generic hardware error source is only
+ * triggered after the previous one is done. So handler can uses
+ * struct ghes without locking.
+ *
+ * estatus: memory buffer for error status block, allocated during
+ * HEST parsing.
+ */
+#define GHES_TO_CLEAR		0x0001
+
+struct ghes {
+	struct acpi_hest_generic *generic;
+	struct acpi_hest_generic_status *estatus;
+	struct list_head list;
+	u64 buffer_paddr;
+	unsigned long flags;
+};
+
+/*
+ * Error source lists, one list for each notification method. The
+ * members in lists are struct ghes.
+ *
+ * The list members are only added in HEST parsing and deleted during
+ * module_exit, that is, single-threaded. So no lock is needed for
+ * that.
+ *
+ * But the mutual exclusion is needed between members adding/deleting
+ * and timer/IRQ/SCI/NMI handler, which may traverse the list. RCU is
+ * used for that.
+ */
+static LIST_HEAD(ghes_sci);
+
+static struct ghes *ghes_new(struct acpi_hest_generic *generic)
+{
+	struct ghes *ghes;
+	unsigned int error_block_length;
+	int rc;
+
+	ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);
+	if (!ghes)
+		return ERR_PTR(-ENOMEM);
+	ghes->generic = generic;
+	INIT_LIST_HEAD(&ghes->list);
+	rc = acpi_pre_map_gar(&generic->error_status_address);
+	if (rc)
+		goto err_free;
+	error_block_length = generic->error_block_length;
+	if (error_block_length > GHES_ESTATUS_MAX_SIZE) {
+		pr_warning(FW_WARN GHES_PFX
+			   "Error status block length is too long: %u for "
+			   "generic hardware error source: %d.\n",
+			   error_block_length, generic->header.source_id);
+		error_block_length = GHES_ESTATUS_MAX_SIZE;
+	}
+	ghes->estatus = kmalloc(error_block_length, GFP_KERNEL);
+	if (!ghes->estatus) {
+		rc = -ENOMEM;
+		goto err_unmap;
+	}
+
+	return ghes;
+
+err_unmap:
+	acpi_post_unmap_gar(&generic->error_status_address);
+err_free:
+	kfree(ghes);
+	return ERR_PTR(rc);
+}
+
+static void ghes_fini(struct ghes *ghes)
+{
+	kfree(ghes->estatus);
+	acpi_post_unmap_gar(&ghes->generic->error_status_address);
+}
+
+enum {
+	GHES_SER_NO = 0x0,
+	GHES_SER_CORRECTED = 0x1,
+	GHES_SER_RECOVERABLE = 0x2,
+	GHES_SER_PANIC = 0x3,
+};
+
+static inline int ghes_severity(int severity)
+{
+	switch (severity) {
+	case CPER_SER_INFORMATIONAL:
+		return GHES_SER_NO;
+	case CPER_SER_CORRECTED:
+		return GHES_SER_CORRECTED;
+	case CPER_SER_RECOVERABLE:
+		return GHES_SER_RECOVERABLE;
+	case CPER_SER_FATAL:
+		return GHES_SER_PANIC;
+	default:
+		/* Unkown, go panic */
+		return GHES_SER_PANIC;
+	}
+}
+
+/* SCI handler run in work queue, so ioremap can be used here */
+static int ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len,
+				 int from_phys)
+{
+	void *vaddr;
+
+	vaddr = ioremap_cache(paddr, len);
+	if (!vaddr)
+		return -ENOMEM;
+	if (from_phys)
+		memcpy(buffer, vaddr, len);
+	else
+		memcpy(vaddr, buffer, len);
+	iounmap(vaddr);
+
+	return 0;
+}
+
+static int ghes_read_estatus(struct ghes *ghes, int silent)
+{
+	struct acpi_hest_generic *g = ghes->generic;
+	u64 buf_paddr;
+	u32 len;
+	int rc;
+
+	rc = acpi_atomic_read(&buf_paddr, &g->error_status_address);
+	if (rc) {
+		if (!silent && printk_ratelimit())
+			pr_warning(FW_WARN GHES_PFX
+"Failed to read error status block address for hardware error source: %d.\n",
+				   g->header.source_id);
+		return -EIO;
+	}
+	if (!buf_paddr)
+		return -ENOENT;
+
+	rc = ghes_copy_tofrom_phys(ghes->estatus, buf_paddr,
+				   sizeof(*ghes->estatus), 1);
+	if (rc)
+		return rc;
+	if (!ghes->estatus->block_status)
+		return -ENOENT;
+
+	ghes->buffer_paddr = buf_paddr;
+	ghes->flags |= GHES_TO_CLEAR;
+
+	rc = -EIO;
+	len = apei_estatus_len(ghes->estatus);
+	if (len < sizeof(*ghes->estatus))
+		goto err_read_block;
+	if (len > ghes->generic->error_block_length)
+		goto err_read_block;
+	if (apei_estatus_check_header(ghes->estatus))
+		goto err_read_block;
+	rc = ghes_copy_tofrom_phys(ghes->estatus + 1,
+				   buf_paddr + sizeof(*ghes->estatus),
+				   len - sizeof(*ghes->estatus), 1);
+	if (rc)
+		return rc;
+	if (apei_estatus_check(ghes->estatus))
+		goto err_read_block;
+	rc = 0;
+
+err_read_block:
+	if (rc && !silent)
+		pr_warning(FW_WARN GHES_PFX
+			   "Failed to read error status block!\n");
+	return rc;
+}
+
+static void ghes_clear_estatus(struct ghes *ghes)
+{
+	ghes->estatus->block_status = 0;
+	if (!(ghes->flags & GHES_TO_CLEAR))
+		return;
+	ghes_copy_tofrom_phys(ghes->estatus, ghes->buffer_paddr,
+			      sizeof(ghes->estatus->block_status), 0);
+	ghes->flags &= ~GHES_TO_CLEAR;
+}
+
+static void ghes_do_proc(struct ghes *ghes)
+{
+	int ser, processed = 0;
+	struct acpi_hest_generic_data *gdata;
+
+	ser = ghes_severity(ghes->estatus->error_severity);
+	apei_estatus_for_each_section(ghes->estatus, gdata) {
+#ifdef CONFIG_X86_MCE
+		if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
+				 CPER_SEC_PLATFORM_MEM)) {
+			apei_mce_report_mem_error(
+				ser == GHES_SER_CORRECTED,
+				(struct cper_sec_mem_err *)(gdata+1));
+			processed = 1;
+		}
+#endif
+	}
+
+	if (!processed && printk_ratelimit())
+		pr_warning(GHES_PFX
+		"Unknown error record from generic hardware error source: %d\n",
+			   ghes->generic->header.source_id);
+}
+
+static int ghes_proc(struct ghes *ghes)
+{
+	int rc;
+
+	rc = ghes_read_estatus(ghes, 0);
+	if (rc)
+		goto out;
+	ghes_do_proc(ghes);
+
+out:
+	ghes_clear_estatus(ghes);
+	return 0;
+}
+
+static int ghes_notify_sci(struct notifier_block *this,
+				  unsigned long event, void *data)
+{
+	struct ghes *ghes;
+	int ret = NOTIFY_DONE;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ghes, &ghes_sci, list) {
+		if (!ghes_proc(ghes))
+			ret = NOTIFY_OK;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static struct notifier_block ghes_notifier_sci = {
+	.notifier_call = ghes_notify_sci,
+};
+
+static int hest_ghes_parse(struct acpi_hest_header *hest_hdr, void *data)
+{
+	struct acpi_hest_generic *generic;
+	struct ghes *ghes = NULL;
+	int rc = 0;
+
+	if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR)
+		return 0;
+
+	generic = (struct acpi_hest_generic *)hest_hdr;
+	if (!generic->enabled)
+		return 0;
+
+	if (generic->error_block_length <
+	    sizeof(struct acpi_hest_generic_status)) {
+		pr_warning(FW_BUG GHES_PFX
+"Invalid error block length: %u for generic hardware error source: %d\n",
+			   generic->error_block_length,
+			   generic->header.source_id);
+		goto err;
+	}
+	if (generic->records_to_preallocate == 0) {
+		pr_warning(FW_BUG GHES_PFX
+"Invalid records to preallocate: %u for generic hardware error source: %d\n",
+			   generic->records_to_preallocate,
+			   generic->header.source_id);
+		goto err;
+	}
+	ghes = ghes_new(generic);
+	if (IS_ERR(ghes)) {
+		rc = PTR_ERR(ghes);
+		ghes = NULL;
+		goto err;
+	}
+	switch (generic->notify.type) {
+	case ACPI_HEST_NOTIFY_POLLED:
+		pr_warning(GHES_PFX
+"Generic hardware error source: %d notified via POLL is not supported!\n",
+			   generic->header.source_id);
+		break;
+	case ACPI_HEST_NOTIFY_EXTERNAL:
+	case ACPI_HEST_NOTIFY_LOCAL:
+		pr_warning(GHES_PFX
+"Generic hardware error source: %d notified via IRQ is not supported!\n",
+			   generic->header.source_id);
+		break;
+	case ACPI_HEST_NOTIFY_SCI:
+		if (list_empty(&ghes_sci))
+			register_acpi_hed_notifier(&ghes_notifier_sci);
+		list_add_rcu(&ghes->list, &ghes_sci);
+		break;
+	case ACPI_HEST_NOTIFY_NMI:
+		pr_warning(GHES_PFX
+"Generic hardware error source: %d notified via NMI is not supported!\n",
+			   generic->header.source_id);
+		break;
+	default:
+		pr_warning(FW_WARN GHES_PFX
+	"Unknown notification type: %u for generic hardware error source: %d\n",
+			   generic->notify.type, generic->header.source_id);
+		break;
+	}
+
+	return 0;
+err:
+	if (ghes)
+		ghes_fini(ghes);
+	return rc;
+}
+
+static void ghes_cleanup(void)
+{
+	struct ghes *ghes, *nghes;
+
+	if (!list_empty(&ghes_sci))
+		unregister_acpi_hed_notifier(&ghes_notifier_sci);
+
+	synchronize_rcu();
+
+	list_for_each_entry_safe(ghes, nghes, &ghes_sci, list) {
+		list_del(&ghes->list);
+		ghes_fini(ghes);
+		kfree(ghes);
+	}
+}
+
+static int __init ghes_init(void)
+{
+	int rc;
+
+	if (acpi_disabled)
+		return -ENODEV;
+
+	if (hest_disable) {
+		pr_info(GHES_PFX "HEST is not enabled!\n");
+		return -EINVAL;
+	}
+
+	rc = apei_hest_parse(hest_ghes_parse, NULL);
+	if (rc) {
+		pr_err(GHES_PFX
+		"Error during parsing HEST generic hardware error sources.\n");
+		goto err_cleanup;
+	}
+
+	if (list_empty(&ghes_sci)) {
+		pr_info(GHES_PFX
+			"No functional generic hardware error sources.\n");
+		rc = -ENODEV;
+		goto err_cleanup;
+	}
+
+	pr_info(GHES_PFX
+		"Generic Hardware Error Source support is initialized.\n");
+
+	return 0;
+err_cleanup:
+	ghes_cleanup();
+	return rc;
+}
+
+static void __exit ghes_exit(void)
+{
+	ghes_cleanup();
+}
+
+module_init(ghes_init);
+module_exit(ghes_exit);
+
+MODULE_AUTHOR("Huang Ying");
+MODULE_DESCRIPTION("APEI Generic Hardware Error Source support");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3-70-g09d2


From 482908b49ebfa453dd0455910c951c750567c05d Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 18 May 2010 14:35:22 +0800
Subject: ACPI, APEI, Use ERST for persistent storage of MCE

Traditionally, fatal MCE will cause Linux print error log to console
then reboot. Because MCE registers will preserve their content after
warm reboot, the hardware error can be logged to disk or network after
reboot. But system may fail to warm reboot, then you may lose the
hardware error log. ERST can help here. Through saving the hardware
error log into flash via ERST before go panic, the hardware error log
can be gotten from the flash after system boot successful again.

The fatal MCE processing procedure with ERST involved is as follow:

- Hardware detect error, MCE raised
- MCE read MCE registers, check error severity (fatal), prepare error record
- Write MCE error record into flash via ERST
- Go panic, then trigger system reboot
- System reboot, /sbin/mcelog run, it reads /dev/mcelog to check flash
  for error record of previous boot via ERST, and output and clear
  them if available
- /sbin/mcelog logs error records into disk or network

ERST only accepts CPER record format, but there is no pre-defined CPER
section can accommodate all information in struct mce, so a customized
section type is defined to hold struct mce inside a CPER record as an
error section.

Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/kernel/cpu/mcheck/mce-apei.c     | 86 +++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/mcheck/mce-internal.h | 23 +++++++++
 arch/x86/kernel/cpu/mcheck/mce.c          | 79 ++++++++++++++++++++++++----
 3 files changed, 177 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 4eccd1fadb1..745b54f9be8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -8,6 +8,9 @@
  * the error memory page can be offlined by /sbin/mcelog if the error
  * count for one page is beyond the threshold.
  *
+ * For fatal MCE, save MCE record into persistent storage via ERST, so
+ * that the MCE record can be logged after reboot via ERST.
+ *
  * Copyright 2010 Intel Corp.
  *   Author: Huang Ying <ying.huang@intel.com>
  *
@@ -50,3 +53,86 @@ void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
 	mce_notify_irq();
 }
 EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
+
+#define CPER_CREATOR_MCE						\
+	UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c,	\
+		0x64, 0x90, 0xb8, 0x9d)
+#define CPER_SECTION_TYPE_MCE						\
+	UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96,	\
+		0x04, 0x4a, 0x38, 0xfc)
+
+/*
+ * CPER specification (in UEFI specification 2.3 appendix N) requires
+ * byte-packed.
+ */
+struct cper_mce_record {
+	struct cper_record_header hdr;
+	struct cper_section_descriptor sec_hdr;
+	struct mce mce;
+} __packed;
+
+int apei_write_mce(struct mce *m)
+{
+	struct cper_mce_record rcd;
+
+	memset(&rcd, 0, sizeof(rcd));
+	memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+	rcd.hdr.revision = CPER_RECORD_REV;
+	rcd.hdr.signature_end = CPER_SIG_END;
+	rcd.hdr.section_count = 1;
+	rcd.hdr.error_severity = CPER_SER_FATAL;
+	/* timestamp, platform_id, partition_id are all invalid */
+	rcd.hdr.validation_bits = 0;
+	rcd.hdr.record_length = sizeof(rcd);
+	rcd.hdr.creator_id = CPER_CREATOR_MCE;
+	rcd.hdr.notification_type = CPER_NOTIFY_MCE;
+	rcd.hdr.record_id = cper_next_record_id();
+	rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
+
+	rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
+	rcd.sec_hdr.section_length = sizeof(rcd.mce);
+	rcd.sec_hdr.revision = CPER_SEC_REV;
+	/* fru_id and fru_text is invalid */
+	rcd.sec_hdr.validation_bits = 0;
+	rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
+	rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
+	rcd.sec_hdr.section_severity = CPER_SER_FATAL;
+
+	memcpy(&rcd.mce, m, sizeof(*m));
+
+	return erst_write(&rcd.hdr);
+}
+
+ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+	struct cper_mce_record rcd;
+	ssize_t len;
+
+	len = erst_read_next(&rcd.hdr, sizeof(rcd));
+	if (len <= 0)
+		return len;
+	/* Can not skip other records in storage via ERST unless clear them */
+	else if (len != sizeof(rcd) ||
+		 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) {
+		if (printk_ratelimit())
+			pr_warning(
+			"MCE-APEI: Can not skip the unknown record in ERST");
+		return -EIO;
+	}
+
+	memcpy(m, &rcd.mce, sizeof(*m));
+	*record_id = rcd.hdr.record_id;
+
+	return sizeof(*m);
+}
+
+/* Check whether there is record in ERST */
+int apei_check_mce(void)
+{
+	return erst_get_record_count();
+}
+
+int apei_clear_mce(u64 record_id)
+{
+	return erst_clear(record_id);
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 32996f9fab6..fefcc69ee8b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -28,3 +28,26 @@ extern int mce_ser;
 
 extern struct mce_bank *mce_banks;
 
+#ifdef CONFIG_ACPI_APEI
+int apei_write_mce(struct mce *m);
+ssize_t apei_read_mce(struct mce *m, u64 *record_id);
+int apei_check_mce(void);
+int apei_clear_mce(u64 record_id);
+#else
+static inline int apei_write_mce(struct mce *m)
+{
+	return -EINVAL;
+}
+static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+	return 0;
+}
+static inline int apei_check_mce(void)
+{
+	return 0;
+}
+static inline int apei_clear_mce(u64 record_id)
+{
+	return -EINVAL;
+}
+#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8a6f0afa767..09535ca9b9d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -264,7 +264,7 @@ static void wait_for_panic(void)
 
 static void mce_panic(char *msg, struct mce *final, char *exp)
 {
-	int i;
+	int i, apei_err = 0;
 
 	if (!fake_panic) {
 		/*
@@ -287,8 +287,11 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 		struct mce *m = &mcelog.entry[i];
 		if (!(m->status & MCI_STATUS_VAL))
 			continue;
-		if (!(m->status & MCI_STATUS_UC))
+		if (!(m->status & MCI_STATUS_UC)) {
 			print_mce(m);
+			if (!apei_err)
+				apei_err = apei_write_mce(m);
+		}
 	}
 	/* Now print uncorrected but with the final one last */
 	for (i = 0; i < MCE_LOG_LEN; i++) {
@@ -297,11 +300,17 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
 			continue;
 		if (!(m->status & MCI_STATUS_UC))
 			continue;
-		if (!final || memcmp(m, final, sizeof(struct mce)))
+		if (!final || memcmp(m, final, sizeof(struct mce))) {
 			print_mce(m);
+			if (!apei_err)
+				apei_err = apei_write_mce(m);
+		}
 	}
-	if (final)
+	if (final) {
 		print_mce(final);
+		if (!apei_err)
+			apei_err = apei_write_mce(final);
+	}
 	if (cpu_missing)
 		printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
 	print_mce_tail();
@@ -1493,6 +1502,43 @@ static void collect_tscs(void *data)
 	rdtscll(cpu_tsc[smp_processor_id()]);
 }
 
+static int mce_apei_read_done;
+
+/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
+static int __mce_read_apei(char __user **ubuf, size_t usize)
+{
+	int rc;
+	u64 record_id;
+	struct mce m;
+
+	if (usize < sizeof(struct mce))
+		return -EINVAL;
+
+	rc = apei_read_mce(&m, &record_id);
+	/* Error or no more MCE record */
+	if (rc <= 0) {
+		mce_apei_read_done = 1;
+		return rc;
+	}
+	rc = -EFAULT;
+	if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
+		return rc;
+	/*
+	 * In fact, we should have cleared the record after that has
+	 * been flushed to the disk or sent to network in
+	 * /sbin/mcelog, but we have no interface to support that now,
+	 * so just clear it to avoid duplication.
+	 */
+	rc = apei_clear_mce(record_id);
+	if (rc) {
+		mce_apei_read_done = 1;
+		return rc;
+	}
+	*ubuf += sizeof(struct mce);
+
+	return 0;
+}
+
 static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 			loff_t *off)
 {
@@ -1506,15 +1552,19 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 		return -ENOMEM;
 
 	mutex_lock(&mce_read_mutex);
+
+	if (!mce_apei_read_done) {
+		err = __mce_read_apei(&buf, usize);
+		if (err || buf != ubuf)
+			goto out;
+	}
+
 	next = rcu_dereference_check_mce(mcelog.next);
 
 	/* Only supports full reads right now */
-	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
-		mutex_unlock(&mce_read_mutex);
-		kfree(cpu_tsc);
-
-		return -EINVAL;
-	}
+	err = -EINVAL;
+	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
+		goto out;
 
 	err = 0;
 	prev = 0;
@@ -1562,10 +1612,15 @@ timeout:
 			memset(&mcelog.entry[i], 0, sizeof(struct mce));
 		}
 	}
+
+	if (err)
+		err = -EFAULT;
+
+out:
 	mutex_unlock(&mce_read_mutex);
 	kfree(cpu_tsc);
 
-	return err ? -EFAULT : buf - ubuf;
+	return err ? err : buf - ubuf;
 }
 
 static unsigned int mce_poll(struct file *file, poll_table *wait)
@@ -1573,6 +1628,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
 	poll_wait(file, &mce_wait, wait);
 	if (rcu_dereference_check_mce(mcelog.next))
 		return POLLIN | POLLRDNORM;
+	if (!mce_apei_read_done && apei_check_mce())
+		return POLLIN | POLLRDNORM;
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 3f6ea84a3035cc0ef7488f8e93bc76766799e082 Mon Sep 17 00:00:00 2001
From: "Ira W. Snyder" <iws@ovro.caltech.edu>
Date: Thu, 1 Apr 2010 11:43:30 -0700
Subject: PCI: read memory ranges out of Broadcom CNB20LE host bridge

Read the memory ranges behind the Broadcom CNB20LE host bridge out of the
hardware. This allows PCI hotplugging to work, since we know which memory
range to allocate PCI BAR's from.

The x86 PCI code automatically prefers the ACPI _CRS information when it is
available. In that case, this information is not used.

Signed-off-by: Ira W. Snyder <iws@ovro.caltech.edu>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/Kconfig            |   8 ++++
 arch/x86/pci/Makefile       |   2 +
 arch/x86/pci/broadcom_bus.c | 101 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 111 insertions(+)
 create mode 100644 arch/x86/pci/broadcom_bus.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9458685902b..677b87d60a3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1931,6 +1931,14 @@ config PCI_MMCONFIG
 	bool "Support mmconfig PCI config space access"
 	depends on X86_64 && PCI && ACPI
 
+config PCI_CNB20LE_QUIRK
+	bool "Read CNB20LE Host Bridge Windows"
+	depends on PCI
+	help
+	  Read the PCI windows out of the CNB20LE host bridge. This allows
+	  PCI hotplug to work on systems with the CNB20LE chipset which do
+	  not have ACPI.
+
 config DMAR
 	bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
 	depends on PCI_MSI && ACPI && EXPERIMENTAL
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index b110d97fb92..a0207a7fdf3 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -18,6 +18,8 @@ obj-$(CONFIG_X86_MRST)		+= mrst.o
 obj-y				+= common.o early.o
 obj-y				+= amd_bus.o bus_numa.o
 
+obj-$(CONFIG_PCI_CNB20LE_QUIRK)	+= broadcom_bus.o
+
 ifeq ($(CONFIG_PCI_DEBUG),y)
 EXTRA_CFLAGS += -DDEBUG
 endif
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
new file mode 100644
index 00000000000..0846a5bbbfb
--- /dev/null
+++ b/arch/x86/pci/broadcom_bus.c
@@ -0,0 +1,101 @@
+/*
+ * Read address ranges from a Broadcom CNB20LE Host Bridge
+ *
+ * Copyright (c) 2010 Ira W. Snyder <iws@ovro.caltech.edu>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/delay.h>
+#include <linux/dmi.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <asm/pci_x86.h>
+
+#include "bus_numa.h"
+
+static void __devinit cnb20le_res(struct pci_dev *dev)
+{
+	struct pci_root_info *info;
+	struct resource res;
+	u16 word1, word2;
+	u8 fbus, lbus;
+	int i;
+
+	/*
+	 * The x86_pci_root_bus_res_quirks() function already refuses to use
+	 * this information if ACPI _CRS was used. Therefore, we don't bother
+	 * checking if ACPI is enabled, and just generate the information
+	 * for both the ACPI _CRS and no ACPI cases.
+	 */
+
+	info = &pci_root_info[pci_root_num];
+	pci_root_num++;
+
+	/* read the PCI bus numbers */
+	pci_read_config_byte(dev, 0x44, &fbus);
+	pci_read_config_byte(dev, 0x45, &lbus);
+	info->bus_min = fbus;
+	info->bus_max = lbus;
+
+	/*
+	 * Add the legacy IDE ports on bus 0
+	 *
+	 * These do not exist anywhere in the bridge registers, AFAICT. I do
+	 * not have the datasheet, so this is the best I can do.
+	 */
+	if (fbus == 0) {
+		update_res(info, 0x01f0, 0x01f7, IORESOURCE_IO, 0);
+		update_res(info, 0x03f6, 0x03f6, IORESOURCE_IO, 0);
+		update_res(info, 0x0170, 0x0177, IORESOURCE_IO, 0);
+		update_res(info, 0x0376, 0x0376, IORESOURCE_IO, 0);
+		update_res(info, 0xffa0, 0xffaf, IORESOURCE_IO, 0);
+	}
+
+	/* read the non-prefetchable memory window */
+	pci_read_config_word(dev, 0xc0, &word1);
+	pci_read_config_word(dev, 0xc2, &word2);
+	if (word1 != word2) {
+		res.start = (word1 << 16) | 0x0000;
+		res.end   = (word2 << 16) | 0xffff;
+		res.flags = IORESOURCE_MEM;
+		update_res(info, res.start, res.end, res.flags, 0);
+	}
+
+	/* read the prefetchable memory window */
+	pci_read_config_word(dev, 0xc4, &word1);
+	pci_read_config_word(dev, 0xc6, &word2);
+	if (word1 != word2) {
+		res.start = (word1 << 16) | 0x0000;
+		res.end   = (word2 << 16) | 0xffff;
+		res.flags = IORESOURCE_MEM | IORESOURCE_PREFETCH;
+		update_res(info, res.start, res.end, res.flags, 0);
+	}
+
+	/* read the IO port window */
+	pci_read_config_word(dev, 0xd0, &word1);
+	pci_read_config_word(dev, 0xd2, &word2);
+	if (word1 != word2) {
+		res.start = word1;
+		res.end   = word2;
+		res.flags = IORESOURCE_IO;
+		update_res(info, res.start, res.end, res.flags, 0);
+	}
+
+	/* print information about this host bridge */
+	res.start = fbus;
+	res.end   = lbus;
+	res.flags = IORESOURCE_BUS;
+	dev_info(&dev->dev, "CNB20LE PCI Host Bridge (domain %04x %pR)\n",
+			    pci_domain_nr(dev->bus), &res);
+
+	for (i = 0; i < info->res_num; i++)
+		dev_info(&dev->dev, "host bridge window %pR\n", &info->res[i]);
+}
+
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE,
+			cnb20le_res);
+
-- 
cgit v1.2.3-70-g09d2


From 841bca1393d315d79077f272c2918423e36dc364 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 18 May 2010 08:48:30 +0900
Subject: x86/mmiotrace: Remove redundant instruction prefix checks

Get rid of the duplicated entries in prefix_codes[]
to eliminate redundant checks by skip_prefix().

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Acked-by: Pekka Paalanen <pq@iki.fi>
LKML-Reference: <1274140110-5841-1-git-send-email-akinobu.mita@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/pf_in.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index df3d5c861cd..308e32570d8 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -34,7 +34,7 @@
 /* IA32 Manual 3, 2-1 */
 static unsigned char prefix_codes[] = {
 	0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
-	0x65, 0x2E, 0x3E, 0x66, 0x67
+	0x65, 0x66, 0x67
 };
 /* IA32 Manual 3, 3-432*/
 static unsigned int reg_rop[] = {
-- 
cgit v1.2.3-70-g09d2


From 8c3ba8d049247dc06b6dcee1711a11b26647aa44 Mon Sep 17 00:00:00 2001
From: Kerstin Jonsson <kerstin.jonsson@ericsson.com>
Date: Mon, 24 May 2010 12:13:15 -0700
Subject: x86, apic: ack all pending irqs when crashed/on kexec

When the SMP kernel decides to crash_kexec() the local APICs may have
pending interrupts in their vector tables.

The setup routine for the local APIC has a deficient mechanism for
clearing these interrupts, it only handles interrupts that has already
been dispatched to the local core for servicing (the ISR register) safely,
it doesn't consider lower prioritized queued interrupts stored in the IRR
register.

If you have more than one pending interrupt within the same 32 bit word in
the LAPIC vector table registers you may find yourself entering the IO
APIC setup with pending interrupts left in the LAPIC.  This is a situation
for wich the IO APIC setup is not prepared.  Depending of what/which
interrupt vector/vectors are stuck in the APIC tables your system may show
various degrees of malfunctioning.  That was the reason why the
check_timer() failed in our system, the timer interrupts was blocked by
pending interrupts from the old kernel when routed trough the IO APIC.

Additional comment from Jiri Bohac:
==============
If this should go into stable release,
I'd add some kind of limit on the number of iterations, just to be safe from
hard to debug lock-ups:

+if (loops++  > MAX_LOOPS) {
+        printk("LAPIC pending clean-up")
+        break;
+}
 while (queued);

with MAX_LOOPS something like 1E9 this would leave plenty of time for the
pending IRQs to be cleared and would and still cause at most a second of delay
if the loop were to lock-up for whatever reason.

[trenn@suse.de:

V2: Use tsc if avail to bail out after 1 sec due to possible virtual
    apic_read calls which may take rather long (suggested by: Avi Kivity
    <avi@redhat.com>) If no tsc is available bail out quickly after
    cpu_khz, if we broke out too early and still have irqs pending (which
    should never happen?) we still get a WARN_ON...

V3: - Fixed indentation -> checkpatch clean
    - max_loops must be signed

V4: - Fix typo, mixed up tsc and ntsc in first rdtscll() call

V5: Adjust WARN_ON() condition to also catch error in cpu_has_tsc case]

Cc: <jbohac@novell.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Kerstin Jonsson <kerstin.jonsson@ericsson.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Tested-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Thomas Renninger <trenn@suse.de>
LKML-Reference: <201005241913.o4OJDGWM010865@imap1.linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/apic/apic.c | 41 +++++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e5a4a1e0161..c02cc692985 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -51,6 +51,7 @@
 #include <asm/smp.h>
 #include <asm/mce.h>
 #include <asm/kvm_para.h>
+#include <asm/tsc.h>
 
 unsigned int num_processors;
 
@@ -1151,8 +1152,13 @@ static void __cpuinit lapic_setup_esr(void)
  */
 void __cpuinit setup_local_APIC(void)
 {
-	unsigned int value;
-	int i, j;
+	unsigned int value, queued;
+	int i, j, acked = 0;
+	unsigned long long tsc = 0, ntsc;
+	long long max_loops = cpu_khz;
+
+	if (cpu_has_tsc)
+		rdtscll(tsc);
 
 	if (disable_apic) {
 		arch_disable_smp_support();
@@ -1204,13 +1210,32 @@ void __cpuinit setup_local_APIC(void)
 	 * the interrupt. Hence a vector might get locked. It was noticed
 	 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
 	 */
-	for (i = APIC_ISR_NR - 1; i >= 0; i--) {
-		value = apic_read(APIC_ISR + i*0x10);
-		for (j = 31; j >= 0; j--) {
-			if (value & (1<<j))
-				ack_APIC_irq();
+	do {
+		queued = 0;
+		for (i = APIC_ISR_NR - 1; i >= 0; i--)
+			queued |= apic_read(APIC_IRR + i*0x10);
+
+		for (i = APIC_ISR_NR - 1; i >= 0; i--) {
+			value = apic_read(APIC_ISR + i*0x10);
+			for (j = 31; j >= 0; j--) {
+				if (value & (1<<j)) {
+					ack_APIC_irq();
+					acked++;
+				}
+			}
 		}
-	}
+		if (acked > 256) {
+			printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n",
+			       acked);
+			break;
+		}
+		if (cpu_has_tsc) {
+			rdtscll(ntsc);
+			max_loops = (cpu_khz << 10) - (ntsc - tsc);
+		} else
+			max_loops--;
+	} while (queued && max_loops > 0);
+	WARN_ON(max_loops <= 0);
 
 	/*
 	 * Now that we are all set up, enable the APIC
-- 
cgit v1.2.3-70-g09d2


From b46fc5f235be04a7f77fb2af1d8cb809889c25c1 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Mon, 24 May 2010 12:13:16 -0700
Subject: arch/x86/pci: use kasprintf

kasprintf combines kmalloc and sprintf, and takes care of the size
calculation itself.

The semantic patch that makes this change is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@@
expression a,flag;
expression list args;
statement S;
@@

  a =
-  \(kmalloc\|kzalloc\)(...,flag)
+  kasprintf(flag,args)
  <... when != a
  if (a == NULL || ...) S
  ...>
- sprintf(a,args);
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
LKML-Reference: <201005241913.o4OJDG3R010871@imap1.linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/pci/acpi.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 31930fd30ea..7c0ad634694 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -207,10 +207,9 @@ get_current_resources(struct acpi_device *device, int busnum,
 	if (!info.res)
 		goto res_alloc_fail;
 
-	info.name = kmalloc(16, GFP_KERNEL);
+	info.name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum);
 	if (!info.name)
 		goto name_alloc_fail;
-	sprintf(info.name, "PCI Bus %04x:%02x", domain, busnum);
 
 	info.res_num = 0;
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
-- 
cgit v1.2.3-70-g09d2


From 5f2eb55026c91f8400ab4469aff88b2e201b5616 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@novell.com>
Date: Mon, 24 May 2010 12:13:17 -0700
Subject: x86: "nosmp" command line option should force the system into UP mode

Bits set in cpu_possible_mask prior to the execution of
prefill_possible_map() (i.e.  when parsing ACPI or MPS tables) would
prevent the SMP alternatives logic from switching to UP mode, plus
unnecessary setup of per-CPU data for CPUs that can never come online.

Additionally, without CONFIG_HOTPLUG_CPU disabled CPUs can never come
online, and hence setting cpu_possible_mask bits for them is again a
simple waste of resources.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <201005241913.o4OJDH3Z010874@imap1.linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/smpboot.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 763d815e27a..37462f1ddba 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1215,9 +1215,17 @@ __init void prefill_possible_map(void)
 	if (!num_processors)
 		num_processors = 1;
 
-	if (setup_possible_cpus == -1)
-		possible = num_processors + disabled_cpus;
-	else
+	i = setup_max_cpus ?: 1;
+	if (setup_possible_cpus == -1) {
+		possible = num_processors;
+#ifdef CONFIG_HOTPLUG_CPU
+		if (setup_max_cpus)
+			possible += disabled_cpus;
+#else
+		if (possible > i)
+			possible = i;
+#endif
+	} else
 		possible = setup_possible_cpus;
 
 	total_cpus = max_t(int, possible, num_processors + disabled_cpus);
@@ -1230,11 +1238,23 @@ __init void prefill_possible_map(void)
 		possible = nr_cpu_ids;
 	}
 
+#ifdef CONFIG_HOTPLUG_CPU
+	if (!setup_max_cpus)
+#endif
+	if (possible > i) {
+		printk(KERN_WARNING
+			"%d Processors exceeds max_cpus limit of %u\n",
+			possible, setup_max_cpus);
+		possible = i;
+	}
+
 	printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
 		possible, max_t(int, possible - num_processors, 0));
 
 	for (i = 0; i < possible; i++)
 		set_cpu_possible(i, true);
+	for (; i < NR_CPUS; i++)
+		set_cpu_possible(i, false);
 
 	nr_cpu_ids = possible;
 }
-- 
cgit v1.2.3-70-g09d2


From 3d6e77a3ddb8e4156b89f4273ff8c7d37abaf781 Mon Sep 17 00:00:00 2001
From: Gabor Gombas <gombasg@digikabel.hu>
Date: Mon, 24 May 2010 12:13:18 -0700
Subject: x86, setup: Phoenix BIOS fixup is needed on Dell Inspiron Mini 1012

The low-memory corruption checker triggers during suspend/resume, so we
need to reserve the low 64k.  Don't be fooled that the BIOS identifies
itself as "Dell Inc.", it's still Phoenix BIOS.

[ hpa: I think we blacklist almost every BIOS in existence.  We should
either change this to a whitelist or just make it unconditional. ]

Signed-off-by: Gabor Gombas <gombasg@digikabel.hu>
LKML-Reference: <201005241913.o4OJDIMM010877@imap1.linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: <stable@kernel.org>
---
 arch/x86/kernel/setup.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e8029896309..b4ae4acbd03 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -676,6 +676,17 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
 		},
 	},
+	/*
+	 * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
+	 * match on the product name.
+	 */
+	{
+		.callback = dmi_low_memory_corruption,
+		.ident = "Phoenix BIOS",
+		.matches = {
+			DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
+		},
+	},
 #endif
 	{}
 };
-- 
cgit v1.2.3-70-g09d2


From 48691ff86d91db1090551ec2a5ae0d80ef59105f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 24 May 2010 12:13:19 -0700
Subject: x86: remove last traces of quicklist usage

We still have a stray quicklist header included even though we axed
quicklist usage quite a while back.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <201005241913.o4OJDJe9010881@imap1.linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/pgtable_32.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 792854003ed..cac71849925 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -9,7 +9,6 @@
 #include <linux/pagemap.h>
 #include <linux/spinlock.h>
 #include <linux/module.h>
-#include <linux/quicklist.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
-- 
cgit v1.2.3-70-g09d2


From 87f44bbc246c5244c76a701f8eefba7788bce64a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 25 May 2010 11:02:55 +0200
Subject: perf, trace: Fix !x86 build bug

Patch b7e2ecef92 (perf, trace: Optimize tracepoints by removing
IRQ-disable from perf/tracepoint interaction) made the
unfortunate mistake of assuming the world is x86 only, correct
this.

The problem was that perf_fetch_caller_regs() did
local_save_flags() into regs->flags, and I re-used that to
remove another local_save_flags(), forgetting !x86 doesn't have
regs->flags.

Do the reverse, remove the local_save_flags() from
perf_fetch_caller_regs() and let the ftrace site do the
local_save_flags() instead.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Cc: acme@redhat.com
Cc: efault@gmx.de
Cc: fweisbec@gmail.com
Cc: rostedt@goodmis.org
LKML-Reference: <1274778175.5882.623.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 6 +++++-
 kernel/trace/trace_event_perf.c  | 4 +++-
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index fd4db0db370..c77586061bc 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1717,7 +1717,11 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
 	 */
 	regs->bp = rewind_frame_pointer(skip + 1);
 	regs->cs = __KERNEL_CS;
-	local_save_flags(regs->flags);
+	/*
+	 * We abuse bit 3 to pass exact information, see perf_misc_flags
+	 * and the comment with PERF_EFLAGS_EXACT.
+	 */
+	regs->flags = 0;
 }
 
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 26b8607a0ab..cb6f365016e 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -157,6 +157,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
 				       struct pt_regs *regs, int *rctxp)
 {
 	struct trace_entry *entry;
+	unsigned long flags;
 	char *raw_data;
 	int pc;
 
@@ -174,7 +175,8 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
 	memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
 
 	entry = (struct trace_entry *)raw_data;
-	tracing_generic_entry_update(entry, regs->flags, pc);
+	local_save_flags(flags);
+	tracing_generic_entry_update(entry, flags, pc);
 	entry->type = type;
 
 	return raw_data;
-- 
cgit v1.2.3-70-g09d2


From b3b77c8caef1750ebeea1054e39e358550ea9f55 Mon Sep 17 00:00:00 2001
From: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Date: Mon, 24 May 2010 14:33:01 -0700
Subject: endian: #define __BYTE_ORDER

Linux does not define __BYTE_ORDER in its endian header files which makes
some header files bend backwards to get at the current endian.  Lets
#define __BYTE_ORDER in big_endian.h/litte_endian.h to make it easier for
header files that are used in user space too.

In userspace the convention is that

  1. _both_ __LITTLE_ENDIAN and __BIG_ENDIAN are defined,
  2. you have to test for e.g. __BYTE_ORDER == __BIG_ENDIAN.

Signed-off-by: Joakim Tjernlund <Joakim.Tjernlund@transmode.se>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/math-emu/sfp-util.h          | 5 -----
 arch/powerpc/include/asm/sfp-machine.h  | 6 ------
 arch/s390/include/asm/sfp-util.h        | 2 --
 arch/sh/math-emu/sfp-util.h             | 4 ----
 arch/sparc/math-emu/sfp-util_32.h       | 6 ------
 arch/sparc/math-emu/sfp-util_64.h       | 6 ------
 arch/x86/boot/compressed/relocs.c       | 4 ++--
 include/linux/byteorder/big_endian.h    | 3 +++
 include/linux/byteorder/little_endian.h | 3 +++
 9 files changed, 8 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/math-emu/sfp-util.h b/arch/alpha/math-emu/sfp-util.h
index f53707f7745..d4c6ae7fee4 100644
--- a/arch/alpha/math-emu/sfp-util.h
+++ b/arch/alpha/math-emu/sfp-util.h
@@ -28,8 +28,3 @@ extern unsigned long __udiv_qrnnd (unsigned long *, unsigned long,
 #define UDIV_NEEDS_NORMALIZATION 1  
 
 #define abort()			goto bad_insn
-
-#ifndef __LITTLE_ENDIAN
-#define __LITTLE_ENDIAN -1
-#endif
-#define __BYTE_ORDER __LITTLE_ENDIAN
diff --git a/arch/powerpc/include/asm/sfp-machine.h b/arch/powerpc/include/asm/sfp-machine.h
index 3a7a67a0d00..8b8fab91ad1 100644
--- a/arch/powerpc/include/asm/sfp-machine.h
+++ b/arch/powerpc/include/asm/sfp-machine.h
@@ -353,12 +353,6 @@
 #define abort()								\
 	return 0
 
-#ifdef __BIG_ENDIAN
-#define __BYTE_ORDER __BIG_ENDIAN
-#else
-#define __BYTE_ORDER __LITTLE_ENDIAN
-#endif
-
 /* Exception flags. */
 #define EFLAG_INVALID		(1 << (31 - 2))
 #define EFLAG_OVERFLOW		(1 << (31 - 3))
diff --git a/arch/s390/include/asm/sfp-util.h b/arch/s390/include/asm/sfp-util.h
index 0addc6466d9..7d43fee17e3 100644
--- a/arch/s390/include/asm/sfp-util.h
+++ b/arch/s390/include/asm/sfp-util.h
@@ -73,5 +73,3 @@ extern unsigned long __udiv_qrnnd (unsigned int *, unsigned int,
 #define UDIV_NEEDS_NORMALIZATION 0
 
 #define abort() return 0
-
-#define __BYTE_ORDER __BIG_ENDIAN
diff --git a/arch/sh/math-emu/sfp-util.h b/arch/sh/math-emu/sfp-util.h
index 8ae1bd310ad..e8526021892 100644
--- a/arch/sh/math-emu/sfp-util.h
+++ b/arch/sh/math-emu/sfp-util.h
@@ -66,7 +66,3 @@
   } while (0)
 
 #define abort()	return 0
-
-#define __BYTE_ORDER __LITTLE_ENDIAN
-
-
diff --git a/arch/sparc/math-emu/sfp-util_32.h b/arch/sparc/math-emu/sfp-util_32.h
index d1b2aff3c25..0ea35afbb91 100644
--- a/arch/sparc/math-emu/sfp-util_32.h
+++ b/arch/sparc/math-emu/sfp-util_32.h
@@ -107,9 +107,3 @@
 
 #define abort()								\
 	return 0
-
-#ifdef __BIG_ENDIAN
-#define __BYTE_ORDER __BIG_ENDIAN
-#else
-#define __BYTE_ORDER __LITTLE_ENDIAN
-#endif
diff --git a/arch/sparc/math-emu/sfp-util_64.h b/arch/sparc/math-emu/sfp-util_64.h
index 425d3cf01af..d17c9bc7218 100644
--- a/arch/sparc/math-emu/sfp-util_64.h
+++ b/arch/sparc/math-emu/sfp-util_64.h
@@ -112,9 +112,3 @@
 
 #define abort() \
 	return 0
-
-#ifdef __BIG_ENDIAN
-#define __BYTE_ORDER __BIG_ENDIAN
-#else
-#define __BYTE_ORDER __LITTLE_ENDIAN
-#endif
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index 89bbf4e4d05..7b1aaa20c7b 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -195,11 +195,11 @@ static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym)
 
 
-#if BYTE_ORDER == LITTLE_ENDIAN
+#if __BYTE_ORDER == __LITTLE_ENDIAN
 #define le16_to_cpu(val) (val)
 #define le32_to_cpu(val) (val)
 #endif
-#if BYTE_ORDER == BIG_ENDIAN
+#if __BYTE_ORDER == __BIG_ENDIAN
 #define le16_to_cpu(val) bswap_16(val)
 #define le32_to_cpu(val) bswap_32(val)
 #endif
diff --git a/include/linux/byteorder/big_endian.h b/include/linux/byteorder/big_endian.h
index 3c80fd7e8b5..d53a67dff01 100644
--- a/include/linux/byteorder/big_endian.h
+++ b/include/linux/byteorder/big_endian.h
@@ -7,6 +7,9 @@
 #ifndef __BIG_ENDIAN_BITFIELD
 #define __BIG_ENDIAN_BITFIELD
 #endif
+#ifndef __BYTE_ORDER
+#define __BYTE_ORDER __BIG_ENDIAN
+#endif
 
 #include <linux/types.h>
 #include <linux/swab.h>
diff --git a/include/linux/byteorder/little_endian.h b/include/linux/byteorder/little_endian.h
index 83195fb8296..f7f8ad13adb 100644
--- a/include/linux/byteorder/little_endian.h
+++ b/include/linux/byteorder/little_endian.h
@@ -7,6 +7,9 @@
 #ifndef __LITTLE_ENDIAN_BITFIELD
 #define __LITTLE_ENDIAN_BITFIELD
 #endif
+#ifndef __BYTE_ORDER
+#define __BYTE_ORDER __LITTLE_ENDIAN
+#endif
 
 #include <linux/types.h>
 #include <linux/swab.h>
-- 
cgit v1.2.3-70-g09d2


From a321cedb12904114e2ba5041a3673ca24deb09c9 Mon Sep 17 00:00:00 2001
From: Carsten Emde <C.Emde@osadl.org>
Date: Mon, 24 May 2010 14:33:41 -0700
Subject: drivers/hwmon/coretemp.c: get TjMax value from MSR

The MSR IA32_TEMPERATURE_TARGET contains the TjMax value in the newer
Intel processors.

Signed-off-by: Huaxu Wan <huaxu.wan@linux.intel.com>
Signed-off-by: Carsten Emde <C.Emde@osadl.org>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Cc: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Cc: Yong Wang <yong.y.wang@linux.intel.com>
Cc: Rudolf Marek <r.marek@assembler.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/msr-index.h |  2 ++
 drivers/hwmon/coretemp.c         | 61 +++++++++++++++++++++++++++++++++++++---
 2 files changed, 59 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index f9324851eba..b49d8ca228f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -236,6 +236,8 @@
 
 #define MSR_IA32_MISC_ENABLE		0x000001a0
 
+#define MSR_IA32_TEMPERATURE_TARGET	0x000001a2
+
 /* MISC_ENABLE bits: architectural */
 #define MSR_IA32_MISC_ENABLE_FAST_STRING	(1ULL << 0)
 #define MSR_IA32_MISC_ENABLE_TCC		(1ULL << 1)
diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index 9fae7cbc5d7..2988da150ed 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -241,6 +241,55 @@ static int __devinit adjust_tjmax(struct cpuinfo_x86 *c, u32 id, struct device *
 	return tjmax;
 }
 
+static int __devinit get_tjmax(struct cpuinfo_x86 *c, u32 id,
+			       struct device *dev)
+{
+	/* The 100C is default for both mobile and non mobile CPUs */
+	int err;
+	u32 eax, edx;
+	u32 val;
+
+	/* A new feature of current Intel(R) processors, the
+	   IA32_TEMPERATURE_TARGET contains the TjMax value */
+	err = rdmsr_safe_on_cpu(id, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx);
+	if (err) {
+		dev_warn(dev, "Unable to read TjMax from CPU.\n");
+	} else {
+		val = (eax >> 16) & 0xff;
+		/*
+		 * If the TjMax is not plausible, an assumption
+		 * will be used
+		 */
+		if ((val > 80) && (val < 120)) {
+			dev_info(dev, "TjMax is %d C.\n", val);
+			return val * 1000;
+		}
+	}
+
+	/*
+	 * An assumption is made for early CPUs and unreadable MSR.
+	 * NOTE: the given value may not be correct.
+	 */
+
+	switch (c->x86_model) {
+	case 0xe:
+	case 0xf:
+	case 0x16:
+	case 0x1a:
+		dev_warn(dev, "TjMax is assumed as 100 C!\n");
+		return 100000;
+		break;
+	case 0x17:
+	case 0x1c:		/* Atom CPUs */
+		return adjust_tjmax(c, id, dev);
+		break;
+	default:
+		dev_warn(dev, "CPU (model=0x%x) is not supported yet,"
+			" using default TjMax of 100C.\n", c->x86_model);
+		return 100000;
+	}
+}
+
 static int __devinit coretemp_probe(struct platform_device *pdev)
 {
 	struct coretemp_data *data;
@@ -283,14 +332,18 @@ static int __devinit coretemp_probe(struct platform_device *pdev)
 		}
 	}
 
-	data->tjmax = adjust_tjmax(c, data->id, &pdev->dev);
+	data->tjmax = get_tjmax(c, data->id, &pdev->dev);
 	platform_set_drvdata(pdev, data);
 
-	/* read the still undocumented IA32_TEMPERATURE_TARGET it exists
-	   on older CPUs but not in this register, Atoms don't have it either */
+	/*
+	 * read the still undocumented IA32_TEMPERATURE_TARGET. It exists
+	 * on older CPUs but not in this register,
+	 * Atoms don't have it either.
+	 */
 
 	if ((c->x86_model > 0xe) && (c->x86_model != 0x1c)) {
-		err = rdmsr_safe_on_cpu(data->id, 0x1a2, &eax, &edx);
+		err = rdmsr_safe_on_cpu(data->id, MSR_IA32_TEMPERATURE_TARGET,
+		    &eax, &edx);
 		if (err) {
 			dev_warn(&pdev->dev, "Unable to read"
 					" IA32_TEMPERATURE_TARGET MSR\n");
-- 
cgit v1.2.3-70-g09d2


From 578454ff7eab61d13a26b568f99a89a2c9edc881 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Thu, 20 May 2010 18:07:20 +0200
Subject: driver core: add devname module aliases to allow module on-demand
 auto-loading

This adds:
  alias: devname:<name>
to some common kernel modules, which will allow the on-demand loading
of the kernel module when the device node is accessed.

Ideally all these modules would be compiled-in, but distros seems too
much in love with their modularization that we need to cover the common
cases with this new facility. It will allow us to remove a bunch of pretty
useless init scripts and modprobes from init scripts.

The static device node aliases will be carried in the module itself. The
program depmod will extract this information to a file in the module directory:
  $ cat /lib/modules/2.6.34-00650-g537b60d-dirty/modules.devname
  # Device nodes to trigger on-demand module loading.
  microcode cpu/microcode c10:184
  fuse fuse c10:229
  ppp_generic ppp c108:0
  tun net/tun c10:200
  dm_mod mapper/control c10:235

Udev will pick up the depmod created file on startup and create all the
static device nodes which the kernel modules specify, so that these modules
get automatically loaded when the device node is accessed:
  $ /sbin/udevd --debug
  ...
  static_dev_create_from_modules: mknod '/dev/cpu/microcode' c10:184
  static_dev_create_from_modules: mknod '/dev/fuse' c10:229
  static_dev_create_from_modules: mknod '/dev/ppp' c108:0
  static_dev_create_from_modules: mknod '/dev/net/tun' c10:200
  static_dev_create_from_modules: mknod '/dev/mapper/control' c10:235
  udev_rules_apply_static_dev_perms: chmod '/dev/net/tun' 0666
  udev_rules_apply_static_dev_perms: chmod '/dev/fuse' 0666

A few device nodes are switched to statically allocated numbers, to allow
the static nodes to work. This might also useful for systems which still run
a plain static /dev, which is completely unsafe to use with any dynamic minor
numbers.

Note:
The devname aliases must be limited to the *common* and *single*instance*
device nodes, like the misc devices, and never be used for conceptually limited
systems like the loop devices, which should rather get fixed properly and get a
control node for losetup to talk to, instead of creating a random number of
device nodes in advance, regardless if they are ever used.

This facility is to hide the mess distros are creating with too modualized
kernels, and just to hide that these modules are not compiled-in, and not to
paper-over broken concepts. Thanks! :)

Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: David S. Miller <davem@davemloft.net>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Cc: Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
Cc: Ian Kent <raven@themaw.net>
Signed-Off-By: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/devices.txt        | 2 ++
 arch/x86/kernel/microcode_core.c | 1 +
 drivers/net/ppp_generic.c        | 4 ++--
 drivers/net/tun.c                | 1 +
 fs/autofs4/dev-ioctl.c           | 5 ++++-
 fs/btrfs/super.c                 | 5 ++++-
 fs/fuse/dev.c                    | 1 +
 include/linux/miscdevice.h       | 2 ++
 8 files changed, 17 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/devices.txt b/Documentation/devices.txt
index 53d64d38234..1d83d124056 100644
--- a/Documentation/devices.txt
+++ b/Documentation/devices.txt
@@ -443,6 +443,8 @@ Your cooperation is appreciated.
 		231 = /dev/snapshot	System memory snapshot device
 		232 = /dev/kvm		Kernel-based virtual machine (hardware virtualization extensions)
 		233 = /dev/kmview	View-OS A process with a view
+		234 = /dev/btrfs-control	Btrfs control device
+		235 = /dev/autofs	Autofs control device
 		240-254			Reserved for local use
 		255			Reserved for MISC_DYNAMIC_MINOR
 
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 2cd8c544e41..fa6551d36c1 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -260,6 +260,7 @@ static void microcode_dev_exit(void)
 }
 
 MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+MODULE_ALIAS("devname:cpu/microcode");
 #else
 #define microcode_dev_init()	0
 #define microcode_dev_exit()	do { } while (0)
diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c
index 5441688daba..c5f8eb102bf 100644
--- a/drivers/net/ppp_generic.c
+++ b/drivers/net/ppp_generic.c
@@ -2926,5 +2926,5 @@ EXPORT_SYMBOL(ppp_output_wakeup);
 EXPORT_SYMBOL(ppp_register_compressor);
 EXPORT_SYMBOL(ppp_unregister_compressor);
 MODULE_LICENSE("GPL");
-MODULE_ALIAS_CHARDEV_MAJOR(PPP_MAJOR);
-MODULE_ALIAS("/dev/ppp");
+MODULE_ALIAS_CHARDEV(PPP_MAJOR, 0);
+MODULE_ALIAS("devname:ppp");
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 97b25533e5f..005cad68957 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1649,3 +1649,4 @@ MODULE_DESCRIPTION(DRV_DESCRIPTION);
 MODULE_AUTHOR(DRV_COPYRIGHT);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_MISCDEV(TUN_MINOR);
+MODULE_ALIAS("devname:net/tun");
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index d29b7f6df86..d832062869f 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -736,11 +736,14 @@ static const struct file_operations _dev_ioctl_fops = {
 };
 
 static struct miscdevice _autofs_dev_ioctl_misc = {
-	.minor 		= MISC_DYNAMIC_MINOR,
+	.minor		= AUTOFS_MINOR,
 	.name  		= AUTOFS_DEVICE_NAME,
 	.fops  		= &_dev_ioctl_fops
 };
 
+MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
+MODULE_ALIAS("devname:autofs");
+
 /* Register/deregister misc character device */
 int autofs_dev_ioctl_init(void)
 {
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1866dff0538..2909a03e523 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -832,11 +832,14 @@ static const struct file_operations btrfs_ctl_fops = {
 };
 
 static struct miscdevice btrfs_misc = {
-	.minor		= MISC_DYNAMIC_MINOR,
+	.minor		= BTRFS_MINOR,
 	.name		= "btrfs-control",
 	.fops		= &btrfs_ctl_fops
 };
 
+MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
+MODULE_ALIAS("devname:btrfs-control");
+
 static int btrfs_interface_init(void)
 {
 	return misc_register(&btrfs_misc);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index eb7e9423691..e53df5ebb2b 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 
 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
+MODULE_ALIAS("devname:fuse");
 
 static struct kmem_cache *fuse_req_cachep;
 
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 8b5f7cc0fba..b631c46cffd 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -31,6 +31,8 @@
 #define FUSE_MINOR		229
 #define KVM_MINOR		232
 #define VHOST_NET_MINOR		233
+#define BTRFS_MINOR		234
+#define AUTOFS_MINOR		235
 #define MISC_DYNAMIC_MINOR	255
 
 struct device;
-- 
cgit v1.2.3-70-g09d2


From fe501f1e89cd460793152f500bf25d81d463515b Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Tue, 25 May 2010 15:28:58 +0000
Subject: x86, k8: Fix section mismatch for powernowk8_exit()

Fix the following warning:

"WARNING: arch/x86/kernel/built-in.o(.exit.text+0x72):
Section mismatch in reference from the function powernowk8_exit() to the variable .cpuinit.data:cpb_nb

The function __exit powernowk8_exit() references a variable
__cpuinitdata cpb_nb. This is often seen when error handling in the exit
function uses functionality in the init path. The fix is often to remove
the __cpuinitdata annotation of cpb_nb so it may be used outside an init
section."

Cc: <stable@kernel.org>
Reported-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
LKML-Reference: <20100525152858.GA24836@aftab>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 6f3dc8fbbfd..7ec2123838e 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1497,8 +1497,8 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
  * simply keep the boost-disable flag in sync with the current global
  * state.
  */
-static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action,
-				void *hcpu)
+static int cpb_notify(struct notifier_block *nb, unsigned long action,
+		      void *hcpu)
 {
 	unsigned cpu = (long)hcpu;
 	u32 lo, hi;
@@ -1528,7 +1528,7 @@ static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata cpb_nb = {
+static struct notifier_block cpb_nb = {
 	.notifier_call		= cpb_notify,
 };
 
-- 
cgit v1.2.3-70-g09d2


From 13da9e200fe4740b02cd51e07ab454627e228920 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 26 May 2010 08:30:15 -0700
Subject: Revert "endian: #define __BYTE_ORDER"

This reverts commit b3b77c8caef1750ebeea1054e39e358550ea9f55, which was
also totally broken (see commit 0d2daf5cc858 that reverted the crc32
version of it).  As reported by Stephen Rothwell, it causes problems on
big-endian machines:

> In file included from fs/jfs/jfs_types.h:33,
>                  from fs/jfs/jfs_incore.h:26,
>                  from fs/jfs/file.c:22:
> fs/jfs/endian24.h:36:101: warning: "__LITTLE_ENDIAN" is not defined

The kernel has never had that crazy "__BYTE_ORDER == __LITTLE_ENDIAN"
model.  It's not how we do things, and it isn't how we _should_ do
things.  So don't go there.

Requested-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/math-emu/sfp-util.h          | 5 +++++
 arch/powerpc/include/asm/sfp-machine.h  | 6 ++++++
 arch/s390/include/asm/sfp-util.h        | 2 ++
 arch/sh/math-emu/sfp-util.h             | 4 ++++
 arch/sparc/math-emu/sfp-util_32.h       | 6 ++++++
 arch/sparc/math-emu/sfp-util_64.h       | 6 ++++++
 arch/x86/boot/compressed/relocs.c       | 4 ++--
 include/linux/byteorder/big_endian.h    | 3 ---
 include/linux/byteorder/little_endian.h | 3 ---
 9 files changed, 31 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/math-emu/sfp-util.h b/arch/alpha/math-emu/sfp-util.h
index d4c6ae7fee4..f53707f7745 100644
--- a/arch/alpha/math-emu/sfp-util.h
+++ b/arch/alpha/math-emu/sfp-util.h
@@ -28,3 +28,8 @@ extern unsigned long __udiv_qrnnd (unsigned long *, unsigned long,
 #define UDIV_NEEDS_NORMALIZATION 1  
 
 #define abort()			goto bad_insn
+
+#ifndef __LITTLE_ENDIAN
+#define __LITTLE_ENDIAN -1
+#endif
+#define __BYTE_ORDER __LITTLE_ENDIAN
diff --git a/arch/powerpc/include/asm/sfp-machine.h b/arch/powerpc/include/asm/sfp-machine.h
index 8b8fab91ad1..3a7a67a0d00 100644
--- a/arch/powerpc/include/asm/sfp-machine.h
+++ b/arch/powerpc/include/asm/sfp-machine.h
@@ -353,6 +353,12 @@
 #define abort()								\
 	return 0
 
+#ifdef __BIG_ENDIAN
+#define __BYTE_ORDER __BIG_ENDIAN
+#else
+#define __BYTE_ORDER __LITTLE_ENDIAN
+#endif
+
 /* Exception flags. */
 #define EFLAG_INVALID		(1 << (31 - 2))
 #define EFLAG_OVERFLOW		(1 << (31 - 3))
diff --git a/arch/s390/include/asm/sfp-util.h b/arch/s390/include/asm/sfp-util.h
index 7d43fee17e3..0addc6466d9 100644
--- a/arch/s390/include/asm/sfp-util.h
+++ b/arch/s390/include/asm/sfp-util.h
@@ -73,3 +73,5 @@ extern unsigned long __udiv_qrnnd (unsigned int *, unsigned int,
 #define UDIV_NEEDS_NORMALIZATION 0
 
 #define abort() return 0
+
+#define __BYTE_ORDER __BIG_ENDIAN
diff --git a/arch/sh/math-emu/sfp-util.h b/arch/sh/math-emu/sfp-util.h
index e8526021892..8ae1bd310ad 100644
--- a/arch/sh/math-emu/sfp-util.h
+++ b/arch/sh/math-emu/sfp-util.h
@@ -66,3 +66,7 @@
   } while (0)
 
 #define abort()	return 0
+
+#define __BYTE_ORDER __LITTLE_ENDIAN
+
+
diff --git a/arch/sparc/math-emu/sfp-util_32.h b/arch/sparc/math-emu/sfp-util_32.h
index 0ea35afbb91..d1b2aff3c25 100644
--- a/arch/sparc/math-emu/sfp-util_32.h
+++ b/arch/sparc/math-emu/sfp-util_32.h
@@ -107,3 +107,9 @@
 
 #define abort()								\
 	return 0
+
+#ifdef __BIG_ENDIAN
+#define __BYTE_ORDER __BIG_ENDIAN
+#else
+#define __BYTE_ORDER __LITTLE_ENDIAN
+#endif
diff --git a/arch/sparc/math-emu/sfp-util_64.h b/arch/sparc/math-emu/sfp-util_64.h
index d17c9bc7218..425d3cf01af 100644
--- a/arch/sparc/math-emu/sfp-util_64.h
+++ b/arch/sparc/math-emu/sfp-util_64.h
@@ -112,3 +112,9 @@
 
 #define abort() \
 	return 0
+
+#ifdef __BIG_ENDIAN
+#define __BYTE_ORDER __BIG_ENDIAN
+#else
+#define __BYTE_ORDER __LITTLE_ENDIAN
+#endif
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index 7b1aaa20c7b..89bbf4e4d05 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -195,11 +195,11 @@ static const char *sym_name(const char *sym_strtab, Elf32_Sym *sym)
 
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
+#if BYTE_ORDER == LITTLE_ENDIAN
 #define le16_to_cpu(val) (val)
 #define le32_to_cpu(val) (val)
 #endif
-#if __BYTE_ORDER == __BIG_ENDIAN
+#if BYTE_ORDER == BIG_ENDIAN
 #define le16_to_cpu(val) bswap_16(val)
 #define le32_to_cpu(val) bswap_32(val)
 #endif
diff --git a/include/linux/byteorder/big_endian.h b/include/linux/byteorder/big_endian.h
index d53a67dff01..3c80fd7e8b5 100644
--- a/include/linux/byteorder/big_endian.h
+++ b/include/linux/byteorder/big_endian.h
@@ -7,9 +7,6 @@
 #ifndef __BIG_ENDIAN_BITFIELD
 #define __BIG_ENDIAN_BITFIELD
 #endif
-#ifndef __BYTE_ORDER
-#define __BYTE_ORDER __BIG_ENDIAN
-#endif
 
 #include <linux/types.h>
 #include <linux/swab.h>
diff --git a/include/linux/byteorder/little_endian.h b/include/linux/byteorder/little_endian.h
index f7f8ad13adb..83195fb8296 100644
--- a/include/linux/byteorder/little_endian.h
+++ b/include/linux/byteorder/little_endian.h
@@ -7,9 +7,6 @@
 #ifndef __LITTLE_ENDIAN_BITFIELD
 #define __LITTLE_ENDIAN_BITFIELD
 #endif
-#ifndef __BYTE_ORDER
-#define __BYTE_ORDER __LITTLE_ENDIAN
-#endif
 
 #include <linux/types.h>
 #include <linux/swab.h>
-- 
cgit v1.2.3-70-g09d2


From 20413f27163fb1b8b806c0c219dc95eae67c633a Mon Sep 17 00:00:00 2001
From: Xiaotian Feng <dfeng@redhat.com>
Date: Wed, 26 May 2010 09:51:10 +0800
Subject: x86, pat: Fix memory leak in free_memtype

Reserve_memtype will allocate memory for new memtype, but
in free_memtype, after the memtype erased from rbtree, the
memory is not freed.

Changes since V1:
	make rbt_memtype_erase return erased memtype so that
	it can be freed in free_memtype.

[ hpa: not for -stable: 2.6.34 and earlier not affected ]

Signed-off-by: Xiaotian Feng <dfeng@redhat.com>
LKML-Reference: <1274838670-8731-1-git-send-email-dfeng@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Jack Steiner <steiner@sgi.com>
Acked-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/mm/pat.c          | 10 +++++++---
 arch/x86/mm/pat_internal.h |  6 +++---
 arch/x86/mm/pat_rbtree.c   |  7 ++++---
 3 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index bbe5502ee1c..acc15b23b74 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -336,6 +336,7 @@ int free_memtype(u64 start, u64 end)
 {
 	int err = -EINVAL;
 	int is_range_ram;
+	struct memtype *entry;
 
 	if (!pat_enabled)
 		return 0;
@@ -355,17 +356,20 @@ int free_memtype(u64 start, u64 end)
 	}
 
 	spin_lock(&memtype_lock);
-	err = rbt_memtype_erase(start, end);
+	entry = rbt_memtype_erase(start, end);
 	spin_unlock(&memtype_lock);
 
-	if (err) {
+	if (!entry) {
 		printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
 			current->comm, current->pid, start, end);
+		return -EINVAL;
 	}
 
+	kfree(entry);
+
 	dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
 
-	return err;
+	return 0;
 }
 
 
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
index 4f39eefa3e6..77e5ba153fa 100644
--- a/arch/x86/mm/pat_internal.h
+++ b/arch/x86/mm/pat_internal.h
@@ -28,15 +28,15 @@ static inline char *cattr_name(unsigned long flags)
 #ifdef CONFIG_X86_PAT
 extern int rbt_memtype_check_insert(struct memtype *new,
 					unsigned long *new_type);
-extern int rbt_memtype_erase(u64 start, u64 end);
+extern struct memtype *rbt_memtype_erase(u64 start, u64 end);
 extern struct memtype *rbt_memtype_lookup(u64 addr);
 extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos);
 #else
 static inline int rbt_memtype_check_insert(struct memtype *new,
 					unsigned long *new_type)
 { return 0; }
-static inline int rbt_memtype_erase(u64 start, u64 end)
-{ return 0; }
+static inline struct memtype *rbt_memtype_erase(u64 start, u64 end)
+{ return NULL; }
 static inline struct memtype *rbt_memtype_lookup(u64 addr)
 { return NULL; }
 static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos)
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 07de4cb8cc3..f537087bb74 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -231,16 +231,17 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type)
 	return err;
 }
 
-int rbt_memtype_erase(u64 start, u64 end)
+struct memtype *rbt_memtype_erase(u64 start, u64 end)
 {
 	struct memtype *data;
 
 	data = memtype_rb_exact_match(&memtype_rbroot, start, end);
 	if (!data)
-		return -EINVAL;
+		goto out;
 
 	rb_erase(&data->rb, &memtype_rbroot);
-	return 0;
+out:
+	return data;
 }
 
 struct memtype *rbt_memtype_lookup(u64 addr)
-- 
cgit v1.2.3-70-g09d2


From 84fe6c19e4a598e8071e3bd1b2c923454eae1268 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Thu, 27 May 2010 12:31:51 +0200
Subject: arch/x86/kernel: Add missing spin_unlock

Add a spin_unlock missing on the error path.  The locks and unlocks are
balanced in other functions, so it seems that the same should be the case
here.

The semantic match that finds this problem is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@@
expression E1;
@@

* spin_lock(E1,...);
  <+... when != E1
  if (...) {
    ... when != E1
*   return ...;
  }
  ...+>
* spin_unlock(E1,...);
// </smpl>

Cc: stable@kernel.org
Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index fa5a1474cd1..8a9aaa8412c 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1487,6 +1487,7 @@ static int __attach_device(struct device *dev,
 			   struct protection_domain *domain)
 {
 	struct iommu_dev_data *dev_data, *alias_data;
+	int ret;
 
 	dev_data   = get_dev_data(dev);
 	alias_data = get_dev_data(dev_data->alias);
@@ -1498,13 +1499,14 @@ static int __attach_device(struct device *dev,
 	spin_lock(&domain->lock);
 
 	/* Some sanity checks */
+	ret = -EBUSY;
 	if (alias_data->domain != NULL &&
 	    alias_data->domain != domain)
-		return -EBUSY;
+		goto out_unlock;
 
 	if (dev_data->domain != NULL &&
 	    dev_data->domain != domain)
-		return -EBUSY;
+		goto out_unlock;
 
 	/* Do real assignment */
 	if (dev_data->alias != dev) {
@@ -1520,10 +1522,14 @@ static int __attach_device(struct device *dev,
 
 	atomic_inc(&dev_data->bind);
 
+	ret = 0;
+
+out_unlock:
+
 	/* ready */
 	spin_unlock(&domain->lock);
 
-	return 0;
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From 0ac0c0d0f837c499afd02a802f9cf52d3027fa3b Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Wed, 26 May 2010 14:42:51 -0700
Subject: cpusets: randomize node rotor used in cpuset_mem_spread_node()

Some workloads that create a large number of small files tend to assign
too many pages to node 0 (multi-node systems).  Part of the reason is that
the rotor (in cpuset_mem_spread_node()) used to assign nodes starts at
node 0 for newly created tasks.

This patch changes the rotor to be initialized to a random node number of
the cpuset.

[akpm@linux-foundation.org: fix layout]
[Lee.Schermerhorn@hp.com: Define stub numa_random() for !NUMA configuration]
Signed-off-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Paul Menage <menage@google.com>
Cc: Jack Steiner <steiner@sgi.com>
Cc: Robin Holt <holt@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/numa.c       | 17 +++++++++++++++++
 include/linux/bitmap.h   |  1 +
 include/linux/nodemask.h |  8 ++++++++
 kernel/fork.c            |  4 ++++
 lib/bitmap.c             |  2 +-
 5 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 550df481acc..10c27bb1e95 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -2,6 +2,7 @@
 #include <linux/topology.h>
 #include <linux/module.h>
 #include <linux/bootmem.h>
+#include <linux/random.h>
 
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 # define DBG(x...) printk(KERN_DEBUG x)
@@ -65,3 +66,19 @@ const struct cpumask *cpumask_of_node(int node)
 }
 EXPORT_SYMBOL(cpumask_of_node);
 #endif
+
+/*
+ * Return the bit number of a random bit set in the nodemask.
+ *   (returns -1 if nodemask is empty)
+ */
+int __node_random(const nodemask_t *maskp)
+{
+	int w, bit = -1;
+
+	w = nodes_weight(*maskp);
+	if (w)
+		bit = bitmap_ord_to_pos(maskp->bits,
+			get_random_int() % w, MAX_NUMNODES);
+	return bit;
+}
+EXPORT_SYMBOL(__node_random);
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index daf8c480c78..6fb2720882f 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -141,6 +141,7 @@ extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order);
 extern void bitmap_release_region(unsigned long *bitmap, int pos, int order);
 extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
 extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);
+extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits);
 
 #define BITMAP_LAST_WORD_MASK(nbits)					\
 (									\
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index dba35e41337..8a8f1d09c13 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -66,6 +66,8 @@
  * int num_online_nodes()		Number of online Nodes
  * int num_possible_nodes()		Number of all possible Nodes
  *
+ * int node_random(mask)		Random node with set bit in mask
+ *
  * int node_online(node)		Is some node online?
  * int node_possible(node)		Is some node possible?
  *
@@ -430,6 +432,10 @@ static inline void node_set_offline(int nid)
 	node_clear_state(nid, N_ONLINE);
 	nr_online_nodes = num_node_state(N_ONLINE);
 }
+
+#define node_random(mask) __node_random(&(mask))
+extern int __node_random(const nodemask_t *maskp);
+
 #else
 
 static inline int node_state(int node, enum node_states state)
@@ -460,6 +466,8 @@ static inline int num_node_state(enum node_states state)
 
 #define node_set_online(node)	   node_set_state((node), N_ONLINE)
 #define node_set_offline(node)	   node_clear_state((node), N_ONLINE)
+
+static inline int node_random(const nodemask_t mask) { return 0; }
 #endif
 
 #define node_online_map 	node_states[N_ONLINE]
diff --git a/kernel/fork.c b/kernel/fork.c
index 4d57d9e3a6e..2e9cc3139ec 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1079,6 +1079,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
  	}
 	mpol_fix_fork_child_flag(p);
 #endif
+#ifdef CONFIG_CPUSETS
+	p->cpuset_mem_spread_rotor = node_random(p->mems_allowed);
+	p->cpuset_slab_spread_rotor = node_random(p->mems_allowed);
+#endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	p->irq_events = 0;
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
diff --git a/lib/bitmap.c b/lib/bitmap.c
index ffb78c916cc..d7137e7e06e 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -672,7 +672,7 @@ static int bitmap_pos_to_ord(const unsigned long *buf, int pos, int bits)
  *
  * The bit positions 0 through @bits are valid positions in @buf.
  */
-static int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits)
+int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits)
 {
 	int pos = 0;
 
-- 
cgit v1.2.3-70-g09d2


From a94247e7fb99170590dc9592792045c6fa49c7f5 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 26 May 2010 14:43:30 -0700
Subject: x86: convert cpu notifier to return encapsulate errno value

By the previous modification, the cpu notifier can return encapsulate
errno value.  This converts the cpu notifiers for msr, cpuid, and
therm_throt.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/mcheck/therm_throt.c | 2 +-
 arch/x86/kernel/cpuid.c                  | 2 +-
 arch/x86/kernel/msr.c                    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 81c499eceb2..e1a0a3bf971 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -190,7 +190,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
 		mutex_unlock(&therm_cpu_lock);
 		break;
 	}
-	return err ? NOTIFY_BAD : NOTIFY_OK;
+	return notifier_from_errno(err);
 }
 
 static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata =
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 8b862d5900f..1b7b31ab7d8 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -170,7 +170,7 @@ static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb,
 		cpuid_device_destroy(cpu);
 		break;
 	}
-	return err ? NOTIFY_BAD : NOTIFY_OK;
+	return notifier_from_errno(err);
 }
 
 static struct notifier_block __refdata cpuid_class_cpu_notifier =
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 4d4468e9f47..7bf2dc4c8f7 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -230,7 +230,7 @@ static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb,
 		msr_device_destroy(cpu);
 		break;
 	}
-	return err ? NOTIFY_BAD : NOTIFY_OK;
+	return notifier_from_errno(err);
 }
 
 static struct notifier_block __refdata msr_class_cpu_notifier = {
-- 
cgit v1.2.3-70-g09d2


From de006a071cbb08fff6663d98f5b9bac7ffb47559 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Wed, 26 May 2010 14:44:16 -0700
Subject: x86: remove unnecessary sync_single_range_* in swiotlb_dma_ops

sync_single_range_for_cpu and sync_single_range_for_device hooks in
swiotlb_dma_ops are unnecessary because sync_single_for_cpu and
sync_single_for_device are used there.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/pci-swiotlb.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 7d2829dde20..a5bc528d432 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -31,8 +31,6 @@ static struct dma_map_ops swiotlb_dma_ops = {
 	.free_coherent = swiotlb_free_coherent,
 	.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
 	.sync_single_for_device = swiotlb_sync_single_for_device,
-	.sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
-	.sync_single_range_for_device = swiotlb_sync_single_range_for_device,
 	.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
 	.sync_sg_for_device = swiotlb_sync_sg_for_device,
 	.map_sg = swiotlb_map_sg_attrs,
-- 
cgit v1.2.3-70-g09d2


From 18e98307de0d746cb0845ebf66535ce2184c25a2 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Wed, 26 May 2010 14:44:32 -0700
Subject: asm-generic: add NEED_SG_DMA_LENGTH to define sg_dma_len()

There are only two ways to define sg_dma_len(); use sg->dma_length or
sg->length.  This patch introduces NEED_SG_DMA_LENGTH that enables
architectures to choose sg->dma_length or sg->length.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/Kconfig                    |  3 +++
 arch/sh/Kconfig                      |  3 +++
 arch/sparc/Kconfig                   |  3 +++
 arch/sparc/include/asm/scatterlist.h |  2 --
 arch/x86/Kconfig                     |  3 +++
 include/asm-generic/scatterlist.h    | 13 +++++--------
 6 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 9676100b83e..f8afdd271f3 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -56,6 +56,9 @@ config MMU
 config NEED_DMA_MAP_STATE
 	def_bool y
 
+config NEED_SG_DMA_LENGTH
+	def_bool y
+
 config SWIOTLB
        bool
 
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 0e318c905ee..c5ee4ce60b5 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -186,6 +186,9 @@ config DMA_NONCOHERENT
 config NEED_DMA_MAP_STATE
 	def_bool DMA_NONCOHERENT
 
+config NEED_SG_DMA_LENGTH
+	def_bool y
+
 source "init/Kconfig"
 
 source "kernel/Kconfig.freezer"
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index d6781ce687e..6f1470baa31 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -133,6 +133,9 @@ config ZONE_DMA
 config NEED_DMA_MAP_STATE
 	def_bool y
 
+config NEED_SG_DMA_LENGTH
+	def_bool y
+
 config GENERIC_ISA_DMA
 	bool
 	default y if SPARC32
diff --git a/arch/sparc/include/asm/scatterlist.h b/arch/sparc/include/asm/scatterlist.h
index 0fa0d6da210..69d21bb052f 100644
--- a/arch/sparc/include/asm/scatterlist.h
+++ b/arch/sparc/include/asm/scatterlist.h
@@ -1,8 +1,6 @@
 #ifndef _SPARC_SCATTERLIST_H
 #define _SPARC_SCATTERLIST_H
 
-#define sg_dma_len(sg)     	((sg)->dma_length)
-
 #define ISA_DMA_THRESHOLD	(~0UL)
 
 #include <asm-generic/scatterlist.h>
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e0c619c55b4..5bdc143b122 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -109,6 +109,9 @@ config SBUS
 config NEED_DMA_MAP_STATE
        def_bool (X86_64 || DMAR || DMA_API_DEBUG)
 
+config NEED_SG_DMA_LENGTH
+	def_bool X86_64
+
 config GENERIC_ISA_DMA
 	def_bool y
 
diff --git a/include/asm-generic/scatterlist.h b/include/asm-generic/scatterlist.h
index 51a7a43ab0c..5e087944a65 100644
--- a/include/asm-generic/scatterlist.h
+++ b/include/asm-generic/scatterlist.h
@@ -11,7 +11,9 @@ struct scatterlist {
 	unsigned int	offset;
 	unsigned int	length;
 	dma_addr_t	dma_address;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
 	unsigned int	dma_length;
+#endif
 };
 
 /*
@@ -22,17 +24,12 @@ struct scatterlist {
  * is 0.
  */
 #define sg_dma_address(sg)	((sg)->dma_address)
-#ifndef sg_dma_len
-/*
- * Normally, you have an iommu on 64 bit machines, but not on 32 bit
- * machines. Architectures that are differnt should override this.
- */
-#if __BITS_PER_LONG == 64
+
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
 #define sg_dma_len(sg)		((sg)->dma_length)
 #else
 #define sg_dma_len(sg)		((sg)->length)
-#endif /* 64 bit */
-#endif /* sg_dma_len */
+#endif
 
 #define ARCH_HAS_SG_CHAIN
 
-- 
cgit v1.2.3-70-g09d2


From 4a14d84ea2adc6c02dde4ae2d4552c15e014a475 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 26 May 2010 14:44:33 -0700
Subject: x86_32: use asm-generic/scatterlist.h

Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5bdc143b122..5dc54219e73 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -110,7 +110,7 @@ config NEED_DMA_MAP_STATE
        def_bool (X86_64 || DMAR || DMA_API_DEBUG)
 
 config NEED_SG_DMA_LENGTH
-	def_bool X86_64
+	def_bool y
 
 config GENERIC_ISA_DMA
 	def_bool y
-- 
cgit v1.2.3-70-g09d2


From 1ef04370d823a811d2cca9f237097559a6b99b12 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Wed, 26 May 2010 14:44:34 -0700
Subject: asm-generic: remove ARCH_HAS_SG_CHAIN in scatterlist.h

There are more architectures that don't support ARCH_HAS_SG_CHAIN than
those that support it.  This removes removes ARCH_HAS_SG_CHAIN in
asm-generic/scatterlist.h and lets arhictectures to define it.

It's clearer than defining ARCH_HAS_SG_CHAIN asm-generic/scatterlist.h and
undefing it in arhictectures that don't support it.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/scatterlist.h      | 4 +---
 arch/arm/include/asm/scatterlist.h        | 3 ---
 arch/ia64/include/asm/scatterlist.h       | 4 ++--
 arch/microblaze/include/asm/scatterlist.h | 4 ++--
 arch/powerpc/include/asm/scatterlist.h    | 1 +
 arch/sparc/include/asm/scatterlist.h      | 5 +++--
 arch/x86/include/asm/scatterlist.h        | 5 +++--
 include/asm-generic/scatterlist.h         | 2 --
 8 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/include/asm/scatterlist.h b/arch/alpha/include/asm/scatterlist.h
index 85a0ef25516..5728c52a741 100644
--- a/arch/alpha/include/asm/scatterlist.h
+++ b/arch/alpha/include/asm/scatterlist.h
@@ -1,10 +1,8 @@
 #ifndef _ALPHA_SCATTERLIST_H
 #define _ALPHA_SCATTERLIST_H
 
-#define ISA_DMA_THRESHOLD (~0UL)
-
 #include <asm-generic/scatterlist.h>
 
-#undef ARCH_HAS_SG_CHAIN
+#define ISA_DMA_THRESHOLD (~0UL)
 
 #endif /* !(_ALPHA_SCATTERLIST_H) */
diff --git a/arch/arm/include/asm/scatterlist.h b/arch/arm/include/asm/scatterlist.h
index bcda59f3994..2f87870d934 100644
--- a/arch/arm/include/asm/scatterlist.h
+++ b/arch/arm/include/asm/scatterlist.h
@@ -3,9 +3,6 @@
 
 #include <asm/memory.h>
 #include <asm/types.h>
-
 #include <asm-generic/scatterlist.h>
 
-#undef ARCH_HAS_SG_CHAIN
-
 #endif /* _ASMARM_SCATTERLIST_H */
diff --git a/arch/ia64/include/asm/scatterlist.h b/arch/ia64/include/asm/scatterlist.h
index d8e98961dec..f299a4fb25c 100644
--- a/arch/ia64/include/asm/scatterlist.h
+++ b/arch/ia64/include/asm/scatterlist.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_IA64_SCATTERLIST_H
 #define _ASM_IA64_SCATTERLIST_H
 
+#include <asm-generic/scatterlist.h>
 /*
  * It used to be that ISA_DMA_THRESHOLD had something to do with the
  * DMA-limits of ISA-devices.  Nowadays, its only remaining use (apart
@@ -10,7 +11,6 @@
  * that's 4GB - 1.
  */
 #define ISA_DMA_THRESHOLD	0xffffffff
-
-#include <asm-generic/scatterlist.h>
+#define ARCH_HAS_SG_CHAIN
 
 #endif /* _ASM_IA64_SCATTERLIST_H */
diff --git a/arch/microblaze/include/asm/scatterlist.h b/arch/microblaze/include/asm/scatterlist.h
index be44d94cba5..dc4a8900cc8 100644
--- a/arch/microblaze/include/asm/scatterlist.h
+++ b/arch/microblaze/include/asm/scatterlist.h
@@ -1,3 +1,3 @@
-#define ISA_DMA_THRESHOLD	(~0UL)
-
 #include <asm-generic/scatterlist.h>
+
+#define ISA_DMA_THRESHOLD	(~0UL)
diff --git a/arch/powerpc/include/asm/scatterlist.h b/arch/powerpc/include/asm/scatterlist.h
index 4ae35da4975..34cc78fd0ef 100644
--- a/arch/powerpc/include/asm/scatterlist.h
+++ b/arch/powerpc/include/asm/scatterlist.h
@@ -15,5 +15,6 @@
 #ifdef __powerpc64__
 #define ISA_DMA_THRESHOLD	(~0UL)
 #endif
+#define ARCH_HAS_SG_CHAIN
 
 #endif /* _ASM_POWERPC_SCATTERLIST_H */
diff --git a/arch/sparc/include/asm/scatterlist.h b/arch/sparc/include/asm/scatterlist.h
index 69d21bb052f..433e45f05fd 100644
--- a/arch/sparc/include/asm/scatterlist.h
+++ b/arch/sparc/include/asm/scatterlist.h
@@ -1,8 +1,9 @@
 #ifndef _SPARC_SCATTERLIST_H
 #define _SPARC_SCATTERLIST_H
 
-#define ISA_DMA_THRESHOLD	(~0UL)
-
 #include <asm-generic/scatterlist.h>
 
+#define ISA_DMA_THRESHOLD	(~0UL)
+#define ARCH_HAS_SG_CHAIN
+
 #endif /* !(_SPARC_SCATTERLIST_H) */
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
index 75af592677e..fb0b1874396 100644
--- a/arch/x86/include/asm/scatterlist.h
+++ b/arch/x86/include/asm/scatterlist.h
@@ -1,8 +1,9 @@
 #ifndef _ASM_X86_SCATTERLIST_H
 #define _ASM_X86_SCATTERLIST_H
 
-#define ISA_DMA_THRESHOLD (0x00ffffff)
-
 #include <asm-generic/scatterlist.h>
 
+#define ISA_DMA_THRESHOLD (0x00ffffff)
+#define ARCH_HAS_SG_CHAIN
+
 #endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/include/asm-generic/scatterlist.h b/include/asm-generic/scatterlist.h
index 5e087944a65..5de07355fad 100644
--- a/include/asm-generic/scatterlist.h
+++ b/include/asm-generic/scatterlist.h
@@ -31,6 +31,4 @@ struct scatterlist {
 #define sg_dma_len(sg)		((sg)->length)
 #endif
 
-#define ARCH_HAS_SG_CHAIN
-
 #endif /* __ASM_GENERIC_SCATTERLIST_H */
-- 
cgit v1.2.3-70-g09d2


From 7281201922a0063fa60804ce39c277fc98142a47 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Wed, 26 May 2010 14:44:56 -0700
Subject: numa: add generic percpu var numa_node_id() implementation

Rework the generic version of the numa_node_id() function to use the new
generic percpu variable infrastructure.

Guard the new implementation with a new config option:

        CONFIG_USE_PERCPU_NUMA_NODE_ID.

Archs which support this new implemention will default this option to 'y'
when NUMA is configured.  This config option could be removed if/when all
archs switch over to the generic percpu implementation of numa_node_id().
Arch support involves:

  1) converting any existing per cpu variable implementations to use
     this implementation.  x86_64 is an instance of such an arch.
  2) archs that don't use a per cpu variable for numa_node_id() will
     need to initialize the new per cpu variable "numa_node" as cpus
     are brought on-line.  ia64 is an example.
  3) Defining USE_PERCPU_NUMA_NODE_ID in arch dependent Kconfig--e.g.,
     when NUMA is configured.  This is required because I have
     retained the old implementation by default to allow archs to
     be modified incrementally, as desired.

Subsequent patches will convert x86_64 and ia64 to use this implemenation.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Eric Whitney <eric.whitney@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/topology.h |  4 ++++
 include/linux/topology.h        | 51 +++++++++++++++++++++++++++++++++++++----
 mm/page_alloc.c                 |  5 ++++
 3 files changed, 55 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index c5087d79658..a2e629476b0 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -170,6 +170,10 @@ static inline int numa_node_id(void)
 {
 	return 0;
 }
+/*
+ * indicate override:
+ */
+#define numa_node_id numa_node_id
 
 static inline int early_cpu_to_node(int cpu)
 {
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 5b81156780b..2e5518f4657 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -31,6 +31,7 @@
 #include <linux/bitops.h>
 #include <linux/mmzone.h>
 #include <linux/smp.h>
+#include <linux/percpu.h>
 #include <asm/topology.h>
 
 #ifndef node_has_online_mem
@@ -203,8 +204,53 @@ int arch_update_cpu_topology(void);
 #ifndef SD_NODE_INIT
 #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
 #endif
+
 #endif /* CONFIG_NUMA */
 
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+DECLARE_PER_CPU(int, numa_node);
+
+#ifndef numa_node_id
+/* Returns the number of the current Node. */
+static inline int numa_node_id(void)
+{
+	return __this_cpu_read(numa_node);
+}
+#endif
+
+#ifndef cpu_to_node
+static inline int cpu_to_node(int cpu)
+{
+	return per_cpu(numa_node, cpu);
+}
+#endif
+
+#ifndef set_numa_node
+static inline void set_numa_node(int node)
+{
+	percpu_write(numa_node, node);
+}
+#endif
+
+#ifndef set_cpu_numa_node
+static inline void set_cpu_numa_node(int cpu, int node)
+{
+	per_cpu(numa_node, cpu) = node;
+}
+#endif
+
+#else	/* !CONFIG_USE_PERCPU_NUMA_NODE_ID */
+
+/* Returns the number of the current Node. */
+#ifndef numa_node_id
+static inline int numa_node_id(void)
+{
+	return cpu_to_node(raw_smp_processor_id());
+}
+#endif
+
+#endif	/* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */
+
 #ifndef topology_physical_package_id
 #define topology_physical_package_id(cpu)	((void)(cpu), -1)
 #endif
@@ -218,9 +264,4 @@ int arch_update_cpu_topology(void);
 #define topology_core_cpumask(cpu)		cpumask_of(cpu)
 #endif
 
-/* Returns the number of the current Node. */
-#ifndef numa_node_id
-#define numa_node_id()		(cpu_to_node(raw_smp_processor_id()))
-#endif
-
 #endif /* _LINUX_TOPOLOGY_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 08b349931eb..6fe1b65ee1a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -57,6 +57,11 @@
 #include <asm/div64.h>
 #include "internal.h"
 
+#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
+DEFINE_PER_CPU(int, numa_node);
+EXPORT_PER_CPU_SYMBOL(numa_node);
+#endif
+
 /*
  * Array of node states.
  */
-- 
cgit v1.2.3-70-g09d2


From e534c7c5f8d6e9fc46f57fab067c7e48d8ceb172 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <lee.schermerhorn@hp.com>
Date: Wed, 26 May 2010 14:44:58 -0700
Subject: numa: x86_64: use generic percpu var numa_node_id() implementation

x86 arch specific changes to use generic numa_node_id() based on generic
percpu variable infrastructure.  Back out x86's custom version of
numa_node_id()

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Eric Whitney <eric.whitney@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig                |  4 ++++
 arch/x86/include/asm/topology.h | 22 +++++++++-------------
 arch/x86/kernel/cpu/common.c    |  6 +++---
 arch/x86/kernel/setup_percpu.c  |  4 ++--
 arch/x86/mm/numa_64.c           |  9 +++------
 5 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5dc54219e73..dcb0593b4a6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1706,6 +1706,10 @@ config HAVE_ARCH_EARLY_PFN_TO_NID
 	def_bool X86_64
 	depends on NUMA
 
+config USE_PERCPU_NUMA_NODE_ID
+	def_bool X86_64
+	depends on NUMA
+
 menu "Power management and ACPI options"
 
 config ARCH_HIBERNATION_HEADER
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index a2e629476b0..21899cc31e5 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -53,33 +53,29 @@
 extern int cpu_to_node_map[];
 
 /* Returns the number of the node containing CPU 'cpu' */
-static inline int cpu_to_node(int cpu)
+static inline int __cpu_to_node(int cpu)
 {
 	return cpu_to_node_map[cpu];
 }
-#define early_cpu_to_node(cpu)	cpu_to_node(cpu)
+#define early_cpu_to_node __cpu_to_node
+#define cpu_to_node __cpu_to_node
 
 #else /* CONFIG_X86_64 */
 
 /* Mappings between logical cpu number and node number */
 DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
 
-/* Returns the number of the current Node. */
-DECLARE_PER_CPU(int, node_number);
-#define numa_node_id()		percpu_read(node_number)
-
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
-extern int cpu_to_node(int cpu);
+/*
+ * override generic percpu implementation of cpu_to_node
+ */
+extern int __cpu_to_node(int cpu);
+#define cpu_to_node __cpu_to_node
+
 extern int early_cpu_to_node(int cpu);
 
 #else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
 
-/* Returns the number of the node containing CPU 'cpu' */
-static inline int cpu_to_node(int cpu)
-{
-	return per_cpu(x86_cpu_to_node_map, cpu);
-}
-
 /* Same function but used if called before per_cpu areas are setup */
 static inline int early_cpu_to_node(int cpu)
 {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cc83a002786..68e4a6f2211 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1121,9 +1121,9 @@ void __cpuinit cpu_init(void)
 	oist = &per_cpu(orig_ist, cpu);
 
 #ifdef CONFIG_NUMA
-	if (cpu != 0 && percpu_read(node_number) == 0 &&
-	    cpu_to_node(cpu) != NUMA_NO_NODE)
-		percpu_write(node_number, cpu_to_node(cpu));
+	if (cpu != 0 && percpu_read(numa_node) == 0 &&
+	    early_cpu_to_node(cpu) != NUMA_NO_NODE)
+		set_numa_node(early_cpu_to_node(cpu));
 #endif
 
 	me = current;
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ef6370b00e7..a867940a6df 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -265,10 +265,10 @@ void __init setup_per_cpu_areas(void)
 
 #if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
 	/*
-	 * make sure boot cpu node_number is right, when boot cpu is on the
+	 * make sure boot cpu numa_node is right, when boot cpu is on the
 	 * node that doesn't have mem installed
 	 */
-	per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
+	set_cpu_numa_node(boot_cpu_id, early_cpu_to_node(boot_cpu_id));
 #endif
 
 	/* Setup node to cpumask map */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 8948f47fde0..a7bcc23ef96 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -33,9 +33,6 @@ int numa_off __initdata;
 static unsigned long __initdata nodemap_addr;
 static unsigned long __initdata nodemap_size;
 
-DEFINE_PER_CPU(int, node_number) = 0;
-EXPORT_PER_CPU_SYMBOL(node_number);
-
 /*
  * Map cpu index to node index
  */
@@ -809,7 +806,7 @@ void __cpuinit numa_set_node(int cpu, int node)
 	per_cpu(x86_cpu_to_node_map, cpu) = node;
 
 	if (node != NUMA_NO_NODE)
-		per_cpu(node_number, cpu) = node;
+		set_cpu_numa_node(cpu, node);
 }
 
 void __cpuinit numa_clear_node(int cpu)
@@ -867,7 +864,7 @@ void __cpuinit numa_remove_cpu(int cpu)
 	numa_set_cpumask(cpu, 0);
 }
 
-int cpu_to_node(int cpu)
+int __cpu_to_node(int cpu)
 {
 	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
 		printk(KERN_WARNING
@@ -877,7 +874,7 @@ int cpu_to_node(int cpu)
 	}
 	return per_cpu(x86_cpu_to_node_map, cpu);
 }
-EXPORT_SYMBOL(cpu_to_node);
+EXPORT_SYMBOL(__cpu_to_node);
 
 /*
  * Same function as cpu_to_node() but used if called before the
-- 
cgit v1.2.3-70-g09d2


From 1ba4f22c426ba04b00fd717318d50620c621a0e1 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Thu, 27 May 2010 12:02:00 -0700
Subject: x86, cpufeature: Unbreak compile with gcc 3.x

gcc 3 is too braindamaged to be able to compile static_cpu_has() --
apparently it can't tell that a constant passed to an inline function
is still a constant -- so if we're using gcc 3, just use the dynamic
test.  This is bad for performance, but if you care about performance,
don't use an ancient, known-to-optimize-poorly compiler.

Reported-and-tested-by: Eric Dumazet <eric.dumazet@gmail.com>
LKML-Reference: <4BF2FF82.7090005@zytor.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/cpufeature.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index dca9c545f44..46814591438 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -332,6 +332,7 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
 #endif
 }
 
+#if __GNUC__ >= 4
 #define static_cpu_has(bit)					\
 (								\
 	__builtin_constant_p(boot_cpu_has(bit)) ?		\
@@ -340,6 +341,12 @@ static __always_inline __pure bool __static_cpu_has(u8 bit)
 		__static_cpu_has(bit) :				\
 		boot_cpu_has(bit)				\
 )
+#else
+/*
+ * gcc 3.x is too stupid to do the static test; fall back to dynamic.
+ */
+#define static_cpu_has(bit) boot_cpu_has(bit)
+#endif
 
 #endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
 
-- 
cgit v1.2.3-70-g09d2


From f52e62dcc1f555a0f8d8ccc3112454c3ff571395 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian@openwrt.org>
Date: Sun, 21 Mar 2010 01:06:17 +0100
Subject: x86: remove rdc321x_defs.h

This file is replaced by a cleaner version with the adding of a MFD driver for
the southbridge.

Signed-off-by: Florian Fainelli <florian@openwrt.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 arch/x86/include/asm/rdc321x_defs.h | 12 ------------
 1 file changed, 12 deletions(-)
 delete mode 100644 arch/x86/include/asm/rdc321x_defs.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/rdc321x_defs.h b/arch/x86/include/asm/rdc321x_defs.h
deleted file mode 100644
index c8e9c8bed3d..00000000000
--- a/arch/x86/include/asm/rdc321x_defs.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#define PFX	"rdc321x: "
-
-/* General purpose configuration and data registers */
-#define RDC3210_CFGREG_ADDR     0x0CF8
-#define RDC3210_CFGREG_DATA     0x0CFC
-
-#define RDC321X_GPIO_CTRL_REG1	0x48
-#define RDC321X_GPIO_CTRL_REG2	0x84
-#define RDC321X_GPIO_DATA_REG1	0x4c
-#define RDC321X_GPIO_DATA_REG2	0x88
-
-#define RDC321X_MAX_GPIO	58
-- 
cgit v1.2.3-70-g09d2


From e45b7fa23097332508730123ac6d59227e7bd7f8 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Mon, 24 May 2010 11:34:36 -0400
Subject: sched: clarify commment for TS_POLLING

TS_POLLING set tells the scheduler an idle_task will poll
need_resched() to look for work.

TS_POLLING clear tells resched_task() and wake_up_idle_cpu()
that the remote CPU's idle_task is now sleeping in idle,
and thus requires a reschedule interrupt notice work.

Update the description of TS_POLLING to reflect how it works.
"idle task polling need_resched, skip sending interrupt"

Wordsmithing-by: Milton Miller <miltonm@bga.com>
Signed-off-by: Len Brown <len.brown@intel.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
---
 arch/x86/include/asm/thread_info.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e0d28901e96..812186919bb 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -241,8 +241,8 @@ static inline struct thread_info *current_thread_info(void)
 #define TS_USEDFPU		0x0001	/* FPU was used by this task
 					   this quantum (SMP) */
 #define TS_COMPAT		0x0002	/* 32bit syscall active (64BIT)*/
-#define TS_POLLING		0x0004	/* true if in idle loop
-					   and not sleeping */
+#define TS_POLLING		0x0004	/* idle task polling need_resched,
+					   skip sending interrupt */
 #define TS_RESTORE_SIGMASK	0x0008	/* restore signal mask in do_signal() */
 #define TS_XSAVE		0x0010	/* Use xsave/xrstor */
 
-- 
cgit v1.2.3-70-g09d2


From 35926ff5fba8245bd1c6ac04155048f6f89232b1 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 30 May 2010 09:00:03 -0700
Subject: Revert "cpusets: randomize node rotor used in
 cpuset_mem_spread_node()"

This reverts commit 0ac0c0d0f837c499afd02a802f9cf52d3027fa3b, which
caused cross-architecture build problems for all the wrong reasons.
IA64 already added its own version of __node_random(), but the fact is,
there is nothing architectural about the function, and the original
commit was just badly done. Revert it, since no fix is forthcoming.

Requested-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/numa.c       | 17 -----------------
 include/linux/bitmap.h   |  1 -
 include/linux/nodemask.h |  8 --------
 kernel/fork.c            |  4 ----
 lib/bitmap.c             |  2 +-
 5 files changed, 1 insertion(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 10c27bb1e95..550df481acc 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -2,7 +2,6 @@
 #include <linux/topology.h>
 #include <linux/module.h>
 #include <linux/bootmem.h>
-#include <linux/random.h>
 
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 # define DBG(x...) printk(KERN_DEBUG x)
@@ -66,19 +65,3 @@ const struct cpumask *cpumask_of_node(int node)
 }
 EXPORT_SYMBOL(cpumask_of_node);
 #endif
-
-/*
- * Return the bit number of a random bit set in the nodemask.
- *   (returns -1 if nodemask is empty)
- */
-int __node_random(const nodemask_t *maskp)
-{
-	int w, bit = -1;
-
-	w = nodes_weight(*maskp);
-	if (w)
-		bit = bitmap_ord_to_pos(maskp->bits,
-			get_random_int() % w, MAX_NUMNODES);
-	return bit;
-}
-EXPORT_SYMBOL(__node_random);
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 6fb2720882f..daf8c480c78 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -141,7 +141,6 @@ extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order);
 extern void bitmap_release_region(unsigned long *bitmap, int pos, int order);
 extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
 extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits);
-extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits);
 
 #define BITMAP_LAST_WORD_MASK(nbits)					\
 (									\
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 8a8f1d09c13..dba35e41337 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -66,8 +66,6 @@
  * int num_online_nodes()		Number of online Nodes
  * int num_possible_nodes()		Number of all possible Nodes
  *
- * int node_random(mask)		Random node with set bit in mask
- *
  * int node_online(node)		Is some node online?
  * int node_possible(node)		Is some node possible?
  *
@@ -432,10 +430,6 @@ static inline void node_set_offline(int nid)
 	node_clear_state(nid, N_ONLINE);
 	nr_online_nodes = num_node_state(N_ONLINE);
 }
-
-#define node_random(mask) __node_random(&(mask))
-extern int __node_random(const nodemask_t *maskp);
-
 #else
 
 static inline int node_state(int node, enum node_states state)
@@ -466,8 +460,6 @@ static inline int num_node_state(enum node_states state)
 
 #define node_set_online(node)	   node_set_state((node), N_ONLINE)
 #define node_set_offline(node)	   node_clear_state((node), N_ONLINE)
-
-static inline int node_random(const nodemask_t mask) { return 0; }
 #endif
 
 #define node_online_map 	node_states[N_ONLINE]
diff --git a/kernel/fork.c b/kernel/fork.c
index bf9fef6d1bf..b6cce14ba04 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1086,10 +1086,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
  	}
 	mpol_fix_fork_child_flag(p);
 #endif
-#ifdef CONFIG_CPUSETS
-	p->cpuset_mem_spread_rotor = node_random(p->mems_allowed);
-	p->cpuset_slab_spread_rotor = node_random(p->mems_allowed);
-#endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	p->irq_events = 0;
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
diff --git a/lib/bitmap.c b/lib/bitmap.c
index d7137e7e06e..ffb78c916cc 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -672,7 +672,7 @@ static int bitmap_pos_to_ord(const unsigned long *buf, int pos, int bits)
  *
  * The bit positions 0 through @bits are valid positions in @buf.
  */
-int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits)
+static int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits)
 {
 	int pos = 0;
 
-- 
cgit v1.2.3-70-g09d2


From 90151c35b19633e0cab5a6c80f1ba4a51e7c913b Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Tue, 25 May 2010 16:23:10 +0200
Subject: perf_events: Fix event scheduling issues introduced by transactional
 API

The transactional API patch between the generic and model-specific
code introduced several important bugs with event scheduling, at
least on X86. If you had pinned events, e.g., watchdog,  and were
over-committing the PMU, you would get bogus counts. The bug was
showing up on Intel CPU because events would move around more
often that on AMD. But the problem also existed on AMD, though
harder to expose.

The issues were:

 - group_sched_in() was missing a cancel_txn() in the error path

 - cpuc->n_added was not properly maintained, leading to missing
   actions in hw_perf_enable(), i.e., n_running being 0. You cannot
   update n_added until you know the transaction has succeeded. In
   case of failed transaction n_added was not adjusted back.

 - in case of failed transactions, event_sched_out() was called
   and eventually invoked x86_disable_event() to touch the HW reg.
   But with transactions, on X86, event_sched_in() does not touch
   HW registers, it simply collects events into a list. Thus, you
   could end up calling x86_disable_event() on a counter which
   did not correspond to the current event when idx != -1.

The patch modifies the generic and X86 code to avoid all those problems.

First, we keep track of the number of events added last. In case the
transaction fails, we substract them from n_added. This approach is
necessary (as opposed to delaying updates to n_added) because not all
event updates use the transaction API, e.g., single events.

Second, we encapsulate the event_sched_in() and event_sched_out() in
group_sched_in() inside the transaction. That makes the operations
symmetrical and you can also detect that you are inside a transaction
and skip the HW reg access by checking cpuc->group_flag.

With this patch, you can now overcommit the PMU even with pinned
system-wide events present and still get valid counts.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1274796225.5882.1389.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_event.c | 22 ++++++++++++++++++++++
 kernel/perf_event.c              | 11 +++++++----
 2 files changed, 29 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c77586061bc..5db5b7d65a1 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -106,6 +106,7 @@ struct cpu_hw_events {
 
 	int			n_events;
 	int			n_added;
+	int			n_txn;
 	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
 	u64			tags[X86_PMC_IDX_MAX];
 	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
@@ -983,6 +984,7 @@ static int x86_pmu_enable(struct perf_event *event)
 out:
 	cpuc->n_events = n;
 	cpuc->n_added += n - n0;
+	cpuc->n_txn += n - n0;
 
 	return 0;
 }
@@ -1089,6 +1091,14 @@ static void x86_pmu_disable(struct perf_event *event)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 	int i;
 
+	/*
+	 * If we're called during a txn, we don't need to do anything.
+	 * The events never got scheduled and ->cancel_txn will truncate
+	 * the event_list.
+	 */
+	if (cpuc->group_flag & PERF_EVENT_TXN_STARTED)
+		return;
+
 	x86_pmu_stop(event);
 
 	for (i = 0; i < cpuc->n_events; i++) {
@@ -1379,6 +1389,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
 	cpuc->group_flag |= PERF_EVENT_TXN_STARTED;
+	cpuc->n_txn = 0;
 }
 
 /*
@@ -1391,6 +1402,11 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
 	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
 
 	cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED;
+	/*
+	 * Truncate the collected events.
+	 */
+	cpuc->n_added -= cpuc->n_txn;
+	cpuc->n_events -= cpuc->n_txn;
 }
 
 /*
@@ -1419,6 +1435,12 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
 	 */
 	memcpy(cpuc->assign, assign, n*sizeof(int));
 
+	/*
+	 * Clear out the txn count so that ->cancel_txn() which gets
+	 * run after ->commit_txn() doesn't undo things.
+	 */
+	cpuc->n_txn = 0;
+
 	return 0;
 }
 
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 10a1aee2309..42a0e9191af 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -687,8 +687,11 @@ group_sched_in(struct perf_event *group_event,
 	if (txn)
 		pmu->start_txn(pmu);
 
-	if (event_sched_in(group_event, cpuctx, ctx))
+	if (event_sched_in(group_event, cpuctx, ctx)) {
+		if (txn)
+			pmu->cancel_txn(pmu);
 		return -EAGAIN;
+	}
 
 	/*
 	 * Schedule in siblings as one group (if any):
@@ -710,9 +713,6 @@ group_sched_in(struct perf_event *group_event,
 	}
 
 group_error:
-	if (txn)
-		pmu->cancel_txn(pmu);
-
 	/*
 	 * Groups can be scheduled in as one unit only, so undo any
 	 * partial group before returning:
@@ -724,6 +724,9 @@ group_error:
 	}
 	event_sched_out(group_event, cpuctx, ctx);
 
+	if (txn)
+		pmu->cancel_txn(pmu);
+
 	return -EAGAIN;
 }
 
-- 
cgit v1.2.3-70-g09d2


From e565813ab95875af0d51a6bcd537068380bb06ea Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Mon, 24 May 2010 22:04:51 +0900
Subject: x86/mm: Remove unused DBG() macro

DBG() macro for CONFIG_DEBUG_PER_CPU_MAPS is unused.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
LKML-Reference: <1274706291-13554-1-git-send-email-akinobu.mita@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_percpu.c | 6 ------
 arch/x86/mm/numa.c             | 6 ------
 2 files changed, 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index a867940a6df..401ff1085a4 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -21,12 +21,6 @@
 #include <asm/cpu.h>
 #include <asm/stackprotector.h>
 
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__)
-#else
-# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0)
-#endif
-
 DEFINE_PER_CPU(int, cpu_number);
 EXPORT_PER_CPU_SYMBOL(cpu_number);
 
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 550df481acc..787c52ca49c 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -3,12 +3,6 @@
 #include <linux/module.h>
 #include <linux/bootmem.h>
 
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-# define DBG(x...) printk(KERN_DEBUG x)
-#else
-# define DBG(x...)
-#endif
-
 /*
  * Which logical CPUs are on which nodes
  */
-- 
cgit v1.2.3-70-g09d2


From e82752d8b5a7e0a5e4d607fd8713549e2a4e2741 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 28 May 2010 14:26:48 +0200
Subject: x86/amd-iommu: Fix crash when request_mem_region fails

When request_mem_region fails the error path tries to
disable the IOMMUs. This accesses the mmio-region which was
not allocated leading to a kernel crash. This patch fixes
the issue.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 3bacb4d0844..1405346c62b 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -287,8 +287,12 @@ static u8 * __init iommu_map_mmio_space(u64 address)
 {
 	u8 *ret;
 
-	if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu"))
+	if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) {
+		pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n",
+			address);
+		pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n");
 		return NULL;
+	}
 
 	ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
 	if (ret != NULL)
@@ -1314,7 +1318,7 @@ static int __init amd_iommu_init(void)
 		ret = amd_iommu_init_dma_ops();
 
 	if (ret)
-		goto free;
+		goto free_disable;
 
 	amd_iommu_init_api();
 
@@ -1332,9 +1336,10 @@ static int __init amd_iommu_init(void)
 out:
 	return ret;
 
-free:
+free_disable:
 	disable_iommus();
 
+free:
 	amd_iommu_uninit_devices();
 
 	free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
-- 
cgit v1.2.3-70-g09d2


From d7f0776975334070a93370ae048fda0c31a91c38 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 31 May 2010 15:05:20 +0200
Subject: x86/amd-iommu: Fall back to GART if initialization fails

This patch implements a fallback to the GART IOMMU if this
is possible and the AMD IOMMU initialization failed.
Otherwise the fallback would be nommu which is very
problematic on machines with more than 4GB of memory or
swiotlb which hurts io-performance.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c      | 4 ----
 arch/x86/kernel/amd_iommu_init.c | 9 +++++++++
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 8a9aaa8412c..0d20286d78c 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2330,10 +2330,6 @@ int __init amd_iommu_init_dma_ops(void)
 
 	iommu_detected = 1;
 	swiotlb = 0;
-#ifdef CONFIG_GART_IOMMU
-	gart_iommu_aperture_disabled = 1;
-	gart_iommu_aperture = 0;
-#endif
 
 	/* Make the driver finally visible to the drivers */
 	dma_ops = &amd_iommu_dma_ops;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 1405346c62b..3cc63e2b8dd 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1358,6 +1358,15 @@ free:
 
 	free_unity_maps();
 
+#ifdef CONFIG_GART_IOMMU
+	/*
+	 * We failed to initialize the AMD IOMMU - try fallback to GART
+	 * if possible.
+	 */
+	gart_iommu_init();
+
+#endif
+
 	goto out;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 4adc8b71cc142f9a7b44b13b99aab38ba897c56f Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Tue, 1 Jun 2010 21:04:55 +0200
Subject: x86, smpboot: Fix cores per node printing on boot

Percpu initialization happens now after booting the cores on the
machine and this causes them all to be displayed as belonging to
node 0:

Jun  8 05:57:21 kepek kernel: [    0.106999] Booting Node   0,
Processors  #1 #2 #3 #4 #5 #6 #7 #8 #9 #10 #11 #12 #13 #14 #15 #16 #17 #18 #19 #20 #21 #22 #23 Ok.

Use early_cpu_to_node() to get the correct node of each core
instead.

Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Cc: Mike Travis <travis@sgi.com>
LKML-Reference: <20100601190455.GA14237@aftab>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 37462f1ddba..c4f33b2e77d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -686,7 +686,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
 static void __cpuinit announce_cpu(int cpu, int apicid)
 {
 	static int current_node = -1;
-	int node = cpu_to_node(cpu);
+	int node = early_cpu_to_node(cpu);
 
 	if (system_state == SYSTEM_BOOTING) {
 		if (node != current_node) {
-- 
cgit v1.2.3-70-g09d2


From cd52e17ea8278f8449b6174a8e5ed439a2e44ffb Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Wed, 19 May 2010 16:19:25 +0100
Subject: xen: ensure timer tick is resumed even on CPU driving the resume

The core suspend/resume code is run from stop_machine on CPU0 but
parts of the suspend/resume machinery (including xen_arch_resume) are
run on whichever CPU happened to schedule the xenwatch kernel thread.

As part of the non-core resume code xen_arch_resume is called in order
to restart the timer tick on non-boot processors. The boot processor
itself is taken care of by core timekeeping code.

xen_arch_resume uses smp_call_function which does not call the given
function on the current processor. This means that we can end up with
one CPU not receiving timer ticks if the xenwatch thread happened to
be scheduled on CPU > 0.

Use on_each_cpu instead of smp_call_function to ensure the timer tick
is resumed everywhere.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Acked-by: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Stable Kernel <stable@kernel.org> # .32.x
---
 arch/x86/xen/suspend.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 987267f79bf..a9c66110803 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -60,6 +60,6 @@ static void xen_vcpu_notify_restore(void *data)
 
 void xen_arch_resume(void)
 {
-	smp_call_function(xen_vcpu_notify_restore,
-			       (void *)CLOCK_EVT_NOTIFY_RESUME, 1);
+	on_each_cpu(xen_vcpu_notify_restore,
+		    (void *)CLOCK_EVT_NOTIFY_RESUME, 1);
 }
-- 
cgit v1.2.3-70-g09d2


From 85a0e7539781dad4bfcffd98e72fa9f130f4e40d Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@rainbow-software.org>
Date: Tue, 8 Jun 2010 00:32:49 +0200
Subject: PM / x86: Save/restore MISC_ENABLE register

Save/restore MISC_ENABLE register on suspend/resume.
This fixes OOPS (invalid opcode) on resume from STR on Asus P4P800-VM,
which wakes up with MWAIT disabled.

Fixes https://bugzilla.kernel.org/show_bug.cgi?id=15385

Signed-off-by: Ondrej Zary <linux@rainbow-software.org>
Tested-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 arch/x86/include/asm/suspend_32.h | 2 ++
 arch/x86/include/asm/suspend_64.h | 2 ++
 arch/x86/power/cpu.c              | 4 ++++
 3 files changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index 48dcfa62ea0..fd921c3a684 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -15,6 +15,8 @@ static inline int arch_prepare_suspend(void) { return 0; }
 struct saved_context {
 	u16 es, fs, gs, ss;
 	unsigned long cr0, cr2, cr3, cr4;
+	u64 misc_enable;
+	bool misc_enable_saved;
 	struct desc_ptr gdt;
 	struct desc_ptr idt;
 	u16 ldt;
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 06284f42b75..8d942afae68 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -27,6 +27,8 @@ struct saved_context {
 	u16 ds, es, fs, gs, ss;
 	unsigned long gs_base, gs_kernel_base, fs_base;
 	unsigned long cr0, cr2, cr3, cr4, cr8;
+	u64 misc_enable;
+	bool misc_enable_saved;
 	unsigned long efer;
 	u16 gdt_pad;
 	u16 gdt_limit;
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 0a979f3e5b8..1290ba54b35 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -105,6 +105,8 @@ static void __save_processor_state(struct saved_context *ctxt)
 	ctxt->cr4 = read_cr4();
 	ctxt->cr8 = read_cr8();
 #endif
+	ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE,
+					       &ctxt->misc_enable);
 }
 
 /* Needed by apm.c */
@@ -152,6 +154,8 @@ static void fix_processor_context(void)
  */
 static void __restore_processor_state(struct saved_context *ctxt)
 {
+	if (ctxt->misc_enable_saved)
+		wrmsrl(MSR_IA32_MISC_ENABLE, ctxt->misc_enable);
 	/*
 	 * control registers
 	 */
-- 
cgit v1.2.3-70-g09d2


From fe5913e4e1700cbfc337f4b1da9ddb26f6a55586 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 17 May 2010 14:43:34 +0200
Subject: KVM: SVM: Handle MCEs early in the vmexit process

This patch moves handling of the MC vmexits to an earlier
point in the vmexit. The handle_exit function is too late
because the vcpu might alreadry have changed its physical
cpu.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 96dc232bfc5..5e1ed033cd7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1410,7 +1410,7 @@ static int nm_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
-static int mc_interception(struct vcpu_svm *svm)
+static void svm_handle_mce(struct vcpu_svm *svm)
 {
 	/*
 	 * On an #MC intercept the MCE handler is not called automatically in
@@ -1420,6 +1420,11 @@ static int mc_interception(struct vcpu_svm *svm)
 		"int $0x12\n");
 	/* not sure if we ever come back to this point */
 
+	return;
+}
+
+static int mc_interception(struct vcpu_svm *svm)
+{
 	return 1;
 }
 
@@ -3088,6 +3093,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 		vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
 		vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
 	}
+
+	/*
+	 * We need to handle MC intercepts here before the vcpu has a chance to
+	 * change the physical cpu
+	 */
+	if (unlikely(svm->vmcb->control.exit_code ==
+		     SVM_EXIT_EXCP_BASE + MC_VECTOR))
+		svm_handle_mce(svm);
 }
 
 #undef R
-- 
cgit v1.2.3-70-g09d2


From 67ec66077799f2fef84b21a643912b179c422281 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 17 May 2010 14:43:35 +0200
Subject: KVM: SVM: Implement workaround for Erratum 383

This patch implements a workaround for AMD erratum 383 into
KVM. Without this erratum fix it is possible for a guest to
kill the host machine. This patch implements the suggested
workaround for hypervisors which will be published by the
next revision guide update.

[jan: fix overflow warning on i386]
[xiao: fix unused variable warning]

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/msr-index.h |  1 +
 arch/x86/kvm/svm.c               | 81 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b49d8ca228f..8c7ae431862 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -110,6 +110,7 @@
 #define MSR_AMD64_PATCH_LOADER		0xc0010020
 #define MSR_AMD64_OSVW_ID_LENGTH	0xc0010140
 #define MSR_AMD64_OSVW_STATUS		0xc0010141
+#define MSR_AMD64_DC_CFG		0xc0011022
 #define MSR_AMD64_IBSFETCHCTL		0xc0011030
 #define MSR_AMD64_IBSFETCHLINAD		0xc0011031
 #define MSR_AMD64_IBSFETCHPHYSAD	0xc0011032
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5e1ed033cd7..ce438e0fdd2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -28,6 +28,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/slab.h>
 
+#include <asm/tlbflush.h>
 #include <asm/desc.h>
 
 #include <asm/virtext.h>
@@ -56,6 +57,8 @@ MODULE_LICENSE("GPL");
 
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 
+static bool erratum_383_found __read_mostly;
+
 static const u32 host_save_user_msrs[] = {
 #ifdef CONFIG_X86_64
 	MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
@@ -374,6 +377,31 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	svm->vmcb->control.event_inj_err = error_code;
 }
 
+static void svm_init_erratum_383(void)
+{
+	u32 low, high;
+	int err;
+	u64 val;
+
+	/* Only Fam10h is affected */
+	if (boot_cpu_data.x86 != 0x10)
+		return;
+
+	/* Use _safe variants to not break nested virtualization */
+	val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
+	if (err)
+		return;
+
+	val |= (1ULL << 47);
+
+	low  = lower_32_bits(val);
+	high = upper_32_bits(val);
+
+	native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
+
+	erratum_383_found = true;
+}
+
 static int has_svm(void)
 {
 	const char *msg;
@@ -429,6 +457,8 @@ static int svm_hardware_enable(void *garbage)
 
 	wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
 
+	svm_init_erratum_383();
+
 	return 0;
 }
 
@@ -1410,8 +1440,59 @@ static int nm_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
+static bool is_erratum_383(void)
+{
+	int err, i;
+	u64 value;
+
+	if (!erratum_383_found)
+		return false;
+
+	value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
+	if (err)
+		return false;
+
+	/* Bit 62 may or may not be set for this mce */
+	value &= ~(1ULL << 62);
+
+	if (value != 0xb600000000010015ULL)
+		return false;
+
+	/* Clear MCi_STATUS registers */
+	for (i = 0; i < 6; ++i)
+		native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
+
+	value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
+	if (!err) {
+		u32 low, high;
+
+		value &= ~(1ULL << 2);
+		low    = lower_32_bits(value);
+		high   = upper_32_bits(value);
+
+		native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
+	}
+
+	/* Flush tlb to evict multi-match entries */
+	__flush_tlb_all();
+
+	return true;
+}
+
 static void svm_handle_mce(struct vcpu_svm *svm)
 {
+	if (is_erratum_383()) {
+		/*
+		 * Erratum 383 triggered. Guest state is corrupt so kill the
+		 * guest.
+		 */
+		pr_err("KVM: Guest triggered AMD Erratum 383\n");
+
+		set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests);
+
+		return;
+	}
+
 	/*
 	 * On an #MC intercept the MCE handler is not called automatically in
 	 * the host. So do it by hand here.
-- 
cgit v1.2.3-70-g09d2


From 3be2264be3c00865116f997dc53ebcc90fe7fc4b Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Fri, 28 May 2010 09:44:59 -0300
Subject: KVM: MMU: invalidate and flush on spte small->large page size change

Always invalidate spte and flush TLBs when changing page size, to make
sure different sized translations for the same address are never cached
in a CPU's TLB.

Currently the only case where this occurs is when a non-leaf spte pointer is
overwritten by a leaf, large spte entry. This can happen after dirty
logging is disabled on a memslot, for example.

Noticed by Andrea.

KVM-Stable-Tag
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 81563e76e28..6fbcb48d5a9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1870,6 +1870,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
 			mmu_page_remove_parent_pte(child, sptep);
+			__set_spte(sptep, shadow_trap_nonpresent_pte);
+			kvm_flush_remote_tlbs(vcpu->kvm);
 		} else if (pfn != spte_to_pfn(*sptep)) {
 			pgprintk("hfn old %lx new %lx\n",
 				 spte_to_pfn(*sptep), pfn);
-- 
cgit v1.2.3-70-g09d2


From 69325a122580d3a7b26589e8efdd6663001c3297 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 27 May 2010 14:35:58 +0300
Subject: KVM: MMU: Remove user access when allowing kernel access to gpte.w=0
 page

If cr0.wp=0, we have to allow the guest kernel access to a page with pte.w=0.
We do that by setting spte.w=1, since the host cr0.wp must remain set so the
host can write protect pages.  Once we allow write access, we must remove
user access otherwise we mistakenly allow the user to write the page.

Reviewed-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6fbcb48d5a9..a6f695d7692 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1815,6 +1815,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 		spte |= PT_WRITABLE_MASK;
 
+		if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
+			spte &= ~PT_USER_MASK;
+
 		/*
 		 * Optimization: for pte sync, if spte was writable the hash
 		 * lookup is unnecessary (and expensive). Write protection
-- 
cgit v1.2.3-70-g09d2


From 837c4ef13c44296bb763a0ca0e84a076592474cf Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai.lu@oracle.com>
Date: Thu, 3 Jun 2010 13:43:03 -0700
Subject: PCI: clear bridge resource range if BIOS assigned bad one

Yannick found that video does not work with 2.6.34.  The cause of this
bug was that the BIOS had assigned the wrong range to the PCI bridge
above the video device.  Before 2.6.34 the kernel would have shrunk
the size of the bridge window, but since
  d65245c PCI: don't shrink bridge resources
the kernel will avoid shrinking BIOS ranges.

So zero out the old range if we fail to claim it at boot time; this will
cause us to allocate a new range at startup, restoring the 2.6.34
behavior.

Fixes regression https://bugzilla.kernel.org/show_bug.cgi?id=16009.

Reported-by: Yannick <yannick.roehlly@free.fr>
Acked-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/microblaze/pci/pci-common.c        | 1 +
 arch/mn10300/unit-asb2305/pci-asb2305.c | 1 +
 arch/powerpc/kernel/pci-common.c        | 1 +
 arch/x86/pci/i386.c                     | 2 ++
 4 files changed, 5 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
index 9cb782b8e03..23be25fec4d 100644
--- a/arch/microblaze/pci/pci-common.c
+++ b/arch/microblaze/pci/pci-common.c
@@ -1277,6 +1277,7 @@ void pcibios_allocate_bus_resources(struct pci_bus *bus)
 		printk(KERN_WARNING "PCI: Cannot allocate resource region "
 		       "%d of PCI bridge %d, will remap\n", i, bus->number);
 clear_resource:
+		res->start = res->end = 0;
 		res->flags = 0;
 	}
 
diff --git a/arch/mn10300/unit-asb2305/pci-asb2305.c b/arch/mn10300/unit-asb2305/pci-asb2305.c
index d6119b879a9..45b40ac6c46 100644
--- a/arch/mn10300/unit-asb2305/pci-asb2305.c
+++ b/arch/mn10300/unit-asb2305/pci-asb2305.c
@@ -117,6 +117,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
 					 * Invalidate the resource to prevent
 					 * child resource allocations in this
 					 * range. */
+					r->start = r->end = 0;
 					r->flags = 0;
 				}
 			}
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 6646005dffb..5b38f6ae2b2 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1309,6 +1309,7 @@ void pcibios_allocate_bus_resources(struct pci_bus *bus)
 		printk(KERN_WARNING "PCI: Cannot allocate resource region "
 		       "%d of PCI bridge %d, will remap\n", i, bus->number);
 clear_resource:
+		res->start = res->end = 0;
 		res->flags = 0;
 	}
 
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 97da2ba9344..6fdb3ec30c3 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -96,6 +96,7 @@ EXPORT_SYMBOL(pcibios_align_resource);
  *	  the fact the PCI specs explicitly allow address decoders to be
  *	  shared between expansion ROMs and other resource regions, it's
  *	  at least dangerous)
+ *	- bad resource sizes or overlaps with other regions
  *
  *  Our solution:
  *	(1) Allocate resources for all buses behind PCI-to-PCI bridges.
@@ -136,6 +137,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
 					 * child resource allocations in this
 					 * range.
 					 */
+					r->start = r->end = 0;
 					r->flags = 0;
 				}
 			}
-- 
cgit v1.2.3-70-g09d2