diff options
Diffstat (limited to 'arch/x86')
91 files changed, 4139 insertions, 2271 deletions
diff --git a/arch/x86/.gitignore b/arch/x86/.gitignore new file mode 100644 index 00000000000..028079065af --- /dev/null +++ b/arch/x86/.gitignore @@ -0,0 +1,3 @@ +boot/compressed/vmlinux +tools/test_get_len + diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a2d3a5fbeed..dcb0593b4a6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -109,6 +109,9 @@ config SBUS config NEED_DMA_MAP_STATE def_bool (X86_64 || DMAR || DMA_API_DEBUG) +config NEED_SG_DMA_LENGTH + def_bool y + config GENERIC_ISA_DMA def_bool y @@ -1703,6 +1706,10 @@ config HAVE_ARCH_EARLY_PFN_TO_NID def_bool X86_64 depends on NUMA +config USE_PERCPU_NUMA_NODE_ID + def_bool X86_64 + depends on NUMA + menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER @@ -1923,6 +1930,14 @@ config PCI_MMCONFIG bool "Support mmconfig PCI config space access" depends on X86_64 && PCI && ACPI +config PCI_CNB20LE_QUIRK + bool "Read CNB20LE Host Bridge Windows" + depends on PCI + help + Read the PCI windows out of the CNB20LE host bridge. This allows + PCI hotplug to work on systems with the CNB20LE chipset which do + not have ACPI. + config DMAR bool "Support for DMA Remapping Devices (EXPERIMENTAL)" depends on PCI_MSI && ACPI && EXPERIMENTAL diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c index bcbd36c4143..5c228129d17 100644 --- a/arch/x86/boot/compressed/mkpiggy.c +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -77,7 +77,7 @@ int main(int argc, char *argv[]) offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */ offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ - printf(".section \".rodata.compressed\",\"a\",@progbits\n"); + printf(".section \".rodata..compressed\",\"a\",@progbits\n"); printf(".globl z_input_len\n"); printf("z_input_len = %lu\n", ilen); printf(".globl z_output_len\n"); diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index a6f1a59a5b0..5ddabceee12 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -26,8 +26,8 @@ SECTIONS HEAD_TEXT _ehead = . ; } - .rodata.compressed : { - *(.rodata.compressed) + .rodata..compressed : { + *(.rodata..compressed) } .text : { _text = .; /* Text */ diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 20bb0e1ac68..ff16756a51c 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -32,6 +32,9 @@ #define IN IN1 #define KEY %xmm2 #define IV %xmm3 +#define BSWAP_MASK %xmm10 +#define CTR %xmm11 +#define INC %xmm12 #define KEYP %rdi #define OUTP %rsi @@ -42,6 +45,7 @@ #define T1 %r10 #define TKEYP T1 #define T2 %r11 +#define TCTR_LOW T2 _key_expansion_128: _key_expansion_256a: @@ -724,3 +728,114 @@ ENTRY(aesni_cbc_dec) movups IV, (IVP) .Lcbc_dec_just_ret: ret + +.align 16 +.Lbswap_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +/* + * _aesni_inc_init: internal ABI + * setup registers used by _aesni_inc + * input: + * IV + * output: + * CTR: == IV, in little endian + * TCTR_LOW: == lower qword of CTR + * INC: == 1, in little endian + * BSWAP_MASK == endian swapping mask + */ +_aesni_inc_init: + movaps .Lbswap_mask, BSWAP_MASK + movaps IV, CTR + PSHUFB_XMM BSWAP_MASK CTR + mov $1, TCTR_LOW + MOVQ_R64_XMM TCTR_LOW INC + MOVQ_R64_XMM CTR TCTR_LOW + ret + +/* + * _aesni_inc: internal ABI + * Increase IV by 1, IV is in big endian + * input: + * IV + * CTR: == IV, in little endian + * TCTR_LOW: == lower qword of CTR + * INC: == 1, in little endian + * BSWAP_MASK == endian swapping mask + * output: + * IV: Increase by 1 + * changed: + * CTR: == output IV, in little endian + * TCTR_LOW: == lower qword of CTR + */ +_aesni_inc: + paddq INC, CTR + add $1, TCTR_LOW + jnc .Linc_low + pslldq $8, INC + paddq INC, CTR + psrldq $8, INC +.Linc_low: + movaps CTR, IV + PSHUFB_XMM BSWAP_MASK IV + ret + +/* + * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, + * size_t len, u8 *iv) + */ +ENTRY(aesni_ctr_enc) + cmp $16, LEN + jb .Lctr_enc_just_ret + mov 480(KEYP), KLEN + movups (IVP), IV + call _aesni_inc_init + cmp $64, LEN + jb .Lctr_enc_loop1 +.align 4 +.Lctr_enc_loop4: + movaps IV, STATE1 + call _aesni_inc + movups (INP), IN1 + movaps IV, STATE2 + call _aesni_inc + movups 0x10(INP), IN2 + movaps IV, STATE3 + call _aesni_inc + movups 0x20(INP), IN3 + movaps IV, STATE4 + call _aesni_inc + movups 0x30(INP), IN4 + call _aesni_enc4 + pxor IN1, STATE1 + movups STATE1, (OUTP) + pxor IN2, STATE2 + movups STATE2, 0x10(OUTP) + pxor IN3, STATE3 + movups STATE3, 0x20(OUTP) + pxor IN4, STATE4 + movups STATE4, 0x30(OUTP) + sub $64, LEN + add $64, INP + add $64, OUTP + cmp $64, LEN + jge .Lctr_enc_loop4 + cmp $16, LEN + jb .Lctr_enc_ret +.align 4 +.Lctr_enc_loop1: + movaps IV, STATE + call _aesni_inc + movups (INP), IN + call _aesni_enc1 + pxor IN, STATE + movups STATE, (OUTP) + sub $16, LEN + add $16, INP + add $16, OUTP + cmp $16, LEN + jge .Lctr_enc_loop1 +.Lctr_enc_ret: + movups IV, (IVP) +.Lctr_enc_just_ret: + ret diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 49c552c060e..2cb3dcc4490 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -18,6 +18,7 @@ #include <crypto/algapi.h> #include <crypto/aes.h> #include <crypto/cryptd.h> +#include <crypto/ctr.h> #include <asm/i387.h> #include <asm/aes.h> @@ -58,6 +59,8 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); +asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) { @@ -321,6 +324,72 @@ static struct crypto_alg blk_cbc_alg = { }, }; +static void ctr_crypt_final(struct crypto_aes_ctx *ctx, + struct blkcipher_walk *walk) +{ + u8 *ctrblk = walk->iv; + u8 keystream[AES_BLOCK_SIZE]; + u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + + aesni_enc(ctx, keystream, ctrblk); + crypto_xor(keystream, src, nbytes); + memcpy(dst, keystream, nbytes); + crypto_inc(ctrblk, AES_BLOCK_SIZE); +} + +static int ctr_crypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm)); + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + kernel_fpu_begin(); + while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { + aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, + nbytes & AES_BLOCK_MASK, walk.iv); + nbytes &= AES_BLOCK_SIZE - 1; + err = blkcipher_walk_done(desc, &walk, nbytes); + } + if (walk.nbytes) { + ctr_crypt_final(ctx, &walk); + err = blkcipher_walk_done(desc, &walk, 0); + } + kernel_fpu_end(); + + return err; +} + +static struct crypto_alg blk_ctr_alg = { + .cra_name = "__ctr-aes-aesni", + .cra_driver_name = "__driver-ctr-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1, + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aes_set_key, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}; + static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, unsigned int key_len) { @@ -467,13 +536,11 @@ static struct crypto_alg ablk_cbc_alg = { }, }; -#ifdef HAS_CTR static int ablk_ctr_init(struct crypto_tfm *tfm) { struct cryptd_ablkcipher *cryptd_tfm; - cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))", - 0, 0); + cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ctr-aes-aesni", 0, 0); if (IS_ERR(cryptd_tfm)) return PTR_ERR(cryptd_tfm); ablk_init_common(tfm, cryptd_tfm); @@ -500,11 +567,50 @@ static struct crypto_alg ablk_ctr_alg = { .ivsize = AES_BLOCK_SIZE, .setkey = ablk_set_key, .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, + .decrypt = ablk_encrypt, .geniv = "chainiv", }, }, }; + +#ifdef HAS_CTR +static int ablk_rfc3686_ctr_init(struct crypto_tfm *tfm) +{ + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher( + "rfc3686(__driver-ctr-aes-aesni)", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ablk_init_common(tfm, cryptd_tfm); + return 0; +} + +static struct crypto_alg ablk_rfc3686_ctr_alg = { + .cra_name = "rfc3686(ctr(aes))", + .cra_driver_name = "rfc3686-ctr-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ablk_rfc3686_ctr_alg.cra_list), + .cra_init = ablk_rfc3686_ctr_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE+CTR_RFC3686_NONCE_SIZE, + .max_keysize = AES_MAX_KEY_SIZE+CTR_RFC3686_NONCE_SIZE, + .ivsize = CTR_RFC3686_IV_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + .geniv = "seqiv", + }, + }, +}; #endif #ifdef HAS_LRW @@ -640,13 +746,17 @@ static int __init aesni_init(void) goto blk_ecb_err; if ((err = crypto_register_alg(&blk_cbc_alg))) goto blk_cbc_err; + if ((err = crypto_register_alg(&blk_ctr_alg))) + goto blk_ctr_err; if ((err = crypto_register_alg(&ablk_ecb_alg))) goto ablk_ecb_err; if ((err = crypto_register_alg(&ablk_cbc_alg))) goto ablk_cbc_err; -#ifdef HAS_CTR if ((err = crypto_register_alg(&ablk_ctr_alg))) goto ablk_ctr_err; +#ifdef HAS_CTR + if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg))) + goto ablk_rfc3686_ctr_err; #endif #ifdef HAS_LRW if ((err = crypto_register_alg(&ablk_lrw_alg))) @@ -675,13 +785,17 @@ ablk_pcbc_err: ablk_lrw_err: #endif #ifdef HAS_CTR + crypto_unregister_alg(&ablk_rfc3686_ctr_alg); +ablk_rfc3686_ctr_err: +#endif crypto_unregister_alg(&ablk_ctr_alg); ablk_ctr_err: -#endif crypto_unregister_alg(&ablk_cbc_alg); ablk_cbc_err: crypto_unregister_alg(&ablk_ecb_alg); ablk_ecb_err: + crypto_unregister_alg(&blk_ctr_alg); +blk_ctr_err: crypto_unregister_alg(&blk_cbc_alg); blk_cbc_err: crypto_unregister_alg(&blk_ecb_alg); @@ -705,10 +819,12 @@ static void __exit aesni_exit(void) crypto_unregister_alg(&ablk_lrw_alg); #endif #ifdef HAS_CTR - crypto_unregister_alg(&ablk_ctr_alg); + crypto_unregister_alg(&ablk_rfc3686_ctr_alg); #endif + crypto_unregister_alg(&ablk_ctr_alg); crypto_unregister_alg(&ablk_cbc_alg); crypto_unregister_alg(&ablk_ecb_alg); + crypto_unregister_alg(&blk_ctr_alg); crypto_unregister_alg(&blk_cbc_alg); crypto_unregister_alg(&blk_ecb_alg); crypto_unregister_alg(&__aesni_alg); diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 56f462cf22d..aa2c39d968f 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -85,7 +85,6 @@ extern int acpi_ioapic; extern int acpi_noirq; extern int acpi_strict; extern int acpi_disabled; -extern int acpi_ht; extern int acpi_pci_disabled; extern int acpi_skip_timer_override; extern int acpi_use_timer_override; @@ -97,7 +96,6 @@ void acpi_pic_sci_set_trigger(unsigned int, u16); static inline void disable_acpi(void) { acpi_disabled = 1; - acpi_ht = 0; acpi_pci_disabled = 1; acpi_noirq = 1; } diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h index 2f9047cfaac..48f99f15452 100644 --- a/arch/x86/include/asm/cache.h +++ b/arch/x86/include/asm/cache.h @@ -7,7 +7,7 @@ #define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT) #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) -#define __read_mostly __attribute__((__section__(".data.read_mostly"))) +#define __read_mostly __attribute__((__section__(".data..read_mostly"))) #define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT #define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index dca9c545f44..46814591438 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -332,6 +332,7 @@ static __always_inline __pure bool __static_cpu_has(u8 bit) #endif } +#if __GNUC__ >= 4 #define static_cpu_has(bit) \ ( \ __builtin_constant_p(boot_cpu_has(bit)) ? \ @@ -340,6 +341,12 @@ static __always_inline __pure bool __static_cpu_has(u8 bit) __static_cpu_has(bit) : \ boot_cpu_has(bit) \ ) +#else +/* + * gcc 3.x is too stupid to do the static test; fall back to dynamic. + */ +#define static_cpu_has(bit) boot_cpu_has(bit) +#endif #endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h index 14cf526091f..280bf7fb6ab 100644 --- a/arch/x86/include/asm/inst.h +++ b/arch/x86/include/asm/inst.h @@ -7,7 +7,66 @@ #ifdef __ASSEMBLY__ +#define REG_NUM_INVALID 100 + +#define REG_TYPE_R64 0 +#define REG_TYPE_XMM 1 +#define REG_TYPE_INVALID 100 + + .macro R64_NUM opd r64 + \opd = REG_NUM_INVALID + .ifc \r64,%rax + \opd = 0 + .endif + .ifc \r64,%rcx + \opd = 1 + .endif + .ifc \r64,%rdx + \opd = 2 + .endif + .ifc \r64,%rbx + \opd = 3 + .endif + .ifc \r64,%rsp + \opd = 4 + .endif + .ifc \r64,%rbp + \opd = 5 + .endif + .ifc \r64,%rsi + \opd = 6 + .endif + .ifc \r64,%rdi + \opd = 7 + .endif + .ifc \r64,%r8 + \opd = 8 + .endif + .ifc \r64,%r9 + \opd = 9 + .endif + .ifc \r64,%r10 + \opd = 10 + .endif + .ifc \r64,%r11 + \opd = 11 + .endif + .ifc \r64,%r12 + \opd = 12 + .endif + .ifc \r64,%r13 + \opd = 13 + .endif + .ifc \r64,%r14 + \opd = 14 + .endif + .ifc \r64,%r15 + \opd = 15 + .endif + .endm + .macro XMM_NUM opd xmm + \opd = REG_NUM_INVALID .ifc \xmm,%xmm0 \opd = 0 .endif @@ -58,13 +117,25 @@ .endif .endm + .macro REG_TYPE type reg + R64_NUM reg_type_r64 \reg + XMM_NUM reg_type_xmm \reg + .if reg_type_r64 <> REG_NUM_INVALID + \type = REG_TYPE_R64 + .elseif reg_type_xmm <> REG_NUM_INVALID + \type = REG_TYPE_XMM + .else + \type = REG_TYPE_INVALID + .endif + .endm + .macro PFX_OPD_SIZE .byte 0x66 .endm - .macro PFX_REX opd1 opd2 - .if (\opd1 | \opd2) & 8 - .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) + .macro PFX_REX opd1 opd2 W=0 + .if ((\opd1 | \opd2) & 8) || \W + .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3) .endif .endm @@ -145,6 +216,25 @@ .byte 0x0f, 0x38, 0xdf MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2 .endm + + .macro MOVQ_R64_XMM opd1 opd2 + REG_TYPE movq_r64_xmm_opd1_type \opd1 + .if movq_r64_xmm_opd1_type == REG_TYPE_XMM + XMM_NUM movq_r64_xmm_opd1 \opd1 + R64_NUM movq_r64_xmm_opd2 \opd2 + .else + R64_NUM movq_r64_xmm_opd1 \opd1 + XMM_NUM movq_r64_xmm_opd2 \opd2 + .endif + PFX_OPD_SIZE + PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1 + .if movq_r64_xmm_opd1_type == REG_TYPE_XMM + .byte 0x0f, 0x7e + .else + .byte 0x0f, 0x6e + .endif + MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2 + .endm #endif #endif diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h new file mode 100644 index 00000000000..4470c9ad4a3 --- /dev/null +++ b/arch/x86/include/asm/intel_scu_ipc.h @@ -0,0 +1,55 @@ +#ifndef _ASM_X86_INTEL_SCU_IPC_H_ +#define _ASM_X86_INTEL_SCU_IPC_H_ + +/* Read single register */ +int intel_scu_ipc_ioread8(u16 addr, u8 *data); + +/* Read two sequential registers */ +int intel_scu_ipc_ioread16(u16 addr, u16 *data); + +/* Read four sequential registers */ +int intel_scu_ipc_ioread32(u16 addr, u32 *data); + +/* Read a vector */ +int intel_scu_ipc_readv(u16 *addr, u8 *data, int len); + +/* Write single register */ +int intel_scu_ipc_iowrite8(u16 addr, u8 data); + +/* Write two sequential registers */ +int intel_scu_ipc_iowrite16(u16 addr, u16 data); + +/* Write four sequential registers */ +int intel_scu_ipc_iowrite32(u16 addr, u32 data); + +/* Write a vector */ +int intel_scu_ipc_writev(u16 *addr, u8 *data, int len); + +/* Update single register based on the mask */ +int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask); + +/* + * Indirect register read + * Can be used when SCCB(System Controller Configuration Block) register + * HRIM(Honor Restricted IPC Messages) is set (bit 23) + */ +int intel_scu_ipc_register_read(u32 addr, u32 *data); + +/* + * Indirect register write + * Can be used when SCCB(System Controller Configuration Block) register + * HRIM(Honor Restricted IPC Messages) is set (bit 23) + */ +int intel_scu_ipc_register_write(u32 addr, u32 data); + +/* Issue commands to the SCU with or without data */ +int intel_scu_ipc_simple_command(int cmd, int sub); +int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, + u32 *out, int outlen); +/* I2C control api */ +int intel_scu_ipc_i2c_cntrl(u32 addr, u32 *data); + +/* Update FW version */ +int intel_scu_ipc_fw_update(u8 *buffer, u32 length); + +#endif diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index f46b79f6c16..ff90055c7f0 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -21,6 +21,7 @@ #define __KVM_HAVE_PIT_STATE2 #define __KVM_HAVE_XEN_HVM #define __KVM_HAVE_VCPU_EVENTS +#define __KVM_HAVE_DEBUGREGS /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 @@ -257,6 +258,11 @@ struct kvm_reinject_control { /* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */ #define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001 #define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 +#define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 + +/* Interrupt shadow states */ +#define KVM_X86_SHADOW_INT_MOV_SS 0x01 +#define KVM_X86_SHADOW_INT_STI 0x02 /* for KVM_GET/SET_VCPU_EVENTS */ struct kvm_vcpu_events { @@ -271,7 +277,7 @@ struct kvm_vcpu_events { __u8 injected; __u8 nr; __u8 soft; - __u8 pad; + __u8 shadow; } interrupt; struct { __u8 injected; @@ -284,4 +290,13 @@ struct kvm_vcpu_events { __u32 reserved[10]; }; +/* for KVM_GET/SET_DEBUGREGS */ +struct kvm_debugregs { + __u64 db[4]; + __u64 dr6; + __u64 dr7; + __u64 flags; + __u64 reserved[9]; +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 7a6f54fa13b..0b2729bf207 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -11,6 +11,8 @@ #ifndef _ASM_X86_KVM_X86_EMULATE_H #define _ASM_X86_KVM_X86_EMULATE_H +#include <asm/desc_defs.h> + struct x86_emulate_ctxt; /* @@ -63,6 +65,15 @@ struct x86_emulate_ops { unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); /* + * write_std: Write bytes of standard (non-emulated/special) memory. + * Used for descriptor writing. + * @addr: [IN ] Linear address to which to write. + * @val: [OUT] Value write to memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to write to memory. + */ + int (*write_std)(unsigned long addr, void *val, + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); + /* * fetch: Read bytes of standard (non-emulated/special) memory. * Used for instruction fetch. * @addr: [IN ] Linear address from which to read. @@ -109,6 +120,23 @@ struct x86_emulate_ops { unsigned int bytes, struct kvm_vcpu *vcpu); + int (*pio_in_emulated)(int size, unsigned short port, void *val, + unsigned int count, struct kvm_vcpu *vcpu); + + int (*pio_out_emulated)(int size, unsigned short port, const void *val, + unsigned int count, struct kvm_vcpu *vcpu); + + bool (*get_cached_descriptor)(struct desc_struct *desc, + int seg, struct kvm_vcpu *vcpu); + void (*set_cached_descriptor)(struct desc_struct *desc, + int seg, struct kvm_vcpu *vcpu); + u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); + void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); + void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); + ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); + void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); + int (*cpl)(struct kvm_vcpu *vcpu); + void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); }; /* Type, address-of, and value of an instruction's operand. */ @@ -124,6 +152,12 @@ struct fetch_cache { unsigned long end; }; +struct read_cache { + u8 data[1024]; + unsigned long pos; + unsigned long end; +}; + struct decode_cache { u8 twobyte; u8 b; @@ -139,7 +173,7 @@ struct decode_cache { u8 seg_override; unsigned int d; unsigned long regs[NR_VCPU_REGS]; - unsigned long eip, eip_orig; + unsigned long eip; /* modrm */ u8 modrm; u8 modrm_mod; @@ -151,16 +185,15 @@ struct decode_cache { void *modrm_ptr; unsigned long modrm_val; struct fetch_cache fetch; + struct read_cache io_read; }; -#define X86_SHADOW_INT_MOV_SS 1 -#define X86_SHADOW_INT_STI 2 - struct x86_emulate_ctxt { /* Register state before/after emulation. */ struct kvm_vcpu *vcpu; unsigned long eflags; + unsigned long eip; /* eip before instruction emulation */ /* Emulated execution mode, represented by an X86EMUL_MODE value. */ int mode; u32 cs_base; @@ -168,6 +201,7 @@ struct x86_emulate_ctxt { /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; + bool restart; /* restart string instruction after writeback */ /* decode cache */ struct decode_cache decode; }; @@ -194,5 +228,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops); int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops); +int emulator_task_switch(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 tss_selector, int reason, + bool has_error_code, u32 error_code); #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 06d9e79ca37..76f5483cffe 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -171,15 +171,15 @@ struct kvm_pte_chain { union kvm_mmu_page_role { unsigned word; struct { - unsigned glevels:4; unsigned level:4; + unsigned cr4_pae:1; unsigned quadrant:2; unsigned pad_for_nice_hex_output:6; unsigned direct:1; unsigned access:3; unsigned invalid:1; - unsigned cr4_pge:1; unsigned nxe:1; + unsigned cr0_wp:1; }; }; @@ -187,8 +187,6 @@ struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; - struct list_head oos_link; - /* * The following two entries are used to key the shadow page in the * hash table. @@ -204,9 +202,9 @@ struct kvm_mmu_page { * in this shadow page. */ DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); - int multimapped; /* More than one parent_pte? */ - int root_count; /* Currently serving as active root */ + bool multimapped; /* More than one parent_pte? */ bool unsync; + int root_count; /* Currently serving as active root */ unsigned int unsync_children; union { u64 *parent_pte; /* !multimapped */ @@ -224,14 +222,9 @@ struct kvm_pv_mmu_op_buffer { struct kvm_pio_request { unsigned long count; - int cur_count; - gva_t guest_gva; int in; int port; int size; - int string; - int down; - int rep; }; /* @@ -320,6 +313,7 @@ struct kvm_vcpu_arch { struct kvm_queued_exception { bool pending; bool has_error_code; + bool reinject; u8 nr; u32 error_code; } exception; @@ -362,8 +356,8 @@ struct kvm_vcpu_arch { u64 *mce_banks; /* used for guest single stepping over the given code position */ - u16 singlestep_cs; unsigned long singlestep_rip; + /* fields used by HYPER-V emulation */ u64 hv_vapic; }; @@ -389,6 +383,7 @@ struct kvm_arch { unsigned int n_free_mmu_pages; unsigned int n_requested_mmu_pages; unsigned int n_alloc_mmu_pages; + atomic_t invlpg_counter; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* * Hash table of struct kvm_mmu_page. @@ -461,11 +456,6 @@ struct kvm_vcpu_stat { u32 nmi_injections; }; -struct descriptor_table { - u16 limit; - unsigned long base; -} __attribute__((packed)); - struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ @@ -503,12 +493,11 @@ struct kvm_x86_ops { void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); - void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); - int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest); - int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value); + void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); + void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); + void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); + void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); + void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); @@ -527,7 +516,8 @@ struct kvm_x86_ops { void (*set_irq)(struct kvm_vcpu *vcpu); void (*set_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code); + bool has_error_code, u32 error_code, + bool reinject); int (*interrupt_allowed)(struct kvm_vcpu *vcpu); int (*nmi_allowed)(struct kvm_vcpu *vcpu); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); @@ -541,6 +531,8 @@ struct kvm_x86_ops { int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); + void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); + const struct trace_print_flags *exit_reasons_str; }; @@ -587,23 +579,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu, void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); -void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, - unsigned long *rflags); -unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr); -void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value, - unsigned long *rflags); void kvm_enable_efer_bits(u64); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); struct x86_emulate_ctxt; -int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, - int size, unsigned port); -int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, - int size, unsigned long count, int down, - gva_t address, int rep, unsigned port); +int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); @@ -616,12 +599,15 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, + bool has_error_code, u32 error_code); void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); +int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); @@ -634,6 +620,8 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); +void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); +void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, u32 error_code); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); @@ -649,8 +637,6 @@ int emulator_write_emulated(unsigned long addr, unsigned int bytes, struct kvm_vcpu *vcpu); -unsigned long segment_base(u16 selector); - void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes, @@ -675,7 +661,6 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_enable_tdp(void); void kvm_disable_tdp(void); -int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int complete_pio(struct kvm_vcpu *vcpu); bool kvm_check_iopl(struct kvm_vcpu *vcpu); @@ -724,23 +709,6 @@ static inline void kvm_load_ldt(u16 sel) asm("lldt %0" : : "rm"(sel)); } -static inline void kvm_get_idt(struct descriptor_table *table) -{ - asm("sidt %0" : "=m"(*table)); -} - -static inline void kvm_get_gdt(struct descriptor_table *table) -{ - asm("sgdt %0" : "=m"(*table)); -} - -static inline unsigned long kvm_read_tr_base(void) -{ - u16 tr; - asm("str %0" : "=g"(tr)); - return segment_base(tr); -} - #ifdef CONFIG_X86_64 static inline unsigned long read_msr(unsigned long msr) { @@ -826,4 +794,6 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_define_shared_msr(unsigned index, u32 msr); void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); +bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index ffae1420e7d..05eba5e9a8e 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -16,10 +16,23 @@ #define KVM_FEATURE_CLOCKSOURCE 0 #define KVM_FEATURE_NOP_IO_DELAY 1 #define KVM_FEATURE_MMU_OP 2 +/* This indicates that the new set of kvmclock msrs + * are available. The use of 0x11 and 0x12 is deprecated + */ +#define KVM_FEATURE_CLOCKSOURCE2 3 + +/* The last 8 bits are used to indicate how to interpret the flags field + * in pvclock structure. If no bits are set, all flags are ignored. + */ +#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24 #define MSR_KVM_WALL_CLOCK 0x11 #define MSR_KVM_SYSTEM_TIME 0x12 +/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ +#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 +#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 + #define KVM_MAX_MMU_OP_BATCH 32 /* Operations for KVM_HC_MMU_OP */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6c3fdd631ed..f32a4301c4d 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -225,5 +225,13 @@ extern void mcheck_intel_therm_init(void); static inline void mcheck_intel_therm_init(void) { } #endif +/* + * Used by APEI to report memory error via /dev/mcelog + */ + +struct cper_sec_mem_err; +extern void apei_mce_report_mem_error(int corrected, + struct cper_sec_mem_err *mem_err); + #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index bc473acfa7f..8c7ae431862 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -110,6 +110,7 @@ #define MSR_AMD64_PATCH_LOADER 0xc0010020 #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 #define MSR_AMD64_OSVW_STATUS 0xc0010141 +#define MSR_AMD64_DC_CFG 0xc0011022 #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 @@ -202,8 +203,9 @@ #define MSR_IA32_EBL_CR_POWERON 0x0000002a #define MSR_IA32_FEATURE_CONTROL 0x0000003a -#define FEATURE_CONTROL_LOCKED (1<<0) -#define FEATURE_CONTROL_VMXON_ENABLED (1<<2) +#define FEATURE_CONTROL_LOCKED (1<<0) +#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) +#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2) #define MSR_IA32_APICBASE 0x0000001b #define MSR_IA32_APICBASE_BSP (1<<8) @@ -235,6 +237,8 @@ #define MSR_IA32_MISC_ENABLE 0x000001a0 +#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2 + /* MISC_ENABLE bits: architectural */ #define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) #define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 1a0422348d6..cd2a31dc5fb 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -53,6 +53,8 @@ extern int pcibios_last_bus; extern struct pci_bus *pci_root_bus; extern struct pci_ops pci_root_ops; +void pcibios_scan_specific_bus(int busn); + /* pci-irq.c */ struct irq_info { @@ -83,7 +85,7 @@ struct irq_routing_table { extern unsigned int pcibios_irq_mask; -extern spinlock_t pci_config_lock; +extern raw_spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); extern void (*pcibios_disable_irq)(struct pci_dev *dev); diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index b05400a542f..64a8ebff06f 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -89,7 +89,8 @@ P4_CCCR_ENABLE) /* HT mask */ -#define P4_CCCR_MASK_HT (P4_CCCR_MASK | P4_CCCR_THREAD_ANY) +#define P4_CCCR_MASK_HT \ + (P4_CCCR_MASK | P4_CCCR_OVF_PMI_T1 | P4_CCCR_THREAD_ANY) #define P4_GEN_ESCR_EMASK(class, name, bit) \ class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT) diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h index 6d93508f262..35f2d1948ad 100644 --- a/arch/x86/include/asm/pvclock-abi.h +++ b/arch/x86/include/asm/pvclock-abi.h @@ -29,7 +29,8 @@ struct pvclock_vcpu_time_info { u64 system_time; u32 tsc_to_system_mul; s8 tsc_shift; - u8 pad[3]; + u8 flags; + u8 pad[2]; } __attribute__((__packed__)); /* 32 bytes */ struct pvclock_wall_clock { @@ -38,5 +39,6 @@ struct pvclock_wall_clock { u32 nsec; } __attribute__((__packed__)); +#define PVCLOCK_TSC_STABLE_BIT (1 << 0) #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PVCLOCK_ABI_H */ diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 53235fd5f8c..cd02f324aa6 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -6,6 +6,7 @@ /* some helper functions for xen and kvm pv clock sources */ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); +void pvclock_set_flags(u8 flags); unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); void pvclock_read_wallclock(struct pvclock_wall_clock *wall, struct pvclock_vcpu_time_info *vcpu, diff --git a/arch/x86/include/asm/rdc321x_defs.h b/arch/x86/include/asm/rdc321x_defs.h deleted file mode 100644 index c8e9c8bed3d..00000000000 --- a/arch/x86/include/asm/rdc321x_defs.h +++ /dev/null @@ -1,12 +0,0 @@ -#define PFX "rdc321x: " - -/* General purpose configuration and data registers */ -#define RDC3210_CFGREG_ADDR 0x0CF8 -#define RDC3210_CFGREG_DATA 0x0CFC - -#define RDC321X_GPIO_CTRL_REG1 0x48 -#define RDC321X_GPIO_CTRL_REG2 0x84 -#define RDC321X_GPIO_DATA_REG1 0x4c -#define RDC321X_GPIO_DATA_REG2 0x88 - -#define RDC321X_MAX_GPIO 58 diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h index 75af592677e..fb0b1874396 100644 --- a/arch/x86/include/asm/scatterlist.h +++ b/arch/x86/include/asm/scatterlist.h @@ -1,8 +1,9 @@ #ifndef _ASM_X86_SCATTERLIST_H #define _ASM_X86_SCATTERLIST_H -#define ISA_DMA_THRESHOLD (0x00ffffff) - #include <asm-generic/scatterlist.h> +#define ISA_DMA_THRESHOLD (0x00ffffff) +#define ARCH_HAS_SG_CHAIN + #endif /* _ASM_X86_SCATTERLIST_H */ diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h index 48dcfa62ea0..fd921c3a684 100644 --- a/arch/x86/include/asm/suspend_32.h +++ b/arch/x86/include/asm/suspend_32.h @@ -15,6 +15,8 @@ static inline int arch_prepare_suspend(void) { return 0; } struct saved_context { u16 es, fs, gs, ss; unsigned long cr0, cr2, cr3, cr4; + u64 misc_enable; + bool misc_enable_saved; struct desc_ptr gdt; struct desc_ptr idt; u16 ldt; diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h index 06284f42b75..8d942afae68 100644 --- a/arch/x86/include/asm/suspend_64.h +++ b/arch/x86/include/asm/suspend_64.h @@ -27,6 +27,8 @@ struct saved_context { u16 ds, es, fs, gs, ss; unsigned long gs_base, gs_kernel_base, fs_base; unsigned long cr0, cr2, cr3, cr4, cr8; + u64 misc_enable; + bool misc_enable_saved; unsigned long efer; u16 gdt_pad; u16 gdt_limit; diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 38638cd2fa4..0e831059ac5 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -81,7 +81,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 event_inj_err; u64 nested_cr3; u64 lbr_ctl; - u8 reserved_5[832]; + u64 reserved_5; + u64 next_rip; + u8 reserved_6[816]; }; @@ -115,6 +117,10 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT) #define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT) +#define SVM_VM_CR_VALID_MASK 0x001fULL +#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL +#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL + struct __attribute__ ((__packed__)) vmcb_seg { u16 selector; u16 attrib; @@ -238,6 +244,7 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_EXITINFOSHIFT_TS_REASON_IRET 36 #define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 +#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44 #define SVM_EXIT_READ_CR0 0x000 #define SVM_EXIT_READ_CR3 0x003 diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 62ba9400cc4..f0b6e5dbc5a 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -239,8 +239,8 @@ static inline struct thread_info *current_thread_info(void) #define TS_USEDFPU 0x0001 /* FPU was used by this task this quantum (SMP) */ #define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ -#define TS_POLLING 0x0004 /* true if in idle loop - and not sleeping */ +#define TS_POLLING 0x0004 /* idle task polling need_resched, + skip sending interrupt */ #define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ #define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index c5087d79658..21899cc31e5 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -53,33 +53,29 @@ extern int cpu_to_node_map[]; /* Returns the number of the node containing CPU 'cpu' */ -static inline int cpu_to_node(int cpu) +static inline int __cpu_to_node(int cpu) { return cpu_to_node_map[cpu]; } -#define early_cpu_to_node(cpu) cpu_to_node(cpu) +#define early_cpu_to_node __cpu_to_node +#define cpu_to_node __cpu_to_node #else /* CONFIG_X86_64 */ /* Mappings between logical cpu number and node number */ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); -/* Returns the number of the current Node. */ -DECLARE_PER_CPU(int, node_number); -#define numa_node_id() percpu_read(node_number) - #ifdef CONFIG_DEBUG_PER_CPU_MAPS -extern int cpu_to_node(int cpu); +/* + * override generic percpu implementation of cpu_to_node + */ +extern int __cpu_to_node(int cpu); +#define cpu_to_node __cpu_to_node + extern int early_cpu_to_node(int cpu); #else /* !CONFIG_DEBUG_PER_CPU_MAPS */ -/* Returns the number of the node containing CPU 'cpu' */ -static inline int cpu_to_node(int cpu) -{ - return per_cpu(x86_cpu_to_node_map, cpu); -} - /* Same function but used if called before per_cpu areas are setup */ static inline int early_cpu_to_node(int cpu) { @@ -170,6 +166,10 @@ static inline int numa_node_id(void) { return 0; } +/* + * indicate override: + */ +#define numa_node_id numa_node_id static inline int early_cpu_to_node(int cpu) { diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index fb9a080740e..9e6779f7cf2 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -25,6 +25,8 @@ * */ +#include <linux/types.h> + /* * Definitions of Primary Processor-Based VM-Execution Controls. */ @@ -120,6 +122,8 @@ enum vmcs_field { GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, GUEST_IA32_PAT = 0x00002804, GUEST_IA32_PAT_HIGH = 0x00002805, + GUEST_IA32_EFER = 0x00002806, + GUEST_IA32_EFER_HIGH = 0x00002807, GUEST_PDPTR0 = 0x0000280a, GUEST_PDPTR0_HIGH = 0x0000280b, GUEST_PDPTR1 = 0x0000280c, @@ -130,6 +134,8 @@ enum vmcs_field { GUEST_PDPTR3_HIGH = 0x00002811, HOST_IA32_PAT = 0x00002c00, HOST_IA32_PAT_HIGH = 0x00002c01, + HOST_IA32_EFER = 0x00002c02, + HOST_IA32_EFER_HIGH = 0x00002c03, PIN_BASED_VM_EXEC_CONTROL = 0x00004000, CPU_BASED_VM_EXEC_CONTROL = 0x00004002, EXCEPTION_BITMAP = 0x00004004, @@ -394,6 +400,10 @@ enum vmcs_field { #define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" #define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" - +struct vmx_msr_entry { + u32 index; + u32 reserved; + u64 value; +} __aligned(16); #endif diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 488be461a38..60cc4058ed5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -63,7 +63,6 @@ EXPORT_SYMBOL(acpi_disabled); int acpi_noirq; /* skip ACPI IRQ initialization */ int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ EXPORT_SYMBOL(acpi_pci_disabled); -int acpi_ht __initdata = 1; /* enable HT */ int acpi_lapic; int acpi_ioapic; @@ -1501,9 +1500,8 @@ void __init acpi_boot_table_init(void) /* * If acpi_disabled, bail out - * One exception: acpi=ht continues far enough to enumerate LAPICs */ - if (acpi_disabled && !acpi_ht) + if (acpi_disabled) return; /* @@ -1534,9 +1532,8 @@ int __init early_acpi_boot_init(void) { /* * If acpi_disabled, bail out - * One exception: acpi=ht continues far enough to enumerate LAPICs */ - if (acpi_disabled && !acpi_ht) + if (acpi_disabled) return 1; /* @@ -1554,9 +1551,8 @@ int __init acpi_boot_init(void) /* * If acpi_disabled, bail out - * One exception: acpi=ht continues far enough to enumerate LAPICs */ - if (acpi_disabled && !acpi_ht) + if (acpi_disabled) return 1; acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); @@ -1591,21 +1587,12 @@ static int __init parse_acpi(char *arg) /* acpi=force to over-ride black-list */ else if (strcmp(arg, "force") == 0) { acpi_force = 1; - acpi_ht = 1; acpi_disabled = 0; } /* acpi=strict disables out-of-spec workarounds */ else if (strcmp(arg, "strict") == 0) { acpi_strict = 1; } - /* Limit ACPI just to boot-time to enable HT */ - else if (strcmp(arg, "ht") == 0) { - if (!acpi_force) { - printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n"); - disable_acpi(); - } - acpi_ht = 1; - } /* acpi=rsdt use RSDT instead of XSDT */ else if (strcmp(arg, "rsdt") == 0) { acpi_rsdt_forced = 1; diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index f9961034e55..82e508677b9 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -162,8 +162,6 @@ static int __init acpi_sleep_setup(char *str) #endif if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); - if (strncmp(str, "sci_force_enable", 16) == 0) - acpi_set_sci_en_on_resume(); str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index 8ded418b059..13ab720573e 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -1,4 +1,4 @@ - .section .text.page_aligned + .section .text..page_aligned #include <linux/linkage.h> #include <asm/segment.h> #include <asm/page_types.h> diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index fa5a1474cd1..0d20286d78c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1487,6 +1487,7 @@ static int __attach_device(struct device *dev, struct protection_domain *domain) { struct iommu_dev_data *dev_data, *alias_data; + int ret; dev_data = get_dev_data(dev); alias_data = get_dev_data(dev_data->alias); @@ -1498,13 +1499,14 @@ static int __attach_device(struct device *dev, spin_lock(&domain->lock); /* Some sanity checks */ + ret = -EBUSY; if (alias_data->domain != NULL && alias_data->domain != domain) - return -EBUSY; + goto out_unlock; if (dev_data->domain != NULL && dev_data->domain != domain) - return -EBUSY; + goto out_unlock; /* Do real assignment */ if (dev_data->alias != dev) { @@ -1520,10 +1522,14 @@ static int __attach_device(struct device *dev, atomic_inc(&dev_data->bind); + ret = 0; + +out_unlock: + /* ready */ spin_unlock(&domain->lock); - return 0; + return ret; } /* @@ -2324,10 +2330,6 @@ int __init amd_iommu_init_dma_ops(void) iommu_detected = 1; swiotlb = 0; -#ifdef CONFIG_GART_IOMMU - gart_iommu_aperture_disabled = 1; - gart_iommu_aperture = 0; -#endif /* Make the driver finally visible to the drivers */ dma_ops = &amd_iommu_dma_ops; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 3bacb4d0844..3cc63e2b8dd 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -287,8 +287,12 @@ static u8 * __init iommu_map_mmio_space(u64 address) { u8 *ret; - if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) + if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) { + pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n", + address); + pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n"); return NULL; + } ret = ioremap_nocache(address, MMIO_REGION_LENGTH); if (ret != NULL) @@ -1314,7 +1318,7 @@ static int __init amd_iommu_init(void) ret = amd_iommu_init_dma_ops(); if (ret) - goto free; + goto free_disable; amd_iommu_init_api(); @@ -1332,9 +1336,10 @@ static int __init amd_iommu_init(void) out: return ret; -free: +free_disable: disable_iommus(); +free: amd_iommu_uninit_devices(); free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, @@ -1353,6 +1358,15 @@ free: free_unity_maps(); +#ifdef CONFIG_GART_IOMMU + /* + * We failed to initialize the AMD IOMMU - try fallback to GART + * if possible. + */ + gart_iommu_init(); + +#endif + goto out; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 192cd7ee35c..0bcc5aeda99 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -51,6 +51,7 @@ #include <asm/smp.h> #include <asm/mce.h> #include <asm/kvm_para.h> +#include <asm/tsc.h> unsigned int num_processors; @@ -1151,8 +1152,13 @@ static void __cpuinit lapic_setup_esr(void) */ void __cpuinit setup_local_APIC(void) { - unsigned int value; - int i, j; + unsigned int value, queued; + int i, j, acked = 0; + unsigned long long tsc = 0, ntsc; + long long max_loops = cpu_khz; + + if (cpu_has_tsc) + rdtscll(tsc); if (disable_apic) { arch_disable_smp_support(); @@ -1204,13 +1210,32 @@ void __cpuinit setup_local_APIC(void) * the interrupt. Hence a vector might get locked. It was noticed * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. */ - for (i = APIC_ISR_NR - 1; i >= 0; i--) { - value = apic_read(APIC_ISR + i*0x10); - for (j = 31; j >= 0; j--) { - if (value & (1<<j)) - ack_APIC_irq(); + do { + queued = 0; + for (i = APIC_ISR_NR - 1; i >= 0; i--) + queued |= apic_read(APIC_IRR + i*0x10); + + for (i = APIC_ISR_NR - 1; i >= 0; i--) { + value = apic_read(APIC_ISR + i*0x10); + for (j = 31; j >= 0; j--) { + if (value & (1<<j)) { + ack_APIC_irq(); + acked++; + } + } } - } + if (acked > 256) { + printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n", + acked); + break; + } + if (cpu_has_tsc) { + rdtscll(ntsc); + max_loops = (cpu_khz << 10) - (ntsc - tsc); + } else + max_loops--; + } while (queued && max_loops > 0); + WARN_ON(max_loops <= 0); /* * Now that we are all set up, enable the APIC diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cc83a002786..68e4a6f2211 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1121,9 +1121,9 @@ void __cpuinit cpu_init(void) oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA - if (cpu != 0 && percpu_read(node_number) == 0 && - cpu_to_node(cpu) != NUMA_NO_NODE) - percpu_write(node_number, cpu_to_node(cpu)); + if (cpu != 0 && percpu_read(numa_node) == 0 && + early_cpu_to_node(cpu) != NUMA_NO_NODE) + set_numa_node(early_cpu_to_node(cpu)); #endif me = current; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 6f3dc8fbbfd..7ec2123838e 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1497,8 +1497,8 @@ static struct cpufreq_driver cpufreq_amd64_driver = { * simply keep the boost-disable flag in sync with the current global * state. */ -static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action, - void *hcpu) +static int cpb_notify(struct notifier_block *nb, unsigned long action, + void *hcpu) { unsigned cpu = (long)hcpu; u32 lo, hi; @@ -1528,7 +1528,7 @@ static int __cpuinit cpb_notify(struct notifier_block *nb, unsigned long action, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata cpb_nb = { +static struct notifier_block cpb_nb = { .notifier_call = cpb_notify, }; diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 4ac6d48fe11..bb34b03af25 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -7,3 +7,5 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o + +obj-$(CONFIG_ACPI_APEI) += mce-apei.o diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c new file mode 100644 index 00000000000..745b54f9be8 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -0,0 +1,138 @@ +/* + * Bridge between MCE and APEI + * + * On some machine, corrected memory errors are reported via APEI + * generic hardware error source (GHES) instead of corrected Machine + * Check. These corrected memory errors can be reported to user space + * through /dev/mcelog via faking a corrected Machine Check, so that + * the error memory page can be offlined by /sbin/mcelog if the error + * count for one page is beyond the threshold. + * + * For fatal MCE, save MCE record into persistent storage via ERST, so + * that the MCE record can be logged after reboot via ERST. + * + * Copyright 2010 Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kernel.h> +#include <linux/acpi.h> +#include <linux/cper.h> +#include <acpi/apei.h> +#include <asm/mce.h> + +#include "mce-internal.h" + +void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) +{ + struct mce m; + + /* Only corrected MC is reported */ + if (!corrected) + return; + + mce_setup(&m); + m.bank = 1; + /* Fake a memory read corrected error with unknown channel */ + m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; + m.addr = mem_err->physical_addr; + mce_log(&m); + mce_notify_irq(); +} +EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); + +#define CPER_CREATOR_MCE \ + UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \ + 0x64, 0x90, 0xb8, 0x9d) +#define CPER_SECTION_TYPE_MCE \ + UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \ + 0x04, 0x4a, 0x38, 0xfc) + +/* + * CPER specification (in UEFI specification 2.3 appendix N) requires + * byte-packed. + */ +struct cper_mce_record { + struct cper_record_header hdr; + struct cper_section_descriptor sec_hdr; + struct mce mce; +} __packed; + +int apei_write_mce(struct mce *m) +{ + struct cper_mce_record rcd; + + memset(&rcd, 0, sizeof(rcd)); + memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE); + rcd.hdr.revision = CPER_RECORD_REV; + rcd.hdr.signature_end = CPER_SIG_END; + rcd.hdr.section_count = 1; + rcd.hdr.error_severity = CPER_SER_FATAL; + /* timestamp, platform_id, partition_id are all invalid */ + rcd.hdr.validation_bits = 0; + rcd.hdr.record_length = sizeof(rcd); + rcd.hdr.creator_id = CPER_CREATOR_MCE; + rcd.hdr.notification_type = CPER_NOTIFY_MCE; + rcd.hdr.record_id = cper_next_record_id(); + rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR; + + rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd; + rcd.sec_hdr.section_length = sizeof(rcd.mce); + rcd.sec_hdr.revision = CPER_SEC_REV; + /* fru_id and fru_text is invalid */ + rcd.sec_hdr.validation_bits = 0; + rcd.sec_hdr.flags = CPER_SEC_PRIMARY; + rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE; + rcd.sec_hdr.section_severity = CPER_SER_FATAL; + + memcpy(&rcd.mce, m, sizeof(*m)); + + return erst_write(&rcd.hdr); +} + +ssize_t apei_read_mce(struct mce *m, u64 *record_id) +{ + struct cper_mce_record rcd; + ssize_t len; + + len = erst_read_next(&rcd.hdr, sizeof(rcd)); + if (len <= 0) + return len; + /* Can not skip other records in storage via ERST unless clear them */ + else if (len != sizeof(rcd) || + uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { + if (printk_ratelimit()) + pr_warning( + "MCE-APEI: Can not skip the unknown record in ERST"); + return -EIO; + } + + memcpy(m, &rcd.mce, sizeof(*m)); + *record_id = rcd.hdr.record_id; + + return sizeof(*m); +} + +/* Check whether there is record in ERST */ +int apei_check_mce(void) +{ + return erst_get_record_count(); +} + +int apei_clear_mce(u64 record_id) +{ + return erst_clear(record_id); +} diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 32996f9fab6..fefcc69ee8b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -28,3 +28,26 @@ extern int mce_ser; extern struct mce_bank *mce_banks; +#ifdef CONFIG_ACPI_APEI +int apei_write_mce(struct mce *m); +ssize_t apei_read_mce(struct mce *m, u64 *record_id); +int apei_check_mce(void); +int apei_clear_mce(u64 record_id); +#else +static inline int apei_write_mce(struct mce *m) +{ + return -EINVAL; +} +static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id) +{ + return 0; +} +static inline int apei_check_mce(void) +{ + return 0; +} +static inline int apei_clear_mce(u64 record_id) +{ + return -EINVAL; +} +#endif diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 7a355ddcc64..18cc4256225 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -36,6 +36,7 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/debugfs.h> +#include <linux/edac_mce.h> #include <asm/processor.h> #include <asm/hw_irq.h> @@ -169,6 +170,15 @@ void mce_log(struct mce *mce) entry = rcu_dereference_check_mce(mcelog.next); for (;;) { /* + * If edac_mce is enabled, it will check the error type + * and will process it, if it is a known error. + * Otherwise, the error will be sent through mcelog + * interface + */ + if (edac_mce_parse(mce)) + return; + + /* * When the buffer fills up discard new entries. * Assume that the earlier errors are the more * interesting ones: @@ -264,7 +274,7 @@ static void wait_for_panic(void) static void mce_panic(char *msg, struct mce *final, char *exp) { - int i; + int i, apei_err = 0; if (!fake_panic) { /* @@ -287,8 +297,11 @@ static void mce_panic(char *msg, struct mce *final, char *exp) struct mce *m = &mcelog.entry[i]; if (!(m->status & MCI_STATUS_VAL)) continue; - if (!(m->status & MCI_STATUS_UC)) + if (!(m->status & MCI_STATUS_UC)) { print_mce(m); + if (!apei_err) + apei_err = apei_write_mce(m); + } } /* Now print uncorrected but with the final one last */ for (i = 0; i < MCE_LOG_LEN; i++) { @@ -297,11 +310,17 @@ static void mce_panic(char *msg, struct mce *final, char *exp) continue; if (!(m->status & MCI_STATUS_UC)) continue; - if (!final || memcmp(m, final, sizeof(struct mce))) + if (!final || memcmp(m, final, sizeof(struct mce))) { print_mce(m); + if (!apei_err) + apei_err = apei_write_mce(m); + } } - if (final) + if (final) { print_mce(final); + if (!apei_err) + apei_err = apei_write_mce(final); + } if (cpu_missing) printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); print_mce_tail(); @@ -1493,6 +1512,43 @@ static void collect_tscs(void *data) rdtscll(cpu_tsc[smp_processor_id()]); } +static int mce_apei_read_done; + +/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ +static int __mce_read_apei(char __user **ubuf, size_t usize) +{ + int rc; + u64 record_id; + struct mce m; + + if (usize < sizeof(struct mce)) + return -EINVAL; + + rc = apei_read_mce(&m, &record_id); + /* Error or no more MCE record */ + if (rc <= 0) { + mce_apei_read_done = 1; + return rc; + } + rc = -EFAULT; + if (copy_to_user(*ubuf, &m, sizeof(struct mce))) + return rc; + /* + * In fact, we should have cleared the record after that has + * been flushed to the disk or sent to network in + * /sbin/mcelog, but we have no interface to support that now, + * so just clear it to avoid duplication. + */ + rc = apei_clear_mce(record_id); + if (rc) { + mce_apei_read_done = 1; + return rc; + } + *ubuf += sizeof(struct mce); + + return 0; +} + static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) { @@ -1506,15 +1562,19 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, return -ENOMEM; mutex_lock(&mce_read_mutex); + + if (!mce_apei_read_done) { + err = __mce_read_apei(&buf, usize); + if (err || buf != ubuf) + goto out; + } + next = rcu_dereference_check_mce(mcelog.next); /* Only supports full reads right now */ - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { - mutex_unlock(&mce_read_mutex); - kfree(cpu_tsc); - - return -EINVAL; - } + err = -EINVAL; + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) + goto out; err = 0; prev = 0; @@ -1562,10 +1622,15 @@ timeout: memset(&mcelog.entry[i], 0, sizeof(struct mce)); } } + + if (err) + err = -EFAULT; + +out: mutex_unlock(&mce_read_mutex); kfree(cpu_tsc); - return err ? -EFAULT : buf - ubuf; + return err ? err : buf - ubuf; } static unsigned int mce_poll(struct file *file, poll_table *wait) @@ -1573,6 +1638,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait) poll_wait(file, &mce_wait, wait); if (rcu_dereference_check_mce(mcelog.next)) return POLLIN | POLLRDNORM; + if (!mce_apei_read_done && apei_check_mce()) + return POLLIN | POLLRDNORM; return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 81c499eceb2..e1a0a3bf971 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -190,7 +190,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, mutex_unlock(&therm_cpu_lock); break; } - return err ? NOTIFY_BAD : NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata = diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index fd4db0db370..5db5b7d65a1 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -106,6 +106,7 @@ struct cpu_hw_events { int n_events; int n_added; + int n_txn; int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ u64 tags[X86_PMC_IDX_MAX]; struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ @@ -983,6 +984,7 @@ static int x86_pmu_enable(struct perf_event *event) out: cpuc->n_events = n; cpuc->n_added += n - n0; + cpuc->n_txn += n - n0; return 0; } @@ -1089,6 +1091,14 @@ static void x86_pmu_disable(struct perf_event *event) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int i; + /* + * If we're called during a txn, we don't need to do anything. + * The events never got scheduled and ->cancel_txn will truncate + * the event_list. + */ + if (cpuc->group_flag & PERF_EVENT_TXN_STARTED) + return; + x86_pmu_stop(event); for (i = 0; i < cpuc->n_events; i++) { @@ -1379,6 +1389,7 @@ static void x86_pmu_start_txn(const struct pmu *pmu) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); cpuc->group_flag |= PERF_EVENT_TXN_STARTED; + cpuc->n_txn = 0; } /* @@ -1391,6 +1402,11 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); cpuc->group_flag &= ~PERF_EVENT_TXN_STARTED; + /* + * Truncate the collected events. + */ + cpuc->n_added -= cpuc->n_txn; + cpuc->n_events -= cpuc->n_txn; } /* @@ -1419,6 +1435,12 @@ static int x86_pmu_commit_txn(const struct pmu *pmu) */ memcpy(cpuc->assign, assign, n*sizeof(int)); + /* + * Clear out the txn count so that ->cancel_txn() which gets + * run after ->commit_txn() doesn't undo things. + */ + cpuc->n_txn = 0; + return 0; } @@ -1717,7 +1739,11 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski */ regs->bp = rewind_frame_pointer(skip + 1); regs->cs = __KERNEL_CS; - local_save_flags(regs->flags); + /* + * We abuse bit 3 to pass exact information, see perf_misc_flags + * and the comment with PERF_EFLAGS_EXACT. + */ + regs->flags = 0; } unsigned long perf_instruction_pointer(struct pt_regs *regs) diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 424fc8de68e..ae85d69644d 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -465,15 +465,21 @@ out: return rc; } -static inline void p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) +static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) { - unsigned long dummy; + int overflow = 0; + u32 low, high; - rdmsrl(hwc->config_base + hwc->idx, dummy); - if (dummy & P4_CCCR_OVF) { + rdmsr(hwc->config_base + hwc->idx, low, high); + + /* we need to check high bit for unflagged overflows */ + if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) { + overflow = 1; (void)checking_wrmsrl(hwc->config_base + hwc->idx, - ((u64)dummy) & ~P4_CCCR_OVF); + ((u64)low) & ~P4_CCCR_OVF); } + + return overflow; } static inline void p4_pmu_disable_event(struct perf_event *event) @@ -584,21 +590,15 @@ static int p4_pmu_handle_irq(struct pt_regs *regs) WARN_ON_ONCE(hwc->idx != idx); - /* - * FIXME: Redundant call, actually not needed - * but just to check if we're screwed - */ - p4_pmu_clear_cccr_ovf(hwc); + /* it might be unflagged overflow */ + handled = p4_pmu_clear_cccr_ovf(hwc); val = x86_perf_event_update(event); - if (val & (1ULL << (x86_pmu.cntval_bits - 1))) + if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) continue; - /* - * event overflow - */ - handled = 1; - data.period = event->hw.last_period; + /* event overflow for sure */ + data.period = event->hw.last_period; if (!x86_perf_event_set_period(event)) continue; @@ -670,7 +670,7 @@ static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu) /* * ESCR address hashing is tricky, ESCRs are not sequential - * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03e0) and + * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and * the metric between any ESCRs is laid in range [0xa0,0xe1] * * so we make ~70% filled hashtable @@ -735,8 +735,9 @@ static int p4_get_escr_idx(unsigned int addr) { unsigned int idx = P4_ESCR_MSR_IDX(addr); - if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE || - !p4_escr_table[idx])) { + if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE || + !p4_escr_table[idx] || + p4_escr_table[idx] != addr)) { WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr); return -1; } @@ -762,7 +763,7 @@ static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign { unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)]; - int cpu = raw_smp_processor_id(); + int cpu = smp_processor_id(); struct hw_perf_event *hwc; struct p4_event_bind *bind; unsigned int i, thread, num; diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 8b862d5900f..1b7b31ab7d8 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -170,7 +170,7 @@ static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb, cpuid_device_destroy(cpu); break; } - return err ? NOTIFY_BAD : NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block __refdata cpuid_class_cpu_notifier = diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 3a54dcb9cd0..43e9ccf4494 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -34,7 +34,7 @@ EXPORT_SYMBOL(init_task); /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, * no more per-task TSS's. The TSS size is kept cacheline-aligned - * so they are allowed to end up in the .data.cacheline_aligned + * so they are allowed to end up in the .data..cacheline_aligned * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index feaeb0d3aa4..eb9b76c716c 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -29,6 +29,8 @@ #define KVM_SCALE 22 static int kvmclock = 1; +static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; +static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; static int parse_no_kvmclock(char *arg) { @@ -54,7 +56,8 @@ static unsigned long kvm_get_wallclock(void) low = (int)__pa_symbol(&wall_clock); high = ((u64)__pa_symbol(&wall_clock) >> 32); - native_write_msr(MSR_KVM_WALL_CLOCK, low, high); + + native_write_msr(msr_kvm_wall_clock, low, high); vcpu_time = &get_cpu_var(hv_clock); pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); @@ -130,7 +133,8 @@ static int kvm_register_clock(char *txt) high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", cpu, high, low, txt); - return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); + + return native_write_msr_safe(msr_kvm_system_time, low, high); } #ifdef CONFIG_X86_LOCAL_APIC @@ -165,14 +169,14 @@ static void __init kvm_smp_prepare_boot_cpu(void) #ifdef CONFIG_KEXEC static void kvm_crash_shutdown(struct pt_regs *regs) { - native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); + native_write_msr(msr_kvm_system_time, 0, 0); native_machine_crash_shutdown(regs); } #endif static void kvm_shutdown(void) { - native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); + native_write_msr(msr_kvm_system_time, 0, 0); native_machine_shutdown(); } @@ -181,27 +185,37 @@ void __init kvmclock_init(void) if (!kvm_para_available()) return; - if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { - if (kvm_register_clock("boot clock")) - return; - pv_time_ops.sched_clock = kvm_clock_read; - x86_platform.calibrate_tsc = kvm_get_tsc_khz; - x86_platform.get_wallclock = kvm_get_wallclock; - x86_platform.set_wallclock = kvm_set_wallclock; + if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; + } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) + return; + + printk(KERN_INFO "kvm-clock: Using msrs %x and %x", + msr_kvm_system_time, msr_kvm_wall_clock); + + if (kvm_register_clock("boot clock")) + return; + pv_time_ops.sched_clock = kvm_clock_read; + x86_platform.calibrate_tsc = kvm_get_tsc_khz; + x86_platform.get_wallclock = kvm_get_wallclock; + x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - x86_cpuinit.setup_percpu_clockev = - kvm_setup_secondary_clock; + x86_cpuinit.setup_percpu_clockev = + kvm_setup_secondary_clock; #endif #ifdef CONFIG_SMP - smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; + smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; #endif - machine_ops.shutdown = kvm_shutdown; + machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC - machine_ops.crash_shutdown = kvm_crash_shutdown; + machine_ops.crash_shutdown = kvm_crash_shutdown; #endif - kvm_get_preset_lpj(); - clocksource_register(&kvm_clock); - pv_info.paravirt_enabled = 1; - pv_info.name = "KVM"; - } + kvm_get_preset_lpj(); + clocksource_register(&kvm_clock); + pv_info.paravirt_enabled = 1; + pv_info.name = "KVM"; + + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); } diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 2cd8c544e41..fa6551d36c1 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -260,6 +260,7 @@ static void microcode_dev_exit(void) } MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +MODULE_ALIAS("devname:cpu/microcode"); #else #define microcode_dev_init() 0 #define microcode_dev_exit() do { } while (0) diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 4d4468e9f47..7bf2dc4c8f7 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -230,7 +230,7 @@ static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb, msr_device_destroy(cpu); break; } - return err ? NOTIFY_BAD : NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block __refdata msr_class_cpu_notifier = { diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 7d2829dde20..a5bc528d432 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -31,8 +31,6 @@ static struct dma_map_ops swiotlb_dma_ops = { .free_coherent = swiotlb_free_coherent, .sync_single_for_cpu = swiotlb_sync_single_for_cpu, .sync_single_for_device = swiotlb_sync_single_for_device, - .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, - .sync_single_range_for_device = swiotlb_sync_single_range_for_device, .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, .sync_sg_for_device = swiotlb_sync_sg_for_device, .map_sg = swiotlb_map_sg_attrs, diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 03801f2f761..239427ca02a 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -31,8 +31,16 @@ struct pvclock_shadow_time { u32 tsc_to_nsec_mul; int tsc_shift; u32 version; + u8 flags; }; +static u8 valid_flags __read_mostly = 0; + +void pvclock_set_flags(u8 flags) +{ + valid_flags = flags; +} + /* * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, * yielding a 64-bit result. @@ -91,6 +99,7 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, dst->system_timestamp = src->system_time; dst->tsc_to_nsec_mul = src->tsc_to_system_mul; dst->tsc_shift = src->tsc_shift; + dst->flags = src->flags; rmb(); /* test version after fetching data */ } while ((src->version & 1) || (dst->version != src->version)); @@ -109,11 +118,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) return pv_tsc_khz; } +static atomic64_t last_value = ATOMIC64_INIT(0); + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { struct pvclock_shadow_time shadow; unsigned version; cycle_t ret, offset; + u64 last; do { version = pvclock_get_time_values(&shadow, src); @@ -123,6 +135,31 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) barrier(); } while (version != src->version); + if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && + (shadow.flags & PVCLOCK_TSC_STABLE_BIT)) + return ret; + + /* + * Assumption here is that last_value, a global accumulator, always goes + * forward. If we are less than that, we should not be much smaller. + * We assume there is an error marging we're inside, and then the correction + * does not sacrifice accuracy. + * + * For reads: global may have changed between test and return, + * but this means someone else updated poked the clock at a later time. + * We just need to make sure we are not seeing a backwards event. + * + * For updates: last_value = ret is not enough, since two vcpus could be + * updating at the same time, and one of them could be slightly behind, + * making the assumption that last_value always go forward fail to hold. + */ + last = atomic64_read(&last_value); + do { + if (ret < last) + return last; + last = atomic64_cmpxchg(&last_value, last, ret); + } while (unlikely(last != ret)); + return ret; } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e8029896309..b4ae4acbd03 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -676,6 +676,17 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), }, }, + /* + * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so + * match on the product name. + */ + { + .callback = dmi_low_memory_corruption, + .ident = "Phoenix BIOS", + .matches = { + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"), + }, + }, #endif {} }; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ef6370b00e7..de3b63ae3da 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -21,12 +21,6 @@ #include <asm/cpu.h> #include <asm/stackprotector.h> -#ifdef CONFIG_DEBUG_PER_CPU_MAPS -# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__) -#else -# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0) -#endif - DEFINE_PER_CPU(int, cpu_number); EXPORT_PER_CPU_SYMBOL(cpu_number); @@ -247,7 +241,7 @@ void __init setup_per_cpu_areas(void) #endif #endif /* - * Up to this point, the boot CPU has been using .data.init + * Up to this point, the boot CPU has been using .init.data * area. Reload any changed state for the boot CPU. */ if (cpu == boot_cpu_id) @@ -265,10 +259,10 @@ void __init setup_per_cpu_areas(void) #if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) /* - * make sure boot cpu node_number is right, when boot cpu is on the + * make sure boot cpu numa_node is right, when boot cpu is on the * node that doesn't have mem installed */ - per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id); + set_cpu_numa_node(boot_cpu_id, early_cpu_to_node(boot_cpu_id)); #endif /* Setup node to cpumask map */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 763d815e27a..c4f33b2e77d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -686,7 +686,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work) static void __cpuinit announce_cpu(int cpu, int apicid) { static int current_node = -1; - int node = cpu_to_node(cpu); + int node = early_cpu_to_node(cpu); if (system_state == SYSTEM_BOOTING) { if (node != current_node) { @@ -1215,9 +1215,17 @@ __init void prefill_possible_map(void) if (!num_processors) num_processors = 1; - if (setup_possible_cpus == -1) - possible = num_processors + disabled_cpus; - else + i = setup_max_cpus ?: 1; + if (setup_possible_cpus == -1) { + possible = num_processors; +#ifdef CONFIG_HOTPLUG_CPU + if (setup_max_cpus) + possible += disabled_cpus; +#else + if (possible > i) + possible = i; +#endif + } else possible = setup_possible_cpus; total_cpus = max_t(int, possible, num_processors + disabled_cpus); @@ -1230,11 +1238,23 @@ __init void prefill_possible_map(void) possible = nr_cpu_ids; } +#ifdef CONFIG_HOTPLUG_CPU + if (!setup_max_cpus) +#endif + if (possible > i) { + printk(KERN_WARNING + "%d Processors exceeds max_cpus limit of %u\n", + possible, setup_max_cpus); + possible = i; + } + printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); for (i = 0; i < possible; i++) set_cpu_possible(i, true); + for (; i < NR_CPUS; i++) + set_cpu_possible(i, false); nr_cpu_ids = possible; } diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index cc2c60474fd..c2f1b26141e 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -46,6 +46,7 @@ /* Global pointer to shared data; NULL means no measured launch. */ struct tboot *tboot __read_mostly; +EXPORT_SYMBOL(tboot); /* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ #define AP_WAIT_TIMEOUT 1 diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 2cc249718c4..d0bb52296fa 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -97,7 +97,7 @@ SECTIONS HEAD_TEXT #ifdef CONFIG_X86_32 . = ALIGN(PAGE_SIZE); - *(.text.page_aligned) + *(.text..page_aligned) #endif . = ALIGN(8); _stext = .; @@ -305,7 +305,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); .bss : AT(ADDR(.bss) - LOAD_OFFSET) { __bss_start = .; - *(.bss.page_aligned) + *(.bss..page_aligned) *(.bss) . = ALIGN(4); __bss_stop = .; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4dade6ac082..5ac0bb465ed 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -33,6 +33,7 @@ #include <asm/kvm_emulate.h> #include "x86.h" +#include "tss.h" /* * Opcode effective-address decode tables. @@ -50,6 +51,8 @@ #define DstReg (2<<1) /* Register operand. */ #define DstMem (3<<1) /* Memory operand. */ #define DstAcc (4<<1) /* Destination Accumulator */ +#define DstDI (5<<1) /* Destination is in ES:(E)DI */ +#define DstMem64 (6<<1) /* 64bit memory operand */ #define DstMask (7<<1) /* Source operand type. */ #define SrcNone (0<<4) /* No source operand. */ @@ -63,6 +66,7 @@ #define SrcOne (7<<4) /* Implied '1' */ #define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ #define SrcImmU (9<<4) /* Immediate operand, unsigned */ +#define SrcSI (0xa<<4) /* Source is in the DS:RSI */ #define SrcMask (0xf<<4) /* Generic ModRM decode. */ #define ModRM (1<<8) @@ -85,6 +89,9 @@ #define Src2ImmByte (2<<29) #define Src2One (3<<29) #define Src2Imm16 (4<<29) +#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be + in memory and second argument is located + immediately after the first one in memory. */ #define Src2Mask (7<<29) enum { @@ -147,8 +154,8 @@ static u32 opcode_table[256] = { 0, 0, 0, 0, /* 0x68 - 0x6F */ SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ + DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */ + SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */ /* 0x70 - 0x77 */ SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, @@ -173,12 +180,12 @@ static u32 opcode_table[256] = { /* 0xA0 - 0xA7 */ ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, - ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, - ByteOp | ImplicitOps | String, ImplicitOps | String, + ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, + ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, /* 0xA8 - 0xAF */ - 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, - ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, - ByteOp | ImplicitOps | String, ImplicitOps | String, + 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String, + ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, + ByteOp | DstDI | String, DstDI | String, /* 0xB0 - 0xB7 */ ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, @@ -204,13 +211,13 @@ static u32 opcode_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xE7 */ 0, 0, 0, 0, - ByteOp | SrcImmUByte, SrcImmUByte, - ByteOp | SrcImmUByte, SrcImmUByte, + ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, + ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, /* 0xE8 - 0xEF */ SrcImm | Stack, SrcImm | ImplicitOps, SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, + SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, @@ -343,7 +350,8 @@ static u32 group_table[] = { [Group5*8] = DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, SrcMem | ModRM | Stack, 0, - SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, + SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps, + SrcMem | ModRM | Stack, 0, [Group7*8] = 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, SrcNone | ModRM | DstMem | Mov, 0, @@ -353,14 +361,14 @@ static u32 group_table[] = { DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, [Group9*8] = - 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0, + 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0, }; static u32 group2_table[] = { [Group7*8] = - SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM, + SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv, SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov, 0, + SrcMem16 | ModRM | Mov | Priv, 0, [Group9*8] = 0, 0, 0, 0, 0, 0, 0, 0, }; @@ -562,7 +570,7 @@ static u32 group2_table[] = { #define insn_fetch(_type, _size, _eip) \ ({ unsigned long _x; \ rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ - if (rc != 0) \ + if (rc != X86EMUL_CONTINUE) \ goto done; \ (_eip) += (_size); \ (_type)_x; \ @@ -638,40 +646,40 @@ static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, - unsigned long linear, u8 *dest) + unsigned long eip, u8 *dest) { struct fetch_cache *fc = &ctxt->decode.fetch; int rc; - int size; + int size, cur_size; - if (linear < fc->start || linear >= fc->end) { - size = min(15UL, PAGE_SIZE - offset_in_page(linear)); - rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL); - if (rc) + if (eip == fc->end) { + cur_size = fc->end - fc->start; + size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); + rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, + size, ctxt->vcpu, NULL); + if (rc != X86EMUL_CONTINUE) return rc; - fc->start = linear; - fc->end = linear + size; + fc->end += size; } - *dest = fc->data[linear - fc->start]; - return 0; + *dest = fc->data[eip - fc->start]; + return X86EMUL_CONTINUE; } static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned long eip, void *dest, unsigned size) { - int rc = 0; + int rc; /* x86 instructions are limited to 15 bytes. */ - if (eip + size - ctxt->decode.eip_orig > 15) + if (eip + size - ctxt->eip > 15) return X86EMUL_UNHANDLEABLE; - eip += ctxt->cs_base; while (size--) { rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); - if (rc) + if (rc != X86EMUL_CONTINUE) return rc; } - return 0; + return X86EMUL_CONTINUE; } /* @@ -702,7 +710,7 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, *address = 0; rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, ctxt->vcpu, NULL); - if (rc) + if (rc != X86EMUL_CONTINUE) return rc; rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, ctxt->vcpu, NULL); @@ -782,7 +790,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; u8 sib; int index_reg = 0, base_reg = 0, scale; - int rc = 0; + int rc = X86EMUL_CONTINUE; if (c->rex_prefix) { c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ @@ -895,7 +903,7 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - int rc = 0; + int rc = X86EMUL_CONTINUE; switch (c->ad_bytes) { case 2: @@ -916,14 +924,18 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - int rc = 0; + int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; int def_op_bytes, def_ad_bytes, group; - /* Shadow copy of register state. Committed on successful emulation. */ + /* we cannot decode insn before we complete previous rep insn */ + WARN_ON(ctxt->restart); + + /* Shadow copy of register state. Committed on successful emulation. */ memset(c, 0, sizeof(struct decode_cache)); - c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu); + c->eip = ctxt->eip; + c->fetch.start = c->fetch.end = c->eip; ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); @@ -1015,11 +1027,6 @@ done_prefixes: } } - if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { - kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction"); - return -1; - } - if (c->d & Group) { group = c->d & GroupMask; c->modrm = insn_fetch(u8, 1, c->eip); @@ -1046,7 +1053,7 @@ done_prefixes: rc = decode_modrm(ctxt, ops); else if (c->d & MemAbs) rc = decode_abs(ctxt, ops); - if (rc) + if (rc != X86EMUL_CONTINUE) goto done; if (!c->has_seg_override) @@ -1057,6 +1064,10 @@ done_prefixes: if (c->ad_bytes != 8) c->modrm_ea = (u32)c->modrm_ea; + + if (c->rip_relative) + c->modrm_ea += c->eip; + /* * Decode and fetch the source operand: register, memory * or immediate. @@ -1091,6 +1102,8 @@ done_prefixes: break; } c->src.type = OP_MEM; + c->src.ptr = (unsigned long *)c->modrm_ea; + c->src.val = 0; break; case SrcImm: case SrcImmU: @@ -1139,6 +1152,14 @@ done_prefixes: c->src.bytes = 1; c->src.val = 1; break; + case SrcSI: + c->src.type = OP_MEM; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->src.ptr = (unsigned long *) + register_address(c, seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]); + c->src.val = 0; + break; } /* @@ -1168,6 +1189,12 @@ done_prefixes: c->src2.bytes = 1; c->src2.val = 1; break; + case Src2Mem16: + c->src2.type = OP_MEM; + c->src2.bytes = 2; + c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes); + c->src2.val = 0; + break; } /* Decode and fetch the destination operand: register or memory. */ @@ -1180,6 +1207,7 @@ done_prefixes: c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); break; case DstMem: + case DstMem64: if ((c->d & ModRM) && c->modrm_mod == 3) { c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.type = OP_REG; @@ -1188,12 +1216,24 @@ done_prefixes: break; } c->dst.type = OP_MEM; + c->dst.ptr = (unsigned long *)c->modrm_ea; + if ((c->d & DstMask) == DstMem64) + c->dst.bytes = 8; + else + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.val = 0; + if (c->d & BitOp) { + unsigned long mask = ~(c->dst.bytes * 8 - 1); + + c->dst.ptr = (void *)c->dst.ptr + + (c->src.val & mask) / 8; + } break; case DstAcc: c->dst.type = OP_REG; - c->dst.bytes = c->op_bytes; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - switch (c->op_bytes) { + switch (c->dst.bytes) { case 1: c->dst.val = *(u8 *)c->dst.ptr; break; @@ -1203,18 +1243,248 @@ done_prefixes: case 4: c->dst.val = *(u32 *)c->dst.ptr; break; + case 8: + c->dst.val = *(u64 *)c->dst.ptr; + break; } c->dst.orig_val = c->dst.val; break; + case DstDI: + c->dst.type = OP_MEM; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *) + register_address(c, es_base(ctxt), + c->regs[VCPU_REGS_RDI]); + c->dst.val = 0; + break; } - if (c->rip_relative) - c->modrm_ea += c->eip; - done: return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; } +static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned int size, unsigned short port, + void *dest) +{ + struct read_cache *rc = &ctxt->decode.io_read; + + if (rc->pos == rc->end) { /* refill pio read ahead */ + struct decode_cache *c = &ctxt->decode; + unsigned int in_page, n; + unsigned int count = c->rep_prefix ? + address_mask(c, c->regs[VCPU_REGS_RCX]) : 1; + in_page = (ctxt->eflags & EFLG_DF) ? + offset_in_page(c->regs[VCPU_REGS_RDI]) : + PAGE_SIZE - offset_in_page(c->regs[VCPU_REGS_RDI]); + n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, + count); + if (n == 0) + n = 1; + rc->pos = rc->end = 0; + if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu)) + return 0; + rc->end = n * size; + } + + memcpy(dest, rc->data + rc->pos, size); + rc->pos += size; + return 1; +} + +static u32 desc_limit_scaled(struct desc_struct *desc) +{ + u32 limit = get_desc_limit(desc); + + return desc->g ? (limit << 12) | 0xfff : limit; +} + +static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 selector, struct desc_ptr *dt) +{ + if (selector & 1 << 2) { + struct desc_struct desc; + memset (dt, 0, sizeof *dt); + if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) + return; + + dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ + dt->address = get_desc_base(&desc); + } else + ops->get_gdt(dt, ctxt->vcpu); +} + +/* allowed just for 8 bytes segments */ +static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 selector, struct desc_struct *desc) +{ + struct desc_ptr dt; + u16 index = selector >> 3; + int ret; + u32 err; + ulong addr; + + get_descriptor_table_ptr(ctxt, ops, selector, &dt); + + if (dt.size < index * 8 + 7) { + kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); + return X86EMUL_PROPAGATE_FAULT; + } + addr = dt.address + index * 8; + ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(ctxt->vcpu, addr, err); + + return ret; +} + +/* allowed just for 8 bytes segments */ +static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 selector, struct desc_struct *desc) +{ + struct desc_ptr dt; + u16 index = selector >> 3; + u32 err; + ulong addr; + int ret; + + get_descriptor_table_ptr(ctxt, ops, selector, &dt); + + if (dt.size < index * 8 + 7) { + kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); + return X86EMUL_PROPAGATE_FAULT; + } + + addr = dt.address + index * 8; + ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(ctxt->vcpu, addr, err); + + return ret; +} + +static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 selector, int seg) +{ + struct desc_struct seg_desc; + u8 dpl, rpl, cpl; + unsigned err_vec = GP_VECTOR; + u32 err_code = 0; + bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + int ret; + + memset(&seg_desc, 0, sizeof seg_desc); + + if ((seg <= VCPU_SREG_GS && ctxt->mode == X86EMUL_MODE_VM86) + || ctxt->mode == X86EMUL_MODE_REAL) { + /* set real mode segment descriptor */ + set_desc_base(&seg_desc, selector << 4); + set_desc_limit(&seg_desc, 0xffff); + seg_desc.type = 3; + seg_desc.p = 1; + seg_desc.s = 1; + goto load; + } + + /* NULL selector is not valid for TR, CS and SS */ + if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + && null_selector) + goto exception; + + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + + if (null_selector) /* for NULL selector skip all following checks */ + goto load; + + ret = read_segment_descriptor(ctxt, ops, selector, &seg_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + + err_code = selector & 0xfffc; + err_vec = GP_VECTOR; + + /* can't load system descriptor into segment selecor */ + if (seg <= VCPU_SREG_GS && !seg_desc.s) + goto exception; + + if (!seg_desc.p) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; + } + + rpl = selector & 3; + dpl = seg_desc.dpl; + cpl = ops->cpl(ctxt->vcpu); + + switch (seg) { + case VCPU_SREG_SS: + /* + * segment is not a writable data segment or segment + * selector's RPL != CPL or segment selector's RPL != CPL + */ + if (rpl != cpl || (seg_desc.type & 0xa) != 0x2 || dpl != cpl) + goto exception; + break; + case VCPU_SREG_CS: + if (!(seg_desc.type & 8)) + goto exception; + + if (seg_desc.type & 4) { + /* conforming */ + if (dpl > cpl) + goto exception; + } else { + /* nonconforming */ + if (rpl > cpl || dpl != cpl) + goto exception; + } + /* CS(RPL) <- CPL */ + selector = (selector & 0xfffc) | cpl; + break; + case VCPU_SREG_TR: + if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) + goto exception; + break; + case VCPU_SREG_LDTR: + if (seg_desc.s || seg_desc.type != 2) + goto exception; + break; + default: /* DS, ES, FS, or GS */ + /* + * segment is not a data or readable code segment or + * ((segment is a data or nonconforming code segment) + * and (both RPL and CPL > DPL)) + */ + if ((seg_desc.type & 0xa) == 0x8 || + (((seg_desc.type & 0xc) != 0xc) && + (rpl > dpl && cpl > dpl))) + goto exception; + break; + } + + if (seg_desc.s) { + /* mark segment as accessed */ + seg_desc.type |= 1; + ret = write_segment_descriptor(ctxt, ops, selector, &seg_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + } +load: + ops->set_segment_selector(selector, seg, ctxt->vcpu); + ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); + return X86EMUL_CONTINUE; +exception: + kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code); + return X86EMUL_PROPAGATE_FAULT; +} + static inline void emulate_push(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; @@ -1251,7 +1521,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, int rc; unsigned long val, change_mask; int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; - int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu); + int cpl = ops->cpl(ctxt->vcpu); rc = emulate_pop(ctxt, ops, &val, len); if (rc != X86EMUL_CONTINUE) @@ -1306,10 +1576,10 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int rc; rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg); + rc = load_segment_descriptor(ctxt, ops, (u16)selector, seg); return rc; } @@ -1332,7 +1602,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - int rc = 0; + int rc = X86EMUL_CONTINUE; int reg = VCPU_REGS_RDI; while (reg >= VCPU_REGS_RAX) { @@ -1343,7 +1613,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt, } rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) break; --reg; } @@ -1354,12 +1624,8 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - int rc; - rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); - if (rc != 0) - return rc; - return 0; + return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); } static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) @@ -1395,7 +1661,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - int rc = 0; switch (c->modrm_reg) { case 0 ... 1: /* test */ @@ -1408,11 +1673,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, emulate_1op("neg", c->dst, ctxt->eflags); break; default: - DPRINTF("Cannot emulate %02x\n", c->b); - rc = X86EMUL_UNHANDLEABLE; - break; + return 0; } - return rc; + return 1; } static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, @@ -1442,20 +1705,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, emulate_push(ctxt); break; } - return 0; + return X86EMUL_CONTINUE; } static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - unsigned long memop) + struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - u64 old, new; - int rc; - - rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); - if (rc != X86EMUL_CONTINUE) - return rc; + u64 old = c->dst.orig_val; if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { @@ -1463,17 +1720,13 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); ctxt->eflags &= ~EFLG_ZF; - } else { - new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | + c->dst.val = ((u64)c->regs[VCPU_REGS_RCX] << 32) | (u32) c->regs[VCPU_REGS_RBX]; - rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); - if (rc != X86EMUL_CONTINUE) - return rc; ctxt->eflags |= EFLG_ZF; } - return 0; + return X86EMUL_CONTINUE; } static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, @@ -1484,14 +1737,14 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, unsigned long cs; rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); - if (rc) + if (rc != X86EMUL_CONTINUE) return rc; if (c->op_bytes == 4) c->eip = (u32)c->eip; rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); - if (rc) + if (rc != X86EMUL_CONTINUE) return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS); + rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); return rc; } @@ -1544,7 +1797,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, default: break; } - return 0; + return X86EMUL_CONTINUE; } static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) @@ -1598,8 +1851,11 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt) u64 msr_data; /* syscall is not available in real mode */ - if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) - return X86EMUL_UNHANDLEABLE; + if (ctxt->mode == X86EMUL_MODE_REAL || + ctxt->mode == X86EMUL_MODE_VM86) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + return X86EMUL_PROPAGATE_FAULT; + } setup_syscalls_segments(ctxt, &cs, &ss); @@ -1649,14 +1905,16 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt) /* inject #GP if in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL) { kvm_inject_gp(ctxt->vcpu, 0); - return X86EMUL_UNHANDLEABLE; + return X86EMUL_PROPAGATE_FAULT; } /* XXX sysenter/sysexit have not been tested in 64bit mode. * Therefore, we inject an #UD. */ - if (ctxt->mode == X86EMUL_MODE_PROT64) - return X86EMUL_UNHANDLEABLE; + if (ctxt->mode == X86EMUL_MODE_PROT64) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + return X86EMUL_PROPAGATE_FAULT; + } setup_syscalls_segments(ctxt, &cs, &ss); @@ -1711,7 +1969,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) { kvm_inject_gp(ctxt->vcpu, 0); - return X86EMUL_UNHANDLEABLE; + return X86EMUL_PROPAGATE_FAULT; } setup_syscalls_segments(ctxt, &cs, &ss); @@ -1756,7 +2014,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) +static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { int iopl; if (ctxt->mode == X86EMUL_MODE_REAL) @@ -1764,7 +2023,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) if (ctxt->mode == X86EMUL_MODE_VM86) return true; iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; - return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl; + return ops->cpl(ctxt->vcpu) > iopl; } static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, @@ -1801,22 +2060,419 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, u16 port, u16 len) { - if (emulator_bad_iopl(ctxt)) + if (emulator_bad_iopl(ctxt, ops)) if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) return false; return true; } +static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + int seg) +{ + struct desc_struct desc; + if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu)) + return get_desc_base(&desc); + else + return ~0; +} + +static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + struct tss_segment_16 *tss) +{ + struct decode_cache *c = &ctxt->decode; + + tss->ip = c->eip; + tss->flag = ctxt->eflags; + tss->ax = c->regs[VCPU_REGS_RAX]; + tss->cx = c->regs[VCPU_REGS_RCX]; + tss->dx = c->regs[VCPU_REGS_RDX]; + tss->bx = c->regs[VCPU_REGS_RBX]; + tss->sp = c->regs[VCPU_REGS_RSP]; + tss->bp = c->regs[VCPU_REGS_RBP]; + tss->si = c->regs[VCPU_REGS_RSI]; + tss->di = c->regs[VCPU_REGS_RDI]; + + tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); + tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); + tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); + tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); +} + +static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + struct tss_segment_16 *tss) +{ + struct decode_cache *c = &ctxt->decode; + int ret; + + c->eip = tss->ip; + ctxt->eflags = tss->flag | 2; + c->regs[VCPU_REGS_RAX] = tss->ax; + c->regs[VCPU_REGS_RCX] = tss->cx; + c->regs[VCPU_REGS_RDX] = tss->dx; + c->regs[VCPU_REGS_RBX] = tss->bx; + c->regs[VCPU_REGS_RSP] = tss->sp; + c->regs[VCPU_REGS_RBP] = tss->bp; + c->regs[VCPU_REGS_RSI] = tss->si; + c->regs[VCPU_REGS_RDI] = tss->di; + + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu); + ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); + ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + ret = load_segment_descriptor(ctxt, ops, tss->ldt, VCPU_SREG_LDTR); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); + if (ret != X86EMUL_CONTINUE) + return ret; + + return X86EMUL_CONTINUE; +} + +static int task_switch_16(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 tss_selector, u16 old_tss_sel, + ulong old_tss_base, struct desc_struct *new_desc) +{ + struct tss_segment_16 tss_seg; + int ret; + u32 err, new_tss_base = get_desc_base(new_desc); + + ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + &err); + if (ret == X86EMUL_PROPAGATE_FAULT) { + /* FIXME: need to provide precise fault address */ + kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + return ret; + } + + save_state_to_tss16(ctxt, ops, &tss_seg); + + ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + &err); + if (ret == X86EMUL_PROPAGATE_FAULT) { + /* FIXME: need to provide precise fault address */ + kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + return ret; + } + + ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + &err); + if (ret == X86EMUL_PROPAGATE_FAULT) { + /* FIXME: need to provide precise fault address */ + kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + return ret; + } + + if (old_tss_sel != 0xffff) { + tss_seg.prev_task_link = old_tss_sel; + + ret = ops->write_std(new_tss_base, + &tss_seg.prev_task_link, + sizeof tss_seg.prev_task_link, + ctxt->vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) { + /* FIXME: need to provide precise fault address */ + kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + return ret; + } + } + + return load_state_from_tss16(ctxt, ops, &tss_seg); +} + +static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + struct tss_segment_32 *tss) +{ + struct decode_cache *c = &ctxt->decode; + + tss->cr3 = ops->get_cr(3, ctxt->vcpu); + tss->eip = c->eip; + tss->eflags = ctxt->eflags; + tss->eax = c->regs[VCPU_REGS_RAX]; + tss->ecx = c->regs[VCPU_REGS_RCX]; + tss->edx = c->regs[VCPU_REGS_RDX]; + tss->ebx = c->regs[VCPU_REGS_RBX]; + tss->esp = c->regs[VCPU_REGS_RSP]; + tss->ebp = c->regs[VCPU_REGS_RBP]; + tss->esi = c->regs[VCPU_REGS_RSI]; + tss->edi = c->regs[VCPU_REGS_RDI]; + + tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); + tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); + tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); + tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu); + tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu); + tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); +} + +static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + struct tss_segment_32 *tss) +{ + struct decode_cache *c = &ctxt->decode; + int ret; + + ops->set_cr(3, tss->cr3, ctxt->vcpu); + c->eip = tss->eip; + ctxt->eflags = tss->eflags | 2; + c->regs[VCPU_REGS_RAX] = tss->eax; + c->regs[VCPU_REGS_RCX] = tss->ecx; + c->regs[VCPU_REGS_RDX] = tss->edx; + c->regs[VCPU_REGS_RBX] = tss->ebx; + c->regs[VCPU_REGS_RSP] = tss->esp; + c->regs[VCPU_REGS_RBP] = tss->ebp; + c->regs[VCPU_REGS_RSI] = tss->esi; + c->regs[VCPU_REGS_RDI] = tss->edi; + + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu); + ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); + ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); + ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu); + ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu); + + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + ret = load_segment_descriptor(ctxt, ops, tss->ldt_selector, VCPU_SREG_LDTR); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->es, VCPU_SREG_ES); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->cs, VCPU_SREG_CS); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->ss, VCPU_SREG_SS); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->ds, VCPU_SREG_DS); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->fs, VCPU_SREG_FS); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = load_segment_descriptor(ctxt, ops, tss->gs, VCPU_SREG_GS); + if (ret != X86EMUL_CONTINUE) + return ret; + + return X86EMUL_CONTINUE; +} + +static int task_switch_32(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 tss_selector, u16 old_tss_sel, + ulong old_tss_base, struct desc_struct *new_desc) +{ + struct tss_segment_32 tss_seg; + int ret; + u32 err, new_tss_base = get_desc_base(new_desc); + + ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + &err); + if (ret == X86EMUL_PROPAGATE_FAULT) { + /* FIXME: need to provide precise fault address */ + kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + return ret; + } + + save_state_to_tss32(ctxt, ops, &tss_seg); + + ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + &err); + if (ret == X86EMUL_PROPAGATE_FAULT) { + /* FIXME: need to provide precise fault address */ + kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); + return ret; + } + + ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + &err); + if (ret == X86EMUL_PROPAGATE_FAULT) { + /* FIXME: need to provide precise fault address */ + kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + return ret; + } + + if (old_tss_sel != 0xffff) { + tss_seg.prev_task_link = old_tss_sel; + + ret = ops->write_std(new_tss_base, + &tss_seg.prev_task_link, + sizeof tss_seg.prev_task_link, + ctxt->vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) { + /* FIXME: need to provide precise fault address */ + kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); + return ret; + } + } + + return load_state_from_tss32(ctxt, ops, &tss_seg); +} + +static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 tss_selector, int reason, + bool has_error_code, u32 error_code) +{ + struct desc_struct curr_tss_desc, next_tss_desc; + int ret; + u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); + ulong old_tss_base = + get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR); + u32 desc_limit; + + /* FIXME: old_tss_base == ~0 ? */ + + ret = read_segment_descriptor(ctxt, ops, tss_selector, &next_tss_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + ret = read_segment_descriptor(ctxt, ops, old_tss_sel, &curr_tss_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + + /* FIXME: check that next_tss_desc is tss */ + + if (reason != TASK_SWITCH_IRET) { + if ((tss_selector & 3) > next_tss_desc.dpl || + ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_PROPAGATE_FAULT; + } + } + + desc_limit = desc_limit_scaled(&next_tss_desc); + if (!next_tss_desc.p || + ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || + desc_limit < 0x2b)) { + kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR, + tss_selector & 0xfffc); + return X86EMUL_PROPAGATE_FAULT; + } + + if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { + curr_tss_desc.type &= ~(1 << 1); /* clear busy flag */ + write_segment_descriptor(ctxt, ops, old_tss_sel, + &curr_tss_desc); + } + + if (reason == TASK_SWITCH_IRET) + ctxt->eflags = ctxt->eflags & ~X86_EFLAGS_NT; + + /* set back link to prev task only if NT bit is set in eflags + note that old_tss_sel is not used afetr this point */ + if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) + old_tss_sel = 0xffff; + + if (next_tss_desc.type & 8) + ret = task_switch_32(ctxt, ops, tss_selector, old_tss_sel, + old_tss_base, &next_tss_desc); + else + ret = task_switch_16(ctxt, ops, tss_selector, old_tss_sel, + old_tss_base, &next_tss_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + + if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) + ctxt->eflags = ctxt->eflags | X86_EFLAGS_NT; + + if (reason != TASK_SWITCH_IRET) { + next_tss_desc.type |= (1 << 1); /* set busy flag */ + write_segment_descriptor(ctxt, ops, tss_selector, + &next_tss_desc); + } + + ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); + ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu); + ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu); + + if (has_error_code) { + struct decode_cache *c = &ctxt->decode; + + c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; + c->lock_prefix = 0; + c->src.val = (unsigned long) error_code; + emulate_push(ctxt); + } + + return ret; +} + +int emulator_task_switch(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 tss_selector, int reason, + bool has_error_code, u32 error_code) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + memset(c, 0, sizeof(struct decode_cache)); + c->eip = ctxt->eip; + memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); + c->dst.type = OP_NONE; + + rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, + has_error_code, error_code); + + if (rc == X86EMUL_CONTINUE) { + memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(ctxt->vcpu, c->eip); + rc = writeback(ctxt, ops); + } + + return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; +} + +static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, + int reg, struct operand *op) +{ + struct decode_cache *c = &ctxt->decode; + int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; + + register_address_increment(c, &c->regs[reg], df * op->bytes); + op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]); +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { - unsigned long memop = 0; u64 msr_data; - unsigned long saved_eip = 0; struct decode_cache *c = &ctxt->decode; - unsigned int port; - int io_dir_in; - int rc = 0; + int rc = X86EMUL_CONTINUE; + int saved_dst_type = c->dst.type; ctxt->interruptibility = 0; @@ -1826,26 +2482,30 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) */ memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); - saved_eip = c->eip; + + if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } /* LOCK prefix is allowed only with some instructions */ - if (c->lock_prefix && !(c->d & Lock)) { + if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { kvm_queue_exception(ctxt->vcpu, UD_VECTOR); goto done; } /* Privileged instruction can be executed only in CPL=0 */ - if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) { + if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { kvm_inject_gp(ctxt->vcpu, 0); goto done; } - if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) - memop = c->modrm_ea; - if (c->rep_prefix && (c->d & String)) { + ctxt->restart = true; /* All REP prefixes have the same first termination condition */ - if (c->regs[VCPU_REGS_RCX] == 0) { + if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { + string_done: + ctxt->restart = false; kvm_rip_write(ctxt->vcpu, c->eip); goto done; } @@ -1857,25 +2517,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) * - if REPNE/REPNZ and ZF = 1 then done */ if ((c->b == 0xa6) || (c->b == 0xa7) || - (c->b == 0xae) || (c->b == 0xaf)) { + (c->b == 0xae) || (c->b == 0xaf)) { if ((c->rep_prefix == REPE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == 0)) { - kvm_rip_write(ctxt->vcpu, c->eip); - goto done; - } + ((ctxt->eflags & EFLG_ZF) == 0)) + goto string_done; if ((c->rep_prefix == REPNE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { - kvm_rip_write(ctxt->vcpu, c->eip); - goto done; - } + ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) + goto string_done; } - c->regs[VCPU_REGS_RCX]--; - c->eip = kvm_rip_read(ctxt->vcpu); + c->eip = ctxt->eip; } if (c->src.type == OP_MEM) { - c->src.ptr = (unsigned long *)memop; - c->src.val = 0; rc = ops->read_emulated((unsigned long)c->src.ptr, &c->src.val, c->src.bytes, @@ -1885,29 +2538,25 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) c->src.orig_val = c->src.val; } + if (c->src2.type == OP_MEM) { + rc = ops->read_emulated((unsigned long)c->src2.ptr, + &c->src2.val, + c->src2.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + goto done; + } + if ((c->d & DstMask) == ImplicitOps) goto special_insn; - if (c->dst.type == OP_MEM) { - c->dst.ptr = (unsigned long *)memop; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.val = 0; - if (c->d & BitOp) { - unsigned long mask = ~(c->dst.bytes * 8 - 1); - - c->dst.ptr = (void *)c->dst.ptr + - (c->src.val & mask) / 8; - } - if (!(c->d & Mov)) { - /* optimisation - avoid slow emulated read */ - rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu); - if (rc != X86EMUL_CONTINUE) - goto done; - } + if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { + /* optimisation - avoid slow emulated read if Mov */ + rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val, + c->dst.bytes, ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + goto done; } c->dst.orig_val = c->dst.val; @@ -1926,7 +2575,7 @@ special_insn: break; case 0x07: /* pop es */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0x08 ... 0x0d: @@ -1945,7 +2594,7 @@ special_insn: break; case 0x17: /* pop ss */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0x18 ... 0x1d: @@ -1957,7 +2606,7 @@ special_insn: break; case 0x1f: /* pop ds */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0x20 ... 0x25: @@ -1988,7 +2637,7 @@ special_insn: case 0x58 ... 0x5f: /* pop reg */ pop_instruction: rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0x60: /* pusha */ @@ -1996,7 +2645,7 @@ special_insn: break; case 0x61: /* popa */ rc = emulate_popa(ctxt, ops); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0x63: /* movsxd */ @@ -2010,47 +2659,29 @@ special_insn: break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ + c->dst.bytes = min(c->dst.bytes, 4u); if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], - (c->d & ByteOp) ? 1 : c->op_bytes)) { + c->dst.bytes)) { kvm_inject_gp(ctxt->vcpu, 0); goto done; } - if (kvm_emulate_pio_string(ctxt->vcpu, - 1, - (c->d & ByteOp) ? 1 : c->op_bytes, - c->rep_prefix ? - address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, - (ctxt->eflags & EFLG_DF), - register_address(c, es_base(ctxt), - c->regs[VCPU_REGS_RDI]), - c->rep_prefix, - c->regs[VCPU_REGS_RDX]) == 0) { - c->eip = saved_eip; - return -1; - } - return 0; + if (!pio_in_emulated(ctxt, ops, c->dst.bytes, + c->regs[VCPU_REGS_RDX], &c->dst.val)) + goto done; /* IO is needed, skip writeback */ + break; case 0x6e: /* outsb */ case 0x6f: /* outsw/outsd */ + c->src.bytes = min(c->src.bytes, 4u); if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], - (c->d & ByteOp) ? 1 : c->op_bytes)) { + c->src.bytes)) { kvm_inject_gp(ctxt->vcpu, 0); goto done; } - if (kvm_emulate_pio_string(ctxt->vcpu, - 0, - (c->d & ByteOp) ? 1 : c->op_bytes, - c->rep_prefix ? - address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, - (ctxt->eflags & EFLG_DF), - register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), - c->rep_prefix, - c->regs[VCPU_REGS_RDX]) == 0) { - c->eip = saved_eip; - return -1; - } - return 0; + ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], + &c->src.val, 1, ctxt->vcpu); + + c->dst.type = OP_NONE; /* nothing to writeback */ + break; case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(c->b, ctxt->eflags)) jmp_rel(c, c->src.val); @@ -2107,12 +2738,11 @@ special_insn: case 0x8c: { /* mov r/m, sreg */ struct kvm_segment segreg; - if (c->modrm_reg <= 5) + if (c->modrm_reg <= VCPU_SREG_GS) kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); else { - printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n", - c->modrm); - goto cannot_emulate; + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; } c->dst.val = segreg.selector; break; @@ -2132,16 +2762,16 @@ special_insn: } if (c->modrm_reg == VCPU_SREG_SS) - toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); + toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS); - rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg); + rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); c->dst.type = OP_NONE; /* Disable writeback. */ break; } case 0x8f: /* pop (sole member of Grp1a) */ rc = emulate_grp1a(ctxt, ops); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0x90: /* nop / xchg r8,rax */ @@ -2175,89 +2805,16 @@ special_insn: c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; break; case 0xa4 ... 0xa5: /* movs */ - c->dst.type = OP_MEM; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address(c, - es_base(ctxt), - c->regs[VCPU_REGS_RDI]); - rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), - &c->dst.val, - c->dst.bytes, ctxt->vcpu); - if (rc != X86EMUL_CONTINUE) - goto done; - register_address_increment(c, &c->regs[VCPU_REGS_RSI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - register_address_increment(c, &c->regs[VCPU_REGS_RDI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - break; + goto mov; case 0xa6 ... 0xa7: /* cmps */ - c->src.type = OP_NONE; /* Disable writeback. */ - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.ptr = (unsigned long *)register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]); - rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu); - if (rc != X86EMUL_CONTINUE) - goto done; - c->dst.type = OP_NONE; /* Disable writeback. */ - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address(c, - es_base(ctxt), - c->regs[VCPU_REGS_RDI]); - rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu); - if (rc != X86EMUL_CONTINUE) - goto done; - DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); - - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - - register_address_increment(c, &c->regs[VCPU_REGS_RSI], - (ctxt->eflags & EFLG_DF) ? -c->src.bytes - : c->src.bytes); - register_address_increment(c, &c->regs[VCPU_REGS_RDI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - - break; + goto cmp; case 0xaa ... 0xab: /* stos */ - c->dst.type = OP_MEM; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address(c, - es_base(ctxt), - c->regs[VCPU_REGS_RDI]); c->dst.val = c->regs[VCPU_REGS_RAX]; - register_address_increment(c, &c->regs[VCPU_REGS_RDI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); break; case 0xac ... 0xad: /* lods */ - c->dst.type = OP_REG; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), - &c->dst.val, - c->dst.bytes, - ctxt->vcpu); - if (rc != X86EMUL_CONTINUE) - goto done; - register_address_increment(c, &c->regs[VCPU_REGS_RSI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - break; + goto mov; case 0xae ... 0xaf: /* scas */ DPRINTF("Urk! I don't handle SCAS.\n"); goto cannot_emulate; @@ -2277,7 +2834,7 @@ special_insn: break; case 0xcb: /* ret far */ rc = emulate_ret_far(ctxt, ops); - if (rc) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0xd0 ... 0xd1: /* Grp2 */ @@ -2290,14 +2847,10 @@ special_insn: break; case 0xe4: /* inb */ case 0xe5: /* in */ - port = c->src.val; - io_dir_in = 1; - goto do_io; + goto do_io_in; case 0xe6: /* outb */ case 0xe7: /* out */ - port = c->src.val; - io_dir_in = 0; - goto do_io; + goto do_io_out; case 0xe8: /* call (near) */ { long int rel = c->src.val; c->src.val = (unsigned long) c->eip; @@ -2308,8 +2861,9 @@ special_insn: case 0xe9: /* jmp rel */ goto jmp; case 0xea: /* jmp far */ - if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, - VCPU_SREG_CS)) + jump_far: + if (load_segment_descriptor(ctxt, ops, c->src2.val, + VCPU_SREG_CS)) goto done; c->eip = c->src.val; @@ -2321,25 +2875,29 @@ special_insn: break; case 0xec: /* in al,dx */ case 0xed: /* in (e/r)ax,dx */ - port = c->regs[VCPU_REGS_RDX]; - io_dir_in = 1; - goto do_io; + c->src.val = c->regs[VCPU_REGS_RDX]; + do_io_in: + c->dst.bytes = min(c->dst.bytes, 4u); + if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, + &c->dst.val)) + goto done; /* IO is needed */ + break; case 0xee: /* out al,dx */ case 0xef: /* out (e/r)ax,dx */ - port = c->regs[VCPU_REGS_RDX]; - io_dir_in = 0; - do_io: - if (!emulator_io_permited(ctxt, ops, port, - (c->d & ByteOp) ? 1 : c->op_bytes)) { + c->src.val = c->regs[VCPU_REGS_RDX]; + do_io_out: + c->dst.bytes = min(c->dst.bytes, 4u); + if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { kvm_inject_gp(ctxt->vcpu, 0); goto done; } - if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, - (c->d & ByteOp) ? 1 : c->op_bytes, - port) != 0) { - c->eip = saved_eip; - goto cannot_emulate; - } + ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, + ctxt->vcpu); + c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf4: /* hlt */ ctxt->vcpu->arch.halt_request = 1; @@ -2350,16 +2908,15 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf6 ... 0xf7: /* Grp3 */ - rc = emulate_grp3(ctxt, ops); - if (rc != 0) - goto done; + if (!emulate_grp3(ctxt, ops)) + goto cannot_emulate; break; case 0xf8: /* clc */ ctxt->eflags &= ~EFLG_CF; c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfa: /* cli */ - if (emulator_bad_iopl(ctxt)) + if (emulator_bad_iopl(ctxt, ops)) kvm_inject_gp(ctxt->vcpu, 0); else { ctxt->eflags &= ~X86_EFLAGS_IF; @@ -2367,10 +2924,10 @@ special_insn: } break; case 0xfb: /* sti */ - if (emulator_bad_iopl(ctxt)) + if (emulator_bad_iopl(ctxt, ops)) kvm_inject_gp(ctxt->vcpu, 0); else { - toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); + toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI); ctxt->eflags |= X86_EFLAGS_IF; c->dst.type = OP_NONE; /* Disable writeback. */ } @@ -2383,28 +2940,55 @@ special_insn: ctxt->eflags |= EFLG_DF; c->dst.type = OP_NONE; /* Disable writeback. */ break; - case 0xfe ... 0xff: /* Grp4/Grp5 */ + case 0xfe: /* Grp4 */ + grp45: rc = emulate_grp45(ctxt, ops); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; + case 0xff: /* Grp5 */ + if (c->modrm_reg == 5) + goto jump_far; + goto grp45; } writeback: rc = writeback(ctxt, ops); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; + /* + * restore dst type in case the decoding will be reused + * (happens for string instruction ) + */ + c->dst.type = saved_dst_type; + + if ((c->d & SrcMask) == SrcSI) + string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI, + &c->src); + + if ((c->d & DstMask) == DstDI) + string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst); + + if (c->rep_prefix && (c->d & String)) { + struct read_cache *rc = &ctxt->decode.io_read; + register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); + /* + * Re-enter guest when pio read ahead buffer is empty or, + * if it is not used, after each 1024 iteration. + */ + if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) || + (rc->end != 0 && rc->end == rc->pos)) + ctxt->restart = false; + } + /* Commit shadow register state. */ memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(ctxt->vcpu, c->eip); + ops->set_rflags(ctxt->vcpu, ctxt->eflags); done: - if (rc == X86EMUL_UNHANDLEABLE) { - c->eip = saved_eip; - return -1; - } - return 0; + return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; twobyte_insn: switch (c->b) { @@ -2418,18 +3002,18 @@ twobyte_insn: goto cannot_emulate; rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc) + if (rc != X86EMUL_CONTINUE) goto done; /* Let the processor re-execute the fixed hypercall */ - c->eip = kvm_rip_read(ctxt->vcpu); + c->eip = ctxt->eip; /* Disable writeback. */ c->dst.type = OP_NONE; break; case 2: /* lgdt */ rc = read_descriptor(ctxt, ops, c->src.ptr, &size, &address, c->op_bytes); - if (rc) + if (rc != X86EMUL_CONTINUE) goto done; realmode_lgdt(ctxt->vcpu, size, address); /* Disable writeback. */ @@ -2440,7 +3024,7 @@ twobyte_insn: switch (c->modrm_rm) { case 1: rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc) + if (rc != X86EMUL_CONTINUE) goto done; break; default: @@ -2450,7 +3034,7 @@ twobyte_insn: rc = read_descriptor(ctxt, ops, c->src.ptr, &size, &address, c->op_bytes); - if (rc) + if (rc != X86EMUL_CONTINUE) goto done; realmode_lidt(ctxt->vcpu, size, address); } @@ -2459,15 +3043,18 @@ twobyte_insn: break; case 4: /* smsw */ c->dst.bytes = 2; - c->dst.val = realmode_get_cr(ctxt->vcpu, 0); + c->dst.val = ops->get_cr(0, ctxt->vcpu); break; case 6: /* lmsw */ - realmode_lmsw(ctxt->vcpu, (u16)c->src.val, - &ctxt->eflags); + ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) | + (c->src.val & 0x0f), ctxt->vcpu); c->dst.type = OP_NONE; break; + case 5: /* not defined */ + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; case 7: /* invlpg*/ - emulate_invlpg(ctxt->vcpu, memop); + emulate_invlpg(ctxt->vcpu, c->modrm_ea); /* Disable writeback. */ c->dst.type = OP_NONE; break; @@ -2493,54 +3080,54 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x20: /* mov cr, reg */ - if (c->modrm_mod != 3) - goto cannot_emulate; - c->regs[c->modrm_rm] = - realmode_get_cr(ctxt->vcpu, c->modrm_reg); + switch (c->modrm_reg) { + case 1: + case 5 ... 7: + case 9 ... 15: + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } + c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); c->dst.type = OP_NONE; /* no writeback */ break; case 0x21: /* mov from dr to reg */ - if (c->modrm_mod != 3) - goto cannot_emulate; - rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); - if (rc) - goto cannot_emulate; + if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && + (c->modrm_reg == 4 || c->modrm_reg == 5)) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } + emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); c->dst.type = OP_NONE; /* no writeback */ break; case 0x22: /* mov reg, cr */ - if (c->modrm_mod != 3) - goto cannot_emulate; - realmode_set_cr(ctxt->vcpu, - c->modrm_reg, c->modrm_val, &ctxt->eflags); + ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu); c->dst.type = OP_NONE; break; case 0x23: /* mov from reg to dr */ - if (c->modrm_mod != 3) - goto cannot_emulate; - rc = emulator_set_dr(ctxt, c->modrm_reg, - c->regs[c->modrm_rm]); - if (rc) - goto cannot_emulate; + if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && + (c->modrm_reg == 4 || c->modrm_reg == 5)) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } + emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]); c->dst.type = OP_NONE; /* no writeback */ break; case 0x30: /* wrmsr */ msr_data = (u32)c->regs[VCPU_REGS_RAX] | ((u64)c->regs[VCPU_REGS_RDX] << 32); - rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); - if (rc) { + if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { kvm_inject_gp(ctxt->vcpu, 0); - c->eip = kvm_rip_read(ctxt->vcpu); + goto done; } rc = X86EMUL_CONTINUE; c->dst.type = OP_NONE; break; case 0x32: /* rdmsr */ - rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); - if (rc) { + if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { kvm_inject_gp(ctxt->vcpu, 0); - c->eip = kvm_rip_read(ctxt->vcpu); + goto done; } else { c->regs[VCPU_REGS_RAX] = (u32)msr_data; c->regs[VCPU_REGS_RDX] = msr_data >> 32; @@ -2577,7 +3164,7 @@ twobyte_insn: break; case 0xa1: /* pop fs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0xa3: @@ -2596,7 +3183,7 @@ twobyte_insn: break; case 0xa9: /* pop gs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0xab: @@ -2668,16 +3255,14 @@ twobyte_insn: (u64) c->src.val; break; case 0xc7: /* Grp9 (cmpxchg8b) */ - rc = emulate_grp9(ctxt, ops, memop); - if (rc != 0) + rc = emulate_grp9(ctxt, ops); + if (rc != X86EMUL_CONTINUE) goto done; - c->dst.type = OP_NONE; break; } goto writeback; cannot_emulate: DPRINTF("Cannot emulate %02x\n", c->b); - c->eip = saved_eip; return -1; } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index a790fa128a9..93825ff3338 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -33,6 +33,29 @@ #include <linux/kvm_host.h> #include "trace.h" +static void pic_lock(struct kvm_pic *s) + __acquires(&s->lock) +{ + raw_spin_lock(&s->lock); +} + +static void pic_unlock(struct kvm_pic *s) + __releases(&s->lock) +{ + bool wakeup = s->wakeup_needed; + struct kvm_vcpu *vcpu; + + s->wakeup_needed = false; + + raw_spin_unlock(&s->lock); + + if (wakeup) { + vcpu = s->kvm->bsp_vcpu; + if (vcpu) + kvm_vcpu_kick(vcpu); + } +} + static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); @@ -45,19 +68,19 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) * Other interrupt may be delivered to PIC while lock is dropped but * it should be safe since PIC state is already updated at this stage. */ - raw_spin_unlock(&s->pics_state->lock); + pic_unlock(s->pics_state); kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); - raw_spin_lock(&s->pics_state->lock); + pic_lock(s->pics_state); } void kvm_pic_clear_isr_ack(struct kvm *kvm) { struct kvm_pic *s = pic_irqchip(kvm); - raw_spin_lock(&s->lock); + pic_lock(s); s->pics[0].isr_ack = 0xff; s->pics[1].isr_ack = 0xff; - raw_spin_unlock(&s->lock); + pic_unlock(s); } /* @@ -158,9 +181,9 @@ static void pic_update_irq(struct kvm_pic *s) void kvm_pic_update_irq(struct kvm_pic *s) { - raw_spin_lock(&s->lock); + pic_lock(s); pic_update_irq(s); - raw_spin_unlock(&s->lock); + pic_unlock(s); } int kvm_pic_set_irq(void *opaque, int irq, int level) @@ -168,14 +191,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) struct kvm_pic *s = opaque; int ret = -1; - raw_spin_lock(&s->lock); + pic_lock(s); if (irq >= 0 && irq < PIC_NUM_PINS) { ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, s->pics[irq >> 3].imr, ret == 0); } - raw_spin_unlock(&s->lock); + pic_unlock(s); return ret; } @@ -205,7 +228,7 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); - raw_spin_lock(&s->lock); + pic_lock(s); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { pic_intack(&s->pics[0], irq); @@ -230,7 +253,7 @@ int kvm_pic_read_irq(struct kvm *kvm) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); - raw_spin_unlock(&s->lock); + pic_unlock(s); return intno; } @@ -444,7 +467,7 @@ static int picdev_write(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte write\n"); return 0; } - raw_spin_lock(&s->lock); + pic_lock(s); switch (addr) { case 0x20: case 0x21: @@ -457,7 +480,7 @@ static int picdev_write(struct kvm_io_device *this, elcr_ioport_write(&s->pics[addr & 1], addr, data); break; } - raw_spin_unlock(&s->lock); + pic_unlock(s); return 0; } @@ -474,7 +497,7 @@ static int picdev_read(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte read\n"); return 0; } - raw_spin_lock(&s->lock); + pic_lock(s); switch (addr) { case 0x20: case 0x21: @@ -488,7 +511,7 @@ static int picdev_read(struct kvm_io_device *this, break; } *(unsigned char *)val = data; - raw_spin_unlock(&s->lock); + pic_unlock(s); return 0; } @@ -505,7 +528,7 @@ static void pic_irq_request(void *opaque, int level) s->output = level; if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { s->pics[0].isr_ack &= ~(1 << irq); - kvm_vcpu_kick(vcpu); + s->wakeup_needed = true; } } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 34b15915754..cd1f362f413 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -63,6 +63,7 @@ struct kvm_kpic_state { struct kvm_pic { raw_spinlock_t lock; + bool wakeup_needed; unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h index 55c7524dda5..64bc6ea78d9 100644 --- a/arch/x86/kvm/kvm_timer.h +++ b/arch/x86/kvm/kvm_timer.h @@ -10,9 +10,7 @@ struct kvm_timer { }; struct kvm_timer_ops { - bool (*is_periodic)(struct kvm_timer *); + bool (*is_periodic)(struct kvm_timer *); }; - enum hrtimer_restart kvm_timer_fn(struct hrtimer *data); - diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 19a8906bcaa..a6f695d7692 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -148,7 +148,6 @@ module_param(oos_shadow, bool, 0644); #include <trace/events/kvm.h> -#undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS #include "mmutrace.h" @@ -174,12 +173,7 @@ struct kvm_shadow_walk_iterator { shadow_walk_okay(&(_walker)); \ shadow_walk_next(&(_walker))) - -struct kvm_unsync_walk { - int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk); -}; - -typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); +typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp); static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; @@ -223,7 +217,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); -static int is_write_protection(struct kvm_vcpu *vcpu) +static bool is_write_protection(struct kvm_vcpu *vcpu) { return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } @@ -327,7 +321,6 @@ static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, page = alloc_page(GFP_KERNEL); if (!page) return -ENOMEM; - set_page_private(page, 0); cache->objects[cache->nobjs++] = page_address(page); } return 0; @@ -438,9 +431,9 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) int i; gfn = unalias_gfn(kvm, gfn); + slot = gfn_to_memslot_unaliased(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - slot = gfn_to_memslot_unaliased(kvm, gfn); write_count = slot_largepage_idx(gfn, slot, i); *write_count -= 1; WARN_ON(*write_count < 0); @@ -654,7 +647,6 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) { struct kvm_rmap_desc *desc; - struct kvm_rmap_desc *prev_desc; u64 *prev_spte; int i; @@ -666,7 +658,6 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) return NULL; } desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - prev_desc = NULL; prev_spte = NULL; while (desc) { for (i = 0; i < RMAP_EXT && desc->sptes[i]; ++i) { @@ -794,7 +785,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, int retval = 0; struct kvm_memslots *slots; - slots = rcu_dereference(kvm->memslots); + slots = kvm_memslots(kvm); for (i = 0; i < slots->nmemslots; i++) { struct kvm_memory_slot *memslot = &slots->memslots[i]; @@ -925,7 +916,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - INIT_LIST_HEAD(&sp->oos_link); bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); sp->multimapped = 0; sp->parent_pte = parent_pte; @@ -1009,8 +999,7 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, } -static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - mmu_parent_walk_fn fn) +static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) { struct kvm_pte_chain *pte_chain; struct hlist_node *node; @@ -1019,8 +1008,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (!sp->multimapped && sp->parent_pte) { parent_sp = page_header(__pa(sp->parent_pte)); - fn(vcpu, parent_sp); - mmu_parent_walk(vcpu, parent_sp, fn); + fn(parent_sp); + mmu_parent_walk(parent_sp, fn); return; } hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) @@ -1028,8 +1017,8 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (!pte_chain->parent_ptes[i]) break; parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); - fn(vcpu, parent_sp); - mmu_parent_walk(vcpu, parent_sp, fn); + fn(parent_sp); + mmu_parent_walk(parent_sp, fn); } } @@ -1066,16 +1055,15 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) } } -static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +static int unsync_walk_fn(struct kvm_mmu_page *sp) { kvm_mmu_update_parents_unsync(sp); return 1; } -static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) +static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) { - mmu_parent_walk(vcpu, sp, unsync_walk_fn); + mmu_parent_walk(sp, unsync_walk_fn); kvm_mmu_update_parents_unsync(sp); } @@ -1201,6 +1189,7 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON(!sp->unsync); + trace_kvm_mmu_sync_page(sp); sp->unsync = 0; --kvm->stat.mmu_unsync; } @@ -1209,12 +1198,11 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - if (sp->role.glevels != vcpu->arch.mmu.root_level) { + if (sp->role.cr4_pae != !!is_pae(vcpu)) { kvm_mmu_zap_page(vcpu->kvm, sp); return 1; } - trace_kvm_mmu_sync_page(sp); if (rmap_write_protect(vcpu->kvm, sp->gfn)) kvm_flush_remote_tlbs(vcpu->kvm); kvm_unlink_unsync_page(vcpu->kvm, sp); @@ -1331,6 +1319,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, role = vcpu->arch.mmu.base_role; role.level = level; role.direct = direct; + if (role.direct) + role.cr4_pae = 0; role.access = access; if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); @@ -1351,7 +1341,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, mmu_page_add_parent_pte(vcpu, sp, parent_pte); if (sp->unsync_children) { set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); - kvm_mmu_mark_parents_unsync(vcpu, sp); + kvm_mmu_mark_parents_unsync(sp); } trace_kvm_mmu_get_page(sp, false); return sp; @@ -1573,13 +1563,14 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) r = 0; index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; +restart: hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) if (sp->gfn == gfn && !sp->role.direct) { pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); r = 1; if (kvm_mmu_zap_page(kvm, sp)) - n = bucket->first; + goto restart; } return r; } @@ -1593,13 +1584,14 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; +restart: hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { if (sp->gfn == gfn && !sp->role.direct && !sp->role.invalid) { pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); if (kvm_mmu_zap_page(kvm, sp)) - nn = bucket->first; + goto restart; } } } @@ -1626,20 +1618,6 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp) } } -struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) -{ - struct page *page; - - gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); - - if (gpa == UNMAPPED_GVA) - return NULL; - - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - - return page; -} - /* * The function is based on mtrr_type_lookup() in * arch/x86/kernel/cpu/mtrr/generic.c @@ -1752,7 +1730,6 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) struct kvm_mmu_page *s; struct hlist_node *node, *n; - trace_kvm_mmu_unsync_page(sp); index = kvm_page_table_hashfn(sp->gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; /* don't unsync if pagetable is shadowed with multiple roles */ @@ -1762,10 +1739,11 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (s->role.word != sp->role.word) return 1; } + trace_kvm_mmu_unsync_page(sp); ++vcpu->kvm->stat.mmu_unsync; sp->unsync = 1; - kvm_mmu_mark_parents_unsync(vcpu, sp); + kvm_mmu_mark_parents_unsync(sp); mmu_convert_notrap(sp); return 0; @@ -1837,6 +1815,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= PT_WRITABLE_MASK; + if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) + spte &= ~PT_USER_MASK; + /* * Optimization: for pte sync, if spte was writable the hash * lookup is unnecessary (and expensive). Write protection @@ -1892,6 +1873,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, child = page_header(pte & PT64_BASE_ADDR_MASK); mmu_page_remove_parent_pte(child, sptep); + __set_spte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); } else if (pfn != spte_to_pfn(*sptep)) { pgprintk("hfn old %lx new %lx\n", spte_to_pfn(*sptep), pfn); @@ -2081,21 +2064,23 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu.root_hpa; ASSERT(!VALID_PAGE(root)); - if (tdp_enabled) - direct = 1; if (mmu_check_root(vcpu, root_gfn)) return 1; + if (tdp_enabled) { + direct = 1; + root_gfn = 0; + } + spin_lock(&vcpu->kvm->mmu_lock); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, direct, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; + spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = root; return 0; } direct = !is_paging(vcpu); - if (tdp_enabled) - direct = 1; for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -2111,11 +2096,18 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) root_gfn = 0; if (mmu_check_root(vcpu, root_gfn)) return 1; + if (tdp_enabled) { + direct = 1; + root_gfn = i << 30; + } + spin_lock(&vcpu->kvm->mmu_lock); sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; + spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; } vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); @@ -2299,13 +2291,19 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) /* no rsvd bits for 2 level 4K page table entries */ context->rsvd_bits_mask[0][1] = 0; context->rsvd_bits_mask[0][0] = 0; + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + + if (!is_pse(vcpu)) { + context->rsvd_bits_mask[1][1] = 0; + break; + } + if (is_cpuid_PSE36()) /* 36bits PSE 4MB page */ context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); else /* 32 bits PSE 4MB page */ context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; break; case PT32E_ROOT_LEVEL: context->rsvd_bits_mask[0][2] = @@ -2318,7 +2316,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; break; case PT64_ROOT_LEVEL: context->rsvd_bits_mask[0][3] = exb_bit_rsvd | @@ -2336,7 +2334,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) context->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[1][0]; + context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; break; } } @@ -2438,7 +2436,8 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu) else r = paging32_init_context(vcpu); - vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level; + vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); + vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); return r; } @@ -2478,7 +2477,9 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) goto out; spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); r = mmu_alloc_roots(vcpu); + spin_lock(&vcpu->kvm->mmu_lock); mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); if (r) @@ -2527,7 +2528,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, } ++vcpu->kvm->stat.mmu_pte_updated; - if (sp->role.glevels == PT32_ROOT_LEVEL) + if (!sp->role.cr4_pae) paging32_update_pte(vcpu, sp, spte, new); else paging64_update_pte(vcpu, sp, spte, new); @@ -2562,36 +2563,11 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) } static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) + u64 gpte) { gfn_t gfn; - int r; - u64 gpte = 0; pfn_t pfn; - if (bytes != 4 && bytes != 8) - return; - - /* - * Assume that the pte write on a page table of the same type - * as the current vcpu paging mode. This is nearly always true - * (might be false while changing modes). Note it is verified later - * by update_pte(). - */ - if (is_pae(vcpu)) { - /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ - if ((bytes == 4) && (gpa % 4 == 0)) { - r = kvm_read_guest(vcpu->kvm, gpa & ~(u64)7, &gpte, 8); - if (r) - return; - memcpy((void *)&gpte + (gpa % 8), new, 4); - } else if ((bytes == 8) && (gpa % 8 == 0)) { - memcpy((void *)&gpte, new, 8); - } - } else { - if ((bytes == 4) && (gpa % 4 == 0)) - memcpy((void *)&gpte, new, 4); - } if (!is_present_gpte(gpte)) return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -2640,10 +2616,46 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, int flooded = 0; int npte; int r; + int invlpg_counter; pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); - mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); + + invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter); + + /* + * Assume that the pte write on a page table of the same type + * as the current vcpu paging mode. This is nearly always true + * (might be false while changing modes). Note it is verified later + * by update_pte(). + */ + if ((is_pae(vcpu) && bytes == 4) || !new) { + /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ + if (is_pae(vcpu)) { + gpa &= ~(gpa_t)7; + bytes = 8; + } + r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8)); + if (r) + gentry = 0; + new = (const u8 *)&gentry; + } + + switch (bytes) { + case 4: + gentry = *(const u32 *)new; + break; + case 8: + gentry = *(const u64 *)new; + break; + default: + gentry = 0; + break; + } + + mmu_guess_page_from_pte_write(vcpu, gpa, gentry); spin_lock(&vcpu->kvm->mmu_lock); + if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) + gentry = 0; kvm_mmu_access_page(vcpu, gfn); kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; @@ -2662,10 +2674,12 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + +restart: hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) continue; - pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; + pte_size = sp->role.cr4_pae ? 8 : 4; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); misaligned |= bytes < 4; if (misaligned || flooded) { @@ -2682,14 +2696,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); if (kvm_mmu_zap_page(vcpu->kvm, sp)) - n = bucket->first; + goto restart; ++vcpu->kvm->stat.mmu_flooded; continue; } page_offset = offset; level = sp->role.level; npte = 1; - if (sp->role.glevels == PT32_ROOT_LEVEL) { + if (!sp->role.cr4_pae) { page_offset <<= 1; /* 32->64 */ /* * A 32-bit pde maps 4MB while the shadow pdes map @@ -2707,20 +2721,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, continue; } spte = &sp->spt[page_offset / sizeof(*spte)]; - if ((gpa & (pte_size - 1)) || (bytes < pte_size)) { - gentry = 0; - r = kvm_read_guest_atomic(vcpu->kvm, - gpa & ~(u64)(pte_size - 1), - &gentry, pte_size); - new = (const void *)&gentry; - if (r < 0) - new = NULL; - } while (npte--) { entry = *spte; mmu_pte_write_zap_pte(vcpu, sp, spte); - if (new) - mmu_pte_write_new_pte(vcpu, sp, spte, new); + if (gentry) + mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); mmu_pte_write_flush_tlb(vcpu, entry, *spte); ++spte; } @@ -2900,22 +2905,23 @@ void kvm_mmu_zap_all(struct kvm *kvm) struct kvm_mmu_page *sp, *node; spin_lock(&kvm->mmu_lock); +restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) if (kvm_mmu_zap_page(kvm, sp)) - node = container_of(kvm->arch.active_mmu_pages.next, - struct kvm_mmu_page, link); + goto restart; + spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); } -static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) +static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - kvm_mmu_zap_page(kvm, page); + return kvm_mmu_zap_page(kvm, page) + 1; } static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) @@ -2927,7 +2933,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - int npages, idx; + int npages, idx, freed_pages; idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); @@ -2935,8 +2941,8 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) kvm->arch.n_free_mmu_pages; cache_count += npages; if (!kvm_freed && nr_to_scan > 0 && npages > 0) { - kvm_mmu_remove_one_alloc_mmu_page(kvm); - cache_count--; + freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm); + cache_count -= freed_pages; kvm_freed = kvm; } nr_to_scan--; @@ -3011,7 +3017,8 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) unsigned int nr_pages = 0; struct kvm_memslots *slots; - slots = rcu_dereference(kvm->memslots); + slots = kvm_memslots(kvm); + for (i = 0; i < slots->nmemslots; i++) nr_pages += slots->memslots[i].npages; @@ -3174,8 +3181,7 @@ static gva_t canonicalize(gva_t gva) } -typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp, - u64 *sptep); +typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep); static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, inspect_spte_fn fn) @@ -3191,7 +3197,7 @@ static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, child = page_header(ent & PT64_BASE_ADDR_MASK); __mmu_spte_walk(kvm, child, fn); } else - fn(kvm, sp, &sp->spt[i]); + fn(kvm, &sp->spt[i]); } } } @@ -3282,11 +3288,13 @@ static void audit_mappings(struct kvm_vcpu *vcpu) static int count_rmaps(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; + struct kvm_memslots *slots; int nmaps = 0; int i, j, k, idx; idx = srcu_read_lock(&kvm->srcu); - slots = rcu_dereference(kvm->memslots); + slots = kvm_memslots(kvm); for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { struct kvm_memory_slot *m = &slots->memslots[i]; struct kvm_rmap_desc *d; @@ -3315,7 +3323,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu) return nmaps; } -void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep) +void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) { unsigned long *rmapp; struct kvm_mmu_page *rev_sp; @@ -3331,14 +3339,14 @@ void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 *sptep) printk(KERN_ERR "%s: no memslot for gfn %ld\n", audit_msg, gfn); printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", - audit_msg, sptep - rev_sp->spt, + audit_msg, (long int)(sptep - rev_sp->spt), rev_sp->gfn); dump_stack(); return; } rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], - is_large_pte(*sptep)); + rev_sp->role.level); if (!*rmapp) { if (!printk_ratelimit()) return; @@ -3373,7 +3381,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) continue; if (!(ent & PT_WRITABLE_MASK)) continue; - inspect_spte_has_rmap(vcpu->kvm, sp, &pt[i]); + inspect_spte_has_rmap(vcpu->kvm, &pt[i]); } } return; diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 3e4a5c6ca2a..42f07b1bfbc 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -6,14 +6,12 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvmmmu -#define TRACE_INCLUDE_PATH . -#define TRACE_INCLUDE_FILE mmutrace #define KVM_MMU_PAGE_FIELDS \ __field(__u64, gfn) \ __field(__u32, role) \ __field(__u32, root_count) \ - __field(__u32, unsync) + __field(bool, unsync) #define KVM_MMU_PAGE_ASSIGN(sp) \ __entry->gfn = sp->gfn; \ @@ -30,14 +28,14 @@ \ role.word = __entry->role; \ \ - trace_seq_printf(p, "sp gfn %llx %u/%u q%u%s %s%s %spge" \ + trace_seq_printf(p, "sp gfn %llx %u%s q%u%s %s%s" \ " %snxe root %u %s%c", \ - __entry->gfn, role.level, role.glevels, \ + __entry->gfn, role.level, \ + role.cr4_pae ? " pae" : "", \ role.quadrant, \ role.direct ? " direct" : "", \ access_str[role.access], \ role.invalid ? " invalid" : "", \ - role.cr4_pge ? "" : "!", \ role.nxe ? "" : "!", \ __entry->root_count, \ __entry->unsync ? "unsync" : "sync", 0); \ @@ -94,15 +92,15 @@ TRACE_EVENT( TP_printk("pte %llx level %u", __entry->pte, __entry->level) ); -/* We set a pte accessed bit */ -TRACE_EVENT( - kvm_mmu_set_accessed_bit, +DECLARE_EVENT_CLASS(kvm_mmu_set_bit_class, + TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), + TP_ARGS(table_gfn, index, size), TP_STRUCT__entry( __field(__u64, gpa) - ), + ), TP_fast_assign( __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) @@ -112,22 +110,20 @@ TRACE_EVENT( TP_printk("gpa %llx", __entry->gpa) ); -/* We set a pte dirty bit */ -TRACE_EVENT( - kvm_mmu_set_dirty_bit, +/* We set a pte accessed bit */ +DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_accessed_bit, + TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), - TP_ARGS(table_gfn, index, size), - TP_STRUCT__entry( - __field(__u64, gpa) - ), + TP_ARGS(table_gfn, index, size) +); - TP_fast_assign( - __entry->gpa = ((u64)table_gfn << PAGE_SHIFT) - + index * size; - ), +/* We set a pte dirty bit */ +DEFINE_EVENT(kvm_mmu_set_bit_class, kvm_mmu_set_dirty_bit, - TP_printk("gpa %llx", __entry->gpa) + TP_PROTO(unsigned long table_gfn, unsigned index, unsigned size), + + TP_ARGS(table_gfn, index, size) ); TRACE_EVENT( @@ -166,55 +162,45 @@ TRACE_EVENT( __entry->created ? "new" : "existing") ); -TRACE_EVENT( - kvm_mmu_sync_page, +DECLARE_EVENT_CLASS(kvm_mmu_page_class, + TP_PROTO(struct kvm_mmu_page *sp), TP_ARGS(sp), TP_STRUCT__entry( KVM_MMU_PAGE_FIELDS - ), + ), TP_fast_assign( KVM_MMU_PAGE_ASSIGN(sp) - ), + ), TP_printk("%s", KVM_MMU_PAGE_PRINTK()) ); -TRACE_EVENT( - kvm_mmu_unsync_page, +DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_sync_page, TP_PROTO(struct kvm_mmu_page *sp), - TP_ARGS(sp), - - TP_STRUCT__entry( - KVM_MMU_PAGE_FIELDS - ), - TP_fast_assign( - KVM_MMU_PAGE_ASSIGN(sp) - ), - - TP_printk("%s", KVM_MMU_PAGE_PRINTK()) + TP_ARGS(sp) ); -TRACE_EVENT( - kvm_mmu_zap_page, +DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page, TP_PROTO(struct kvm_mmu_page *sp), - TP_ARGS(sp), - TP_STRUCT__entry( - KVM_MMU_PAGE_FIELDS - ), + TP_ARGS(sp) +); - TP_fast_assign( - KVM_MMU_PAGE_ASSIGN(sp) - ), +DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page, + TP_PROTO(struct kvm_mmu_page *sp), - TP_printk("%s", KVM_MMU_PAGE_PRINTK()) + TP_ARGS(sp) ); - #endif /* _TRACE_KVMMMU_H */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE mmutrace + /* This part must be outside protection */ #include <trace/define_trace.h> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 81eab9a50e6..89d66ca4d87 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -170,7 +170,7 @@ walk: goto access_error; #if PTTYPE == 64 - if (fetch_fault && is_nx(vcpu) && (pte & PT64_NX_MASK)) + if (fetch_fault && (pte & PT64_NX_MASK)) goto access_error; #endif @@ -190,10 +190,10 @@ walk: if ((walker->level == PT_PAGE_TABLE_LEVEL) || ((walker->level == PT_DIRECTORY_LEVEL) && - (pte & PT_PAGE_SIZE_MASK) && + is_large_pte(pte) && (PTTYPE == 64 || is_pse(vcpu))) || ((walker->level == PT_PDPE_LEVEL) && - (pte & PT_PAGE_SIZE_MASK) && + is_large_pte(pte) && is_long_mode(vcpu))) { int lvl = walker->level; @@ -258,11 +258,17 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, pt_element_t gpte; unsigned pte_access; pfn_t pfn; + u64 new_spte; gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { - if (!is_present_gpte(gpte)) - __set_spte(spte, shadow_notrap_nonpresent_pte); + if (!is_present_gpte(gpte)) { + if (page->unsync) + new_spte = shadow_trap_nonpresent_pte; + else + new_spte = shadow_notrap_nonpresent_pte; + __set_spte(spte, new_spte); + } return; } pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); @@ -457,6 +463,7 @@ out_unlock: static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_shadow_walk_iterator iterator; + gpa_t pte_gpa = -1; int level; u64 *sptep; int need_flush = 0; @@ -467,9 +474,16 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) level = iterator.level; sptep = iterator.sptep; - if (level == PT_PAGE_TABLE_LEVEL || - ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || - ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { + if (is_last_spte(*sptep, level)) { + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + int offset, shift; + + shift = PAGE_SHIFT - + (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; + offset = sp->role.quadrant << shift; + + pte_gpa = (sp->gfn << PAGE_SHIFT) + offset; + pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); if (is_shadow_present_pte(*sptep)) { rmap_remove(vcpu->kvm, sptep); @@ -487,7 +501,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) if (need_flush) kvm_flush_remote_tlbs(vcpu->kvm); + + atomic_inc(&vcpu->kvm->arch.invlpg_counter); + spin_unlock(&vcpu->kvm->mmu_lock); + + if (pte_gpa == -1) + return; + + if (mmu_topup_memory_caches(vcpu)) + return; + kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0); } static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, @@ -551,12 +575,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { int i, offset, nr_present; bool reset_host_protection; + gpa_t first_pte_gpa; offset = nr_present = 0; if (PTTYPE == 32) offset = sp->role.quadrant << PT64_LEVEL_BITS; + first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { unsigned pte_access; pt_element_t gpte; @@ -566,8 +593,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (!is_shadow_present_pte(sp->spt[i])) continue; - pte_gpa = gfn_to_gpa(sp->gfn); - pte_gpa += (i+offset) * sizeof(pt_element_t); + pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, sizeof(pt_element_t))) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 737361fcd50..ce438e0fdd2 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -28,6 +28,7 @@ #include <linux/ftrace_event.h> #include <linux/slab.h> +#include <asm/tlbflush.h> #include <asm/desc.h> #include <asm/virtext.h> @@ -44,10 +45,11 @@ MODULE_LICENSE("GPL"); #define SEG_TYPE_LDT 2 #define SEG_TYPE_BUSY_TSS16 3 -#define SVM_FEATURE_NPT (1 << 0) -#define SVM_FEATURE_LBRV (1 << 1) -#define SVM_FEATURE_SVML (1 << 2) -#define SVM_FEATURE_PAUSE_FILTER (1 << 10) +#define SVM_FEATURE_NPT (1 << 0) +#define SVM_FEATURE_LBRV (1 << 1) +#define SVM_FEATURE_SVML (1 << 2) +#define SVM_FEATURE_NRIP (1 << 3) +#define SVM_FEATURE_PAUSE_FILTER (1 << 10) #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ #define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ @@ -55,6 +57,8 @@ MODULE_LICENSE("GPL"); #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) +static bool erratum_383_found __read_mostly; + static const u32 host_save_user_msrs[] = { #ifdef CONFIG_X86_64 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, @@ -70,6 +74,7 @@ struct kvm_vcpu; struct nested_state { struct vmcb *hsave; u64 hsave_msr; + u64 vm_cr_msr; u64 vmcb; /* These are the merged vectors */ @@ -77,6 +82,7 @@ struct nested_state { /* gpa pointers to the real vectors */ u64 vmcb_msrpm; + u64 vmcb_iopm; /* A VMEXIT is required but not yet emulated */ bool exit_required; @@ -91,6 +97,9 @@ struct nested_state { }; +#define MSRPM_OFFSETS 16 +static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; + struct vcpu_svm { struct kvm_vcpu vcpu; struct vmcb *vmcb; @@ -110,13 +119,39 @@ struct vcpu_svm { struct nested_state nested; bool nmi_singlestep; + + unsigned int3_injected; + unsigned long int3_rip; +}; + +#define MSR_INVALID 0xffffffffU + +static struct svm_direct_access_msrs { + u32 index; /* Index of the MSR */ + bool always; /* True if intercept is always on */ +} direct_access_msrs[] = { + { .index = MSR_K6_STAR, .always = true }, + { .index = MSR_IA32_SYSENTER_CS, .always = true }, +#ifdef CONFIG_X86_64 + { .index = MSR_GS_BASE, .always = true }, + { .index = MSR_FS_BASE, .always = true }, + { .index = MSR_KERNEL_GS_BASE, .always = true }, + { .index = MSR_LSTAR, .always = true }, + { .index = MSR_CSTAR, .always = true }, + { .index = MSR_SYSCALL_MASK, .always = true }, +#endif + { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, + { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, + { .index = MSR_IA32_LASTINTFROMIP, .always = false }, + { .index = MSR_IA32_LASTINTTOIP, .always = false }, + { .index = MSR_INVALID, .always = false }, }; /* enable NPT for AMD64 and X86 with PAE */ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) static bool npt_enabled = true; #else -static bool npt_enabled = false; +static bool npt_enabled; #endif static int npt = 1; @@ -129,6 +164,7 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); static int nested_svm_exit_handled(struct vcpu_svm *svm); +static int nested_svm_intercept(struct vcpu_svm *svm); static int nested_svm_vmexit(struct vcpu_svm *svm); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); @@ -163,8 +199,8 @@ static unsigned long iopm_base; struct kvm_ldttss_desc { u16 limit0; u16 base0; - unsigned base1 : 8, type : 5, dpl : 2, p : 1; - unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8; + unsigned base1:8, type:5, dpl:2, p:1; + unsigned limit1:4, zero0:3, g:1, base2:8; u32 base3; u32 zero1; } __attribute__((packed)); @@ -194,6 +230,27 @@ static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; #define MSRS_RANGE_SIZE 2048 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) +static u32 svm_msrpm_offset(u32 msr) +{ + u32 offset; + int i; + + for (i = 0; i < NUM_MSR_MAPS; i++) { + if (msr < msrpm_ranges[i] || + msr >= msrpm_ranges[i] + MSRS_IN_RANGE) + continue; + + offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */ + offset += (i * MSRS_RANGE_SIZE); /* add range offset */ + + /* Now we have the u8 offset - but need the u32 offset */ + return offset / 4; + } + + /* MSR not in any range */ + return MSR_INVALID; +} + #define MAX_INST_SIZE 15 static inline u32 svm_has(u32 feat) @@ -213,7 +270,7 @@ static inline void stgi(void) static inline void invlpga(unsigned long addr, u32 asid) { - asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); + asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); } static inline void force_new_asid(struct kvm_vcpu *vcpu) @@ -235,23 +292,6 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) vcpu->arch.efer = efer; } -static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - /* If we are within a nested VM we'd better #VMEXIT and let the - guest handle the exception */ - if (nested_svm_check_exception(svm, nr, has_error_code, error_code)) - return; - - svm->vmcb->control.event_inj = nr - | SVM_EVTINJ_VALID - | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) - | SVM_EVTINJ_TYPE_EXEPT; - svm->vmcb->control.event_inj_err = error_code; -} - static int is_external_interrupt(u32 info) { info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID; @@ -264,7 +304,7 @@ static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) u32 ret = 0; if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) - ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS; + ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; return ret & mask; } @@ -283,6 +323,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + if (svm->vmcb->control.next_rip != 0) + svm->next_rip = svm->vmcb->control.next_rip; + if (!svm->next_rip) { if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != EMULATE_DONE) @@ -297,6 +340,68 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) svm_set_interrupt_shadow(vcpu, 0); } +static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, + bool has_error_code, u32 error_code, + bool reinject) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + /* + * If we are within a nested VM we'd better #VMEXIT and let the guest + * handle the exception + */ + if (!reinject && + nested_svm_check_exception(svm, nr, has_error_code, error_code)) + return; + + if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) { + unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); + + /* + * For guest debugging where we have to reinject #BP if some + * INT3 is guest-owned: + * Emulate nRIP by moving RIP forward. Will fail if injection + * raises a fault that is not intercepted. Still better than + * failing in all cases. + */ + skip_emulated_instruction(&svm->vcpu); + rip = kvm_rip_read(&svm->vcpu); + svm->int3_rip = rip + svm->vmcb->save.cs.base; + svm->int3_injected = rip - old_rip; + } + + svm->vmcb->control.event_inj = nr + | SVM_EVTINJ_VALID + | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0) + | SVM_EVTINJ_TYPE_EXEPT; + svm->vmcb->control.event_inj_err = error_code; +} + +static void svm_init_erratum_383(void) +{ + u32 low, high; + int err; + u64 val; + + /* Only Fam10h is affected */ + if (boot_cpu_data.x86 != 0x10) + return; + + /* Use _safe variants to not break nested virtualization */ + val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); + if (err) + return; + + val |= (1ULL << 47); + + low = lower_32_bits(val); + high = upper_32_bits(val); + + native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); + + erratum_383_found = true; +} + static int has_svm(void) { const char *msg; @@ -319,7 +424,7 @@ static int svm_hardware_enable(void *garbage) struct svm_cpu_data *sd; uint64_t efer; - struct descriptor_table gdt_descr; + struct desc_ptr gdt_descr; struct desc_struct *gdt; int me = raw_smp_processor_id(); @@ -344,14 +449,16 @@ static int svm_hardware_enable(void *garbage) sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; sd->next_asid = sd->max_asid + 1; - kvm_get_gdt(&gdt_descr); - gdt = (struct desc_struct *)gdt_descr.base; + native_store_gdt(&gdt_descr); + gdt = (struct desc_struct *)gdt_descr.address; sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); wrmsrl(MSR_EFER, efer | EFER_SVME); wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); + svm_init_erratum_383(); + return 0; } @@ -391,42 +498,98 @@ err_1: } +static bool valid_msr_intercept(u32 index) +{ + int i; + + for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) + if (direct_access_msrs[i].index == index) + return true; + + return false; +} + static void set_msr_interception(u32 *msrpm, unsigned msr, int read, int write) { + u8 bit_read, bit_write; + unsigned long tmp; + u32 offset; + + /* + * If this warning triggers extend the direct_access_msrs list at the + * beginning of the file + */ + WARN_ON(!valid_msr_intercept(msr)); + + offset = svm_msrpm_offset(msr); + bit_read = 2 * (msr & 0x0f); + bit_write = 2 * (msr & 0x0f) + 1; + tmp = msrpm[offset]; + + BUG_ON(offset == MSR_INVALID); + + read ? clear_bit(bit_read, &tmp) : set_bit(bit_read, &tmp); + write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp); + + msrpm[offset] = tmp; +} + +static void svm_vcpu_init_msrpm(u32 *msrpm) +{ int i; - for (i = 0; i < NUM_MSR_MAPS; i++) { - if (msr >= msrpm_ranges[i] && - msr < msrpm_ranges[i] + MSRS_IN_RANGE) { - u32 msr_offset = (i * MSRS_IN_RANGE + msr - - msrpm_ranges[i]) * 2; - - u32 *base = msrpm + (msr_offset / 32); - u32 msr_shift = msr_offset % 32; - u32 mask = ((write) ? 0 : 2) | ((read) ? 0 : 1); - *base = (*base & ~(0x3 << msr_shift)) | - (mask << msr_shift); + memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); + + for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { + if (!direct_access_msrs[i].always) + continue; + + set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1); + } +} + +static void add_msr_offset(u32 offset) +{ + int i; + + for (i = 0; i < MSRPM_OFFSETS; ++i) { + + /* Offset already in list? */ + if (msrpm_offsets[i] == offset) return; - } + + /* Slot used by another offset? */ + if (msrpm_offsets[i] != MSR_INVALID) + continue; + + /* Add offset to list */ + msrpm_offsets[i] = offset; + + return; } + + /* + * If this BUG triggers the msrpm_offsets table has an overflow. Just + * increase MSRPM_OFFSETS in this case. + */ BUG(); } -static void svm_vcpu_init_msrpm(u32 *msrpm) +static void init_msrpm_offsets(void) { - memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); + int i; -#ifdef CONFIG_X86_64 - set_msr_interception(msrpm, MSR_GS_BASE, 1, 1); - set_msr_interception(msrpm, MSR_FS_BASE, 1, 1); - set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1); - set_msr_interception(msrpm, MSR_LSTAR, 1, 1); - set_msr_interception(msrpm, MSR_CSTAR, 1, 1); - set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1); -#endif - set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); - set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); + memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets)); + + for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { + u32 offset; + + offset = svm_msrpm_offset(direct_access_msrs[i].index); + BUG_ON(offset == MSR_INVALID); + + add_msr_offset(offset); + } } static void svm_enable_lbrv(struct vcpu_svm *svm) @@ -467,6 +630,8 @@ static __init int svm_hardware_setup(void) memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER)); iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; + init_msrpm_offsets(); + if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); @@ -523,7 +688,7 @@ static void init_seg(struct vmcb_seg *seg) { seg->selector = 0; seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | - SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ + SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ seg->limit = 0xffff; seg->base = 0; } @@ -543,16 +708,16 @@ static void init_vmcb(struct vcpu_svm *svm) svm->vcpu.fpu_active = 1; - control->intercept_cr_read = INTERCEPT_CR0_MASK | + control->intercept_cr_read = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | INTERCEPT_CR4_MASK; - control->intercept_cr_write = INTERCEPT_CR0_MASK | + control->intercept_cr_write = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | INTERCEPT_CR4_MASK | INTERCEPT_CR8_MASK; - control->intercept_dr_read = INTERCEPT_DR0_MASK | + control->intercept_dr_read = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | INTERCEPT_DR2_MASK | INTERCEPT_DR3_MASK | @@ -561,7 +726,7 @@ static void init_vmcb(struct vcpu_svm *svm) INTERCEPT_DR6_MASK | INTERCEPT_DR7_MASK; - control->intercept_dr_write = INTERCEPT_DR0_MASK | + control->intercept_dr_write = INTERCEPT_DR0_MASK | INTERCEPT_DR1_MASK | INTERCEPT_DR2_MASK | INTERCEPT_DR3_MASK | @@ -575,7 +740,7 @@ static void init_vmcb(struct vcpu_svm *svm) (1 << MC_VECTOR); - control->intercept = (1ULL << INTERCEPT_INTR) | + control->intercept = (1ULL << INTERCEPT_INTR) | (1ULL << INTERCEPT_NMI) | (1ULL << INTERCEPT_SMI) | (1ULL << INTERCEPT_SELECTIVE_CR0) | @@ -636,7 +801,8 @@ static void init_vmcb(struct vcpu_svm *svm) save->rip = 0x0000fff0; svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; - /* This is the guest-visible cr0 value. + /* + * This is the guest-visible cr0 value. * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. */ svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; @@ -729,6 +895,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm_vcpu_init_msrpm(svm->msrpm); svm->nested.msrpm = page_address(nested_msrpm_pages); + svm_vcpu_init_msrpm(svm->nested.msrpm); svm->vmcb = page_address(page); clear_page(svm->vmcb); @@ -882,7 +1049,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; - /* AMD's VMCB does not have an explicit unusable field, so emulate it + /* + * AMD's VMCB does not have an explicit unusable field, so emulate it * for cross vendor migration purposes by "not present" */ var->unusable = !var->present || (var->type == 0); @@ -918,7 +1086,8 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->type |= 0x1; break; case VCPU_SREG_SS: - /* On AMD CPUs sometimes the DB bit in the segment + /* + * On AMD CPUs sometimes the DB bit in the segment * descriptor is left as 1, although the whole segment has * been made unusable. Clear it here to pass an Intel VMX * entry check when cross vendor migrating. @@ -936,36 +1105,36 @@ static int svm_get_cpl(struct kvm_vcpu *vcpu) return save->cpl; } -static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { struct vcpu_svm *svm = to_svm(vcpu); - dt->limit = svm->vmcb->save.idtr.limit; - dt->base = svm->vmcb->save.idtr.base; + dt->size = svm->vmcb->save.idtr.limit; + dt->address = svm->vmcb->save.idtr.base; } -static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.idtr.limit = dt->limit; - svm->vmcb->save.idtr.base = dt->base ; + svm->vmcb->save.idtr.limit = dt->size; + svm->vmcb->save.idtr.base = dt->address ; } -static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { struct vcpu_svm *svm = to_svm(vcpu); - dt->limit = svm->vmcb->save.gdtr.limit; - dt->base = svm->vmcb->save.gdtr.base; + dt->size = svm->vmcb->save.gdtr.limit; + dt->address = svm->vmcb->save.gdtr.base; } -static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.gdtr.limit = dt->limit; - svm->vmcb->save.gdtr.base = dt->base ; + svm->vmcb->save.gdtr.limit = dt->size; + svm->vmcb->save.gdtr.base = dt->address ; } static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) @@ -978,6 +1147,7 @@ static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) static void update_cr0_intercept(struct vcpu_svm *svm) { + struct vmcb *vmcb = svm->vmcb; ulong gcr0 = svm->vcpu.arch.cr0; u64 *hcr0 = &svm->vmcb->save.cr0; @@ -989,11 +1159,25 @@ static void update_cr0_intercept(struct vcpu_svm *svm) if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { - svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; - svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; + vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; + vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; + if (is_nested(svm)) { + struct vmcb *hsave = svm->nested.hsave; + + hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; + hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; + vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read; + vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write; + } } else { svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; + if (is_nested(svm)) { + struct vmcb *hsave = svm->nested.hsave; + + hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK; + hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK; + } } } @@ -1001,6 +1185,27 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); + if (is_nested(svm)) { + /* + * We are here because we run in nested mode, the host kvm + * intercepts cr0 writes but the l1 hypervisor does not. + * But the L1 hypervisor may intercept selective cr0 writes. + * This needs to be checked here. + */ + unsigned long old, new; + + /* Remove bits that would trigger a real cr0 write intercept */ + old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK; + new = cr0 & SVM_CR0_SELECTIVE_MASK; + + if (old == new) { + /* cr0 write with ts and mp unchanged */ + svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; + if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) + return; + } + } + #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { @@ -1134,70 +1339,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->vmcb->control.asid = sd->next_asid++; } -static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest) +static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); - switch (dr) { - case 0 ... 3: - *dest = vcpu->arch.db[dr]; - break; - case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return EMULATE_FAIL; /* will re-inject UD */ - /* fall through */ - case 6: - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - *dest = vcpu->arch.dr6; - else - *dest = svm->vmcb->save.dr6; - break; - case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return EMULATE_FAIL; /* will re-inject UD */ - /* fall through */ - case 7: - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - *dest = vcpu->arch.dr7; - else - *dest = svm->vmcb->save.dr7; - break; - } - - return EMULATE_DONE; -} - -static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - switch (dr) { - case 0 ... 3: - vcpu->arch.db[dr] = value; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) - vcpu->arch.eff_db[dr] = value; - break; - case 4: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return EMULATE_FAIL; /* will re-inject UD */ - /* fall through */ - case 6: - vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1; - break; - case 5: - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) - return EMULATE_FAIL; /* will re-inject UD */ - /* fall through */ - case 7: - vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { - svm->vmcb->save.dr7 = vcpu->arch.dr7; - vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK); - } - break; - } - - return EMULATE_DONE; + svm->vmcb->save.dr7 = value; } static int pf_interception(struct vcpu_svm *svm) @@ -1234,7 +1380,7 @@ static int db_interception(struct vcpu_svm *svm) } if (svm->vcpu.guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){ + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; @@ -1268,7 +1414,22 @@ static int ud_interception(struct vcpu_svm *svm) static void svm_fpu_activate(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); + u32 excp; + + if (is_nested(svm)) { + u32 h_excp, n_excp; + + h_excp = svm->nested.hsave->control.intercept_exceptions; + n_excp = svm->nested.intercept_exceptions; + h_excp &= ~(1 << NM_VECTOR); + excp = h_excp | n_excp; + } else { + excp = svm->vmcb->control.intercept_exceptions; + excp &= ~(1 << NM_VECTOR); + } + + svm->vmcb->control.intercept_exceptions = excp; + svm->vcpu.fpu_active = 1; update_cr0_intercept(svm); } @@ -1279,8 +1440,59 @@ static int nm_interception(struct vcpu_svm *svm) return 1; } -static int mc_interception(struct vcpu_svm *svm) +static bool is_erratum_383(void) { + int err, i; + u64 value; + + if (!erratum_383_found) + return false; + + value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); + if (err) + return false; + + /* Bit 62 may or may not be set for this mce */ + value &= ~(1ULL << 62); + + if (value != 0xb600000000010015ULL) + return false; + + /* Clear MCi_STATUS registers */ + for (i = 0; i < 6; ++i) + native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); + + value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); + if (!err) { + u32 low, high; + + value &= ~(1ULL << 2); + low = lower_32_bits(value); + high = upper_32_bits(value); + + native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); + } + + /* Flush tlb to evict multi-match entries */ + __flush_tlb_all(); + + return true; +} + +static void svm_handle_mce(struct vcpu_svm *svm) +{ + if (is_erratum_383()) { + /* + * Erratum 383 triggered. Guest state is corrupt so kill the + * guest. + */ + pr_err("KVM: Guest triggered AMD Erratum 383\n"); + + set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests); + + return; + } + /* * On an #MC intercept the MCE handler is not called automatically in * the host. So do it by hand here. @@ -1289,6 +1501,11 @@ static int mc_interception(struct vcpu_svm *svm) "int $0x12\n"); /* not sure if we ever come back to this point */ + return; +} + +static int mc_interception(struct vcpu_svm *svm) +{ return 1; } @@ -1309,29 +1526,23 @@ static int shutdown_interception(struct vcpu_svm *svm) static int io_interception(struct vcpu_svm *svm) { + struct kvm_vcpu *vcpu = &svm->vcpu; u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ int size, in, string; unsigned port; ++svm->vcpu.stat.io_exits; - - svm->next_rip = svm->vmcb->control.exit_info_2; - string = (io_info & SVM_IOIO_STR_MASK) != 0; - - if (string) { - if (emulate_instruction(&svm->vcpu, - 0, 0, 0) == EMULATE_DO_MMIO) - return 0; - return 1; - } - in = (io_info & SVM_IOIO_TYPE_MASK) != 0; + if (string || in) + return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); + port = io_info >> 16; size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; - + svm->next_rip = svm->vmcb->control.exit_info_2; skip_emulated_instruction(&svm->vcpu); - return kvm_emulate_pio(&svm->vcpu, in, size, port); + + return kvm_fast_pio_out(vcpu, size, port); } static int nmi_interception(struct vcpu_svm *svm) @@ -1384,6 +1595,8 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm) static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code) { + int vmexit; + if (!is_nested(svm)) return 0; @@ -1392,21 +1605,28 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, svm->vmcb->control.exit_info_1 = error_code; svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; - return nested_svm_exit_handled(svm); + vmexit = nested_svm_intercept(svm); + if (vmexit == NESTED_EXIT_DONE) + svm->nested.exit_required = true; + + return vmexit; } -static inline int nested_svm_intr(struct vcpu_svm *svm) +/* This function returns true if it is save to enable the irq window */ +static inline bool nested_svm_intr(struct vcpu_svm *svm) { if (!is_nested(svm)) - return 0; + return true; if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) - return 0; + return true; if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) - return 0; + return false; - svm->vmcb->control.exit_code = SVM_EXIT_INTR; + svm->vmcb->control.exit_code = SVM_EXIT_INTR; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; if (svm->nested.intercept & 1ULL) { /* @@ -1417,21 +1637,40 @@ static inline int nested_svm_intr(struct vcpu_svm *svm) */ svm->nested.exit_required = true; trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); - return 1; + return false; } - return 0; + return true; } -static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx) +/* This function returns true if it is save to enable the nmi window */ +static inline bool nested_svm_nmi(struct vcpu_svm *svm) +{ + if (!is_nested(svm)) + return true; + + if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) + return true; + + svm->vmcb->control.exit_code = SVM_EXIT_NMI; + svm->nested.exit_required = true; + + return false; +} + +static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page) { struct page *page; + might_sleep(); + page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); if (is_error_page(page)) goto error; - return kmap_atomic(page, idx); + *_page = page; + + return kmap(page); error: kvm_release_page_clean(page); @@ -1440,61 +1679,55 @@ error: return NULL; } -static void nested_svm_unmap(void *addr, enum km_type idx) +static void nested_svm_unmap(struct page *page) { - struct page *page; + kunmap(page); + kvm_release_page_dirty(page); +} - if (!addr) - return; +static int nested_svm_intercept_ioio(struct vcpu_svm *svm) +{ + unsigned port; + u8 val, bit; + u64 gpa; - page = kmap_atomic_to_page(addr); + if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT))) + return NESTED_EXIT_HOST; - kunmap_atomic(addr, idx); - kvm_release_page_dirty(page); + port = svm->vmcb->control.exit_info_1 >> 16; + gpa = svm->nested.vmcb_iopm + (port / 8); + bit = port % 8; + val = 0; + + if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1)) + val &= (1 << bit); + + return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } -static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm) +static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) { - u32 param = svm->vmcb->control.exit_info_1 & 1; - u32 msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - bool ret = false; - u32 t0, t1; - u8 *msrpm; + u32 offset, msr, value; + int write, mask; if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) - return false; + return NESTED_EXIT_HOST; - msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); + msr = svm->vcpu.arch.regs[VCPU_REGS_RCX]; + offset = svm_msrpm_offset(msr); + write = svm->vmcb->control.exit_info_1 & 1; + mask = 1 << ((2 * (msr & 0xf)) + write); - if (!msrpm) - goto out; + if (offset == MSR_INVALID) + return NESTED_EXIT_DONE; - switch (msr) { - case 0 ... 0x1fff: - t0 = (msr * 2) % 8; - t1 = msr / 8; - break; - case 0xc0000000 ... 0xc0001fff: - t0 = (8192 + msr - 0xc0000000) * 2; - t1 = (t0 / 8); - t0 %= 8; - break; - case 0xc0010000 ... 0xc0011fff: - t0 = (16384 + msr - 0xc0010000) * 2; - t1 = (t0 / 8); - t0 %= 8; - break; - default: - ret = true; - goto out; - } - - ret = msrpm[t1] & ((1 << param) << t0); + /* Offset is in 32 bit units but need in 8 bit units */ + offset *= 4; -out: - nested_svm_unmap(msrpm, KM_USER0); + if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4)) + return NESTED_EXIT_DONE; - return ret; + return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } static int nested_svm_exit_special(struct vcpu_svm *svm) @@ -1504,17 +1737,21 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) switch (exit_code) { case SVM_EXIT_INTR: case SVM_EXIT_NMI: + case SVM_EXIT_EXCP_BASE + MC_VECTOR: return NESTED_EXIT_HOST; - /* For now we are always handling NPFs when using them */ case SVM_EXIT_NPF: + /* For now we are always handling NPFs when using them */ if (npt_enabled) return NESTED_EXIT_HOST; break; - /* When we're shadowing, trap PFs */ case SVM_EXIT_EXCP_BASE + PF_VECTOR: + /* When we're shadowing, trap PFs */ if (!npt_enabled) return NESTED_EXIT_HOST; break; + case SVM_EXIT_EXCP_BASE + NM_VECTOR: + nm_interception(svm); + break; default: break; } @@ -1525,7 +1762,7 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) /* * If this function returns true, this #vmexit was already handled */ -static int nested_svm_exit_handled(struct vcpu_svm *svm) +static int nested_svm_intercept(struct vcpu_svm *svm) { u32 exit_code = svm->vmcb->control.exit_code; int vmexit = NESTED_EXIT_HOST; @@ -1534,6 +1771,9 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm) case SVM_EXIT_MSR: vmexit = nested_svm_exit_handled_msr(svm); break; + case SVM_EXIT_IOIO: + vmexit = nested_svm_intercept_ioio(svm); + break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); if (svm->nested.intercept_cr_read & cr_bits) @@ -1564,6 +1804,10 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm) vmexit = NESTED_EXIT_DONE; break; } + case SVM_EXIT_ERR: { + vmexit = NESTED_EXIT_DONE; + break; + } default: { u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); if (svm->nested.intercept & exit_bits) @@ -1571,9 +1815,17 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm) } } - if (vmexit == NESTED_EXIT_DONE) { + return vmexit; +} + +static int nested_svm_exit_handled(struct vcpu_svm *svm) +{ + int vmexit; + + vmexit = nested_svm_intercept(svm); + + if (vmexit == NESTED_EXIT_DONE) nested_svm_vmexit(svm); - } return vmexit; } @@ -1615,6 +1867,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; + struct page *page; trace_kvm_nested_vmexit_inject(vmcb->control.exit_code, vmcb->control.exit_info_1, @@ -1622,10 +1875,13 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) vmcb->control.exit_int_info, vmcb->control.exit_int_info_err); - nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); + nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page); if (!nested_vmcb) return 1; + /* Exit nested SVM mode */ + svm->nested.vmcb = 0; + /* Give the current vmcb to the guest */ disable_gif(svm); @@ -1635,9 +1891,10 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->save.ds = vmcb->save.ds; nested_vmcb->save.gdtr = vmcb->save.gdtr; nested_vmcb->save.idtr = vmcb->save.idtr; - if (npt_enabled) - nested_vmcb->save.cr3 = vmcb->save.cr3; + nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); + nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; nested_vmcb->save.cr2 = vmcb->save.cr2; + nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; nested_vmcb->save.rflags = vmcb->save.rflags; nested_vmcb->save.rip = vmcb->save.rip; nested_vmcb->save.rsp = vmcb->save.rsp; @@ -1709,10 +1966,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) svm->vmcb->save.cpl = 0; svm->vmcb->control.exit_int_info = 0; - /* Exit nested SVM mode */ - svm->nested.vmcb = 0; - - nested_svm_unmap(nested_vmcb, KM_USER0); + nested_svm_unmap(page); kvm_mmu_reset_context(&svm->vcpu); kvm_mmu_load(&svm->vcpu); @@ -1722,19 +1976,33 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { - u32 *nested_msrpm; + /* + * This function merges the msr permission bitmaps of kvm and the + * nested vmcb. It is omptimized in that it only merges the parts where + * the kvm msr permission bitmap may contain zero bits + */ int i; - nested_msrpm = nested_svm_map(svm, svm->nested.vmcb_msrpm, KM_USER0); - if (!nested_msrpm) - return false; + if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT))) + return true; - for (i=0; i< PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER) / 4; i++) - svm->nested.msrpm[i] = svm->msrpm[i] | nested_msrpm[i]; + for (i = 0; i < MSRPM_OFFSETS; i++) { + u32 value, p; + u64 offset; - svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); + if (msrpm_offsets[i] == 0xffffffff) + break; + + p = msrpm_offsets[i]; + offset = svm->nested.vmcb_msrpm + (p * 4); - nested_svm_unmap(nested_msrpm, KM_USER0); + if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4)) + return false; + + svm->nested.msrpm[p] = svm->msrpm[p] | value; + } + + svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); return true; } @@ -1744,26 +2012,34 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) struct vmcb *nested_vmcb; struct vmcb *hsave = svm->nested.hsave; struct vmcb *vmcb = svm->vmcb; + struct page *page; + u64 vmcb_gpa; + + vmcb_gpa = svm->vmcb->save.rax; - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); if (!nested_vmcb) return false; - /* nested_vmcb is our indicator if nested SVM is activated */ - svm->nested.vmcb = svm->vmcb->save.rax; - - trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb, + trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa, nested_vmcb->save.rip, nested_vmcb->control.int_ctl, nested_vmcb->control.event_inj, nested_vmcb->control.nested_ctl); + trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read, + nested_vmcb->control.intercept_cr_write, + nested_vmcb->control.intercept_exceptions, + nested_vmcb->control.intercept); + /* Clear internal status */ kvm_clear_exception_queue(&svm->vcpu); kvm_clear_interrupt_queue(&svm->vcpu); - /* Save the old vmcb, so we don't need to pick what we save, but - can restore everything when a VMEXIT occurs */ + /* + * Save the old vmcb, so we don't need to pick what we save, but can + * restore everything when a VMEXIT occurs + */ hsave->save.es = vmcb->save.es; hsave->save.cs = vmcb->save.cs; hsave->save.ss = vmcb->save.ss; @@ -1803,14 +2079,17 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) if (npt_enabled) { svm->vmcb->save.cr3 = nested_vmcb->save.cr3; svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; - } else { + } else kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); - kvm_mmu_reset_context(&svm->vcpu); - } + + /* Guest paging mode is active - reset mmu */ + kvm_mmu_reset_context(&svm->vcpu); + svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip); + /* In case we don't even reach vcpu_run, the fields are not updated */ svm->vmcb->save.rax = nested_vmcb->save.rax; svm->vmcb->save.rsp = nested_vmcb->save.rsp; @@ -1819,22 +2098,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) svm->vmcb->save.dr6 = nested_vmcb->save.dr6; svm->vmcb->save.cpl = nested_vmcb->save.cpl; - /* We don't want a nested guest to be more powerful than the guest, - so all intercepts are ORed */ - svm->vmcb->control.intercept_cr_read |= - nested_vmcb->control.intercept_cr_read; - svm->vmcb->control.intercept_cr_write |= - nested_vmcb->control.intercept_cr_write; - svm->vmcb->control.intercept_dr_read |= - nested_vmcb->control.intercept_dr_read; - svm->vmcb->control.intercept_dr_write |= - nested_vmcb->control.intercept_dr_write; - svm->vmcb->control.intercept_exceptions |= - nested_vmcb->control.intercept_exceptions; - - svm->vmcb->control.intercept |= nested_vmcb->control.intercept; - - svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa; + svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL; + svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; /* cache intercepts */ svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; @@ -1851,13 +2116,43 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) else svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; + if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { + /* We only want the cr8 intercept bits of the guest */ + svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK; + svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; + } + + /* We don't want to see VMMCALLs from a nested guest */ + svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL); + + /* + * We don't want a nested guest to be more powerful than the guest, so + * all intercepts are ORed + */ + svm->vmcb->control.intercept_cr_read |= + nested_vmcb->control.intercept_cr_read; + svm->vmcb->control.intercept_cr_write |= + nested_vmcb->control.intercept_cr_write; + svm->vmcb->control.intercept_dr_read |= + nested_vmcb->control.intercept_dr_read; + svm->vmcb->control.intercept_dr_write |= + nested_vmcb->control.intercept_dr_write; + svm->vmcb->control.intercept_exceptions |= + nested_vmcb->control.intercept_exceptions; + + svm->vmcb->control.intercept |= nested_vmcb->control.intercept; + + svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; svm->vmcb->control.int_state = nested_vmcb->control.int_state; svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; - nested_svm_unmap(nested_vmcb, KM_USER0); + nested_svm_unmap(page); + + /* nested_vmcb is our indicator if nested SVM is activated */ + svm->nested.vmcb = vmcb_gpa; enable_gif(svm); @@ -1883,6 +2178,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) static int vmload_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; + struct page *page; if (nested_svm_check_permissions(svm)) return 1; @@ -1890,12 +2186,12 @@ static int vmload_interception(struct vcpu_svm *svm) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); if (!nested_vmcb) return 1; nested_svm_vmloadsave(nested_vmcb, svm->vmcb); - nested_svm_unmap(nested_vmcb, KM_USER0); + nested_svm_unmap(page); return 1; } @@ -1903,6 +2199,7 @@ static int vmload_interception(struct vcpu_svm *svm) static int vmsave_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; + struct page *page; if (nested_svm_check_permissions(svm)) return 1; @@ -1910,12 +2207,12 @@ static int vmsave_interception(struct vcpu_svm *svm) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, KM_USER0); + nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); if (!nested_vmcb) return 1; nested_svm_vmloadsave(svm->vmcb, nested_vmcb); - nested_svm_unmap(nested_vmcb, KM_USER0); + nested_svm_unmap(page); return 1; } @@ -2018,6 +2315,8 @@ static int task_switch_interception(struct vcpu_svm *svm) svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; uint32_t idt_v = svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; + bool has_error_code = false; + u32 error_code = 0; tss_selector = (u16)svm->vmcb->control.exit_info_1; @@ -2038,6 +2337,12 @@ static int task_switch_interception(struct vcpu_svm *svm) svm->vcpu.arch.nmi_injected = false; break; case SVM_EXITINTINFO_TYPE_EXEPT: + if (svm->vmcb->control.exit_info_2 & + (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) { + has_error_code = true; + error_code = + (u32)svm->vmcb->control.exit_info_2; + } kvm_clear_exception_queue(&svm->vcpu); break; case SVM_EXITINTINFO_TYPE_INTR: @@ -2054,7 +2359,14 @@ static int task_switch_interception(struct vcpu_svm *svm) (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) skip_emulated_instruction(&svm->vcpu); - return kvm_task_switch(&svm->vcpu, tss_selector, reason); + if (kvm_task_switch(&svm->vcpu, tss_selector, reason, + has_error_code, error_code) == EMULATE_FAIL) { + svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + svm->vcpu.run->internal.ndata = 0; + return 0; + } + return 1; } static int cpuid_interception(struct vcpu_svm *svm) @@ -2145,9 +2457,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) case MSR_IA32_SYSENTER_ESP: *data = svm->sysenter_esp; break; - /* Nobody will change the following 5 values in the VMCB so - we can safely return them on rdmsr. They will always be 0 - until LBRV is implemented. */ + /* + * Nobody will change the following 5 values in the VMCB so we can + * safely return them on rdmsr. They will always be 0 until LBRV is + * implemented. + */ case MSR_IA32_DEBUGCTLMSR: *data = svm->vmcb->save.dbgctl; break; @@ -2167,7 +2481,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) *data = svm->nested.hsave_msr; break; case MSR_VM_CR: - *data = 0; + *data = svm->nested.vm_cr_msr; break; case MSR_IA32_UCODE_REV: *data = 0x01000065; @@ -2197,6 +2511,31 @@ static int rdmsr_interception(struct vcpu_svm *svm) return 1; } +static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int svm_dis, chg_mask; + + if (data & ~SVM_VM_CR_VALID_MASK) + return 1; + + chg_mask = SVM_VM_CR_VALID_MASK; + + if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK) + chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK); + + svm->nested.vm_cr_msr &= ~chg_mask; + svm->nested.vm_cr_msr |= (data & chg_mask); + + svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK; + + /* check for svm_disable while efer.svme is set */ + if (svm_dis && (vcpu->arch.efer & EFER_SVME)) + return 1; + + return 0; +} + static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2263,6 +2602,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) svm->nested.hsave_msr = data; break; case MSR_VM_CR: + return svm_set_vm_cr(vcpu, data); case MSR_VM_IGNNE: pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; @@ -2326,16 +2666,16 @@ static int pause_interception(struct vcpu_svm *svm) } static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { - [SVM_EXIT_READ_CR0] = emulate_on_interception, - [SVM_EXIT_READ_CR3] = emulate_on_interception, - [SVM_EXIT_READ_CR4] = emulate_on_interception, - [SVM_EXIT_READ_CR8] = emulate_on_interception, + [SVM_EXIT_READ_CR0] = emulate_on_interception, + [SVM_EXIT_READ_CR3] = emulate_on_interception, + [SVM_EXIT_READ_CR4] = emulate_on_interception, + [SVM_EXIT_READ_CR8] = emulate_on_interception, [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, - [SVM_EXIT_WRITE_CR0] = emulate_on_interception, - [SVM_EXIT_WRITE_CR3] = emulate_on_interception, - [SVM_EXIT_WRITE_CR4] = emulate_on_interception, - [SVM_EXIT_WRITE_CR8] = cr8_write_interception, - [SVM_EXIT_READ_DR0] = emulate_on_interception, + [SVM_EXIT_WRITE_CR0] = emulate_on_interception, + [SVM_EXIT_WRITE_CR3] = emulate_on_interception, + [SVM_EXIT_WRITE_CR4] = emulate_on_interception, + [SVM_EXIT_WRITE_CR8] = cr8_write_interception, + [SVM_EXIT_READ_DR0] = emulate_on_interception, [SVM_EXIT_READ_DR1] = emulate_on_interception, [SVM_EXIT_READ_DR2] = emulate_on_interception, [SVM_EXIT_READ_DR3] = emulate_on_interception, @@ -2354,15 +2694,14 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, - [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, - [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, - [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, - [SVM_EXIT_INTR] = intr_interception, + [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, + [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, + [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, + [SVM_EXIT_INTR] = intr_interception, [SVM_EXIT_NMI] = nmi_interception, [SVM_EXIT_SMI] = nop_on_interception, [SVM_EXIT_INIT] = nop_on_interception, [SVM_EXIT_VINTR] = interrupt_window_interception, - /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ [SVM_EXIT_CPUID] = cpuid_interception, [SVM_EXIT_IRET] = iret_interception, [SVM_EXIT_INVD] = emulate_on_interception, @@ -2370,7 +2709,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_HLT] = halt_interception, [SVM_EXIT_INVLPG] = invlpg_interception, [SVM_EXIT_INVLPGA] = invlpga_interception, - [SVM_EXIT_IOIO] = io_interception, + [SVM_EXIT_IOIO] = io_interception, [SVM_EXIT_MSR] = msr_interception, [SVM_EXIT_TASK_SWITCH] = task_switch_interception, [SVM_EXIT_SHUTDOWN] = shutdown_interception, @@ -2393,7 +2732,12 @@ static int handle_exit(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; - trace_kvm_exit(exit_code, svm->vmcb->save.rip); + trace_kvm_exit(exit_code, vcpu); + + if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) + vcpu->arch.cr0 = svm->vmcb->save.cr0; + if (npt_enabled) + vcpu->arch.cr3 = svm->vmcb->save.cr3; if (unlikely(svm->nested.exit_required)) { nested_svm_vmexit(svm); @@ -2422,11 +2766,6 @@ static int handle_exit(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); - if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) - vcpu->arch.cr0 = svm->vmcb->save.cr0; - if (npt_enabled) - vcpu->arch.cr3 = svm->vmcb->save.cr3; - if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason @@ -2511,6 +2850,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); + if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) + return; + if (irr == -1) return; @@ -2522,8 +2864,12 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && - !(svm->vcpu.arch.hflags & HF_NMI_MASK); + int ret; + ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && + !(svm->vcpu.arch.hflags & HF_NMI_MASK); + ret = ret && gif_set(svm) && nested_svm_nmi(svm); + + return ret; } static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) @@ -2568,13 +2914,13 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - nested_svm_intr(svm); - - /* In case GIF=0 we can't rely on the CPU to tell us when - * GIF becomes 1, because that's a separate STGI/VMRUN intercept. - * The next time we get that intercept, this function will be - * called again though and we'll get the vintr intercept. */ - if (gif_set(svm)) { + /* + * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes + * 1, because that's a separate STGI/VMRUN intercept. The next time we + * get that intercept, this function will be called again though and + * we'll get the vintr intercept. + */ + if (gif_set(svm) && nested_svm_intr(svm)) { svm_set_vintr(svm); svm_inject_irq(svm, 0x0); } @@ -2588,9 +2934,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) == HF_NMI_MASK) return; /* IRET will cause a vm exit */ - /* Something prevents NMI from been injected. Single step over - possible problem (IRET or exception injection or interrupt - shadow) */ + /* + * Something prevents NMI from been injected. Single step over possible + * problem (IRET or exception injection or interrupt shadow) + */ svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); update_db_intercept(vcpu); @@ -2614,6 +2961,9 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) + return; + if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; kvm_set_cr8(vcpu, cr8); @@ -2625,6 +2975,9 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u64 cr8; + if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) + return; + cr8 = kvm_get_cr8(vcpu); svm->vmcb->control.int_ctl &= ~V_TPR_MASK; svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; @@ -2635,6 +2988,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) u8 vector; int type; u32 exitintinfo = svm->vmcb->control.exit_int_info; + unsigned int3_injected = svm->int3_injected; + + svm->int3_injected = 0; if (svm->vcpu.arch.hflags & HF_IRET_MASK) svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); @@ -2654,18 +3010,25 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) svm->vcpu.arch.nmi_injected = true; break; case SVM_EXITINTINFO_TYPE_EXEPT: - /* In case of software exception do not reinject an exception - vector, but re-execute and instruction instead */ - if (is_nested(svm)) - break; - if (kvm_exception_is_soft(vector)) + /* + * In case of software exceptions, do not reinject the vector, + * but re-execute the instruction instead. Rewind RIP first + * if we emulated INT3 before. + */ + if (kvm_exception_is_soft(vector)) { + if (vector == BP_VECTOR && int3_injected && + kvm_is_linear_rip(&svm->vcpu, svm->int3_rip)) + kvm_rip_write(&svm->vcpu, + kvm_rip_read(&svm->vcpu) - + int3_injected); break; + } if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { u32 err = svm->vmcb->control.exit_int_info_err; - kvm_queue_exception_e(&svm->vcpu, vector, err); + kvm_requeue_exception_e(&svm->vcpu, vector, err); } else - kvm_queue_exception(&svm->vcpu, vector); + kvm_requeue_exception(&svm->vcpu, vector); break; case SVM_EXITINTINFO_TYPE_INTR: kvm_queue_interrupt(&svm->vcpu, vector, false); @@ -2688,6 +3051,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) u16 gs_selector; u16 ldt_selector; + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; + /* * A vmexit emulation is required before the vcpu can be executed * again. @@ -2695,10 +3062,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->nested.exit_required)) return; - svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; - svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; - pre_svm_run(svm); sync_lapic_to_cr8(vcpu); @@ -2811,6 +3174,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); } + + /* + * We need to handle MC intercepts here before the vcpu has a chance to + * change the physical cpu + */ + if (unlikely(svm->vmcb->control.exit_code == + SVM_EXIT_EXCP_BASE + MC_VECTOR)) + svm_handle_mce(svm); } #undef R @@ -2879,25 +3250,39 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) { } +static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) +{ + switch (func) { + case 0x8000000A: + entry->eax = 1; /* SVM revision 1 */ + entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper + ASID emulation to nested SVM */ + entry->ecx = 0; /* Reserved */ + entry->edx = 0; /* Do not support any additional features */ + + break; + } +} + static const struct trace_print_flags svm_exit_reasons_str[] = { - { SVM_EXIT_READ_CR0, "read_cr0" }, - { SVM_EXIT_READ_CR3, "read_cr3" }, - { SVM_EXIT_READ_CR4, "read_cr4" }, - { SVM_EXIT_READ_CR8, "read_cr8" }, - { SVM_EXIT_WRITE_CR0, "write_cr0" }, - { SVM_EXIT_WRITE_CR3, "write_cr3" }, - { SVM_EXIT_WRITE_CR4, "write_cr4" }, - { SVM_EXIT_WRITE_CR8, "write_cr8" }, - { SVM_EXIT_READ_DR0, "read_dr0" }, - { SVM_EXIT_READ_DR1, "read_dr1" }, - { SVM_EXIT_READ_DR2, "read_dr2" }, - { SVM_EXIT_READ_DR3, "read_dr3" }, - { SVM_EXIT_WRITE_DR0, "write_dr0" }, - { SVM_EXIT_WRITE_DR1, "write_dr1" }, - { SVM_EXIT_WRITE_DR2, "write_dr2" }, - { SVM_EXIT_WRITE_DR3, "write_dr3" }, - { SVM_EXIT_WRITE_DR5, "write_dr5" }, - { SVM_EXIT_WRITE_DR7, "write_dr7" }, + { SVM_EXIT_READ_CR0, "read_cr0" }, + { SVM_EXIT_READ_CR3, "read_cr3" }, + { SVM_EXIT_READ_CR4, "read_cr4" }, + { SVM_EXIT_READ_CR8, "read_cr8" }, + { SVM_EXIT_WRITE_CR0, "write_cr0" }, + { SVM_EXIT_WRITE_CR3, "write_cr3" }, + { SVM_EXIT_WRITE_CR4, "write_cr4" }, + { SVM_EXIT_WRITE_CR8, "write_cr8" }, + { SVM_EXIT_READ_DR0, "read_dr0" }, + { SVM_EXIT_READ_DR1, "read_dr1" }, + { SVM_EXIT_READ_DR2, "read_dr2" }, + { SVM_EXIT_READ_DR3, "read_dr3" }, + { SVM_EXIT_WRITE_DR0, "write_dr0" }, + { SVM_EXIT_WRITE_DR1, "write_dr1" }, + { SVM_EXIT_WRITE_DR2, "write_dr2" }, + { SVM_EXIT_WRITE_DR3, "write_dr3" }, + { SVM_EXIT_WRITE_DR5, "write_dr5" }, + { SVM_EXIT_WRITE_DR7, "write_dr7" }, { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, @@ -2946,8 +3331,10 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - update_cr0_intercept(svm); svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; + if (is_nested(svm)) + svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR; + update_cr0_intercept(svm); } static struct kvm_x86_ops svm_x86_ops = { @@ -2986,8 +3373,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, - .get_dr = svm_get_dr, - .set_dr = svm_set_dr, + .set_dr7 = svm_set_dr7, .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, @@ -3023,12 +3409,14 @@ static struct kvm_x86_ops svm_x86_ops = { .cpuid_update = svm_cpuid_update, .rdtscp_supported = svm_rdtscp_supported, + + .set_supported_cpuid = svm_set_supported_cpuid, }; static int __init svm_init(void) { return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), - THIS_MODULE); + __alignof__(struct vcpu_svm), THIS_MODULE); } static void __exit svm_exit(void) diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index eea40439066..4ddadb1a5ff 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -12,7 +12,8 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) /* * There is a race window between reading and incrementing, but we do * not care about potentially loosing timer events in the !reinject - * case anyway. + * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked + * in vcpu_enter_guest. */ if (ktimer->reinject || !atomic_read(&ktimer->pending)) { atomic_inc(&ktimer->pending); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 6ad30a29f04..a6544b8e7c0 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -5,8 +5,6 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm -#define TRACE_INCLUDE_PATH arch/x86/kvm -#define TRACE_INCLUDE_FILE trace /* * Tracepoint for guest mode entry. @@ -184,8 +182,8 @@ TRACE_EVENT(kvm_apic, * Tracepoint for kvm guest exit: */ TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned int exit_reason, unsigned long guest_rip), - TP_ARGS(exit_reason, guest_rip), + TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu), + TP_ARGS(exit_reason, vcpu), TP_STRUCT__entry( __field( unsigned int, exit_reason ) @@ -194,7 +192,7 @@ TRACE_EVENT(kvm_exit, TP_fast_assign( __entry->exit_reason = exit_reason; - __entry->guest_rip = guest_rip; + __entry->guest_rip = kvm_rip_read(vcpu); ), TP_printk("reason %s rip 0x%lx", @@ -221,6 +219,38 @@ TRACE_EVENT(kvm_inj_virq, TP_printk("irq %u", __entry->irq) ); +#define EXS(x) { x##_VECTOR, "#" #x } + +#define kvm_trace_sym_exc \ + EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ + EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \ + EXS(MF), EXS(MC) + +/* + * Tracepoint for kvm interrupt injection: + */ +TRACE_EVENT(kvm_inj_exception, + TP_PROTO(unsigned exception, bool has_error, unsigned error_code), + TP_ARGS(exception, has_error, error_code), + + TP_STRUCT__entry( + __field( u8, exception ) + __field( u8, has_error ) + __field( u32, error_code ) + ), + + TP_fast_assign( + __entry->exception = exception; + __entry->has_error = has_error; + __entry->error_code = error_code; + ), + + TP_printk("%s (0x%x)", + __print_symbolic(__entry->exception, kvm_trace_sym_exc), + /* FIXME: don't print error_code if not present */ + __entry->has_error ? __entry->error_code : 0) +); + /* * Tracepoint for page fault. */ @@ -413,12 +443,34 @@ TRACE_EVENT(kvm_nested_vmrun, ), TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x " - "event_inj: 0x%08x npt: %s\n", + "event_inj: 0x%08x npt: %s", __entry->rip, __entry->vmcb, __entry->nested_rip, __entry->int_ctl, __entry->event_inj, __entry->npt ? "on" : "off") ); +TRACE_EVENT(kvm_nested_intercepts, + TP_PROTO(__u16 cr_read, __u16 cr_write, __u32 exceptions, __u64 intercept), + TP_ARGS(cr_read, cr_write, exceptions, intercept), + + TP_STRUCT__entry( + __field( __u16, cr_read ) + __field( __u16, cr_write ) + __field( __u32, exceptions ) + __field( __u64, intercept ) + ), + + TP_fast_assign( + __entry->cr_read = cr_read; + __entry->cr_write = cr_write; + __entry->exceptions = exceptions; + __entry->intercept = intercept; + ), + + TP_printk("cr_read: %04x cr_write: %04x excp: %08x intercept: %016llx", + __entry->cr_read, __entry->cr_write, __entry->exceptions, + __entry->intercept) +); /* * Tracepoint for #VMEXIT while nested */ @@ -447,7 +499,7 @@ TRACE_EVENT(kvm_nested_vmexit, __entry->exit_int_info_err = exit_int_info_err; ), TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " - "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n", + "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", __entry->rip, ftrace_print_symbols_seq(p, __entry->exit_code, kvm_x86_ops->exit_reasons_str), @@ -482,7 +534,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject, ), TP_printk("reason: %s ext_inf1: 0x%016llx " - "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n", + "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", ftrace_print_symbols_seq(p, __entry->exit_code, kvm_x86_ops->exit_reasons_str), __entry->exit_info1, __entry->exit_info2, @@ -504,7 +556,7 @@ TRACE_EVENT(kvm_nested_intr_vmexit, __entry->rip = rip ), - TP_printk("rip: 0x%016llx\n", __entry->rip) + TP_printk("rip: 0x%016llx", __entry->rip) ); /* @@ -526,7 +578,7 @@ TRACE_EVENT(kvm_invlpga, __entry->address = address; ), - TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n", + TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx", __entry->rip, __entry->asid, __entry->address) ); @@ -547,11 +599,102 @@ TRACE_EVENT(kvm_skinit, __entry->slb = slb; ), - TP_printk("rip: 0x%016llx slb: 0x%08x\n", + TP_printk("rip: 0x%016llx slb: 0x%08x", __entry->rip, __entry->slb) ); +#define __print_insn(insn, ilen) ({ \ + int i; \ + const char *ret = p->buffer + p->len; \ + \ + for (i = 0; i < ilen; ++i) \ + trace_seq_printf(p, " %02x", insn[i]); \ + trace_seq_printf(p, "%c", 0); \ + ret; \ + }) + +#define KVM_EMUL_INSN_F_CR0_PE (1 << 0) +#define KVM_EMUL_INSN_F_EFL_VM (1 << 1) +#define KVM_EMUL_INSN_F_CS_D (1 << 2) +#define KVM_EMUL_INSN_F_CS_L (1 << 3) + +#define kvm_trace_symbol_emul_flags \ + { 0, "real" }, \ + { KVM_EMUL_INSN_F_CR0_PE \ + | KVM_EMUL_INSN_F_EFL_VM, "vm16" }, \ + { KVM_EMUL_INSN_F_CR0_PE, "prot16" }, \ + { KVM_EMUL_INSN_F_CR0_PE \ + | KVM_EMUL_INSN_F_CS_D, "prot32" }, \ + { KVM_EMUL_INSN_F_CR0_PE \ + | KVM_EMUL_INSN_F_CS_L, "prot64" } + +#define kei_decode_mode(mode) ({ \ + u8 flags = 0xff; \ + switch (mode) { \ + case X86EMUL_MODE_REAL: \ + flags = 0; \ + break; \ + case X86EMUL_MODE_VM86: \ + flags = KVM_EMUL_INSN_F_EFL_VM; \ + break; \ + case X86EMUL_MODE_PROT16: \ + flags = KVM_EMUL_INSN_F_CR0_PE; \ + break; \ + case X86EMUL_MODE_PROT32: \ + flags = KVM_EMUL_INSN_F_CR0_PE \ + | KVM_EMUL_INSN_F_CS_D; \ + break; \ + case X86EMUL_MODE_PROT64: \ + flags = KVM_EMUL_INSN_F_CR0_PE \ + | KVM_EMUL_INSN_F_CS_L; \ + break; \ + } \ + flags; \ + }) + +TRACE_EVENT(kvm_emulate_insn, + TP_PROTO(struct kvm_vcpu *vcpu, __u8 failed), + TP_ARGS(vcpu, failed), + + TP_STRUCT__entry( + __field( __u64, rip ) + __field( __u32, csbase ) + __field( __u8, len ) + __array( __u8, insn, 15 ) + __field( __u8, flags ) + __field( __u8, failed ) + ), + + TP_fast_assign( + __entry->rip = vcpu->arch.emulate_ctxt.decode.fetch.start; + __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); + __entry->len = vcpu->arch.emulate_ctxt.decode.eip + - vcpu->arch.emulate_ctxt.decode.fetch.start; + memcpy(__entry->insn, + vcpu->arch.emulate_ctxt.decode.fetch.data, + 15); + __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode); + __entry->failed = failed; + ), + + TP_printk("%x:%llx:%s (%s)%s", + __entry->csbase, __entry->rip, + __print_insn(__entry->insn, __entry->len), + __print_symbolic(__entry->flags, + kvm_trace_symbol_emul_flags), + __entry->failed ? " failed" : "" + ) + ); + +#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0) +#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1) + #endif /* _TRACE_KVM_H */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH arch/x86/kvm +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + /* This part must be outside protection */ #include <trace/define_trace.h> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index edca080407a..859a01a07db 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -27,6 +27,7 @@ #include <linux/moduleparam.h> #include <linux/ftrace_event.h> #include <linux/slab.h> +#include <linux/tboot.h> #include "kvm_cache_regs.h" #include "x86.h" @@ -98,6 +99,8 @@ module_param(ple_gap, int, S_IRUGO); static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; module_param(ple_window, int, S_IRUGO); +#define NR_AUTOLOAD_MSRS 1 + struct vmcs { u32 revision_id; u32 abort; @@ -125,6 +128,11 @@ struct vcpu_vmx { u64 msr_guest_kernel_gs_base; #endif struct vmcs *vmcs; + struct msr_autoload { + unsigned nr; + struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; + struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; + } msr_autoload; struct { int loaded; u16 fs_sel, gs_sel, ldt_sel; @@ -234,56 +242,56 @@ static const u32 vmx_msr_index[] = { }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) -static inline int is_page_fault(u32 intr_info) +static inline bool is_page_fault(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); } -static inline int is_no_device(u32 intr_info) +static inline bool is_no_device(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); } -static inline int is_invalid_opcode(u32 intr_info) +static inline bool is_invalid_opcode(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); } -static inline int is_external_interrupt(u32 intr_info) +static inline bool is_external_interrupt(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); } -static inline int is_machine_check(u32 intr_info) +static inline bool is_machine_check(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | INTR_INFO_VALID_MASK)) == (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); } -static inline int cpu_has_vmx_msr_bitmap(void) +static inline bool cpu_has_vmx_msr_bitmap(void) { return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; } -static inline int cpu_has_vmx_tpr_shadow(void) +static inline bool cpu_has_vmx_tpr_shadow(void) { return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; } -static inline int vm_need_tpr_shadow(struct kvm *kvm) +static inline bool vm_need_tpr_shadow(struct kvm *kvm) { return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); } -static inline int cpu_has_secondary_exec_ctrls(void) +static inline bool cpu_has_secondary_exec_ctrls(void) { return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -303,80 +311,80 @@ static inline bool cpu_has_vmx_flexpriority(void) static inline bool cpu_has_vmx_ept_execute_only(void) { - return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT); + return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; } static inline bool cpu_has_vmx_eptp_uncacheable(void) { - return !!(vmx_capability.ept & VMX_EPTP_UC_BIT); + return vmx_capability.ept & VMX_EPTP_UC_BIT; } static inline bool cpu_has_vmx_eptp_writeback(void) { - return !!(vmx_capability.ept & VMX_EPTP_WB_BIT); + return vmx_capability.ept & VMX_EPTP_WB_BIT; } static inline bool cpu_has_vmx_ept_2m_page(void) { - return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); + return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; } static inline bool cpu_has_vmx_ept_1g_page(void) { - return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT); + return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; } -static inline int cpu_has_vmx_invept_individual_addr(void) +static inline bool cpu_has_vmx_invept_individual_addr(void) { - return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); + return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; } -static inline int cpu_has_vmx_invept_context(void) +static inline bool cpu_has_vmx_invept_context(void) { - return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT); + return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; } -static inline int cpu_has_vmx_invept_global(void) +static inline bool cpu_has_vmx_invept_global(void) { - return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT); + return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; } -static inline int cpu_has_vmx_ept(void) +static inline bool cpu_has_vmx_ept(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_EPT; } -static inline int cpu_has_vmx_unrestricted_guest(void) +static inline bool cpu_has_vmx_unrestricted_guest(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_UNRESTRICTED_GUEST; } -static inline int cpu_has_vmx_ple(void) +static inline bool cpu_has_vmx_ple(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PAUSE_LOOP_EXITING; } -static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) +static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm) { return flexpriority_enabled && irqchip_in_kernel(kvm); } -static inline int cpu_has_vmx_vpid(void) +static inline bool cpu_has_vmx_vpid(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_VPID; } -static inline int cpu_has_vmx_rdtscp(void) +static inline bool cpu_has_vmx_rdtscp(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_RDTSCP; } -static inline int cpu_has_virtual_nmis(void) +static inline bool cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; } @@ -595,16 +603,56 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) vmcs_write32(EXCEPTION_BITMAP, eb); } +static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) +{ + unsigned i; + struct msr_autoload *m = &vmx->msr_autoload; + + for (i = 0; i < m->nr; ++i) + if (m->guest[i].index == msr) + break; + + if (i == m->nr) + return; + --m->nr; + m->guest[i] = m->guest[m->nr]; + m->host[i] = m->host[m->nr]; + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); +} + +static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, + u64 guest_val, u64 host_val) +{ + unsigned i; + struct msr_autoload *m = &vmx->msr_autoload; + + for (i = 0; i < m->nr; ++i) + if (m->guest[i].index == msr) + break; + + if (i == m->nr) { + ++m->nr; + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); + } + + m->guest[i].index = msr; + m->guest[i].value = guest_val; + m->host[i].index = msr; + m->host[i].value = host_val; +} + static void reload_tss(void) { /* * VT restores TR but not its size. Useless. */ - struct descriptor_table gdt; + struct desc_ptr gdt; struct desc_struct *descs; - kvm_get_gdt(&gdt); - descs = (void *)gdt.base; + native_store_gdt(&gdt); + descs = (void *)gdt.address; descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ load_TR_desc(); } @@ -631,9 +679,57 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) guest_efer |= host_efer & ignore_bits; vmx->guest_msrs[efer_offset].data = guest_efer; vmx->guest_msrs[efer_offset].mask = ~ignore_bits; + + clear_atomic_switch_msr(vmx, MSR_EFER); + /* On ept, can't emulate nx, and must switch nx atomically */ + if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) { + guest_efer = vmx->vcpu.arch.efer; + if (!(guest_efer & EFER_LMA)) + guest_efer &= ~EFER_LME; + add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer); + return false; + } + return true; } +static unsigned long segment_base(u16 selector) +{ + struct desc_ptr gdt; + struct desc_struct *d; + unsigned long table_base; + unsigned long v; + + if (!(selector & ~3)) + return 0; + + native_store_gdt(&gdt); + table_base = gdt.address; + + if (selector & 4) { /* from ldt */ + u16 ldt_selector = kvm_read_ldt(); + + if (!(ldt_selector & ~3)) + return 0; + + table_base = segment_base(ldt_selector); + } + d = (struct desc_struct *)(table_base + (selector & ~7)); + v = get_desc_base(d); +#ifdef CONFIG_X86_64 + if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) + v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; +#endif + return v; +} + +static inline unsigned long kvm_read_tr_base(void) +{ + u16 tr; + asm("str %0" : "=g"(tr)); + return segment_base(tr); +} + static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -758,7 +854,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (vcpu->cpu != cpu) { - struct descriptor_table dt; + struct desc_ptr dt; unsigned long sysenter_esp; vcpu->cpu = cpu; @@ -767,8 +863,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * processors. */ vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ - kvm_get_gdt(&dt); - vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ + native_store_gdt(&dt); + vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */ rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ @@ -846,9 +942,9 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) int ret = 0; if (interruptibility & GUEST_INTR_STATE_STI) - ret |= X86_SHADOW_INT_STI; + ret |= KVM_X86_SHADOW_INT_STI; if (interruptibility & GUEST_INTR_STATE_MOV_SS) - ret |= X86_SHADOW_INT_MOV_SS; + ret |= KVM_X86_SHADOW_INT_MOV_SS; return ret & mask; } @@ -860,9 +956,9 @@ static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); - if (mask & X86_SHADOW_INT_MOV_SS) + if (mask & KVM_X86_SHADOW_INT_MOV_SS) interruptibility |= GUEST_INTR_STATE_MOV_SS; - if (mask & X86_SHADOW_INT_STI) + else if (mask & KVM_X86_SHADOW_INT_STI) interruptibility |= GUEST_INTR_STATE_STI; if ((interruptibility != interruptibility_old)) @@ -882,7 +978,8 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) } static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, - bool has_error_code, u32 error_code) + bool has_error_code, u32 error_code, + bool reinject) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info = nr | INTR_INFO_VALID_MASK; @@ -1176,9 +1273,16 @@ static __init int vmx_disabled_by_bios(void) u64 msr; rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); - return (msr & (FEATURE_CONTROL_LOCKED | - FEATURE_CONTROL_VMXON_ENABLED)) - == FEATURE_CONTROL_LOCKED; + if (msr & FEATURE_CONTROL_LOCKED) { + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) + && tboot_enabled()) + return 1; + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + && !tboot_enabled()) + return 1; + } + + return 0; /* locked but not enabled */ } @@ -1186,21 +1290,23 @@ static int hardware_enable(void *garbage) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - u64 old; + u64 old, test_bits; if (read_cr4() & X86_CR4_VMXE) return -EBUSY; INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); rdmsrl(MSR_IA32_FEATURE_CONTROL, old); - if ((old & (FEATURE_CONTROL_LOCKED | - FEATURE_CONTROL_VMXON_ENABLED)) - != (FEATURE_CONTROL_LOCKED | - FEATURE_CONTROL_VMXON_ENABLED)) + + test_bits = FEATURE_CONTROL_LOCKED; + test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + if (tboot_enabled()) + test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; + + if ((old & test_bits) != test_bits) { /* enable and lock */ - wrmsrl(MSR_IA32_FEATURE_CONTROL, old | - FEATURE_CONTROL_LOCKED | - FEATURE_CONTROL_VMXON_ENABLED); + wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); + } write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) @@ -1521,7 +1627,7 @@ static gva_t rmode_tss_base(struct kvm *kvm) struct kvm_memslots *slots; gfn_t base_gfn; - slots = rcu_dereference(kvm->memslots); + slots = kvm_memslots(kvm); base_gfn = kvm->memslots->memslots[0].base_gfn + kvm->memslots->memslots[0].npages - 3; return base_gfn << PAGE_SHIFT; @@ -1649,6 +1755,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu) vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) & ~VM_ENTRY_IA32E_MODE); + vmx_set_efer(vcpu, vcpu->arch.efer); } #endif @@ -1934,28 +2041,28 @@ static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) *l = (ar >> 13) & 1; } -static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { - dt->limit = vmcs_read32(GUEST_IDTR_LIMIT); - dt->base = vmcs_readl(GUEST_IDTR_BASE); + dt->size = vmcs_read32(GUEST_IDTR_LIMIT); + dt->address = vmcs_readl(GUEST_IDTR_BASE); } -static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { - vmcs_write32(GUEST_IDTR_LIMIT, dt->limit); - vmcs_writel(GUEST_IDTR_BASE, dt->base); + vmcs_write32(GUEST_IDTR_LIMIT, dt->size); + vmcs_writel(GUEST_IDTR_BASE, dt->address); } -static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { - dt->limit = vmcs_read32(GUEST_GDTR_LIMIT); - dt->base = vmcs_readl(GUEST_GDTR_BASE); + dt->size = vmcs_read32(GUEST_GDTR_LIMIT); + dt->address = vmcs_readl(GUEST_GDTR_BASE); } -static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) +static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) { - vmcs_write32(GUEST_GDTR_LIMIT, dt->limit); - vmcs_writel(GUEST_GDTR_BASE, dt->base); + vmcs_write32(GUEST_GDTR_LIMIT, dt->size); + vmcs_writel(GUEST_GDTR_BASE, dt->address); } static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) @@ -2296,6 +2403,16 @@ static void allocate_vpid(struct vcpu_vmx *vmx) spin_unlock(&vmx_vpid_lock); } +static void free_vpid(struct vcpu_vmx *vmx) +{ + if (!enable_vpid) + return; + spin_lock(&vmx_vpid_lock); + if (vmx->vpid != 0) + __clear_bit(vmx->vpid, vmx_vpid_bitmap); + spin_unlock(&vmx_vpid_lock); +} + static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) { int f = sizeof(unsigned long); @@ -2334,7 +2451,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) u32 junk; u64 host_pat, tsc_this, tsc_base; unsigned long a; - struct descriptor_table dt; + struct desc_ptr dt; int i; unsigned long kvm_vmx_return; u32 exec_control; @@ -2415,14 +2532,16 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - kvm_get_idt(&dt); - vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ + native_store_idt(&dt); + vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); @@ -2947,22 +3066,20 @@ static int handle_io(struct kvm_vcpu *vcpu) int size, in, string; unsigned port; - ++vcpu->stat.io_exits; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); string = (exit_qualification & 16) != 0; + in = (exit_qualification & 8) != 0; - if (string) { - if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO) - return 0; - return 1; - } + ++vcpu->stat.io_exits; - size = (exit_qualification & 7) + 1; - in = (exit_qualification & 8) != 0; - port = exit_qualification >> 16; + if (string || in) + return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); + port = exit_qualification >> 16; + size = (exit_qualification & 7) + 1; skip_emulated_instruction(vcpu); - return kvm_emulate_pio(vcpu, in, size, port); + + return kvm_fast_pio_out(vcpu, size, port); } static void @@ -3053,19 +3170,9 @@ static int handle_cr(struct kvm_vcpu *vcpu) return 0; } -static int check_dr_alias(struct kvm_vcpu *vcpu) -{ - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return -1; - } - return 0; -} - static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; - unsigned long val; int dr, reg; /* Do not handle if the CPL > 0, will trigger GP on re-entry */ @@ -3100,67 +3207,20 @@ static int handle_dr(struct kvm_vcpu *vcpu) dr = exit_qualification & DEBUG_REG_ACCESS_NUM; reg = DEBUG_REG_ACCESS_REG(exit_qualification); if (exit_qualification & TYPE_MOV_FROM_DR) { - switch (dr) { - case 0 ... 3: - val = vcpu->arch.db[dr]; - break; - case 4: - if (check_dr_alias(vcpu) < 0) - return 1; - /* fall through */ - case 6: - val = vcpu->arch.dr6; - break; - case 5: - if (check_dr_alias(vcpu) < 0) - return 1; - /* fall through */ - default: /* 7 */ - val = vcpu->arch.dr7; - break; - } - kvm_register_write(vcpu, reg, val); - } else { - val = vcpu->arch.regs[reg]; - switch (dr) { - case 0 ... 3: - vcpu->arch.db[dr] = val; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) - vcpu->arch.eff_db[dr] = val; - break; - case 4: - if (check_dr_alias(vcpu) < 0) - return 1; - /* fall through */ - case 6: - if (val & 0xffffffff00000000ULL) { - kvm_inject_gp(vcpu, 0); - return 1; - } - vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; - break; - case 5: - if (check_dr_alias(vcpu) < 0) - return 1; - /* fall through */ - default: /* 7 */ - if (val & 0xffffffff00000000ULL) { - kvm_inject_gp(vcpu, 0); - return 1; - } - vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { - vmcs_writel(GUEST_DR7, vcpu->arch.dr7); - vcpu->arch.switch_db_regs = - (val & DR7_BP_EN_MASK); - } - break; - } - } + unsigned long val; + if (!kvm_get_dr(vcpu, dr, &val)) + kvm_register_write(vcpu, reg, val); + } else + kvm_set_dr(vcpu, dr, vcpu->arch.regs[reg]); skip_emulated_instruction(vcpu); return 1; } +static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) +{ + vmcs_writel(GUEST_DR7, val); +} + static int handle_cpuid(struct kvm_vcpu *vcpu) { kvm_emulate_cpuid(vcpu); @@ -3292,6 +3352,8 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification; + bool has_error_code = false; + u32 error_code = 0; u16 tss_selector; int reason, type, idt_v; @@ -3314,6 +3376,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) kvm_clear_interrupt_queue(vcpu); break; case INTR_TYPE_HARD_EXCEPTION: + if (vmx->idt_vectoring_info & + VECTORING_INFO_DELIVER_CODE_MASK) { + has_error_code = true; + error_code = + vmcs_read32(IDT_VECTORING_ERROR_CODE); + } + /* fall through */ case INTR_TYPE_SOFT_EXCEPTION: kvm_clear_exception_queue(vcpu); break; @@ -3328,8 +3397,13 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) type != INTR_TYPE_NMI_INTR)) skip_emulated_instruction(vcpu); - if (!kvm_task_switch(vcpu, tss_selector, reason)) + if (kvm_task_switch(vcpu, tss_selector, reason, + has_error_code, error_code) == EMULATE_FAIL) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; return 0; + } /* clear all local breakpoint enable flags */ vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~55); @@ -3574,7 +3648,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; - trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); + trace_kvm_exit(exit_reason, vcpu); /* If guest state is invalid, start emulating */ if (vmx->emulation_required && emulate_invalid_guest_state) @@ -3923,10 +3997,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - spin_lock(&vmx_vpid_lock); - if (vmx->vpid != 0) - __clear_bit(vmx->vpid, vmx_vpid_bitmap); - spin_unlock(&vmx_vpid_lock); + free_vpid(vmx); vmx_free_vmcs(vcpu); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); @@ -3988,6 +4059,7 @@ free_msrs: uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); free_vcpu: + free_vpid(vmx); kmem_cache_free(kvm_vcpu_cache, vmx); return ERR_PTR(err); } @@ -4118,6 +4190,10 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) } } +static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) +{ +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -4154,6 +4230,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, + .set_dr7 = vmx_set_dr7, .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, @@ -4189,6 +4266,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .cpuid_update = vmx_cpuid_update, .rdtscp_supported = vmx_rdtscp_supported, + + .set_supported_cpuid = vmx_set_supported_cpuid, }; static int __init vmx_init(void) @@ -4236,7 +4315,8 @@ static int __init vmx_init(void) set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ - r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), + __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) goto out3; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dd9bc8fb81a..05d571f6f19 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -42,7 +42,7 @@ #include <linux/slab.h> #include <linux/perf_event.h> #include <trace/events/kvm.h> -#undef TRACE_INCLUDE_FILE + #define CREATE_TRACE_POINTS #include "trace.h" @@ -224,34 +224,6 @@ static void drop_user_return_notifiers(void *ignore) kvm_on_user_return(&smsr->urn); } -unsigned long segment_base(u16 selector) -{ - struct descriptor_table gdt; - struct desc_struct *d; - unsigned long table_base; - unsigned long v; - - if (selector == 0) - return 0; - - kvm_get_gdt(&gdt); - table_base = gdt.base; - - if (selector & 4) { /* from ldt */ - u16 ldt_selector = kvm_read_ldt(); - - table_base = segment_base(ldt_selector); - } - d = (struct desc_struct *)(table_base + (selector & ~7)); - v = get_desc_base(d); -#ifdef CONFIG_X86_64 - if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) - v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; -#endif - return v; -} -EXPORT_SYMBOL_GPL(segment_base); - u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) { if (irqchip_in_kernel(vcpu->kvm)) @@ -293,7 +265,8 @@ static int exception_class(int vector) } static void kvm_multiple_exception(struct kvm_vcpu *vcpu, - unsigned nr, bool has_error, u32 error_code) + unsigned nr, bool has_error, u32 error_code, + bool reinject) { u32 prev_nr; int class1, class2; @@ -304,6 +277,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, vcpu->arch.exception.has_error_code = has_error; vcpu->arch.exception.nr = nr; vcpu->arch.exception.error_code = error_code; + vcpu->arch.exception.reinject = reinject; return; } @@ -332,10 +306,16 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - kvm_multiple_exception(vcpu, nr, false, 0); + kvm_multiple_exception(vcpu, nr, false, 0, false); } EXPORT_SYMBOL_GPL(kvm_queue_exception); +void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) +{ + kvm_multiple_exception(vcpu, nr, false, 0, true); +} +EXPORT_SYMBOL_GPL(kvm_requeue_exception); + void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, u32 error_code) { @@ -352,10 +332,16 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - kvm_multiple_exception(vcpu, nr, true, error_code); + kvm_multiple_exception(vcpu, nr, true, error_code, false); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); +void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) +{ + kvm_multiple_exception(vcpu, nr, true, error_code, true); +} +EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); + /* * Checks if cpl <= required_cpl; if true, return true. Otherwise queue * a #GP and return false. @@ -476,7 +462,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } kvm_x86_ops->set_cr0(vcpu, cr0); - vcpu->arch.cr0 = cr0; kvm_mmu_reset_context(vcpu); return; @@ -485,7 +470,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f)); + kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(kvm_lmsw); @@ -517,7 +502,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) } kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->arch.cr4 = cr4; - vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; kvm_mmu_reset_context(vcpu); } EXPORT_SYMBOL_GPL(kvm_set_cr4); @@ -592,6 +576,80 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_cr8); +int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) +{ + switch (dr) { + case 0 ... 3: + vcpu->arch.db[dr] = val; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + vcpu->arch.eff_db[dr] = val; + break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + /* fall through */ + case 6: + if (val & 0xffffffff00000000ULL) { + kvm_inject_gp(vcpu, 0); + return 1; + } + vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; + break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + /* fall through */ + default: /* 7 */ + if (val & 0xffffffff00000000ULL) { + kvm_inject_gp(vcpu, 0); + return 1; + } + vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { + kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); + vcpu->arch.switch_db_regs = (val & DR7_BP_EN_MASK); + } + break; + } + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_set_dr); + +int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) +{ + switch (dr) { + case 0 ... 3: + *val = vcpu->arch.db[dr]; + break; + case 4: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + /* fall through */ + case 6: + *val = vcpu->arch.dr6; + break; + case 5: + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; + } + /* fall through */ + default: /* 7 */ + *val = vcpu->arch.dr7; + break; + } + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_get_dr); + static inline u32 bit(int bitno) { return 1 << (bitno & 31); @@ -606,9 +664,10 @@ static inline u32 bit(int bitno) * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 5 +#define KVM_SAVE_MSRS_BEGIN 7 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, @@ -625,48 +684,42 @@ static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, }; -static void set_efer(struct kvm_vcpu *vcpu, u64 efer) +static int set_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (efer & efer_reserved_bits) { - kvm_inject_gp(vcpu, 0); - return; - } + if (efer & efer_reserved_bits) + return 1; if (is_paging(vcpu) - && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) { - kvm_inject_gp(vcpu, 0); - return; - } + && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) + return 1; if (efer & EFER_FFXSR) { struct kvm_cpuid_entry2 *feat; feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { - kvm_inject_gp(vcpu, 0); - return; - } + if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) + return 1; } if (efer & EFER_SVME) { struct kvm_cpuid_entry2 *feat; feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { - kvm_inject_gp(vcpu, 0); - return; - } + if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) + return 1; } - kvm_x86_ops->set_efer(vcpu, efer); - efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; + kvm_x86_ops->set_efer(vcpu, efer); + vcpu->arch.efer = efer; vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; kvm_mmu_reset_context(vcpu); + + return 0; } void kvm_enable_efer_bits(u64 mask) @@ -696,14 +749,22 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { - static int version; + int version; + int r; struct pvclock_wall_clock wc; struct timespec boot; if (!wall_clock) return; - version++; + r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); + if (r) + return; + + if (version & 1) + ++version; /* first time write, random junk */ + + ++version; kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); @@ -796,6 +857,8 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) vcpu->hv_clock.system_time = ts.tv_nsec + (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; + vcpu->hv_clock.flags = 0; + /* * The interface expects us to write an even number signaling that the * update is finished. Since the guest won't see the intermediate @@ -1087,10 +1150,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { case MSR_EFER: - set_efer(vcpu, data); - break; + return set_efer(vcpu, data); case MSR_K7_HWCR: data &= ~(u64)0x40; /* ignore flush filter disable */ + data &= ~(u64)0x100; /* ignore ignne emulation enable */ if (data != 0) { pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", data); @@ -1133,10 +1196,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; + case MSR_KVM_WALL_CLOCK_NEW: case MSR_KVM_WALL_CLOCK: vcpu->kvm->arch.wall_clock = data; kvm_write_wall_clock(vcpu->kvm, data); break; + case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { if (vcpu->arch.time_page) { kvm_release_page_dirty(vcpu->arch.time_page); @@ -1408,9 +1473,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: + case MSR_KVM_WALL_CLOCK_NEW: data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_SYSTEM_TIME: + case MSR_KVM_SYSTEM_TIME_NEW: data = vcpu->arch.time; break; case MSR_IA32_P5_MC_ADDR: @@ -1549,6 +1616,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_HYPERV_VAPIC: case KVM_CAP_HYPERV_SPIN: case KVM_CAP_PCI_SEGMENT: + case KVM_CAP_DEBUGREGS: case KVM_CAP_X86_ROBUST_SINGLESTEP: r = 1; break; @@ -1769,6 +1837,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, { int r; + vcpu_load(vcpu); r = -E2BIG; if (cpuid->nent < vcpu->arch.cpuid_nent) goto out; @@ -1780,6 +1849,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, out: cpuid->nent = vcpu->arch.cpuid_nent; + vcpu_put(vcpu); return r; } @@ -1910,6 +1980,24 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + case KVM_CPUID_SIGNATURE: { + char signature[12] = "KVMKVMKVM\0\0"; + u32 *sigptr = (u32 *)signature; + entry->eax = 0; + entry->ebx = sigptr[0]; + entry->ecx = sigptr[1]; + entry->edx = sigptr[2]; + break; + } + case KVM_CPUID_FEATURES: + entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | + (1 << KVM_FEATURE_NOP_IO_DELAY) | + (1 << KVM_FEATURE_CLOCKSOURCE2) | + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + break; case 0x80000000: entry->eax = min(entry->eax, 0x8000001a); break; @@ -1918,6 +2006,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= kvm_supported_word6_x86_features; break; } + + kvm_x86_ops->set_supported_cpuid(function, entry); + put_cpu(); } @@ -1953,6 +2044,23 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent); + + + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, + cpuid->nent); + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent, + cpuid->nent); + r = -E2BIG; if (nent >= cpuid->nent) goto out_free; @@ -2032,6 +2140,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, int r; unsigned bank_num = mcg_cap & 0xff, bank; + vcpu_load(vcpu); r = -EINVAL; if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) goto out; @@ -2046,6 +2155,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; out: + vcpu_put(vcpu); return r; } @@ -2105,14 +2215,20 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, { vcpu_load(vcpu); - events->exception.injected = vcpu->arch.exception.pending; + events->exception.injected = + vcpu->arch.exception.pending && + !kvm_exception_is_soft(vcpu->arch.exception.nr); events->exception.nr = vcpu->arch.exception.nr; events->exception.has_error_code = vcpu->arch.exception.has_error_code; events->exception.error_code = vcpu->arch.exception.error_code; - events->interrupt.injected = vcpu->arch.interrupt.pending; + events->interrupt.injected = + vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; - events->interrupt.soft = vcpu->arch.interrupt.soft; + events->interrupt.soft = 0; + events->interrupt.shadow = + kvm_x86_ops->get_interrupt_shadow(vcpu, + KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending; @@ -2121,7 +2237,8 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->sipi_vector = vcpu->arch.sipi_vector; events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SIPI_VECTOR); + | KVM_VCPUEVENT_VALID_SIPI_VECTOR + | KVM_VCPUEVENT_VALID_SHADOW); vcpu_put(vcpu); } @@ -2130,7 +2247,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SIPI_VECTOR)) + | KVM_VCPUEVENT_VALID_SIPI_VECTOR + | KVM_VCPUEVENT_VALID_SHADOW)) return -EINVAL; vcpu_load(vcpu); @@ -2145,6 +2263,9 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.soft = events->interrupt.soft; if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) kvm_pic_clear_isr_ack(vcpu->kvm); + if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) + kvm_x86_ops->set_interrupt_shadow(vcpu, + events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) @@ -2159,6 +2280,36 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, return 0; } +static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, + struct kvm_debugregs *dbgregs) +{ + vcpu_load(vcpu); + + memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); + dbgregs->dr6 = vcpu->arch.dr6; + dbgregs->dr7 = vcpu->arch.dr7; + dbgregs->flags = 0; + + vcpu_put(vcpu); +} + +static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, + struct kvm_debugregs *dbgregs) +{ + if (dbgregs->flags) + return -EINVAL; + + vcpu_load(vcpu); + + memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); + vcpu->arch.dr6 = dbgregs->dr6; + vcpu->arch.dr7 = dbgregs->dr7; + + vcpu_put(vcpu); + + return 0; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2313,7 +2464,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&mce, argp, sizeof mce)) goto out; + vcpu_load(vcpu); r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); + vcpu_put(vcpu); break; } case KVM_GET_VCPU_EVENTS: { @@ -2337,6 +2490,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); break; } + case KVM_GET_DEBUGREGS: { + struct kvm_debugregs dbgregs; + + kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs); + + r = -EFAULT; + if (copy_to_user(argp, &dbgregs, + sizeof(struct kvm_debugregs))) + break; + r = 0; + break; + } + case KVM_SET_DEBUGREGS: { + struct kvm_debugregs dbgregs; + + r = -EFAULT; + if (copy_from_user(&dbgregs, argp, + sizeof(struct kvm_debugregs))) + break; + + r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); + break; + } default: r = -EINVAL; } @@ -2390,7 +2566,7 @@ gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) struct kvm_mem_alias *alias; struct kvm_mem_aliases *aliases; - aliases = rcu_dereference(kvm->arch.aliases); + aliases = kvm_aliases(kvm); for (i = 0; i < aliases->naliases; ++i) { alias = &aliases->aliases[i]; @@ -2409,7 +2585,7 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) struct kvm_mem_alias *alias; struct kvm_mem_aliases *aliases; - aliases = rcu_dereference(kvm->arch.aliases); + aliases = kvm_aliases(kvm); for (i = 0; i < aliases->naliases; ++i) { alias = &aliases->aliases[i]; @@ -2804,11 +2980,13 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&irq_event, argp, sizeof irq_event)) goto out; + r = -ENXIO; if (irqchip_in_kernel(kvm)) { __s32 status; status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irq_event.irq, irq_event.level); if (ioctl == KVM_IRQ_LINE_STATUS) { + r = -EFAULT; irq_event.status = status; if (copy_to_user(argp, &irq_event, sizeof irq_event)) @@ -3024,6 +3202,18 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } +static void kvm_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + kvm_x86_ops->set_segment(vcpu, var, seg); +} + +void kvm_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + kvm_x86_ops->get_segment(vcpu, var, seg); +} + gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; @@ -3104,14 +3294,17 @@ static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); } -static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, u32 *error) +static int kvm_write_guest_virt_system(gva_t addr, void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu, + u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error); + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, + PFERR_WRITE_MASK, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -3134,7 +3327,6 @@ out: return r; } - static int emulator_read_emulated(unsigned long addr, void *val, unsigned int bytes, @@ -3237,9 +3429,9 @@ mmio: } int emulator_write_emulated(unsigned long addr, - const void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu) + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) { /* Crossing a page boundary? */ if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { @@ -3257,45 +3449,150 @@ int emulator_write_emulated(unsigned long addr, } EXPORT_SYMBOL_GPL(emulator_write_emulated); +#define CMPXCHG_TYPE(t, ptr, old, new) \ + (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) + +#ifdef CONFIG_X86_64 +# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new) +#else +# define CMPXCHG64(ptr, old, new) \ + (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) +#endif + static int emulator_cmpxchg_emulated(unsigned long addr, const void *old, const void *new, unsigned int bytes, struct kvm_vcpu *vcpu) { - printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); -#ifndef CONFIG_X86_64 - /* guests cmpxchg8b have to be emulated atomically */ - if (bytes == 8) { - gpa_t gpa; - struct page *page; - char *kaddr; - u64 val; + gpa_t gpa; + struct page *page; + char *kaddr; + bool exchanged; - gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); + /* guests cmpxchg8b have to be emulated atomically */ + if (bytes > 8 || (bytes & (bytes - 1))) + goto emul_write; - if (gpa == UNMAPPED_GVA || - (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) - goto emul_write; + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); - if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) - goto emul_write; + if (gpa == UNMAPPED_GVA || + (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + goto emul_write; - val = *(u64 *)new; + if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) + goto emul_write; - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - kaddr = kmap_atomic(page, KM_USER0); - set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); - kunmap_atomic(kaddr, KM_USER0); - kvm_release_page_dirty(page); + kaddr = kmap_atomic(page, KM_USER0); + kaddr += offset_in_page(gpa); + switch (bytes) { + case 1: + exchanged = CMPXCHG_TYPE(u8, kaddr, old, new); + break; + case 2: + exchanged = CMPXCHG_TYPE(u16, kaddr, old, new); + break; + case 4: + exchanged = CMPXCHG_TYPE(u32, kaddr, old, new); + break; + case 8: + exchanged = CMPXCHG64(kaddr, old, new); + break; + default: + BUG(); } + kunmap_atomic(kaddr, KM_USER0); + kvm_release_page_dirty(page); + + if (!exchanged) + return X86EMUL_CMPXCHG_FAILED; + + kvm_mmu_pte_write(vcpu, gpa, new, bytes, 1); + + return X86EMUL_CONTINUE; + emul_write: -#endif + printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); return emulator_write_emulated(addr, new, bytes, vcpu); } +static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) +{ + /* TODO: String I/O for in kernel device */ + int r; + + if (vcpu->arch.pio.in) + r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, + vcpu->arch.pio.size, pd); + else + r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, + vcpu->arch.pio.port, vcpu->arch.pio.size, + pd); + return r; +} + + +static int emulator_pio_in_emulated(int size, unsigned short port, void *val, + unsigned int count, struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.pio.count) + goto data_avail; + + trace_kvm_pio(1, port, size, 1); + + vcpu->arch.pio.port = port; + vcpu->arch.pio.in = 1; + vcpu->arch.pio.count = count; + vcpu->arch.pio.size = size; + + if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { + data_avail: + memcpy(val, vcpu->arch.pio_data, size * count); + vcpu->arch.pio.count = 0; + return 1; + } + + vcpu->run->exit_reason = KVM_EXIT_IO; + vcpu->run->io.direction = KVM_EXIT_IO_IN; + vcpu->run->io.size = size; + vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; + vcpu->run->io.count = count; + vcpu->run->io.port = port; + + return 0; +} + +static int emulator_pio_out_emulated(int size, unsigned short port, + const void *val, unsigned int count, + struct kvm_vcpu *vcpu) +{ + trace_kvm_pio(0, port, size, 1); + + vcpu->arch.pio.port = port; + vcpu->arch.pio.in = 0; + vcpu->arch.pio.count = count; + vcpu->arch.pio.size = size; + + memcpy(vcpu->arch.pio_data, val, size * count); + + if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { + vcpu->arch.pio.count = 0; + return 1; + } + + vcpu->run->exit_reason = KVM_EXIT_IO; + vcpu->run->io.direction = KVM_EXIT_IO_OUT; + vcpu->run->io.size = size; + vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; + vcpu->run->io.count = count; + vcpu->run->io.port = port; + + return 0; +} + static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { return kvm_x86_ops->get_segment_base(vcpu, seg); @@ -3316,14 +3613,14 @@ int emulate_clts(struct kvm_vcpu *vcpu) int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { - return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest); + return kvm_get_dr(ctxt->vcpu, dr, dest); } int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) { unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; - return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask); + return kvm_set_dr(ctxt->vcpu, dr, value & mask); } void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) @@ -3344,12 +3641,167 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) } EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); +static u64 mk_cr_64(u64 curr_cr, u32 new_val) +{ + return (curr_cr & ~((1ULL << 32) - 1)) | new_val; +} + +static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) +{ + unsigned long value; + + switch (cr) { + case 0: + value = kvm_read_cr0(vcpu); + break; + case 2: + value = vcpu->arch.cr2; + break; + case 3: + value = vcpu->arch.cr3; + break; + case 4: + value = kvm_read_cr4(vcpu); + break; + case 8: + value = kvm_get_cr8(vcpu); + break; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + return 0; + } + + return value; +} + +static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) +{ + switch (cr) { + case 0: + kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); + break; + case 2: + vcpu->arch.cr2 = val; + break; + case 3: + kvm_set_cr3(vcpu, val); + break; + case 4: + kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); + break; + case 8: + kvm_set_cr8(vcpu, val & 0xfUL); + break; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + } +} + +static int emulator_get_cpl(struct kvm_vcpu *vcpu) +{ + return kvm_x86_ops->get_cpl(vcpu); +} + +static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->get_gdt(vcpu, dt); +} + +static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, + struct kvm_vcpu *vcpu) +{ + struct kvm_segment var; + + kvm_get_segment(vcpu, &var, seg); + + if (var.unusable) + return false; + + if (var.g) + var.limit >>= 12; + set_desc_limit(desc, var.limit); + set_desc_base(desc, (unsigned long)var.base); + desc->type = var.type; + desc->s = var.s; + desc->dpl = var.dpl; + desc->p = var.present; + desc->avl = var.avl; + desc->l = var.l; + desc->d = var.db; + desc->g = var.g; + + return true; +} + +static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, + struct kvm_vcpu *vcpu) +{ + struct kvm_segment var; + + /* needed to preserve selector */ + kvm_get_segment(vcpu, &var, seg); + + var.base = get_desc_base(desc); + var.limit = get_desc_limit(desc); + if (desc->g) + var.limit = (var.limit << 12) | 0xfff; + var.type = desc->type; + var.present = desc->p; + var.dpl = desc->dpl; + var.db = desc->d; + var.s = desc->s; + var.l = desc->l; + var.g = desc->g; + var.avl = desc->avl; + var.present = desc->p; + var.unusable = !var.present; + var.padding = 0; + + kvm_set_segment(vcpu, &var, seg); + return; +} + +static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu) +{ + struct kvm_segment kvm_seg; + + kvm_get_segment(vcpu, &kvm_seg, seg); + return kvm_seg.selector; +} + +static void emulator_set_segment_selector(u16 sel, int seg, + struct kvm_vcpu *vcpu) +{ + struct kvm_segment kvm_seg; + + kvm_get_segment(vcpu, &kvm_seg, seg); + kvm_seg.selector = sel; + kvm_set_segment(vcpu, &kvm_seg, seg); +} + +static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + kvm_x86_ops->set_rflags(vcpu, rflags); +} + static struct x86_emulate_ops emulate_ops = { .read_std = kvm_read_guest_virt_system, + .write_std = kvm_write_guest_virt_system, .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, + .pio_in_emulated = emulator_pio_in_emulated, + .pio_out_emulated = emulator_pio_out_emulated, + .get_cached_descriptor = emulator_get_cached_descriptor, + .set_cached_descriptor = emulator_set_cached_descriptor, + .get_segment_selector = emulator_get_segment_selector, + .set_segment_selector = emulator_set_segment_selector, + .get_gdt = emulator_get_gdt, + .get_cr = emulator_get_cr, + .set_cr = emulator_set_cr, + .cpl = emulator_get_cpl, + .set_rflags = emulator_set_rflags, }; static void cache_all_regs(struct kvm_vcpu *vcpu) @@ -3380,14 +3832,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu, cache_all_regs(vcpu); vcpu->mmio_is_write = 0; - vcpu->arch.pio.string = 0; if (!(emulation_type & EMULTYPE_NO_DECODE)) { int cs_db, cs_l; kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); + vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); + vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); vcpu->arch.emulate_ctxt.mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) @@ -3396,6 +3848,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + trace_kvm_emulate_insn_start(vcpu); /* Only allow emulation of specific instructions on #UD * (namely VMMCALL, sysenter, sysexit, syscall)*/ @@ -3428,6 +3881,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ++vcpu->stat.insn_emulation; if (r) { ++vcpu->stat.insn_emulation_fail; + trace_kvm_emulate_insn_failed(vcpu); if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) return EMULATE_DONE; return EMULATE_FAIL; @@ -3439,16 +3893,20 @@ int emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DONE; } +restart: r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; if (r == 0) kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); - if (vcpu->arch.pio.string) + if (vcpu->arch.pio.count) { + if (!vcpu->arch.pio.in) + vcpu->arch.pio.count = 0; return EMULATE_DO_MMIO; + } - if ((r || vcpu->mmio_is_write) && run) { + if (r || vcpu->mmio_is_write) { run->exit_reason = KVM_EXIT_MMIO; run->mmio.phys_addr = vcpu->mmio_phys_addr; memcpy(run->mmio.data, vcpu->mmio_data, 8); @@ -3458,222 +3916,41 @@ int emulate_instruction(struct kvm_vcpu *vcpu, if (r) { if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) - return EMULATE_DONE; + goto done; if (!vcpu->mmio_needed) { + ++vcpu->stat.insn_emulation_fail; + trace_kvm_emulate_insn_failed(vcpu); kvm_report_emulation_failure(vcpu, "mmio"); return EMULATE_FAIL; } return EMULATE_DO_MMIO; } - kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); - if (vcpu->mmio_is_write) { vcpu->mmio_needed = 0; return EMULATE_DO_MMIO; } - return EMULATE_DONE; -} -EXPORT_SYMBOL_GPL(emulate_instruction); - -static int pio_copy_data(struct kvm_vcpu *vcpu) -{ - void *p = vcpu->arch.pio_data; - gva_t q = vcpu->arch.pio.guest_gva; - unsigned bytes; - int ret; - u32 error_code; - - bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; - if (vcpu->arch.pio.in) - ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code); - else - ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code); - - if (ret == X86EMUL_PROPAGATE_FAULT) - kvm_inject_page_fault(vcpu, q, error_code); - - return ret; -} - -int complete_pio(struct kvm_vcpu *vcpu) -{ - struct kvm_pio_request *io = &vcpu->arch.pio; - long delta; - int r; - unsigned long val; - - if (!io->string) { - if (io->in) { - val = kvm_register_read(vcpu, VCPU_REGS_RAX); - memcpy(&val, vcpu->arch.pio_data, io->size); - kvm_register_write(vcpu, VCPU_REGS_RAX, val); - } - } else { - if (io->in) { - r = pio_copy_data(vcpu); - if (r) - goto out; - } - - delta = 1; - if (io->rep) { - delta *= io->cur_count; - /* - * The size of the register should really depend on - * current address size. - */ - val = kvm_register_read(vcpu, VCPU_REGS_RCX); - val -= delta; - kvm_register_write(vcpu, VCPU_REGS_RCX, val); - } - if (io->down) - delta = -delta; - delta *= io->size; - if (io->in) { - val = kvm_register_read(vcpu, VCPU_REGS_RDI); - val += delta; - kvm_register_write(vcpu, VCPU_REGS_RDI, val); - } else { - val = kvm_register_read(vcpu, VCPU_REGS_RSI); - val += delta; - kvm_register_write(vcpu, VCPU_REGS_RSI, val); - } - } -out: - io->count -= io->cur_count; - io->cur_count = 0; - - return 0; -} - -static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) -{ - /* TODO: String I/O for in kernel device */ - int r; - - if (vcpu->arch.pio.in) - r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); - else - r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, - vcpu->arch.pio.port, vcpu->arch.pio.size, - pd); - return r; -} +done: + if (vcpu->arch.exception.pending) + vcpu->arch.emulate_ctxt.restart = false; -static int pio_string_write(struct kvm_vcpu *vcpu) -{ - struct kvm_pio_request *io = &vcpu->arch.pio; - void *pd = vcpu->arch.pio_data; - int i, r = 0; + if (vcpu->arch.emulate_ctxt.restart) + goto restart; - for (i = 0; i < io->cur_count; i++) { - if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, - io->port, io->size, pd)) { - r = -EOPNOTSUPP; - break; - } - pd += io->size; - } - return r; -} - -int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) -{ - unsigned long val; - - trace_kvm_pio(!in, port, size, 1); - - vcpu->run->exit_reason = KVM_EXIT_IO; - vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; - vcpu->run->io.size = vcpu->arch.pio.size = size; - vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; - vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1; - vcpu->run->io.port = vcpu->arch.pio.port = port; - vcpu->arch.pio.in = in; - vcpu->arch.pio.string = 0; - vcpu->arch.pio.down = 0; - vcpu->arch.pio.rep = 0; - - if (!vcpu->arch.pio.in) { - val = kvm_register_read(vcpu, VCPU_REGS_RAX); - memcpy(vcpu->arch.pio_data, &val, 4); - } - - if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { - complete_pio(vcpu); - return 1; - } - return 0; + return EMULATE_DONE; } -EXPORT_SYMBOL_GPL(kvm_emulate_pio); +EXPORT_SYMBOL_GPL(emulate_instruction); -int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, - int size, unsigned long count, int down, - gva_t address, int rep, unsigned port) +int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) { - unsigned now, in_page; - int ret = 0; - - trace_kvm_pio(!in, port, size, count); - - vcpu->run->exit_reason = KVM_EXIT_IO; - vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; - vcpu->run->io.size = vcpu->arch.pio.size = size; - vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; - vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count; - vcpu->run->io.port = vcpu->arch.pio.port = port; - vcpu->arch.pio.in = in; - vcpu->arch.pio.string = 1; - vcpu->arch.pio.down = down; - vcpu->arch.pio.rep = rep; - - if (!count) { - kvm_x86_ops->skip_emulated_instruction(vcpu); - return 1; - } - - if (!down) - in_page = PAGE_SIZE - offset_in_page(address); - else - in_page = offset_in_page(address) + size; - now = min(count, (unsigned long)in_page / size); - if (!now) - now = 1; - if (down) { - /* - * String I/O in reverse. Yuck. Kill the guest, fix later. - */ - pr_unimpl(vcpu, "guest string pio down\n"); - kvm_inject_gp(vcpu, 0); - return 1; - } - vcpu->run->io.count = now; - vcpu->arch.pio.cur_count = now; - - if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count) - kvm_x86_ops->skip_emulated_instruction(vcpu); - - vcpu->arch.pio.guest_gva = address; - - if (!vcpu->arch.pio.in) { - /* string PIO write */ - ret = pio_copy_data(vcpu); - if (ret == X86EMUL_PROPAGATE_FAULT) - return 1; - if (ret == 0 && !pio_string_write(vcpu)) { - complete_pio(vcpu); - if (vcpu->arch.pio.count == 0) - ret = 1; - } - } - /* no string PIO read support yet */ - + unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); + int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu); + /* do not return to emulator after return from userspace */ + vcpu->arch.pio.count = 0; return ret; } -EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); +EXPORT_SYMBOL_GPL(kvm_fast_pio_out); static void bounce_off(void *info) { @@ -3996,85 +4273,20 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) return emulator_write_emulated(rip, instruction, 3, vcpu); } -static u64 mk_cr_64(u64 curr_cr, u32 new_val) -{ - return (curr_cr & ~((1ULL << 32) - 1)) | new_val; -} - void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) { - struct descriptor_table dt = { limit, base }; + struct desc_ptr dt = { limit, base }; kvm_x86_ops->set_gdt(vcpu, &dt); } void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) { - struct descriptor_table dt = { limit, base }; + struct desc_ptr dt = { limit, base }; kvm_x86_ops->set_idt(vcpu, &dt); } -void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, - unsigned long *rflags) -{ - kvm_lmsw(vcpu, msw); - *rflags = kvm_get_rflags(vcpu); -} - -unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) -{ - unsigned long value; - - switch (cr) { - case 0: - value = kvm_read_cr0(vcpu); - break; - case 2: - value = vcpu->arch.cr2; - break; - case 3: - value = vcpu->arch.cr3; - break; - case 4: - value = kvm_read_cr4(vcpu); - break; - case 8: - value = kvm_get_cr8(vcpu); - break; - default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); - return 0; - } - - return value; -} - -void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, - unsigned long *rflags) -{ - switch (cr) { - case 0: - kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); - *rflags = kvm_get_rflags(vcpu); - break; - case 2: - vcpu->arch.cr2 = val; - break; - case 3: - kvm_set_cr3(vcpu, val); - break; - case 4: - kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); - break; - case 8: - kvm_set_cr8(vcpu, val & 0xfUL); - break; - default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); - } -} - static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) { struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; @@ -4138,9 +4350,13 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; + best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0); + if (!best || best->eax < 0x80000008) + goto not_found; best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); if (best) return best->eax & 0xff; +not_found: return 36; } @@ -4254,9 +4470,13 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) { /* try to reinject previous events if any */ if (vcpu->arch.exception.pending) { + trace_kvm_inj_exception(vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code); kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, vcpu->arch.exception.has_error_code, - vcpu->arch.exception.error_code); + vcpu->arch.exception.error_code, + vcpu->arch.exception.reinject); return; } @@ -4486,7 +4706,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) } srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); - post_kvm_run_save(vcpu); vapic_exit(vcpu); @@ -4514,26 +4733,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (!irqchip_in_kernel(vcpu->kvm)) kvm_set_cr8(vcpu, kvm_run->cr8); - if (vcpu->arch.pio.cur_count) { - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = complete_pio(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r) - goto out; - } - if (vcpu->mmio_needed) { - memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); - vcpu->mmio_read_completed = 1; - vcpu->mmio_needed = 0; - + if (vcpu->arch.pio.count || vcpu->mmio_needed || + vcpu->arch.emulate_ctxt.restart) { + if (vcpu->mmio_needed) { + memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); + vcpu->mmio_read_completed = 1; + vcpu->mmio_needed = 0; + } vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0, - EMULTYPE_NO_DECODE); + r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r == EMULATE_DO_MMIO) { - /* - * Read-modify-write. Back to userspace. - */ r = 0; goto out; } @@ -4545,6 +4755,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) r = __vcpu_run(vcpu); out: + post_kvm_run_save(vcpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); @@ -4616,12 +4827,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } -void kvm_get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - kvm_x86_ops->get_segment(vcpu, var, seg); -} - void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) { struct kvm_segment cs; @@ -4635,7 +4840,7 @@ EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits); int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - struct descriptor_table dt; + struct desc_ptr dt; vcpu_load(vcpu); @@ -4650,11 +4855,11 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); kvm_x86_ops->get_idt(vcpu, &dt); - sregs->idt.limit = dt.limit; - sregs->idt.base = dt.base; + sregs->idt.limit = dt.size; + sregs->idt.base = dt.address; kvm_x86_ops->get_gdt(vcpu, &dt); - sregs->gdt.limit = dt.limit; - sregs->gdt.base = dt.base; + sregs->gdt.limit = dt.size; + sregs->gdt.base = dt.address; sregs->cr0 = kvm_read_cr0(vcpu); sregs->cr2 = vcpu->arch.cr2; @@ -4693,563 +4898,33 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return 0; } -static void kvm_set_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) -{ - kvm_x86_ops->set_segment(vcpu, var, seg); -} - -static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, - struct kvm_segment *kvm_desct) -{ - kvm_desct->base = get_desc_base(seg_desc); - kvm_desct->limit = get_desc_limit(seg_desc); - if (seg_desc->g) { - kvm_desct->limit <<= 12; - kvm_desct->limit |= 0xfff; - } - kvm_desct->selector = selector; - kvm_desct->type = seg_desc->type; - kvm_desct->present = seg_desc->p; - kvm_desct->dpl = seg_desc->dpl; - kvm_desct->db = seg_desc->d; - kvm_desct->s = seg_desc->s; - kvm_desct->l = seg_desc->l; - kvm_desct->g = seg_desc->g; - kvm_desct->avl = seg_desc->avl; - if (!selector) - kvm_desct->unusable = 1; - else - kvm_desct->unusable = 0; - kvm_desct->padding = 0; -} - -static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, - u16 selector, - struct descriptor_table *dtable) -{ - if (selector & 1 << 2) { - struct kvm_segment kvm_seg; - - kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); - - if (kvm_seg.unusable) - dtable->limit = 0; - else - dtable->limit = kvm_seg.limit; - dtable->base = kvm_seg.base; - } - else - kvm_x86_ops->get_gdt(vcpu, dtable); -} - -/* allowed just for 8 bytes segments */ -static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - struct desc_struct *seg_desc) -{ - struct descriptor_table dtable; - u16 index = selector >> 3; - int ret; - u32 err; - gva_t addr; - - get_segment_descriptor_dtable(vcpu, selector, &dtable); - - if (dtable.limit < index * 8 + 7) { - kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); - return X86EMUL_PROPAGATE_FAULT; - } - addr = dtable.base + index * 8; - ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc), - vcpu, &err); - if (ret == X86EMUL_PROPAGATE_FAULT) - kvm_inject_page_fault(vcpu, addr, err); - - return ret; -} - -/* allowed just for 8 bytes segments */ -static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - struct desc_struct *seg_desc) -{ - struct descriptor_table dtable; - u16 index = selector >> 3; - - get_segment_descriptor_dtable(vcpu, selector, &dtable); - - if (dtable.limit < index * 8 + 7) - return 1; - return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL); -} - -static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu, - struct desc_struct *seg_desc) -{ - u32 base_addr = get_desc_base(seg_desc); - - return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL); -} - -static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu, - struct desc_struct *seg_desc) -{ - u32 base_addr = get_desc_base(seg_desc); - - return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL); -} - -static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) -{ - struct kvm_segment kvm_seg; - - kvm_get_segment(vcpu, &kvm_seg, seg); - return kvm_seg.selector; -} - -static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) -{ - struct kvm_segment segvar = { - .base = selector << 4, - .limit = 0xffff, - .selector = selector, - .type = 3, - .present = 1, - .dpl = 3, - .db = 0, - .s = 1, - .l = 0, - .g = 0, - .avl = 0, - .unusable = 0, - }; - kvm_x86_ops->set_segment(vcpu, &segvar, seg); - return X86EMUL_CONTINUE; -} - -static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, + bool has_error_code, u32 error_code) { - return (seg != VCPU_SREG_LDTR) && - (seg != VCPU_SREG_TR) && - (kvm_get_rflags(vcpu) & X86_EFLAGS_VM); -} - -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg) -{ - struct kvm_segment kvm_seg; - struct desc_struct seg_desc; - u8 dpl, rpl, cpl; - unsigned err_vec = GP_VECTOR; - u32 err_code = 0; - bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ - int ret; + int cs_db, cs_l, ret; + cache_all_regs(vcpu); - if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu)) - return kvm_load_realmode_segment(vcpu, selector, seg); + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - /* NULL selector is not valid for TR, CS and SS */ - if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) - && null_selector) - goto exception; + vcpu->arch.emulate_ctxt.vcpu = vcpu; + vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); + vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); + vcpu->arch.emulate_ctxt.mode = + (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : + (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) + ? X86EMUL_MODE_VM86 : cs_l + ? X86EMUL_MODE_PROT64 : cs_db + ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - /* TR should be in GDT only */ - if (seg == VCPU_SREG_TR && (selector & (1 << 2))) - goto exception; + ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, + tss_selector, reason, has_error_code, + error_code); - ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc); if (ret) - return ret; - - seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg); - - if (null_selector) { /* for NULL selector skip all following checks */ - kvm_seg.unusable = 1; - goto load; - } - - err_code = selector & 0xfffc; - err_vec = GP_VECTOR; - - /* can't load system descriptor into segment selecor */ - if (seg <= VCPU_SREG_GS && !kvm_seg.s) - goto exception; - - if (!kvm_seg.present) { - err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; - goto exception; - } - - rpl = selector & 3; - dpl = kvm_seg.dpl; - cpl = kvm_x86_ops->get_cpl(vcpu); - - switch (seg) { - case VCPU_SREG_SS: - /* - * segment is not a writable data segment or segment - * selector's RPL != CPL or segment selector's RPL != CPL - */ - if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl) - goto exception; - break; - case VCPU_SREG_CS: - if (!(kvm_seg.type & 8)) - goto exception; - - if (kvm_seg.type & 4) { - /* conforming */ - if (dpl > cpl) - goto exception; - } else { - /* nonconforming */ - if (rpl > cpl || dpl != cpl) - goto exception; - } - /* CS(RPL) <- CPL */ - selector = (selector & 0xfffc) | cpl; - break; - case VCPU_SREG_TR: - if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9)) - goto exception; - break; - case VCPU_SREG_LDTR: - if (kvm_seg.s || kvm_seg.type != 2) - goto exception; - break; - default: /* DS, ES, FS, or GS */ - /* - * segment is not a data or readable code segment or - * ((segment is a data or nonconforming code segment) - * and (both RPL and CPL > DPL)) - */ - if ((kvm_seg.type & 0xa) == 0x8 || - (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl))) - goto exception; - break; - } - - if (!kvm_seg.unusable && kvm_seg.s) { - /* mark segment as accessed */ - kvm_seg.type |= 1; - seg_desc.type |= 1; - save_guest_segment_descriptor(vcpu, selector, &seg_desc); - } -load: - kvm_set_segment(vcpu, &kvm_seg, seg); - return X86EMUL_CONTINUE; -exception: - kvm_queue_exception_e(vcpu, err_vec, err_code); - return X86EMUL_PROPAGATE_FAULT; -} - -static void save_state_to_tss32(struct kvm_vcpu *vcpu, - struct tss_segment_32 *tss) -{ - tss->cr3 = vcpu->arch.cr3; - tss->eip = kvm_rip_read(vcpu); - tss->eflags = kvm_get_rflags(vcpu); - tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); - tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); - tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); - tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); - tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); - tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); - tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); - tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); - tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); - tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); - tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); - tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); - tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); - tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); - tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); -} - -static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg) -{ - struct kvm_segment kvm_seg; - kvm_get_segment(vcpu, &kvm_seg, seg); - kvm_seg.selector = sel; - kvm_set_segment(vcpu, &kvm_seg, seg); -} - -static int load_state_from_tss32(struct kvm_vcpu *vcpu, - struct tss_segment_32 *tss) -{ - kvm_set_cr3(vcpu, tss->cr3); - - kvm_rip_write(vcpu, tss->eip); - kvm_set_rflags(vcpu, tss->eflags | 2); - - kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); - kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); - kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); - kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); - kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); - kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); - kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); - kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); - - /* - * SDM says that segment selectors are loaded before segment - * descriptors - */ - kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR); - kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); - kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); - kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); - kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); - kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS); - kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS); - - /* - * Now load segment descriptors. If fault happenes at this stage - * it is handled in a context of new task - */ - if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) - return 1; + return EMULATE_FAIL; - if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS)) - return 1; - return 0; -} - -static void save_state_to_tss16(struct kvm_vcpu *vcpu, - struct tss_segment_16 *tss) -{ - tss->ip = kvm_rip_read(vcpu); - tss->flag = kvm_get_rflags(vcpu); - tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); - tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); - tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); - tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); - tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); - tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); - tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); - tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); - - tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); - tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); - tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); - tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); - tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); -} - -static int load_state_from_tss16(struct kvm_vcpu *vcpu, - struct tss_segment_16 *tss) -{ - kvm_rip_write(vcpu, tss->ip); - kvm_set_rflags(vcpu, tss->flag | 2); - kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); - kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); - kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); - kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); - kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); - kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); - kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); - kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); - - /* - * SDM says that segment selectors are loaded before segment - * descriptors - */ - kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR); - kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); - kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); - kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); - kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); - - /* - * Now load segment descriptors. If fault happenes at this stage - * it is handled in a context of new task - */ - if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS)) - return 1; - return 0; -} - -static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, - u16 old_tss_sel, u32 old_tss_base, - struct desc_struct *nseg_desc) -{ - struct tss_segment_16 tss_segment_16; - int ret = 0; - - if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, - sizeof tss_segment_16)) - goto out; - - save_state_to_tss16(vcpu, &tss_segment_16); - - if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, - sizeof tss_segment_16)) - goto out; - - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), - &tss_segment_16, sizeof tss_segment_16)) - goto out; - - if (old_tss_sel != 0xffff) { - tss_segment_16.prev_task_link = old_tss_sel; - - if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr_write(vcpu, nseg_desc), - &tss_segment_16.prev_task_link, - sizeof tss_segment_16.prev_task_link)) - goto out; - } - - if (load_state_from_tss16(vcpu, &tss_segment_16)) - goto out; - - ret = 1; -out: - return ret; -} - -static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, - u16 old_tss_sel, u32 old_tss_base, - struct desc_struct *nseg_desc) -{ - struct tss_segment_32 tss_segment_32; - int ret = 0; - - if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, - sizeof tss_segment_32)) - goto out; - - save_state_to_tss32(vcpu, &tss_segment_32); - - if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, - sizeof tss_segment_32)) - goto out; - - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc), - &tss_segment_32, sizeof tss_segment_32)) - goto out; - - if (old_tss_sel != 0xffff) { - tss_segment_32.prev_task_link = old_tss_sel; - - if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr_write(vcpu, nseg_desc), - &tss_segment_32.prev_task_link, - sizeof tss_segment_32.prev_task_link)) - goto out; - } - - if (load_state_from_tss32(vcpu, &tss_segment_32)) - goto out; - - ret = 1; -out: - return ret; -} - -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) -{ - struct kvm_segment tr_seg; - struct desc_struct cseg_desc; - struct desc_struct nseg_desc; - int ret = 0; - u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); - u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); - u32 desc_limit; - - old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL); - - /* FIXME: Handle errors. Failure to read either TSS or their - * descriptors should generate a pagefault. - */ - if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) - goto out; - - if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) - goto out; - - if (reason != TASK_SWITCH_IRET) { - int cpl; - - cpl = kvm_x86_ops->get_cpl(vcpu); - if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { - kvm_queue_exception_e(vcpu, GP_VECTOR, 0); - return 1; - } - } - - desc_limit = get_desc_limit(&nseg_desc); - if (!nseg_desc.p || - ((desc_limit < 0x67 && (nseg_desc.type & 8)) || - desc_limit < 0x2b)) { - kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); - return 1; - } - - if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { - cseg_desc.type &= ~(1 << 1); //clear the B flag - save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); - } - - if (reason == TASK_SWITCH_IRET) { - u32 eflags = kvm_get_rflags(vcpu); - kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); - } - - /* set back link to prev task only if NT bit is set in eflags - note that old_tss_sel is not used afetr this point */ - if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) - old_tss_sel = 0xffff; - - if (nseg_desc.type & 8) - ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, - old_tss_base, &nseg_desc); - else - ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, - old_tss_base, &nseg_desc); - - if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { - u32 eflags = kvm_get_rflags(vcpu); - kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT); - } - - if (reason != TASK_SWITCH_IRET) { - nseg_desc.type |= (1 << 1); - save_guest_segment_descriptor(vcpu, tss_selector, - &nseg_desc); - } - - kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS); - seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); - tr_seg.type = 11; - kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); -out: - return ret; + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + return EMULATE_DONE; } EXPORT_SYMBOL_GPL(kvm_task_switch); @@ -5258,15 +4933,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, { int mmu_reset_needed = 0; int pending_vec, max_bits; - struct descriptor_table dt; + struct desc_ptr dt; vcpu_load(vcpu); - dt.limit = sregs->idt.limit; - dt.base = sregs->idt.base; + dt.size = sregs->idt.limit; + dt.address = sregs->idt.base; kvm_x86_ops->set_idt(vcpu, &dt); - dt.limit = sregs->gdt.limit; - dt.base = sregs->gdt.base; + dt.size = sregs->gdt.limit; + dt.address = sregs->gdt.base; kvm_x86_ops->set_gdt(vcpu, &dt); vcpu->arch.cr2 = sregs->cr2; @@ -5365,11 +5040,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); } - if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - vcpu->arch.singlestep_cs = - get_segment_selector(vcpu, VCPU_SREG_CS); - vcpu->arch.singlestep_rip = kvm_rip_read(vcpu); - } + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) + + get_segment_base(vcpu, VCPU_SREG_CS); /* * Trigger an rflags update that will inject or remove the trace @@ -5860,13 +5533,22 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) return kvm_x86_ops->interrupt_allowed(vcpu); } +bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) +{ + unsigned long current_rip = kvm_rip_read(vcpu) + + get_segment_base(vcpu, VCPU_SREG_CS); + + return current_rip == linear_rip; +} +EXPORT_SYMBOL_GPL(kvm_is_linear_rip); + unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags; rflags = kvm_x86_ops->get_rflags(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) - rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF); + rflags &= ~X86_EFLAGS_TF; return rflags; } EXPORT_SYMBOL_GPL(kvm_get_rflags); @@ -5874,10 +5556,8 @@ EXPORT_SYMBOL_GPL(kvm_get_rflags); void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && - vcpu->arch.singlestep_cs == - get_segment_selector(vcpu, VCPU_SREG_CS) && - vcpu->arch.singlestep_rip == kvm_rip_read(vcpu)) - rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; + kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) + rflags |= X86_EFLAGS_TF; kvm_x86_ops->set_rflags(vcpu, rflags); } EXPORT_SYMBOL_GPL(kvm_set_rflags); @@ -5893,3 +5573,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index b7a404722d2..f4b54458285 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -65,6 +65,13 @@ static inline int is_paging(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_PG); } +static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm) +{ + return rcu_dereference_check(kvm->arch.aliases, + srcu_read_lock_held(&kvm->srcu) + || lockdep_is_held(&kvm->slots_lock)); +} + void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 2bdf628066b..9257510b483 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1390,7 +1390,6 @@ __init void lguest_init(void) #endif #ifdef CONFIG_ACPI acpi_disabled = 1; - acpi_ht = 0; #endif /* diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 550df481acc..787c52ca49c 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -3,12 +3,6 @@ #include <linux/module.h> #include <linux/bootmem.h> -#ifdef CONFIG_DEBUG_PER_CPU_MAPS -# define DBG(x...) printk(KERN_DEBUG x) -#else -# define DBG(x...) -#endif - /* * Which logical CPUs are on which nodes */ diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 8948f47fde0..a7bcc23ef96 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -33,9 +33,6 @@ int numa_off __initdata; static unsigned long __initdata nodemap_addr; static unsigned long __initdata nodemap_size; -DEFINE_PER_CPU(int, node_number) = 0; -EXPORT_PER_CPU_SYMBOL(node_number); - /* * Map cpu index to node index */ @@ -809,7 +806,7 @@ void __cpuinit numa_set_node(int cpu, int node) per_cpu(x86_cpu_to_node_map, cpu) = node; if (node != NUMA_NO_NODE) - per_cpu(node_number, cpu) = node; + set_cpu_numa_node(cpu, node); } void __cpuinit numa_clear_node(int cpu) @@ -867,7 +864,7 @@ void __cpuinit numa_remove_cpu(int cpu) numa_set_cpumask(cpu, 0); } -int cpu_to_node(int cpu) +int __cpu_to_node(int cpu) { if (early_per_cpu_ptr(x86_cpu_to_node_map)) { printk(KERN_WARNING @@ -877,7 +874,7 @@ int cpu_to_node(int cpu) } return per_cpu(x86_cpu_to_node_map, cpu); } -EXPORT_SYMBOL(cpu_to_node); +EXPORT_SYMBOL(__cpu_to_node); /* * Same function as cpu_to_node() but used if called before the diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index bbe5502ee1c..acc15b23b74 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -336,6 +336,7 @@ int free_memtype(u64 start, u64 end) { int err = -EINVAL; int is_range_ram; + struct memtype *entry; if (!pat_enabled) return 0; @@ -355,17 +356,20 @@ int free_memtype(u64 start, u64 end) } spin_lock(&memtype_lock); - err = rbt_memtype_erase(start, end); + entry = rbt_memtype_erase(start, end); spin_unlock(&memtype_lock); - if (err) { + if (!entry) { printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", current->comm, current->pid, start, end); + return -EINVAL; } + kfree(entry); + dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); - return err; + return 0; } diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h index 4f39eefa3e6..77e5ba153fa 100644 --- a/arch/x86/mm/pat_internal.h +++ b/arch/x86/mm/pat_internal.h @@ -28,15 +28,15 @@ static inline char *cattr_name(unsigned long flags) #ifdef CONFIG_X86_PAT extern int rbt_memtype_check_insert(struct memtype *new, unsigned long *new_type); -extern int rbt_memtype_erase(u64 start, u64 end); +extern struct memtype *rbt_memtype_erase(u64 start, u64 end); extern struct memtype *rbt_memtype_lookup(u64 addr); extern int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos); #else static inline int rbt_memtype_check_insert(struct memtype *new, unsigned long *new_type) { return 0; } -static inline int rbt_memtype_erase(u64 start, u64 end) -{ return 0; } +static inline struct memtype *rbt_memtype_erase(u64 start, u64 end) +{ return NULL; } static inline struct memtype *rbt_memtype_lookup(u64 addr) { return NULL; } static inline int rbt_memtype_copy_nth_element(struct memtype *out, loff_t pos) diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 07de4cb8cc3..f537087bb74 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -231,16 +231,17 @@ int rbt_memtype_check_insert(struct memtype *new, unsigned long *ret_type) return err; } -int rbt_memtype_erase(u64 start, u64 end) +struct memtype *rbt_memtype_erase(u64 start, u64 end) { struct memtype *data; data = memtype_rb_exact_match(&memtype_rbroot, start, end); if (!data) - return -EINVAL; + goto out; rb_erase(&data->rb, &memtype_rbroot); - return 0; +out: + return data; } struct memtype *rbt_memtype_lookup(u64 addr) diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c index df3d5c861cd..308e32570d8 100644 --- a/arch/x86/mm/pf_in.c +++ b/arch/x86/mm/pf_in.c @@ -34,7 +34,7 @@ /* IA32 Manual 3, 2-1 */ static unsigned char prefix_codes[] = { 0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64, - 0x65, 0x2E, 0x3E, 0x66, 0x67 + 0x65, 0x66, 0x67 }; /* IA32 Manual 3, 3-432*/ static unsigned int reg_rop[] = { diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 792854003ed..cac71849925 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -9,7 +9,6 @@ #include <linux/pagemap.h> #include <linux/spinlock.h> #include <linux/module.h> -#include <linux/quicklist.h> #include <asm/system.h> #include <asm/pgtable.h> diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index b110d97fb92..a0207a7fdf3 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -18,6 +18,8 @@ obj-$(CONFIG_X86_MRST) += mrst.o obj-y += common.o early.o obj-y += amd_bus.o bus_numa.o +obj-$(CONFIG_PCI_CNB20LE_QUIRK) += broadcom_bus.o + ifeq ($(CONFIG_PCI_DEBUG),y) EXTRA_CFLAGS += -DDEBUG endif diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 31930fd30ea..2ec04c424a6 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -207,10 +207,9 @@ get_current_resources(struct acpi_device *device, int busnum, if (!info.res) goto res_alloc_fail; - info.name = kmalloc(16, GFP_KERNEL); + info.name = kasprintf(GFP_KERNEL, "PCI Bus %04x:%02x", domain, busnum); if (!info.name) goto name_alloc_fail; - sprintf(info.name, "PCI Bus %04x:%02x", domain, busnum); info.res_num = 0; acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, @@ -224,8 +223,11 @@ res_alloc_fail: return; } -struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum) +struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) { + struct acpi_device *device = root->device; + int domain = root->segment; + int busnum = root->secondary.start; struct pci_bus *bus; struct pci_sysdata *sd; int node; diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c new file mode 100644 index 00000000000..0846a5bbbfb --- /dev/null +++ b/arch/x86/pci/broadcom_bus.c @@ -0,0 +1,101 @@ +/* + * Read address ranges from a Broadcom CNB20LE Host Bridge + * + * Copyright (c) 2010 Ira W. Snyder <iws@ovro.caltech.edu> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include <linux/delay.h> +#include <linux/dmi.h> +#include <linux/pci.h> +#include <linux/init.h> +#include <asm/pci_x86.h> + +#include "bus_numa.h" + +static void __devinit cnb20le_res(struct pci_dev *dev) +{ + struct pci_root_info *info; + struct resource res; + u16 word1, word2; + u8 fbus, lbus; + int i; + + /* + * The x86_pci_root_bus_res_quirks() function already refuses to use + * this information if ACPI _CRS was used. Therefore, we don't bother + * checking if ACPI is enabled, and just generate the information + * for both the ACPI _CRS and no ACPI cases. + */ + + info = &pci_root_info[pci_root_num]; + pci_root_num++; + + /* read the PCI bus numbers */ + pci_read_config_byte(dev, 0x44, &fbus); + pci_read_config_byte(dev, 0x45, &lbus); + info->bus_min = fbus; + info->bus_max = lbus; + + /* + * Add the legacy IDE ports on bus 0 + * + * These do not exist anywhere in the bridge registers, AFAICT. I do + * not have the datasheet, so this is the best I can do. + */ + if (fbus == 0) { + update_res(info, 0x01f0, 0x01f7, IORESOURCE_IO, 0); + update_res(info, 0x03f6, 0x03f6, IORESOURCE_IO, 0); + update_res(info, 0x0170, 0x0177, IORESOURCE_IO, 0); + update_res(info, 0x0376, 0x0376, IORESOURCE_IO, 0); + update_res(info, 0xffa0, 0xffaf, IORESOURCE_IO, 0); + } + + /* read the non-prefetchable memory window */ + pci_read_config_word(dev, 0xc0, &word1); + pci_read_config_word(dev, 0xc2, &word2); + if (word1 != word2) { + res.start = (word1 << 16) | 0x0000; + res.end = (word2 << 16) | 0xffff; + res.flags = IORESOURCE_MEM; + update_res(info, res.start, res.end, res.flags, 0); + } + + /* read the prefetchable memory window */ + pci_read_config_word(dev, 0xc4, &word1); + pci_read_config_word(dev, 0xc6, &word2); + if (word1 != word2) { + res.start = (word1 << 16) | 0x0000; + res.end = (word2 << 16) | 0xffff; + res.flags = IORESOURCE_MEM | IORESOURCE_PREFETCH; + update_res(info, res.start, res.end, res.flags, 0); + } + + /* read the IO port window */ + pci_read_config_word(dev, 0xd0, &word1); + pci_read_config_word(dev, 0xd2, &word2); + if (word1 != word2) { + res.start = word1; + res.end = word2; + res.flags = IORESOURCE_IO; + update_res(info, res.start, res.end, res.flags, 0); + } + + /* print information about this host bridge */ + res.start = fbus; + res.end = lbus; + res.flags = IORESOURCE_BUS; + dev_info(&dev->dev, "CNB20LE PCI Host Bridge (domain %04x %pR)\n", + pci_domain_nr(dev->bus), &res); + + for (i = 0; i < info->res_num; i++) + dev_info(&dev->dev, "host bridge window %pR\n", &info->res[i]); +} + +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE, + cnb20le_res); + diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index cf2e93869c4..215a27ae050 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -76,7 +76,7 @@ struct pci_ops pci_root_ops = { * This interrupt-safe spinlock protects all accesses to PCI * configuration space. */ -DEFINE_SPINLOCK(pci_config_lock); +DEFINE_RAW_SPINLOCK(pci_config_lock); static int __devinit can_skip_ioresource_align(const struct dmi_system_id *d) { diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index 347d882b3bb..bd33620b007 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -27,7 +27,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus, return -EINVAL; } - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8); @@ -43,7 +43,7 @@ static int pci_conf1_read(unsigned int seg, unsigned int bus, break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } @@ -56,7 +56,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus, if ((bus > 255) || (devfn > 255) || (reg > 4095)) return -EINVAL; - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); outl(PCI_CONF1_ADDRESS(bus, devfn, reg), 0xCF8); @@ -72,7 +72,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus, break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } @@ -108,7 +108,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus, if (dev & 0x10) return PCIBIOS_DEVICE_NOT_FOUND; - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); outb((u8)(0xF0 | (fn << 1)), 0xCF8); outb((u8)bus, 0xCFA); @@ -127,7 +127,7 @@ static int pci_conf2_read(unsigned int seg, unsigned int bus, outb(0, 0xCF8); - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } @@ -147,7 +147,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus, if (dev & 0x10) return PCIBIOS_DEVICE_NOT_FOUND; - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); outb((u8)(0xF0 | (fn << 1)), 0xCF8); outb((u8)bus, 0xCFA); @@ -166,7 +166,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus, outb(0, 0xCF8); - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 97da2ba9344..6fdb3ec30c3 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -96,6 +96,7 @@ EXPORT_SYMBOL(pcibios_align_resource); * the fact the PCI specs explicitly allow address decoders to be * shared between expansion ROMs and other resource regions, it's * at least dangerous) + * - bad resource sizes or overlaps with other regions * * Our solution: * (1) Allocate resources for all buses behind PCI-to-PCI bridges. @@ -136,6 +137,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) * child resource allocations in this * range. */ + r->start = r->end = 0; r->flags = 0; } } diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 5d362b5ba06..9810a0f76c9 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -589,8 +589,6 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route case PCI_DEVICE_ID_INTEL_ICH10_1: case PCI_DEVICE_ID_INTEL_ICH10_2: case PCI_DEVICE_ID_INTEL_ICH10_3: - case PCI_DEVICE_ID_INTEL_CPT_LPC1: - case PCI_DEVICE_ID_INTEL_CPT_LPC2: r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; @@ -605,6 +603,13 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route return 1; } + if ((device >= PCI_DEVICE_ID_INTEL_CPT_LPC_MIN) && + (device <= PCI_DEVICE_ID_INTEL_CPT_LPC_MAX)) { + r->name = "PIIX/ICH"; + r->get = pirq_piix_get; + r->set = pirq_piix_set; + return 1; + } return 0; } diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 0db5eaf5456..8d460eaf524 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -11,28 +11,14 @@ */ static void __devinit pcibios_fixup_peer_bridges(void) { - int n, devfn; - long node; + int n; if (pcibios_last_bus <= 0 || pcibios_last_bus > 0xff) return; DBG("PCI: Peer bridge fixup\n"); - for (n=0; n <= pcibios_last_bus; n++) { - u32 l; - if (pci_find_bus(0, n)) - continue; - node = get_mp_bus_to_node(n); - for (devfn = 0; devfn < 256; devfn += 8) { - if (!raw_pci_read(0, n, devfn, PCI_VENDOR_ID, 2, &l) && - l != 0x0000 && l != 0xffff) { - DBG("Found device at %02x:%02x [%04x]\n", n, devfn, l); - printk(KERN_INFO "PCI: Discovered peer bus %02x\n", n); - pci_scan_bus_on_node(n, &pci_root_ops, node); - break; - } - } - } + for (n=0; n <= pcibios_last_bus; n++) + pcibios_scan_specific_bus(n); } int __init pci_legacy_init(void) @@ -50,6 +36,28 @@ int __init pci_legacy_init(void) return 0; } +void pcibios_scan_specific_bus(int busn) +{ + int devfn; + long node; + u32 l; + + if (pci_find_bus(0, busn)) + return; + + node = get_mp_bus_to_node(busn); + for (devfn = 0; devfn < 256; devfn += 8) { + if (!raw_pci_read(0, busn, devfn, PCI_VENDOR_ID, 2, &l) && + l != 0x0000 && l != 0xffff) { + DBG("Found device at %02x:%02x [%04x]\n", busn, devfn, l); + printk(KERN_INFO "PCI: Discovered peer bus %02x\n", busn); + pci_scan_bus_on_node(busn, &pci_root_ops, node); + return; + } + } +} +EXPORT_SYMBOL_GPL(pcibios_scan_specific_bus); + int __init pci_subsys_init(void) { /* diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 39b9ebe8f88..a918553ebc7 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -483,16 +483,17 @@ static void __init pci_mmcfg_reject_broken(int early) list_for_each_entry(cfg, &pci_mmcfg_list, list) { int valid = 0; - if (!early && !acpi_disabled) + if (!early && !acpi_disabled) { valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0); - if (valid) - continue; - - if (!early) - printk(KERN_ERR FW_BUG PREFIX - "MMCONFIG at %pR not reserved in " - "ACPI motherboard resources\n", &cfg->res); + if (valid) + continue; + else + printk(KERN_ERR FW_BUG PREFIX + "MMCONFIG at %pR not reserved in " + "ACPI motherboard resources\n", + &cfg->res); + } /* Don't try to do this check unless configuration type 1 is available. how about type 2 ?*/ diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index 90d5fd476ed..a3d9c54792a 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -64,7 +64,7 @@ err: *value = -1; if (!base) goto err; - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); pci_exp_set_dev_base(base, bus, devfn); @@ -79,7 +79,7 @@ err: *value = -1; *value = mmio_config_readl(mmcfg_virt_addr + reg); break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } @@ -97,7 +97,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, if (!base) return -EINVAL; - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); pci_exp_set_dev_base(base, bus, devfn); @@ -112,7 +112,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, mmio_config_writel(mmcfg_virt_addr + reg, value); break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 8223738ad80..5c9e2458df4 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -37,7 +37,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus, if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) return -EINVAL; - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); write_cf8(bus, devfn, reg); @@ -62,7 +62,7 @@ static int pci_conf1_mq_read(unsigned int seg, unsigned int bus, break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } @@ -76,7 +76,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus, if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) return -EINVAL; - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); write_cf8(bus, devfn, reg); @@ -101,7 +101,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus, break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return 0; } diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 59a225c17b8..2492d165096 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -162,7 +162,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus, if (!value || (bus > 255) || (devfn > 255) || (reg > 255)) return -EINVAL; - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); switch (len) { case 1: @@ -213,7 +213,7 @@ static int pci_bios_read(unsigned int seg, unsigned int bus, break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return (int)((result & 0xff00) >> 8); } @@ -228,7 +228,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus, if ((bus > 255) || (devfn > 255) || (reg > 255)) return -EINVAL; - spin_lock_irqsave(&pci_config_lock, flags); + raw_spin_lock_irqsave(&pci_config_lock, flags); switch (len) { case 1: @@ -269,7 +269,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus, break; } - spin_unlock_irqrestore(&pci_config_lock, flags); + raw_spin_unlock_irqrestore(&pci_config_lock, flags); return (int)((result & 0xff00) >> 8); } diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 0a979f3e5b8..1290ba54b35 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -105,6 +105,8 @@ static void __save_processor_state(struct saved_context *ctxt) ctxt->cr4 = read_cr4(); ctxt->cr8 = read_cr8(); #endif + ctxt->misc_enable_saved = !rdmsrl_safe(MSR_IA32_MISC_ENABLE, + &ctxt->misc_enable); } /* Needed by apm.c */ @@ -152,6 +154,8 @@ static void fix_processor_context(void) */ static void __restore_processor_state(struct saved_context *ctxt) { + if (ctxt->misc_enable_saved) + wrmsrl(MSR_IA32_MISC_ENABLE, ctxt->misc_enable); /* * control registers */ diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 987267f79bf..a9c66110803 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -60,6 +60,6 @@ static void xen_vcpu_notify_restore(void *data) void xen_arch_resume(void) { - smp_call_function(xen_vcpu_notify_restore, - (void *)CLOCK_EVT_NOTIFY_RESUME, 1); + on_each_cpu(xen_vcpu_notify_restore, + (void *)CLOCK_EVT_NOTIFY_RESUME, 1); } |