diff options
author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2013-05-01 08:47:44 -0700 |
---|---|---|
committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2013-05-01 08:47:44 -0700 |
commit | bf61c8840efe60fd8f91446860b63338fb424158 (patch) | |
tree | 7a71832407a4f0d6346db773343f4c3ae2257b19 /arch/x86/kvm | |
parent | 5846115b30f3a881e542c8bfde59a699c1c13740 (diff) | |
parent | 0c6a61657da78098472fd0eb71cc01f2387fa1bb (diff) |
Merge branch 'next' into for-linus
Prepare first set of updates for 3.10 merge window.
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/cpuid.c | 3 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.h | 11 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 681 | ||||
-rw-r--r-- | arch/x86/kvm/i8254.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/i8259.c | 2 | ||||
-rw-r--r-- | arch/x86/kvm/irq.c | 74 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 142 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 34 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 241 | ||||
-rw-r--r-- | arch/x86/kvm/mmutrace.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 211 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 72 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 63 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 924 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 757 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 2 |
16 files changed, 2207 insertions, 1017 deletions
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ec79e773342..a20ecb5b6cb 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -320,6 +320,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, if (index == 0) { entry->ebx &= kvm_supported_word9_x86_features; cpuid_mask(&entry->ebx, 9); + // TSC_ADJUST is emulated + entry->ebx |= F(TSC_ADJUST); } else entry->ebx = 0; entry->eax = 0; @@ -659,6 +661,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) } else *eax = *ebx = *ecx = *edx = 0; } +EXPORT_SYMBOL_GPL(kvm_cpuid); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index a10e4601685..b7fd0798488 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -24,10 +24,21 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; + if (!static_cpu_has(X86_FEATURE_XSAVE)) + return 0; + best = kvm_find_cpuid_entry(vcpu, 1, 0); return best && (best->ecx & bit(X86_FEATURE_XSAVE)); } +static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST)); +} + static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 39171cb307e..a335cc6cde7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -24,6 +24,7 @@ #include "kvm_cache_regs.h" #include <linux/module.h> #include <asm/kvm_emulate.h> +#include <linux/stringify.h> #include "x86.h" #include "tss.h" @@ -43,7 +44,7 @@ #define OpCL 9ull /* CL register (for shifts) */ #define OpImmByte 10ull /* 8-bit sign extended immediate */ #define OpOne 11ull /* Implied 1 */ -#define OpImm 12ull /* Sign extended immediate */ +#define OpImm 12ull /* Sign extended up to 32-bit immediate */ #define OpMem16 13ull /* Memory operand (16-bit). */ #define OpMem32 14ull /* Memory operand (32-bit). */ #define OpImmU 15ull /* Immediate operand, zero extended */ @@ -58,6 +59,7 @@ #define OpFS 24ull /* FS */ #define OpGS 25ull /* GS */ #define OpMem8 26ull /* 8-bit zero extended memory operand */ +#define OpImm64 27ull /* Sign extended 16/32/64-bit immediate */ #define OpBits 5 /* Width of operand field */ #define OpMask ((1ull << OpBits) - 1) @@ -101,6 +103,7 @@ #define SrcMemFAddr (OpMemFAddr << SrcShift) #define SrcAcc (OpAcc << SrcShift) #define SrcImmU16 (OpImmU16 << SrcShift) +#define SrcImm64 (OpImm64 << SrcShift) #define SrcDX (OpDX << SrcShift) #define SrcMem8 (OpMem8 << SrcShift) #define SrcMask (OpMask << SrcShift) @@ -113,6 +116,7 @@ #define GroupDual (2<<15) /* Alternate decoding of mod == 3 */ #define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ #define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ +#define Escape (5<<15) /* Escape to coprocessor instruction */ #define Sse (1<<18) /* SSE Vector instruction */ /* Generic ModRM decode. */ #define ModRM (1<<19) @@ -146,6 +150,8 @@ #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ #define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ #define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ +#define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ +#define NoWrite ((u64)1 << 45) /* No writeback */ #define X2(x...) x, x #define X3(x...) X2(x), x @@ -156,6 +162,27 @@ #define X8(x...) X4(x), X4(x) #define X16(x...) X8(x), X8(x) +#define NR_FASTOP (ilog2(sizeof(ulong)) + 1) +#define FASTOP_SIZE 8 + +/* + * fastop functions have a special calling convention: + * + * dst: [rdx]:rax (in/out) + * src: rbx (in/out) + * src2: rcx (in) + * flags: rflags (in/out) + * + * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for + * different operand sizes can be reached by calculation, rather than a jump + * table (which would be bigger than the code). + * + * fastop functions are declared as taking a never-defined fastop parameter, + * so they can't be called from C directly. + */ + +struct fastop; + struct opcode { u64 flags : 56; u64 intercept : 8; @@ -164,6 +191,8 @@ struct opcode { const struct opcode *group; const struct group_dual *gdual; const struct gprefix *gprefix; + const struct escape *esc; + void (*fastop)(struct fastop *fake); } u; int (*check_perm)(struct x86_emulate_ctxt *ctxt); }; @@ -180,6 +209,11 @@ struct gprefix { struct opcode pfx_f3; }; +struct escape { + struct opcode op[8]; + struct opcode high[64]; +}; + /* EFLAGS bit definitions. */ #define EFLG_ID (1<<21) #define EFLG_VIP (1<<20) @@ -407,6 +441,97 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) } \ } while (0) +static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); + +#define FOP_ALIGN ".align " __stringify(FASTOP_SIZE) " \n\t" +#define FOP_RET "ret \n\t" + +#define FOP_START(op) \ + extern void em_##op(struct fastop *fake); \ + asm(".pushsection .text, \"ax\" \n\t" \ + ".global em_" #op " \n\t" \ + FOP_ALIGN \ + "em_" #op ": \n\t" + +#define FOP_END \ + ".popsection") + +#define FOPNOP() FOP_ALIGN FOP_RET + +#define FOP1E(op, dst) \ + FOP_ALIGN #op " %" #dst " \n\t" FOP_RET + +#define FASTOP1(op) \ + FOP_START(op) \ + FOP1E(op##b, al) \ + FOP1E(op##w, ax) \ + FOP1E(op##l, eax) \ + ON64(FOP1E(op##q, rax)) \ + FOP_END + +#define FOP2E(op, dst, src) \ + FOP_ALIGN #op " %" #src ", %" #dst " \n\t" FOP_RET + +#define FASTOP2(op) \ + FOP_START(op) \ + FOP2E(op##b, al, bl) \ + FOP2E(op##w, ax, bx) \ + FOP2E(op##l, eax, ebx) \ + ON64(FOP2E(op##q, rax, rbx)) \ + FOP_END + +/* 2 operand, word only */ +#define FASTOP2W(op) \ + FOP_START(op) \ + FOPNOP() \ + FOP2E(op##w, ax, bx) \ + FOP2E(op##l, eax, ebx) \ + ON64(FOP2E(op##q, rax, rbx)) \ + FOP_END + +/* 2 operand, src is CL */ +#define FASTOP2CL(op) \ + FOP_START(op) \ + FOP2E(op##b, al, cl) \ + FOP2E(op##w, ax, cl) \ + FOP2E(op##l, eax, cl) \ + ON64(FOP2E(op##q, rax, cl)) \ + FOP_END + +#define FOP3E(op, dst, src, src2) \ + FOP_ALIGN #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET + +/* 3-operand, word-only, src2=cl */ +#define FASTOP3WCL(op) \ + FOP_START(op) \ + FOPNOP() \ + FOP3E(op##w, ax, bx, cl) \ + FOP3E(op##l, eax, ebx, cl) \ + ON64(FOP3E(op##q, rax, rbx, cl)) \ + FOP_END + +/* Special case for SETcc - 1 instruction per cc */ +#define FOP_SETCC(op) ".align 4; " #op " %al; ret \n\t" + +FOP_START(setcc) +FOP_SETCC(seto) +FOP_SETCC(setno) +FOP_SETCC(setc) +FOP_SETCC(setnc) +FOP_SETCC(setz) +FOP_SETCC(setnz) +FOP_SETCC(setbe) +FOP_SETCC(setnbe) +FOP_SETCC(sets) +FOP_SETCC(setns) +FOP_SETCC(setp) +FOP_SETCC(setnp) +FOP_SETCC(setl) +FOP_SETCC(setnl) +FOP_SETCC(setle) +FOP_SETCC(setnle) +FOP_END; + #define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ do { \ unsigned long _tmp; \ @@ -426,8 +551,7 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt) _ASM_EXTABLE(1b, 3b) \ : "=m" ((ctxt)->eflags), "=&r" (_tmp), \ "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \ - : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val), \ - "a" (*rax), "d" (*rdx)); \ + : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val)); \ } while (0) /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ @@ -664,7 +788,7 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, ulong la; u32 lim; u16 sel; - unsigned cpl, rpl; + unsigned cpl; la = seg_base(ctxt, addr.seg) + addr.ea; switch (ctxt->mode) { @@ -677,8 +801,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, addr.seg); if (!usable) goto bad; - /* code segment or read-only data segment */ - if (((desc.type & 8) || !(desc.type & 2)) && write) + /* code segment in protected mode or read-only data segment */ + if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8)) + || !(desc.type & 2)) && write) goto bad; /* unreadable code segment */ if (!fetch && (desc.type & 8) && !(desc.type & 2)) @@ -697,11 +822,6 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, goto bad; } cpl = ctxt->ops->cpl(ctxt); - if (ctxt->mode == X86EMUL_MODE_REAL) - rpl = 0; - else - rpl = sel & 3; - cpl = max(cpl, rpl); if (!(desc.type & 8)) { /* data segment */ if (cpl > desc.dpl) @@ -852,39 +972,50 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, return rc; } -static int test_cc(unsigned int condition, unsigned int flags) -{ - int rc = 0; - - switch ((condition & 15) >> 1) { - case 0: /* o */ - rc |= (flags & EFLG_OF); - break; - case 1: /* b/c/nae */ - rc |= (flags & EFLG_CF); - break; - case 2: /* z/e */ - rc |= (flags & EFLG_ZF); - break; - case 3: /* be/na */ - rc |= (flags & (EFLG_CF|EFLG_ZF)); - break; - case 4: /* s */ - rc |= (flags & EFLG_SF); - break; - case 5: /* p/pe */ - rc |= (flags & EFLG_PF); - break; - case 7: /* le/ng */ - rc |= (flags & EFLG_ZF); - /* fall through */ - case 6: /* l/nge */ - rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); - break; - } - - /* Odd condition identifiers (lsb == 1) have inverted sense. */ - return (!!rc ^ (condition & 1)); +FASTOP2(add); +FASTOP2(or); +FASTOP2(adc); +FASTOP2(sbb); +FASTOP2(and); +FASTOP2(sub); +FASTOP2(xor); +FASTOP2(cmp); +FASTOP2(test); + +FASTOP3WCL(shld); +FASTOP3WCL(shrd); + +FASTOP2W(imul); + +FASTOP1(not); +FASTOP1(neg); +FASTOP1(inc); +FASTOP1(dec); + +FASTOP2CL(rol); +FASTOP2CL(ror); +FASTOP2CL(rcl); +FASTOP2CL(rcr); +FASTOP2CL(shl); +FASTOP2CL(shr); +FASTOP2CL(sar); + +FASTOP2W(bsf); +FASTOP2W(bsr); +FASTOP2W(bt); +FASTOP2W(bts); +FASTOP2W(btr); +FASTOP2W(btc); + +static u8 test_cc(unsigned int condition, unsigned long flags) +{ + u8 rc; + void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf); + + flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF; + asm("push %[flags]; popf; call *%[fastop]" + : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags)); + return rc; } static void fetch_register_operand(struct operand *op) @@ -994,6 +1125,53 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg) ctxt->ops->put_fpu(ctxt); } +static int em_fninit(struct x86_emulate_ctxt *ctxt) +{ + if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) + return emulate_nm(ctxt); + + ctxt->ops->get_fpu(ctxt); + asm volatile("fninit"); + ctxt->ops->put_fpu(ctxt); + return X86EMUL_CONTINUE; +} + +static int em_fnstcw(struct x86_emulate_ctxt *ctxt) +{ + u16 fcw; + + if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) + return emulate_nm(ctxt); + + ctxt->ops->get_fpu(ctxt); + asm volatile("fnstcw %0": "+m"(fcw)); + ctxt->ops->put_fpu(ctxt); + + /* force 2 byte destination */ + ctxt->dst.bytes = 2; + ctxt->dst.val = fcw; + + return X86EMUL_CONTINUE; +} + +static int em_fnstsw(struct x86_emulate_ctxt *ctxt) +{ + u16 fsw; + + if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) + return emulate_nm(ctxt); + + ctxt->ops->get_fpu(ctxt); + asm volatile("fnstsw %0": "+m"(fsw)); + ctxt->ops->put_fpu(ctxt); + + /* force 2 byte destination */ + ctxt->dst.bytes = 2; + ctxt->dst.val = fsw; + + return X86EMUL_CONTINUE; +} + static void decode_register_operand(struct x86_emulate_ctxt *ctxt, struct operand *op) { @@ -1534,6 +1712,9 @@ static int writeback(struct x86_emulate_ctxt *ctxt) { int rc; + if (ctxt->d & NoWrite) + return X86EMUL_CONTINUE; + switch (ctxt->dst.type) { case OP_REG: write_register_operand(&ctxt->dst); @@ -1918,47 +2099,6 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_grp2(struct x86_emulate_ctxt *ctxt) -{ - switch (ctxt->modrm_reg) { - case 0: /* rol */ - emulate_2op_SrcB(ctxt, "rol"); - break; - case 1: /* ror */ - emulate_2op_SrcB(ctxt, "ror"); - break; - case 2: /* rcl */ - emulate_2op_SrcB(ctxt, "rcl"); - break; - case 3: /* rcr */ - emulate_2op_SrcB(ctxt, "rcr"); - break; - case 4: /* sal/shl */ - case 6: /* sal/shl */ - emulate_2op_SrcB(ctxt, "sal"); - break; - case 5: /* shr */ - emulate_2op_SrcB(ctxt, "shr"); - break; - case 7: /* sar */ - emulate_2op_SrcB(ctxt, "sar"); - break; - } - return X86EMUL_CONTINUE; -} - -static int em_not(struct x86_emulate_ctxt *ctxt) -{ - ctxt->dst.val = ~ctxt->dst.val; - return X86EMUL_CONTINUE; -} - -static int em_neg(struct x86_emulate_ctxt *ctxt) -{ - emulate_1op(ctxt, "neg"); - return X86EMUL_CONTINUE; -} - static int em_mul_ex(struct x86_emulate_ctxt *ctxt) { u8 ex = 0; @@ -2000,12 +2140,6 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt) int rc = X86EMUL_CONTINUE; switch (ctxt->modrm_reg) { - case 0: /* inc */ - emulate_1op(ctxt, "inc"); - break; - case 1: /* dec */ - emulate_1op(ctxt, "dec"); - break; case 2: /* call near abs */ { long int old_eip; old_eip = ctxt->_eip; @@ -2075,7 +2209,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) /* Save real source value, then compare EAX against destination. */ ctxt->src.orig_val = ctxt->src.val; ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX); - emulate_2op_SrcV(ctxt, "cmp"); + fastop(ctxt, em_cmp); if (ctxt->eflags & EFLG_ZF) { /* Success: write back to memory. */ @@ -2843,7 +2977,7 @@ static int em_das(struct x86_emulate_ctxt *ctxt) ctxt->src.type = OP_IMM; ctxt->src.val = 0; ctxt->src.bytes = 1; - emulate_2op_SrcV(ctxt, "or"); + fastop(ctxt, em_or); ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); if (cf) ctxt->eflags |= X86_EFLAGS_CF; @@ -2852,6 +2986,24 @@ static int em_das(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_aad(struct x86_emulate_ctxt *ctxt) +{ + u8 al = ctxt->dst.val & 0xff; + u8 ah = (ctxt->dst.val >> 8) & 0xff; + + al = (al + (ah * ctxt->src.val)) & 0xff; + + ctxt->dst.val = (ctxt->dst.val & 0xffff0000) | al; + + /* Set PF, ZF, SF */ + ctxt->src.type = OP_IMM; + ctxt->src.val = 0; + ctxt->src.bytes = 1; + fastop(ctxt, em_or); + + return X86EMUL_CONTINUE; +} + static int em_call(struct x86_emulate_ctxt *ctxt) { long rel = ctxt->src.val; @@ -2900,64 +3052,6 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_add(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "add"); - return X86EMUL_CONTINUE; -} - -static int em_or(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "or"); - return X86EMUL_CONTINUE; -} - -static int em_adc(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "adc"); - return X86EMUL_CONTINUE; -} - -static int em_sbb(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "sbb"); - return X86EMUL_CONTINUE; -} - -static int em_and(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "and"); - return X86EMUL_CONTINUE; -} - -static int em_sub(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "sub"); - return X86EMUL_CONTINUE; -} - -static int em_xor(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "xor"); - return X86EMUL_CONTINUE; -} - -static int em_cmp(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "cmp"); - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return X86EMUL_CONTINUE; -} - -static int em_test(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV(ctxt, "test"); - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - return X86EMUL_CONTINUE; -} - static int em_xchg(struct x86_emulate_ctxt *ctxt) { /* Write back the register source. */ @@ -2970,16 +3064,10 @@ static int em_xchg(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_imul(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "imul"); - return X86EMUL_CONTINUE; -} - static int em_imul_3op(struct x86_emulate_ctxt *ctxt) { ctxt->dst.val = ctxt->src2.val; - return em_imul(ctxt); + return fastop(ctxt, em_imul); } static int em_cwd(struct x86_emulate_ctxt *ctxt) @@ -3300,47 +3388,6 @@ static int em_sti(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_bt(struct x86_emulate_ctxt *ctxt) -{ - /* Disable writeback. */ - ctxt->dst.type = OP_NONE; - /* only subword offset */ - ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; - - emulate_2op_SrcV_nobyte(ctxt, "bt"); - return X86EMUL_CONTINUE; -} - -static int em_bts(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "bts"); - return X86EMUL_CONTINUE; -} - -static int em_btr(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "btr"); - return X86EMUL_CONTINUE; -} - -static int em_btc(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "btc"); - return X86EMUL_CONTINUE; -} - -static int em_bsf(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "bsf"); - return X86EMUL_CONTINUE; -} - -static int em_bsr(struct x86_emulate_ctxt *ctxt) -{ - emulate_2op_SrcV_nobyte(ctxt, "bsr"); - return X86EMUL_CONTINUE; -} - static int em_cpuid(struct x86_emulate_ctxt *ctxt) { u32 eax, ebx, ecx, edx; @@ -3572,7 +3619,9 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } #define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) } +#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } +#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) } #define II(_f, _e, _i) \ { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } #define IIP(_f, _e, _i, _p) \ @@ -3583,12 +3632,13 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define D2bv(_f) D((_f) | ByteOp), D(_f) #define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) +#define F2bv(_f, _e) F((_f) | ByteOp, _e), F(_f, _e) #define I2bvIP(_f, _e, _i, _p) \ IIP((_f) | ByteOp, _e, _i, _p), IIP(_f, _e, _i, _p) -#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ - I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ - I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) +#define F6ALU(_f, _e) F2bv((_f) | DstMem | SrcReg | ModRM, _e), \ + F2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ + F2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) static const struct opcode group7_rm1[] = { DI(SrcNone | Priv, monitor), @@ -3614,25 +3664,36 @@ static const struct opcode group7_rm7[] = { }; static const struct opcode group1[] = { - I(Lock, em_add), - I(Lock | PageTable, em_or), - I(Lock, em_adc), - I(Lock, em_sbb), - I(Lock | PageTable, em_and), - I(Lock, em_sub), - I(Lock, em_xor), - I(0, em_cmp), + F(Lock, em_add), + F(Lock | PageTable, em_or), + F(Lock, em_adc), + F(Lock, em_sbb), + F(Lock | PageTable, em_and), + F(Lock, em_sub), + F(Lock, em_xor), + F(NoWrite, em_cmp), }; static const struct opcode group1A[] = { I(DstMem | SrcNone | Mov | Stack, em_pop), N, N, N, N, N, N, N, }; +static const struct opcode group2[] = { + F(DstMem | ModRM, em_rol), + F(DstMem | ModRM, em_ror), + F(DstMem | ModRM, em_rcl), + F(DstMem | ModRM, em_rcr), + F(DstMem | ModRM, em_shl), + F(DstMem | ModRM, em_shr), + F(DstMem | ModRM, em_shl), + F(DstMem | ModRM, em_sar), +}; + static const struct opcode group3[] = { - I(DstMem | SrcImm, em_test), - I(DstMem | SrcImm, em_test), - I(DstMem | SrcNone | Lock, em_not), - I(DstMem | SrcNone | Lock, em_neg), + F(DstMem | SrcImm | NoWrite, em_test), + F(DstMem | SrcImm | NoWrite, em_test), + F(DstMem | SrcNone | Lock, em_not), + F(DstMem | SrcNone | Lock, em_neg), I(SrcMem, em_mul_ex), I(SrcMem, em_imul_ex), I(SrcMem, em_div_ex), @@ -3640,14 +3701,14 @@ static const struct opcode group3[] = { }; static const struct opcode group4[] = { - I(ByteOp | DstMem | SrcNone | Lock, em_grp45), - I(ByteOp | DstMem | SrcNone | Lock, em_grp45), + F(ByteOp | DstMem | SrcNone | Lock, em_inc), + F(ByteOp | DstMem | SrcNone | Lock, em_dec), N, N, N, N, N, N, }; static const struct opcode group5[] = { - I(DstMem | SrcNone | Lock, em_grp45), - I(DstMem | SrcNone | Lock, em_grp45), + F(DstMem | SrcNone | Lock, em_inc), + F(DstMem | SrcNone | Lock, em_dec), I(SrcMem | Stack, em_grp45), I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), I(SrcMem | Stack, em_grp45), @@ -3682,10 +3743,10 @@ static const struct group_dual group7 = { { static const struct opcode group8[] = { N, N, N, N, - I(DstMem | SrcImmByte, em_bt), - I(DstMem | SrcImmByte | Lock | PageTable, em_bts), - I(DstMem | SrcImmByte | Lock, em_btr), - I(DstMem | SrcImmByte | Lock | PageTable, em_btc), + F(DstMem | SrcImmByte | NoWrite, em_bt), + F(DstMem | SrcImmByte | Lock | PageTable, em_bts), + F(DstMem | SrcImmByte | Lock, em_btr), + F(DstMem | SrcImmByte | Lock | PageTable, em_btc), }; static const struct group_dual group9 = { { @@ -3707,33 +3768,96 @@ static const struct gprefix pfx_vmovntpx = { I(0, em_mov), N, N, N, }; +static const struct escape escape_d9 = { { + N, N, N, N, N, N, N, I(DstMem, em_fnstcw), +}, { + /* 0xC0 - 0xC7 */ + N, N, N, N, N, N, N, N, + /* 0xC8 - 0xCF */ + N, N, N, N, N, N, N, N, + /* 0xD0 - 0xC7 */ + N, N, N, N, N, N, N, N, + /* 0xD8 - 0xDF */ + N, N, N, N, N, N, N, N, + /* 0xE0 - 0xE7 */ + N, N, N, N, N, N, N, N, + /* 0xE8 - 0xEF */ + N, N, N, N, N, N, N, N, + /* 0xF0 - 0xF7 */ + N, N, N, N, N, N, N, N, + /* 0xF8 - 0xFF */ + N, N, N, N, N, N, N, N, +} }; + +static const struct escape escape_db = { { + N, N, N, N, N, N, N, N, +}, { + /* 0xC0 - 0xC7 */ + N, N, N, N, N, N, N, N, + /* 0xC8 - 0xCF */ + N, N, N, N, N, N, N, N, + /* 0xD0 - 0xC7 */ + N, N, N, N, N, N, N, N, + /* 0xD8 - 0xDF */ + N, N, N, N, N, N, N, N, + /* 0xE0 - 0xE7 */ + N, N, N, I(ImplicitOps, em_fninit), N, N, N, N, + /* 0xE8 - 0xEF */ + N, N, N, N, N, N, N, N, + /* 0xF0 - 0xF7 */ + N, N, N, N, N, N, N, N, + /* 0xF8 - 0xFF */ + N, N, N, N, N, N, N, N, +} }; + +static const struct escape escape_dd = { { + N, N, N, N, N, N, N, I(DstMem, em_fnstsw), +}, { + /* 0xC0 - 0xC7 */ + N, N, N, N, N, N, N, N, + /* 0xC8 - 0xCF */ + N, N, N, N, N, N, N, N, + /* 0xD0 - 0xC7 */ + N, N, N, N, N, N, N, N, + /* 0xD8 - 0xDF */ + N, N, N, N, N, N, N, N, + /* 0xE0 - 0xE7 */ + N, N, N, N, N, N, N, N, + /* 0xE8 - 0xEF */ + N, N, N, N, N, N, N, N, + /* 0xF0 - 0xF7 */ + N, N, N, N, N, N, N, N, + /* 0xF8 - 0xFF */ + N, N, N, N, N, N, N, N, +} }; + static const struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ - I6ALU(Lock, em_add), + F6ALU(Lock, em_add), I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), /* 0x08 - 0x0F */ - I6ALU(Lock | PageTable, em_or), + F6ALU(Lock | PageTable, em_or), I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), N, /* 0x10 - 0x17 */ - I6ALU(Lock, em_adc), + F6ALU(Lock, em_adc), I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg), I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg), /* 0x18 - 0x1F */ - I6ALU(Lock, em_sbb), + F6ALU(Lock, em_sbb), I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), /* 0x20 - 0x27 */ - I6ALU(Lock | PageTable, em_and), N, N, + F6ALU(Lock | PageTable, em_and), N, N, /* 0x28 - 0x2F */ - I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), + F6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), /* 0x30 - 0x37 */ - I6ALU(Lock, em_xor), N, N, + F6ALU(Lock, em_xor), N, N, /* 0x38 - 0x3F */ - I6ALU(0, em_cmp), N, N, + F6ALU(NoWrite, em_cmp), N, N, /* 0x40 - 0x4F */ - X16(D(DstReg)), + X8(F(DstReg, em_inc)), X8(F(DstReg, em_dec)), /* 0x50 - 0x57 */ X8(I(SrcReg | Stack, em_push)), /* 0x58 - 0x5F */ @@ -3757,7 +3881,7 @@ static const struct opcode opcode_table[256] = { G(DstMem | SrcImm, group1), G(ByteOp | DstMem | SrcImm | No64, group1), G(DstMem | SrcImmByte, group1), - I2bv(DstMem | SrcReg | ModRM, em_test), + F2bv(DstMem | SrcReg | ModRM | NoWrite, em_test), I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_xchg), /* 0x88 - 0x8F */ I2bv(DstMem | SrcReg | ModRM | Mov | PageTable, em_mov), @@ -3777,18 +3901,18 @@ static const struct opcode opcode_table[256] = { I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), I2bv(SrcSI | DstDI | Mov | String, em_mov), - I2bv(SrcSI | DstDI | String, em_cmp), + F2bv(SrcSI | DstDI | String | NoWrite, em_cmp), /* 0xA8 - 0xAF */ - I2bv(DstAcc | SrcImm, em_test), + F2bv(DstAcc | SrcImm | NoWrite, em_test), I2bv(SrcAcc | DstDI | Mov | String, em_mov), I2bv(SrcSI | DstAcc | Mov | String, em_mov), - I2bv(SrcAcc | DstDI | String, em_cmp), + F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp), /* 0xB0 - 0xB7 */ X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), /* 0xB8 - 0xBF */ - X8(I(DstReg | SrcImm | Mov, em_mov)), + X8(I(DstReg | SrcImm64 | Mov, em_mov)), /* 0xC0 - 0xC7 */ - D2bv(DstMem | SrcImmByte | ModRM), + G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2), I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), I(ImplicitOps | Stack, em_ret), I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), @@ -3800,10 +3924,11 @@ static const struct opcode opcode_table[256] = { D(ImplicitOps), DI(SrcImmByte, intn), D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), /* 0xD0 - 0xD7 */ - D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), - N, N, N, N, + G(Src2One | ByteOp, group2), G(Src2One, group2), + G(Src2CL | ByteOp, group2), G(Src2CL, group2), + N, I(DstAcc | SrcImmByte | No64, em_aad), N, N, /* 0xD8 - 0xDF */ - N, N, N, N, N, N, N, N, + N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N, /* 0xE0 - 0xE7 */ X3(I(SrcImmByte, em_loop)), I(SrcImmByte, em_jcxz), @@ -3870,28 +3995,29 @@ static const struct opcode twobyte_table[256] = { X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), - II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), - D(DstMem | SrcReg | Src2ImmByte | ModRM), - D(DstMem | SrcReg | Src2CL | ModRM), N, N, + II(ImplicitOps, em_cpuid, cpuid), + F(DstMem | SrcReg | ModRM | BitOp | NoWrite, em_bt), + F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shld), + F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N, /* 0xA8 - 0xAF */ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), DI(ImplicitOps, rsm), - I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), - D(DstMem | SrcReg | Src2ImmByte | ModRM), - D(DstMem | SrcReg | Src2CL | ModRM), - D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), + F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), + F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), + F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), + D(ModRM), F(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), - I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), + F(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ N, N, G(BitOp, group8), - I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), - I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), + F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), + F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr), D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xC7 */ D2bv(DstMem | SrcReg | ModRM | Lock), @@ -3950,6 +4076,9 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, case 4: op->val = insn_fetch(s32, ctxt); break; + case 8: + op->val = insn_fetch(s64, ctxt); + break; } if (!sign_extension) { switch (op->bytes) { @@ -4028,6 +4157,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, case OpImm: rc = decode_imm(ctxt, op, imm_size(ctxt), true); break; + case OpImm64: + rc = decode_imm(ctxt, op, ctxt->op_bytes, true); + break; case OpMem8: ctxt->memop.bytes = 1; goto mem_common; @@ -4222,6 +4354,12 @@ done_prefixes: case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break; } break; + case Escape: + if (ctxt->modrm > 0xbf) + opcode = opcode.u.esc->high[ctxt->modrm - 0xc0]; + else + opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7]; + break; default: return EMULATION_FAILED; } @@ -4354,6 +4492,16 @@ static void fetch_possible_mmx_operand(struct x86_emulate_ctxt *ctxt, read_mmx_reg(ctxt, &op->mm_val, op->addr.mm); } +static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) +{ + ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF; + fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; + asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" + : "+a"(ctxt->dst.val), "+b"(ctxt->src.val), [flags]"+D"(flags) + : "c"(ctxt->src2.val), [fastop]"S"(fop)); + ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); + return X86EMUL_CONTINUE; +} int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { @@ -4483,6 +4631,13 @@ special_insn: } if (ctxt->execute) { + if (ctxt->d & Fastop) { + void (*fop)(struct fastop *) = (void *)ctxt->execute; + rc = fastop(ctxt, fop); + if (rc != X86EMUL_CONTINUE) + goto done; + goto writeback; + } rc = ctxt->execute(ctxt); if (rc != X86EMUL_CONTINUE) goto done; @@ -4493,12 +4648,6 @@ special_insn: goto twobyte_insn; switch (ctxt->b) { - case 0x40 ... 0x47: /* inc r16/r32 */ - emulate_1op(ctxt, "inc"); - break; - case 0x48 ... 0x4f: /* dec r16/r32 */ - emulate_1op(ctxt, "dec"); - break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) goto cannot_emulate; @@ -4523,9 +4672,6 @@ special_insn: case 8: ctxt->dst.val = (s32)ctxt->dst.val; break; } break; - case 0xc0 ... 0xc1: - rc = em_grp2(ctxt); - break; case 0xcc: /* int3 */ rc = emulate_int(ctxt, 3); break; @@ -4536,13 +4682,6 @@ special_insn: if (ctxt->eflags & EFLG_OF) rc = emulate_int(ctxt, 4); break; - case 0xd0 ... 0xd1: /* Grp2 */ - rc = em_grp2(ctxt); - break; - case 0xd2 ... 0xd3: /* Grp2 */ - ctxt->src.val = reg_read(ctxt, VCPU_REGS_RCX); - rc = em_grp2(ctxt); - break; case 0xe9: /* jmp rel */ case 0xeb: /* jmp rel short */ jmp_rel(ctxt, ctxt->src.val); @@ -4661,14 +4800,6 @@ twobyte_insn: case 0x90 ... 0x9f: /* setcc r/m8 */ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); break; - case 0xa4: /* shld imm8, r, r/m */ - case 0xa5: /* shld cl, r, r/m */ - emulate_2op_cl(ctxt, "shld"); - break; - case 0xac: /* shrd imm8, r, r/m */ - case 0xad: /* shrd cl, r, r/m */ - emulate_2op_cl(ctxt, "shrd"); - break; case 0xae: /* clflush */ break; case 0xb6 ... 0xb7: /* movzx */ @@ -4682,7 +4813,7 @@ twobyte_insn: (s16) ctxt->src.val; break; case 0xc0 ... 0xc1: /* xadd */ - emulate_2op_SrcV(ctxt, "add"); + fastop(ctxt, em_add); /* Write back the register source. */ ctxt->src.val = ctxt->dst.orig_val; write_register_operand(&ctxt->src); diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 11300d2fa71..c1d30b2fc9b 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -122,7 +122,6 @@ static s64 __kpit_elapsed(struct kvm *kvm) */ remaining = hrtimer_get_remaining(&ps->timer); elapsed = ps->period - ktime_to_ns(remaining); - elapsed = mod_64(elapsed, ps->period); return elapsed; } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 848206df096..cc31f7c06d3 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -241,6 +241,8 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); + s->output = 0; + pic_lock(s); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 7e06ba1618b..484bc874688 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -38,49 +38,81 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) EXPORT_SYMBOL(kvm_cpu_has_pending_timer); /* + * check if there is pending interrupt from + * non-APIC source without intack. + */ +static int kvm_cpu_has_extint(struct kvm_vcpu *v) +{ + if (kvm_apic_accept_pic_intr(v)) + return pic_irqchip(v->kvm)->output; /* PIC */ + else + return 0; +} + +/* + * check if there is injectable interrupt: + * when virtual interrupt delivery enabled, + * interrupt from apic will handled by hardware, + * we don't need to check it here. + */ +int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) +{ + if (!irqchip_in_kernel(v->kvm)) + return v->arch.interrupt.pending; + + if (kvm_cpu_has_extint(v)) + return 1; + + if (kvm_apic_vid_enabled(v->kvm)) + return 0; + + return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ +} + +/* * check if there is pending interrupt without * intack. */ int kvm_cpu_has_interrupt(struct kvm_vcpu *v) { - struct kvm_pic *s; - if (!irqchip_in_kernel(v->kvm)) return v->arch.interrupt.pending; - if (kvm_apic_has_interrupt(v) == -1) { /* LAPIC */ - if (kvm_apic_accept_pic_intr(v)) { - s = pic_irqchip(v->kvm); /* PIC */ - return s->output; - } else - return 0; - } - return 1; + if (kvm_cpu_has_extint(v)) + return 1; + + return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ } EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt); /* + * Read pending interrupt(from non-APIC source) + * vector and intack. + */ +static int kvm_cpu_get_extint(struct kvm_vcpu *v) +{ + if (kvm_cpu_has_extint(v)) + return kvm_pic_read_irq(v->kvm); /* PIC */ + return -1; +} + +/* * Read pending interrupt vector and intack. */ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) { - struct kvm_pic *s; int vector; if (!irqchip_in_kernel(v->kvm)) return v->arch.interrupt.nr; - vector = kvm_get_apic_interrupt(v); /* APIC */ - if (vector == -1) { - if (kvm_apic_accept_pic_intr(v)) { - s = pic_irqchip(v->kvm); - s->output = 0; /* PIC */ - vector = kvm_pic_read_irq(v->kvm); - } - } - return vector; + vector = kvm_cpu_get_extint(v); + + if (kvm_apic_vid_enabled(v->kvm) || vector != -1) + return vector; /* PIC */ + + return kvm_get_apic_interrupt(v); /* APIC */ } -EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 43e9fadca5d..02b51dd4e4a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -140,31 +140,56 @@ static inline int apic_enabled(struct kvm_lapic *apic) (LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \ APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER) -static inline int apic_x2apic_mode(struct kvm_lapic *apic) -{ - return apic->vcpu->arch.apic_base & X2APIC_ENABLE; -} - static inline int kvm_apic_id(struct kvm_lapic *apic) { return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff; } -static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) +void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, + struct kvm_lapic_irq *irq, + u64 *eoi_exit_bitmap) { - u16 cid; - ldr >>= 32 - map->ldr_bits; - cid = (ldr >> map->cid_shift) & map->cid_mask; + struct kvm_lapic **dst; + struct kvm_apic_map *map; + unsigned long bitmap = 1; + int i; - BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); + rcu_read_lock(); + map = rcu_dereference(vcpu->kvm->arch.apic_map); - return cid; -} + if (unlikely(!map)) { + __set_bit(irq->vector, (unsigned long *)eoi_exit_bitmap); + goto out; + } -static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) -{ - ldr >>= (32 - map->ldr_bits); - return ldr & map->lid_mask; + if (irq->dest_mode == 0) { /* physical mode */ + if (irq->delivery_mode == APIC_DM_LOWEST || + irq->dest_id == 0xff) { + __set_bit(irq->vector, + (unsigned long *)eoi_exit_bitmap); + goto out; + } + dst = &map->phys_map[irq->dest_id & 0xff]; + } else { + u32 mda = irq->dest_id << (32 - map->ldr_bits); + + dst = map->logical_map[apic_cluster_id(map, mda)]; + + bitmap = apic_logical_id(map, mda); + } + + for_each_set_bit(i, &bitmap, 16) { + if (!dst[i]) + continue; + if (dst[i]->vcpu == vcpu) { + __set_bit(irq->vector, + (unsigned long *)eoi_exit_bitmap); + break; + } + } + +out: + rcu_read_unlock(); } static void recalculate_apic_map(struct kvm *kvm) @@ -230,6 +255,8 @@ out: if (old) kfree_rcu(old, rcu); + + kvm_ioapic_make_eoibitmap_request(kvm); } static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id) @@ -345,6 +372,10 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) { int result; + /* + * Note that irr_pending is just a hint. It will be always + * true with virtual interrupt delivery enabled. + */ if (!apic->irr_pending) return -1; @@ -461,6 +492,8 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) static inline int apic_find_highest_isr(struct kvm_lapic *apic) { int result; + + /* Note that isr_count is always 1 with vid enabled */ if (!apic->isr_count) return -1; if (likely(apic->highest_isr_cache != -1)) @@ -740,6 +773,19 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } +static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector) +{ + if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && + kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { + int trigger_mode; + if (apic_test_vector(vector, apic->regs + APIC_TMR)) + trigger_mode = IOAPIC_LEVEL_TRIG; + else + trigger_mode = IOAPIC_EDGE_TRIG; + kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + } +} + static int apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); @@ -756,19 +802,26 @@ static int apic_set_eoi(struct kvm_lapic *apic) apic_clear_isr(vector, apic); apic_update_ppr(apic); - if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && - kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { - int trigger_mode; - if (apic_test_vector(vector, apic->regs + APIC_TMR)) - trigger_mode = IOAPIC_LEVEL_TRIG; - else - trigger_mode = IOAPIC_EDGE_TRIG; - kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); - } + kvm_ioapic_send_eoi(apic, vector); kvm_make_request(KVM_REQ_EVENT, apic->vcpu); return vector; } +/* + * this interface assumes a trap-like exit, which has already finished + * desired side effect including vISR and vPPR update. + */ +void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + trace_kvm_eoi(apic, vector); + + kvm_ioapic_send_eoi(apic, vector); + kvm_make_request(KVM_REQ_EVENT, apic->vcpu); +} +EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); + static void apic_send_ipi(struct kvm_lapic *apic) { u32 icr_low = kvm_apic_get_reg(apic, APIC_ICR); @@ -1011,7 +1064,7 @@ static void start_apic_timer(struct kvm_lapic *apic) local_irq_save(flags); now = apic->lapic_timer.timer.base->get_time(); - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); @@ -1212,6 +1265,21 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); +/* emulate APIC access in a trap manner */ +void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset) +{ + u32 val = 0; + + /* hw has done the conditional check and inst decode */ + offset &= 0xff0; + + apic_reg_read(vcpu->arch.apic, offset, 4, &val); + + /* TODO: optimize to just emulate side effect w/o one more write */ + apic_reg_write(vcpu->arch.apic, offset, val); +} +EXPORT_SYMBOL_GPL(kvm_apic_write_nodecode); + void kvm_free_lapic(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -1288,6 +1356,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) { + u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; if (!apic) { @@ -1309,11 +1378,16 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) value &= ~MSR_IA32_APICBASE_BSP; vcpu->arch.apic_base = value; - if (apic_x2apic_mode(apic)) { - u32 id = kvm_apic_id(apic); - u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); - kvm_apic_set_ldr(apic, ldr); + if ((old_value ^ value) & X2APIC_ENABLE) { + if (value & X2APIC_ENABLE) { + u32 id = kvm_apic_id(apic); + u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); + kvm_apic_set_ldr(apic, ldr); + kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true); + } else + kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false); } + apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; @@ -1359,8 +1433,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } - apic->irr_pending = false; - apic->isr_count = 0; + apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm); + apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm); apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); @@ -1575,8 +1649,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; - apic->isr_count = count_vectors(apic->regs + APIC_ISR); + apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ? + 1 : count_vectors(apic->regs + APIC_ISR); apic->highest_isr_cache = -1; + kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic)); kvm_make_request(KVM_REQ_EVENT, vcpu); } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index e5ebf9f3571..1676d34ddb4 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -64,6 +64,9 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); +void kvm_apic_write_nodecode(struct kvm_vcpu *vcpu, u32 offset); +void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector); + void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); @@ -124,4 +127,35 @@ static inline int kvm_lapic_enabled(struct kvm_vcpu *vcpu) return kvm_apic_present(vcpu) && kvm_apic_sw_enabled(vcpu->arch.apic); } +static inline int apic_x2apic_mode(struct kvm_lapic *apic) +{ + return apic->vcpu->arch.apic_base & X2APIC_ENABLE; +} + +static inline bool kvm_apic_vid_enabled(struct kvm *kvm) +{ + return kvm_x86_ops->vm_has_apicv(kvm); +} + +static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr) +{ + u16 cid; + ldr >>= 32 - map->ldr_bits; + cid = (ldr >> map->cid_shift) & map->cid_mask; + + BUG_ON(cid >= ARRAY_SIZE(map->logical_map)); + + return cid; +} + +static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr) +{ + ldr >>= (32 - map->ldr_bits); + return ldr & map->lid_mask; +} + +void kvm_calculate_eoi_exitmap(struct kvm_vcpu *vcpu, + struct kvm_lapic_irq *irq, + u64 *eoi_bitmap); + #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6f85fe0bf95..956ca358108 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -448,7 +448,8 @@ static bool __check_direct_spte_mmio_pf(u64 spte) static bool spte_is_locklessly_modifiable(u64 spte) { - return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); + return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == + (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); } static bool spte_has_volatile_bits(u64 spte) @@ -831,8 +832,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) if (host_level == PT_PAGE_TABLE_LEVEL) return host_level; - max_level = kvm_x86_ops->get_lpage_level() < host_level ? - kvm_x86_ops->get_lpage_level() : host_level; + max_level = min(kvm_x86_ops->get_lpage_level(), host_level); for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) @@ -1142,7 +1142,7 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) } static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, - int level, bool pt_protect) + bool pt_protect) { u64 *sptep; struct rmap_iterator iter; @@ -1180,7 +1180,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, while (mask) { rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), PT_PAGE_TABLE_LEVEL, slot); - __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); + __rmap_write_protect(kvm, rmapp, false); /* clear the first set bit */ mask &= mask - 1; @@ -1199,7 +1199,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn) for (i = PT_PAGE_TABLE_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { rmapp = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmapp, i, true); + write_protected |= __rmap_write_protect(kvm, rmapp, true); } return write_protected; @@ -1460,28 +1460,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) percpu_counter_add(&kvm_total_used_mmu_pages, nr); } -/* - * Remove the sp from shadow page cache, after call it, - * we can not find this sp from the cache, and the shadow - * page table is still valid. - * It should be under the protection of mmu lock. - */ -static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp) +static void kvm_mmu_free_page(struct kvm_mmu_page *sp) { ASSERT(is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); - if (!sp->role.direct) - free_page((unsigned long)sp->gfns); -} - -/* - * Free the shadow page table and the sp, we can do it - * out of the protection of mmu lock. - */ -static void kvm_mmu_free_page(struct kvm_mmu_page *sp) -{ list_del(&sp->link); free_page((unsigned long)sp->spt); + if (!sp->role.direct) + free_page((unsigned long)sp->gfns); kmem_cache_free(mmu_page_header_cache, sp); } @@ -1522,7 +1508,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); - bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); sp->parent_ptes = 0; mmu_page_add_parent_pte(vcpu, sp, parent_pte); kvm_mod_used_mmu_pages(vcpu->kvm, +1); @@ -1659,13 +1644,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); -#define for_each_gfn_sp(kvm, sp, gfn, pos) \ - hlist_for_each_entry(sp, pos, \ +#define for_each_gfn_sp(kvm, sp, gfn) \ + hlist_for_each_entry(sp, \ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ if ((sp)->gfn != (gfn)) {} else -#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \ - hlist_for_each_entry(sp, pos, \ +#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn) \ + hlist_for_each_entry(sp, \ &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ if ((sp)->gfn != (gfn) || (sp)->role.direct || \ (sp)->role.invalid) {} else @@ -1721,11 +1706,10 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_mmu_page *s; - struct hlist_node *node; LIST_HEAD(invalid_list); bool flush = false; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { if (!s->unsync) continue; @@ -1863,7 +1847,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, union kvm_mmu_page_role role; unsigned quadrant; struct kvm_mmu_page *sp; - struct hlist_node *node; bool need_sync = false; role = vcpu->arch.mmu.base_role; @@ -1878,7 +1861,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_gfn_sp(vcpu->kvm, sp, gfn, node) { + for_each_gfn_sp(vcpu->kvm, sp, gfn) { if (!need_sync && sp->unsync) need_sync = true; @@ -1973,9 +1956,9 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) { u64 spte; - spte = __pa(sp->spt) - | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; + spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | + shadow_user_mask | shadow_x_mask | shadow_accessed_mask; + mmu_spte_set(sptep, spte); } @@ -2126,7 +2109,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, do { sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); WARN_ON(!sp->role.invalid || sp->root_count); - kvm_mmu_isolate_page(sp); kvm_mmu_free_page(sp); } while (!list_empty(invalid_list)); } @@ -2144,6 +2126,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) * change the value */ + spin_lock(&kvm->mmu_lock); + if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && !list_empty(&kvm->arch.active_mmu_pages)) { @@ -2158,19 +2142,20 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) } kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; + + spin_unlock(&kvm->mmu_lock); } int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) { struct kvm_mmu_page *sp; - struct hlist_node *node; LIST_HEAD(invalid_list); int r; pgprintk("%s: looking for gfn %llx\n", __func__, gfn); r = 0; spin_lock(&kvm->mmu_lock); - for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { + for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { pgprintk("%s: gfn %llx role %x\n", __func__, gfn, sp->role.word); r = 1; @@ -2183,14 +2168,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); -static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) -{ - int slot = memslot_id(kvm, gfn); - struct kvm_mmu_page *sp = page_header(__pa(pte)); - - __set_bit(slot, sp->slot_bitmap); -} - /* * The function is based on mtrr_type_lookup() in * arch/x86/kernel/cpu/mtrr/generic.c @@ -2308,9 +2285,8 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_mmu_page *s; - struct hlist_node *node; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { if (s->unsync) continue; WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); @@ -2322,19 +2298,17 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync) { struct kvm_mmu_page *s; - struct hlist_node *node; bool need_unsync = false; - for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { if (!can_unsync) return 1; if (s->role.level != PT_PAGE_TABLE_LEVEL) return 1; - if (!need_unsync && !s->unsync) { + if (!s->unsync) need_unsync = true; - } } if (need_unsync) kvm_unsync_pages(vcpu, gfn); @@ -2342,8 +2316,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, } static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned pte_access, int user_fault, - int write_fault, int level, + unsigned pte_access, int level, gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { @@ -2378,32 +2351,20 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= (u64)pfn << PAGE_SHIFT; - if ((pte_access & ACC_WRITE_MASK) - || (!vcpu->arch.mmu.direct_map && write_fault - && !is_write_protection(vcpu) && !user_fault)) { + if (pte_access & ACC_WRITE_MASK) { + /* + * Other vcpu creates new sp in the window between + * mapping_level() and acquiring mmu-lock. We can + * allow guest to retry the access, the mapping can + * be fixed if guest refault. + */ if (level > PT_PAGE_TABLE_LEVEL && - has_wrprotected_page(vcpu->kvm, gfn, level)) { - ret = 1; - drop_spte(vcpu->kvm, sptep); + has_wrprotected_page(vcpu->kvm, gfn, level)) goto done; - } spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; - if (!vcpu->arch.mmu.direct_map - && !(pte_access & ACC_WRITE_MASK)) { - spte &= ~PT_USER_MASK; - /* - * If we converted a user page to a kernel page, - * so that the kernel can write to it when cr0.wp=0, - * then we should prevent the kernel from executing it - * if SMEP is enabled. - */ - if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) - spte |= PT64_NX_MASK; - } - /* * Optimization: for pte sync, if spte was writable the hash * lookup is unnecessary (and expensive). Write protection @@ -2433,19 +2394,15 @@ done: } static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, - unsigned pt_access, unsigned pte_access, - int user_fault, int write_fault, - int *emulate, int level, gfn_t gfn, - pfn_t pfn, bool speculative, + unsigned pte_access, int write_fault, int *emulate, + int level, gfn_t gfn, pfn_t pfn, bool speculative, bool host_writable) { int was_rmapped = 0; int rmap_count; - pgprintk("%s: spte %llx access %x write_fault %d" - " user_fault %d gfn %llx\n", - __func__, *sptep, pt_access, - write_fault, user_fault, gfn); + pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, + *sptep, write_fault, gfn); if (is_rmap_spte(*sptep)) { /* @@ -2469,9 +2426,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, was_rmapped = 1; } - if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, - level, gfn, pfn, speculative, true, - host_writable)) { + if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, + true, host_writable)) { if (write_fault) *emulate = 1; kvm_mmu_flush_tlb(vcpu); @@ -2489,7 +2445,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, ++vcpu->kvm->stat.lpages; if (is_shadow_present_pte(*sptep)) { - page_header_update_slot(vcpu->kvm, sptep, gfn); if (!was_rmapped) { rmap_count = rmap_add(vcpu, sptep, gfn); if (rmap_count > RMAP_RECYCLE_THRESHOLD) @@ -2505,6 +2460,14 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } +static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) +{ + int bit7; + + bit7 = (gpte >> 7) & 1; + return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; +} + static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { @@ -2517,6 +2480,26 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, return gfn_to_pfn_memslot_atomic(slot, gfn); } +static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + u64 gpte) +{ + if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + goto no_present; + + if (!is_present_gpte(gpte)) + goto no_present; + + if (!(gpte & PT_ACCESSED_MASK)) + goto no_present; + + return false; + +no_present: + drop_spte(vcpu->kvm, spte); + return true; +} + static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *start, u64 *end) @@ -2535,10 +2518,9 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, return -1; for (i = 0; i < ret; i++, gfn++, start++) - mmu_set_spte(vcpu, start, ACC_ALL, - access, 0, 0, NULL, - sp->role.level, gfn, - page_to_pfn(pages[i]), true, true); + mmu_set_spte(vcpu, start, access, 0, NULL, + sp->role.level, gfn, page_to_pfn(pages[i]), + true, true); return 0; } @@ -2597,11 +2579,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { if (iterator.level == level) { - unsigned pte_access = ACC_ALL; - - mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, - 0, write, &emulate, - level, gfn, pfn, prefault, map_writable); + mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, + write, &emulate, level, gfn, pfn, + prefault, map_writable); direct_pte_prefetch(vcpu, iterator.sptep); ++vcpu->stat.pf_fixed; break; @@ -2616,11 +2596,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, iterator.level - 1, 1, ACC_ALL, iterator.sptep); - mmu_spte_set(iterator.sptep, - __pa(sp->spt) - | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask - | shadow_accessed_mask); + link_shadow_page(iterator.sptep, sp); } } return emulate; @@ -2671,7 +2647,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * PT_PAGE_TABLE_LEVEL and there would be no adjustment done * here. */ - if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && + if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompound(pfn_to_page(pfn)) && !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { @@ -2699,18 +2675,13 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, } } -static bool mmu_invalid_pfn(pfn_t pfn) -{ - return unlikely(is_invalid_pfn(pfn)); -} - static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, pfn_t pfn, unsigned access, int *ret_val) { bool ret = true; /* The pfn is invalid, report the error! */ - if (unlikely(is_invalid_pfn(pfn))) { + if (unlikely(is_error_pfn(pfn))) { *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); goto exit; } @@ -2862,7 +2833,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, return r; spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); if (likely(!force_pt_level)) @@ -3331,7 +3302,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, return r; spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); if (likely(!force_pt_level)) @@ -3399,14 +3370,6 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } -static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) -{ - int bit7; - - bit7 = (gpte >> 7) & 1; - return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; -} - static inline void protect_clean_gpte(unsigned *access, unsigned gpte) { unsigned mask; @@ -3696,6 +3659,7 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) else r = paging32_init_context(vcpu, context); + vcpu->arch.mmu.base_role.nxe = is_nx(vcpu); vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); vcpu->arch.mmu.base_role.smep_andnot_wp @@ -3862,7 +3826,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ *gpa &= ~(gpa_t)7; *bytes = 8; - r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8)); + r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8); if (r) gentry = 0; new = (const u8 *)&gentry; @@ -3964,7 +3928,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gfn_t gfn = gpa >> PAGE_SHIFT; union kvm_mmu_page_role mask = { .word = 0 }; struct kvm_mmu_page *sp; - struct hlist_node *node; LIST_HEAD(invalid_list); u64 entry, gentry, *spte; int npte; @@ -3995,7 +3958,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; - for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { + for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || detect_write_flooding(sp)) { zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, @@ -4016,7 +3979,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, !((sp->role.word ^ vcpu->arch.mmu.base_role.word) & mask.word) && rmap_can_add(vcpu)) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); - if (!remote_flush && need_remote_flush(entry, *spte)) + if (need_remote_flush(entry, *spte)) remote_flush = true; ++spte; } @@ -4175,26 +4138,36 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { - struct kvm_mmu_page *sp; - bool flush = false; + struct kvm_memory_slot *memslot; + gfn_t last_gfn; + int i; - list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { - int i; - u64 *pt; + memslot = id_to_memslot(kvm->memslots, slot); + last_gfn = memslot->base_gfn + memslot->npages - 1; - if (!test_bit(slot, sp->slot_bitmap)) - continue; + spin_lock(&kvm->mmu_lock); - pt = sp->spt; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (!is_shadow_present_pte(pt[i]) || - !is_last_spte(pt[i], sp->role.level)) - continue; + for (i = PT_PAGE_TABLE_LEVEL; + i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + unsigned long *rmapp; + unsigned long last_index, index; + + rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; + last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); - spte_write_protect(kvm, &pt[i], &flush, false); + for (index = 0; index <= last_index; ++index, ++rmapp) { + if (*rmapp) + __rmap_write_protect(kvm, rmapp, false); + + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + kvm_flush_remote_tlbs(kvm); + cond_resched_lock(&kvm->mmu_lock); + } } } + kvm_flush_remote_tlbs(kvm); + spin_unlock(&kvm->mmu_lock); } void kvm_mmu_zap_all(struct kvm *kvm) diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index cd6e98333ba..b8f6172f417 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -195,12 +195,6 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, TP_ARGS(sp) ); -DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages, - TP_PROTO(struct kvm_mmu_page *sp), - - TP_ARGS(sp) -); - TRACE_EVENT( mark_mmio_spte, TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access), diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 714e2c01a6f..105dd5bd550 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -151,7 +151,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; - unsigned index, pt_access, pte_access, accessed_dirty, shift; + unsigned index, pt_access, pte_access, accessed_dirty; gpa_t pte_gpa; int offset; const int write_fault = access & PFERR_WRITE_MASK; @@ -249,16 +249,12 @@ retry_walk: if (!write_fault) protect_clean_gpte(&pte_access, pte); - - /* - * On a write fault, fold the dirty bit into accessed_dirty by shifting it one - * place right. - * - * On a read fault, do nothing. - */ - shift = write_fault >> ilog2(PFERR_WRITE_MASK); - shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT; - accessed_dirty &= pte >> shift; + else + /* + * On a write fault, fold the dirty bit into accessed_dirty by + * shifting it one place right. + */ + accessed_dirty &= pte >> (PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT); if (unlikely(!accessed_dirty)) { ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); @@ -305,51 +301,43 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, addr, access); } -static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, u64 *spte, - pt_element_t gpte) +static bool +FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *spte, pt_element_t gpte, bool no_dirty_log) { - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) - goto no_present; + unsigned pte_access; + gfn_t gfn; + pfn_t pfn; + + if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) + return false; - if (!is_present_gpte(gpte)) - goto no_present; + pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - if (!(gpte & PT_ACCESSED_MASK)) - goto no_present; + gfn = gpte_to_gfn(gpte); + pte_access = sp->role.access & gpte_access(vcpu, gpte); + protect_clean_gpte(&pte_access, gpte); + pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, + no_dirty_log && (pte_access & ACC_WRITE_MASK)); + if (is_error_pfn(pfn)) + return false; - return false; + /* + * we call mmu_set_spte() with host_writable = true because + * pte_prefetch_gfn_to_pfn always gets a writable pfn. + */ + mmu_set_spte(vcpu, spte, pte_access, 0, NULL, PT_PAGE_TABLE_LEVEL, + gfn, pfn, true, true); -no_present: - drop_spte(vcpu->kvm, spte); return true; } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte) { - pt_element_t gpte; - unsigned pte_access; - pfn_t pfn; - - gpte = *(const pt_element_t *)pte; - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) - return; - - pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - pte_access = sp->role.access & gpte_access(vcpu, gpte); - protect_clean_gpte(&pte_access, gpte); - pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); - if (mmu_invalid_pfn(pfn)) - return; + pt_element_t gpte = *(const pt_element_t *)pte; - /* - * we call mmu_set_spte() with host_writable = true because that - * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). - */ - mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - NULL, PT_PAGE_TABLE_LEVEL, - gpte_to_gfn(gpte), pfn, true, true); + FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false); } static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, @@ -395,53 +383,31 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { - pt_element_t gpte; - unsigned pte_access; - gfn_t gfn; - pfn_t pfn; - if (spte == sptep) continue; if (is_shadow_present_pte(*spte)) continue; - gpte = gptep[i]; - - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) - continue; - - pte_access = sp->role.access & gpte_access(vcpu, gpte); - protect_clean_gpte(&pte_access, gpte); - gfn = gpte_to_gfn(gpte); - pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, - pte_access & ACC_WRITE_MASK); - if (mmu_invalid_pfn(pfn)) + if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true)) break; - - mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, - NULL, PT_PAGE_TABLE_LEVEL, gfn, - pfn, true, true); } } /* * Fetch a shadow pte for a specific level in the paging hierarchy. + * If the guest tries to write a write-protected page, we need to + * emulate this operation, return 1 to indicate this case. */ -static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, +static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, - int user_fault, int write_fault, int hlevel, - int *emulate, pfn_t pfn, bool map_writable, - bool prefault) + int write_fault, int hlevel, + pfn_t pfn, bool map_writable, bool prefault) { - unsigned access = gw->pt_access; struct kvm_mmu_page *sp = NULL; - int top_level; - unsigned direct_access; struct kvm_shadow_walk_iterator it; - - if (!is_present_gpte(gw->ptes[gw->level - 1])) - return NULL; + unsigned direct_access, access = gw->pt_access; + int top_level, emulate = 0; direct_access = gw->pte_access; @@ -504,18 +470,57 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, } clear_sp_write_flooding_count(it.sptep); - mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, - user_fault, write_fault, emulate, it.level, - gw->gfn, pfn, prefault, map_writable); + mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault, &emulate, + it.level, gw->gfn, pfn, prefault, map_writable); FNAME(pte_prefetch)(vcpu, gw, it.sptep); - return it.sptep; + return emulate; out_gpte_changed: if (sp) kvm_mmu_put_page(sp, it.sptep); kvm_release_pfn_clean(pfn); - return NULL; + return 0; +} + + /* + * To see whether the mapped gfn can write its page table in the current + * mapping. + * + * It is the helper function of FNAME(page_fault). When guest uses large page + * size to map the writable gfn which is used as current page table, we should + * force kvm to use small page size to map it because new shadow page will be + * created when kvm establishes shadow page table that stop kvm using large + * page size. Do it early can avoid unnecessary #PF and emulation. + * + * @write_fault_to_shadow_pgtable will return true if the fault gfn is + * currently used as its page table. + * + * Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok + * since the PDPT is always shadowed, that means, we can not use large page + * size to map the gfn which is used as PDPT. + */ +static bool +FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, + struct guest_walker *walker, int user_fault, + bool *write_fault_to_shadow_pgtable) +{ + int level; + gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1); + bool self_changed = false; + + if (!(walker->pte_access & ACC_WRITE_MASK || + (!is_write_protection(vcpu) && !user_fault))) + return false; + + for (level = walker->level; level <= walker->max_level; level++) { + gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1]; + + self_changed |= !(gfn & mask); + *write_fault_to_shadow_pgtable |= !gfn; + } + + return self_changed; } /* @@ -538,14 +543,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, int write_fault = error_code & PFERR_WRITE_MASK; int user_fault = error_code & PFERR_USER_MASK; struct guest_walker walker; - u64 *sptep; - int emulate = 0; int r; pfn_t pfn; int level = PT_PAGE_TABLE_LEVEL; int force_pt_level; unsigned long mmu_seq; - bool map_writable; + bool map_writable, is_self_change_mapping; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -573,8 +576,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, return 0; } + vcpu->arch.write_fault_to_shadow_pgtable = false; + + is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, + &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); + if (walker.level >= PT_DIRECTORY_LEVEL) - force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn); + force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn) + || is_self_change_mapping; else force_pt_level = 1; if (!force_pt_level) { @@ -593,25 +602,41 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, walker.gfn, pfn, walker.pte_access, &r)) return r; + /* + * Do not change pte_access if the pfn is a mmio page, otherwise + * we will cache the incorrect access into mmio spte. + */ + if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) && + !is_write_protection(vcpu) && !user_fault && + !is_noslot_pfn(pfn)) { + walker.pte_access |= ACC_WRITE_MASK; + walker.pte_access &= ~ACC_USER_MASK; + + /* + * If we converted a user page to a kernel page, + * so that the kernel can write to it when cr0.wp=0, + * then we should prevent the kernel from executing it + * if SMEP is enabled. + */ + if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) + walker.pte_access &= ~ACC_EXEC_MASK; + } + spin_lock(&vcpu->kvm->mmu_lock); - if (mmu_notifier_retry(vcpu, mmu_seq)) + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); kvm_mmu_free_some_pages(vcpu); if (!force_pt_level) transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); - sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - level, &emulate, pfn, map_writable, prefault); - (void)sptep; - pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__, - sptep, *sptep, emulate); - + r = FNAME(fetch)(vcpu, addr, &walker, write_fault, + level, pfn, map_writable, prefault); ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); spin_unlock(&vcpu->kvm->mmu_lock); - return emulate; + return r; out_unlock: spin_unlock(&vcpu->kvm->mmu_lock); @@ -757,7 +782,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) sizeof(pt_element_t))) return -EINVAL; - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { + if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) { vcpu->kvm->tlbs_dirty++; continue; } @@ -780,7 +805,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; - set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, + set_spte(vcpu, &sp->spt[i], pte_access, PT_PAGE_TABLE_LEVEL, gfn, spte_to_pfn(sp->spt[i]), true, false, host_writable); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d017df3899e..e1b1ce21bc0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -20,6 +20,7 @@ #include "mmu.h" #include "kvm_cache_regs.h" #include "x86.h" +#include "cpuid.h" #include <linux/module.h> #include <linux/mod_devicetable.h> @@ -630,15 +631,12 @@ static int svm_hardware_enable(void *garbage) return -EBUSY; if (!has_svm()) { - printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", - me); + pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); return -EINVAL; } sd = per_cpu(svm_data, me); - if (!sd) { - printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n", - me); + pr_err("%s: svm_data is NULL on %d\n", __func__, me); return -EINVAL; } @@ -1012,6 +1010,13 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) svm->tsc_ratio = ratio; } +static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + return svm->vmcb->control.tsc_offset; +} + static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1189,6 +1194,8 @@ static void init_vmcb(struct vcpu_svm *svm) static int svm_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + u32 dummy; + u32 eax = 1; init_vmcb(svm); @@ -1197,8 +1204,9 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; } - vcpu->arch.regs_avail = ~0; - vcpu->arch.regs_dirty = ~0; + + kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); + kvm_register_write(vcpu, VCPU_REGS_RDX, eax); return 0; } @@ -1254,11 +1262,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; init_vmcb(svm); - kvm_write_tsc(&svm->vcpu, 0); - - err = fx_init(&svm->vcpu); - if (err) - goto free_page4; svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) @@ -1268,8 +1271,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) return &svm->vcpu; -free_page4: - __free_page(hsave_page); free_page3: __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); free_page2: @@ -3008,11 +3009,11 @@ static int cr8_write_interception(struct vcpu_svm *svm) return 0; } -u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu) +u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); return vmcb->control.tsc_offset + - svm_scale_tsc(vcpu, native_read_tsc()); + svm_scale_tsc(vcpu, host_tsc); } static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) @@ -3131,13 +3132,15 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) return 0; } -static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) +static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct vcpu_svm *svm = to_svm(vcpu); + u32 ecx = msr->index; + u64 data = msr->data; switch (ecx) { case MSR_IA32_TSC: - kvm_write_tsc(vcpu, data); + kvm_write_tsc(vcpu, msr); break; case MSR_STAR: svm->vmcb->save.star = data; @@ -3192,20 +3195,24 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; default: - return kvm_set_msr_common(vcpu, ecx, data); + return kvm_set_msr_common(vcpu, msr); } return 0; } static int wrmsr_interception(struct vcpu_svm *svm) { + struct msr_data msr; u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); + msr.data = data; + msr.index = ecx; + msr.host_initiated = false; svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, ecx, data)) { + if (svm_set_msr(&svm->vcpu, &msr)) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); } else { @@ -3564,6 +3571,26 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) set_cr_intercept(svm, INTERCEPT_CR8_WRITE); } +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) +{ + return; +} + +static int svm_vm_has_apicv(struct kvm *kvm) +{ + return 0; +} + +static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + return; +} + +static void svm_hwapic_isr_update(struct kvm *kvm, int isr) +{ + return; +} + static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4283,6 +4310,10 @@ static struct kvm_x86_ops svm_x86_ops = { .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, + .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, + .vm_has_apicv = svm_vm_has_apicv, + .load_eoi_exitmap = svm_load_eoi_exitmap, + .hwapic_isr_update = svm_hwapic_isr_update, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, @@ -4302,6 +4333,7 @@ static struct kvm_x86_ops svm_x86_ops = { .has_wbinvd_exit = svm_has_wbinvd_exit, .set_tsc_khz = svm_set_tsc_khz, + .read_tsc_offset = svm_read_tsc_offset, .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset = svm_adjust_tsc_offset, .compute_tsc_offset = svm_compute_tsc_offset, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index bca63f04dcc..fe5e00ed703 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -4,6 +4,7 @@ #include <linux/tracepoint.h> #include <asm/vmx.h> #include <asm/svm.h> +#include <asm/clocksource.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm @@ -754,6 +755,68 @@ TRACE_EVENT( __entry->write ? "Write" : "Read", __entry->gpa_match ? "GPA" : "GVA") ); + +#ifdef CONFIG_X86_64 + +#define host_clocks \ + {VCLOCK_NONE, "none"}, \ + {VCLOCK_TSC, "tsc"}, \ + {VCLOCK_HPET, "hpet"} \ + +TRACE_EVENT(kvm_update_master_clock, + TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched), + TP_ARGS(use_master_clock, host_clock, offset_matched), + + TP_STRUCT__entry( + __field( bool, use_master_clock ) + __field( unsigned int, host_clock ) + __field( bool, offset_matched ) + ), + + TP_fast_assign( + __entry->use_master_clock = use_master_clock; + __entry->host_clock = host_clock; + __entry->offset_matched = offset_matched; + ), + + TP_printk("masterclock %d hostclock %s offsetmatched %u", + __entry->use_master_clock, + __print_symbolic(__entry->host_clock, host_clocks), + __entry->offset_matched) +); + +TRACE_EVENT(kvm_track_tsc, + TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched, + unsigned int online_vcpus, bool use_master_clock, + unsigned int host_clock), + TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock, + host_clock), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( unsigned int, nr_vcpus_matched_tsc ) + __field( unsigned int, online_vcpus ) + __field( bool, use_master_clock ) + __field( unsigned int, host_clock ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->nr_vcpus_matched_tsc = nr_matched; + __entry->online_vcpus = online_vcpus; + __entry->use_master_clock = use_master_clock; + __entry->host_clock = host_clock; + ), + + TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u" + " hostclock %s", + __entry->vcpu_id, __entry->use_master_clock, + __entry->nr_vcpus_matched_tsc, __entry->online_vcpus, + __print_symbolic(__entry->host_clock, host_clocks)) +); + +#endif /* CONFIG_X86_64 */ + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ad6b1dd06f8..6667042714c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -42,6 +42,7 @@ #include <asm/i387.h> #include <asm/xcr.h> #include <asm/perf_event.h> +#include <asm/kexec.h> #include "trace.h" @@ -83,6 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO); static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); +static bool __read_mostly enable_apicv_reg_vid; + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -91,12 +94,8 @@ module_param(fasteoi, bool, S_IRUGO); static bool __read_mostly nested = 0; module_param(nested, bool, S_IRUGO); -#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) -#define KVM_GUEST_CR0_MASK \ - (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) -#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE) +#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_CR4_GUEST_OWNED_BITS \ @@ -623,6 +622,8 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +static bool guest_state_valid(struct kvm_vcpu *vcpu); +static u32 vmx_segment_access_rights(struct kvm_segment *var); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -637,6 +638,8 @@ static unsigned long *vmx_io_bitmap_a; static unsigned long *vmx_io_bitmap_b; static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; +static unsigned long *vmx_msr_bitmap_legacy_x2apic; +static unsigned long *vmx_msr_bitmap_longmode_x2apic; static bool cpu_has_load_ia32_efer; static bool cpu_has_load_perf_global_ctrl; @@ -761,6 +764,24 @@ static inline bool cpu_has_vmx_virtualize_apic_accesses(void) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; } +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; +} + +static inline bool cpu_has_vmx_apic_register_virt(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_APIC_REGISTER_VIRT; +} + +static inline bool cpu_has_vmx_virtual_intr_delivery(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() && @@ -802,11 +823,6 @@ static inline bool cpu_has_vmx_ept_ad_bits(void) return vmx_capability.ept & VMX_EPT_AD_BIT; } -static inline bool cpu_has_vmx_invept_individual_addr(void) -{ - return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; -} - static inline bool cpu_has_vmx_invept_context(void) { return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; @@ -992,6 +1008,46 @@ static void vmcs_load(struct vmcs *vmcs) vmcs, phys_addr); } +#ifdef CONFIG_KEXEC +/* + * This bitmap is used to indicate whether the vmclear + * operation is enabled on all cpus. All disabled by + * default. + */ +static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; + +static inline void crash_enable_local_vmclear(int cpu) +{ + cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static inline void crash_disable_local_vmclear(int cpu) +{ + cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static inline int crash_local_vmclear_enabled(int cpu) +{ + return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static void crash_vmclear_local_loaded_vmcss(void) +{ + int cpu = raw_smp_processor_id(); + struct loaded_vmcs *v; + + if (!crash_local_vmclear_enabled(cpu)) + return; + + list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), + loaded_vmcss_on_cpu_link) + vmcs_clear(v->vmcs); +} +#else +static inline void crash_enable_local_vmclear(int cpu) { } +static inline void crash_disable_local_vmclear(int cpu) { } +#endif /* CONFIG_KEXEC */ + static void __loaded_vmcs_clear(void *arg) { struct loaded_vmcs *loaded_vmcs = arg; @@ -1001,15 +1057,28 @@ static void __loaded_vmcs_clear(void *arg) return; /* vcpu migration can race with cpu offline */ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) per_cpu(current_vmcs, cpu) = NULL; + crash_disable_local_vmclear(cpu); list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); + + /* + * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link + * is before setting loaded_vmcs->vcpu to -1 which is done in + * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist + * then adds the vmcs into percpu list before it is deleted. + */ + smp_wmb(); + loaded_vmcs_init(loaded_vmcs); + crash_enable_local_vmclear(cpu); } static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) { - if (loaded_vmcs->cpu != -1) - smp_call_function_single( - loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1); + int cpu = loaded_vmcs->cpu; + + if (cpu != -1) + smp_call_function_single(cpu, + __loaded_vmcs_clear, loaded_vmcs, 1); } static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) @@ -1051,17 +1120,6 @@ static inline void ept_sync_context(u64 eptp) } } -static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) -{ - if (enable_ept) { - if (cpu_has_vmx_invept_individual_addr()) - __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, - eptp, gpa); - else - ept_sync_context(eptp); - } -} - static __always_inline unsigned long vmcs_readl(unsigned long field) { unsigned long value; @@ -1535,8 +1593,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); local_irq_disable(); + crash_disable_local_vmclear(cpu); + + /* + * Read loaded_vmcs->cpu should be before fetching + * loaded_vmcs->loaded_vmcss_on_cpu_link. + * See the comments in __loaded_vmcs_clear(). + */ + smp_rmb(); + list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, &per_cpu(loaded_vmcss_on_cpu, cpu)); + crash_enable_local_vmclear(cpu); local_irq_enable(); /* @@ -1646,7 +1714,6 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); - __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); to_vmx(vcpu)->rflags = rflags; if (to_vmx(vcpu)->rmode.vm86_active) { to_vmx(vcpu)->rmode.save_rflags = rflags; @@ -1772,6 +1839,25 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) vmx->guest_msrs[from] = tmp; } +static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) +{ + unsigned long *msr_bitmap; + + if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) { + if (is_long_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode_x2apic; + else + msr_bitmap = vmx_msr_bitmap_legacy_x2apic; + } else { + if (is_long_mode(vcpu)) + msr_bitmap = vmx_msr_bitmap_longmode; + else + msr_bitmap = vmx_msr_bitmap_legacy; + } + + vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); +} + /* * Set up the vmcs to automatically save and restore system * msrs. Don't touch the 64-bit msrs if the guest is in legacy @@ -1780,7 +1866,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) static void setup_msrs(struct vcpu_vmx *vmx) { int save_nmsrs, index; - unsigned long *msr_bitmap; save_nmsrs = 0; #ifdef CONFIG_X86_64 @@ -1812,14 +1897,8 @@ static void setup_msrs(struct vcpu_vmx *vmx) vmx->save_nmsrs = save_nmsrs; - if (cpu_has_vmx_msr_bitmap()) { - if (is_long_mode(&vmx->vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode; - else - msr_bitmap = vmx_msr_bitmap_legacy; - - vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); - } + if (cpu_has_vmx_msr_bitmap()) + vmx_set_msr_bitmap(&vmx->vcpu); } /* @@ -1839,11 +1918,10 @@ static u64 guest_read_tsc(void) * Like guest_read_tsc, but always returns L1's notion of the timestamp * counter, even if a nested guest (L2) is currently running. */ -u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) +u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - u64 host_tsc, tsc_offset; + u64 tsc_offset; - rdtscll(host_tsc); tsc_offset = is_guest_mode(vcpu) ? to_vmx(vcpu)->nested.vmcs01_tsc_offset : vmcs_read64(TSC_OFFSET); @@ -1866,6 +1944,11 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) WARN(1, "user requested TSC rate below hardware speed\n"); } +static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) +{ + return vmcs_read64(TSC_OFFSET); +} + /* * writes 'offset' into guest's timestamp counter offset register */ @@ -2202,15 +2285,17 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct shared_msr_entry *msr; int ret = 0; + u32 msr_index = msr_info->index; + u64 data = msr_info->data; switch (msr_index) { case MSR_EFER: - ret = kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_info); break; #ifdef CONFIG_X86_64 case MSR_FS_BASE: @@ -2236,7 +2321,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_TSC: - kvm_write_tsc(vcpu, data); + kvm_write_tsc(vcpu, msr_info); break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { @@ -2244,7 +2329,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vcpu->arch.pat = data; break; } - ret = kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_info); + break; + case MSR_IA32_TSC_ADJUST: + ret = kvm_set_msr_common(vcpu, msr_info); break; case MSR_TSC_AUX: if (!vmx->rdtscp_enabled) @@ -2267,7 +2355,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) } break; } - ret = kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_info); } return ret; @@ -2341,6 +2429,18 @@ static int hardware_enable(void *garbage) return -EBUSY; INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); + + /* + * Now we can enable the vmclear operation in kdump + * since the loaded_vmcss_on_cpu list on this cpu + * has been initialized. + * + * Though the cpu is not in VMX operation now, there + * is no problem to enable the vmclear operation + * for the loaded_vmcss_on_cpu list is empty! + */ + crash_enable_local_vmclear(cpu); + rdmsrl(MSR_IA32_FEATURE_CONTROL, old); test_bits = FEATURE_CONTROL_LOCKED; @@ -2464,13 +2564,16 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { min2 = 0; opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_PAUSE_LOOP_EXITING | SECONDARY_EXEC_RDTSCP | - SECONDARY_EXEC_ENABLE_INVPCID; + SECONDARY_EXEC_ENABLE_INVPCID | + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -2481,6 +2584,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; #endif + + if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) + _cpu_based_2nd_exec_control &= ~( + SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { /* CR3 accesses and invlpg don't need to cause VM Exits when EPT enabled */ @@ -2678,6 +2788,15 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_ple()) ple_gap = 0; + if (!cpu_has_vmx_apic_register_virt() || + !cpu_has_vmx_virtual_intr_delivery()) + enable_apicv_reg_vid = 0; + + if (enable_apicv_reg_vid) + kvm_x86_ops->update_cr8_intercept = NULL; + else + kvm_x86_ops->hwapic_irr_update = NULL; + if (nested) nested_vmx_setup_ctls_msrs(); @@ -2689,17 +2808,28 @@ static __exit void hardware_unsetup(void) free_kvm_area(); } -static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment *save) +static bool emulation_required(struct kvm_vcpu *vcpu) { - const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - struct kvm_segment tmp = *save; + return emulate_invalid_guest_state && !guest_state_valid(vcpu); +} - if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) { - tmp.base = vmcs_readl(sf->base); - tmp.selector = vmcs_read16(sf->selector); - tmp.s = 1; +static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, + struct kvm_segment *save) +{ + if (!emulate_invalid_guest_state) { + /* + * CS and SS RPL should be equal during guest entry according + * to VMX spec, but in reality it is not always so. Since vcpu + * is in the middle of the transition from real mode to + * protected mode it is safe to assume that RPL 0 is a good + * default value. + */ + if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) + save->selector &= ~SELECTOR_RPL_MASK; + save->dpl = save->selector & SELECTOR_RPL_MASK; + save->s = 1; } - vmx_set_segment(vcpu, &tmp, seg); + vmx_set_segment(vcpu, save, seg); } static void enter_pmode(struct kvm_vcpu *vcpu) @@ -2707,7 +2837,17 @@ static void enter_pmode(struct kvm_vcpu *vcpu) unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); - vmx->emulation_required = 1; + /* + * Update real mode segment cache. It may be not up-to-date if sement + * register was written while vcpu was in a guest mode. + */ + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); + vmx->rmode.vm86_active = 0; vmx_segment_cache_clear(vmx); @@ -2724,22 +2864,16 @@ static void enter_pmode(struct kvm_vcpu *vcpu) update_exception_bitmap(vcpu); - if (emulate_invalid_guest_state) - return; + fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); + fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); + fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); + fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); + fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); + fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); - fix_pmode_dataseg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); - fix_pmode_dataseg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); - fix_pmode_dataseg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); - fix_pmode_dataseg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); - - vmx_segment_cache_clear(vmx); - - vmcs_write16(GUEST_SS_SELECTOR, 0); - vmcs_write32(GUEST_SS_AR_BYTES, 0x93); - - vmcs_write16(GUEST_CS_SELECTOR, - vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK); - vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); + /* CPL is always 0 when CPU enters protected mode */ + __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + vmx->cpl = 0; } static gva_t rmode_tss_base(struct kvm *kvm) @@ -2761,36 +2895,51 @@ static gva_t rmode_tss_base(struct kvm *kvm) static void fix_rmode_seg(int seg, struct kvm_segment *save) { const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - - vmcs_write16(sf->selector, save->base >> 4); - vmcs_write32(sf->base, save->base & 0xffff0); - vmcs_write32(sf->limit, 0xffff); - vmcs_write32(sf->ar_bytes, 0xf3); - if (save->base & 0xf) - printk_once(KERN_WARNING "kvm: segment base is not paragraph" - " aligned when entering protected mode (seg=%d)", - seg); + struct kvm_segment var = *save; + + var.dpl = 0x3; + if (seg == VCPU_SREG_CS) + var.type = 0x3; + + if (!emulate_invalid_guest_state) { + var.selector = var.base >> 4; + var.base = var.base & 0xffff0; + var.limit = 0xffff; + var.g = 0; + var.db = 0; + var.present = 1; + var.s = 1; + var.l = 0; + var.unusable = 0; + var.type = 0x3; + var.avl = 0; + if (save->base & 0xf) + printk_once(KERN_WARNING "kvm: segment base is not " + "paragraph aligned when entering " + "protected mode (seg=%d)", seg); + } + + vmcs_write16(sf->selector, var.selector); + vmcs_write32(sf->base, var.base); + vmcs_write32(sf->limit, var.limit); + vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); } static void enter_rmode(struct kvm_vcpu *vcpu) { unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_segment var; - - if (enable_unrestricted_guest) - return; vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); + vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); - vmx->emulation_required = 1; vmx->rmode.vm86_active = 1; - /* * Very old userspace does not call KVM_SET_TSS_ADDR before entering * vcpu. Call it here with phys address pointing 16M below 4G. @@ -2818,28 +2967,13 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); update_exception_bitmap(vcpu); - if (emulate_invalid_guest_state) - goto continue_rmode; + fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); + fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); + fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); + fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); + fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); + fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); - vmx_get_segment(vcpu, &var, VCPU_SREG_SS); - vmx_set_segment(vcpu, &var, VCPU_SREG_SS); - - vmx_get_segment(vcpu, &var, VCPU_SREG_CS); - vmx_set_segment(vcpu, &var, VCPU_SREG_CS); - - vmx_get_segment(vcpu, &var, VCPU_SREG_ES); - vmx_set_segment(vcpu, &var, VCPU_SREG_ES); - - vmx_get_segment(vcpu, &var, VCPU_SREG_DS); - vmx_set_segment(vcpu, &var, VCPU_SREG_DS); - - vmx_get_segment(vcpu, &var, VCPU_SREG_GS); - vmx_set_segment(vcpu, &var, VCPU_SREG_GS); - - vmx_get_segment(vcpu, &var, VCPU_SREG_FS); - vmx_set_segment(vcpu, &var, VCPU_SREG_FS); - -continue_rmode: kvm_mmu_reset_context(vcpu); } @@ -2998,17 +3132,18 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long hw_cr0; + hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK); if (enable_unrestricted_guest) - hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST) - | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; - else - hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; + hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; + else { + hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; - if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) - enter_pmode(vcpu); + if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) + enter_pmode(vcpu); - if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) - enter_rmode(vcpu); + if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) + enter_rmode(vcpu); + } #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME) { @@ -3028,7 +3163,9 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; - __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + + /* depends on vcpu->arch.cr0 to be set to a new value */ + vmx->emulation_required = emulation_required(vcpu); } static u64 construct_eptp(unsigned long root_hpa) @@ -3085,6 +3222,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!is_paging(vcpu)) { hw_cr4 &= ~X86_CR4_PAE; hw_cr4 |= X86_CR4_PSE; + /* + * SMEP is disabled if CPU is in non-paging mode in + * hardware. However KVM always uses paging mode to + * emulate guest non-paging mode with TDP. + * To emulate this behavior, SMEP needs to be manually + * disabled when guest switches to non-paging mode. + */ + hw_cr4 &= ~X86_CR4_SMEP; } else if (!(cr4 & X86_CR4_PAE)) { hw_cr4 &= ~X86_CR4_PAE; } @@ -3101,10 +3246,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); u32 ar; - if (vmx->rmode.vm86_active - && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES - || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS - || seg == VCPU_SREG_GS)) { + if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { *var = vmx->rmode.segs[seg]; if (seg == VCPU_SREG_TR || var->selector == vmx_read_guest_seg_selector(vmx, seg)) @@ -3117,8 +3259,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var->limit = vmx_read_guest_seg_limit(vmx, seg); var->selector = vmx_read_guest_seg_selector(vmx, seg); ar = vmx_read_guest_seg_ar(vmx, seg); - if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) - ar = 0; var->type = ar & 15; var->s = (ar >> 4) & 1; var->dpl = (ar >> 5) & 3; @@ -3141,8 +3281,10 @@ static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) return vmx_read_guest_seg_base(to_vmx(vcpu), seg); } -static int __vmx_get_cpl(struct kvm_vcpu *vcpu) +static int vmx_get_cpl(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + if (!is_protmode(vcpu)) return 0; @@ -3150,24 +3292,9 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu) && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ return 3; - return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3; -} - -static int vmx_get_cpl(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations - * fail; use the cache instead. - */ - if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) { - return vmx->cpl; - } - if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); - vmx->cpl = __vmx_get_cpl(vcpu); + vmx->cpl = vmx_read_guest_seg_selector(vmx, VCPU_SREG_CS) & 3; } return vmx->cpl; @@ -3199,28 +3326,23 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - u32 ar; vmx_segment_cache_clear(vmx); + if (seg == VCPU_SREG_CS) + __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); - if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { - vmcs_write16(sf->selector, var->selector); - vmx->rmode.segs[VCPU_SREG_TR] = *var; - return; + if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { + vmx->rmode.segs[seg] = *var; + if (seg == VCPU_SREG_TR) + vmcs_write16(sf->selector, var->selector); + else if (var->s) + fix_rmode_seg(seg, &vmx->rmode.segs[seg]); + goto out; } + vmcs_writel(sf->base, var->base); vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); - if (vmx->rmode.vm86_active && var->s) { - vmx->rmode.segs[seg] = *var; - /* - * Hack real-mode segments into vm86 compatibility. - */ - if (var->base == 0xffff0000 && var->selector == 0xf000) - vmcs_writel(sf->base, 0xf0000); - ar = 0xf3; - } else - ar = vmx_segment_access_rights(var); /* * Fix the "Accessed" bit in AR field of segment registers for older @@ -3234,42 +3356,12 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, * kvm hack. */ if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) - ar |= 0x1; /* Accessed */ + var->type |= 0x1; /* Accessed */ - vmcs_write32(sf->ar_bytes, ar); - __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); - /* - * Fix segments for real mode guest in hosts that don't have - * "unrestricted_mode" or it was disabled. - * This is done to allow migration of the guests from hosts with - * unrestricted guest like Westmere to older host that don't have - * unrestricted guest like Nehelem. - */ - if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { - switch (seg) { - case VCPU_SREG_CS: - vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); - vmcs_write32(GUEST_CS_LIMIT, 0xffff); - if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) - vmcs_writel(GUEST_CS_BASE, 0xf0000); - vmcs_write16(GUEST_CS_SELECTOR, - vmcs_readl(GUEST_CS_BASE) >> 4); - break; - case VCPU_SREG_ES: - case VCPU_SREG_DS: - case VCPU_SREG_GS: - case VCPU_SREG_FS: - fix_rmode_seg(seg, &vmx->rmode.segs[seg]); - break; - case VCPU_SREG_SS: - vmcs_write16(GUEST_SS_SELECTOR, - vmcs_readl(GUEST_SS_BASE) >> 4); - vmcs_write32(GUEST_SS_LIMIT, 0xffff); - vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); - break; - } - } +out: + vmx->emulation_required |= emulation_required(vcpu); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -3310,13 +3402,16 @@ static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) u32 ar; vmx_get_segment(vcpu, &var, seg); + var.dpl = 0x3; + if (seg == VCPU_SREG_CS) + var.type = 0x3; ar = vmx_segment_access_rights(&var); if (var.base != (var.selector << 4)) return false; - if (var.limit < 0xffff) + if (var.limit != 0xffff) return false; - if (((ar | (3 << AR_DPL_SHIFT)) & ~(AR_G_MASK | AR_DB_MASK)) != 0xf3) + if (ar != 0xf3) return false; return true; @@ -3451,6 +3546,9 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) */ static bool guest_state_valid(struct kvm_vcpu *vcpu) { + if (enable_unrestricted_guest) + return true; + /* real mode guest state checks */ if (!is_protmode(vcpu)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) @@ -3574,12 +3672,9 @@ static void seg_setup(int seg) vmcs_write16(sf->selector, 0); vmcs_writel(sf->base, 0); vmcs_write32(sf->limit, 0xffff); - if (enable_unrestricted_guest) { - ar = 0x93; - if (seg == VCPU_SREG_CS) - ar |= 0x08; /* code segment */ - } else - ar = 0xf3; + ar = 0x93; + if (seg == VCPU_SREG_CS) + ar |= 0x08; /* code segment */ vmcs_write32(sf->ar_bytes, ar); } @@ -3597,7 +3692,7 @@ static int alloc_apic_access_page(struct kvm *kvm) kvm_userspace_mem.flags = 0; kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL; kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false); if (r) goto out; @@ -3627,7 +3722,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) kvm_userspace_mem.guest_phys_addr = kvm->arch.ept_identity_map_addr; kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); + r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false); if (r) goto out; @@ -3669,7 +3764,45 @@ static void free_vpid(struct vcpu_vmx *vmx) spin_unlock(&vmx_vpid_lock); } -static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) +#define MSR_TYPE_R 1 +#define MSR_TYPE_W 2 +static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type) +{ + int f = sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) + return; + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + if (msr <= 0x1fff) { + if (type & MSR_TYPE_R) + /* read-low */ + __clear_bit(msr, msr_bitmap + 0x000 / f); + + if (type & MSR_TYPE_W) + /* write-low */ + __clear_bit(msr, msr_bitmap + 0x800 / f); + + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + if (type & MSR_TYPE_R) + /* read-high */ + __clear_bit(msr, msr_bitmap + 0x400 / f); + + if (type & MSR_TYPE_W) + /* write-high */ + __clear_bit(msr, msr_bitmap + 0xc00 / f); + + } +} + +static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, + u32 msr, int type) { int f = sizeof(unsigned long); @@ -3682,20 +3815,58 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. */ if (msr <= 0x1fff) { - __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */ - __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */ + if (type & MSR_TYPE_R) + /* read-low */ + __set_bit(msr, msr_bitmap + 0x000 / f); + + if (type & MSR_TYPE_W) + /* write-low */ + __set_bit(msr, msr_bitmap + 0x800 / f); + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { msr &= 0x1fff; - __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */ - __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */ + if (type & MSR_TYPE_R) + /* read-high */ + __set_bit(msr, msr_bitmap + 0x400 / f); + + if (type & MSR_TYPE_W) + /* write-high */ + __set_bit(msr, msr_bitmap + 0xc00 / f); + } } static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) { if (!longmode_only) - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, + msr, MSR_TYPE_R | MSR_TYPE_W); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, + msr, MSR_TYPE_R | MSR_TYPE_W); +} + +static void vmx_enable_intercept_msr_read_x2apic(u32 msr) +{ + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_R); + __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_R); +} + +static void vmx_disable_intercept_msr_read_x2apic(u32 msr) +{ + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_R); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_R); +} + +static void vmx_disable_intercept_msr_write_x2apic(u32 msr) +{ + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, + msr, MSR_TYPE_W); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, + msr, MSR_TYPE_W); } /* @@ -3774,6 +3945,11 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx) return exec_control; } +static int vmx_vm_has_apicv(struct kvm *kvm) +{ + return enable_apicv_reg_vid && irqchip_in_kernel(kvm); +} + static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; @@ -3791,6 +3967,10 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; if (!ple_gap) exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; + if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) + exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); + exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; return exec_control; } @@ -3835,6 +4015,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx_secondary_exec_control(vmx)); } + if (enable_apicv_reg_vid) { + vmcs_write64(EOI_EXIT_BITMAP0, 0); + vmcs_write64(EOI_EXIT_BITMAP1, 0); + vmcs_write64(EOI_EXIT_BITMAP2, 0); + vmcs_write64(EOI_EXIT_BITMAP3, 0); + + vmcs_write16(GUEST_INTR_STATUS, 0); + } + if (ple_gap) { vmcs_write32(PLE_GAP, ple_gap); vmcs_write32(PLE_WINDOW, ple_window); @@ -3897,8 +4086,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); set_cr4_guest_host_mask(vmx); - kvm_write_tsc(&vmx->vcpu, 0); - return 0; } @@ -3908,8 +4095,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) u64 msr; int ret; - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - vmx->rmode.vm86_active = 0; vmx->soft_vnmi_blocked = 0; @@ -3921,21 +4106,12 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) msr |= MSR_IA32_APICBASE_BSP; kvm_set_apic_base(&vmx->vcpu, msr); - ret = fx_init(&vmx->vcpu); - if (ret != 0) - goto out; - vmx_segment_cache_clear(vmx); seg_setup(VCPU_SREG_CS); - /* - * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode - * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. - */ - if (kvm_vcpu_is_bsp(&vmx->vcpu)) { + if (kvm_vcpu_is_bsp(&vmx->vcpu)) vmcs_write16(GUEST_CS_SELECTOR, 0xf000); - vmcs_writel(GUEST_CS_BASE, 0x000f0000); - } else { + else { vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); } @@ -3965,7 +4141,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) kvm_rip_write(vcpu, 0xfff0); else kvm_rip_write(vcpu, 0); - kvm_register_write(vcpu, VCPU_REGS_RSP, 0); vmcs_writel(GUEST_GDTR_BASE, 0); vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); @@ -4012,10 +4187,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) ret = 0; - /* HACK: Don't enable emulation on guest boot/reset */ - vmx->emulation_required = 0; - -out: return ret; } @@ -4191,7 +4362,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) .flags = 0, }; - ret = kvm_set_memory_region(kvm, &tss_mem, 0); + ret = kvm_set_memory_region(kvm, &tss_mem, false); if (ret) return ret; kvm->arch.tss_addr = addr; @@ -4201,28 +4372,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) return 0; } -static int handle_rmode_exception(struct kvm_vcpu *vcpu, - int vec, u32 err_code) +static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) { - /* - * Instruction with address size override prefix opcode 0x67 - * Cause the #SS fault with 0 error code in VM86 mode. - */ - if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) - if (emulate_instruction(vcpu, 0) == EMULATE_DONE) - return 1; - /* - * Forward all other exceptions that are valid in real mode. - * FIXME: Breaks guest debugging in real mode, needs to be fixed with - * the required debugging infrastructure rework. - */ switch (vec) { - case DB_VECTOR: - if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - return 0; - kvm_queue_exception(vcpu, vec); - return 1; case BP_VECTOR: /* * Update instruction length as we may reinject the exception @@ -4231,7 +4383,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) - return 0; + return false; + /* fall through */ + case DB_VECTOR: + if (vcpu->guest_debug & + (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) + return false; /* fall through */ case DE_VECTOR: case OF_VECTOR: @@ -4241,10 +4398,37 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, case SS_VECTOR: case GP_VECTOR: case MF_VECTOR: - kvm_queue_exception(vcpu, vec); - return 1; + return true; + break; } - return 0; + return false; +} + +static int handle_rmode_exception(struct kvm_vcpu *vcpu, + int vec, u32 err_code) +{ + /* + * Instruction with address size override prefix opcode 0x67 + * Cause the #SS fault with 0 error code in VM86 mode. + */ + if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { + if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { + if (vcpu->arch.halt_request) { + vcpu->arch.halt_request = 0; + return kvm_emulate_halt(vcpu); + } + return 1; + } + return 0; + } + + /* + * Forward all other exceptions that are valid in real mode. + * FIXME: Breaks guest debugging in real mode, needs to be fixed with + * the required debugging infrastructure rework. + */ + kvm_queue_exception(vcpu, vec); + return 1; } /* @@ -4287,16 +4471,6 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (is_machine_check(intr_info)) return handle_machine_check(vcpu); - if ((vect_info & VECTORING_INFO_VALID_MASK) && - !is_page_fault(intr_info)) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; - vcpu->run->internal.ndata = 2; - vcpu->run->internal.data[0] = vect_info; - vcpu->run->internal.data[1] = intr_info; - return 0; - } - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) return 1; /* already handled by vmx_vcpu_run() */ @@ -4315,6 +4489,22 @@ static int handle_exception(struct kvm_vcpu *vcpu) error_code = 0; if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); + + /* + * The #PF with PFEC.RSVD = 1 indicates the guest is accessing + * MMIO, it is better to report an internal error. + * See the comments in vmx_handle_exit. + */ + if ((vect_info & VECTORING_INFO_VALID_MASK) && + !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vect_info; + vcpu->run->internal.data[1] = intr_info; + return 0; + } + if (is_page_fault(intr_info)) { /* EPT won't cause page fault directly */ BUG_ON(enable_ept); @@ -4326,17 +4516,11 @@ static int handle_exception(struct kvm_vcpu *vcpu) return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); } - if (vmx->rmode.vm86_active && - handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, - error_code)) { - if (vcpu->arch.halt_request) { - vcpu->arch.halt_request = 0; - return kvm_emulate_halt(vcpu); - } - return 1; - } - ex_no = intr_info & INTR_INFO_VECTOR_MASK; + + if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) + return handle_rmode_exception(vcpu, ex_no, error_code); + switch (ex_no) { case DB_VECTOR: dr6 = vmcs_readl(EXIT_QUALIFICATION); @@ -4626,11 +4810,15 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) static int handle_wrmsr(struct kvm_vcpu *vcpu) { + struct msr_data msr; u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - if (vmx_set_msr(vcpu, ecx, data) != 0) { + msr.data = data; + msr.index = ecx; + msr.host_initiated = false; + if (vmx_set_msr(vcpu, &msr) != 0) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; @@ -4750,6 +4938,26 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) return emulate_instruction(vcpu, 0) == EMULATE_DONE; } +static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + int vector = exit_qualification & 0xff; + + /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ + kvm_apic_set_eoi_accelerated(vcpu, vector); + return 1; +} + +static int handle_apic_write(struct kvm_vcpu *vcpu) +{ + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 offset = exit_qualification & 0xfff; + + /* APIC-write VM exit is trap-like and thus no need to adjust IP */ + kvm_apic_write_nodecode(vcpu, offset); + return 1; +} + static int handle_task_switch(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4827,11 +5035,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - if (exit_qualification & (1 << 6)) { - printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); - return -EINVAL; - } - gla_validity = (exit_qualification >> 7) & 0x3; if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); @@ -5000,7 +5203,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) schedule(); } - vmx->emulation_required = !guest_state_valid(vcpu); + vmx->emulation_required = emulation_required(vcpu); out: return ret; } @@ -5689,6 +5892,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_VMON] = handle_vmon, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, + [EXIT_REASON_APIC_WRITE] = handle_apic_write, + [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, [EXIT_REASON_WBINVD] = handle_wbinvd, [EXIT_REASON_XSETBV] = handle_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, @@ -5715,7 +5920,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; gpa_t bitmap; - if (!nested_cpu_has(get_vmcs12(vcpu), CPU_BASED_USE_MSR_BITMAPS)) + if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) return 1; /* @@ -5943,7 +6148,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 vectoring_info = vmx->idt_vectoring_info; /* If guest state is invalid, start emulating */ - if (vmx->emulation_required && emulate_invalid_guest_state) + if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); /* @@ -5979,13 +6184,24 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) return 0; } + /* + * Note: + * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by + * delivery event since it indicates guest is accessing MMIO. + * The vm-exit can be triggered again after return to guest that + * will cause infinite loop. + */ if ((vectoring_info & VECTORING_INFO_VALID_MASK) && (exit_reason != EXIT_REASON_EXCEPTION_NMI && exit_reason != EXIT_REASON_EPT_VIOLATION && - exit_reason != EXIT_REASON_TASK_SWITCH)) - printk(KERN_WARNING "%s: unexpected, valid vectoring info " - "(0x%x) and exit reason is 0x%x\n", - __func__, vectoring_info, exit_reason); + exit_reason != EXIT_REASON_TASK_SWITCH)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vectoring_info; + vcpu->run->internal.data[1] = exit_reason; + return 0; + } if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( @@ -6027,6 +6243,85 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) vmcs_write32(TPR_THRESHOLD, irr); } +static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) +{ + u32 sec_exec_control; + + /* + * There is not point to enable virtualize x2apic without enable + * apicv + */ + if (!cpu_has_vmx_virtualize_x2apic_mode() || + !vmx_vm_has_apicv(vcpu->kvm)) + return; + + if (!vm_need_tpr_shadow(vcpu->kvm)) + return; + + sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + + if (set) { + sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + } else { + sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + } + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); + + vmx_set_msr_bitmap(vcpu); +} + +static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) +{ + u16 status; + u8 old; + + if (!vmx_vm_has_apicv(kvm)) + return; + + if (isr == -1) + isr = 0; + + status = vmcs_read16(GUEST_INTR_STATUS); + old = status >> 8; + if (isr != old) { + status &= 0xff; + status |= isr << 8; + vmcs_write16(GUEST_INTR_STATUS, status); + } +} + +static void vmx_set_rvi(int vector) +{ + u16 status; + u8 old; + + status = vmcs_read16(GUEST_INTR_STATUS); + old = (u8)status & 0xff; + if ((u8)vector != old) { + status &= ~0xff; + status |= (u8)vector; + vmcs_write16(GUEST_INTR_STATUS, status); + } +} + +static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) +{ + if (max_irr == -1) + return; + + vmx_set_rvi(max_irr); +} + +static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) +{ + vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); + vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); + vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); + vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); +} + static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -6215,7 +6510,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) /* Don't enter VMX if guest state is invalid, let the exit handler start emulation until we arrive back to a valid state */ - if (vmx->emulation_required && emulate_invalid_guest_state) + if (vmx->emulation_required) return; if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) @@ -6549,19 +6844,22 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) } } - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); /* Exposing INVPCID only when PCID is exposed */ best = kvm_find_cpuid_entry(vcpu, 0x7, 0); if (vmx_invpcid_supported() && best && (best->ebx & bit(X86_FEATURE_INVPCID)) && guest_cpuid_has_pcid(vcpu)) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } else { - exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, - exec_control); + if (cpu_has_secondary_exec_ctrls()) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } if (best) best->ebx &= ~bit(X86_FEATURE_INVPCID); } @@ -7287,6 +7585,11 @@ static struct kvm_x86_ops vmx_x86_ops = { .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, + .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, + .vm_has_apicv = vmx_vm_has_apicv, + .load_eoi_exitmap = vmx_load_eoi_exitmap, + .hwapic_irr_update = vmx_hwapic_irr_update, + .hwapic_isr_update = vmx_hwapic_isr_update, .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, @@ -7306,6 +7609,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, .set_tsc_khz = vmx_set_tsc_khz, + .read_tsc_offset = vmx_read_tsc_offset, .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset = vmx_adjust_tsc_offset, .compute_tsc_offset = vmx_compute_tsc_offset, @@ -7318,7 +7622,7 @@ static struct kvm_x86_ops vmx_x86_ops = { static int __init vmx_init(void) { - int r, i; + int r, i, msr; rdmsrl_safe(MSR_EFER, &host_efer); @@ -7339,11 +7643,19 @@ static int __init vmx_init(void) if (!vmx_msr_bitmap_legacy) goto out1; + vmx_msr_bitmap_legacy_x2apic = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_legacy_x2apic) + goto out2; vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_msr_bitmap_longmode) - goto out2; + goto out3; + vmx_msr_bitmap_longmode_x2apic = + (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_msr_bitmap_longmode_x2apic) + goto out4; /* * Allow direct access to the PC debug port (it is often used for I/O @@ -7364,12 +7676,39 @@ static int __init vmx_init(void) if (r) goto out3; +#ifdef CONFIG_KEXEC + rcu_assign_pointer(crash_vmclear_loaded_vmcss, + crash_vmclear_local_loaded_vmcss); +#endif + vmx_disable_intercept_for_msr(MSR_FS_BASE, false); vmx_disable_intercept_for_msr(MSR_GS_BASE, false); vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); + memcpy(vmx_msr_bitmap_legacy_x2apic, + vmx_msr_bitmap_legacy, PAGE_SIZE); + memcpy(vmx_msr_bitmap_longmode_x2apic, + vmx_msr_bitmap_longmode, PAGE_SIZE); + + if (enable_apicv_reg_vid) { + for (msr = 0x800; msr <= 0x8ff; msr++) + vmx_disable_intercept_msr_read_x2apic(msr); + + /* According SDM, in x2apic mode, the whole id reg is used. + * But in KVM, it only use the highest eight bits. Need to + * intercept it */ + vmx_enable_intercept_msr_read_x2apic(0x802); + /* TMCCT */ + vmx_enable_intercept_msr_read_x2apic(0x839); + /* TPR */ + vmx_disable_intercept_msr_write_x2apic(0x808); + /* EOI */ + vmx_disable_intercept_msr_write_x2apic(0x80b); + /* SELF-IPI */ + vmx_disable_intercept_msr_write_x2apic(0x83f); + } if (enable_ept) { kvm_mmu_set_mask_ptes(0ull, @@ -7383,8 +7722,10 @@ static int __init vmx_init(void) return 0; -out3: +out4: free_page((unsigned long)vmx_msr_bitmap_longmode); +out3: + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); out2: free_page((unsigned long)vmx_msr_bitmap_legacy); out1: @@ -7396,11 +7737,18 @@ out: static void __exit vmx_exit(void) { + free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); + free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); free_page((unsigned long)vmx_msr_bitmap_legacy); free_page((unsigned long)vmx_msr_bitmap_longmode); free_page((unsigned long)vmx_io_bitmap_b); free_page((unsigned long)vmx_io_bitmap_a); +#ifdef CONFIG_KEXEC + rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL); + synchronize_rcu(); +#endif + kvm_exit(); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 224a7e78cb6..f71500af1f8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -46,6 +46,8 @@ #include <linux/uaccess.h> #include <linux/hash.h> #include <linux/pci.h> +#include <linux/timekeeper_internal.h> +#include <linux/pvclock_gtod.h> #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS @@ -118,7 +120,7 @@ struct kvm_shared_msrs { }; static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; -static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs); +static struct kvm_shared_msrs __percpu *shared_msrs; struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, @@ -158,7 +160,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { u64 __read_mostly host_xcr0; -int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); +static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); + +static int kvm_vcpu_reset(struct kvm_vcpu *vcpu); static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { @@ -187,10 +191,10 @@ static void kvm_on_user_return(struct user_return_notifier *urn) static void shared_msr_update(unsigned slot, u32 msr) { - struct kvm_shared_msrs *smsr; u64 value; + unsigned int cpu = smp_processor_id(); + struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); - smsr = &__get_cpu_var(shared_msrs); /* only read, and nobody should modify it at this time, * so don't need lock */ if (slot >= shared_msrs_global.nr) { @@ -222,7 +226,8 @@ static void kvm_shared_msr_cpu_online(void) void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) { - struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); + unsigned int cpu = smp_processor_id(); + struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); if (((value ^ smsr->values[slot].curr) & mask) == 0) return; @@ -238,7 +243,8 @@ EXPORT_SYMBOL_GPL(kvm_set_shared_msr); static void drop_user_return_notifiers(void *ignore) { - struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); + unsigned int cpu = smp_processor_id(); + struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); if (smsr->registered) kvm_on_user_return(&smsr->urn); @@ -633,7 +639,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) } if (is_long_mode(vcpu)) { - if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) { + if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) { if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) return 1; } else @@ -827,6 +833,7 @@ static u32 msrs_to_save[] = { static unsigned num_msrs_to_save; static const u32 emulated_msrs[] = { + MSR_IA32_TSC_ADJUST, MSR_IA32_TSCDEADLINE, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, @@ -865,8 +872,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer) kvm_x86_ops->set_efer(vcpu, efer); - vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; - /* Update reserved bits */ if ((efer ^ old_efer) & EFER_NX) kvm_mmu_reset_context(vcpu); @@ -886,9 +891,9 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { - return kvm_x86_ops->set_msr(vcpu, msr_index, data); + return kvm_x86_ops->set_msr(vcpu, msr); } /* @@ -896,8 +901,62 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) */ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - return kvm_set_msr(vcpu, index, *data); + struct msr_data msr; + + msr.data = *data; + msr.index = index; + msr.host_initiated = true; + return kvm_set_msr(vcpu, &msr); +} + +#ifdef CONFIG_X86_64 +struct pvclock_gtod_data { + seqcount_t seq; + + struct { /* extract of a clocksource struct */ + int vclock_mode; + cycle_t cycle_last; + cycle_t mask; + u32 mult; + u32 shift; + } clock; + + /* open coded 'struct timespec' */ + u64 monotonic_time_snsec; + time_t monotonic_time_sec; +}; + +static struct pvclock_gtod_data pvclock_gtod_data; + +static void update_pvclock_gtod(struct timekeeper *tk) +{ + struct pvclock_gtod_data *vdata = &pvclock_gtod_data; + + write_seqcount_begin(&vdata->seq); + + /* copy pvclock gtod data */ + vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode; + vdata->clock.cycle_last = tk->clock->cycle_last; + vdata->clock.mask = tk->clock->mask; + vdata->clock.mult = tk->mult; + vdata->clock.shift = tk->shift; + + vdata->monotonic_time_sec = tk->xtime_sec + + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_snsec = tk->xtime_nsec + + (tk->wall_to_monotonic.tv_nsec + << tk->shift); + while (vdata->monotonic_time_snsec >= + (((u64)NSEC_PER_SEC) << tk->shift)) { + vdata->monotonic_time_snsec -= + ((u64)NSEC_PER_SEC) << tk->shift; + vdata->monotonic_time_sec++; + } + + write_seqcount_end(&vdata->seq); } +#endif + static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { @@ -995,6 +1054,10 @@ static inline u64 get_kernel_ns(void) return timespec_to_ns(&ts); } +#ifdef CONFIG_X86_64 +static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); +#endif + static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); unsigned long max_tsc_khz; @@ -1046,12 +1109,47 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) return tsc; } -void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) +void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_X86_64 + bool vcpus_matched; + bool do_request = false; + struct kvm_arch *ka = &vcpu->kvm->arch; + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + + vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == + atomic_read(&vcpu->kvm->online_vcpus)); + + if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC) + if (!ka->use_master_clock) + do_request = 1; + + if (!vcpus_matched && ka->use_master_clock) + do_request = 1; + + if (do_request) + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + + trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, + atomic_read(&vcpu->kvm->online_vcpus), + ka->use_master_clock, gtod->clock.vclock_mode); +#endif +} + +static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) +{ + u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu); + vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; +} + +void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; s64 usdiff; + bool matched; + u64 data = msr->data; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); @@ -1094,6 +1192,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } + matched = true; } else { /* * We split periods of matched TSC writes into generations. @@ -1108,6 +1207,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) kvm->arch.cur_tsc_nsec = ns; kvm->arch.cur_tsc_write = data; kvm->arch.cur_tsc_offset = offset; + matched = false; pr_debug("kvm: new tsc generation %u, clock %llu\n", kvm->arch.cur_tsc_generation, data); } @@ -1129,26 +1229,195 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; + if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated) + update_ia32_tsc_adjust_msr(vcpu, offset); kvm_x86_ops->write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + + spin_lock(&kvm->arch.pvclock_gtod_sync_lock); + if (matched) + kvm->arch.nr_vcpus_matched_tsc++; + else + kvm->arch.nr_vcpus_matched_tsc = 0; + + kvm_track_tsc_matching(vcpu); + spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); } EXPORT_SYMBOL_GPL(kvm_write_tsc); +#ifdef CONFIG_X86_64 + +static cycle_t read_tsc(void) +{ + cycle_t ret; + u64 last; + + /* + * Empirically, a fence (of type that depends on the CPU) + * before rdtsc is enough to ensure that rdtsc is ordered + * with respect to loads. The various CPU manuals are unclear + * as to whether rdtsc can be reordered with later loads, + * but no one has ever seen it happen. + */ + rdtsc_barrier(); + ret = (cycle_t)vget_cycles(); + + last = pvclock_gtod_data.clock.cycle_last; + + if (likely(ret >= last)) + return ret; + + /* + * GCC likes to generate cmov here, but this branch is extremely + * predictable (it's just a funciton of time and the likely is + * very likely) and there's a data dependence, so force GCC + * to generate a branch instead. I don't barrier() because + * we don't actually need a barrier, and if this function + * ever gets inlined it will generate worse code. + */ + asm volatile (""); + return last; +} + +static inline u64 vgettsc(cycle_t *cycle_now) +{ + long v; + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + + *cycle_now = read_tsc(); + + v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask; + return v * gtod->clock.mult; +} + +static int do_monotonic(struct timespec *ts, cycle_t *cycle_now) +{ + unsigned long seq; + u64 ns; + int mode; + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + + ts->tv_nsec = 0; + do { + seq = read_seqcount_begin(>od->seq); + mode = gtod->clock.vclock_mode; + ts->tv_sec = gtod->monotonic_time_sec; + ns = gtod->monotonic_time_snsec; + ns += vgettsc(cycle_now); + ns >>= gtod->clock.shift; + } while (unlikely(read_seqcount_retry(>od->seq, seq))); + timespec_add_ns(ts, ns); + + return mode; +} + +/* returns true if host is using tsc clocksource */ +static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) +{ + struct timespec ts; + + /* checked again under seqlock below */ + if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) + return false; + + if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC) + return false; + + monotonic_to_bootbased(&ts); + *kernel_ns = timespec_to_ns(&ts); + + return true; +} +#endif + +/* + * + * Assuming a stable TSC across physical CPUS, and a stable TSC + * across virtual CPUs, the following condition is possible. + * Each numbered line represents an event visible to both + * CPUs at the next numbered event. + * + * "timespecX" represents host monotonic time. "tscX" represents + * RDTSC value. + * + * VCPU0 on CPU0 | VCPU1 on CPU1 + * + * 1. read timespec0,tsc0 + * 2. | timespec1 = timespec0 + N + * | tsc1 = tsc0 + M + * 3. transition to guest | transition to guest + * 4. ret0 = timespec0 + (rdtsc - tsc0) | + * 5. | ret1 = timespec1 + (rdtsc - tsc1) + * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M)) + * + * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity: + * + * - ret0 < ret1 + * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M)) + * ... + * - 0 < N - M => M < N + * + * That is, when timespec0 != timespec1, M < N. Unfortunately that is not + * always the case (the difference between two distinct xtime instances + * might be smaller then the difference between corresponding TSC reads, + * when updating guest vcpus pvclock areas). + * + * To avoid that problem, do not allow visibility of distinct + * system_timestamp/tsc_timestamp values simultaneously: use a master + * copy of host monotonic time values. Update that master copy + * in lockstep. + * + * Rely on synchronization of host TSCs and guest TSCs for monotonicity. + * + */ + +static void pvclock_update_vm_gtod_copy(struct kvm *kvm) +{ +#ifdef CONFIG_X86_64 + struct kvm_arch *ka = &kvm->arch; + int vclock_mode; + bool host_tsc_clocksource, vcpus_matched; + + vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == + atomic_read(&kvm->online_vcpus)); + + /* + * If the host uses TSC clock, then passthrough TSC as stable + * to the guest. + */ + host_tsc_clocksource = kvm_get_time_and_clockread( + &ka->master_kernel_ns, + &ka->master_cycle_now); + + ka->use_master_clock = host_tsc_clocksource & vcpus_matched; + + if (ka->use_master_clock) + atomic_set(&kvm_guest_has_master_clock, 1); + + vclock_mode = pvclock_gtod_data.clock.vclock_mode; + trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, + vcpus_matched); +#endif +} + static int kvm_guest_time_update(struct kvm_vcpu *v) { - unsigned long flags; + unsigned long flags, this_tsc_khz; struct kvm_vcpu_arch *vcpu = &v->arch; + struct kvm_arch *ka = &v->kvm->arch; void *shared_kaddr; - unsigned long this_tsc_khz; s64 kernel_ns, max_kernel_ns; - u64 tsc_timestamp; + u64 tsc_timestamp, host_tsc; + struct pvclock_vcpu_time_info *guest_hv_clock; u8 pvclock_flags; + bool use_master_clock; + + kernel_ns = 0; + host_tsc = 0; /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); - kernel_ns = get_kernel_ns(); this_tsc_khz = __get_cpu_var(cpu_tsc_khz); if (unlikely(this_tsc_khz == 0)) { local_irq_restore(flags); @@ -1157,6 +1426,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) } /* + * If the host uses TSC clock, then passthrough TSC as stable + * to the guest. + */ + spin_lock(&ka->pvclock_gtod_sync_lock); + use_master_clock = ka->use_master_clock; + if (use_master_clock) { + host_tsc = ka->master_cycle_now; + kernel_ns = ka->master_kernel_ns; + } + spin_unlock(&ka->pvclock_gtod_sync_lock); + if (!use_master_clock) { + host_tsc = native_read_tsc(); + kernel_ns = get_kernel_ns(); + } + + tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc); + + /* * We may have to catch up the TSC to match elapsed wall clock * time for two reasons, even if kvmclock is used. * 1) CPU could have been running below the maximum TSC rate @@ -1217,23 +1504,20 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hw_tsc_khz = this_tsc_khz; } - if (max_kernel_ns > kernel_ns) - kernel_ns = max_kernel_ns; - + /* with a master <monotonic time, tsc value> tuple, + * pvclock clock reads always increase at the (scaled) rate + * of guest TSC - no need to deal with sampling errors. + */ + if (!use_master_clock) { + if (max_kernel_ns > kernel_ns) + kernel_ns = max_kernel_ns; + } /* With all the info we got, fill in the values */ vcpu->hv_clock.tsc_timestamp = tsc_timestamp; vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; vcpu->last_kernel_ns = kernel_ns; vcpu->last_guest_tsc = tsc_timestamp; - pvclock_flags = 0; - if (vcpu->pvclock_set_guest_stopped_request) { - pvclock_flags |= PVCLOCK_GUEST_STOPPED; - vcpu->pvclock_set_guest_stopped_request = false; - } - - vcpu->hv_clock.flags = pvclock_flags; - /* * The interface expects us to write an even number signaling that the * update is finished. Since the guest won't see the intermediate @@ -1243,6 +1527,22 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) shared_kaddr = kmap_atomic(vcpu->time_page); + guest_hv_clock = shared_kaddr + vcpu->time_offset; + + /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ + pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); + + if (vcpu->pvclock_set_guest_stopped_request) { + pvclock_flags |= PVCLOCK_GUEST_STOPPED; + vcpu->pvclock_set_guest_stopped_request = false; + } + + /* If the host uses TSC clocksource, then it is stable */ + if (use_master_clock) + pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; + + vcpu->hv_clock.flags = pvclock_flags; + memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, sizeof(vcpu->hv_clock)); @@ -1572,11 +1872,21 @@ static void record_steal_time(struct kvm_vcpu *vcpu) &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); } -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) +int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { bool pr = false; + u32 msr = msr_info->index; + u64 data = msr_info->data; switch (msr) { + case MSR_AMD64_NB_CFG: + case MSR_IA32_UCODE_REV: + case MSR_IA32_UCODE_WRITE: + case MSR_VM_HSAVE_PA: + case MSR_AMD64_PATCH_LOADER: + case MSR_AMD64_BU_CFG2: + break; + case MSR_EFER: return set_efer(vcpu, data); case MSR_K7_HWCR: @@ -1596,8 +1906,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 1; } break; - case MSR_AMD64_NB_CFG: - break; case MSR_IA32_DEBUGCTLMSR: if (!data) { /* We support the non-activated case already */ @@ -1610,11 +1918,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", __func__, data); break; - case MSR_IA32_UCODE_REV: - case MSR_IA32_UCODE_WRITE: - case MSR_VM_HSAVE_PA: - case MSR_AMD64_PATCH_LOADER: - break; case 0x200 ... 0x2ff: return set_msr_mtrr(vcpu, msr, data); case MSR_IA32_APICBASE: @@ -1625,6 +1928,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_IA32_TSCDEADLINE: kvm_set_lapic_tscdeadline_msr(vcpu, data); break; + case MSR_IA32_TSC_ADJUST: + if (guest_cpuid_has_tsc_adjust(vcpu)) { + if (!msr_info->host_initiated) { + u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; + kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true); + } + vcpu->arch.ia32_tsc_adjust_msr = data; + } + break; case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; @@ -1940,6 +2252,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_K8_INT_PENDING_MSG: case MSR_AMD64_NB_CFG: case MSR_FAM10H_MMIO_CONF_BASE: + case MSR_AMD64_BU_CFG2: data = 0; break; case MSR_P6_PERFCTR0: @@ -1984,6 +2297,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_TSCDEADLINE: data = kvm_get_lapic_tscdeadline_msr(vcpu); break; + case MSR_IA32_TSC_ADJUST: + data = (u64)vcpu->arch.ia32_tsc_adjust_msr; + break; case MSR_IA32_MISC_ENABLE: data = vcpu->arch.ia32_misc_enable_msr; break; @@ -2204,7 +2520,7 @@ int kvm_dev_ioctl_check_extension(long ext) r = KVM_MAX_VCPUS; break; case KVM_CAP_NR_MEMSLOTS: - r = KVM_MEMORY_SLOTS; + r = KVM_USER_MEM_SLOTS; break; case KVM_CAP_PV_MMU: /* obsolete */ r = 0; @@ -2342,7 +2658,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + /* + * On a host with synchronized TSC, there is no need to update + * kvmclock on vcpu->cpu migration + */ + if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); if (vcpu->cpu != cpu) kvm_migrate_timers(vcpu); vcpu->cpu = cpu; @@ -2691,15 +3012,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (!vcpu->arch.apic) goto out; u.lapic = memdup_user(argp, sizeof(*u.lapic)); - if (IS_ERR(u.lapic)) { - r = PTR_ERR(u.lapic); - goto out; - } + if (IS_ERR(u.lapic)) + return PTR_ERR(u.lapic); r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); - if (r) - goto out; - r = 0; break; } case KVM_INTERRUPT: { @@ -2709,16 +3025,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&irq, argp, sizeof irq)) goto out; r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); - if (r) - goto out; - r = 0; break; } case KVM_NMI: { r = kvm_vcpu_ioctl_nmi(vcpu); - if (r) - goto out; - r = 0; break; } case KVM_SET_CPUID: { @@ -2729,8 +3039,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) goto out; r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); - if (r) - goto out; break; } case KVM_SET_CPUID2: { @@ -2742,8 +3050,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp, goto out; r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, cpuid_arg->entries); - if (r) - goto out; break; } case KVM_GET_CPUID2: { @@ -2875,10 +3181,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_SET_XSAVE: { u.xsave = memdup_user(argp, sizeof(*u.xsave)); - if (IS_ERR(u.xsave)) { - r = PTR_ERR(u.xsave); - goto out; - } + if (IS_ERR(u.xsave)) + return PTR_ERR(u.xsave); r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; @@ -2900,10 +3204,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_SET_XCRS: { u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); - if (IS_ERR(u.xcrs)) { - r = PTR_ERR(u.xcrs); - goto out; - } + if (IS_ERR(u.xcrs)) + return PTR_ERR(u.xcrs); r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); break; @@ -2951,7 +3253,7 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) int ret; if (addr > (unsigned int)(-3 * PAGE_SIZE)) - return -1; + return -EINVAL; ret = kvm_x86_ops->set_tss_addr(kvm, addr); return ret; } @@ -2970,12 +3272,10 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, return -EINVAL; mutex_lock(&kvm->slots_lock); - spin_lock(&kvm->mmu_lock); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; - spin_unlock(&kvm->mmu_lock); mutex_unlock(&kvm->slots_lock); return 0; } @@ -3135,7 +3435,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) mutex_lock(&kvm->slots_lock); r = -EINVAL; - if (log->slot >= KVM_MEMORY_SLOTS) + if (log->slot >= KVM_USER_MEM_SLOTS) goto out; memslot = id_to_memslot(kvm->memslots, log->slot); @@ -3212,8 +3512,6 @@ long kvm_arch_vm_ioctl(struct file *filp, switch (ioctl) { case KVM_SET_TSS_ADDR: r = kvm_vm_ioctl_set_tss_addr(kvm, arg); - if (r < 0) - goto out; break; case KVM_SET_IDENTITY_MAP_ADDR: { u64 ident_addr; @@ -3222,14 +3520,10 @@ long kvm_arch_vm_ioctl(struct file *filp, if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) goto out; r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); - if (r < 0) - goto out; break; } case KVM_SET_NR_MMU_PAGES: r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); - if (r) - goto out; break; case KVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); @@ -3320,8 +3614,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; get_irqchip_out: kfree(chip); - if (r) - goto out; break; } case KVM_SET_IRQCHIP: { @@ -3343,8 +3635,6 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; set_irqchip_out: kfree(chip); - if (r) - goto out; break; } case KVM_GET_PIT: { @@ -3371,9 +3661,6 @@ long kvm_arch_vm_ioctl(struct file *filp, if (!kvm->arch.vpit) goto out; r = kvm_vm_ioctl_set_pit(kvm, &u.ps); - if (r) - goto out; - r = 0; break; } case KVM_GET_PIT2: { @@ -3397,9 +3684,6 @@ long kvm_arch_vm_ioctl(struct file *filp, if (!kvm->arch.vpit) goto out; r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); - if (r) - goto out; - r = 0; break; } case KVM_REINJECT_CONTROL: { @@ -3408,9 +3692,6 @@ long kvm_arch_vm_ioctl(struct file *filp, if (copy_from_user(&control, argp, sizeof(control))) goto out; r = kvm_vm_ioctl_reinject(kvm, &control); - if (r) - goto out; - r = 0; break; } case KVM_XEN_HVM_CONFIG: { @@ -4210,8 +4491,10 @@ static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, kvm_get_segment(emul_to_vcpu(ctxt), &var, seg); *selector = var.selector; - if (var.unusable) + if (var.unusable) { + memset(desc, 0, sizeof(*desc)); return false; + } if (var.g) var.limit >>= 12; @@ -4273,7 +4556,12 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data) { - return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); + struct msr_data msr; + + msr.data = data; + msr.index = msr_index; + msr.host_initiated = false; + return kvm_set_msr(emul_to_vcpu(ctxt), &msr); } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, @@ -4467,26 +4755,26 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu) return r; } -static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) +static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, + bool write_fault_to_shadow_pgtable) { - gpa_t gpa; + gpa_t gpa = cr2; pfn_t pfn; - if (tdp_enabled) - return false; - - /* - * if emulation was due to access to shadowed page table - * and it failed try to unshadow page and re-enter the - * guest to let CPU execute the instruction. - */ - if (kvm_mmu_unprotect_page_virt(vcpu, gva)) - return true; - - gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); + if (!vcpu->arch.mmu.direct_map) { + /* + * Write permission should be allowed since only + * write access need to be emulated. + */ + gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); - if (gpa == UNMAPPED_GVA) - return true; /* let cpu generate fault */ + /* + * If the mapping is invalid in guest, let cpu retry + * it to generate fault. + */ + if (gpa == UNMAPPED_GVA) + return true; + } /* * Do not retry the unhandleable instruction if it faults on the @@ -4495,12 +4783,43 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) * instruction -> ... */ pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); - if (!is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); + + /* + * If the instruction failed on the error pfn, it can not be fixed, + * report the error to userspace. + */ + if (is_error_noslot_pfn(pfn)) + return false; + + kvm_release_pfn_clean(pfn); + + /* The instructions are well-emulated on direct mmu. */ + if (vcpu->arch.mmu.direct_map) { + unsigned int indirect_shadow_pages; + + spin_lock(&vcpu->kvm->mmu_lock); + indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages; + spin_unlock(&vcpu->kvm->mmu_lock); + + if (indirect_shadow_pages) + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + return true; } - return false; + /* + * if emulation was due to access to shadowed page table + * and it failed try to unshadow page and re-enter the + * guest to let CPU execute the instruction. + */ + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); + + /* + * If the access faults on its page table, it can not + * be fixed by unprotecting shadow page and it should + * be reported to userspace. + */ + return !write_fault_to_shadow_pgtable; } static bool retry_instruction(struct x86_emulate_ctxt *ctxt, @@ -4542,7 +4861,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, if (!vcpu->arch.mmu.direct_map) gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL); - kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)); return true; } @@ -4559,7 +4878,13 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, int r; struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; bool writeback = true; + bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; + /* + * Clear write_fault_to_shadow_pgtable here to ensure it is + * never reused. + */ + vcpu->arch.write_fault_to_shadow_pgtable = false; kvm_clear_exception_queue(vcpu); if (!(emulation_type & EMULTYPE_NO_DECODE)) { @@ -4578,7 +4903,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if (r != EMULATION_OK) { if (emulation_type & EMULTYPE_TRAP_UD) return EMULATE_FAIL; - if (reexecute_instruction(vcpu, cr2)) + if (reexecute_instruction(vcpu, cr2, + write_fault_to_spt)) return EMULATE_DONE; if (emulation_type & EMULTYPE_SKIP) return EMULATE_FAIL; @@ -4608,7 +4934,7 @@ restart: return EMULATE_DONE; if (r == EMULATION_FAILED) { - if (reexecute_instruction(vcpu, cr2)) + if (reexecute_instruction(vcpu, cr2, write_fault_to_spt)) return EMULATE_DONE; return handle_emulation_failure(vcpu); @@ -4881,6 +5207,50 @@ static void kvm_set_mmio_spte_mask(void) kvm_mmu_set_mmio_spte_mask(mask); } +#ifdef CONFIG_X86_64 +static void pvclock_gtod_update_fn(struct work_struct *work) +{ + struct kvm *kvm; + + struct kvm_vcpu *vcpu; + int i; + + raw_spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_for_each_vcpu(i, vcpu, kvm) + set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests); + atomic_set(&kvm_guest_has_master_clock, 0); + raw_spin_unlock(&kvm_lock); +} + +static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); + +/* + * Notification about pvclock gtod data update. + */ +static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, + void *priv) +{ + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + struct timekeeper *tk = priv; + + update_pvclock_gtod(tk); + + /* disable master clock if host does not trust, or does not + * use, TSC clocksource + */ + if (gtod->clock.vclock_mode != VCLOCK_TSC && + atomic_read(&kvm_guest_has_master_clock) != 0) + queue_work(system_long_wq, &pvclock_gtod_work); + + return 0; +} + +static struct notifier_block pvclock_gtod_notifier = { + .notifier_call = pvclock_gtod_notify, +}; +#endif + int kvm_arch_init(void *opaque) { int r; @@ -4903,9 +5273,16 @@ int kvm_arch_init(void *opaque) goto out; } + r = -ENOMEM; + shared_msrs = alloc_percpu(struct kvm_shared_msrs); + if (!shared_msrs) { + printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); + goto out; + } + r = kvm_mmu_module_init(); if (r) - goto out; + goto out_free_percpu; kvm_set_mmio_spte_mask(); kvm_init_msr_list(); @@ -4922,8 +5299,14 @@ int kvm_arch_init(void *opaque) host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_lapic_init(); +#ifdef CONFIG_X86_64 + pvclock_gtod_register_notifier(&pvclock_gtod_notifier); +#endif + return 0; +out_free_percpu: + free_percpu(shared_msrs); out: return r; } @@ -4936,8 +5319,12 @@ void kvm_arch_exit(void) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); +#ifdef CONFIG_X86_64 + pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); +#endif kvm_x86_ops = NULL; kvm_mmu_module_exit(); + free_percpu(shared_msrs); } int kvm_emulate_halt(struct kvm_vcpu *vcpu) @@ -5059,7 +5446,7 @@ out: } EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); -int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) +static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); char instruction[3]; @@ -5190,7 +5577,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) vcpu->arch.nmi_injected = true; kvm_x86_ops->set_nmi(vcpu); } - } else if (kvm_cpu_has_interrupt(vcpu)) { + } else if (kvm_cpu_has_injectable_intr(vcpu)) { if (kvm_x86_ops->interrupt_allowed(vcpu)) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); @@ -5235,6 +5622,39 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } +static void kvm_gen_update_masterclock(struct kvm *kvm) +{ +#ifdef CONFIG_X86_64 + int i; + struct kvm_vcpu *vcpu; + struct kvm_arch *ka = &kvm->arch; + + spin_lock(&ka->pvclock_gtod_sync_lock); + kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ + pvclock_update_vm_gtod_copy(kvm); + + kvm_for_each_vcpu(i, vcpu, kvm) + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + + /* guest entries allowed */ + kvm_for_each_vcpu(i, vcpu, kvm) + clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); + + spin_unlock(&ka->pvclock_gtod_sync_lock); +#endif +} + +static void update_eoi_exitmap(struct kvm_vcpu *vcpu) +{ + u64 eoi_exit_bitmap[4]; + + memset(eoi_exit_bitmap, 0, 32); + + kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap); + kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); +} + static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; @@ -5247,6 +5667,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); + if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) + kvm_gen_update_masterclock(vcpu->kvm); if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { r = kvm_guest_time_update(vcpu); if (unlikely(r)) @@ -5286,6 +5708,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_handle_pmu_event(vcpu); if (kvm_check_request(KVM_REQ_PMI, vcpu)) kvm_deliver_pmi(vcpu); + if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu)) + update_eoi_exitmap(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -5294,10 +5718,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* enable NMI/IRQ window open exits if needed */ if (vcpu->arch.nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) kvm_x86_ops->enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { + /* + * Update architecture specific hints for APIC + * virtual interrupt delivery. + */ + if (kvm_x86_ops->hwapic_irr_update) + kvm_x86_ops->hwapic_irr_update(vcpu, + kvm_lapic_find_highest_irr(vcpu)); update_cr8_intercept(vcpu); kvm_lapic_sync_to_vapic(vcpu); } @@ -5362,7 +5793,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); - vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, + native_read_tsc()); vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); @@ -5419,7 +5851,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) pr_debug("vcpu %d received sipi with vector # %x\n", vcpu->vcpu_id, vcpu->arch.sipi_vector); kvm_lapic_reset(vcpu); - r = kvm_arch_vcpu_reset(vcpu); + r = kvm_vcpu_reset(vcpu); if (r) return r; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -5781,6 +6213,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int pending_vec, max_bits, idx; struct desc_ptr dt; + if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE)) + return -EINVAL; + dt.size = sregs->idt.limit; dt.address = sregs->idt.base; kvm_x86_ops->set_idt(vcpu, &dt); @@ -6044,7 +6479,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) r = vcpu_load(vcpu); if (r) return r; - r = kvm_arch_vcpu_reset(vcpu); + r = kvm_vcpu_reset(vcpu); if (r == 0) r = kvm_mmu_setup(vcpu); vcpu_put(vcpu); @@ -6052,6 +6487,23 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) return r; } +int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ + int r; + struct msr_data msr; + + r = vcpu_load(vcpu); + if (r) + return r; + msr.data = 0x0; + msr.index = MSR_IA32_TSC; + msr.host_initiated = true; + kvm_write_tsc(vcpu, &msr); + vcpu_put(vcpu); + + return r; +} + void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { int r; @@ -6066,7 +6518,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_x86_ops->vcpu_free(vcpu); } -int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) +static int kvm_vcpu_reset(struct kvm_vcpu *vcpu) { atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; @@ -6089,6 +6541,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) kvm_pmu_reset(vcpu); + memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); + vcpu->arch.regs_avail = ~0; + vcpu->arch.regs_dirty = ~0; + return kvm_x86_ops->vcpu_reset(vcpu); } @@ -6165,6 +6621,8 @@ int kvm_arch_hardware_enable(void *garbage) kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.tsc_offset_adjustment += delta_cyc; vcpu->arch.last_host_tsc = local_tsc; + set_bit(KVM_REQ_MASTERCLOCK_UPDATE, + &vcpu->requests); } /* @@ -6255,10 +6713,17 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) goto fail_free_mce_banks; + r = fx_init(vcpu); + if (r) + goto fail_free_wbinvd_dirty_mask; + + vcpu->arch.ia32_tsc_adjust_msr = 0x0; kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); return 0; +fail_free_wbinvd_dirty_mask: + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: kfree(vcpu->arch.mce_banks); fail_free_lapic: @@ -6302,6 +6767,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); + spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); + + pvclock_update_vm_gtod_copy(kvm); return 0; } @@ -6440,48 +6908,43 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, struct kvm_userspace_memory_region *mem, - int user_alloc) + bool user_alloc) { int npages = memslot->npages; - int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; - - /* Prevent internal slot pages from being moved by fork()/COW. */ - if (memslot->id >= KVM_MEMORY_SLOTS) - map_flags = MAP_SHARED | MAP_ANONYMOUS; - /*To keep backward compatibility with older userspace, - *x86 needs to handle !user_alloc case. + /* + * Only private memory slots need to be mapped here since + * KVM_SET_MEMORY_REGION ioctl is no longer supported. */ - if (!user_alloc) { - if (npages && !old.npages) { - unsigned long userspace_addr; + if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) { + unsigned long userspace_addr; - userspace_addr = vm_mmap(NULL, 0, - npages * PAGE_SIZE, - PROT_READ | PROT_WRITE, - map_flags, - 0); + /* + * MAP_SHARED to prevent internal slot pages from being moved + * by fork()/COW. + */ + userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, 0); - if (IS_ERR((void *)userspace_addr)) - return PTR_ERR((void *)userspace_addr); + if (IS_ERR((void *)userspace_addr)) + return PTR_ERR((void *)userspace_addr); - memslot->userspace_addr = userspace_addr; - } + memslot->userspace_addr = userspace_addr; } - return 0; } void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, struct kvm_memory_slot old, - int user_alloc) + bool user_alloc) { int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; - if (!user_alloc && !old.user_alloc && old.npages && !npages) { + if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) { int ret; ret = vm_munmap(old.userspace_addr, @@ -6495,11 +6958,15 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, if (!kvm->arch.n_requested_mmu_pages) nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); - spin_lock(&kvm->mmu_lock); if (nr_mmu_pages) kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); - kvm_mmu_slot_remove_write_access(kvm, mem->slot); - spin_unlock(&kvm->mmu_lock); + /* + * Write protect all pages for dirty logging. + * Existing largepage mappings are destroyed here and new ones will + * not be created until the end of the logging. + */ + if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES)) + kvm_mmu_slot_remove_write_access(kvm, mem->slot); /* * If memory slot is created, or moved, we need to clear all * mmio sptes. diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2b5219c12ac..e224f7a671b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -112,7 +112,7 @@ void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); -void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); +void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, |