summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/emulate.c867
-rw-r--r--arch/x86/kvm/i8254.c6
-rw-r--r--arch/x86/kvm/i8259.c123
-rw-r--r--arch/x86/kvm/irq.h4
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h7
-rw-r--r--arch/x86/kvm/kvm_timer.h2
-rw-r--r--arch/x86/kvm/lapic.c167
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/mmu.c8
-rw-r--r--arch/x86/kvm/mmu_audit.c6
-rw-r--r--arch/x86/kvm/paging_tmpl.h24
-rw-r--r--arch/x86/kvm/svm.c93
-rw-r--r--arch/x86/kvm/trace.h118
-rw-r--r--arch/x86/kvm/vmx.c131
-rw-r--r--arch/x86/kvm/x86.c277
15 files changed, 1070 insertions, 767 deletions
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6f08bc940fa..f1e3be18a08 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -29,6 +29,39 @@
#include "tss.h"
/*
+ * Operand types
+ */
+#define OpNone 0ull
+#define OpImplicit 1ull /* No generic decode */
+#define OpReg 2ull /* Register */
+#define OpMem 3ull /* Memory */
+#define OpAcc 4ull /* Accumulator: AL/AX/EAX/RAX */
+#define OpDI 5ull /* ES:DI/EDI/RDI */
+#define OpMem64 6ull /* Memory, 64-bit */
+#define OpImmUByte 7ull /* Zero-extended 8-bit immediate */
+#define OpDX 8ull /* DX register */
+#define OpCL 9ull /* CL register (for shifts) */
+#define OpImmByte 10ull /* 8-bit sign extended immediate */
+#define OpOne 11ull /* Implied 1 */
+#define OpImm 12ull /* Sign extended immediate */
+#define OpMem16 13ull /* Memory operand (16-bit). */
+#define OpMem32 14ull /* Memory operand (32-bit). */
+#define OpImmU 15ull /* Immediate operand, zero extended */
+#define OpSI 16ull /* SI/ESI/RSI */
+#define OpImmFAddr 17ull /* Immediate far address */
+#define OpMemFAddr 18ull /* Far address in memory */
+#define OpImmU16 19ull /* Immediate operand, 16 bits, zero extended */
+#define OpES 20ull /* ES */
+#define OpCS 21ull /* CS */
+#define OpSS 22ull /* SS */
+#define OpDS 23ull /* DS */
+#define OpFS 24ull /* FS */
+#define OpGS 25ull /* GS */
+
+#define OpBits 5 /* Width of operand field */
+#define OpMask ((1ull << OpBits) - 1)
+
+/*
* Opcode effective-address decode tables.
* Note that we only emulate instructions that have at least one memory
* operand (excluding implicit stack references). We assume that stack
@@ -40,37 +73,35 @@
/* Operand sizes: 8-bit operands or specified/overridden size. */
#define ByteOp (1<<0) /* 8-bit operands. */
/* Destination operand type. */
-#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
-#define DstReg (2<<1) /* Register operand. */
-#define DstMem (3<<1) /* Memory operand. */
-#define DstAcc (4<<1) /* Destination Accumulator */
-#define DstDI (5<<1) /* Destination is in ES:(E)DI */
-#define DstMem64 (6<<1) /* 64bit memory operand */
-#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */
-#define DstDX (8<<1) /* Destination is in DX register */
-#define DstMask (0xf<<1)
+#define DstShift 1
+#define ImplicitOps (OpImplicit << DstShift)
+#define DstReg (OpReg << DstShift)
+#define DstMem (OpMem << DstShift)
+#define DstAcc (OpAcc << DstShift)
+#define DstDI (OpDI << DstShift)
+#define DstMem64 (OpMem64 << DstShift)
+#define DstImmUByte (OpImmUByte << DstShift)
+#define DstDX (OpDX << DstShift)
+#define DstMask (OpMask << DstShift)
/* Source operand type. */
-#define SrcNone (0<<5) /* No source operand. */
-#define SrcReg (1<<5) /* Register operand. */
-#define SrcMem (2<<5) /* Memory operand. */
-#define SrcMem16 (3<<5) /* Memory operand (16-bit). */
-#define SrcMem32 (4<<5) /* Memory operand (32-bit). */
-#define SrcImm (5<<5) /* Immediate operand. */
-#define SrcImmByte (6<<5) /* 8-bit sign-extended immediate operand. */
-#define SrcOne (7<<5) /* Implied '1' */
-#define SrcImmUByte (8<<5) /* 8-bit unsigned immediate operand. */
-#define SrcImmU (9<<5) /* Immediate operand, unsigned */
-#define SrcSI (0xa<<5) /* Source is in the DS:RSI */
-#define SrcImmFAddr (0xb<<5) /* Source is immediate far address */
-#define SrcMemFAddr (0xc<<5) /* Source is far address in memory */
-#define SrcAcc (0xd<<5) /* Source Accumulator */
-#define SrcImmU16 (0xe<<5) /* Immediate operand, unsigned, 16 bits */
-#define SrcDX (0xf<<5) /* Source is in DX register */
-#define SrcMask (0xf<<5)
-/* Generic ModRM decode. */
-#define ModRM (1<<9)
-/* Destination is only written; never read. */
-#define Mov (1<<10)
+#define SrcShift 6
+#define SrcNone (OpNone << SrcShift)
+#define SrcReg (OpReg << SrcShift)
+#define SrcMem (OpMem << SrcShift)
+#define SrcMem16 (OpMem16 << SrcShift)
+#define SrcMem32 (OpMem32 << SrcShift)
+#define SrcImm (OpImm << SrcShift)
+#define SrcImmByte (OpImmByte << SrcShift)
+#define SrcOne (OpOne << SrcShift)
+#define SrcImmUByte (OpImmUByte << SrcShift)
+#define SrcImmU (OpImmU << SrcShift)
+#define SrcSI (OpSI << SrcShift)
+#define SrcImmFAddr (OpImmFAddr << SrcShift)
+#define SrcMemFAddr (OpMemFAddr << SrcShift)
+#define SrcAcc (OpAcc << SrcShift)
+#define SrcImmU16 (OpImmU16 << SrcShift)
+#define SrcDX (OpDX << SrcShift)
+#define SrcMask (OpMask << SrcShift)
#define BitOp (1<<11)
#define MemAbs (1<<12) /* Memory operand is absolute displacement */
#define String (1<<13) /* String instruction (rep capable) */
@@ -81,6 +112,10 @@
#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
#define Sse (1<<18) /* SSE Vector instruction */
+/* Generic ModRM decode. */
+#define ModRM (1<<19)
+/* Destination is only written; never read. */
+#define Mov (1<<20)
/* Misc flags */
#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */
#define VendorSpecific (1<<22) /* Vendor specific instruction */
@@ -91,12 +126,19 @@
#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
#define No64 (1<<28)
/* Source 2 operand type */
-#define Src2None (0<<29)
-#define Src2CL (1<<29)
-#define Src2ImmByte (2<<29)
-#define Src2One (3<<29)
-#define Src2Imm (4<<29)
-#define Src2Mask (7<<29)
+#define Src2Shift (29)
+#define Src2None (OpNone << Src2Shift)
+#define Src2CL (OpCL << Src2Shift)
+#define Src2ImmByte (OpImmByte << Src2Shift)
+#define Src2One (OpOne << Src2Shift)
+#define Src2Imm (OpImm << Src2Shift)
+#define Src2ES (OpES << Src2Shift)
+#define Src2CS (OpCS << Src2Shift)
+#define Src2SS (OpSS << Src2Shift)
+#define Src2DS (OpDS << Src2Shift)
+#define Src2FS (OpFS << Src2Shift)
+#define Src2GS (OpGS << Src2Shift)
+#define Src2Mask (OpMask << Src2Shift)
#define X2(x...) x, x
#define X3(x...) X2(x), x
@@ -108,8 +150,8 @@
#define X16(x...) X8(x), X8(x)
struct opcode {
- u32 flags;
- u8 intercept;
+ u64 flags : 56;
+ u64 intercept : 8;
union {
int (*execute)(struct x86_emulate_ctxt *ctxt);
struct opcode *group;
@@ -205,105 +247,100 @@ struct gprefix {
#define ON64(x)
#endif
-#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \
+#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype) \
do { \
__asm__ __volatile__ ( \
_PRE_EFLAGS("0", "4", "2") \
_op _suffix " %"_x"3,%1; " \
_POST_EFLAGS("0", "4", "2") \
- : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\
+ : "=m" ((ctxt)->eflags), \
+ "+q" (*(_dsttype*)&(ctxt)->dst.val), \
"=&r" (_tmp) \
- : _y ((_src).val), "i" (EFLAGS_MASK)); \
+ : _y ((ctxt)->src.val), "i" (EFLAGS_MASK)); \
} while (0)
/* Raw emulation: instruction has two explicit operands. */
-#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
+#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy) \
do { \
unsigned long _tmp; \
\
- switch ((_dst).bytes) { \
+ switch ((ctxt)->dst.bytes) { \
case 2: \
- ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\
+ ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16); \
break; \
case 4: \
- ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\
+ ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32); \
break; \
case 8: \
- ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \
+ ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \
break; \
} \
} while (0)
-#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
+#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
do { \
unsigned long _tmp; \
- switch ((_dst).bytes) { \
+ switch ((ctxt)->dst.bytes) { \
case 1: \
- ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \
+ ____emulate_2op(ctxt,_op,_bx,_by,"b",u8); \
break; \
default: \
- __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
+ __emulate_2op_nobyte(ctxt, _op, \
_wx, _wy, _lx, _ly, _qx, _qy); \
break; \
} \
} while (0)
/* Source operand is byte-sized and may be restricted to just %cl. */
-#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \
- __emulate_2op(_op, _src, _dst, _eflags, \
- "b", "c", "b", "c", "b", "c", "b", "c")
+#define emulate_2op_SrcB(ctxt, _op) \
+ __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c")
/* Source operand is byte, word, long or quad sized. */
-#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \
- __emulate_2op(_op, _src, _dst, _eflags, \
- "b", "q", "w", "r", _LO32, "r", "", "r")
+#define emulate_2op_SrcV(ctxt, _op) \
+ __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r")
/* Source operand is word, long or quad sized. */
-#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \
- __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
- "w", "r", _LO32, "r", "", "r")
+#define emulate_2op_SrcV_nobyte(ctxt, _op) \
+ __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r")
/* Instruction has three operands and one operand is stored in ECX register */
-#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \
+#define __emulate_2op_cl(ctxt, _op, _suffix, _type) \
do { \
unsigned long _tmp; \
- _type _clv = (_cl).val; \
- _type _srcv = (_src).val; \
- _type _dstv = (_dst).val; \
+ _type _clv = (ctxt)->src2.val; \
+ _type _srcv = (ctxt)->src.val; \
+ _type _dstv = (ctxt)->dst.val; \
\
__asm__ __volatile__ ( \
_PRE_EFLAGS("0", "5", "2") \
_op _suffix " %4,%1 \n" \
_POST_EFLAGS("0", "5", "2") \
- : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \
+ : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \
: "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
); \
\
- (_cl).val = (unsigned long) _clv; \
- (_src).val = (unsigned long) _srcv; \
- (_dst).val = (unsigned long) _dstv; \
+ (ctxt)->src2.val = (unsigned long) _clv; \
+ (ctxt)->src2.val = (unsigned long) _srcv; \
+ (ctxt)->dst.val = (unsigned long) _dstv; \
} while (0)
-#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \
+#define emulate_2op_cl(ctxt, _op) \
do { \
- switch ((_dst).bytes) { \
+ switch ((ctxt)->dst.bytes) { \
case 2: \
- __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
- "w", unsigned short); \
+ __emulate_2op_cl(ctxt, _op, "w", u16); \
break; \
case 4: \
- __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
- "l", unsigned int); \
+ __emulate_2op_cl(ctxt, _op, "l", u32); \
break; \
case 8: \
- ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
- "q", unsigned long)); \
+ ON64(__emulate_2op_cl(ctxt, _op, "q", ulong)); \
break; \
} \
} while (0)
-#define __emulate_1op(_op, _dst, _eflags, _suffix) \
+#define __emulate_1op(ctxt, _op, _suffix) \
do { \
unsigned long _tmp; \
\
@@ -311,39 +348,27 @@ struct gprefix {
_PRE_EFLAGS("0", "3", "2") \
_op _suffix " %1; " \
_POST_EFLAGS("0", "3", "2") \
- : "=m" (_eflags), "+m" ((_dst).val), \
+ : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \
"=&r" (_tmp) \
: "i" (EFLAGS_MASK)); \
} while (0)
/* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(_op, _dst, _eflags) \
+#define emulate_1op(ctxt, _op) \
do { \
- switch ((_dst).bytes) { \
- case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \
- case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \
- case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \
- case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \
+ switch ((ctxt)->dst.bytes) { \
+ case 1: __emulate_1op(ctxt, _op, "b"); break; \
+ case 2: __emulate_1op(ctxt, _op, "w"); break; \
+ case 4: __emulate_1op(ctxt, _op, "l"); break; \
+ case 8: ON64(__emulate_1op(ctxt, _op, "q")); break; \
} \
} while (0)
-#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \
- do { \
- unsigned long _tmp; \
- \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "4", "1") \
- _op _suffix " %5; " \
- _POST_EFLAGS("0", "4", "1") \
- : "=m" (_eflags), "=&r" (_tmp), \
- "+a" (_rax), "+d" (_rdx) \
- : "i" (EFLAGS_MASK), "m" ((_src).val), \
- "a" (_rax), "d" (_rdx)); \
- } while (0)
-
-#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \
+#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \
do { \
unsigned long _tmp; \
+ ulong *rax = &(ctxt)->regs[VCPU_REGS_RAX]; \
+ ulong *rdx = &(ctxt)->regs[VCPU_REGS_RDX]; \
\
__asm__ __volatile__ ( \
_PRE_EFLAGS("0", "5", "1") \
@@ -356,53 +381,27 @@ struct gprefix {
"jmp 2b \n\t" \
".popsection \n\t" \
_ASM_EXTABLE(1b, 3b) \
- : "=m" (_eflags), "=&r" (_tmp), \
- "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \
- : "i" (EFLAGS_MASK), "m" ((_src).val), \
- "a" (_rax), "d" (_rdx)); \
+ : "=m" ((ctxt)->eflags), "=&r" (_tmp), \
+ "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \
+ : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val), \
+ "a" (*rax), "d" (*rdx)); \
} while (0)
/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
-#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \
+#define emulate_1op_rax_rdx(ctxt, _op, _ex) \
do { \
- switch((_src).bytes) { \
+ switch((ctxt)->src.bytes) { \
case 1: \
- __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
- _eflags, "b"); \
+ __emulate_1op_rax_rdx(ctxt, _op, "b", _ex); \
break; \
case 2: \
- __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
- _eflags, "w"); \
+ __emulate_1op_rax_rdx(ctxt, _op, "w", _ex); \
break; \
case 4: \
- __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
- _eflags, "l"); \
- break; \
- case 8: \
- ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
- _eflags, "q")); \
- break; \
- } \
- } while (0)
-
-#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \
- do { \
- switch((_src).bytes) { \
- case 1: \
- __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
- _eflags, "b", _ex); \
- break; \
- case 2: \
- __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
- _eflags, "w", _ex); \
- break; \
- case 4: \
- __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
- _eflags, "l", _ex); \
+ __emulate_1op_rax_rdx(ctxt, _op, "l", _ex); \
break; \
case 8: ON64( \
- __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
- _eflags, "q", _ex)); \
+ __emulate_1op_rax_rdx(ctxt, _op, "q", _ex)); \
break; \
} \
} while (0)
@@ -651,41 +650,50 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
}
-static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt,
- unsigned long eip, u8 *dest)
+/*
+ * Fetch the next byte of the instruction being emulated which is pointed to
+ * by ctxt->_eip, then increment ctxt->_eip.
+ *
+ * Also prefetch the remaining bytes of the instruction without crossing page
+ * boundary if they are not in fetch_cache yet.
+ */
+static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest)
{
struct fetch_cache *fc = &ctxt->fetch;
int rc;
int size, cur_size;
- if (eip == fc->end) {
+ if (ctxt->_eip == fc->end) {
unsigned long linear;
- struct segmented_address addr = { .seg=VCPU_SREG_CS, .ea=eip};
+ struct segmented_address addr = { .seg = VCPU_SREG_CS,
+ .ea = ctxt->_eip };
cur_size = fc->end - fc->start;
- size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
+ size = min(15UL - cur_size,
+ PAGE_SIZE - offset_in_page(ctxt->_eip));
rc = __linearize(ctxt, addr, size, false, true, &linear);
- if (rc != X86EMUL_CONTINUE)
+ if (unlikely(rc != X86EMUL_CONTINUE))
return rc;
rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size,
size, &ctxt->exception);
- if (rc != X86EMUL_CONTINUE)
+ if (unlikely(rc != X86EMUL_CONTINUE))
return rc;
fc->end += size;
}
- *dest = fc->data[eip - fc->start];
+ *dest = fc->data[ctxt->_eip - fc->start];
+ ctxt->_eip++;
return X86EMUL_CONTINUE;
}
static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
- unsigned long eip, void *dest, unsigned size)
+ void *dest, unsigned size)
{
int rc;
/* x86 instructions are limited to 15 bytes. */
- if (eip + size - ctxt->eip > 15)
+ if (unlikely(ctxt->_eip + size - ctxt->eip > 15))
return X86EMUL_UNHANDLEABLE;
while (size--) {
- rc = do_insn_fetch_byte(ctxt, eip++, dest++);
+ rc = do_insn_fetch_byte(ctxt, dest++);
if (rc != X86EMUL_CONTINUE)
return rc;
}
@@ -693,20 +701,18 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
}
/* Fetch next part of the instruction being emulated. */
-#define insn_fetch(_type, _size, _eip) \
+#define insn_fetch(_type, _ctxt) \
({ unsigned long _x; \
- rc = do_insn_fetch(ctxt, (_eip), &_x, (_size)); \
+ rc = do_insn_fetch(_ctxt, &_x, sizeof(_type)); \
if (rc != X86EMUL_CONTINUE) \
goto done; \
- (_eip) += (_size); \
(_type)_x; \
})
-#define insn_fetch_arr(_arr, _size, _eip) \
-({ rc = do_insn_fetch(ctxt, (_eip), _arr, (_size)); \
+#define insn_fetch_arr(_arr, _size, _ctxt) \
+({ rc = do_insn_fetch(_ctxt, _arr, (_size)); \
if (rc != X86EMUL_CONTINUE) \
goto done; \
- (_eip) += (_size); \
})
/*
@@ -894,7 +900,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */
}
- ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
+ ctxt->modrm = insn_fetch(u8, ctxt);
ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6;
ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3;
ctxt->modrm_rm |= (ctxt->modrm & 0x07);
@@ -928,13 +934,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
switch (ctxt->modrm_mod) {
case 0:
if (ctxt->modrm_rm == 6)
- modrm_ea += insn_fetch(u16, 2, ctxt->_eip);
+ modrm_ea += insn_fetch(u16, ctxt);
break;
case 1:
- modrm_ea += insn_fetch(s8, 1, ctxt->_eip);
+ modrm_ea += insn_fetch(s8, ctxt);
break;
case 2:
- modrm_ea += insn_fetch(u16, 2, ctxt->_eip);
+ modrm_ea += insn_fetch(u16, ctxt);
break;
}
switch (ctxt->modrm_rm) {
@@ -971,13 +977,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
} else {
/* 32/64-bit ModR/M decode. */
if ((ctxt->modrm_rm & 7) == 4) {
- sib = insn_fetch(u8, 1, ctxt->_eip);
+ sib = insn_fetch(u8, ctxt);
index_reg |= (sib >> 3) & 7;
base_reg |= sib & 7;
scale = sib >> 6;
if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0)
- modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
+ modrm_ea += insn_fetch(s32, ctxt);
else
modrm_ea += ctxt->regs[base_reg];
if (index_reg != 4)
@@ -990,13 +996,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
switch (ctxt->modrm_mod) {
case 0:
if (ctxt->modrm_rm == 5)
- modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
+ modrm_ea += insn_fetch(s32, ctxt);
break;
case 1:
- modrm_ea += insn_fetch(s8, 1, ctxt->_eip);
+ modrm_ea += insn_fetch(s8, ctxt);
break;
case 2:
- modrm_ea += insn_fetch(s32, 4, ctxt->_eip);
+ modrm_ea += insn_fetch(s32, ctxt);
break;
}
}
@@ -1013,13 +1019,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
op->type = OP_MEM;
switch (ctxt->ad_bytes) {
case 2:
- op->addr.mem.ea = insn_fetch(u16, 2, ctxt->_eip);
+ op->addr.mem.ea = insn_fetch(u16, ctxt);
break;
case 4:
- op->addr.mem.ea = insn_fetch(u32, 4, ctxt->_eip);
+ op->addr.mem.ea = insn_fetch(u32, ctxt);
break;
case 8:
- op->addr.mem.ea = insn_fetch(u64, 8, ctxt->_eip);
+ op->addr.mem.ea = insn_fetch(u64, ctxt);
break;
}
done:
@@ -1452,15 +1458,18 @@ static int em_popf(struct x86_emulate_ctxt *ctxt)
return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes);
}
-static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
+static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
{
+ int seg = ctxt->src2.val;
+
ctxt->src.val = get_segment_selector(ctxt, seg);
return em_push(ctxt);
}
-static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg)
+static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
{
+ int seg = ctxt->src2.val;
unsigned long selector;
int rc;
@@ -1674,64 +1683,74 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt)
{
switch (ctxt->modrm_reg) {
case 0: /* rol */
- emulate_2op_SrcB("rol", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcB(ctxt, "rol");
break;
case 1: /* ror */
- emulate_2op_SrcB("ror", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcB(ctxt, "ror");
break;
case 2: /* rcl */
- emulate_2op_SrcB("rcl", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcB(ctxt, "rcl");
break;
case 3: /* rcr */
- emulate_2op_SrcB("rcr", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcB(ctxt, "rcr");
break;
case 4: /* sal/shl */
case 6: /* sal/shl */
- emulate_2op_SrcB("sal", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcB(ctxt, "sal");
break;
case 5: /* shr */
- emulate_2op_SrcB("shr", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcB(ctxt, "shr");
break;
case 7: /* sar */
- emulate_2op_SrcB("sar", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcB(ctxt, "sar");
break;
}
return X86EMUL_CONTINUE;
}
-static int em_grp3(struct x86_emulate_ctxt *ctxt)
+static int em_not(struct x86_emulate_ctxt *ctxt)
+{
+ ctxt->dst.val = ~ctxt->dst.val;
+ return X86EMUL_CONTINUE;
+}
+
+static int em_neg(struct x86_emulate_ctxt *ctxt)
+{
+ emulate_1op(ctxt, "neg");
+ return X86EMUL_CONTINUE;
+}
+
+static int em_mul_ex(struct x86_emulate_ctxt *ctxt)
+{
+ u8 ex = 0;
+
+ emulate_1op_rax_rdx(ctxt, "mul", ex);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_imul_ex(struct x86_emulate_ctxt *ctxt)
+{
+ u8 ex = 0;
+
+ emulate_1op_rax_rdx(ctxt, "imul", ex);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_div_ex(struct x86_emulate_ctxt *ctxt)
{
- unsigned long *rax = &ctxt->regs[VCPU_REGS_RAX];
- unsigned long *rdx = &ctxt->regs[VCPU_REGS_RDX];
u8 de = 0;
- switch (ctxt->modrm_reg) {
- case 0 ... 1: /* test */
- emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags);
- break;
- case 2: /* not */
- ctxt->dst.val = ~ctxt->dst.val;
- break;
- case 3: /* neg */
- emulate_1op("neg", ctxt->dst, ctxt->eflags);
- break;
- case 4: /* mul */
- emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags);
- break;
- case 5: /* imul */
- emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, ctxt->eflags);
- break;
- case 6: /* div */
- emulate_1op_rax_rdx_ex("div", ctxt->src, *rax, *rdx,
- ctxt->eflags, de);
- break;
- case 7: /* idiv */
- emulate_1op_rax_rdx_ex("idiv", ctxt->src, *rax, *rdx,
- ctxt->eflags, de);
- break;
- default:
- return X86EMUL_UNHANDLEABLE;
- }
+ emulate_1op_rax_rdx(ctxt, "div", de);
+ if (de)
+ return emulate_de(ctxt);
+ return X86EMUL_CONTINUE;
+}
+
+static int em_idiv_ex(struct x86_emulate_ctxt *ctxt)
+{
+ u8 de = 0;
+
+ emulate_1op_rax_rdx(ctxt, "idiv", de);
if (de)
return emulate_de(ctxt);
return X86EMUL_CONTINUE;
@@ -1743,10 +1762,10 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt)
switch (ctxt->modrm_reg) {
case 0: /* inc */
- emulate_1op("inc", ctxt->dst, ctxt->eflags);
+ emulate_1op(ctxt, "inc");
break;
case 1: /* dec */
- emulate_1op("dec", ctxt->dst, ctxt->eflags);
+ emulate_1op(ctxt, "dec");
break;
case 2: /* call near abs */ {
long int old_eip;
@@ -1812,8 +1831,9 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
return rc;
}
-static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg)
+static int em_lseg(struct x86_emulate_ctxt *ctxt)
{
+ int seg = ctxt->src2.val;
unsigned short sel;
int rc;
@@ -2452,7 +2472,7 @@ static int em_das(struct x86_emulate_ctxt *ctxt)
ctxt->src.type = OP_IMM;
ctxt->src.val = 0;
ctxt->src.bytes = 1;
- emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcV(ctxt, "or");
ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
if (cf)
ctxt->eflags |= X86_EFLAGS_CF;
@@ -2502,49 +2522,49 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
static int em_add(struct x86_emulate_ctxt *ctxt)
{
- emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcV(ctxt, "add");
return X86EMUL_CONTINUE;
}
static int em_or(struct x86_emulate_ctxt *ctxt)
{
- emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcV(ctxt, "or");
return X86EMUL_CONTINUE;
}
static int em_adc(struct x86_emulate_ctxt *ctxt)
{
- emulate_2op_SrcV("adc", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcV(ctxt, "adc");
return X86EMUL_CONTINUE;
}
static int em_sbb(struct x86_emulate_ctxt *ctxt)
{
- emulate_2op_SrcV("sbb", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcV(ctxt, "sbb");
return X86EMUL_CONTINUE;
}
static int em_and(struct x86_emulate_ctxt *ctxt)
{
- emulate_2op_SrcV("and", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcV(ctxt, "and");
return X86EMUL_CONTINUE;
}
static int em_sub(struct x86_emulate_ctxt *ctxt)
{
- emulate_2op_SrcV("sub", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcV(ctxt, "sub");
return X86EMUL_CONTINUE;
}
static int em_xor(struct x86_emulate_ctxt *ctxt)
{
- emulate_2op_SrcV("xor", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcV(ctxt, "xor");
return X86EMUL_CONTINUE;
}
static int em_cmp(struct x86_emulate_ctxt *ctxt)
{
- emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcV(ctxt, "cmp");
/* Disable writeback. */
ctxt->dst.type = OP_NONE;
return X86EMUL_CONTINUE;
@@ -2552,7 +2572,9 @@ static int em_cmp(struct x86_emulate_ctxt *ctxt)
static int em_test(struct x86_emulate_ctxt *ctxt)
{
- emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcV(ctxt, "test");
+ /* Disable writeback. */
+ ctxt->dst.type = OP_NONE;
return X86EMUL_CONTINUE;
}
@@ -2570,7 +2592,7 @@ static int em_xchg(struct x86_emulate_ctxt *ctxt)
static int em_imul(struct x86_emulate_ctxt *ctxt)
{
- emulate_2op_SrcV_nobyte("imul", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcV_nobyte(ctxt, "imul");
return X86EMUL_CONTINUE;
}
@@ -3025,9 +3047,14 @@ static struct opcode group1A[] = {
};
static struct opcode group3[] = {
- D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
- D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
- X4(D(SrcMem | ModRM)),
+ I(DstMem | SrcImm | ModRM, em_test),
+ I(DstMem | SrcImm | ModRM, em_test),
+ I(DstMem | SrcNone | ModRM | Lock, em_not),
+ I(DstMem | SrcNone | ModRM | Lock, em_neg),
+ I(SrcMem | ModRM, em_mul_ex),
+ I(SrcMem | ModRM, em_imul_ex),
+ I(SrcMem | ModRM, em_div_ex),
+ I(SrcMem | ModRM, em_idiv_ex),
};
static struct opcode group4[] = {
@@ -3090,16 +3117,20 @@ static struct gprefix pfx_0f_6f_0f_7f = {
static struct opcode opcode_table[256] = {
/* 0x00 - 0x07 */
I6ALU(Lock, em_add),
- D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+ I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg),
+ I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg),
/* 0x08 - 0x0F */
I6ALU(Lock, em_or),
- D(ImplicitOps | Stack | No64), N,
+ I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg),
+ N,
/* 0x10 - 0x17 */
I6ALU(Lock, em_adc),
- D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+ I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg),
+ I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg),
/* 0x18 - 0x1F */
I6ALU(Lock, em_sbb),
- D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
+ I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg),
+ I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg),
/* 0x20 - 0x27 */
I6ALU(Lock, em_and), N, N,
/* 0x28 - 0x2F */
@@ -3167,7 +3198,8 @@ static struct opcode opcode_table[256] = {
D2bv(DstMem | SrcImmByte | ModRM),
I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
I(ImplicitOps | Stack, em_ret),
- D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
+ I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
+ I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
G(ByteOp, group11), G(0, group11),
/* 0xC8 - 0xCF */
N, N, N, I(ImplicitOps | Stack, em_ret_far),
@@ -3242,20 +3274,22 @@ static struct opcode twobyte_table[256] = {
/* 0x90 - 0x9F */
X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
/* 0xA0 - 0xA7 */
- D(ImplicitOps | Stack), D(ImplicitOps | Stack),
+ I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg),
DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp),
D(DstMem | SrcReg | Src2ImmByte | ModRM),
D(DstMem | SrcReg | Src2CL | ModRM), N, N,
/* 0xA8 - 0xAF */
- D(ImplicitOps | Stack), D(ImplicitOps | Stack),
+ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock),
D(DstMem | SrcReg | Src2ImmByte | ModRM),
D(DstMem | SrcReg | Src2CL | ModRM),
D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
/* 0xB0 - 0xB7 */
D2bv(DstMem | SrcReg | ModRM | Lock),
- D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock),
- D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM),
+ I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg),
+ D(DstMem | SrcReg | ModRM | BitOp | Lock),
+ I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg),
+ I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg),
D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
/* 0xB8 - 0xBF */
N, N,
@@ -3309,13 +3343,13 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
/* NB. Immediates are sign-extended as necessary. */
switch (op->bytes) {
case 1:
- op->val = insn_fetch(s8, 1, ctxt->_eip);
+ op->val = insn_fetch(s8, ctxt);
break;
case 2:
- op->val = insn_fetch(s16, 2, ctxt->_eip);
+ op->val = insn_fetch(s16, ctxt);
break;
case 4:
- op->val = insn_fetch(s32, 4, ctxt->_eip);
+ op->val = insn_fetch(s32, ctxt);
break;
}
if (!sign_extension) {
@@ -3335,6 +3369,125 @@ done:
return rc;
}
+static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
+ unsigned d)
+{
+ int rc = X86EMUL_CONTINUE;
+
+ switch (d) {
+ case OpReg:
+ decode_register_operand(ctxt, op,
+ op == &ctxt->dst &&
+ ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
+ break;
+ case OpImmUByte:
+ rc = decode_imm(ctxt, op, 1, false);
+ break;
+ case OpMem:
+ ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ mem_common:
+ *op = ctxt->memop;
+ ctxt->memopp = op;
+ if ((ctxt->d & BitOp) && op == &ctxt->dst)
+ fetch_bit_operand(ctxt);
+ op->orig_val = op->val;
+ break;
+ case OpMem64:
+ ctxt->memop.bytes = 8;
+ goto mem_common;
+ case OpAcc:
+ op->type = OP_REG;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.reg = &ctxt->regs[VCPU_REGS_RAX];
+ fetch_register_operand(op);
+ op->orig_val = op->val;
+ break;
+ case OpDI:
+ op->type = OP_MEM;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.mem.ea =
+ register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
+ op->addr.mem.seg = VCPU_SREG_ES;
+ op->val = 0;
+ break;
+ case OpDX:
+ op->type = OP_REG;
+ op->bytes = 2;
+ op->addr.reg = &ctxt->regs[VCPU_REGS_RDX];
+ fetch_register_operand(op);
+ break;
+ case OpCL:
+ op->bytes = 1;
+ op->val = ctxt->regs[VCPU_REGS_RCX] & 0xff;
+ break;
+ case OpImmByte:
+ rc = decode_imm(ctxt, op, 1, true);
+ break;
+ case OpOne:
+ op->bytes = 1;
+ op->val = 1;
+ break;
+ case OpImm:
+ rc = decode_imm(ctxt, op, imm_size(ctxt), true);
+ break;
+ case OpMem16:
+ ctxt->memop.bytes = 2;
+ goto mem_common;
+ case OpMem32:
+ ctxt->memop.bytes = 4;
+ goto mem_common;
+ case OpImmU16:
+ rc = decode_imm(ctxt, op, 2, false);
+ break;
+ case OpImmU:
+ rc = decode_imm(ctxt, op, imm_size(ctxt), false);
+ break;
+ case OpSI:
+ op->type = OP_MEM;
+ op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
+ op->addr.mem.ea =
+ register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
+ op->addr.mem.seg = seg_override(ctxt);
+ op->val = 0;
+ break;
+ case OpImmFAddr:
+ op->type = OP_IMM;
+ op->addr.mem.ea = ctxt->_eip;
+ op->bytes = ctxt->op_bytes + 2;
+ insn_fetch_arr(op->valptr, op->bytes, ctxt);
+ break;
+ case OpMemFAddr:
+ ctxt->memop.bytes = ctxt->op_bytes + 2;
+ goto mem_common;
+ case OpES:
+ op->val = VCPU_SREG_ES;
+ break;
+ case OpCS:
+ op->val = VCPU_SREG_CS;
+ break;
+ case OpSS:
+ op->val = VCPU_SREG_SS;
+ break;
+ case OpDS:
+ op->val = VCPU_SREG_DS;
+ break;
+ case OpFS:
+ op->val = VCPU_SREG_FS;
+ break;
+ case OpGS:
+ op->val = VCPU_SREG_GS;
+ break;
+ case OpImplicit:
+ /* Special instructions do their own operand decoding. */
+ default:
+ op->type = OP_NONE; /* Disable writeback. */
+ break;
+ }
+
+done:
+ return rc;
+}
+
int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
{
int rc = X86EMUL_CONTINUE;
@@ -3342,8 +3495,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
bool op_prefix = false;
struct opcode opcode;
- struct operand memop = { .type = OP_NONE }, *memopp = NULL;
+ ctxt->memop.type = OP_NONE;
+ ctxt->memopp = NULL;
ctxt->_eip = ctxt->eip;
ctxt->fetch.start = ctxt->_eip;
ctxt->fetch.end = ctxt->fetch.start + insn_len;
@@ -3366,7 +3520,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
break;
#endif
default:
- return -1;
+ return EMULATION_FAILED;
}
ctxt->op_bytes = def_op_bytes;
@@ -3374,7 +3528,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
/* Legacy prefixes. */
for (;;) {
- switch (ctxt->b = insn_fetch(u8, 1, ctxt->_eip)) {
+ switch (ctxt->b = insn_fetch(u8, ctxt)) {
case 0x66: /* operand-size override */
op_prefix = true;
/* switch between 2/4 bytes */
@@ -3430,7 +3584,7 @@ done_prefixes:
/* Two-byte opcode? */
if (ctxt->b == 0x0f) {
ctxt->twobyte = 1;
- ctxt->b = insn_fetch(u8, 1, ctxt->_eip);
+ ctxt->b = insn_fetch(u8, ctxt);
opcode = twobyte_table[ctxt->b];
}
ctxt->d = opcode.flags;
@@ -3438,13 +3592,13 @@ done_prefixes:
while (ctxt->d & GroupMask) {
switch (ctxt->d & GroupMask) {
case Group:
- ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
+ ctxt->modrm = insn_fetch(u8, ctxt);
--ctxt->_eip;
goffset = (ctxt->modrm >> 3) & 7;
opcode = opcode.u.group[goffset];
break;
case GroupDual:
- ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip);
+ ctxt->modrm = insn_fetch(u8, ctxt);
--ctxt->_eip;
goffset = (ctxt->modrm >> 3) & 7;
if ((ctxt->modrm >> 6) == 3)
@@ -3458,7 +3612,7 @@ done_prefixes:
break;
case Prefix:
if (ctxt->rep_prefix && op_prefix)
- return X86EMUL_UNHANDLEABLE;
+ return EMULATION_FAILED;
simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix;
switch (simd_prefix) {
case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
@@ -3468,10 +3622,10 @@ done_prefixes:
}
break;
default:
- return X86EMUL_UNHANDLEABLE;
+ return EMULATION_FAILED;
}
- ctxt->d &= ~GroupMask;
+ ctxt->d &= ~(u64)GroupMask;
ctxt->d |= opcode.flags;
}
@@ -3481,10 +3635,10 @@ done_prefixes:
/* Unrecognised? */
if (ctxt->d == 0 || (ctxt->d & Undefined))
- return -1;
+ return EMULATION_FAILED;
if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
- return -1;
+ return EMULATION_FAILED;
if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
ctxt->op_bytes = 8;
@@ -3501,96 +3655,27 @@ done_prefixes:
/* ModRM and SIB bytes. */
if (ctxt->d & ModRM) {
- rc = decode_modrm(ctxt, &memop);
+ rc = decode_modrm(ctxt, &ctxt->memop);
if (!ctxt->has_seg_override)
set_seg_override(ctxt, ctxt->modrm_seg);
} else if (ctxt->d & MemAbs)
- rc = decode_abs(ctxt, &memop);
+ rc = decode_abs(ctxt, &ctxt->memop);
if (rc != X86EMUL_CONTINUE)
goto done;
if (!ctxt->has_seg_override)
set_seg_override(ctxt, VCPU_SREG_DS);
- memop.addr.mem.seg = seg_override(ctxt);
+ ctxt->memop.addr.mem.seg = seg_override(ctxt);
- if (memop.type == OP_MEM && ctxt->ad_bytes != 8)
- memop.addr.mem.ea = (u32)memop.addr.mem.ea;
+ if (ctxt->memop.type == OP_MEM && ctxt->ad_bytes != 8)
+ ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea;
/*
* Decode and fetch the source operand: register, memory
* or immediate.
*/
- switch (ctxt->d & SrcMask) {
- case SrcNone:
- break;
- case SrcReg:
- decode_register_operand(ctxt, &ctxt->src, 0);
- break;
- case SrcMem16:
- memop.bytes = 2;
- goto srcmem_common;
- case SrcMem32:
- memop.bytes = 4;
- goto srcmem_common;
- case SrcMem:
- memop.bytes = (ctxt->d & ByteOp) ? 1 :
- ctxt->op_bytes;
- srcmem_common:
- ctxt->src = memop;
- memopp = &ctxt->src;
- break;
- case SrcImmU16:
- rc = decode_imm(ctxt, &ctxt->src, 2, false);
- break;
- case SrcImm:
- rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), true);
- break;
- case SrcImmU:
- rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), false);
- break;
- case SrcImmByte:
- rc = decode_imm(ctxt, &ctxt->src, 1, true);
- break;
- case SrcImmUByte:
- rc = decode_imm(ctxt, &ctxt->src, 1, false);
- break;
- case SrcAcc:
- ctxt->src.type = OP_REG;
- ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
- ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
- fetch_register_operand(&ctxt->src);
- break;
- case SrcOne:
- ctxt->src.bytes = 1;
- ctxt->src.val = 1;
- break;
- case SrcSI:
- ctxt->src.type = OP_MEM;
- ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
- ctxt->src.addr.mem.ea =
- register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]);
- ctxt->src.addr.mem.seg = seg_override(ctxt);
- ctxt->src.val = 0;
- break;
- case SrcImmFAddr:
- ctxt->src.type = OP_IMM;
- ctxt->src.addr.mem.ea = ctxt->_eip;
- ctxt->src.bytes = ctxt->op_bytes + 2;
- insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt->_eip);
- break;
- case SrcMemFAddr:
- memop.bytes = ctxt->op_bytes + 2;
- goto srcmem_common;
- break;
- case SrcDX:
- ctxt->src.type = OP_REG;
- ctxt->src.bytes = 2;
- ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
- fetch_register_operand(&ctxt->src);
- break;
- }
-
+ rc = decode_operand(ctxt, &ctxt->src, (ctxt->d >> SrcShift) & OpMask);
if (rc != X86EMUL_CONTINUE)
goto done;
@@ -3598,85 +3683,18 @@ done_prefixes:
* Decode and fetch the second source operand: register, memory
* or immediate.
*/
- switch (ctxt->d & Src2Mask) {
- case Src2None:
- break;
- case Src2CL:
- ctxt->src2.bytes = 1;
- ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0x8;
- break;
- case Src2ImmByte:
- rc = decode_imm(ctxt, &ctxt->src2, 1, true);
- break;
- case Src2One:
- ctxt->src2.bytes = 1;
- ctxt->src2.val = 1;
- break;
- case Src2Imm:
- rc = decode_imm(ctxt, &ctxt->src2, imm_size(ctxt), true);
- break;
- }
-
+ rc = decode_operand(ctxt, &ctxt->src2, (ctxt->d >> Src2Shift) & OpMask);
if (rc != X86EMUL_CONTINUE)
goto done;
/* Decode and fetch the destination operand: register or memory. */
- switch (ctxt->d & DstMask) {
- case DstReg:
- decode_register_operand(ctxt, &ctxt->dst,
- ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7));
- break;
- case DstImmUByte:
- ctxt->dst.type = OP_IMM;
- ctxt->dst.addr.mem.ea = ctxt->_eip;
- ctxt->dst.bytes = 1;
- ctxt->dst.val = insn_fetch(u8, 1, ctxt->_eip);
- break;
- case DstMem:
- case DstMem64:
- ctxt->dst = memop;
- memopp = &ctxt->dst;
- if ((ctxt->d & DstMask) == DstMem64)
- ctxt->dst.bytes = 8;
- else
- ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
- if (ctxt->d & BitOp)
- fetch_bit_operand(ctxt);
- ctxt->dst.orig_val = ctxt->dst.val;
- break;
- case DstAcc:
- ctxt->dst.type = OP_REG;
- ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
- ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RAX];
- fetch_register_operand(&ctxt->dst);
- ctxt->dst.orig_val = ctxt->dst.val;
- break;
- case DstDI:
- ctxt->dst.type = OP_MEM;
- ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
- ctxt->dst.addr.mem.ea =
- register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]);
- ctxt->dst.addr.mem.seg = VCPU_SREG_ES;
- ctxt->dst.val = 0;
- break;
- case DstDX:
- ctxt->dst.type = OP_REG;
- ctxt->dst.bytes = 2;
- ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX];
- fetch_register_operand(&ctxt->dst);
- break;
- case ImplicitOps:
- /* Special instructions do their own operand decoding. */
- default:
- ctxt->dst.type = OP_NONE; /* Disable writeback. */
- break;
- }
+ rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask);
done:
- if (memopp && memopp->type == OP_MEM && ctxt->rip_relative)
- memopp->addr.mem.ea += ctxt->_eip;
+ if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative)
+ ctxt->memopp->addr.mem.ea += ctxt->_eip;
- return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
+ return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
}
static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
@@ -3825,32 +3843,11 @@ special_insn:
goto twobyte_insn;
switch (ctxt->b) {
- case 0x06: /* push es */
- rc = emulate_push_sreg(ctxt, VCPU_SREG_ES);
- break;
- case 0x07: /* pop es */
- rc = emulate_pop_sreg(ctxt, VCPU_SREG_ES);
- break;
- case 0x0e: /* push cs */
- rc = emulate_push_sreg(ctxt, VCPU_SREG_CS);
- break;
- case 0x16: /* push ss */
- rc = emulate_push_sreg(ctxt, VCPU_SREG_SS);
- break;
- case 0x17: /* pop ss */
- rc = emulate_pop_sreg(ctxt, VCPU_SREG_SS);
- break;
- case 0x1e: /* push ds */
- rc = emulate_push_sreg(ctxt, VCPU_SREG_DS);
- break;
- case 0x1f: /* pop ds */
- rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS);
- break;
case 0x40 ... 0x47: /* inc r16/r32 */
- emulate_1op("inc", ctxt->dst, ctxt->eflags);
+ emulate_1op(ctxt, "inc");
break;
case 0x48 ... 0x4f: /* dec r16/r32 */
- emulate_1op("dec", ctxt->dst, ctxt->eflags);
+ emulate_1op(ctxt, "dec");
break;
case 0x63: /* movsxd */
if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -3891,12 +3888,6 @@ special_insn:
case 0xc0 ... 0xc1:
rc = em_grp2(ctxt);
break;
- case 0xc4: /* les */
- rc = emulate_load_segment(ctxt, VCPU_SREG_ES);
- break;
- case 0xc5: /* lds */
- rc = emulate_load_segment(ctxt, VCPU_SREG_DS);
- break;
case 0xcc: /* int3 */
rc = emulate_int(ctxt, 3);
break;
@@ -3953,9 +3944,6 @@ special_insn:
/* complement carry flag from eflags reg */
ctxt->eflags ^= EFLG_CF;
break;
- case 0xf6 ... 0xf7: /* Grp3 */
- rc = em_grp3(ctxt);
- break;
case 0xf8: /* clc */
ctxt->eflags &= ~EFLG_CF;
break;
@@ -4103,36 +4091,24 @@ twobyte_insn:
case 0x90 ... 0x9f: /* setcc r/m8 */
ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags);
break;
- case 0xa0: /* push fs */
- rc = emulate_push_sreg(ctxt, VCPU_SREG_FS);
- break;
- case 0xa1: /* pop fs */
- rc = emulate_pop_sreg(ctxt, VCPU_SREG_FS);
- break;
case 0xa3:
bt: /* bt */
ctxt->dst.type = OP_NONE;
/* only subword offset */
ctxt->src.val &= (ctxt->dst.bytes << 3) - 1;
- emulate_2op_SrcV_nobyte("bt", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcV_nobyte(ctxt, "bt");
break;
case 0xa4: /* shld imm8, r, r/m */
case 0xa5: /* shld cl, r, r/m */
- emulate_2op_cl("shld", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags);
- break;
- case 0xa8: /* push gs */
- rc = emulate_push_sreg(ctxt, VCPU_SREG_GS);
- break;
- case 0xa9: /* pop gs */
- rc = emulate_pop_sreg(ctxt, VCPU_SREG_GS);
+ emulate_2op_cl(ctxt, "shld");
break;
case 0xab:
bts: /* bts */
- emulate_2op_SrcV_nobyte("bts", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcV_nobyte(ctxt, "bts");
break;
case 0xac: /* shrd imm8, r, r/m */
case 0xad: /* shrd cl, r, r/m */
- emulate_2op_cl("shrd", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_cl(ctxt, "shrd");
break;
case 0xae: /* clflush */
break;
@@ -4143,7 +4119,7 @@ twobyte_insn:
*/
ctxt->src.orig_val = ctxt->src.val;
ctxt->src.val = ctxt->regs[VCPU_REGS_RAX];
- emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcV(ctxt, "cmp");
if (ctxt->eflags & EFLG_ZF) {
/* Success: write back to memory. */
ctxt->dst.val = ctxt->src.orig_val;
@@ -4153,18 +4129,9 @@ twobyte_insn:
ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX];
}
break;
- case 0xb2: /* lss */
- rc = emulate_load_segment(ctxt, VCPU_SREG_SS);
- break;
case 0xb3:
btr: /* btr */
- emulate_2op_SrcV_nobyte("btr", ctxt->src, ctxt->dst, ctxt->eflags);
- break;
- case 0xb4: /* lfs */
- rc = emulate_load_segment(ctxt, VCPU_SREG_FS);
- break;
- case 0xb5: /* lgs */
- rc = emulate_load_segment(ctxt, VCPU_SREG_GS);
+ emulate_2op_SrcV_nobyte(ctxt, "btr");
break;
case 0xb6 ... 0xb7: /* movzx */
ctxt->dst.bytes = ctxt->op_bytes;
@@ -4185,7 +4152,7 @@ twobyte_insn:
break;
case 0xbb:
btc: /* btc */
- emulate_2op_SrcV_nobyte("btc", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcV_nobyte(ctxt, "btc");
break;
case 0xbc: { /* bsf */
u8 zf;
@@ -4217,7 +4184,7 @@ twobyte_insn:
(s16) ctxt->src.val;
break;
case 0xc0 ... 0xc1: /* xadd */
- emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags);
+ emulate_2op_SrcV(ctxt, "add");
/* Write back the register source. */
ctxt->src.val = ctxt->dst.orig_val;
write_register_operand(&ctxt->src);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index efad7238505..76e3f1cd036 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -713,14 +713,16 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
kvm_iodevice_init(&pit->dev, &pit_dev_ops);
- ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
+ KVM_PIT_MEM_LENGTH, &pit->dev);
if (ret < 0)
goto fail;
if (flags & KVM_PIT_SPEAKER_DUMMY) {
kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
- &pit->speaker_dev);
+ KVM_SPEAKER_BASE_ADDRESS, 4,
+ &pit->speaker_dev);
if (ret < 0)
goto fail_unregister;
}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 19fe855e795..cac4746d7ff 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -34,6 +34,9 @@
#include <linux/kvm_host.h>
#include "trace.h"
+#define pr_pic_unimpl(fmt, ...) \
+ pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__)
+
static void pic_irq_request(struct kvm *kvm, int level);
static void pic_lock(struct kvm_pic *s)
@@ -306,10 +309,10 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
}
s->init_state = 1;
if (val & 0x02)
- printk(KERN_ERR "single mode not supported");
+ pr_pic_unimpl("single mode not supported");
if (val & 0x08)
- printk(KERN_ERR
- "level sensitive irq not supported");
+ pr_pic_unimpl(
+ "level sensitive irq not supported");
} else if (val & 0x08) {
if (val & 0x04)
s->poll = 1;
@@ -459,22 +462,15 @@ static int picdev_in_range(gpa_t addr)
}
}
-static inline struct kvm_pic *to_pic(struct kvm_io_device *dev)
-{
- return container_of(dev, struct kvm_pic, dev);
-}
-
-static int picdev_write(struct kvm_io_device *this,
+static int picdev_write(struct kvm_pic *s,
gpa_t addr, int len, const void *val)
{
- struct kvm_pic *s = to_pic(this);
unsigned char data = *(unsigned char *)val;
if (!picdev_in_range(addr))
return -EOPNOTSUPP;
if (len != 1) {
- if (printk_ratelimit())
- printk(KERN_ERR "PIC: non byte write\n");
+ pr_pic_unimpl("non byte write\n");
return 0;
}
pic_lock(s);
@@ -494,17 +490,15 @@ static int picdev_write(struct kvm_io_device *this,
return 0;
}
-static int picdev_read(struct kvm_io_device *this,
+static int picdev_read(struct kvm_pic *s,
gpa_t addr, int len, void *val)
{
- struct kvm_pic *s = to_pic(this);
unsigned char data = 0;
if (!picdev_in_range(addr))
return -EOPNOTSUPP;
if (len != 1) {
- if (printk_ratelimit())
- printk(KERN_ERR "PIC: non byte read\n");
+ pr_pic_unimpl("non byte read\n");
return 0;
}
pic_lock(s);
@@ -525,6 +519,48 @@ static int picdev_read(struct kvm_io_device *this,
return 0;
}
+static int picdev_master_write(struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ return picdev_write(container_of(dev, struct kvm_pic, dev_master),
+ addr, len, val);
+}
+
+static int picdev_master_read(struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ return picdev_read(container_of(dev, struct kvm_pic, dev_master),
+ addr, len, val);
+}
+
+static int picdev_slave_write(struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ return picdev_write(container_of(dev, struct kvm_pic, dev_slave),
+ addr, len, val);
+}
+
+static int picdev_slave_read(struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ return picdev_read(container_of(dev, struct kvm_pic, dev_slave),
+ addr, len, val);
+}
+
+static int picdev_eclr_write(struct kvm_io_device *dev,
+ gpa_t addr, int len, const void *val)
+{
+ return picdev_write(container_of(dev, struct kvm_pic, dev_eclr),
+ addr, len, val);
+}
+
+static int picdev_eclr_read(struct kvm_io_device *dev,
+ gpa_t addr, int len, void *val)
+{
+ return picdev_read(container_of(dev, struct kvm_pic, dev_eclr),
+ addr, len, val);
+}
+
/*
* callback when PIC0 irq status changed
*/
@@ -537,9 +573,19 @@ static void pic_irq_request(struct kvm *kvm, int level)
s->output = level;
}
-static const struct kvm_io_device_ops picdev_ops = {
- .read = picdev_read,
- .write = picdev_write,
+static const struct kvm_io_device_ops picdev_master_ops = {
+ .read = picdev_master_read,
+ .write = picdev_master_write,
+};
+
+static const struct kvm_io_device_ops picdev_slave_ops = {
+ .read = picdev_slave_read,
+ .write = picdev_slave_write,
+};
+
+static const struct kvm_io_device_ops picdev_eclr_ops = {
+ .read = picdev_eclr_read,
+ .write = picdev_eclr_write,
};
struct kvm_pic *kvm_create_pic(struct kvm *kvm)
@@ -560,16 +606,39 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
/*
* Initialize PIO device
*/
- kvm_iodevice_init(&s->dev, &picdev_ops);
+ kvm_iodevice_init(&s->dev_master, &picdev_master_ops);
+ kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops);
+ kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops);
mutex_lock(&kvm->slots_lock);
- ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev);
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2,
+ &s->dev_master);
+ if (ret < 0)
+ goto fail_unlock;
+
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave);
+ if (ret < 0)
+ goto fail_unreg_2;
+
+ ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr);
+ if (ret < 0)
+ goto fail_unreg_1;
+
mutex_unlock(&kvm->slots_lock);
- if (ret < 0) {
- kfree(s);
- return NULL;
- }
return s;
+
+fail_unreg_1:
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave);
+
+fail_unreg_2:
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master);
+
+fail_unlock:
+ mutex_unlock(&kvm->slots_lock);
+
+ kfree(s);
+
+ return NULL;
}
void kvm_destroy_pic(struct kvm *kvm)
@@ -577,7 +646,9 @@ void kvm_destroy_pic(struct kvm *kvm)
struct kvm_pic *vpic = kvm->arch.vpic;
if (vpic) {
- kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_master);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_slave);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_eclr);
kvm->arch.vpic = NULL;
kfree(vpic);
}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 53e2d084bff..2086f2bfba3 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -66,7 +66,9 @@ struct kvm_pic {
struct kvm *kvm;
struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
int output; /* intr from master PIC */
- struct kvm_io_device dev;
+ struct kvm_io_device dev_master;
+ struct kvm_io_device dev_slave;
+ struct kvm_io_device dev_eclr;
void (*ack_notifier)(void *opaque, int irq);
unsigned long irq_states[16];
};
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 3377d53fcd3..544076c4f44 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -45,13 +45,6 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
return vcpu->arch.walk_mmu->pdptrs[index];
}
-static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index)
-{
- load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu));
-
- return mmu->pdptrs[index];
-}
-
static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
{
ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
index 64bc6ea78d9..497dbaa366d 100644
--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
@@ -2,6 +2,8 @@
struct kvm_timer {
struct hrtimer timer;
s64 period; /* unit: ns */
+ u32 timer_mode_mask;
+ u64 tscdeadline;
atomic_t pending; /* accumulated triggered timers */
bool reinject;
struct kvm_timer_ops *t_ops;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 57dcbd4308f..54abb40199d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -68,6 +68,9 @@
#define VEC_POS(v) ((v) & (32 - 1))
#define REG_POS(v) (((v) >> 5) << 4)
+static unsigned int min_timer_period_us = 500;
+module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+
static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off)
{
return *((u32 *) (apic->regs + reg_off));
@@ -135,9 +138,23 @@ static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK;
}
+static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
+{
+ return ((apic_get_reg(apic, APIC_LVTT) &
+ apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
+}
+
static inline int apic_lvtt_period(struct kvm_lapic *apic)
{
- return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
+ return ((apic_get_reg(apic, APIC_LVTT) &
+ apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
+}
+
+static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
+{
+ return ((apic_get_reg(apic, APIC_LVTT) &
+ apic->lapic_timer.timer_mode_mask) ==
+ APIC_LVT_TIMER_TSCDEADLINE);
}
static inline int apic_lvt_nmi_mode(u32 lvt_val)
@@ -166,7 +183,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
}
static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
- LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
+ LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */
LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
LVT_MASK | APIC_MODE_MASK, /* LVTPC */
LINT_MASK, LINT_MASK, /* LVT0-1 */
@@ -316,8 +333,8 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
result = 1;
break;
default:
- printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n",
- apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
+ apic_debug("Bad DFR vcpu %d: %08x\n",
+ apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR));
break;
}
@@ -354,8 +371,8 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
result = (target != source);
break;
default:
- printk(KERN_WARNING "Bad dest shorthand value %x\n",
- short_hand);
+ apic_debug("kvm: apic: Bad dest shorthand value %x\n",
+ short_hand);
break;
}
@@ -401,11 +418,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
break;
case APIC_DM_REMRD:
- printk(KERN_DEBUG "Ignoring delivery mode 3\n");
+ apic_debug("Ignoring delivery mode 3\n");
break;
case APIC_DM_SMI:
- printk(KERN_DEBUG "Ignoring guest SMI\n");
+ apic_debug("Ignoring guest SMI\n");
break;
case APIC_DM_NMI:
@@ -565,11 +582,13 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
val = kvm_apic_id(apic) << 24;
break;
case APIC_ARBPRI:
- printk(KERN_WARNING "Access APIC ARBPRI register "
- "which is for P6\n");
+ apic_debug("Access APIC ARBPRI register which is for P6\n");
break;
case APIC_TMCCT: /* Timer CCR */
+ if (apic_lvtt_tscdeadline(apic))
+ return 0;
+
val = apic_get_tmcct(apic);
break;
@@ -664,29 +683,40 @@ static void update_divide_count(struct kvm_lapic *apic)
static void start_apic_timer(struct kvm_lapic *apic)
{
- ktime_t now = apic->lapic_timer.timer.base->get_time();
-
- apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) *
- APIC_BUS_CYCLE_NS * apic->divide_count;
+ ktime_t now;
atomic_set(&apic->lapic_timer.pending, 0);
- if (!apic->lapic_timer.period)
- return;
- /*
- * Do not allow the guest to program periodic timers with small
- * interval, since the hrtimers are not throttled by the host
- * scheduler.
- */
- if (apic_lvtt_period(apic)) {
- if (apic->lapic_timer.period < NSEC_PER_MSEC/2)
- apic->lapic_timer.period = NSEC_PER_MSEC/2;
- }
+ if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
+ /* lapic timer in oneshot or peroidic mode */
+ now = apic->lapic_timer.timer.base->get_time();
+ apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT)
+ * APIC_BUS_CYCLE_NS * apic->divide_count;
+
+ if (!apic->lapic_timer.period)
+ return;
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (apic_lvtt_period(apic)) {
+ s64 min_period = min_timer_period_us * 1000LL;
+
+ if (apic->lapic_timer.period < min_period) {
+ pr_info_ratelimited(
+ "kvm: vcpu %i: requested %lld ns "
+ "lapic timer period limited to %lld ns\n",
+ apic->vcpu->vcpu_id,
+ apic->lapic_timer.period, min_period);
+ apic->lapic_timer.period = min_period;
+ }
+ }
- hrtimer_start(&apic->lapic_timer.timer,
- ktime_add_ns(now, apic->lapic_timer.period),
- HRTIMER_MODE_ABS);
+ hrtimer_start(&apic->lapic_timer.timer,
+ ktime_add_ns(now, apic->lapic_timer.period),
+ HRTIMER_MODE_ABS);
- apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
+ apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
PRIx64 ", "
"timer initial count 0x%x, period %lldns, "
"expire @ 0x%016" PRIx64 ".\n", __func__,
@@ -695,6 +725,30 @@ static void start_apic_timer(struct kvm_lapic *apic)
apic->lapic_timer.period,
ktime_to_ns(ktime_add_ns(now,
apic->lapic_timer.period)));
+ } else if (apic_lvtt_tscdeadline(apic)) {
+ /* lapic timer in tsc deadline mode */
+ u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+ u64 ns = 0;
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu);
+ unsigned long flags;
+
+ if (unlikely(!tscdeadline || !this_tsc_khz))
+ return;
+
+ local_irq_save(flags);
+
+ now = apic->lapic_timer.timer.base->get_time();
+ guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+ if (likely(tscdeadline > guest_tsc)) {
+ ns = (tscdeadline - guest_tsc) * 1000000ULL;
+ do_div(ns, this_tsc_khz);
+ }
+ hrtimer_start(&apic->lapic_timer.timer,
+ ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+
+ local_irq_restore(flags);
+ }
}
static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -782,7 +836,6 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_LVT0:
apic_manage_nmi_watchdog(apic, val);
- case APIC_LVTT:
case APIC_LVTTHMR:
case APIC_LVTPC:
case APIC_LVT1:
@@ -796,7 +849,22 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
+ case APIC_LVTT:
+ if ((apic_get_reg(apic, APIC_LVTT) &
+ apic->lapic_timer.timer_mode_mask) !=
+ (val & apic->lapic_timer.timer_mode_mask))
+ hrtimer_cancel(&apic->lapic_timer.timer);
+
+ if (!apic_sw_enabled(apic))
+ val |= APIC_LVT_MASKED;
+ val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
+ apic_set_reg(apic, APIC_LVTT, val);
+ break;
+
case APIC_TMICT:
+ if (apic_lvtt_tscdeadline(apic))
+ break;
+
hrtimer_cancel(&apic->lapic_timer.timer);
apic_set_reg(apic, APIC_TMICT, val);
start_apic_timer(apic);
@@ -804,14 +872,14 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
case APIC_TDCR:
if (val & 4)
- printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val);
+ apic_debug("KVM_WRITE:TDCR %x\n", val);
apic_set_reg(apic, APIC_TDCR, val);
update_divide_count(apic);
break;
case APIC_ESR:
if (apic_x2apic_mode(apic) && val != 0) {
- printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val);
+ apic_debug("KVM_WRITE:ESR not zero %x\n", val);
ret = 1;
}
break;
@@ -864,6 +932,15 @@ static int apic_mmio_write(struct kvm_io_device *this,
return 0;
}
+void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ if (apic)
+ apic_reg_write(vcpu->arch.apic, APIC_EOI, 0);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi);
+
void kvm_free_lapic(struct kvm_vcpu *vcpu)
{
if (!vcpu->arch.apic)
@@ -883,6 +960,32 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
*----------------------------------------------------------------------
*/
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ if (!apic)
+ return 0;
+
+ if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+ return 0;
+
+ return apic->lapic_timer.tscdeadline;
+}
+
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ if (!apic)
+ return;
+
+ if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic))
+ return;
+
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ apic->lapic_timer.tscdeadline = data;
+ start_apic_timer(apic);
+}
+
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
{
struct kvm_lapic *apic = vcpu->arch.apic;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 52c9e6b9e72..138e8cc6fea 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -26,6 +26,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
void kvm_lapic_reset(struct kvm_vcpu *vcpu);
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
+void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
void kvm_apic_set_version(struct kvm_vcpu *vcpu);
@@ -41,6 +42,9 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
bool kvm_apic_present(struct kvm_vcpu *vcpu);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
+u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
+void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data);
+
void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu);
void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1c5b69373a0..f1b36cf3e3d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -400,7 +400,8 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
/* xchg acts as a barrier before the setting of the high bits */
orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
- orig.spte_high = ssptep->spte_high = sspte.spte_high;
+ orig.spte_high = ssptep->spte_high;
+ ssptep->spte_high = sspte.spte_high;
count_spte_clear(sptep, spte);
return orig.spte;
@@ -2769,7 +2770,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
ASSERT(!VALID_PAGE(root));
if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
- pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
+ pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
if (!is_present_gpte(pdptr)) {
vcpu->arch.mmu.pae_root[i] = 0;
continue;
@@ -3317,6 +3318,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->direct_map = true;
context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
context->get_cr3 = get_cr3;
+ context->get_pdptr = kvm_pdptr_read;
context->inject_page_fault = kvm_inject_page_fault;
context->nx = is_nx(vcpu);
@@ -3375,6 +3377,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
vcpu->arch.walk_mmu->get_cr3 = get_cr3;
+ vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read;
vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
return r;
@@ -3385,6 +3388,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
g_context->get_cr3 = get_cr3;
+ g_context->get_pdptr = kvm_pdptr_read;
g_context->inject_page_fault = kvm_inject_page_fault;
/*
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 2460a265be2..746ec259d02 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -121,16 +121,16 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
{
+ static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
unsigned long *rmapp;
struct kvm_mmu_page *rev_sp;
gfn_t gfn;
-
rev_sp = page_header(__pa(sptep));
gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
if (!gfn_to_memslot(kvm, gfn)) {
- if (!printk_ratelimit())
+ if (!__ratelimit(&ratelimit_state))
return;
audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
@@ -141,7 +141,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
if (!*rmapp) {
- if (!printk_ratelimit())
+ if (!__ratelimit(&ratelimit_state))
return;
audit_printk(kvm, "no rmap for writable spte %llx\n",
*sptep);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 507e2b844cf..92994100638 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -147,7 +147,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
gfn_t table_gfn;
unsigned index, pt_access, uninitialized_var(pte_access);
gpa_t pte_gpa;
- bool eperm;
+ bool eperm, last_gpte;
int offset;
const int write_fault = access & PFERR_WRITE_MASK;
const int user_fault = access & PFERR_USER_MASK;
@@ -163,7 +163,7 @@ retry_walk:
#if PTTYPE == 64
if (walker->level == PT32E_ROOT_LEVEL) {
- pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
+ pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
trace_kvm_mmu_paging_element(pte, walker->level);
if (!is_present_gpte(pte))
goto error;
@@ -221,6 +221,17 @@ retry_walk:
eperm = true;
#endif
+ last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte);
+ if (last_gpte) {
+ pte_access = pt_access &
+ FNAME(gpte_access)(vcpu, pte, true);
+ /* check if the kernel is fetching from user page */
+ if (unlikely(pte_access & PT_USER_MASK) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
+ if (fetch_fault && !user_fault)
+ eperm = true;
+ }
+
if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) {
int ret;
trace_kvm_mmu_set_accessed_bit(table_gfn, index,
@@ -238,18 +249,12 @@ retry_walk:
walker->ptes[walker->level - 1] = pte;
- if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) {
+ if (last_gpte) {
int lvl = walker->level;
gpa_t real_gpa;
gfn_t gfn;
u32 ac;
- /* check if the kernel is fetching from user page */
- if (unlikely(pte_access & PT_USER_MASK) &&
- kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
- if (fetch_fault && !user_fault)
- eperm = true;
-
gfn = gpte_to_gfn_lvl(pte, lvl);
gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
@@ -295,7 +300,6 @@ retry_walk:
walker->ptes[walker->level - 1] = pte;
}
- pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true);
walker->pt_access = pt_access;
walker->pte_access = pte_access;
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 475d1c94850..e32243eac2f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1084,7 +1084,6 @@ static void init_vmcb(struct vcpu_svm *svm)
if (npt_enabled) {
/* Setup VMCB for Nested Paging */
control->nested_ctl = 1;
- clr_intercept(svm, INTERCEPT_TASK_SWITCH);
clr_intercept(svm, INTERCEPT_INVLPG);
clr_exception_intercept(svm, PF_VECTOR);
clr_cr_intercept(svm, INTERCEPT_CR3_READ);
@@ -1844,6 +1843,20 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
return svm->nested.nested_cr3;
}
+static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 cr3 = svm->nested.nested_cr3;
+ u64 pdpte;
+ int ret;
+
+ ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte,
+ offset_in_page(cr3) + index * 8, 8);
+ if (ret)
+ return 0;
+ return pdpte;
+}
+
static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
unsigned long root)
{
@@ -1875,6 +1888,7 @@ static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
+ vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
vcpu->arch.mmu.shadow_root_level = get_npt_level();
vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
@@ -2182,7 +2196,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
vmcb->control.exit_info_1,
vmcb->control.exit_info_2,
vmcb->control.exit_int_info,
- vmcb->control.exit_int_info_err);
+ vmcb->control.exit_int_info_err,
+ KVM_ISA_SVM);
nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
if (!nested_vmcb)
@@ -2894,15 +2909,20 @@ static int cr8_write_interception(struct vcpu_svm *svm)
return 0;
}
+u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu)
+{
+ struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
+ return vmcb->control.tsc_offset +
+ svm_scale_tsc(vcpu, native_read_tsc());
+}
+
static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
{
struct vcpu_svm *svm = to_svm(vcpu);
switch (ecx) {
case MSR_IA32_TSC: {
- struct vmcb *vmcb = get_host_vmcb(svm);
-
- *data = vmcb->control.tsc_offset +
+ *data = svm->vmcb->control.tsc_offset +
svm_scale_tsc(vcpu, native_read_tsc());
break;
@@ -3314,8 +3334,6 @@ static int handle_exit(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
u32 exit_code = svm->vmcb->control.exit_code;
- trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
-
if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
vcpu->arch.cr0 = svm->vmcb->save.cr0;
if (npt_enabled)
@@ -3335,7 +3353,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
svm->vmcb->control.exit_info_1,
svm->vmcb->control.exit_info_2,
svm->vmcb->control.exit_int_info,
- svm->vmcb->control.exit_int_info_err);
+ svm->vmcb->control.exit_int_info_err,
+ KVM_ISA_SVM);
vmexit = nested_svm_exit_special(svm);
@@ -3768,6 +3787,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+ trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM);
+
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_handle_nmi(&svm->vcpu);
@@ -3897,60 +3918,6 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
}
}
-static const struct trace_print_flags svm_exit_reasons_str[] = {
- { SVM_EXIT_READ_CR0, "read_cr0" },
- { SVM_EXIT_READ_CR3, "read_cr3" },
- { SVM_EXIT_READ_CR4, "read_cr4" },
- { SVM_EXIT_READ_CR8, "read_cr8" },
- { SVM_EXIT_WRITE_CR0, "write_cr0" },
- { SVM_EXIT_WRITE_CR3, "write_cr3" },
- { SVM_EXIT_WRITE_CR4, "write_cr4" },
- { SVM_EXIT_WRITE_CR8, "write_cr8" },
- { SVM_EXIT_READ_DR0, "read_dr0" },
- { SVM_EXIT_READ_DR1, "read_dr1" },
- { SVM_EXIT_READ_DR2, "read_dr2" },
- { SVM_EXIT_READ_DR3, "read_dr3" },
- { SVM_EXIT_WRITE_DR0, "write_dr0" },
- { SVM_EXIT_WRITE_DR1, "write_dr1" },
- { SVM_EXIT_WRITE_DR2, "write_dr2" },
- { SVM_EXIT_WRITE_DR3, "write_dr3" },
- { SVM_EXIT_WRITE_DR5, "write_dr5" },
- { SVM_EXIT_WRITE_DR7, "write_dr7" },
- { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" },
- { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" },
- { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" },
- { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" },
- { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" },
- { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" },
- { SVM_EXIT_INTR, "interrupt" },
- { SVM_EXIT_NMI, "nmi" },
- { SVM_EXIT_SMI, "smi" },
- { SVM_EXIT_INIT, "init" },
- { SVM_EXIT_VINTR, "vintr" },
- { SVM_EXIT_CPUID, "cpuid" },
- { SVM_EXIT_INVD, "invd" },
- { SVM_EXIT_HLT, "hlt" },
- { SVM_EXIT_INVLPG, "invlpg" },
- { SVM_EXIT_INVLPGA, "invlpga" },
- { SVM_EXIT_IOIO, "io" },
- { SVM_EXIT_MSR, "msr" },
- { SVM_EXIT_TASK_SWITCH, "task_switch" },
- { SVM_EXIT_SHUTDOWN, "shutdown" },
- { SVM_EXIT_VMRUN, "vmrun" },
- { SVM_EXIT_VMMCALL, "hypercall" },
- { SVM_EXIT_VMLOAD, "vmload" },
- { SVM_EXIT_VMSAVE, "vmsave" },
- { SVM_EXIT_STGI, "stgi" },
- { SVM_EXIT_CLGI, "clgi" },
- { SVM_EXIT_SKINIT, "skinit" },
- { SVM_EXIT_WBINVD, "wbinvd" },
- { SVM_EXIT_MONITOR, "monitor" },
- { SVM_EXIT_MWAIT, "mwait" },
- { SVM_EXIT_XSETBV, "xsetbv" },
- { SVM_EXIT_NPF, "npf" },
- { -1, NULL }
-};
-
static int svm_get_lpage_level(void)
{
return PT_PDPE_LEVEL;
@@ -4223,7 +4190,6 @@ static struct kvm_x86_ops svm_x86_ops = {
.get_mt_mask = svm_get_mt_mask,
.get_exit_info = svm_get_exit_info,
- .exit_reasons_str = svm_exit_reasons_str,
.get_lpage_level = svm_get_lpage_level,
@@ -4239,6 +4205,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.write_tsc_offset = svm_write_tsc_offset,
.adjust_tsc_offset = svm_adjust_tsc_offset,
.compute_tsc_offset = svm_compute_tsc_offset,
+ .read_l1_tsc = svm_read_l1_tsc,
.set_tdp_cr3 = set_tdp_cr3,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 3ff898c104f..911d2641f14 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -2,6 +2,8 @@
#define _TRACE_KVM_H
#include <linux/tracepoint.h>
+#include <asm/vmx.h>
+#include <asm/svm.h>
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm
@@ -181,6 +183,95 @@ TRACE_EVENT(kvm_apic,
#define KVM_ISA_VMX 1
#define KVM_ISA_SVM 2
+#define VMX_EXIT_REASONS \
+ { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
+ { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \
+ { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \
+ { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \
+ { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \
+ { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \
+ { EXIT_REASON_CPUID, "CPUID" }, \
+ { EXIT_REASON_HLT, "HLT" }, \
+ { EXIT_REASON_INVLPG, "INVLPG" }, \
+ { EXIT_REASON_RDPMC, "RDPMC" }, \
+ { EXIT_REASON_RDTSC, "RDTSC" }, \
+ { EXIT_REASON_VMCALL, "VMCALL" }, \
+ { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \
+ { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \
+ { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \
+ { EXIT_REASON_VMPTRST, "VMPTRST" }, \
+ { EXIT_REASON_VMREAD, "VMREAD" }, \
+ { EXIT_REASON_VMRESUME, "VMRESUME" }, \
+ { EXIT_REASON_VMWRITE, "VMWRITE" }, \
+ { EXIT_REASON_VMOFF, "VMOFF" }, \
+ { EXIT_REASON_VMON, "VMON" }, \
+ { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \
+ { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \
+ { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \
+ { EXIT_REASON_MSR_READ, "MSR_READ" }, \
+ { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \
+ { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \
+ { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \
+ { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \
+ { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \
+ { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \
+ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \
+ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \
+ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \
+ { EXIT_REASON_WBINVD, "WBINVD" }
+
+#define SVM_EXIT_REASONS \
+ { SVM_EXIT_READ_CR0, "read_cr0" }, \
+ { SVM_EXIT_READ_CR3, "read_cr3" }, \
+ { SVM_EXIT_READ_CR4, "read_cr4" }, \
+ { SVM_EXIT_READ_CR8, "read_cr8" }, \
+ { SVM_EXIT_WRITE_CR0, "write_cr0" }, \
+ { SVM_EXIT_WRITE_CR3, "write_cr3" }, \
+ { SVM_EXIT_WRITE_CR4, "write_cr4" }, \
+ { SVM_EXIT_WRITE_CR8, "write_cr8" }, \
+ { SVM_EXIT_READ_DR0, "read_dr0" }, \
+ { SVM_EXIT_READ_DR1, "read_dr1" }, \
+ { SVM_EXIT_READ_DR2, "read_dr2" }, \
+ { SVM_EXIT_READ_DR3, "read_dr3" }, \
+ { SVM_EXIT_WRITE_DR0, "write_dr0" }, \
+ { SVM_EXIT_WRITE_DR1, "write_dr1" }, \
+ { SVM_EXIT_WRITE_DR2, "write_dr2" }, \
+ { SVM_EXIT_WRITE_DR3, "write_dr3" }, \
+ { SVM_EXIT_WRITE_DR5, "write_dr5" }, \
+ { SVM_EXIT_WRITE_DR7, "write_dr7" }, \
+ { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \
+ { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \
+ { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \
+ { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \
+ { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \
+ { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \
+ { SVM_EXIT_INTR, "interrupt" }, \
+ { SVM_EXIT_NMI, "nmi" }, \
+ { SVM_EXIT_SMI, "smi" }, \
+ { SVM_EXIT_INIT, "init" }, \
+ { SVM_EXIT_VINTR, "vintr" }, \
+ { SVM_EXIT_CPUID, "cpuid" }, \
+ { SVM_EXIT_INVD, "invd" }, \
+ { SVM_EXIT_HLT, "hlt" }, \
+ { SVM_EXIT_INVLPG, "invlpg" }, \
+ { SVM_EXIT_INVLPGA, "invlpga" }, \
+ { SVM_EXIT_IOIO, "io" }, \
+ { SVM_EXIT_MSR, "msr" }, \
+ { SVM_EXIT_TASK_SWITCH, "task_switch" }, \
+ { SVM_EXIT_SHUTDOWN, "shutdown" }, \
+ { SVM_EXIT_VMRUN, "vmrun" }, \
+ { SVM_EXIT_VMMCALL, "hypercall" }, \
+ { SVM_EXIT_VMLOAD, "vmload" }, \
+ { SVM_EXIT_VMSAVE, "vmsave" }, \
+ { SVM_EXIT_STGI, "stgi" }, \
+ { SVM_EXIT_CLGI, "clgi" }, \
+ { SVM_EXIT_SKINIT, "skinit" }, \
+ { SVM_EXIT_WBINVD, "wbinvd" }, \
+ { SVM_EXIT_MONITOR, "monitor" }, \
+ { SVM_EXIT_MWAIT, "mwait" }, \
+ { SVM_EXIT_XSETBV, "xsetbv" }, \
+ { SVM_EXIT_NPF, "npf" }
+
/*
* Tracepoint for kvm guest exit:
*/
@@ -205,8 +296,9 @@ TRACE_EVENT(kvm_exit,
),
TP_printk("reason %s rip 0x%lx info %llx %llx",
- ftrace_print_symbols_seq(p, __entry->exit_reason,
- kvm_x86_ops->exit_reasons_str),
+ (__entry->isa == KVM_ISA_VMX) ?
+ __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) :
+ __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS),
__entry->guest_rip, __entry->info1, __entry->info2)
);
@@ -486,9 +578,9 @@ TRACE_EVENT(kvm_nested_intercepts,
TRACE_EVENT(kvm_nested_vmexit,
TP_PROTO(__u64 rip, __u32 exit_code,
__u64 exit_info1, __u64 exit_info2,
- __u32 exit_int_info, __u32 exit_int_info_err),
+ __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
TP_ARGS(rip, exit_code, exit_info1, exit_info2,
- exit_int_info, exit_int_info_err),
+ exit_int_info, exit_int_info_err, isa),
TP_STRUCT__entry(
__field( __u64, rip )
@@ -497,6 +589,7 @@ TRACE_EVENT(kvm_nested_vmexit,
__field( __u64, exit_info2 )
__field( __u32, exit_int_info )
__field( __u32, exit_int_info_err )
+ __field( __u32, isa )
),
TP_fast_assign(
@@ -506,12 +599,14 @@ TRACE_EVENT(kvm_nested_vmexit,
__entry->exit_info2 = exit_info2;
__entry->exit_int_info = exit_int_info;
__entry->exit_int_info_err = exit_int_info_err;
+ __entry->isa = isa;
),
TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
"ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
__entry->rip,
- ftrace_print_symbols_seq(p, __entry->exit_code,
- kvm_x86_ops->exit_reasons_str),
+ (__entry->isa == KVM_ISA_VMX) ?
+ __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
+ __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
__entry->exit_info1, __entry->exit_info2,
__entry->exit_int_info, __entry->exit_int_info_err)
);
@@ -522,9 +617,9 @@ TRACE_EVENT(kvm_nested_vmexit,
TRACE_EVENT(kvm_nested_vmexit_inject,
TP_PROTO(__u32 exit_code,
__u64 exit_info1, __u64 exit_info2,
- __u32 exit_int_info, __u32 exit_int_info_err),
+ __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa),
TP_ARGS(exit_code, exit_info1, exit_info2,
- exit_int_info, exit_int_info_err),
+ exit_int_info, exit_int_info_err, isa),
TP_STRUCT__entry(
__field( __u32, exit_code )
@@ -532,6 +627,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
__field( __u64, exit_info2 )
__field( __u32, exit_int_info )
__field( __u32, exit_int_info_err )
+ __field( __u32, isa )
),
TP_fast_assign(
@@ -540,12 +636,14 @@ TRACE_EVENT(kvm_nested_vmexit_inject,
__entry->exit_info2 = exit_info2;
__entry->exit_int_info = exit_int_info;
__entry->exit_int_info_err = exit_int_info_err;
+ __entry->isa = isa;
),
TP_printk("reason: %s ext_inf1: 0x%016llx "
"ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x",
- ftrace_print_symbols_seq(p, __entry->exit_code,
- kvm_x86_ops->exit_reasons_str),
+ (__entry->isa == KVM_ISA_VMX) ?
+ __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) :
+ __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS),
__entry->exit_info1, __entry->exit_info2,
__entry->exit_int_info, __entry->exit_int_info_err)
);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e65a158dee6..a0d6bd9ad44 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -71,6 +71,9 @@ module_param(vmm_exclusive, bool, S_IRUGO);
static int __read_mostly yield_on_hlt = 1;
module_param(yield_on_hlt, bool, S_IRUGO);
+static int __read_mostly fasteoi = 1;
+module_param(fasteoi, bool, S_IRUGO);
+
/*
* If nested=1, nested virtualization is supported, i.e., guests may use
* VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -1748,6 +1751,21 @@ static u64 guest_read_tsc(void)
}
/*
+ * Like guest_read_tsc, but always returns L1's notion of the timestamp
+ * counter, even if a nested guest (L2) is currently running.
+ */
+u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
+{
+ u64 host_tsc, tsc_offset;
+
+ rdtscll(host_tsc);
+ tsc_offset = is_guest_mode(vcpu) ?
+ to_vmx(vcpu)->nested.vmcs01_tsc_offset :
+ vmcs_read64(TSC_OFFSET);
+ return host_tsc + tsc_offset;
+}
+
+/*
* Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
* ioctl. In this case the call-back should update internal vmx state to make
* the changes effective.
@@ -1762,15 +1780,23 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
*/
static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
- vmcs_write64(TSC_OFFSET, offset);
- if (is_guest_mode(vcpu))
+ if (is_guest_mode(vcpu)) {
/*
- * We're here if L1 chose not to trap the TSC MSR. Since
- * prepare_vmcs12() does not copy tsc_offset, we need to also
- * set the vmcs12 field here.
+ * We're here if L1 chose not to trap WRMSR to TSC. According
+ * to the spec, this should set L1's TSC; The offset that L1
+ * set for L2 remains unchanged, and still needs to be added
+ * to the newly set TSC to get L2's TSC.
*/
- get_vmcs12(vcpu)->tsc_offset = offset -
- to_vmx(vcpu)->nested.vmcs01_tsc_offset;
+ struct vmcs12 *vmcs12;
+ to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset;
+ /* recalculate vmcs02.TSC_OFFSET: */
+ vmcs12 = get_vmcs12(vcpu);
+ vmcs_write64(TSC_OFFSET, offset +
+ (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
+ vmcs12->tsc_offset : 0));
+ } else {
+ vmcs_write64(TSC_OFFSET, offset);
+ }
}
static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -2736,8 +2762,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
- printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
- __func__);
+ pr_debug_ratelimited("%s: tss fixup for long mode. \n",
+ __func__);
vmcs_write32(GUEST_TR_AR_BYTES,
(guest_tr_ar & ~AR_TYPE_MASK)
| AR_TYPE_BUSY_64_TSS);
@@ -4115,8 +4141,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
if (is_page_fault(intr_info)) {
/* EPT won't cause page fault directly */
- if (enable_ept)
- BUG();
+ BUG_ON(enable_ept);
cr2 = vmcs_readl(EXIT_QUALIFICATION);
trace_kvm_page_fault(cr2, error_code);
@@ -4518,6 +4543,24 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
static int handle_apic_access(struct kvm_vcpu *vcpu)
{
+ if (likely(fasteoi)) {
+ unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ int access_type, offset;
+
+ access_type = exit_qualification & APIC_ACCESS_TYPE;
+ offset = exit_qualification & APIC_ACCESS_OFFSET;
+ /*
+ * Sane guest uses MOV to write EOI, with written value
+ * not cared. So make a short-circuit here by avoiding
+ * heavy instruction emulation.
+ */
+ if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
+ (offset == APIC_EOI)) {
+ kvm_lapic_set_eoi(vcpu);
+ skip_emulated_instruction(vcpu);
+ return 1;
+ }
+ }
return emulate_instruction(vcpu, 0) == EMULATE_DONE;
}
@@ -5591,8 +5634,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
return 0;
if (unlikely(vmx->fail)) {
- printk(KERN_INFO "%s failed vm entry %x\n",
- __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
+ pr_info_ratelimited("%s failed vm entry %x\n", __func__,
+ vmcs_read32(VM_INSTRUCTION_ERROR));
return 1;
}
@@ -5696,8 +5739,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
- trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
-
/* If guest state is invalid, start emulating */
if (vmx->emulation_required && emulate_invalid_guest_state)
return handle_invalid_guest_state(vcpu);
@@ -6101,6 +6142,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx->loaded_vmcs->launched = 1;
vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+ trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
vmx_complete_atomic_exit(vmx);
vmx_recover_nmi_blocking(vmx);
@@ -6241,49 +6283,6 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
return ret;
}
-#define _ER(x) { EXIT_REASON_##x, #x }
-
-static const struct trace_print_flags vmx_exit_reasons_str[] = {
- _ER(EXCEPTION_NMI),
- _ER(EXTERNAL_INTERRUPT),
- _ER(TRIPLE_FAULT),
- _ER(PENDING_INTERRUPT),
- _ER(NMI_WINDOW),
- _ER(TASK_SWITCH),
- _ER(CPUID),
- _ER(HLT),
- _ER(INVLPG),
- _ER(RDPMC),
- _ER(RDTSC),
- _ER(VMCALL),
- _ER(VMCLEAR),
- _ER(VMLAUNCH),
- _ER(VMPTRLD),
- _ER(VMPTRST),
- _ER(VMREAD),
- _ER(VMRESUME),
- _ER(VMWRITE),
- _ER(VMOFF),
- _ER(VMON),
- _ER(CR_ACCESS),
- _ER(DR_ACCESS),
- _ER(IO_INSTRUCTION),
- _ER(MSR_READ),
- _ER(MSR_WRITE),
- _ER(MWAIT_INSTRUCTION),
- _ER(MONITOR_INSTRUCTION),
- _ER(PAUSE_INSTRUCTION),
- _ER(MCE_DURING_VMENTRY),
- _ER(TPR_BELOW_THRESHOLD),
- _ER(APIC_ACCESS),
- _ER(EPT_VIOLATION),
- _ER(EPT_MISCONFIG),
- _ER(WBINVD),
- { -1, NULL }
-};
-
-#undef _ER
-
static int vmx_get_lpage_level(void)
{
if (enable_ept && !cpu_has_vmx_ept_1g_page())
@@ -6514,8 +6513,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
set_cr4_guest_host_mask(vmx);
- vmcs_write64(TSC_OFFSET,
- vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ vmcs_write64(TSC_OFFSET,
+ vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
+ else
+ vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
if (enable_vpid) {
/*
@@ -6610,9 +6612,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
if (vmcs12->vm_entry_msr_load_count > 0 ||
vmcs12->vm_exit_msr_load_count > 0 ||
vmcs12->vm_exit_msr_store_count > 0) {
- if (printk_ratelimit())
- printk(KERN_WARNING
- "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__);
+ pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
+ __func__);
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
return 1;
}
@@ -6922,7 +6923,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu)
load_vmcs12_host_state(vcpu, vmcs12);
- /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */
+ /* Update TSC_OFFSET if TSC was changed while L2 ran */
vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
/* This is needed for same reason as it was needed in prepare_vmcs02 */
@@ -7039,7 +7040,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
.get_mt_mask = vmx_get_mt_mask,
.get_exit_info = vmx_get_exit_info,
- .exit_reasons_str = vmx_exit_reasons_str,
.get_lpage_level = vmx_get_lpage_level,
@@ -7055,6 +7055,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.write_tsc_offset = vmx_write_tsc_offset,
.adjust_tsc_offset = vmx_adjust_tsc_offset,
.compute_tsc_offset = vmx_compute_tsc_offset,
+ .read_l1_tsc = vmx_read_l1_tsc,
.set_tdp_cr3 = vmx_set_cr3,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 84a28ea45fa..c38efd7b792 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -44,6 +44,7 @@
#include <linux/perf_event.h>
#include <linux/uaccess.h>
#include <linux/hash.h>
+#include <linux/pci.h>
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
@@ -83,6 +84,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
+static void process_nmi(struct kvm_vcpu *vcpu);
struct kvm_x86_ops *kvm_x86_ops;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
@@ -359,8 +361,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- vcpu->arch.nmi_pending = 1;
+ atomic_inc(&vcpu->arch.nmi_queued);
+ kvm_make_request(KVM_REQ_NMI, vcpu);
}
EXPORT_SYMBOL_GPL(kvm_inject_nmi);
@@ -599,6 +601,8 @@ static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
static void update_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 timer_mode_mask;
best = kvm_find_cpuid_entry(vcpu, 1, 0);
if (!best)
@@ -610,6 +614,16 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
best->ecx |= bit(X86_FEATURE_OSXSAVE);
}
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+ best->function == 0x1) {
+ best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER);
+ timer_mode_mask = 3 << 17;
+ } else
+ timer_mode_mask = 1 << 17;
+
+ if (apic)
+ apic->lapic_timer.timer_mode_mask = timer_mode_mask;
}
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -825,6 +839,7 @@ static u32 msrs_to_save[] = {
static unsigned num_msrs_to_save;
static u32 emulated_msrs[] = {
+ MSR_IA32_TSCDEADLINE,
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
MSR_IA32_MCG_CTL,
@@ -1000,7 +1015,7 @@ static inline int kvm_tsc_changes_freq(void)
return ret;
}
-static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
+u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.virtual_tsc_khz)
return vcpu->arch.virtual_tsc_khz;
@@ -1098,7 +1113,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
- kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
+ tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
kernel_ns = get_kernel_ns();
this_tsc_khz = vcpu_tsc_khz(v);
if (unlikely(this_tsc_khz == 0)) {
@@ -1564,6 +1579,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
break;
case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
return kvm_x2apic_msr_write(vcpu, msr, data);
+ case MSR_IA32_TSCDEADLINE:
+ kvm_set_lapic_tscdeadline_msr(vcpu, data);
+ break;
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
@@ -1825,6 +1843,9 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
case HV_X64_MSR_TPR:
return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
+ case HV_X64_MSR_APIC_ASSIST_PAGE:
+ data = vcpu->arch.hv_vapic;
+ break;
default:
pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
return 1;
@@ -1839,7 +1860,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
switch (msr) {
case MSR_IA32_PLATFORM_ID:
- case MSR_IA32_UCODE_REV:
case MSR_IA32_EBL_CR_POWERON:
case MSR_IA32_DEBUGCTLMSR:
case MSR_IA32_LASTBRANCHFROMIP:
@@ -1860,6 +1880,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_FAM10H_MMIO_CONF_BASE:
data = 0;
break;
+ case MSR_IA32_UCODE_REV:
+ data = 0x100000000ULL;
+ break;
case MSR_MTRRcap:
data = 0x500 | KVM_NR_VAR_MTRR;
break;
@@ -1888,6 +1911,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
return kvm_x2apic_msr_read(vcpu, msr, pdata);
break;
+ case MSR_IA32_TSCDEADLINE:
+ data = kvm_get_lapic_tscdeadline_msr(vcpu);
+ break;
case MSR_IA32_MISC_ENABLE:
data = vcpu->arch.ia32_misc_enable_msr;
break;
@@ -2086,6 +2112,9 @@ int kvm_dev_ioctl_check_extension(long ext)
r = !kvm_x86_ops->cpu_has_accelerated_tpr();
break;
case KVM_CAP_NR_VCPUS:
+ r = KVM_SOFT_MAX_VCPUS;
+ break;
+ case KVM_CAP_MAX_VCPUS:
r = KVM_MAX_VCPUS;
break;
case KVM_CAP_NR_MEMSLOTS:
@@ -2095,7 +2124,7 @@ int kvm_dev_ioctl_check_extension(long ext)
r = 0;
break;
case KVM_CAP_IOMMU:
- r = iommu_found();
+ r = iommu_present(&pci_bus_type);
break;
case KVM_CAP_MCE:
r = KVM_MAX_MCE_BANKS;
@@ -2210,7 +2239,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
s64 tsc_delta;
u64 tsc;
- kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc);
+ tsc = kvm_x86_ops->read_l1_tsc(vcpu);
tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
tsc - vcpu->arch.last_guest_tsc;
@@ -2234,7 +2263,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
kvm_x86_ops->vcpu_put(vcpu);
kvm_put_guest_fpu(vcpu);
- kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+ vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
}
static int is_efer_nx(void)
@@ -2819,6 +2848,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
+ process_nmi(vcpu);
events->exception.injected =
vcpu->arch.exception.pending &&
!kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2836,7 +2866,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI);
events->nmi.injected = vcpu->arch.nmi_injected;
- events->nmi.pending = vcpu->arch.nmi_pending;
+ events->nmi.pending = vcpu->arch.nmi_pending != 0;
events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
events->nmi.pad = 0;
@@ -2856,6 +2886,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
| KVM_VCPUEVENT_VALID_SHADOW))
return -EINVAL;
+ process_nmi(vcpu);
vcpu->arch.exception.pending = events->exception.injected;
vcpu->arch.exception.nr = events->exception.nr;
vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -3556,7 +3587,11 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (r) {
mutex_lock(&kvm->slots_lock);
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
- &vpic->dev);
+ &vpic->dev_master);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+ &vpic->dev_slave);
+ kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+ &vpic->dev_eclr);
mutex_unlock(&kvm->slots_lock);
kfree(vpic);
goto create_irqchip_unlock;
@@ -4045,84 +4080,105 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
return 0;
}
-static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
- unsigned long addr,
- void *val,
- unsigned int bytes,
- struct x86_exception *exception)
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const void *val, int bytes)
{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- gpa_t gpa;
- int handled, ret;
+ int ret;
+ ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
+ if (ret < 0)
+ return 0;
+ kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
+ return 1;
+}
+
+struct read_write_emulator_ops {
+ int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
+ int bytes);
+ int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes);
+ int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
+ int bytes, void *val);
+ int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes);
+ bool write;
+};
+
+static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
+{
if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
vcpu->mmio_phys_addr, *(u64 *)val);
vcpu->mmio_read_completed = 0;
- return X86EMUL_CONTINUE;
+ return 1;
}
- ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false);
-
- if (ret < 0)
- return X86EMUL_PROPAGATE_FAULT;
-
- if (ret)
- goto mmio;
-
- if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
- == X86EMUL_CONTINUE)
- return X86EMUL_CONTINUE;
+ return 0;
+}
-mmio:
- /*
- * Is this MMIO handled locally?
- */
- handled = vcpu_mmio_read(vcpu, gpa, bytes, val);
+static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
+ return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);
+}
- if (handled == bytes)
- return X86EMUL_CONTINUE;
+static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
+ return emulator_write_phys(vcpu, gpa, val, bytes);
+}
- gpa += handled;
- bytes -= handled;
- val += handled;
+static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
+{
+ trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+ return vcpu_mmio_write(vcpu, gpa, bytes, val);
+}
+static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
+{
trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
-
- vcpu->mmio_needed = 1;
- vcpu->run->exit_reason = KVM_EXIT_MMIO;
- vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
- vcpu->mmio_size = bytes;
- vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
- vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
- vcpu->mmio_index = 0;
-
return X86EMUL_IO_NEEDED;
}
-int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
- const void *val, int bytes)
+static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
+ void *val, int bytes)
{
- int ret;
-
- ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
- if (ret < 0)
- return 0;
- kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
- return 1;
+ memcpy(vcpu->mmio_data, val, bytes);
+ memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
+ return X86EMUL_CONTINUE;
}
-static int emulator_write_emulated_onepage(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct x86_exception *exception,
- struct kvm_vcpu *vcpu)
+static struct read_write_emulator_ops read_emultor = {
+ .read_write_prepare = read_prepare,
+ .read_write_emulate = read_emulate,
+ .read_write_mmio = vcpu_mmio_read,
+ .read_write_exit_mmio = read_exit_mmio,
+};
+
+static struct read_write_emulator_ops write_emultor = {
+ .read_write_emulate = write_emulate,
+ .read_write_mmio = write_mmio,
+ .read_write_exit_mmio = write_exit_mmio,
+ .write = true,
+};
+
+static int emulator_read_write_onepage(unsigned long addr, void *val,
+ unsigned int bytes,
+ struct x86_exception *exception,
+ struct kvm_vcpu *vcpu,
+ struct read_write_emulator_ops *ops)
{
gpa_t gpa;
int handled, ret;
+ bool write = ops->write;
- ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true);
+ if (ops->read_write_prepare &&
+ ops->read_write_prepare(vcpu, val, bytes))
+ return X86EMUL_CONTINUE;
+
+ ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
if (ret < 0)
return X86EMUL_PROPAGATE_FAULT;
@@ -4131,15 +4187,14 @@ static int emulator_write_emulated_onepage(unsigned long addr,
if (ret)
goto mmio;
- if (emulator_write_phys(vcpu, gpa, val, bytes))
+ if (ops->read_write_emulate(vcpu, gpa, val, bytes))
return X86EMUL_CONTINUE;
mmio:
- trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
/*
* Is this MMIO handled locally?
*/
- handled = vcpu_mmio_write(vcpu, gpa, bytes, val);
+ handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
if (handled == bytes)
return X86EMUL_CONTINUE;
@@ -4148,23 +4203,20 @@ mmio:
val += handled;
vcpu->mmio_needed = 1;
- memcpy(vcpu->mmio_data, val, bytes);
vcpu->run->exit_reason = KVM_EXIT_MMIO;
vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
vcpu->mmio_size = bytes;
vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
- vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
- memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
+ vcpu->run->mmio.is_write = vcpu->mmio_is_write = write;
vcpu->mmio_index = 0;
- return X86EMUL_CONTINUE;
+ return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
}
-int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
- unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct x86_exception *exception)
+int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+ void *val, unsigned int bytes,
+ struct x86_exception *exception,
+ struct read_write_emulator_ops *ops)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
@@ -4173,16 +4225,38 @@ int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
int rc, now;
now = -addr & ~PAGE_MASK;
- rc = emulator_write_emulated_onepage(addr, val, now, exception,
- vcpu);
+ rc = emulator_read_write_onepage(addr, val, now, exception,
+ vcpu, ops);
+
if (rc != X86EMUL_CONTINUE)
return rc;
addr += now;
val += now;
bytes -= now;
}
- return emulator_write_emulated_onepage(addr, val, bytes, exception,
- vcpu);
+
+ return emulator_read_write_onepage(addr, val, bytes, exception,
+ vcpu, ops);
+}
+
+static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ void *val,
+ unsigned int bytes,
+ struct x86_exception *exception)
+{
+ return emulator_read_write(ctxt, addr, val, bytes,
+ exception, &read_emultor);
+}
+
+int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
+ unsigned long addr,
+ const void *val,
+ unsigned int bytes,
+ struct x86_exception *exception)
+{
+ return emulator_read_write(ctxt, addr, (void *)val, bytes,
+ exception, &write_emultor);
}
#define CMPXCHG_TYPE(t, ptr, old, new) \
@@ -4712,7 +4786,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
kvm_set_rflags(vcpu, ctxt->eflags);
if (irq == NMI_VECTOR)
- vcpu->arch.nmi_pending = false;
+ vcpu->arch.nmi_pending = 0;
else
vcpu->arch.interrupt.pending = false;
@@ -4788,7 +4862,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
trace_kvm_emulate_insn_start(vcpu);
++vcpu->stat.insn_emulation;
- if (r) {
+ if (r != EMULATION_OK) {
if (emulation_type & EMULTYPE_TRAP_UD)
return EMULATE_FAIL;
if (reexecute_instruction(vcpu, cr2))
@@ -5521,7 +5595,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
/* try to inject new event if pending */
if (vcpu->arch.nmi_pending) {
if (kvm_x86_ops->nmi_allowed(vcpu)) {
- vcpu->arch.nmi_pending = false;
+ --vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
kvm_x86_ops->set_nmi(vcpu);
}
@@ -5553,10 +5627,26 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
}
}
+static void process_nmi(struct kvm_vcpu *vcpu)
+{
+ unsigned limit = 2;
+
+ /*
+ * x86 is limited to one NMI running, and one NMI pending after it.
+ * If an NMI is already in progress, limit further NMIs to just one.
+ * Otherwise, allow two (and we'll inject the first one immediately).
+ */
+ if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
+ limit = 1;
+
+ vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
+ vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
- bool nmi_pending;
bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
vcpu->run->request_interrupt_window;
@@ -5596,6 +5686,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
record_steal_time(vcpu);
+ if (kvm_check_request(KVM_REQ_NMI, vcpu))
+ process_nmi(vcpu);
}
@@ -5603,19 +5695,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (unlikely(r))
goto out;
- /*
- * An NMI can be injected between local nmi_pending read and
- * vcpu->arch.nmi_pending read inside inject_pending_event().
- * But in that case, KVM_REQ_EVENT will be set, which makes
- * the race described above benign.
- */
- nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending);
-
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
inject_pending_event(vcpu);
/* enable NMI/IRQ window open exits if needed */
- if (nmi_pending)
+ if (vcpu->arch.nmi_pending)
kvm_x86_ops->enable_nmi_window(vcpu);
else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
kvm_x86_ops->enable_irq_window(vcpu);
@@ -5678,7 +5762,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (hw_breakpoint_active())
hw_breakpoint_restore();
- kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
+ vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
@@ -6323,7 +6407,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
{
- vcpu->arch.nmi_pending = false;
+ atomic_set(&vcpu->arch.nmi_queued, 0);
+ vcpu->arch.nmi_pending = 0;
vcpu->arch.nmi_injected = false;
vcpu->arch.switch_db_regs = 0;
@@ -6598,7 +6683,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
!vcpu->arch.apf.halted)
|| !list_empty_careful(&vcpu->async_pf.done)
|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
- || vcpu->arch.nmi_pending ||
+ || atomic_read(&vcpu->arch.nmi_queued) ||
(kvm_arch_interrupt_allowed(vcpu) &&
kvm_cpu_has_interrupt(vcpu));
}