From af24a4e4aec77ef16c1971cf4465f767ba946034 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 15 May 2009 18:42:05 +0530 Subject: KVM: Replace MSR_IA32_TIME_STAMP_COUNTER with MSR_IA32_TSC of msr-index.h Use standard msr-index.h's MSR declaration. MSR_IA32_TSC is better than MSR_IA32_TIME_STAMP_COUNTER as it also solves 80 column issue. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index eabdc1cfab5..79561752af9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -752,8 +752,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); } -#define MSR_IA32_TIME_STAMP_COUNTER 0x010 - #define TSS_IOPB_BASE_OFFSET 0x66 #define TSS_BASE_SIZE 0x68 #define TSS_IOPB_SIZE (65536 / 8) -- cgit v1.2.3-70-g09d2 From 890ca9aefa78f7831f8f633cab9e4803636dffe4 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 11 May 2009 16:48:15 +0800 Subject: KVM: Add MCE support The related MSRs are emulated. MCE capability is exported via extension KVM_CAP_MCE and ioctl KVM_X86_GET_MCE_CAP_SUPPORTED. A new vcpu ioctl command KVM_X86_SETUP_MCE is used to setup MCE emulation such as the mcg_cap. MCE is injected via vcpu ioctl command KVM_X86_SET_MCE. Extended machine-check state (MCG_EXT_P) and CMCI are not implemented. Signed-off-by: Huang Ying Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm.h | 1 + arch/x86/include/asm/kvm_host.h | 5 + arch/x86/kvm/x86.c | 220 +++++++++++++++++++++++++++++++++++----- include/linux/kvm.h | 20 ++++ 4 files changed, 222 insertions(+), 24 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 125be8b1956..708b9c32a5d 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -17,6 +17,7 @@ #define __KVM_HAVE_USER_NMI #define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_MSIX +#define __KVM_HAVE_MCE /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 79561752af9..81c68f630b1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -373,6 +373,11 @@ struct kvm_vcpu_arch { unsigned long dr6; unsigned long dr7; unsigned long eff_db[KVM_NR_DB_REGS]; + + u64 mcg_cap; + u64 mcg_status; + u64 mcg_ctl; + u64 *mce_banks; }; struct kvm_mem_alias { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d46079a901..55a9dd182de 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -42,6 +42,7 @@ #include #include #include +#include #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -55,6 +56,10 @@ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) + +#define KVM_MAX_MCE_BANKS 32 +#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P + /* EFER defaults: * - enable syscall per default because its emulated by KVM * - enable LME and LMA per default on 64 bit KVM @@ -777,23 +782,43 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) return 0; } -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) +static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) { + u64 mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + switch (msr) { - case MSR_EFER: - set_efer(vcpu, data); - break; - case MSR_IA32_MC0_STATUS: - pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", - __func__, data); - break; case MSR_IA32_MCG_STATUS: - pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", - __func__, data); + vcpu->arch.mcg_status = data; break; case MSR_IA32_MCG_CTL: - pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", - __func__, data); + if (!(mcg_cap & MCG_CTL_P)) + return 1; + if (data != 0 && data != ~(u64)0) + return -1; + vcpu->arch.mcg_ctl = data; + break; + default: + if (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + u32 offset = msr - MSR_IA32_MC0_CTL; + /* only 0 or all 1s can be written to IA32_MCi_CTL */ + if ((offset & 0x3) == 0 && + data != 0 && data != ~(u64)0) + return -1; + vcpu->arch.mce_banks[offset] = data; + break; + } + return 1; + } + return 0; +} + +int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + switch (msr) { + case MSR_EFER: + set_efer(vcpu, data); break; case MSR_IA32_DEBUGCTLMSR: if (!data) { @@ -849,6 +874,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) kvm_request_guest_time_update(vcpu); break; } + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + return set_msr_mce(vcpu, msr, data); default: pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); return 1; @@ -904,26 +933,49 @@ static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 data; + u64 mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; switch (msr) { - case 0xc0010010: /* SYSCFG */ - case 0xc0010015: /* HWCR */ - case MSR_IA32_PLATFORM_ID: case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: - case MSR_IA32_MC0_CTL: - case MSR_IA32_MCG_STATUS: + data = 0; + break; case MSR_IA32_MCG_CAP: + data = vcpu->arch.mcg_cap; + break; case MSR_IA32_MCG_CTL: - case MSR_IA32_MC0_MISC: - case MSR_IA32_MC0_MISC+4: - case MSR_IA32_MC0_MISC+8: - case MSR_IA32_MC0_MISC+12: - case MSR_IA32_MC0_MISC+16: - case MSR_IA32_MC0_MISC+20: + if (!(mcg_cap & MCG_CTL_P)) + return 1; + data = vcpu->arch.mcg_ctl; + break; + case MSR_IA32_MCG_STATUS: + data = vcpu->arch.mcg_status; + break; + default: + if (msr >= MSR_IA32_MC0_CTL && + msr < MSR_IA32_MC0_CTL + 4 * bank_num) { + u32 offset = msr - MSR_IA32_MC0_CTL; + data = vcpu->arch.mce_banks[offset]; + break; + } + return 1; + } + *pdata = data; + return 0; +} + +int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data; + + switch (msr) { + case 0xc0010010: /* SYSCFG */ + case 0xc0010015: /* HWCR */ + case MSR_IA32_PLATFORM_ID: case MSR_IA32_UCODE_REV: case MSR_IA32_EBL_CR_POWERON: case MSR_IA32_DEBUGCTLMSR: @@ -966,6 +1018,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_KVM_SYSTEM_TIME: data = vcpu->arch.time; break; + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MCG_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: + return get_msr_mce(vcpu, msr, pdata); default: pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); return 1; @@ -1087,6 +1146,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IOMMU: r = iommu_found(); break; + case KVM_CAP_MCE: + r = KVM_MAX_MCE_BANKS; + break; default: r = 0; break; @@ -1146,6 +1208,16 @@ long kvm_arch_dev_ioctl(struct file *filp, r = 0; break; } + case KVM_X86_GET_MCE_CAP_SUPPORTED: { + u64 mce_cap; + + mce_cap = KVM_MCE_CAP_SUPPORTED; + r = -EFAULT; + if (copy_to_user(argp, &mce_cap, sizeof mce_cap)) + goto out; + r = 0; + break; + } default: r = -EINVAL; } @@ -1502,6 +1574,80 @@ static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, return 0; } +static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, + u64 mcg_cap) +{ + int r; + unsigned bank_num = mcg_cap & 0xff, bank; + + r = -EINVAL; + if (!bank_num) + goto out; + if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000)) + goto out; + r = 0; + vcpu->arch.mcg_cap = mcg_cap; + /* Init IA32_MCG_CTL to all 1s */ + if (mcg_cap & MCG_CTL_P) + vcpu->arch.mcg_ctl = ~(u64)0; + /* Init IA32_MCi_CTL to all 1s */ + for (bank = 0; bank < bank_num; bank++) + vcpu->arch.mce_banks[bank*4] = ~(u64)0; +out: + return r; +} + +static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, + struct kvm_x86_mce *mce) +{ + u64 mcg_cap = vcpu->arch.mcg_cap; + unsigned bank_num = mcg_cap & 0xff; + u64 *banks = vcpu->arch.mce_banks; + + if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) + return -EINVAL; + /* + * if IA32_MCG_CTL is not all 1s, the uncorrected error + * reporting is disabled + */ + if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && + vcpu->arch.mcg_ctl != ~(u64)0) + return 0; + banks += 4 * mce->bank; + /* + * if IA32_MCi_CTL is not all 1s, the uncorrected error + * reporting is disabled for the bank + */ + if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) + return 0; + if (mce->status & MCI_STATUS_UC) { + if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || + !(vcpu->arch.cr4 & X86_CR4_MCE)) { + printk(KERN_DEBUG "kvm: set_mce: " + "injects mce exception while " + "previous one is in progress!\n"); + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + return 0; + } + if (banks[1] & MCI_STATUS_VAL) + mce->status |= MCI_STATUS_OVER; + banks[2] = mce->addr; + banks[3] = mce->misc; + vcpu->arch.mcg_status = mce->mcg_status; + banks[1] = mce->status; + kvm_queue_exception(vcpu, MC_VECTOR); + } else if (!(banks[1] & MCI_STATUS_VAL) + || !(banks[1] & MCI_STATUS_UC)) { + if (banks[1] & MCI_STATUS_VAL) + mce->status |= MCI_STATUS_OVER; + banks[2] = mce->addr; + banks[3] = mce->misc; + banks[1] = mce->status; + } else + banks[1] |= MCI_STATUS_OVER; + return 0; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -1635,6 +1781,24 @@ long kvm_arch_vcpu_ioctl(struct file *filp, kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); break; } + case KVM_X86_SETUP_MCE: { + u64 mcg_cap; + + r = -EFAULT; + if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) + goto out; + r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); + break; + } + case KVM_X86_SET_MCE: { + struct kvm_x86_mce mce; + + r = -EFAULT; + if (copy_from_user(&mce, argp, sizeof mce)) + goto out; + r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); + break; + } default: r = -EINVAL; } @@ -4440,6 +4604,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_mmu_destroy; } + vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4, + GFP_KERNEL); + if (!vcpu->arch.mce_banks) { + r = -ENOMEM; + goto fail_mmu_destroy; + } + vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; + return 0; fail_mmu_destroy: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 3db5d8d3748..7b17141c47c 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -415,6 +415,9 @@ struct kvm_trace_rec { #define KVM_CAP_ASSIGN_DEV_IRQ 29 /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */ #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30 +#ifdef __KVM_HAVE_MCE +#define KVM_CAP_MCE 31 +#endif #ifdef KVM_CAP_IRQ_ROUTING @@ -454,6 +457,19 @@ struct kvm_irq_routing { #endif +#ifdef KVM_CAP_MCE +/* x86 MCE */ +struct kvm_x86_mce { + __u64 status; + __u64 addr; + __u64 misc; + __u64 mcg_status; + __u8 bank; + __u8 pad1[7]; + __u64 pad2[3]; +}; +#endif + /* * ioctls for VM fds */ @@ -541,6 +557,10 @@ struct kvm_irq_routing { #define KVM_NMI _IO(KVMIO, 0x9a) /* Available with KVM_CAP_SET_GUEST_DEBUG */ #define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug) +/* MCE for x86 */ +#define KVM_X86_SETUP_MCE _IOW(KVMIO, 0x9c, __u64) +#define KVM_X86_GET_MCE_CAP_SUPPORTED _IOR(KVMIO, 0x9d, __u64) +#define KVM_X86_SET_MCE _IOW(KVMIO, 0x9e, struct kvm_x86_mce) /* * Deprecated interfaces -- cgit v1.2.3-70-g09d2 From 6de4f3ada40b336522250a7832a0cc4de8856589 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 31 May 2009 22:58:47 +0300 Subject: KVM: Cache pdptrs Instead of reloading the pdptrs on every entry and exit (vmcs writes on vmx, guest memory access on svm) extract them on demand. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 4 ++++ arch/x86/kvm/kvm_cache_regs.h | 9 +++++++++ arch/x86/kvm/mmu.c | 7 +++++-- arch/x86/kvm/paging_tmpl.h | 2 +- arch/x86/kvm/svm.c | 24 ++++++++++++++++++------ arch/x86/kvm/vmx.c | 22 ++++++++++++++++++---- arch/x86/kvm/x86.c | 8 ++++++++ 7 files changed, 63 insertions(+), 13 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 81c68f630b1..1cc901ec4ba 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -120,6 +120,10 @@ enum kvm_reg { NR_VCPU_REGS }; +enum kvm_reg_ex { + VCPU_EXREG_PDPTR = NR_VCPU_REGS, +}; + enum { VCPU_SREG_ES, VCPU_SREG_CS, diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 1ff819dce7d..7bcc5b6a440 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -29,4 +29,13 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) kvm_register_write(vcpu, VCPU_REGS_RIP, val); } +static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) +{ + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail)) + kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); + + return vcpu->arch.pdptrs[index]; +} + #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0ef5bb2b404..8ee67e3fb9d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -18,6 +18,7 @@ */ #include "mmu.h" +#include "kvm_cache_regs.h" #include #include @@ -1954,6 +1955,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) gfn_t root_gfn; struct kvm_mmu_page *sp; int direct = 0; + u64 pdptr; root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; @@ -1981,11 +1983,12 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { - if (!is_present_pte(vcpu->arch.pdptrs[i])) { + pdptr = kvm_pdptr_read(vcpu, i); + if (!is_present_pte(pdptr)) { vcpu->arch.mmu.pae_root[i] = 0; continue; } - root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT; + root_gfn = pdptr >> PAGE_SHIFT; } else if (vcpu->arch.mmu.root_level == 0) root_gfn = 0; if (mmu_check_root(vcpu, root_gfn)) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 67785f63539..4cb1dbfd7c2 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -131,7 +131,7 @@ walk: pte = vcpu->arch.cr3; #if PTTYPE == 64 if (!is_long_mode(vcpu)) { - pte = vcpu->arch.pdptrs[(addr >> 30) & 3]; + pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); if (!is_present_pte(pte)) goto not_present; --walker->level; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 522e69597a1..7749b0692cb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -777,6 +777,18 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) to_svm(vcpu)->vmcb->save.rflags = rflags; } +static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) +{ + switch (reg) { + case VCPU_EXREG_PDPTR: + BUG_ON(!npt_enabled); + load_pdptrs(vcpu, vcpu->arch.cr3); + break; + default: + BUG(); + } +} + static void svm_set_vintr(struct vcpu_svm *svm) { svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; @@ -2285,12 +2297,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } vcpu->arch.cr0 = svm->vmcb->save.cr0; vcpu->arch.cr3 = svm->vmcb->save.cr3; - if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - } if (mmu_reload) { kvm_mmu_reset_context(vcpu); kvm_mmu_load(vcpu); @@ -2641,6 +2647,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) svm->next_rip = 0; + if (npt_enabled) { + vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); + vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); + } + svm_complete_interrupts(svm); } @@ -2749,6 +2760,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_gdt = svm_set_gdt, .get_dr = svm_get_dr, .set_dr = svm_set_dr, + .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 752465f98bf..d726dec6952 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -161,6 +161,8 @@ static struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; +static void ept_save_pdptrs(struct kvm_vcpu *vcpu); + /* * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it * away by decrementing the array size. @@ -1047,6 +1049,10 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) case VCPU_REGS_RIP: vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); break; + case VCPU_EXREG_PDPTR: + if (enable_ept) + ept_save_pdptrs(vcpu); + break; default: break; } @@ -1546,6 +1552,10 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) static void ept_load_pdptrs(struct kvm_vcpu *vcpu) { + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_dirty)) + return; + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); @@ -1562,6 +1572,11 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu) vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); } + + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_dirty); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); @@ -3255,10 +3270,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ - if (enable_ept && is_paging(vcpu)) { + if (enable_ept && is_paging(vcpu)) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - ept_save_pdptrs(vcpu); - } if (unlikely(vmx->fail)) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; @@ -3567,7 +3580,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) #endif ); - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) + | (1 << VCPU_EXREG_PDPTR)); vcpu->arch.regs_dirty = 0; get_debugreg(vcpu->arch.dr6, 6); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 75e9df09784..2ad8c97f58c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -246,6 +246,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) ret = 1; memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail); + __set_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_dirty); out: return ret; @@ -261,6 +265,10 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) if (is_long_mode(vcpu) || !is_pae(vcpu)) return false; + if (!test_bit(VCPU_EXREG_PDPTR, + (unsigned long *)&vcpu->arch.regs_avail)) + return true; + r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); if (r < 0) goto out; -- cgit v1.2.3-70-g09d2 From 3a624e29c7587b79abab60e279f9d1a62a3d4716 Mon Sep 17 00:00:00 2001 From: Nitin A Kamble Date: Mon, 8 Jun 2009 11:34:16 -0700 Subject: KVM: VMX: Support Unrestricted Guest feature "Unrestricted Guest" feature is added in the VMX specification. Intel Westmere and onwards processors will support this feature. It allows kvm guests to run real mode and unpaged mode code natively in the VMX mode when EPT is turned on. With the unrestricted guest there is no need to emulate the guest real mode code in the vm86 container or in the emulator. Also the guest big real mode code works like native. The attached patch enhances KVM to use the unrestricted guest feature if available on the processor. It also adds a new kernel/module parameter to disable the unrestricted guest feature at the boot time. Signed-off-by: Nitin A Kamble Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 12 +++++---- arch/x86/include/asm/vmx.h | 1 + arch/x86/kvm/vmx.c | 60 ++++++++++++++++++++++++++++++++++++----- 3 files changed, 62 insertions(+), 11 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1cc901ec4ba..a1a96a57bb9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -37,12 +37,14 @@ #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 0xFFFFFF0000000000ULL) -#define KVM_GUEST_CR0_MASK \ - (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \ - | X86_CR0_NW | X86_CR0_CD) +#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) +#define KVM_GUEST_CR0_MASK \ + (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) +#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ + (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) #define KVM_VM_CR0_ALWAYS_ON \ - (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \ - | X86_CR0_MP) + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_GUEST_CR4_MASK \ (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 11be5ad2e0e..e7927a639d6 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -55,6 +55,7 @@ #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 +#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 #define PIN_BASED_EXT_INTR_MASK 0x00000001 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 959cb59cfae..f0f9773f0b0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -51,6 +51,10 @@ module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); static int __read_mostly enable_ept = 1; module_param_named(ept, enable_ept, bool, S_IRUGO); +static int __read_mostly enable_unrestricted_guest = 1; +module_param_named(unrestricted_guest, + enable_unrestricted_guest, bool, S_IRUGO); + static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); @@ -279,6 +283,12 @@ static inline int cpu_has_vmx_ept(void) SECONDARY_EXEC_ENABLE_EPT; } +static inline int cpu_has_vmx_unrestricted_guest(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_UNRESTRICTED_GUEST; +} + static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { return flexpriority_enabled && @@ -1210,7 +1220,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_ENABLE_VPID | - SECONDARY_EXEC_ENABLE_EPT; + SECONDARY_EXEC_ENABLE_EPT | + SECONDARY_EXEC_UNRESTRICTED_GUEST; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -1340,8 +1351,13 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_vpid()) enable_vpid = 0; - if (!cpu_has_vmx_ept()) + if (!cpu_has_vmx_ept()) { enable_ept = 0; + enable_unrestricted_guest = 0; + } + + if (!cpu_has_vmx_unrestricted_guest()) + enable_unrestricted_guest = 0; if (!cpu_has_vmx_flexpriority()) flexpriority_enabled = 0; @@ -1440,6 +1456,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu) unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); + if (enable_unrestricted_guest) + return; + vmx->emulation_required = 1; vcpu->arch.rmode.vm86_active = 1; @@ -1593,7 +1612,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; vmx_set_cr4(vcpu, vcpu->arch.cr4); - *hw_cr0 |= X86_CR0_PE | X86_CR0_PG; *hw_cr0 &= ~X86_CR0_WP; } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ @@ -1620,8 +1638,13 @@ static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { - unsigned long hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | - KVM_VM_CR0_ALWAYS_ON; + unsigned long hw_cr0; + + if (enable_unrestricted_guest) + hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST) + | KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; + else + hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; vmx_fpu_deactivate(vcpu); @@ -1786,6 +1809,21 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, ar = 0xf3; } else ar = vmx_segment_access_rights(var); + + /* + * Fix the "Accessed" bit in AR field of segment registers for older + * qemu binaries. + * IA32 arch specifies that at the time of processor reset the + * "Accessed" bit in the AR field of segment registers is 1. And qemu + * is setting it to 0 in the usedland code. This causes invalid guest + * state vmexit when "unrestricted guest" mode is turned on. + * Fix for this setup issue in cpu_reset is being pushed in the qemu + * tree. Newer qemu binaries with that qemu fix would not need this + * kvm hack. + */ + if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) + ar |= 0x1; /* Accessed */ + vmcs_write32(sf->ar_bytes, ar); } @@ -2082,11 +2120,19 @@ out: static void seg_setup(int seg) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + unsigned int ar; vmcs_write16(sf->selector, 0); vmcs_writel(sf->base, 0); vmcs_write32(sf->limit, 0xffff); - vmcs_write32(sf->ar_bytes, 0xf3); + if (enable_unrestricted_guest) { + ar = 0x93; + if (seg == VCPU_SREG_CS) + ar |= 0x08; /* code segment */ + } else + ar = 0xf3; + + vmcs_write32(sf->ar_bytes, ar); } static int alloc_apic_access_page(struct kvm *kvm) @@ -2229,6 +2275,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; if (!enable_ept) exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + if (!enable_unrestricted_guest) + exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } -- cgit v1.2.3-70-g09d2 From 7ffd92c53c5ebd0ad5a68ac3ca033c3a06374d19 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 9 Jun 2009 14:10:45 +0300 Subject: KVM: VMX: Move rmode structure to vmx-specific code rmode is only used in vmx, so move it to vmx.c Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 10 ------ arch/x86/kvm/vmx.c | 78 +++++++++++++++++++++++------------------ 2 files changed, 44 insertions(+), 44 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a1a96a57bb9..c7b0cc2b702 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -340,16 +340,6 @@ struct kvm_vcpu_arch { u8 nr; } interrupt; - struct { - int vm86_active; - u8 save_iopl; - struct kvm_save_segment { - u16 selector; - unsigned long base; - u32 limit; - u32 ar; - } tr, es, ds, fs, gs; - } rmode; int halt_request; /* real mode on Intel only */ int cpuid_nent; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f0f9773f0b0..ae682929a64 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -88,6 +88,14 @@ struct vcpu_vmx { int guest_efer_loaded; } host_state; struct { + int vm86_active; + u8 save_iopl; + struct kvm_save_segment { + u16 selector; + unsigned long base; + u32 limit; + u32 ar; + } tr, es, ds, fs, gs; struct { bool pending; u8 vector; @@ -516,7 +524,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) eb |= 1u << BP_VECTOR; } - if (vcpu->arch.rmode.vm86_active) + if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ @@ -752,7 +760,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - if (vcpu->arch.rmode.vm86_active) + if (to_vmx(vcpu)->rmode.vm86_active) rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; vmcs_writel(GUEST_RFLAGS, rflags); } @@ -809,7 +817,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, intr_info |= INTR_INFO_DELIVER_CODE_MASK; } - if (vcpu->arch.rmode.vm86_active) { + if (vmx->rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = nr; vmx->rmode.irq.rip = kvm_rip_read(vcpu); @@ -1395,15 +1403,15 @@ static void enter_pmode(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); vmx->emulation_required = 1; - vcpu->arch.rmode.vm86_active = 0; + vmx->rmode.vm86_active = 0; - vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); - vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit); - vmcs_write32(GUEST_TR_AR_BYTES, vcpu->arch.rmode.tr.ar); + vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); + vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); + vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); flags = vmcs_readl(GUEST_RFLAGS); flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); - flags |= (vcpu->arch.rmode.save_iopl << IOPL_SHIFT); + flags |= (vmx->rmode.save_iopl << IOPL_SHIFT); vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | @@ -1414,10 +1422,10 @@ static void enter_pmode(struct kvm_vcpu *vcpu) if (emulate_invalid_guest_state) return; - fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); - fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); - fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); - fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); + fix_pmode_dataseg(VCPU_SREG_ES, &vmx->rmode.es); + fix_pmode_dataseg(VCPU_SREG_DS, &vmx->rmode.ds); + fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); + fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); vmcs_write16(GUEST_SS_SELECTOR, 0); vmcs_write32(GUEST_SS_AR_BYTES, 0x93); @@ -1460,19 +1468,19 @@ static void enter_rmode(struct kvm_vcpu *vcpu) return; vmx->emulation_required = 1; - vcpu->arch.rmode.vm86_active = 1; + vmx->rmode.vm86_active = 1; - vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); + vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); - vcpu->arch.rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); + vmx->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT); vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); - vcpu->arch.rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); + vmx->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES); vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); flags = vmcs_readl(GUEST_RFLAGS); - vcpu->arch.rmode.save_iopl + vmx->rmode.save_iopl = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; @@ -1494,10 +1502,10 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_CS_BASE, 0xf0000); vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); - fix_rmode_seg(VCPU_SREG_ES, &vcpu->arch.rmode.es); - fix_rmode_seg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); - fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); - fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); + fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); + fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); + fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); + fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); continue_rmode: kvm_mmu_reset_context(vcpu); @@ -1638,6 +1646,7 @@ static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { + struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long hw_cr0; if (enable_unrestricted_guest) @@ -1648,10 +1657,10 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx_fpu_deactivate(vcpu); - if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE)) + if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); - if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE)) + if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) enter_rmode(vcpu); #ifdef CONFIG_X86_64 @@ -1707,7 +1716,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ? + unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); vcpu->arch.cr4 = cr4; @@ -1787,20 +1796,21 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { + struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; u32 ar; - if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) { - vcpu->arch.rmode.tr.selector = var->selector; - vcpu->arch.rmode.tr.base = var->base; - vcpu->arch.rmode.tr.limit = var->limit; - vcpu->arch.rmode.tr.ar = vmx_segment_access_rights(var); + if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { + vmx->rmode.tr.selector = var->selector; + vmx->rmode.tr.base = var->base; + vmx->rmode.tr.limit = var->limit; + vmx->rmode.tr.ar = vmx_segment_access_rights(var); return; } vmcs_writel(sf->base, var->base); vmcs_write32(sf->limit, var->limit); vmcs_write16(sf->selector, var->selector); - if (vcpu->arch.rmode.vm86_active && var->s) { + if (vmx->rmode.vm86_active && var->s) { /* * Hack real-mode segments into vm86 compatibility. */ @@ -2394,7 +2404,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) goto out; } - vmx->vcpu.arch.rmode.vm86_active = 0; + vmx->rmode.vm86_active = 0; vmx->soft_vnmi_blocked = 0; @@ -2532,7 +2542,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); ++vcpu->stat.irq_injections; - if (vcpu->arch.rmode.vm86_active) { + if (vmx->rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = irq; vmx->rmode.irq.rip = kvm_rip_read(vcpu); @@ -2573,7 +2583,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) } ++vcpu->stat.nmi_injections; - if (vcpu->arch.rmode.vm86_active) { + if (vmx->rmode.vm86_active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = NMI_VECTOR; vmx->rmode.irq.rip = kvm_rip_read(vcpu); @@ -2737,7 +2747,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return kvm_mmu_page_fault(vcpu, cr2, error_code); } - if (vcpu->arch.rmode.vm86_active && + if (vmx->rmode.vm86_active && handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK, error_code)) { if (vcpu->arch.halt_request) { -- cgit v1.2.3-70-g09d2 From e799794e02a368f79c3fae26aabaaadd0b7466ce Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 11 Jun 2009 12:07:40 -0300 Subject: KVM: VMX: more MSR_IA32_VMX_EPT_VPID_CAP capability bits Required for EPT misconfiguration handler. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/vmx.h | 7 +++++++ arch/x86/kvm/vmx.c | 20 ++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index e7927a639d6..272514c2d45 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -352,9 +352,16 @@ enum vmcs_field { #define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_EPT_EXTENT_CONTEXT 1 #define VMX_EPT_EXTENT_GLOBAL 2 + +#define VMX_EPT_EXECUTE_ONLY_BIT (1ull) +#define VMX_EPT_PAGE_WALK_4_BIT (1ull << 6) +#define VMX_EPTP_UC_BIT (1ull << 8) +#define VMX_EPTP_WB_BIT (1ull << 14) +#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) + #define VMX_EPT_DEFAULT_GAW 3 #define VMX_EPT_MAX_GAW 0x4 #define VMX_EPT_MT_EPTE_SHIFT 3 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6ee929255a3..6610181267b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -270,6 +270,26 @@ static inline bool cpu_has_vmx_flexpriority(void) cpu_has_vmx_virtualize_apic_accesses(); } +static inline bool cpu_has_vmx_ept_execute_only(void) +{ + return !!(vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT); +} + +static inline bool cpu_has_vmx_eptp_uncacheable(void) +{ + return !!(vmx_capability.ept & VMX_EPTP_UC_BIT); +} + +static inline bool cpu_has_vmx_eptp_writeback(void) +{ + return !!(vmx_capability.ept & VMX_EPTP_WB_BIT); +} + +static inline bool cpu_has_vmx_ept_2m_page(void) +{ + return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); +} + static inline int cpu_has_vmx_invept_individual_addr(void) { return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); -- cgit v1.2.3-70-g09d2 From 0367b4330e463c45981437083991b90d25a9d78d Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 15 Jun 2009 15:21:22 +0200 Subject: x86: Add definition for IGNNE MSR Hyper-V accesses MSR_IGNNE while running under KVM. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/include/asm/msr-index.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 6be7fc254b5..bd5549034a9 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -374,6 +374,7 @@ /* AMD-V MSRs */ #define MSR_VM_CR 0xc0010114 +#define MSR_VM_IGNNE 0xc0010115 #define MSR_VM_HSAVE_PA 0xc0010117 #endif /* _ASM_X86_MSR_INDEX_H */ -- cgit v1.2.3-70-g09d2 From 229456fc34b1c9031b04f7581e7b755d1cebfe9c Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 17 Jun 2009 09:22:14 -0300 Subject: KVM: convert custom marker based tracing to event traces This allows use of the powerful ftrace infrastructure. See Documentation/trace/ for usage information. [avi, stephen: various build fixes] [sheng: fix control register breakage] Signed-off-by: Marcelo Tosatti Signed-off-by: Stephen Rothwell Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 + arch/x86/kvm/Makefile | 4 + arch/x86/kvm/lapic.c | 7 +- arch/x86/kvm/svm.c | 84 +++++++++---- arch/x86/kvm/trace.h | 260 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/vmx.c | 78 +++++++----- arch/x86/kvm/x86.c | 48 +++----- include/trace/events/kvm.h | 57 +++++++++ virt/kvm/irq_comm.c | 5 + virt/kvm/kvm_main.c | 4 + 10 files changed, 463 insertions(+), 86 deletions(-) create mode 100644 arch/x86/kvm/trace.h create mode 100644 include/trace/events/kvm.h (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c7b0cc2b702..19027ab2041 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -527,6 +528,7 @@ struct kvm_x86_ops { int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); + const struct trace_print_flags *exit_reasons_str; }; extern struct kvm_x86_ops *kvm_x86_ops; diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 01e3c61f749..7c56850b82c 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -1,6 +1,10 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm +CFLAGS_x86.o := -I. +CFLAGS_svm.o := -I. +CFLAGS_vmx.o := -I. + kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o) kvm-$(CONFIG_KVM_TRACE) += $(addprefix ../../../virt/kvm/, kvm_trace.o) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3bde43c3789..2e028659638 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -34,6 +34,7 @@ #include #include "kvm_cache_regs.h" #include "irq.h" +#include "trace.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) @@ -515,8 +516,6 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) { u32 val = 0; - KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler); - if (offset >= LAPIC_MMIO_LENGTH) return 0; @@ -562,6 +561,8 @@ static void apic_mmio_read(struct kvm_io_device *this, } result = __apic_read(apic, offset & ~0xf); + trace_kvm_apic_read(offset, result); + switch (len) { case 1: case 2: @@ -657,7 +658,7 @@ static void apic_mmio_write(struct kvm_io_device *this, offset &= 0xff0; - KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler); + trace_kvm_apic_write(offset, val); switch (offset) { case APIC_ID: /* Local APIC ID */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 45666618377..b1c44620886 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -25,10 +25,12 @@ #include #include #include +#include #include #include +#include "trace.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) @@ -1096,7 +1098,6 @@ static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) val = 0; } - KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); return val; } @@ -1105,8 +1106,6 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, { struct vcpu_svm *svm = to_svm(vcpu); - KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)value, handler); - *exception = 0; switch (dr) { @@ -1154,14 +1153,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) fault_address = svm->vmcb->control.exit_info_2; error_code = svm->vmcb->control.exit_info_1; - if (!npt_enabled) - KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code, - (u32)fault_address, (u32)(fault_address >> 32), - handler); - else - KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code, - (u32)fault_address, (u32)(fault_address >> 32), - handler); + trace_kvm_page_fault(fault_address, error_code); /* * FIXME: Tis shouldn't be necessary here, but there is a flush * missing in the MMU code. Until we find this bug, flush the @@ -1288,14 +1280,12 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - KVMTRACE_0D(NMI, &svm->vcpu, handler); return 1; } static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { ++svm->vcpu.stat.irq_exits; - KVMTRACE_0D(INTR, &svm->vcpu, handler); return 1; } @@ -2077,8 +2067,7 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) if (svm_get_msr(&svm->vcpu, ecx, &data)) kvm_inject_gp(&svm->vcpu, 0); else { - KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, - (u32)(data >> 32), handler); + trace_kvm_msr_read(ecx, data); svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; @@ -2163,8 +2152,7 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); - KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), - handler); + trace_kvm_msr_write(ecx, data); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; if (svm_set_msr(&svm->vcpu, ecx, data)) @@ -2185,8 +2173,6 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int interrupt_window_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler); - svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; /* @@ -2265,8 +2251,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u32 exit_code = svm->vmcb->control.exit_code; - KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip, - (u32)((u64)svm->vmcb->save.rip >> 32), entryexit); + trace_kvm_exit(exit_code, svm->vmcb->save.rip); if (is_nested(svm)) { nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", @@ -2354,7 +2339,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) { struct vmcb_control_area *control; - KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); + trace_kvm_inj_virq(irq); ++svm->vcpu.stat.irq_injections; control = &svm->vmcb->control; @@ -2717,6 +2702,59 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return 0; } +static const struct trace_print_flags svm_exit_reasons_str[] = { + { SVM_EXIT_READ_CR0, "read_cr0" }, + { SVM_EXIT_READ_CR3, "read_cr3" }, + { SVM_EXIT_READ_CR4, "read_cr4" }, + { SVM_EXIT_READ_CR8, "read_cr8" }, + { SVM_EXIT_WRITE_CR0, "write_cr0" }, + { SVM_EXIT_WRITE_CR3, "write_cr3" }, + { SVM_EXIT_WRITE_CR4, "write_cr4" }, + { SVM_EXIT_WRITE_CR8, "write_cr8" }, + { SVM_EXIT_READ_DR0, "read_dr0" }, + { SVM_EXIT_READ_DR1, "read_dr1" }, + { SVM_EXIT_READ_DR2, "read_dr2" }, + { SVM_EXIT_READ_DR3, "read_dr3" }, + { SVM_EXIT_WRITE_DR0, "write_dr0" }, + { SVM_EXIT_WRITE_DR1, "write_dr1" }, + { SVM_EXIT_WRITE_DR2, "write_dr2" }, + { SVM_EXIT_WRITE_DR3, "write_dr3" }, + { SVM_EXIT_WRITE_DR5, "write_dr5" }, + { SVM_EXIT_WRITE_DR7, "write_dr7" }, + { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, + { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, + { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, + { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, + { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, + { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, + { SVM_EXIT_INTR, "interrupt" }, + { SVM_EXIT_NMI, "nmi" }, + { SVM_EXIT_SMI, "smi" }, + { SVM_EXIT_INIT, "init" }, + { SVM_EXIT_VINTR, "vintr" }, + { SVM_EXIT_CPUID, "cpuid" }, + { SVM_EXIT_INVD, "invd" }, + { SVM_EXIT_HLT, "hlt" }, + { SVM_EXIT_INVLPG, "invlpg" }, + { SVM_EXIT_INVLPGA, "invlpga" }, + { SVM_EXIT_IOIO, "io" }, + { SVM_EXIT_MSR, "msr" }, + { SVM_EXIT_TASK_SWITCH, "task_switch" }, + { SVM_EXIT_SHUTDOWN, "shutdown" }, + { SVM_EXIT_VMRUN, "vmrun" }, + { SVM_EXIT_VMMCALL, "hypercall" }, + { SVM_EXIT_VMLOAD, "vmload" }, + { SVM_EXIT_VMSAVE, "vmsave" }, + { SVM_EXIT_STGI, "stgi" }, + { SVM_EXIT_CLGI, "clgi" }, + { SVM_EXIT_SKINIT, "skinit" }, + { SVM_EXIT_WBINVD, "wbinvd" }, + { SVM_EXIT_MONITOR, "monitor" }, + { SVM_EXIT_MWAIT, "mwait" }, + { SVM_EXIT_NPF, "npf" }, + { -1, NULL } +}; + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -2778,6 +2816,8 @@ static struct kvm_x86_ops svm_x86_ops = { .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, .get_mt_mask = svm_get_mt_mask, + + .exit_reasons_str = svm_exit_reasons_str, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h new file mode 100644 index 00000000000..cd8c90db41a --- /dev/null +++ b/arch/x86/kvm/trace.h @@ -0,0 +1,260 @@ +#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_H + +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm +#define TRACE_INCLUDE_PATH arch/x86/kvm +#define TRACE_INCLUDE_FILE trace + +/* + * Tracepoint for guest mode entry. + */ +TRACE_EVENT(kvm_entry, + TP_PROTO(unsigned int vcpu_id), + TP_ARGS(vcpu_id), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + ), + + TP_printk("vcpu %u", __entry->vcpu_id) +); + +/* + * Tracepoint for hypercall. + */ +TRACE_EVENT(kvm_hypercall, + TP_PROTO(unsigned long nr, unsigned long a0, unsigned long a1, + unsigned long a2, unsigned long a3), + TP_ARGS(nr, a0, a1, a2, a3), + + TP_STRUCT__entry( + __field( unsigned long, nr ) + __field( unsigned long, a0 ) + __field( unsigned long, a1 ) + __field( unsigned long, a2 ) + __field( unsigned long, a3 ) + ), + + TP_fast_assign( + __entry->nr = nr; + __entry->a0 = a0; + __entry->a1 = a1; + __entry->a2 = a2; + __entry->a3 = a3; + ), + + TP_printk("nr 0x%lx a0 0x%lx a1 0x%lx a2 0x%lx a3 0x%lx", + __entry->nr, __entry->a0, __entry->a1, __entry->a2, + __entry->a3) +); + +/* + * Tracepoint for PIO. + */ +TRACE_EVENT(kvm_pio, + TP_PROTO(unsigned int rw, unsigned int port, unsigned int size, + unsigned int count), + TP_ARGS(rw, port, size, count), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, port ) + __field( unsigned int, size ) + __field( unsigned int, count ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->port = port; + __entry->size = size; + __entry->count = count; + ), + + TP_printk("pio_%s at 0x%x size %d count %d", + __entry->rw ? "write" : "read", + __entry->port, __entry->size, __entry->count) +); + +/* + * Tracepoint for cpuid. + */ +TRACE_EVENT(kvm_cpuid, + TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx, + unsigned long rcx, unsigned long rdx), + TP_ARGS(function, rax, rbx, rcx, rdx), + + TP_STRUCT__entry( + __field( unsigned int, function ) + __field( unsigned long, rax ) + __field( unsigned long, rbx ) + __field( unsigned long, rcx ) + __field( unsigned long, rdx ) + ), + + TP_fast_assign( + __entry->function = function; + __entry->rax = rax; + __entry->rbx = rbx; + __entry->rcx = rcx; + __entry->rdx = rdx; + ), + + TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx", + __entry->function, __entry->rax, + __entry->rbx, __entry->rcx, __entry->rdx) +); + +/* + * Tracepoint for apic access. + */ +TRACE_EVENT(kvm_apic, + TP_PROTO(unsigned int rw, unsigned int reg, unsigned int val), + TP_ARGS(rw, reg, val), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, reg ) + __field( unsigned int, val ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->reg = reg; + __entry->val = val; + ), + + TP_printk("apic_%s 0x%x = 0x%x", + __entry->rw ? "write" : "read", + __entry->reg, __entry->val) +); + +#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) +#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) + +/* + * Tracepoint for kvm guest exit: + */ +TRACE_EVENT(kvm_exit, + TP_PROTO(unsigned int exit_reason, unsigned long guest_rip), + TP_ARGS(exit_reason, guest_rip), + + TP_STRUCT__entry( + __field( unsigned int, exit_reason ) + __field( unsigned long, guest_rip ) + ), + + TP_fast_assign( + __entry->exit_reason = exit_reason; + __entry->guest_rip = guest_rip; + ), + + TP_printk("reason %s rip 0x%lx", + ftrace_print_symbols_seq(p, __entry->exit_reason, + kvm_x86_ops->exit_reasons_str), + __entry->guest_rip) +); + +/* + * Tracepoint for kvm interrupt injection: + */ +TRACE_EVENT(kvm_inj_virq, + TP_PROTO(unsigned int irq), + TP_ARGS(irq), + + TP_STRUCT__entry( + __field( unsigned int, irq ) + ), + + TP_fast_assign( + __entry->irq = irq; + ), + + TP_printk("irq %u", __entry->irq) +); + +/* + * Tracepoint for page fault. + */ +TRACE_EVENT(kvm_page_fault, + TP_PROTO(unsigned long fault_address, unsigned int error_code), + TP_ARGS(fault_address, error_code), + + TP_STRUCT__entry( + __field( unsigned long, fault_address ) + __field( unsigned int, error_code ) + ), + + TP_fast_assign( + __entry->fault_address = fault_address; + __entry->error_code = error_code; + ), + + TP_printk("address %lx error_code %x", + __entry->fault_address, __entry->error_code) +); + +/* + * Tracepoint for guest MSR access. + */ +TRACE_EVENT(kvm_msr, + TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), + TP_ARGS(rw, ecx, data), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, ecx ) + __field( unsigned long, data ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->ecx = ecx; + __entry->data = data; + ), + + TP_printk("msr_%s %x = 0x%lx", + __entry->rw ? "write" : "read", + __entry->ecx, __entry->data) +); + +#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data) +#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data) + +/* + * Tracepoint for guest CR access. + */ +TRACE_EVENT(kvm_cr, + TP_PROTO(unsigned int rw, unsigned int cr, unsigned long val), + TP_ARGS(rw, cr, val), + + TP_STRUCT__entry( + __field( unsigned int, rw ) + __field( unsigned int, cr ) + __field( unsigned long, val ) + ), + + TP_fast_assign( + __entry->rw = rw; + __entry->cr = cr; + __entry->val = val; + ), + + TP_printk("cr_%s %x = 0x%lx", + __entry->rw ? "write" : "read", + __entry->cr, __entry->val) +); + +#define trace_kvm_cr_read(cr, val) trace_kvm_cr(0, cr, val) +#define trace_kvm_cr_write(cr, val) trace_kvm_cr(1, cr, val) + +#endif /* _TRACE_KVM_H */ + +/* This part must be outside protection */ +#include diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1a84ca191cd..c6256b98f07 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "kvm_cache_regs.h" #include "x86.h" @@ -34,6 +35,8 @@ #include #include +#include "trace.h" + #define __ex(x) __kvm_handle_fault_on_reboot(x) MODULE_AUTHOR("Qumranet"); @@ -2550,7 +2553,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) uint32_t intr; int irq = vcpu->arch.interrupt.nr; - KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); + trace_kvm_inj_virq(irq); ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { @@ -2751,8 +2754,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (enable_ept) BUG(); cr2 = vmcs_readl(EXIT_QUALIFICATION); - KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, - (u32)((u64)cr2 >> 32), handler); + trace_kvm_page_fault(cr2, error_code); + if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, cr2); return kvm_mmu_page_fault(vcpu, cr2, error_code); @@ -2799,7 +2802,6 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { ++vcpu->stat.irq_exits; - KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler); return 1; } @@ -2847,7 +2849,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - unsigned long exit_qualification; + unsigned long exit_qualification, val; int cr; int reg; @@ -2856,21 +2858,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ - KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, - (u32)kvm_register_read(vcpu, reg), - (u32)((u64)kvm_register_read(vcpu, reg) >> 32), - handler); + val = kvm_register_read(vcpu, reg); + trace_kvm_cr_write(cr, val); switch (cr) { case 0: - kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg)); + kvm_set_cr0(vcpu, val); skip_emulated_instruction(vcpu); return 1; case 3: - kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg)); + kvm_set_cr3(vcpu, val); skip_emulated_instruction(vcpu); return 1; case 4: - kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); + kvm_set_cr4(vcpu, val); skip_emulated_instruction(vcpu); return 1; case 8: { @@ -2892,23 +2892,19 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.cr0 &= ~X86_CR0_TS; vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); vmx_fpu_activate(vcpu); - KVMTRACE_0D(CLTS, vcpu, handler); skip_emulated_instruction(vcpu); return 1; case 1: /*mov from cr*/ switch (cr) { case 3: kvm_register_write(vcpu, reg, vcpu->arch.cr3); - KVMTRACE_3D(CR_READ, vcpu, (u32)cr, - (u32)kvm_register_read(vcpu, reg), - (u32)((u64)kvm_register_read(vcpu, reg) >> 32), - handler); + trace_kvm_cr_read(cr, vcpu->arch.cr3); skip_emulated_instruction(vcpu); return 1; case 8: - kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu)); - KVMTRACE_2D(CR_READ, vcpu, (u32)cr, - (u32)kvm_register_read(vcpu, reg), handler); + val = kvm_get_cr8(vcpu); + kvm_register_write(vcpu, reg, val); + trace_kvm_cr_read(cr, val); skip_emulated_instruction(vcpu); return 1; } @@ -2976,7 +2972,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) val = 0; } kvm_register_write(vcpu, reg, val); - KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); } else { val = vcpu->arch.regs[reg]; switch (dr) { @@ -3009,7 +3004,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } break; } - KVMTRACE_2D(DR_WRITE, vcpu, (u32)dr, (u32)val, handler); } skip_emulated_instruction(vcpu); return 1; @@ -3031,8 +3025,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } - KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32), - handler); + trace_kvm_msr_read(ecx, data); /* FIXME: handling of bits 32:63 of rax, rdx */ vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; @@ -3047,8 +3040,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); - KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32), - handler); + trace_kvm_msr_write(ecx, data); if (vmx_set_msr(vcpu, ecx, data) != 0) { kvm_inject_gp(vcpu, 0); @@ -3075,7 +3067,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); - KVMTRACE_0D(PEND_INTR, vcpu, handler); ++vcpu->stat.irq_window_exits; /* @@ -3227,6 +3218,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); + trace_kvm_page_fault(gpa, exit_qualification); return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); } @@ -3410,8 +3402,7 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; - KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), - (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); + trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); /* If we need to emulate an MMIO from handle_invalid_guest_state * we just return 0 */ @@ -3500,10 +3491,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) /* We need to handle NMIs before interrupts are enabled */ if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && - (exit_intr_info & INTR_INFO_VALID_MASK)) { - KVMTRACE_0D(NMI, &vmx->vcpu, handler); + (exit_intr_info & INTR_INFO_VALID_MASK)) asm("int $2"); - } idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; @@ -3891,6 +3880,29 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return ret; } +static const struct trace_print_flags vmx_exit_reasons_str[] = { + { EXIT_REASON_EXCEPTION_NMI, "exception" }, + { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, + { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, + { EXIT_REASON_NMI_WINDOW, "nmi_window" }, + { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, + { EXIT_REASON_CR_ACCESS, "cr_access" }, + { EXIT_REASON_DR_ACCESS, "dr_access" }, + { EXIT_REASON_CPUID, "cpuid" }, + { EXIT_REASON_MSR_READ, "rdmsr" }, + { EXIT_REASON_MSR_WRITE, "wrmsr" }, + { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, + { EXIT_REASON_HLT, "halt" }, + { EXIT_REASON_INVLPG, "invlpg" }, + { EXIT_REASON_VMCALL, "hypercall" }, + { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, + { EXIT_REASON_APIC_ACCESS, "apic_access" }, + { EXIT_REASON_WBINVD, "wbinvd" }, + { EXIT_REASON_TASK_SWITCH, "task_switch" }, + { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, + { -1, NULL } +}; + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -3950,6 +3962,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, .get_mt_mask = vmx_get_mt_mask, + + .exit_reasons_str = vmx_exit_reasons_str, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a066876f137..892a7a60c81 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -37,6 +37,8 @@ #include #include #include +#define CREATE_TRACE_POINTS +#include "trace.h" #include #include @@ -347,9 +349,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); - KVMTRACE_1D(LMSW, vcpu, - (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)), - handler); } EXPORT_SYMBOL_GPL(kvm_lmsw); @@ -2568,7 +2567,6 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) int emulate_clts(struct kvm_vcpu *vcpu) { - KVMTRACE_0D(CLTS, vcpu, handler); kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); return X86EMUL_CONTINUE; } @@ -2851,12 +2849,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.down = 0; vcpu->arch.pio.rep = 0; - if (vcpu->run->io.direction == KVM_EXIT_IO_IN) - KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, - handler); - else - KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, - handler); + trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, + size, 1); val = kvm_register_read(vcpu, VCPU_REGS_RAX); memcpy(vcpu->arch.pio_data, &val, 4); @@ -2892,12 +2886,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.down = down; vcpu->arch.pio.rep = rep; - if (vcpu->run->io.direction == KVM_EXIT_IO_IN) - KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, - handler); - else - KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, - handler); + trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, + size, count); if (!count) { kvm_x86_ops->skip_emulated_instruction(vcpu); @@ -3075,7 +3065,6 @@ void kvm_arch_exit(void) int kvm_emulate_halt(struct kvm_vcpu *vcpu) { ++vcpu->stat.halt_exits; - KVMTRACE_0D(HLT, vcpu, handler); if (irqchip_in_kernel(vcpu->kvm)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; return 1; @@ -3106,7 +3095,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); - KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); + trace_kvm_hypercall(nr, a0, a1, a2, a3); if (!is_long_mode(vcpu)) { nr &= 0xFFFFFFFF; @@ -3206,8 +3195,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); return 0; } - KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value, - (u32)((u64)value >> 32), handler); return value; } @@ -3215,9 +3202,6 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, unsigned long *rflags) { - KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val, - (u32)((u64)val >> 32), handler); - switch (cr) { case 0: kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); @@ -3327,11 +3311,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); } kvm_x86_ops->skip_emulated_instruction(vcpu); - KVMTRACE_5D(CPUID, vcpu, function, - (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), - (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), - (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), - (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); + trace_kvm_cpuid(function, + kvm_register_read(vcpu, VCPU_REGS_RAX), + kvm_register_read(vcpu, VCPU_REGS_RBX), + kvm_register_read(vcpu, VCPU_REGS_RCX), + kvm_register_read(vcpu, VCPU_REGS_RDX)); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); @@ -3527,7 +3511,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) set_debugreg(vcpu->arch.eff_db[3], 3); } - KVMTRACE_0D(VMENTRY, vcpu, entryexit); + trace_kvm_entry(vcpu->vcpu_id); kvm_x86_ops->run(vcpu, kvm_run); if (unlikely(vcpu->arch.switch_db_regs)) { @@ -4842,3 +4826,9 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { return kvm_x86_ops->interrupt_allowed(vcpu); } + +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h new file mode 100644 index 00000000000..d74b23d803f --- /dev/null +++ b/include/trace/events/kvm.h @@ -0,0 +1,57 @@ +#if !defined(_TRACE_KVM_MAIN_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_MAIN_H + +#include + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm +#define TRACE_INCLUDE_FILE kvm + +#if defined(__KVM_HAVE_IOAPIC) +TRACE_EVENT(kvm_set_irq, + TP_PROTO(unsigned int gsi), + TP_ARGS(gsi), + + TP_STRUCT__entry( + __field( unsigned int, gsi ) + ), + + TP_fast_assign( + __entry->gsi = gsi; + ), + + TP_printk("gsi %u", __entry->gsi) +); + + +#define kvm_irqchips \ + {KVM_IRQCHIP_PIC_MASTER, "PIC master"}, \ + {KVM_IRQCHIP_PIC_SLAVE, "PIC slave"}, \ + {KVM_IRQCHIP_IOAPIC, "IOAPIC"} + +TRACE_EVENT(kvm_ack_irq, + TP_PROTO(unsigned int irqchip, unsigned int pin), + TP_ARGS(irqchip, pin), + + TP_STRUCT__entry( + __field( unsigned int, irqchip ) + __field( unsigned int, pin ) + ), + + TP_fast_assign( + __entry->irqchip = irqchip; + __entry->pin = pin; + ), + + TP_printk("irqchip %s pin %u", + __print_symbolic(__entry->irqchip, kvm_irqchips), + __entry->pin) +); + + + +#endif /* defined(__KVM_HAVE_IOAPIC) */ +#endif /* _TRACE_KVM_MAIN_H */ + +/* This part must be outside protection */ +#include diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index bb8a1b5e41c..94759ed96b6 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -20,6 +20,7 @@ */ #include +#include #include #ifdef CONFIG_IA64 @@ -125,6 +126,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) unsigned long *irq_state, sig_level; int ret = -1; + trace_kvm_set_irq(irq); + WARN_ON(!mutex_is_locked(&kvm->irq_lock)); if (irq < KVM_IOAPIC_NUM_PINS) { @@ -161,6 +164,8 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) struct hlist_node *n; unsigned gsi = pin; + trace_kvm_ack_irq(irqchip, pin); + list_for_each_entry(e, &kvm->irq_routing, link) if (e->type == KVM_IRQ_ROUTING_IRQCHIP && e->irqchip.irqchip == irqchip && diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 48d5e697bf4..04bdeddebda 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -59,6 +59,9 @@ #include "irq.h" #endif +#define CREATE_TRACE_POINTS +#include + MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -2718,6 +2721,7 @@ EXPORT_SYMBOL_GPL(kvm_init); void kvm_exit(void) { kvm_trace_cleanup(); + tracepoint_synchronize_unregister(); misc_deregister(&kvm_dev); kmem_cache_destroy(kvm_vcpu_cache); sysdev_unregister(&kvm_sysdev); -- cgit v1.2.3-70-g09d2 From ec04b2604c3707a46db1d26d98f82b11d0844669 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 19 Jun 2009 15:16:23 +0200 Subject: KVM: Prepare memslot data structures for multiple hugepage sizes [avi: fix build on non-x86] Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm_host.h | 3 +- arch/powerpc/include/asm/kvm_host.h | 3 +- arch/s390/include/asm/kvm_host.h | 6 +++- arch/x86/include/asm/kvm_host.h | 12 ++++---- arch/x86/kvm/mmu.c | 30 ++++++++++---------- arch/x86/kvm/paging_tmpl.h | 3 +- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 56 ++++++++++++++++++++++++++----------- 8 files changed, 73 insertions(+), 42 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 9cf1c4b1f92..d9b6325a932 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -235,7 +235,8 @@ struct kvm_vm_data { #define KVM_REQ_PTC_G 32 #define KVM_REQ_RESUME 33 -#define KVM_PAGES_PER_HPAGE 1 +#define KVM_NR_PAGE_SIZES 1 +#define KVM_PAGES_PER_HPAGE(x) 1 struct kvm; struct kvm_vcpu; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index d4caa6127f5..c9c930ed11d 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -34,7 +34,8 @@ #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 /* We don't currently support large pages. */ -#define KVM_PAGES_PER_HPAGE (1UL << 31) +#define KVM_NR_PAGE_SIZES 1 +#define KVM_PAGES_PER_HPAGE(x) (1UL<<31) struct kvm; struct kvm_run; diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 75535d4d7a0..78e07a622b4 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -40,7 +40,11 @@ struct sca_block { struct sca_entry cpu[64]; } __attribute__((packed)); -#define KVM_PAGES_PER_HPAGE 256 +#define KVM_NR_PAGE_SIZES 2 +#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + ((x) - 1) * 8) +#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) +#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) +#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) #define CPUSTAT_HOST 0x80000000 #define CPUSTAT_WAIT 0x10000000 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 19027ab2041..30b625d8e5f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -54,12 +54,12 @@ #define INVALID_PAGE (~(hpa_t)0) #define UNMAPPED_GVA (~(gpa_t)0) -/* shadow tables are PAE even on non-PAE hosts */ -#define KVM_HPAGE_SHIFT 21 -#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT) -#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1)) - -#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE) +/* KVM Hugepage definitions for x86 */ +#define KVM_NR_PAGE_SIZES 2 +#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) +#define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) +#define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) +#define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) #define DE_VECTOR 0 #define DB_VECTOR 1 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 12974de88aa..b67585c1ef0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -394,9 +394,9 @@ static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot) { unsigned long idx; - idx = (gfn / KVM_PAGES_PER_HPAGE) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE); - return &slot->lpage_info[idx].write_count; + idx = (gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)) - + (slot->base_gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)); + return &slot->lpage_info[0][idx].write_count; } static void account_shadowed(struct kvm *kvm, gfn_t gfn) @@ -485,10 +485,10 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) if (!lpage) return &slot->rmap[gfn - slot->base_gfn]; - idx = (gfn / KVM_PAGES_PER_HPAGE) - - (slot->base_gfn / KVM_PAGES_PER_HPAGE); + idx = (gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)) - + (slot->base_gfn / KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL)); - return &slot->lpage_info[idx].rmap_pde; + return &slot->lpage_info[0][idx].rmap_pde; } /* @@ -731,11 +731,11 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + int idx = gfn_offset / + KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL); retval |= handler(kvm, &memslot->rmap[gfn_offset]); retval |= handler(kvm, - &memslot->lpage_info[ - gfn_offset / - KVM_PAGES_PER_HPAGE].rmap_pde); + &memslot->lpage_info[0][idx].rmap_pde); } } @@ -1876,8 +1876,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) pfn_t pfn; unsigned long mmu_seq; - if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { - gfn &= ~(KVM_PAGES_PER_HPAGE-1); + if (is_largepage_backed(vcpu, gfn & + ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1))) { + gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); largepage = 1; } @@ -2082,8 +2083,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, if (r) return r; - if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { - gfn &= ~(KVM_PAGES_PER_HPAGE-1); + if (is_largepage_backed(vcpu, gfn & + ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1))) { + gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); largepage = 1; } mmu_seq = vcpu->kvm->mmu_notifier_seq; @@ -2485,7 +2487,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { - gfn &= ~(KVM_PAGES_PER_HPAGE-1); + gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); vcpu->arch.update_pte.largepage = 1; } vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 322e8113aee..53e129cec5f 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -401,7 +401,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, if (walker.level == PT_DIRECTORY_LEVEL) { gfn_t large_gfn; - large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); + large_gfn = walker.gfn & + ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); if (is_largepage_backed(vcpu, large_gfn)) { walker.gfn = large_gfn; largepage = 1; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6988858dc56..06af936a250 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -103,7 +103,7 @@ struct kvm_memory_slot { struct { unsigned long rmap_pde; int write_count; - } *lpage_info; + } *lpage_info[KVM_NR_PAGE_SIZES - 1]; unsigned long userspace_addr; int user_alloc; }; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1da8072d61b..8361662e7e0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1001,19 +1001,25 @@ out: static void kvm_free_physmem_slot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { + int i; + if (!dont || free->rmap != dont->rmap) vfree(free->rmap); if (!dont || free->dirty_bitmap != dont->dirty_bitmap) vfree(free->dirty_bitmap); - if (!dont || free->lpage_info != dont->lpage_info) - vfree(free->lpage_info); + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { + vfree(free->lpage_info[i]); + free->lpage_info[i] = NULL; + } + } free->npages = 0; free->dirty_bitmap = NULL; free->rmap = NULL; - free->lpage_info = NULL; } void kvm_free_physmem(struct kvm *kvm) @@ -1087,7 +1093,8 @@ int __kvm_set_memory_region(struct kvm *kvm, int r; gfn_t base_gfn; unsigned long npages, ugfn; - unsigned long largepages, i; + int lpages; + unsigned long i, j; struct kvm_memory_slot *memslot; struct kvm_memory_slot old, new; @@ -1161,33 +1168,48 @@ int __kvm_set_memory_region(struct kvm *kvm, else new.userspace_addr = 0; } - if (npages && !new.lpage_info) { - largepages = 1 + (base_gfn + npages - 1) / KVM_PAGES_PER_HPAGE; - largepages -= base_gfn / KVM_PAGES_PER_HPAGE; + if (!npages) + goto skip_lpage; - new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info)); + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + int level = i + 2; - if (!new.lpage_info) + /* Avoid unused variable warning if no large pages */ + (void)level; + + if (new.lpage_info[i]) + continue; + + lpages = 1 + (base_gfn + npages - 1) / + KVM_PAGES_PER_HPAGE(level); + lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level); + + new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); + + if (!new.lpage_info[i]) goto out_free; - memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info)); + memset(new.lpage_info[i], 0, + lpages * sizeof(*new.lpage_info[i])); - if (base_gfn % KVM_PAGES_PER_HPAGE) - new.lpage_info[0].write_count = 1; - if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE) - new.lpage_info[largepages-1].write_count = 1; + if (base_gfn % KVM_PAGES_PER_HPAGE(level)) + new.lpage_info[i][0].write_count = 1; + if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level)) + new.lpage_info[i][lpages - 1].write_count = 1; ugfn = new.userspace_addr >> PAGE_SHIFT; /* * If the gfn and userspace address are not aligned wrt each * other, or if explicitly asked to, disable large page * support for this slot */ - if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE - 1) || + if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || !largepages_enabled) - for (i = 0; i < largepages; ++i) - new.lpage_info[i].write_count = 1; + for (j = 0; j < lpages; ++j) + new.lpage_info[i][j].write_count = 1; } +skip_lpage: + /* Allocate page dirty bitmap if needed */ if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; -- cgit v1.2.3-70-g09d2 From fc61b800f9506ca47bf1439342a79847f2353562 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 5 Jul 2009 17:39:35 +0300 Subject: KVM: Add Directed EOI support to APIC emulation Directed EOI is specified by x2APIC, but is available even when lapic is in xAPIC mode. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/apicdef.h | 2 ++ arch/x86/kvm/lapic.c | 38 ++++++++++++++++++++++++++++++-------- arch/x86/kvm/lapic.h | 1 + arch/x86/kvm/x86.c | 4 ++-- arch/x86/kvm/x86.h | 4 ++++ 5 files changed, 39 insertions(+), 10 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 7ddb36ab933..74ca38f6a4d 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -14,6 +14,7 @@ #define APIC_LVR 0x30 #define APIC_LVR_MASK 0xFF00FF +#define APIC_LVR_DIRECTED_EOI (1 << 24) #define GET_APIC_VERSION(x) ((x) & 0xFFu) #define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu) #ifdef CONFIG_X86_32 @@ -40,6 +41,7 @@ #define APIC_DFR_CLUSTER 0x0FFFFFFFul #define APIC_DFR_FLAT 0xFFFFFFFFul #define APIC_SPIV 0xF0 +#define APIC_SPIV_DIRECTED_EOI (1 << 12) #define APIC_SPIV_FOCUS_DISABLED (1 << 9) #define APIC_SPIV_APIC_ENABLED (1 << 8) #define APIC_ISR 0x100 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 265a765f038..62ea2abfd96 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -35,6 +35,7 @@ #include "kvm_cache_regs.h" #include "irq.h" #include "trace.h" +#include "x86.h" #ifndef CONFIG_X86_64 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y)) @@ -142,6 +143,21 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; } +void kvm_apic_set_version(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_cpuid_entry2 *feat; + u32 v = APIC_VERSION; + + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); + if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31)))) + v |= APIC_LVR_DIRECTED_EOI; + apic_set_reg(apic, APIC_LVR, v); +} + static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ @@ -442,9 +458,11 @@ static void apic_set_eoi(struct kvm_lapic *apic) trigger_mode = IOAPIC_LEVEL_TRIG; else trigger_mode = IOAPIC_EDGE_TRIG; - mutex_lock(&apic->vcpu->kvm->irq_lock); - kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); - mutex_unlock(&apic->vcpu->kvm->irq_lock); + if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) { + mutex_lock(&apic->vcpu->kvm->irq_lock); + kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + mutex_unlock(&apic->vcpu->kvm->irq_lock); + } } static void apic_send_ipi(struct kvm_lapic *apic) @@ -694,8 +712,11 @@ static int apic_mmio_write(struct kvm_io_device *this, apic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); break; - case APIC_SPIV: - apic_set_reg(apic, APIC_SPIV, val & 0x3ff); + case APIC_SPIV: { + u32 mask = 0x3ff; + if (apic_get_reg(apic, APIC_LVR) & APIC_LVR_DIRECTED_EOI) + mask |= APIC_SPIV_DIRECTED_EOI; + apic_set_reg(apic, APIC_SPIV, val & mask); if (!(val & APIC_SPIV_APIC_ENABLED)) { int i; u32 lvt_val; @@ -710,7 +731,7 @@ static int apic_mmio_write(struct kvm_io_device *this, } break; - + } case APIC_ICR: /* No delay here, so we always clear the pending bit */ apic_set_reg(apic, APIC_ICR, val & ~(1 << 12)); @@ -837,7 +858,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) hrtimer_cancel(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24); - apic_set_reg(apic, APIC_LVR, APIC_VERSION); + kvm_apic_set_version(apic->vcpu); for (i = 0; i < APIC_LVT_NUM; i++) apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); @@ -1041,7 +1062,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) apic->base_address = vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; - apic_set_reg(apic, APIC_LVR, APIC_VERSION); + kvm_apic_set_version(vcpu); + apic_update_ppr(apic); hrtimer_cancel(&apic->lapic_timer.timer); update_divide_count(apic); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 3f3ecc6edbf..bc1c5243c86 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -29,6 +29,7 @@ u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); +void kvm_apic_set_version(struct kvm_vcpu *vcpu); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d32e3c6d317..086f93137e3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -79,8 +79,6 @@ static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); -struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, - u32 function, u32 index); struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); @@ -1373,6 +1371,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_nent = cpuid->nent; cpuid_fix_nx_cap(vcpu); r = 0; + kvm_apic_set_version(vcpu); out_free: vfree(cpuid_entries); @@ -1394,6 +1393,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; vcpu->arch.cpuid_nent = cpuid->nent; + kvm_apic_set_version(vcpu); return 0; out: diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 4c8e10af78e..5eadea585d2 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -31,4 +31,8 @@ static inline bool kvm_exception_is_soft(unsigned int nr) { return (nr == BP_VECTOR) || (nr == OF_VECTOR); } + +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function, u32 index); + #endif -- cgit v1.2.3-70-g09d2 From e9f4275732add046fed4a548b8dbb98dbe500d2f Mon Sep 17 00:00:00 2001 From: Beth Kon Date: Tue, 7 Jul 2009 11:50:38 -0400 Subject: KVM: PIT support for HPET legacy mode When kvm is in hpet_legacy_mode, the hpet is providing the timer interrupt and the pit should not be. So in legacy mode, the pit timer is destroyed, but the *state* of the pit is maintained. So if kvm or the guest tries to modify the state of the pit, this modification is accepted, *except* that the timer isn't actually started. When we exit hpet_legacy_mode, the current state of the pit (which is up to date since we've been accepting modifications) is used to restart the pit timer. The saved_mode code in kvm_pit_load_count temporarily changes mode to 0xff in order to destroy the timer, but then restores the actual value, again maintaining "current" state of the pit for possible later reenablement. [avi: add some reserved storage in the ioctl; make SET_PIT2 IOW] [marcelo: fix memory corruption due to reserved storage] Signed-off-by: Beth Kon Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm.h | 9 +++++++ arch/x86/kvm/i8254.c | 22 +++++++++++++---- arch/x86/kvm/i8254.h | 3 ++- arch/x86/kvm/x86.c | 59 +++++++++++++++++++++++++++++++++++++++++++++- include/linux/kvm.h | 6 +++++ 5 files changed, 93 insertions(+), 6 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 708b9c32a5d..4a5fe914dc5 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -18,6 +18,7 @@ #define __KVM_HAVE_GUEST_DEBUG #define __KVM_HAVE_MSIX #define __KVM_HAVE_MCE +#define __KVM_HAVE_PIT_STATE2 /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 @@ -237,6 +238,14 @@ struct kvm_pit_state { struct kvm_pit_channel_state channels[3]; }; +#define KVM_PIT_FLAGS_HPET_LEGACY 0x00000001 + +struct kvm_pit_state2 { + struct kvm_pit_channel_state channels[3]; + __u32 flags; + __u32 reserved[9]; +}; + struct kvm_reinject_control { __u8 pit_reinject; __u8 reserved[31]; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 8c3ac30ef9b..9b62c57ba6e 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -332,20 +332,33 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) case 1: /* FIXME: enhance mode 4 precision */ case 4: - create_pit_timer(ps, val, 0); + if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { + create_pit_timer(ps, val, 0); + } break; case 2: case 3: - create_pit_timer(ps, val, 1); + if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ + create_pit_timer(ps, val, 1); + } break; default: destroy_pit_timer(&ps->pit_timer); } } -void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val) +void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start) { - pit_load_count(kvm, channel, val); + u8 saved_mode; + if (hpet_legacy_start) { + /* save existing mode for later reenablement */ + saved_mode = kvm->arch.vpit->pit_state.channels[0].mode; + kvm->arch.vpit->pit_state.channels[0].mode = 0xff; /* disable timer */ + pit_load_count(kvm, channel, val); + kvm->arch.vpit->pit_state.channels[0].mode = saved_mode; + } else { + pit_load_count(kvm, channel, val); + } } static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev) @@ -554,6 +567,7 @@ void kvm_pit_reset(struct kvm_pit *pit) struct kvm_kpit_channel_state *c; mutex_lock(&pit->pit_state.lock); + pit->pit_state.flags = 0; for (i = 0; i < 3; i++) { c = &pit->pit_state.channels[i]; c->mode = 0xff; diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index b2670180f22..d4c1c7ffdc0 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -21,6 +21,7 @@ struct kvm_kpit_channel_state { struct kvm_kpit_state { struct kvm_kpit_channel_state channels[3]; + u32 flags; struct kvm_timer pit_timer; bool is_periodic; u32 speaker_data_on; @@ -49,7 +50,7 @@ struct kvm_pit { #define KVM_PIT_CHANNEL_MASK 0x3 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); -void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); +void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); void kvm_free_pit(struct kvm *kvm); void kvm_pit_reset(struct kvm_pit *pit); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dfb0e37b3c6..2214384ff61 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1213,6 +1213,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_IRQFD: case KVM_CAP_PIT2: + case KVM_CAP_PIT_STATE2: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -2078,7 +2079,36 @@ static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) mutex_lock(&kvm->arch.vpit->pit_state.lock); memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); - kvm_pit_load_count(kvm, 0, ps->channels[0].count); + kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0); + mutex_unlock(&kvm->arch.vpit->pit_state.lock); + return r; +} + +static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +{ + int r = 0; + + mutex_lock(&kvm->arch.vpit->pit_state.lock); + memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, + sizeof(ps->channels)); + ps->flags = kvm->arch.vpit->pit_state.flags; + mutex_unlock(&kvm->arch.vpit->pit_state.lock); + return r; +} + +static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) +{ + int r = 0, start = 0; + u32 prev_legacy, cur_legacy; + mutex_lock(&kvm->arch.vpit->pit_state.lock); + prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; + cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; + if (!prev_legacy && cur_legacy) + start = 1; + memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels, + sizeof(kvm->arch.vpit->pit_state.channels)); + kvm->arch.vpit->pit_state.flags = ps->flags; + kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start); mutex_unlock(&kvm->arch.vpit->pit_state.lock); return r; } @@ -2140,6 +2170,7 @@ long kvm_arch_vm_ioctl(struct file *filp, */ union { struct kvm_pit_state ps; + struct kvm_pit_state2 ps2; struct kvm_memory_alias alias; struct kvm_pit_config pit_config; } u; @@ -2322,6 +2353,32 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_GET_PIT2: { + r = -ENXIO; + if (!kvm->arch.vpit) + goto out; + r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &u.ps2, sizeof(u.ps2))) + goto out; + r = 0; + break; + } + case KVM_SET_PIT2: { + r = -EFAULT; + if (copy_from_user(&u.ps2, argp, sizeof(u.ps2))) + goto out; + r = -ENXIO; + if (!kvm->arch.vpit) + goto out; + r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); + if (r) + goto out; + r = 0; + break; + } case KVM_REINJECT_CONTROL: { struct kvm_reinject_control control; r = -EFAULT; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 76c640834ea..a74a1fcc28e 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -409,6 +409,9 @@ struct kvm_guest_debug { #define KVM_CAP_PIT2 33 #endif #define KVM_CAP_SET_BOOT_CPU_ID 34 +#ifdef __KVM_HAVE_PIT_STATE2 +#define KVM_CAP_PIT_STATE2 35 +#endif #ifdef KVM_CAP_IRQ_ROUTING @@ -586,6 +589,9 @@ struct kvm_debug_guest { #define KVM_IA64_VCPU_GET_STACK _IOR(KVMIO, 0x9a, void *) #define KVM_IA64_VCPU_SET_STACK _IOW(KVMIO, 0x9b, void *) +#define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) +#define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) + #define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02) #define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03) #define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04) -- cgit v1.2.3-70-g09d2 From 0b71785dc05f1f66e6268022b9953c0d6a9985c6 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 9 Jul 2009 15:33:53 +0300 Subject: KVM: Move kvm_cpu_get_interrupt() declaration to x86 code It is implemented only by x86. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + include/linux/kvm_host.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 30b625d8e5f..3f4f00a2353 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -797,5 +797,6 @@ asmlinkage void kvm_handle_fault_on_reboot(void); int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); +int kvm_cpu_get_interrupt(struct kvm_vcpu *v); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6ec9fc56a49..2c6a5f00874 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -341,7 +341,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_free_all_assigned_devices(struct kvm *kvm); void kvm_arch_sync_events(struct kvm *kvm); -int kvm_cpu_get_interrupt(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *v); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); -- cgit v1.2.3-70-g09d2 From a1b37100d9e29c1f8dc3e2f5490a205c80180e01 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 9 Jul 2009 15:33:52 +0300 Subject: KVM: Reduce runnability interface with arch support code Remove kvm_cpu_has_interrupt() and kvm_arch_interrupt_allowed() from interface between general code and arch code. kvm_arch_vcpu_runnable() checks for interrupts instead. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 16 ++-------------- arch/powerpc/kvm/powerpc.c | 13 +------------ arch/s390/kvm/interrupt.c | 8 +------- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/x86.c | 6 ++++-- include/linux/kvm_host.h | 2 -- virt/kvm/kvm_main.c | 4 +--- 7 files changed, 11 insertions(+), 40 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index d7aa6bb8f47..0ad09f05efa 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1935,19 +1935,6 @@ int kvm_highest_pending_irq(struct kvm_vcpu *vcpu) return find_highest_bits((int *)&vpd->irr[0]); } -int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) -{ - if (kvm_highest_pending_irq(vcpu) != -1) - return 1; - return 0; -} - -int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) -{ - /* do real check here */ - return 1; -} - int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { return vcpu->arch.timer_fired; @@ -1960,7 +1947,8 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE; + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) || + (kvm_highest_pending_irq(vcpu) != -1); } int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 0341391eff1..2a4551f78f6 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -39,20 +39,9 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) return gfn; } -int kvm_cpu_has_interrupt(struct kvm_vcpu *v) -{ - return !!(v->arch.pending_exceptions); -} - -int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) -{ - /* do real check here */ - return 1; -} - int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { - return !(v->arch.msr & MSR_WE); + return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions); } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 4d613415c43..2c2f9835341 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -283,7 +283,7 @@ static int __try_deliver_ckc_interrupt(struct kvm_vcpu *vcpu) return 1; } -int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) +static int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; struct kvm_s390_float_interrupt *fi = vcpu->arch.local_int.float_int; @@ -320,12 +320,6 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu) return rc; } -int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) -{ - /* do real check here */ - return 1; -} - int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { return 0; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3f4f00a2353..08732d7b6d9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -797,6 +797,8 @@ asmlinkage void kvm_handle_fault_on_reboot(void); int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); +int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); +int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 11cfd897aac..b87d65d89a0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4890,8 +4890,10 @@ void kvm_arch_flush_shadow(struct kvm *kvm) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE - || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED - || vcpu->arch.nmi_pending; + || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED + || vcpu->arch.nmi_pending || + (kvm_arch_interrupt_allowed(vcpu) && + kvm_cpu_has_interrupt(vcpu)); } void kvm_vcpu_kick(struct kvm_vcpu *vcpu) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2c6a5f00874..4af56036a6b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -332,7 +332,6 @@ int kvm_arch_hardware_setup(void); void kvm_arch_hardware_unsetup(void); void kvm_arch_check_processor_compat(void *rtn); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); -int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); void kvm_free_physmem(struct kvm *kvm); @@ -341,7 +340,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm); void kvm_free_all_assigned_devices(struct kvm *kvm); void kvm_arch_sync_events(struct kvm *kvm); -int kvm_cpu_has_interrupt(struct kvm_vcpu *v); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d7b9bbba26d..532af9b41ee 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1666,9 +1666,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) for (;;) { prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); - if ((kvm_arch_interrupt_allowed(vcpu) && - kvm_cpu_has_interrupt(vcpu)) || - kvm_arch_vcpu_runnable(vcpu)) { + if (kvm_arch_vcpu_runnable(vcpu)) { set_bit(KVM_REQ_UNHALT, &vcpu->requests); break; } -- cgit v1.2.3-70-g09d2 From b927a3cec081a605142f5b7e90b730611bee28b1 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 21 Jul 2009 10:42:48 +0800 Subject: KVM: VMX: Introduce KVM_SET_IDENTITY_MAP_ADDR ioctl Now KVM allow guest to modify guest's physical address of EPT's identity mapping page. (change from v1, discard unnecessary check, change ioctl to accept parameter address rather than value) Signed-off-by: Sheng Yang Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx.c | 15 ++++++++++----- arch/x86/kvm/x86.c | 19 +++++++++++++++++++ include/linux/kvm.h | 2 ++ 4 files changed, 32 insertions(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 08732d7b6d9..e210b218df4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -411,6 +411,7 @@ struct kvm_arch{ struct page *ept_identity_pagetable; bool ept_identity_pagetable_done; + gpa_t ept_identity_map_addr; unsigned long irq_sources_bitmap; unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c6256b98f07..686e1abb681 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1719,7 +1719,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) eptp = construct_eptp(cr3); vmcs_write64(EPT_POINTER, eptp); guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : - VMX_EPT_IDENTITY_PAGETABLE_ADDR; + vcpu->kvm->arch.ept_identity_map_addr; } vmx_flush_tlb(vcpu); @@ -2122,7 +2122,7 @@ static int init_rmode_identity_map(struct kvm *kvm) if (likely(kvm->arch.ept_identity_pagetable_done)) return 1; ret = 0; - identity_map_pfn = VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT; + identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -2191,14 +2191,15 @@ static int alloc_identity_pagetable(struct kvm *kvm) goto out; kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; kvm_userspace_mem.flags = 0; - kvm_userspace_mem.guest_phys_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; + kvm_userspace_mem.guest_phys_addr = + kvm->arch.ept_identity_map_addr; kvm_userspace_mem.memory_size = PAGE_SIZE; r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, 0); if (r) goto out; kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, - VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); + kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); out: up_write(&kvm->slots_lock); return r; @@ -3814,9 +3815,13 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (alloc_apic_access_page(kvm) != 0) goto free_vmcs; - if (enable_ept) + if (enable_ept) { + if (!kvm->arch.ept_identity_map_addr) + kvm->arch.ept_identity_map_addr = + VMX_EPT_IDENTITY_PAGETABLE_ADDR; if (alloc_identity_pagetable(kvm) != 0) goto free_vmcs; + } return &vmx->vcpu; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c7ec0c921c0..f4cb1baaa04 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1206,6 +1206,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IOEVENTFD: case KVM_CAP_PIT2: case KVM_CAP_PIT_STATE2: + case KVM_CAP_SET_IDENTITY_MAP_ADDR: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1906,6 +1907,13 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) return ret; } +static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, + u64 ident_addr) +{ + kvm->arch.ept_identity_map_addr = ident_addr; + return 0; +} + static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, u32 kvm_nr_mmu_pages) { @@ -2173,6 +2181,17 @@ long kvm_arch_vm_ioctl(struct file *filp, if (r < 0) goto out; break; + case KVM_SET_IDENTITY_MAP_ADDR: { + u64 ident_addr; + + r = -EFAULT; + if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) + goto out; + r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); + if (r < 0) + goto out; + break; + } case KVM_SET_MEMORY_REGION: { struct kvm_memory_region kvm_mem; struct kvm_userspace_memory_region kvm_userspace_mem; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 230a91aa61c..f8f8900fc5e 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -435,6 +435,7 @@ struct kvm_ioeventfd { #define KVM_CAP_PIT_STATE2 35 #endif #define KVM_CAP_IOEVENTFD 36 +#define KVM_CAP_SET_IDENTITY_MAP_ADDR 37 #ifdef KVM_CAP_IRQ_ROUTING @@ -512,6 +513,7 @@ struct kvm_irqfd { #define KVM_SET_USER_MEMORY_REGION _IOW(KVMIO, 0x46,\ struct kvm_userspace_memory_region) #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) +#define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) /* Device model IOC */ #define KVM_CREATE_IRQCHIP _IO(KVMIO, 0x60) #define KVM_IRQ_LINE _IOW(KVMIO, 0x61, struct kvm_irq_level) -- cgit v1.2.3-70-g09d2 From 852e3c19ac64b7c3912e8efe42d3ce090ebc0161 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 27 Jul 2009 16:30:44 +0200 Subject: KVM: MMU: make direct mapping paths aware of mapping levels Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 83 ++++++++++++++++++++++++----------------- arch/x86/kvm/paging_tmpl.h | 6 +-- 3 files changed, 53 insertions(+), 38 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e210b218df4..e09dc26d96b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -315,7 +315,7 @@ struct kvm_vcpu_arch { struct { gfn_t gfn; /* presumed gfn during guest pte update */ pfn_t pfn; /* pfn corresponding to that gfn */ - int largepage; + int level; unsigned long mmu_seq; } update_pte; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c707936b241..110c224ed1f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -257,7 +257,7 @@ static int is_last_spte(u64 pte, int level) { if (level == PT_PAGE_TABLE_LEVEL) return 1; - if (level == PT_DIRECTORY_LEVEL && is_large_pte(pte)) + if (is_large_pte(pte)) return 1; return 0; } @@ -753,7 +753,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, int (*handler)(struct kvm *kvm, unsigned long *rmapp)) { - int i; + int i, j; int retval = 0; /* @@ -772,11 +772,15 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; - int idx = gfn_offset / - KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL); + retval |= handler(kvm, &memslot->rmap[gfn_offset]); - retval |= handler(kvm, - &memslot->lpage_info[0][idx].rmap_pde); + + for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { + int idx = gfn_offset; + idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); + retval |= handler(kvm, + &memslot->lpage_info[j][idx].rmap_pde); + } } } @@ -814,12 +818,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) #define RMAP_RECYCLE_THRESHOLD 1000 -static void rmap_recycle(struct kvm_vcpu *vcpu, gfn_t gfn, int lpage) +static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { unsigned long *rmapp; + struct kvm_mmu_page *sp; + + sp = page_header(__pa(spte)); gfn = unalias_gfn(vcpu->kvm, gfn); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); kvm_unmap_rmapp(vcpu->kvm, rmapp); kvm_flush_remote_tlbs(vcpu->kvm); @@ -1734,7 +1741,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int user_fault, - int write_fault, int dirty, int largepage, + int write_fault, int dirty, int level, gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync) { @@ -1757,7 +1764,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= shadow_nx_mask; if (pte_access & ACC_USER_MASK) spte |= shadow_user_mask; - if (largepage) + if (level > PT_PAGE_TABLE_LEVEL) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, @@ -1768,7 +1775,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if ((pte_access & ACC_WRITE_MASK) || (write_fault && !is_write_protection(vcpu) && !user_fault)) { - if (largepage && has_wrprotected_page(vcpu->kvm, gfn, 1)) { + if (level > PT_PAGE_TABLE_LEVEL && + has_wrprotected_page(vcpu->kvm, gfn, level)) { ret = 1; spte = shadow_trap_nonpresent_pte; goto set_pte; @@ -1806,7 +1814,7 @@ set_pte: static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, - int *ptwrite, int largepage, gfn_t gfn, + int *ptwrite, int level, gfn_t gfn, pfn_t pfn, bool speculative) { int was_rmapped = 0; @@ -1823,7 +1831,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. */ - if (largepage && !is_large_pte(*sptep)) { + if (level > PT_PAGE_TABLE_LEVEL && + !is_large_pte(*sptep)) { struct kvm_mmu_page *child; u64 pte = *sptep; @@ -1836,8 +1845,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } else was_rmapped = 1; } + if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, - dirty, largepage, gfn, pfn, speculative, true)) { + dirty, level, gfn, pfn, speculative, true)) { if (write_fault) *ptwrite = 1; kvm_x86_ops->tlb_flush(vcpu); @@ -1857,7 +1867,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (!is_rmap_spte(*sptep)) kvm_release_pfn_clean(pfn); if (rmap_count > RMAP_RECYCLE_THRESHOLD) - rmap_recycle(vcpu, gfn, largepage); + rmap_recycle(vcpu, sptep, gfn); } else { if (was_writeble) kvm_release_pfn_dirty(pfn); @@ -1875,7 +1885,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) } static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int largepage, gfn_t gfn, pfn_t pfn) + int level, gfn_t gfn, pfn_t pfn) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; @@ -1883,11 +1893,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, gfn_t pseudo_gfn; for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { - if (iterator.level == PT_PAGE_TABLE_LEVEL - || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) { + if (iterator.level == level) { mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 0, write, 1, &pt_write, - largepage, gfn, pfn, false); + level, gfn, pfn, false); ++vcpu->stat.pf_fixed; break; } @@ -1915,14 +1924,20 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) { int r; - int largepage = 0; + int level; pfn_t pfn; unsigned long mmu_seq; - if (mapping_level(vcpu, gfn) == PT_DIRECTORY_LEVEL) { - gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); - largepage = 1; - } + level = mapping_level(vcpu, gfn); + + /* + * This path builds a PAE pagetable - so we can map 2mb pages at + * maximum. Therefore check if the level is larger than that. + */ + if (level > PT_DIRECTORY_LEVEL) + level = PT_DIRECTORY_LEVEL; + + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -1938,7 +1953,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); - r = __direct_map(vcpu, v, write, largepage, gfn, pfn); + r = __direct_map(vcpu, v, write, level, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); @@ -2114,7 +2129,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, { pfn_t pfn; int r; - int largepage = 0; + int level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; @@ -2125,10 +2140,10 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, if (r) return r; - if (mapping_level(vcpu, gfn) == PT_DIRECTORY_LEVEL) { - gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); - largepage = 1; - } + level = mapping_level(vcpu, gfn); + + gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); @@ -2141,7 +2156,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, goto out_unlock; kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, - largepage, gfn, pfn); + level, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); return r; @@ -2448,7 +2463,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, const void *new) { if (sp->role.level != PT_PAGE_TABLE_LEVEL) { - if (!vcpu->arch.update_pte.largepage || + if (vcpu->arch.update_pte.level == PT_PAGE_TABLE_LEVEL || sp->role.glevels == PT32_ROOT_LEVEL) { ++vcpu->kvm->stat.mmu_pde_zapped; return; @@ -2498,7 +2513,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u64 gpte = 0; pfn_t pfn; - vcpu->arch.update_pte.largepage = 0; + vcpu->arch.update_pte.level = PT_PAGE_TABLE_LEVEL; if (bytes != 4 && bytes != 8) return; @@ -2530,7 +2545,7 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (is_large_pte(gpte) && (mapping_level(vcpu, gfn) == PT_DIRECTORY_LEVEL)) { gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); - vcpu->arch.update_pte.largepage = 1; + vcpu->arch.update_pte.level = PT_DIRECTORY_LEVEL; } vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 44f0346578c..b167f0d57b5 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -253,7 +253,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, pt_element_t gpte; unsigned pte_access; pfn_t pfn; - int largepage = vcpu->arch.update_pte.largepage; + int level = vcpu->arch.update_pte.level; gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { @@ -272,7 +272,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, return; kvm_get_pfn(pfn); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, largepage, + gpte & PT_DIRTY_MASK, NULL, level, gpte_to_gfn(gpte), pfn, true); } @@ -306,7 +306,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, gw->pte_access & access, user_fault, write_fault, gw->ptes[gw->level-1] & PT_DIRTY_MASK, - ptwrite, largepage, + ptwrite, level, gw->gfn, pfn, false); break; } -- cgit v1.2.3-70-g09d2 From 7e4e4056f72da51c5dede48515df0ecd20eaf8ca Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 27 Jul 2009 16:30:46 +0200 Subject: KVM: MMU: shadow support for 1gb pages This patch adds support for shadow paging to the 1gb page table code in KVM. With this code the guest can use 1gb pages even if the host does not support them. [ Marcelo: fix shadow page collision on pmd level if a guest 1gb page is mapped with 4kb ptes on host level ] Signed-off-by: Joerg Roedel Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/mmu.c | 14 ++------------ arch/x86/kvm/paging_tmpl.h | 43 +++++++++++++++++++---------------------- 3 files changed, 22 insertions(+), 36 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e09dc26d96b..c9fb2bc13a8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -315,7 +315,6 @@ struct kvm_vcpu_arch { struct { gfn_t gfn; /* presumed gfn during guest pte update */ pfn_t pfn; /* pfn corresponding to that gfn */ - int level; unsigned long mmu_seq; } update_pte; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 09ab6433bf1..1249c12e1d5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2478,11 +2478,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, const void *new) { if (sp->role.level != PT_PAGE_TABLE_LEVEL) { - if (vcpu->arch.update_pte.level == PT_PAGE_TABLE_LEVEL || - sp->role.glevels == PT32_ROOT_LEVEL) { - ++vcpu->kvm->stat.mmu_pde_zapped; - return; - } + ++vcpu->kvm->stat.mmu_pde_zapped; + return; } ++vcpu->kvm->stat.mmu_pte_updated; @@ -2528,8 +2525,6 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, u64 gpte = 0; pfn_t pfn; - vcpu->arch.update_pte.level = PT_PAGE_TABLE_LEVEL; - if (bytes != 4 && bytes != 8) return; @@ -2557,11 +2552,6 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - if (is_large_pte(gpte) && - (mapping_level(vcpu, gfn) == PT_DIRECTORY_LEVEL)) { - gfn &= ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); - vcpu->arch.update_pte.level = PT_DIRECTORY_LEVEL; - } vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 578276e34bd..d2fec9c12d2 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -256,7 +256,6 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, pt_element_t gpte; unsigned pte_access; pfn_t pfn; - int level = vcpu->arch.update_pte.level; gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { @@ -275,7 +274,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, return; kvm_get_pfn(pfn); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, level, + gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, gpte_to_gfn(gpte), pfn, true); } @@ -284,7 +283,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, */ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *gw, - int user_fault, int write_fault, int largepage, + int user_fault, int write_fault, int hlevel, int *ptwrite, pfn_t pfn) { unsigned access = gw->pt_access; @@ -303,8 +302,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, for_each_shadow_entry(vcpu, addr, iterator) { level = iterator.level; sptep = iterator.sptep; - if (level == PT_PAGE_TABLE_LEVEL - || (largepage && level == PT_DIRECTORY_LEVEL)) { + if (iterator.level == hlevel) { mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, user_fault, write_fault, @@ -323,12 +321,15 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, kvm_flush_remote_tlbs(vcpu->kvm); } - if (level == PT_DIRECTORY_LEVEL - && gw->level == PT_DIRECTORY_LEVEL) { + if (level <= gw->level) { + int delta = level - gw->level + 1; direct = 1; - if (!is_dirty_gpte(gw->ptes[level - 1])) + if (!is_dirty_gpte(gw->ptes[level - delta])) access &= ~ACC_WRITE_MASK; - table_gfn = gpte_to_gfn(gw->ptes[level - 1]); + table_gfn = gpte_to_gfn(gw->ptes[level - delta]); + /* advance table_gfn when emulating 1gb pages with 4k */ + if (delta == 0) + table_gfn += PT_INDEX(addr, level); } else { direct = 0; table_gfn = gw->table_gfn[level - 2]; @@ -381,7 +382,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, int write_pt = 0; int r; pfn_t pfn; - int largepage = 0; + int level = PT_PAGE_TABLE_LEVEL; unsigned long mmu_seq; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); @@ -407,15 +408,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return 0; } - if (walker.level == PT_DIRECTORY_LEVEL) { - gfn_t large_gfn; - large_gfn = walker.gfn & - ~(KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL) - 1); - if (mapping_level(vcpu, large_gfn) == PT_DIRECTORY_LEVEL) { - walker.gfn = large_gfn; - largepage = 1; - } + if (walker.level >= PT_DIRECTORY_LEVEL) { + level = min(walker.level, mapping_level(vcpu, walker.gfn)); + walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); } + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); @@ -432,8 +429,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, goto out_unlock; kvm_mmu_free_some_pages(vcpu); sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - largepage, &write_pt, pfn); - + level, &write_pt, pfn); pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, sptep, *sptep, write_pt); @@ -468,8 +464,9 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) sptep = iterator.sptep; /* FIXME: properly handle invlpg on large guest pages */ - if (level == PT_PAGE_TABLE_LEVEL || - ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) { + if (level == PT_PAGE_TABLE_LEVEL || + ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || + ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { struct kvm_mmu_page *sp = page_header(__pa(sptep)); pte_gpa = (sp->gfn << PAGE_SHIFT); @@ -599,7 +596,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) nr_present++; pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, - is_dirty_gpte(gpte), 0, gfn, + is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, spte_to_pfn(sp->spt[i]), true, false); } -- cgit v1.2.3-70-g09d2 From 04326caacff2b162d359c15a2edf634448897d1a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 27 Jul 2009 16:30:47 +0200 Subject: KVM: MMU: enable gbpages by increasing nr of pagesizes Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c9fb2bc13a8..3315efaacf9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -55,7 +55,7 @@ #define UNMAPPED_GVA (~(gpa_t)0) /* KVM Hugepage definitions for x86 */ -#define KVM_NR_PAGE_SIZES 2 +#define KVM_NR_PAGE_SIZES 3 #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) -- cgit v1.2.3-70-g09d2 From 344f414fa0f16254dd07195d4cd11b2f92931d3d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 27 Jul 2009 16:30:48 +0200 Subject: KVM: report 1GB page support to userspace If userspace knows that the kernel part supports 1GB pages it can enable the corresponding cpuid bit so that guests actually use GB pages. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/vmx.c | 6 ++++++ arch/x86/kvm/x86.c | 3 ++- 4 files changed, 16 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3315efaacf9..b17d845897b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -528,6 +528,8 @@ struct kvm_x86_ops { int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); + bool (*gb_page_enable)(void); + const struct trace_print_flags *exit_reasons_str; }; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 92fc0dab505..10e718db990 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2754,6 +2754,11 @@ static const struct trace_print_flags svm_exit_reasons_str[] = { { -1, NULL } }; +static bool svm_gb_page_enable(void) +{ + return true; +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -2817,6 +2822,7 @@ static struct kvm_x86_ops svm_x86_ops = { .get_mt_mask = svm_get_mt_mask, .exit_reasons_str = svm_exit_reasons_str, + .gb_page_enable = svm_gb_page_enable, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c5aaa1b5fdb..32e6d2031ba 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3908,6 +3908,11 @@ static const struct trace_print_flags vmx_exit_reasons_str[] = { { -1, NULL } }; +static bool vmx_gb_page_enable(void) +{ + return false; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -3969,6 +3974,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_mt_mask = vmx_get_mt_mask, .exit_reasons_str = vmx_exit_reasons_str, + .gb_page_enable = vmx_gb_page_enable, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 69de7248083..fa525d511d9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1444,6 +1444,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index, int *nent, int maxnent) { unsigned f_nx = is_efer_nx() ? F(NX) : 0; + unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; #ifdef CONFIG_X86_64 unsigned f_lm = F(LM); #else @@ -1468,7 +1469,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | F(PAT) | F(PSE36) | 0 /* Reserved */ | f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | - F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ | + F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = -- cgit v1.2.3-70-g09d2 From 56e8231841301ad38e347e33fd4319c89f697045 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 12 Aug 2009 15:04:37 +0300 Subject: KVM: Rename x86_emulate.c to emulate.c We're in arch/x86, what could we possibly be emulating? Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 187 +++ arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/include/asm/kvm_x86_emulate.h | 187 --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/emulate.c | 2392 ++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 2 +- arch/x86/kvm/x86_emulate.c | 2392 -------------------------------- 7 files changed, 2582 insertions(+), 2582 deletions(-) create mode 100644 arch/x86/include/asm/kvm_emulate.h delete mode 100644 arch/x86/include/asm/kvm_x86_emulate.h create mode 100644 arch/x86/kvm/emulate.c delete mode 100644 arch/x86/kvm/x86_emulate.c (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h new file mode 100644 index 00000000000..b7ed2c42311 --- /dev/null +++ b/arch/x86/include/asm/kvm_emulate.h @@ -0,0 +1,187 @@ +/****************************************************************************** + * x86_emulate.h + * + * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. + * + * Copyright (c) 2005 Keir Fraser + * + * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 + */ + +#ifndef _ASM_X86_KVM_X86_EMULATE_H +#define _ASM_X86_KVM_X86_EMULATE_H + +struct x86_emulate_ctxt; + +/* + * x86_emulate_ops: + * + * These operations represent the instruction emulator's interface to memory. + * There are two categories of operation: those that act on ordinary memory + * regions (*_std), and those that act on memory regions known to require + * special treatment or emulation (*_emulated). + * + * The emulator assumes that an instruction accesses only one 'emulated memory' + * location, that this location is the given linear faulting address (cr2), and + * that this is one of the instruction's data operands. Instruction fetches and + * stack operations are assumed never to access emulated memory. The emulator + * automatically deduces which operand of a string-move operation is accessing + * emulated memory, and assumes that the other operand accesses normal memory. + * + * NOTES: + * 1. The emulator isn't very smart about emulated vs. standard memory. + * 'Emulated memory' access addresses should be checked for sanity. + * 'Normal memory' accesses may fault, and the caller must arrange to + * detect and handle reentrancy into the emulator via recursive faults. + * Accesses may be unaligned and may cross page boundaries. + * 2. If the access fails (cannot emulate, or a standard access faults) then + * it is up to the memop to propagate the fault to the guest VM via + * some out-of-band mechanism, unknown to the emulator. The memop signals + * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will + * then immediately bail. + * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only + * cmpxchg8b_emulated need support 8-byte accesses. + * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system. + */ +/* Access completed successfully: continue emulation as normal. */ +#define X86EMUL_CONTINUE 0 +/* Access is unhandleable: bail from emulation and return error to caller. */ +#define X86EMUL_UNHANDLEABLE 1 +/* Terminate emulation but return success to the caller. */ +#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ +#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ +#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ +struct x86_emulate_ops { + /* + * read_std: Read bytes of standard (non-emulated/special) memory. + * Used for instruction fetch, stack operations, and others. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*read_std)(unsigned long addr, void *val, + unsigned int bytes, struct kvm_vcpu *vcpu); + + /* + * read_emulated: Read bytes from emulated/special memory area. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*read_emulated)(unsigned long addr, + void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu); + + /* + * write_emulated: Read bytes from emulated/special memory area. + * @addr: [IN ] Linear address to which to write. + * @val: [IN ] Value to write to memory (low-order bytes used as + * required). + * @bytes: [IN ] Number of bytes to write to memory. + */ + int (*write_emulated)(unsigned long addr, + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu); + + /* + * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an + * emulated/special memory area. + * @addr: [IN ] Linear address to access. + * @old: [IN ] Value expected to be current at @addr. + * @new: [IN ] Value to write to @addr. + * @bytes: [IN ] Number of bytes to access using CMPXCHG. + */ + int (*cmpxchg_emulated)(unsigned long addr, + const void *old, + const void *new, + unsigned int bytes, + struct kvm_vcpu *vcpu); + +}; + +/* Type, address-of, and value of an instruction's operand. */ +struct operand { + enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; + unsigned int bytes; + unsigned long val, orig_val, *ptr; +}; + +struct fetch_cache { + u8 data[15]; + unsigned long start; + unsigned long end; +}; + +struct decode_cache { + u8 twobyte; + u8 b; + u8 lock_prefix; + u8 rep_prefix; + u8 op_bytes; + u8 ad_bytes; + u8 rex_prefix; + struct operand src; + struct operand src2; + struct operand dst; + bool has_seg_override; + u8 seg_override; + unsigned int d; + unsigned long regs[NR_VCPU_REGS]; + unsigned long eip; + /* modrm */ + u8 modrm; + u8 modrm_mod; + u8 modrm_reg; + u8 modrm_rm; + u8 use_modrm_ea; + bool rip_relative; + unsigned long modrm_ea; + void *modrm_ptr; + unsigned long modrm_val; + struct fetch_cache fetch; +}; + +#define X86_SHADOW_INT_MOV_SS 1 +#define X86_SHADOW_INT_STI 2 + +struct x86_emulate_ctxt { + /* Register state before/after emulation. */ + struct kvm_vcpu *vcpu; + + unsigned long eflags; + /* Emulated execution mode, represented by an X86EMUL_MODE value. */ + int mode; + u32 cs_base; + + /* interruptibility state, as a result of execution of STI or MOV SS */ + int interruptibility; + + /* decode cache */ + struct decode_cache decode; +}; + +/* Repeat String Operation Prefix */ +#define REPE_PREFIX 1 +#define REPNE_PREFIX 2 + +/* Execution mode, passed to the emulator. */ +#define X86EMUL_MODE_REAL 0 /* Real mode. */ +#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ +#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ +#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ + +/* Host execution mode. */ +#if defined(CONFIG_X86_32) +#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 +#elif defined(CONFIG_X86_64) +#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 +#endif + +int x86_decode_insn(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops); +int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops); + +#endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b17d845897b..33901be75a3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -138,7 +138,7 @@ enum { VCPU_SREG_LDTR, }; -#include +#include #define KVM_NR_MEM_OBJS 40 diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h deleted file mode 100644 index b7ed2c42311..00000000000 --- a/arch/x86/include/asm/kvm_x86_emulate.h +++ /dev/null @@ -1,187 +0,0 @@ -/****************************************************************************** - * x86_emulate.h - * - * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. - * - * Copyright (c) 2005 Keir Fraser - * - * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 - */ - -#ifndef _ASM_X86_KVM_X86_EMULATE_H -#define _ASM_X86_KVM_X86_EMULATE_H - -struct x86_emulate_ctxt; - -/* - * x86_emulate_ops: - * - * These operations represent the instruction emulator's interface to memory. - * There are two categories of operation: those that act on ordinary memory - * regions (*_std), and those that act on memory regions known to require - * special treatment or emulation (*_emulated). - * - * The emulator assumes that an instruction accesses only one 'emulated memory' - * location, that this location is the given linear faulting address (cr2), and - * that this is one of the instruction's data operands. Instruction fetches and - * stack operations are assumed never to access emulated memory. The emulator - * automatically deduces which operand of a string-move operation is accessing - * emulated memory, and assumes that the other operand accesses normal memory. - * - * NOTES: - * 1. The emulator isn't very smart about emulated vs. standard memory. - * 'Emulated memory' access addresses should be checked for sanity. - * 'Normal memory' accesses may fault, and the caller must arrange to - * detect and handle reentrancy into the emulator via recursive faults. - * Accesses may be unaligned and may cross page boundaries. - * 2. If the access fails (cannot emulate, or a standard access faults) then - * it is up to the memop to propagate the fault to the guest VM via - * some out-of-band mechanism, unknown to the emulator. The memop signals - * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will - * then immediately bail. - * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only - * cmpxchg8b_emulated need support 8-byte accesses. - * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system. - */ -/* Access completed successfully: continue emulation as normal. */ -#define X86EMUL_CONTINUE 0 -/* Access is unhandleable: bail from emulation and return error to caller. */ -#define X86EMUL_UNHANDLEABLE 1 -/* Terminate emulation but return success to the caller. */ -#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ -#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ -#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ -struct x86_emulate_ops { - /* - * read_std: Read bytes of standard (non-emulated/special) memory. - * Used for instruction fetch, stack operations, and others. - * @addr: [IN ] Linear address from which to read. - * @val: [OUT] Value read from memory, zero-extended to 'u_long'. - * @bytes: [IN ] Number of bytes to read from memory. - */ - int (*read_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu); - - /* - * read_emulated: Read bytes from emulated/special memory area. - * @addr: [IN ] Linear address from which to read. - * @val: [OUT] Value read from memory, zero-extended to 'u_long'. - * @bytes: [IN ] Number of bytes to read from memory. - */ - int (*read_emulated)(unsigned long addr, - void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu); - - /* - * write_emulated: Read bytes from emulated/special memory area. - * @addr: [IN ] Linear address to which to write. - * @val: [IN ] Value to write to memory (low-order bytes used as - * required). - * @bytes: [IN ] Number of bytes to write to memory. - */ - int (*write_emulated)(unsigned long addr, - const void *val, - unsigned int bytes, - struct kvm_vcpu *vcpu); - - /* - * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an - * emulated/special memory area. - * @addr: [IN ] Linear address to access. - * @old: [IN ] Value expected to be current at @addr. - * @new: [IN ] Value to write to @addr. - * @bytes: [IN ] Number of bytes to access using CMPXCHG. - */ - int (*cmpxchg_emulated)(unsigned long addr, - const void *old, - const void *new, - unsigned int bytes, - struct kvm_vcpu *vcpu); - -}; - -/* Type, address-of, and value of an instruction's operand. */ -struct operand { - enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; - unsigned int bytes; - unsigned long val, orig_val, *ptr; -}; - -struct fetch_cache { - u8 data[15]; - unsigned long start; - unsigned long end; -}; - -struct decode_cache { - u8 twobyte; - u8 b; - u8 lock_prefix; - u8 rep_prefix; - u8 op_bytes; - u8 ad_bytes; - u8 rex_prefix; - struct operand src; - struct operand src2; - struct operand dst; - bool has_seg_override; - u8 seg_override; - unsigned int d; - unsigned long regs[NR_VCPU_REGS]; - unsigned long eip; - /* modrm */ - u8 modrm; - u8 modrm_mod; - u8 modrm_reg; - u8 modrm_rm; - u8 use_modrm_ea; - bool rip_relative; - unsigned long modrm_ea; - void *modrm_ptr; - unsigned long modrm_val; - struct fetch_cache fetch; -}; - -#define X86_SHADOW_INT_MOV_SS 1 -#define X86_SHADOW_INT_STI 2 - -struct x86_emulate_ctxt { - /* Register state before/after emulation. */ - struct kvm_vcpu *vcpu; - - unsigned long eflags; - /* Emulated execution mode, represented by an X86EMUL_MODE value. */ - int mode; - u32 cs_base; - - /* interruptibility state, as a result of execution of STI or MOV SS */ - int interruptibility; - - /* decode cache */ - struct decode_cache decode; -}; - -/* Repeat String Operation Prefix */ -#define REPE_PREFIX 1 -#define REPNE_PREFIX 2 - -/* Execution mode, passed to the emulator. */ -#define X86EMUL_MODE_REAL 0 /* Real mode. */ -#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ -#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ -#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ - -/* Host execution mode. */ -#if defined(CONFIG_X86_32) -#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 -#elif defined(CONFIG_X86_64) -#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 -#endif - -int x86_decode_insn(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops); -int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops); - -#endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index afaaa7627d9..0e7fe78d0f7 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -9,7 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o) kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) -kvm-y += x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ +kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o timer.o kvm-intel-y += vmx.o kvm-amd-y += svm.o diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c new file mode 100644 index 00000000000..2eb807a7b62 --- /dev/null +++ b/arch/x86/kvm/emulate.c @@ -0,0 +1,2392 @@ +/****************************************************************************** + * emulate.c + * + * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. + * + * Copyright (c) 2005 Keir Fraser + * + * Linux coding style, mod r/m decoder, segment base fixes, real-mode + * privileged instructions: + * + * Copyright (C) 2006 Qumranet + * + * Avi Kivity + * Yaniv Kamay + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 + */ + +#ifndef __KERNEL__ +#include +#include +#include +#define DPRINTF(_f, _a ...) printf(_f , ## _a) +#else +#include +#include "kvm_cache_regs.h" +#define DPRINTF(x...) do {} while (0) +#endif +#include +#include + +#include "mmu.h" /* for is_long_mode() */ + +/* + * Opcode effective-address decode tables. + * Note that we only emulate instructions that have at least one memory + * operand (excluding implicit stack references). We assume that stack + * references and instruction fetches will never occur in special memory + * areas that require emulation. So, for example, 'mov ,' need + * not be handled. + */ + +/* Operand sizes: 8-bit operands or specified/overridden size. */ +#define ByteOp (1<<0) /* 8-bit operands. */ +/* Destination operand type. */ +#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ +#define DstReg (2<<1) /* Register operand. */ +#define DstMem (3<<1) /* Memory operand. */ +#define DstAcc (4<<1) /* Destination Accumulator */ +#define DstMask (7<<1) +/* Source operand type. */ +#define SrcNone (0<<4) /* No source operand. */ +#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ +#define SrcReg (1<<4) /* Register operand. */ +#define SrcMem (2<<4) /* Memory operand. */ +#define SrcMem16 (3<<4) /* Memory operand (16-bit). */ +#define SrcMem32 (4<<4) /* Memory operand (32-bit). */ +#define SrcImm (5<<4) /* Immediate operand. */ +#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ +#define SrcOne (7<<4) /* Implied '1' */ +#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ +#define SrcImmU (9<<4) /* Immediate operand, unsigned */ +#define SrcMask (0xf<<4) +/* Generic ModRM decode. */ +#define ModRM (1<<8) +/* Destination is only written; never read. */ +#define Mov (1<<9) +#define BitOp (1<<10) +#define MemAbs (1<<11) /* Memory operand is absolute displacement */ +#define String (1<<12) /* String instruction (rep capable) */ +#define Stack (1<<13) /* Stack instruction (push/pop) */ +#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ +#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ +#define GroupMask 0xff /* Group number stored in bits 0:7 */ +/* Source 2 operand type */ +#define Src2None (0<<29) +#define Src2CL (1<<29) +#define Src2ImmByte (2<<29) +#define Src2One (3<<29) +#define Src2Imm16 (4<<29) +#define Src2Mask (7<<29) + +enum { + Group1_80, Group1_81, Group1_82, Group1_83, + Group1A, Group3_Byte, Group3, Group4, Group5, Group7, +}; + +static u32 opcode_table[256] = { + /* 0x00 - 0x07 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, + /* 0x08 - 0x0F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x10 - 0x17 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x18 - 0x1F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x20 - 0x27 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, + /* 0x28 - 0x2F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x30 - 0x37 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + 0, 0, 0, 0, + /* 0x38 - 0x3F */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, + 0, 0, + /* 0x40 - 0x47 */ + DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, + /* 0x48 - 0x4F */ + DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, + /* 0x50 - 0x57 */ + SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, + SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, + /* 0x58 - 0x5F */ + DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, + DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, + /* 0x60 - 0x67 */ + 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , + 0, 0, 0, 0, + /* 0x68 - 0x6F */ + SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ + /* 0x70 - 0x77 */ + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + /* 0x78 - 0x7F */ + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, + /* 0x80 - 0x87 */ + Group | Group1_80, Group | Group1_81, + Group | Group1_82, Group | Group1_83, + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + /* 0x88 - 0x8F */ + ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, + ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, + DstReg | SrcMem | ModRM | Mov, Group | Group1A, + /* 0x90 - 0x97 */ + DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, + /* 0x98 - 0x9F */ + 0, 0, SrcImm | Src2Imm16, 0, + ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, + /* 0xA0 - 0xA7 */ + ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, + ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, + ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, + ByteOp | ImplicitOps | String, ImplicitOps | String, + /* 0xA8 - 0xAF */ + 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, + ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, + ByteOp | ImplicitOps | String, ImplicitOps | String, + /* 0xB0 - 0xB7 */ + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + /* 0xB8 - 0xBF */ + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + /* 0xC0 - 0xC7 */ + ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, + 0, ImplicitOps | Stack, 0, 0, + ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, + /* 0xC8 - 0xCF */ + 0, 0, 0, ImplicitOps | Stack, + ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps, + /* 0xD0 - 0xD7 */ + ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, + ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, + 0, 0, 0, 0, + /* 0xD8 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xE7 */ + 0, 0, 0, 0, + ByteOp | SrcImmUByte, SrcImmUByte, + ByteOp | SrcImmUByte, SrcImmUByte, + /* 0xE8 - 0xEF */ + SrcImm | Stack, SrcImm | ImplicitOps, + SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + /* 0xF0 - 0xF7 */ + 0, 0, 0, 0, + ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, + /* 0xF8 - 0xFF */ + ImplicitOps, 0, ImplicitOps, ImplicitOps, + ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, +}; + +static u32 twobyte_table[256] = { + /* 0x00 - 0x0F */ + 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, + ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, + /* 0x10 - 0x1F */ + 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, + /* 0x20 - 0x2F */ + ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x30 - 0x3F */ + ImplicitOps, 0, ImplicitOps, 0, + ImplicitOps, ImplicitOps, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x40 - 0x47 */ + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + /* 0x48 - 0x4F */ + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, + /* 0x50 - 0x5F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x60 - 0x6F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x70 - 0x7F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x80 - 0x8F */ + SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, + SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, + /* 0x90 - 0x9F */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xA0 - 0xA7 */ + 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, + DstMem | SrcReg | Src2ImmByte | ModRM, + DstMem | SrcReg | Src2CL | ModRM, 0, 0, + /* 0xA8 - 0xAF */ + 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, + DstMem | SrcReg | Src2ImmByte | ModRM, + DstMem | SrcReg | Src2CL | ModRM, + ModRM, 0, + /* 0xB0 - 0xB7 */ + ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, + DstMem | SrcReg | ModRM | BitOp, + 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem16 | ModRM | Mov, + /* 0xB8 - 0xBF */ + 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, + 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, + DstReg | SrcMem16 | ModRM | Mov, + /* 0xC0 - 0xCF */ + 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, + 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xD0 - 0xDF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xE0 - 0xEF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + /* 0xF0 - 0xFF */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static u32 group_table[] = { + [Group1_80*8] = + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + [Group1_81*8] = + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + [Group1_82*8] = + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + [Group1_83*8] = + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + [Group1A*8] = + DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, + [Group3_Byte*8] = + ByteOp | SrcImm | DstMem | ModRM, 0, + ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + 0, 0, 0, 0, + [Group3*8] = + DstMem | SrcImm | ModRM, 0, + DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + 0, 0, 0, 0, + [Group4*8] = + ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + 0, 0, 0, 0, 0, 0, + [Group5*8] = + DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + SrcMem | ModRM | Stack, 0, + SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, + [Group7*8] = + 0, 0, ModRM | SrcMem, ModRM | SrcMem, + SrcNone | ModRM | DstMem | Mov, 0, + SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, +}; + +static u32 group2_table[] = { + [Group7*8] = + SrcNone | ModRM, 0, 0, SrcNone | ModRM, + SrcNone | ModRM | DstMem | Mov, 0, + SrcMem16 | ModRM | Mov, 0, +}; + +/* EFLAGS bit definitions. */ +#define EFLG_VM (1<<17) +#define EFLG_RF (1<<16) +#define EFLG_OF (1<<11) +#define EFLG_DF (1<<10) +#define EFLG_IF (1<<9) +#define EFLG_SF (1<<7) +#define EFLG_ZF (1<<6) +#define EFLG_AF (1<<4) +#define EFLG_PF (1<<2) +#define EFLG_CF (1<<0) + +/* + * Instruction emulation: + * Most instructions are emulated directly via a fragment of inline assembly + * code. This allows us to save/restore EFLAGS and thus very easily pick up + * any modified flags. + */ + +#if defined(CONFIG_X86_64) +#define _LO32 "k" /* force 32-bit operand */ +#define _STK "%%rsp" /* stack pointer */ +#elif defined(__i386__) +#define _LO32 "" /* force 32-bit operand */ +#define _STK "%%esp" /* stack pointer */ +#endif + +/* + * These EFLAGS bits are restored from saved value during emulation, and + * any changes are written back to the saved value after emulation. + */ +#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) + +/* Before executing instruction: restore necessary bits in EFLAGS. */ +#define _PRE_EFLAGS(_sav, _msk, _tmp) \ + /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ + "movl %"_sav",%"_LO32 _tmp"; " \ + "push %"_tmp"; " \ + "push %"_tmp"; " \ + "movl %"_msk",%"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",("_STK"); " \ + "pushf; " \ + "notl %"_LO32 _tmp"; " \ + "andl %"_LO32 _tmp",("_STK"); " \ + "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ + "pop %"_tmp"; " \ + "orl %"_LO32 _tmp",("_STK"); " \ + "popf; " \ + "pop %"_sav"; " + +/* After executing instruction: write-back necessary bits in EFLAGS. */ +#define _POST_EFLAGS(_sav, _msk, _tmp) \ + /* _sav |= EFLAGS & _msk; */ \ + "pushf; " \ + "pop %"_tmp"; " \ + "andl %"_msk",%"_LO32 _tmp"; " \ + "orl %"_LO32 _tmp",%"_sav"; " + +#ifdef CONFIG_X86_64 +#define ON64(x) x +#else +#define ON64(x) +#endif + +#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ + do { \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "2") \ + _op _suffix " %"_x"3,%1; " \ + _POST_EFLAGS("0", "4", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _y ((_src).val), "i" (EFLAGS_MASK)); \ + } while (0) + + +/* Raw emulation: instruction has two explicit operands. */ +#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ + do { \ + unsigned long _tmp; \ + \ + switch ((_dst).bytes) { \ + case 2: \ + ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ + break; \ + case 4: \ + ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ + break; \ + case 8: \ + ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ + break; \ + } \ + } while (0) + +#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ + do { \ + unsigned long _tmp; \ + switch ((_dst).bytes) { \ + case 1: \ + ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ + break; \ + default: \ + __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + _wx, _wy, _lx, _ly, _qx, _qy); \ + break; \ + } \ + } while (0) + +/* Source operand is byte-sized and may be restricted to just %cl. */ +#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ + __emulate_2op(_op, _src, _dst, _eflags, \ + "b", "c", "b", "c", "b", "c", "b", "c") + +/* Source operand is byte, word, long or quad sized. */ +#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ + __emulate_2op(_op, _src, _dst, _eflags, \ + "b", "q", "w", "r", _LO32, "r", "", "r") + +/* Source operand is word, long or quad sized. */ +#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ + __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + "w", "r", _LO32, "r", "", "r") + +/* Instruction has three operands and one operand is stored in ECX register */ +#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ + do { \ + unsigned long _tmp; \ + _type _clv = (_cl).val; \ + _type _srcv = (_src).val; \ + _type _dstv = (_dst).val; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "5", "2") \ + _op _suffix " %4,%1 \n" \ + _POST_EFLAGS("0", "5", "2") \ + : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ + : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ + ); \ + \ + (_cl).val = (unsigned long) _clv; \ + (_src).val = (unsigned long) _srcv; \ + (_dst).val = (unsigned long) _dstv; \ + } while (0) + +#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ + do { \ + switch ((_dst).bytes) { \ + case 2: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "w", unsigned short); \ + break; \ + case 4: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "l", unsigned int); \ + break; \ + case 8: \ + ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "q", unsigned long)); \ + break; \ + } \ + } while (0) + +#define __emulate_1op(_op, _dst, _eflags, _suffix) \ + do { \ + unsigned long _tmp; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "3", "2") \ + _op _suffix " %1; " \ + _POST_EFLAGS("0", "3", "2") \ + : "=m" (_eflags), "+m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK)); \ + } while (0) + +/* Instruction has only one explicit operand (no source operand). */ +#define emulate_1op(_op, _dst, _eflags) \ + do { \ + switch ((_dst).bytes) { \ + case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \ + case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \ + case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \ + case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \ + } \ + } while (0) + +/* Fetch next part of the instruction being emulated. */ +#define insn_fetch(_type, _size, _eip) \ +({ unsigned long _x; \ + rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ + if (rc != 0) \ + goto done; \ + (_eip) += (_size); \ + (_type)_x; \ +}) + +static inline unsigned long ad_mask(struct decode_cache *c) +{ + return (1UL << (c->ad_bytes << 3)) - 1; +} + +/* Access/update address held in a register, based on addressing mode. */ +static inline unsigned long +address_mask(struct decode_cache *c, unsigned long reg) +{ + if (c->ad_bytes == sizeof(unsigned long)) + return reg; + else + return reg & ad_mask(c); +} + +static inline unsigned long +register_address(struct decode_cache *c, unsigned long base, unsigned long reg) +{ + return base + address_mask(c, reg); +} + +static inline void +register_address_increment(struct decode_cache *c, unsigned long *reg, int inc) +{ + if (c->ad_bytes == sizeof(unsigned long)) + *reg += inc; + else + *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c)); +} + +static inline void jmp_rel(struct decode_cache *c, int rel) +{ + register_address_increment(c, &c->eip, rel); +} + +static void set_seg_override(struct decode_cache *c, int seg) +{ + c->has_seg_override = true; + c->seg_override = seg; +} + +static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) +{ + if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) + return 0; + + return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg); +} + +static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, + struct decode_cache *c) +{ + if (!c->has_seg_override) + return 0; + + return seg_base(ctxt, c->seg_override); +} + +static unsigned long es_base(struct x86_emulate_ctxt *ctxt) +{ + return seg_base(ctxt, VCPU_SREG_ES); +} + +static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) +{ + return seg_base(ctxt, VCPU_SREG_SS); +} + +static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long linear, u8 *dest) +{ + struct fetch_cache *fc = &ctxt->decode.fetch; + int rc; + int size; + + if (linear < fc->start || linear >= fc->end) { + size = min(15UL, PAGE_SIZE - offset_in_page(linear)); + rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); + if (rc) + return rc; + fc->start = linear; + fc->end = linear + size; + } + *dest = fc->data[linear - fc->start]; + return 0; +} + +static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long eip, void *dest, unsigned size) +{ + int rc = 0; + + eip += ctxt->cs_base; + while (size--) { + rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); + if (rc) + return rc; + } + return 0; +} + +/* + * Given the 'reg' portion of a ModRM byte, and a register block, return a + * pointer into the block that addresses the relevant register. + * @highbyte_regs specifies whether to decode AH,CH,DH,BH. + */ +static void *decode_register(u8 modrm_reg, unsigned long *regs, + int highbyte_regs) +{ + void *p; + + p = ®s[modrm_reg]; + if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) + p = (unsigned char *)®s[modrm_reg & 3] + 1; + return p; +} + +static int read_descriptor(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *ptr, + u16 *size, unsigned long *address, int op_bytes) +{ + int rc; + + if (op_bytes == 2) + op_bytes = 3; + *address = 0; + rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, + ctxt->vcpu); + if (rc) + return rc; + rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, + ctxt->vcpu); + return rc; +} + +static int test_cc(unsigned int condition, unsigned int flags) +{ + int rc = 0; + + switch ((condition & 15) >> 1) { + case 0: /* o */ + rc |= (flags & EFLG_OF); + break; + case 1: /* b/c/nae */ + rc |= (flags & EFLG_CF); + break; + case 2: /* z/e */ + rc |= (flags & EFLG_ZF); + break; + case 3: /* be/na */ + rc |= (flags & (EFLG_CF|EFLG_ZF)); + break; + case 4: /* s */ + rc |= (flags & EFLG_SF); + break; + case 5: /* p/pe */ + rc |= (flags & EFLG_PF); + break; + case 7: /* le/ng */ + rc |= (flags & EFLG_ZF); + /* fall through */ + case 6: /* l/nge */ + rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); + break; + } + + /* Odd condition identifiers (lsb == 1) have inverted sense. */ + return (!!rc ^ (condition & 1)); +} + +static void decode_register_operand(struct operand *op, + struct decode_cache *c, + int inhibit_bytereg) +{ + unsigned reg = c->modrm_reg; + int highbyte_regs = c->rex_prefix == 0; + + if (!(c->d & ModRM)) + reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); + op->type = OP_REG; + if ((c->d & ByteOp) && !inhibit_bytereg) { + op->ptr = decode_register(reg, c->regs, highbyte_regs); + op->val = *(u8 *)op->ptr; + op->bytes = 1; + } else { + op->ptr = decode_register(reg, c->regs, 0); + op->bytes = c->op_bytes; + switch (op->bytes) { + case 2: + op->val = *(u16 *)op->ptr; + break; + case 4: + op->val = *(u32 *)op->ptr; + break; + case 8: + op->val = *(u64 *) op->ptr; + break; + } + } + op->orig_val = op->val; +} + +static int decode_modrm(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + u8 sib; + int index_reg = 0, base_reg = 0, scale; + int rc = 0; + + if (c->rex_prefix) { + c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ + index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ + c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ + } + + c->modrm = insn_fetch(u8, 1, c->eip); + c->modrm_mod |= (c->modrm & 0xc0) >> 6; + c->modrm_reg |= (c->modrm & 0x38) >> 3; + c->modrm_rm |= (c->modrm & 0x07); + c->modrm_ea = 0; + c->use_modrm_ea = 1; + + if (c->modrm_mod == 3) { + c->modrm_ptr = decode_register(c->modrm_rm, + c->regs, c->d & ByteOp); + c->modrm_val = *(unsigned long *)c->modrm_ptr; + return rc; + } + + if (c->ad_bytes == 2) { + unsigned bx = c->regs[VCPU_REGS_RBX]; + unsigned bp = c->regs[VCPU_REGS_RBP]; + unsigned si = c->regs[VCPU_REGS_RSI]; + unsigned di = c->regs[VCPU_REGS_RDI]; + + /* 16-bit ModR/M decode. */ + switch (c->modrm_mod) { + case 0: + if (c->modrm_rm == 6) + c->modrm_ea += insn_fetch(u16, 2, c->eip); + break; + case 1: + c->modrm_ea += insn_fetch(s8, 1, c->eip); + break; + case 2: + c->modrm_ea += insn_fetch(u16, 2, c->eip); + break; + } + switch (c->modrm_rm) { + case 0: + c->modrm_ea += bx + si; + break; + case 1: + c->modrm_ea += bx + di; + break; + case 2: + c->modrm_ea += bp + si; + break; + case 3: + c->modrm_ea += bp + di; + break; + case 4: + c->modrm_ea += si; + break; + case 5: + c->modrm_ea += di; + break; + case 6: + if (c->modrm_mod != 0) + c->modrm_ea += bp; + break; + case 7: + c->modrm_ea += bx; + break; + } + if (c->modrm_rm == 2 || c->modrm_rm == 3 || + (c->modrm_rm == 6 && c->modrm_mod != 0)) + if (!c->has_seg_override) + set_seg_override(c, VCPU_SREG_SS); + c->modrm_ea = (u16)c->modrm_ea; + } else { + /* 32/64-bit ModR/M decode. */ + if ((c->modrm_rm & 7) == 4) { + sib = insn_fetch(u8, 1, c->eip); + index_reg |= (sib >> 3) & 7; + base_reg |= sib & 7; + scale = sib >> 6; + + if ((base_reg & 7) == 5 && c->modrm_mod == 0) + c->modrm_ea += insn_fetch(s32, 4, c->eip); + else + c->modrm_ea += c->regs[base_reg]; + if (index_reg != 4) + c->modrm_ea += c->regs[index_reg] << scale; + } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { + if (ctxt->mode == X86EMUL_MODE_PROT64) + c->rip_relative = 1; + } else + c->modrm_ea += c->regs[c->modrm_rm]; + switch (c->modrm_mod) { + case 0: + if (c->modrm_rm == 5) + c->modrm_ea += insn_fetch(s32, 4, c->eip); + break; + case 1: + c->modrm_ea += insn_fetch(s8, 1, c->eip); + break; + case 2: + c->modrm_ea += insn_fetch(s32, 4, c->eip); + break; + } + } +done: + return rc; +} + +static int decode_abs(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc = 0; + + switch (c->ad_bytes) { + case 2: + c->modrm_ea = insn_fetch(u16, 2, c->eip); + break; + case 4: + c->modrm_ea = insn_fetch(u32, 4, c->eip); + break; + case 8: + c->modrm_ea = insn_fetch(u64, 8, c->eip); + break; + } +done: + return rc; +} + +int +x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc = 0; + int mode = ctxt->mode; + int def_op_bytes, def_ad_bytes, group; + + /* Shadow copy of register state. Committed on successful emulation. */ + + memset(c, 0, sizeof(struct decode_cache)); + c->eip = kvm_rip_read(ctxt->vcpu); + ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); + memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); + + switch (mode) { + case X86EMUL_MODE_REAL: + case X86EMUL_MODE_PROT16: + def_op_bytes = def_ad_bytes = 2; + break; + case X86EMUL_MODE_PROT32: + def_op_bytes = def_ad_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case X86EMUL_MODE_PROT64: + def_op_bytes = 4; + def_ad_bytes = 8; + break; +#endif + default: + return -1; + } + + c->op_bytes = def_op_bytes; + c->ad_bytes = def_ad_bytes; + + /* Legacy prefixes. */ + for (;;) { + switch (c->b = insn_fetch(u8, 1, c->eip)) { + case 0x66: /* operand-size override */ + /* switch between 2/4 bytes */ + c->op_bytes = def_op_bytes ^ 6; + break; + case 0x67: /* address-size override */ + if (mode == X86EMUL_MODE_PROT64) + /* switch between 4/8 bytes */ + c->ad_bytes = def_ad_bytes ^ 12; + else + /* switch between 2/4 bytes */ + c->ad_bytes = def_ad_bytes ^ 6; + break; + case 0x26: /* ES override */ + case 0x2e: /* CS override */ + case 0x36: /* SS override */ + case 0x3e: /* DS override */ + set_seg_override(c, (c->b >> 3) & 3); + break; + case 0x64: /* FS override */ + case 0x65: /* GS override */ + set_seg_override(c, c->b & 7); + break; + case 0x40 ... 0x4f: /* REX */ + if (mode != X86EMUL_MODE_PROT64) + goto done_prefixes; + c->rex_prefix = c->b; + continue; + case 0xf0: /* LOCK */ + c->lock_prefix = 1; + break; + case 0xf2: /* REPNE/REPNZ */ + c->rep_prefix = REPNE_PREFIX; + break; + case 0xf3: /* REP/REPE/REPZ */ + c->rep_prefix = REPE_PREFIX; + break; + default: + goto done_prefixes; + } + + /* Any legacy prefix after a REX prefix nullifies its effect. */ + + c->rex_prefix = 0; + } + +done_prefixes: + + /* REX prefix. */ + if (c->rex_prefix) + if (c->rex_prefix & 8) + c->op_bytes = 8; /* REX.W */ + + /* Opcode byte(s). */ + c->d = opcode_table[c->b]; + if (c->d == 0) { + /* Two-byte opcode? */ + if (c->b == 0x0f) { + c->twobyte = 1; + c->b = insn_fetch(u8, 1, c->eip); + c->d = twobyte_table[c->b]; + } + } + + if (c->d & Group) { + group = c->d & GroupMask; + c->modrm = insn_fetch(u8, 1, c->eip); + --c->eip; + + group = (group << 3) + ((c->modrm >> 3) & 7); + if ((c->d & GroupDual) && (c->modrm >> 6) == 3) + c->d = group2_table[group]; + else + c->d = group_table[group]; + } + + /* Unrecognised? */ + if (c->d == 0) { + DPRINTF("Cannot emulate %02x\n", c->b); + return -1; + } + + if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) + c->op_bytes = 8; + + /* ModRM and SIB bytes. */ + if (c->d & ModRM) + rc = decode_modrm(ctxt, ops); + else if (c->d & MemAbs) + rc = decode_abs(ctxt, ops); + if (rc) + goto done; + + if (!c->has_seg_override) + set_seg_override(c, VCPU_SREG_DS); + + if (!(!c->twobyte && c->b == 0x8d)) + c->modrm_ea += seg_override_base(ctxt, c); + + if (c->ad_bytes != 8) + c->modrm_ea = (u32)c->modrm_ea; + /* + * Decode and fetch the source operand: register, memory + * or immediate. + */ + switch (c->d & SrcMask) { + case SrcNone: + break; + case SrcReg: + decode_register_operand(&c->src, c, 0); + break; + case SrcMem16: + c->src.bytes = 2; + goto srcmem_common; + case SrcMem32: + c->src.bytes = 4; + goto srcmem_common; + case SrcMem: + c->src.bytes = (c->d & ByteOp) ? 1 : + c->op_bytes; + /* Don't fetch the address for invlpg: it could be unmapped. */ + if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) + break; + srcmem_common: + /* + * For instructions with a ModR/M byte, switch to register + * access if Mod = 3. + */ + if ((c->d & ModRM) && c->modrm_mod == 3) { + c->src.type = OP_REG; + c->src.val = c->modrm_val; + c->src.ptr = c->modrm_ptr; + break; + } + c->src.type = OP_MEM; + break; + case SrcImm: + case SrcImmU: + c->src.type = OP_IMM; + c->src.ptr = (unsigned long *)c->eip; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + if (c->src.bytes == 8) + c->src.bytes = 4; + /* NB. Immediates are sign-extended as necessary. */ + switch (c->src.bytes) { + case 1: + c->src.val = insn_fetch(s8, 1, c->eip); + break; + case 2: + c->src.val = insn_fetch(s16, 2, c->eip); + break; + case 4: + c->src.val = insn_fetch(s32, 4, c->eip); + break; + } + if ((c->d & SrcMask) == SrcImmU) { + switch (c->src.bytes) { + case 1: + c->src.val &= 0xff; + break; + case 2: + c->src.val &= 0xffff; + break; + case 4: + c->src.val &= 0xffffffff; + break; + } + } + break; + case SrcImmByte: + case SrcImmUByte: + c->src.type = OP_IMM; + c->src.ptr = (unsigned long *)c->eip; + c->src.bytes = 1; + if ((c->d & SrcMask) == SrcImmByte) + c->src.val = insn_fetch(s8, 1, c->eip); + else + c->src.val = insn_fetch(u8, 1, c->eip); + break; + case SrcOne: + c->src.bytes = 1; + c->src.val = 1; + break; + } + + /* + * Decode and fetch the second source operand: register, memory + * or immediate. + */ + switch (c->d & Src2Mask) { + case Src2None: + break; + case Src2CL: + c->src2.bytes = 1; + c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; + break; + case Src2ImmByte: + c->src2.type = OP_IMM; + c->src2.ptr = (unsigned long *)c->eip; + c->src2.bytes = 1; + c->src2.val = insn_fetch(u8, 1, c->eip); + break; + case Src2Imm16: + c->src2.type = OP_IMM; + c->src2.ptr = (unsigned long *)c->eip; + c->src2.bytes = 2; + c->src2.val = insn_fetch(u16, 2, c->eip); + break; + case Src2One: + c->src2.bytes = 1; + c->src2.val = 1; + break; + } + + /* Decode and fetch the destination operand: register or memory. */ + switch (c->d & DstMask) { + case ImplicitOps: + /* Special instructions do their own operand decoding. */ + return 0; + case DstReg: + decode_register_operand(&c->dst, c, + c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); + break; + case DstMem: + if ((c->d & ModRM) && c->modrm_mod == 3) { + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.type = OP_REG; + c->dst.val = c->dst.orig_val = c->modrm_val; + c->dst.ptr = c->modrm_ptr; + break; + } + c->dst.type = OP_MEM; + break; + case DstAcc: + c->dst.type = OP_REG; + c->dst.bytes = c->op_bytes; + c->dst.ptr = &c->regs[VCPU_REGS_RAX]; + switch (c->op_bytes) { + case 1: + c->dst.val = *(u8 *)c->dst.ptr; + break; + case 2: + c->dst.val = *(u16 *)c->dst.ptr; + break; + case 4: + c->dst.val = *(u32 *)c->dst.ptr; + break; + } + c->dst.orig_val = c->dst.val; + break; + } + + if (c->rip_relative) + c->modrm_ea += c->eip; + +done: + return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; +} + +static inline void emulate_push(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.type = OP_MEM; + c->dst.bytes = c->op_bytes; + c->dst.val = c->src.val; + register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); + c->dst.ptr = (void *) register_address(c, ss_base(ctxt), + c->regs[VCPU_REGS_RSP]); +} + +static int emulate_pop(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *dest, int len) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + rc = ops->read_emulated(register_address(c, ss_base(ctxt), + c->regs[VCPU_REGS_RSP]), + dest, len, ctxt->vcpu); + if (rc != 0) + return rc; + + register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); + return rc; +} + +static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); + if (rc != 0) + return rc; + return 0; +} + +static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + switch (c->modrm_reg) { + case 0: /* rol */ + emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); + break; + case 1: /* ror */ + emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); + break; + case 2: /* rcl */ + emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); + break; + case 3: /* rcr */ + emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); + break; + case 4: /* sal/shl */ + case 6: /* sal/shl */ + emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); + break; + case 5: /* shr */ + emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); + break; + case 7: /* sar */ + emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); + break; + } +} + +static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc = 0; + + switch (c->modrm_reg) { + case 0 ... 1: /* test */ + emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); + break; + case 2: /* not */ + c->dst.val = ~c->dst.val; + break; + case 3: /* neg */ + emulate_1op("neg", c->dst, ctxt->eflags); + break; + default: + DPRINTF("Cannot emulate %02x\n", c->b); + rc = X86EMUL_UNHANDLEABLE; + break; + } + return rc; +} + +static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + + switch (c->modrm_reg) { + case 0: /* inc */ + emulate_1op("inc", c->dst, ctxt->eflags); + break; + case 1: /* dec */ + emulate_1op("dec", c->dst, ctxt->eflags); + break; + case 2: /* call near abs */ { + long int old_eip; + old_eip = c->eip; + c->eip = c->src.val; + c->src.val = old_eip; + emulate_push(ctxt); + break; + } + case 4: /* jmp abs */ + c->eip = c->src.val; + break; + case 6: /* push */ + emulate_push(ctxt); + break; + } + return 0; +} + +static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + unsigned long memop) +{ + struct decode_cache *c = &ctxt->decode; + u64 old, new; + int rc; + + rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); + if (rc != 0) + return rc; + + if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || + ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { + + c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); + c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); + ctxt->eflags &= ~EFLG_ZF; + + } else { + new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | + (u32) c->regs[VCPU_REGS_RBX]; + + rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); + if (rc != 0) + return rc; + ctxt->eflags |= EFLG_ZF; + } + return 0; +} + +static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + unsigned long cs; + + rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); + if (rc) + return rc; + if (c->op_bytes == 4) + c->eip = (u32)c->eip; + rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); + if (rc) + return rc; + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); + return rc; +} + +static inline int writeback(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + int rc; + struct decode_cache *c = &ctxt->decode; + + switch (c->dst.type) { + case OP_REG: + /* The 4-byte case *is* correct: + * in 64-bit mode we zero-extend. + */ + switch (c->dst.bytes) { + case 1: + *(u8 *)c->dst.ptr = (u8)c->dst.val; + break; + case 2: + *(u16 *)c->dst.ptr = (u16)c->dst.val; + break; + case 4: + *c->dst.ptr = (u32)c->dst.val; + break; /* 64b: zero-ext */ + case 8: + *c->dst.ptr = c->dst.val; + break; + } + break; + case OP_MEM: + if (c->lock_prefix) + rc = ops->cmpxchg_emulated( + (unsigned long)c->dst.ptr, + &c->dst.orig_val, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + else + rc = ops->write_emulated( + (unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu); + if (rc != 0) + return rc; + break; + case OP_NONE: + /* no writeback */ + break; + default: + break; + } + return 0; +} + +static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) +{ + u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); + /* + * an sti; sti; sequence only disable interrupts for the first + * instruction. So, if the last instruction, be it emulated or + * not, left the system with the INT_STI flag enabled, it + * means that the last instruction is an sti. We should not + * leave the flag on in this case. The same goes for mov ss + */ + if (!(int_shadow & mask)) + ctxt->interruptibility = mask; +} + +static inline void +setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, + struct kvm_segment *cs, struct kvm_segment *ss) +{ + memset(cs, 0, sizeof(struct kvm_segment)); + kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); + memset(ss, 0, sizeof(struct kvm_segment)); + + cs->l = 0; /* will be adjusted later */ + cs->base = 0; /* flat segment */ + cs->g = 1; /* 4kb granularity */ + cs->limit = 0xffffffff; /* 4GB limit */ + cs->type = 0x0b; /* Read, Execute, Accessed */ + cs->s = 1; + cs->dpl = 0; /* will be adjusted later */ + cs->present = 1; + cs->db = 1; + + ss->unusable = 0; + ss->base = 0; /* flat segment */ + ss->limit = 0xffffffff; /* 4GB limit */ + ss->g = 1; /* 4kb granularity */ + ss->s = 1; + ss->type = 0x03; /* Read/Write, Accessed */ + ss->db = 1; /* 32bit stack segment */ + ss->dpl = 0; + ss->present = 1; +} + +static int +emulate_syscall(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_segment cs, ss; + u64 msr_data; + + /* syscall is not available in real mode */ + if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL + || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) + return -1; + + setup_syscalls_segments(ctxt, &cs, &ss); + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + msr_data >>= 32; + cs.selector = (u16)(msr_data & 0xfffc); + ss.selector = (u16)(msr_data + 8); + + if (is_long_mode(ctxt->vcpu)) { + cs.db = 0; + cs.l = 1; + } + kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); + kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + + c->regs[VCPU_REGS_RCX] = c->eip; + if (is_long_mode(ctxt->vcpu)) { +#ifdef CONFIG_X86_64 + c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; + + kvm_x86_ops->get_msr(ctxt->vcpu, + ctxt->mode == X86EMUL_MODE_PROT64 ? + MSR_LSTAR : MSR_CSTAR, &msr_data); + c->eip = msr_data; + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); + ctxt->eflags &= ~(msr_data | EFLG_RF); +#endif + } else { + /* legacy mode */ + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + c->eip = (u32)msr_data; + + ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + } + + return 0; +} + +static int +emulate_sysenter(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_segment cs, ss; + u64 msr_data; + + /* inject #UD if LOCK prefix is used */ + if (c->lock_prefix) + return -1; + + /* inject #GP if in real mode or paging is disabled */ + if (ctxt->mode == X86EMUL_MODE_REAL || + !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + + /* XXX sysenter/sysexit have not been tested in 64bit mode. + * Therefore, we inject an #UD. + */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + return -1; + + setup_syscalls_segments(ctxt, &cs, &ss); + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + switch (ctxt->mode) { + case X86EMUL_MODE_PROT32: + if ((msr_data & 0xfffc) == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + break; + case X86EMUL_MODE_PROT64: + if (msr_data == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + break; + } + + ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + cs.selector = (u16)msr_data; + cs.selector &= ~SELECTOR_RPL_MASK; + ss.selector = cs.selector + 8; + ss.selector &= ~SELECTOR_RPL_MASK; + if (ctxt->mode == X86EMUL_MODE_PROT64 + || is_long_mode(ctxt->vcpu)) { + cs.db = 0; + cs.l = 1; + } + + kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); + kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); + c->eip = msr_data; + + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); + c->regs[VCPU_REGS_RSP] = msr_data; + + return 0; +} + +static int +emulate_sysexit(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_segment cs, ss; + u64 msr_data; + int usermode; + + /* inject #UD if LOCK prefix is used */ + if (c->lock_prefix) + return -1; + + /* inject #GP if in real mode or paging is disabled */ + if (ctxt->mode == X86EMUL_MODE_REAL + || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + + /* sysexit must be called from CPL 0 */ + if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + + setup_syscalls_segments(ctxt, &cs, &ss); + + if ((c->rex_prefix & 0x8) != 0x0) + usermode = X86EMUL_MODE_PROT64; + else + usermode = X86EMUL_MODE_PROT32; + + cs.dpl = 3; + ss.dpl = 3; + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + switch (usermode) { + case X86EMUL_MODE_PROT32: + cs.selector = (u16)(msr_data + 16); + if ((msr_data & 0xfffc) == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + ss.selector = (u16)(msr_data + 24); + break; + case X86EMUL_MODE_PROT64: + cs.selector = (u16)(msr_data + 32); + if (msr_data == 0x0) { + kvm_inject_gp(ctxt->vcpu, 0); + return -1; + } + ss.selector = cs.selector + 8; + cs.db = 0; + cs.l = 1; + break; + } + cs.selector |= SELECTOR_RPL_MASK; + ss.selector |= SELECTOR_RPL_MASK; + + kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); + kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); + + c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; + c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; + + return 0; +} + +int +x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +{ + unsigned long memop = 0; + u64 msr_data; + unsigned long saved_eip = 0; + struct decode_cache *c = &ctxt->decode; + unsigned int port; + int io_dir_in; + int rc = 0; + + ctxt->interruptibility = 0; + + /* Shadow copy of register state. Committed on successful emulation. + * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't + * modify them. + */ + + memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); + saved_eip = c->eip; + + if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) + memop = c->modrm_ea; + + if (c->rep_prefix && (c->d & String)) { + /* All REP prefixes have the same first termination condition */ + if (c->regs[VCPU_REGS_RCX] == 0) { + kvm_rip_write(ctxt->vcpu, c->eip); + goto done; + } + /* The second termination condition only applies for REPE + * and REPNE. Test if the repeat string operation prefix is + * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the + * corresponding termination condition according to: + * - if REPE/REPZ and ZF = 0 then done + * - if REPNE/REPNZ and ZF = 1 then done + */ + if ((c->b == 0xa6) || (c->b == 0xa7) || + (c->b == 0xae) || (c->b == 0xaf)) { + if ((c->rep_prefix == REPE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == 0)) { + kvm_rip_write(ctxt->vcpu, c->eip); + goto done; + } + if ((c->rep_prefix == REPNE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { + kvm_rip_write(ctxt->vcpu, c->eip); + goto done; + } + } + c->regs[VCPU_REGS_RCX]--; + c->eip = kvm_rip_read(ctxt->vcpu); + } + + if (c->src.type == OP_MEM) { + c->src.ptr = (unsigned long *)memop; + c->src.val = 0; + rc = ops->read_emulated((unsigned long)c->src.ptr, + &c->src.val, + c->src.bytes, + ctxt->vcpu); + if (rc != 0) + goto done; + c->src.orig_val = c->src.val; + } + + if ((c->d & DstMask) == ImplicitOps) + goto special_insn; + + + if (c->dst.type == OP_MEM) { + c->dst.ptr = (unsigned long *)memop; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.val = 0; + if (c->d & BitOp) { + unsigned long mask = ~(c->dst.bytes * 8 - 1); + + c->dst.ptr = (void *)c->dst.ptr + + (c->src.val & mask) / 8; + } + if (!(c->d & Mov) && + /* optimisation - avoid slow emulated read */ + ((rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, ctxt->vcpu)) != 0)) + goto done; + } + c->dst.orig_val = c->dst.val; + +special_insn: + + if (c->twobyte) + goto twobyte_insn; + + switch (c->b) { + case 0x00 ... 0x05: + add: /* add */ + emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); + break; + case 0x08 ... 0x0d: + or: /* or */ + emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); + break; + case 0x10 ... 0x15: + adc: /* adc */ + emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); + break; + case 0x18 ... 0x1d: + sbb: /* sbb */ + emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); + break; + case 0x20 ... 0x25: + and: /* and */ + emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); + break; + case 0x28 ... 0x2d: + sub: /* sub */ + emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); + break; + case 0x30 ... 0x35: + xor: /* xor */ + emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); + break; + case 0x38 ... 0x3d: + cmp: /* cmp */ + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + break; + case 0x40 ... 0x47: /* inc r16/r32 */ + emulate_1op("inc", c->dst, ctxt->eflags); + break; + case 0x48 ... 0x4f: /* dec r16/r32 */ + emulate_1op("dec", c->dst, ctxt->eflags); + break; + case 0x50 ... 0x57: /* push reg */ + emulate_push(ctxt); + break; + case 0x58 ... 0x5f: /* pop reg */ + pop_instruction: + rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); + if (rc != 0) + goto done; + break; + case 0x63: /* movsxd */ + if (ctxt->mode != X86EMUL_MODE_PROT64) + goto cannot_emulate; + c->dst.val = (s32) c->src.val; + break; + case 0x68: /* push imm */ + case 0x6a: /* push imm8 */ + emulate_push(ctxt); + break; + case 0x6c: /* insb */ + case 0x6d: /* insw/insd */ + if (kvm_emulate_pio_string(ctxt->vcpu, NULL, + 1, + (c->d & ByteOp) ? 1 : c->op_bytes, + c->rep_prefix ? + address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, + (ctxt->eflags & EFLG_DF), + register_address(c, es_base(ctxt), + c->regs[VCPU_REGS_RDI]), + c->rep_prefix, + c->regs[VCPU_REGS_RDX]) == 0) { + c->eip = saved_eip; + return -1; + } + return 0; + case 0x6e: /* outsb */ + case 0x6f: /* outsw/outsd */ + if (kvm_emulate_pio_string(ctxt->vcpu, NULL, + 0, + (c->d & ByteOp) ? 1 : c->op_bytes, + c->rep_prefix ? + address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, + (ctxt->eflags & EFLG_DF), + register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), + c->rep_prefix, + c->regs[VCPU_REGS_RDX]) == 0) { + c->eip = saved_eip; + return -1; + } + return 0; + case 0x70 ... 0x7f: /* jcc (short) */ + if (test_cc(c->b, ctxt->eflags)) + jmp_rel(c, c->src.val); + break; + case 0x80 ... 0x83: /* Grp1 */ + switch (c->modrm_reg) { + case 0: + goto add; + case 1: + goto or; + case 2: + goto adc; + case 3: + goto sbb; + case 4: + goto and; + case 5: + goto sub; + case 6: + goto xor; + case 7: + goto cmp; + } + break; + case 0x84 ... 0x85: + emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); + break; + case 0x86 ... 0x87: /* xchg */ + xchg: + /* Write back the register source. */ + switch (c->dst.bytes) { + case 1: + *(u8 *) c->src.ptr = (u8) c->dst.val; + break; + case 2: + *(u16 *) c->src.ptr = (u16) c->dst.val; + break; + case 4: + *c->src.ptr = (u32) c->dst.val; + break; /* 64b reg: zero-extend */ + case 8: + *c->src.ptr = c->dst.val; + break; + } + /* + * Write back the memory destination with implicit LOCK + * prefix. + */ + c->dst.val = c->src.val; + c->lock_prefix = 1; + break; + case 0x88 ... 0x8b: /* mov */ + goto mov; + case 0x8c: { /* mov r/m, sreg */ + struct kvm_segment segreg; + + if (c->modrm_reg <= 5) + kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); + else { + printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n", + c->modrm); + goto cannot_emulate; + } + c->dst.val = segreg.selector; + break; + } + case 0x8d: /* lea r16/r32, m */ + c->dst.val = c->modrm_ea; + break; + case 0x8e: { /* mov seg, r/m16 */ + uint16_t sel; + int type_bits; + int err; + + sel = c->src.val; + if (c->modrm_reg == VCPU_SREG_SS) + toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); + + if (c->modrm_reg <= 5) { + type_bits = (c->modrm_reg == 1) ? 9 : 1; + err = kvm_load_segment_descriptor(ctxt->vcpu, sel, + type_bits, c->modrm_reg); + } else { + printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n", + c->modrm); + goto cannot_emulate; + } + + if (err < 0) + goto cannot_emulate; + + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + } + case 0x8f: /* pop (sole member of Grp1a) */ + rc = emulate_grp1a(ctxt, ops); + if (rc != 0) + goto done; + break; + case 0x90: /* nop / xchg r8,rax */ + if (!(c->rex_prefix & 1)) { /* nop */ + c->dst.type = OP_NONE; + break; + } + case 0x91 ... 0x97: /* xchg reg,rax */ + c->src.type = c->dst.type = OP_REG; + c->src.bytes = c->dst.bytes = c->op_bytes; + c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; + c->src.val = *(c->src.ptr); + goto xchg; + case 0x9c: /* pushf */ + c->src.val = (unsigned long) ctxt->eflags; + emulate_push(ctxt); + break; + case 0x9d: /* popf */ + c->dst.type = OP_REG; + c->dst.ptr = (unsigned long *) &ctxt->eflags; + c->dst.bytes = c->op_bytes; + goto pop_instruction; + case 0xa0 ... 0xa1: /* mov */ + c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + c->dst.val = c->src.val; + break; + case 0xa2 ... 0xa3: /* mov */ + c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; + break; + case 0xa4 ... 0xa5: /* movs */ + c->dst.type = OP_MEM; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)register_address(c, + es_base(ctxt), + c->regs[VCPU_REGS_RDI]); + if ((rc = ops->read_emulated(register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), + &c->dst.val, + c->dst.bytes, ctxt->vcpu)) != 0) + goto done; + register_address_increment(c, &c->regs[VCPU_REGS_RSI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + register_address_increment(c, &c->regs[VCPU_REGS_RDI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + break; + case 0xa6 ... 0xa7: /* cmps */ + c->src.type = OP_NONE; /* Disable writeback. */ + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->src.ptr = (unsigned long *)register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]); + if ((rc = ops->read_emulated((unsigned long)c->src.ptr, + &c->src.val, + c->src.bytes, + ctxt->vcpu)) != 0) + goto done; + + c->dst.type = OP_NONE; /* Disable writeback. */ + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)register_address(c, + es_base(ctxt), + c->regs[VCPU_REGS_RDI]); + if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, + &c->dst.val, + c->dst.bytes, + ctxt->vcpu)) != 0) + goto done; + + DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); + + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + + register_address_increment(c, &c->regs[VCPU_REGS_RSI], + (ctxt->eflags & EFLG_DF) ? -c->src.bytes + : c->src.bytes); + register_address_increment(c, &c->regs[VCPU_REGS_RDI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + + break; + case 0xaa ... 0xab: /* stos */ + c->dst.type = OP_MEM; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)register_address(c, + es_base(ctxt), + c->regs[VCPU_REGS_RDI]); + c->dst.val = c->regs[VCPU_REGS_RAX]; + register_address_increment(c, &c->regs[VCPU_REGS_RDI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + break; + case 0xac ... 0xad: /* lods */ + c->dst.type = OP_REG; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + if ((rc = ops->read_emulated(register_address(c, + seg_override_base(ctxt, c), + c->regs[VCPU_REGS_RSI]), + &c->dst.val, + c->dst.bytes, + ctxt->vcpu)) != 0) + goto done; + register_address_increment(c, &c->regs[VCPU_REGS_RSI], + (ctxt->eflags & EFLG_DF) ? -c->dst.bytes + : c->dst.bytes); + break; + case 0xae ... 0xaf: /* scas */ + DPRINTF("Urk! I don't handle SCAS.\n"); + goto cannot_emulate; + case 0xb0 ... 0xbf: /* mov r, imm */ + goto mov; + case 0xc0 ... 0xc1: + emulate_grp2(ctxt); + break; + case 0xc3: /* ret */ + c->dst.type = OP_REG; + c->dst.ptr = &c->eip; + c->dst.bytes = c->op_bytes; + goto pop_instruction; + case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ + mov: + c->dst.val = c->src.val; + break; + case 0xcb: /* ret far */ + rc = emulate_ret_far(ctxt, ops); + if (rc) + goto done; + break; + case 0xd0 ... 0xd1: /* Grp2 */ + c->src.val = 1; + emulate_grp2(ctxt); + break; + case 0xd2 ... 0xd3: /* Grp2 */ + c->src.val = c->regs[VCPU_REGS_RCX]; + emulate_grp2(ctxt); + break; + case 0xe4: /* inb */ + case 0xe5: /* in */ + port = c->src.val; + io_dir_in = 1; + goto do_io; + case 0xe6: /* outb */ + case 0xe7: /* out */ + port = c->src.val; + io_dir_in = 0; + goto do_io; + case 0xe8: /* call (near) */ { + long int rel = c->src.val; + c->src.val = (unsigned long) c->eip; + jmp_rel(c, rel); + emulate_push(ctxt); + break; + } + case 0xe9: /* jmp rel */ + goto jmp; + case 0xea: /* jmp far */ + if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, + VCPU_SREG_CS) < 0) { + DPRINTF("jmp far: Failed to load CS descriptor\n"); + goto cannot_emulate; + } + + c->eip = c->src.val; + break; + case 0xeb: + jmp: /* jmp rel short */ + jmp_rel(c, c->src.val); + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xec: /* in al,dx */ + case 0xed: /* in (e/r)ax,dx */ + port = c->regs[VCPU_REGS_RDX]; + io_dir_in = 1; + goto do_io; + case 0xee: /* out al,dx */ + case 0xef: /* out (e/r)ax,dx */ + port = c->regs[VCPU_REGS_RDX]; + io_dir_in = 0; + do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, + (c->d & ByteOp) ? 1 : c->op_bytes, + port) != 0) { + c->eip = saved_eip; + goto cannot_emulate; + } + break; + case 0xf4: /* hlt */ + ctxt->vcpu->arch.halt_request = 1; + break; + case 0xf5: /* cmc */ + /* complement carry flag from eflags reg */ + ctxt->eflags ^= EFLG_CF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xf6 ... 0xf7: /* Grp3 */ + rc = emulate_grp3(ctxt, ops); + if (rc != 0) + goto done; + break; + case 0xf8: /* clc */ + ctxt->eflags &= ~EFLG_CF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfa: /* cli */ + ctxt->eflags &= ~X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfb: /* sti */ + toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); + ctxt->eflags |= X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfc: /* cld */ + ctxt->eflags &= ~EFLG_DF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfd: /* std */ + ctxt->eflags |= EFLG_DF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfe ... 0xff: /* Grp4/Grp5 */ + rc = emulate_grp45(ctxt, ops); + if (rc != 0) + goto done; + break; + } + +writeback: + rc = writeback(ctxt, ops); + if (rc != 0) + goto done; + + /* Commit shadow register state. */ + memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(ctxt->vcpu, c->eip); + +done: + if (rc == X86EMUL_UNHANDLEABLE) { + c->eip = saved_eip; + return -1; + } + return 0; + +twobyte_insn: + switch (c->b) { + case 0x01: /* lgdt, lidt, lmsw */ + switch (c->modrm_reg) { + u16 size; + unsigned long address; + + case 0: /* vmcall */ + if (c->modrm_mod != 3 || c->modrm_rm != 1) + goto cannot_emulate; + + rc = kvm_fix_hypercall(ctxt->vcpu); + if (rc) + goto done; + + /* Let the processor re-execute the fixed hypercall */ + c->eip = kvm_rip_read(ctxt->vcpu); + /* Disable writeback. */ + c->dst.type = OP_NONE; + break; + case 2: /* lgdt */ + rc = read_descriptor(ctxt, ops, c->src.ptr, + &size, &address, c->op_bytes); + if (rc) + goto done; + realmode_lgdt(ctxt->vcpu, size, address); + /* Disable writeback. */ + c->dst.type = OP_NONE; + break; + case 3: /* lidt/vmmcall */ + if (c->modrm_mod == 3) { + switch (c->modrm_rm) { + case 1: + rc = kvm_fix_hypercall(ctxt->vcpu); + if (rc) + goto done; + break; + default: + goto cannot_emulate; + } + } else { + rc = read_descriptor(ctxt, ops, c->src.ptr, + &size, &address, + c->op_bytes); + if (rc) + goto done; + realmode_lidt(ctxt->vcpu, size, address); + } + /* Disable writeback. */ + c->dst.type = OP_NONE; + break; + case 4: /* smsw */ + c->dst.bytes = 2; + c->dst.val = realmode_get_cr(ctxt->vcpu, 0); + break; + case 6: /* lmsw */ + realmode_lmsw(ctxt->vcpu, (u16)c->src.val, + &ctxt->eflags); + c->dst.type = OP_NONE; + break; + case 7: /* invlpg*/ + emulate_invlpg(ctxt->vcpu, memop); + /* Disable writeback. */ + c->dst.type = OP_NONE; + break; + default: + goto cannot_emulate; + } + break; + case 0x05: /* syscall */ + if (emulate_syscall(ctxt) == -1) + goto cannot_emulate; + else + goto writeback; + break; + case 0x06: + emulate_clts(ctxt->vcpu); + c->dst.type = OP_NONE; + break; + case 0x08: /* invd */ + case 0x09: /* wbinvd */ + case 0x0d: /* GrpP (prefetch) */ + case 0x18: /* Grp16 (prefetch/nop) */ + c->dst.type = OP_NONE; + break; + case 0x20: /* mov cr, reg */ + if (c->modrm_mod != 3) + goto cannot_emulate; + c->regs[c->modrm_rm] = + realmode_get_cr(ctxt->vcpu, c->modrm_reg); + c->dst.type = OP_NONE; /* no writeback */ + break; + case 0x21: /* mov from dr to reg */ + if (c->modrm_mod != 3) + goto cannot_emulate; + rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); + if (rc) + goto cannot_emulate; + c->dst.type = OP_NONE; /* no writeback */ + break; + case 0x22: /* mov reg, cr */ + if (c->modrm_mod != 3) + goto cannot_emulate; + realmode_set_cr(ctxt->vcpu, + c->modrm_reg, c->modrm_val, &ctxt->eflags); + c->dst.type = OP_NONE; + break; + case 0x23: /* mov from reg to dr */ + if (c->modrm_mod != 3) + goto cannot_emulate; + rc = emulator_set_dr(ctxt, c->modrm_reg, + c->regs[c->modrm_rm]); + if (rc) + goto cannot_emulate; + c->dst.type = OP_NONE; /* no writeback */ + break; + case 0x30: + /* wrmsr */ + msr_data = (u32)c->regs[VCPU_REGS_RAX] + | ((u64)c->regs[VCPU_REGS_RDX] << 32); + rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); + if (rc) { + kvm_inject_gp(ctxt->vcpu, 0); + c->eip = kvm_rip_read(ctxt->vcpu); + } + rc = X86EMUL_CONTINUE; + c->dst.type = OP_NONE; + break; + case 0x32: + /* rdmsr */ + rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); + if (rc) { + kvm_inject_gp(ctxt->vcpu, 0); + c->eip = kvm_rip_read(ctxt->vcpu); + } else { + c->regs[VCPU_REGS_RAX] = (u32)msr_data; + c->regs[VCPU_REGS_RDX] = msr_data >> 32; + } + rc = X86EMUL_CONTINUE; + c->dst.type = OP_NONE; + break; + case 0x34: /* sysenter */ + if (emulate_sysenter(ctxt) == -1) + goto cannot_emulate; + else + goto writeback; + break; + case 0x35: /* sysexit */ + if (emulate_sysexit(ctxt) == -1) + goto cannot_emulate; + else + goto writeback; + break; + case 0x40 ... 0x4f: /* cmov */ + c->dst.val = c->dst.orig_val = c->src.val; + if (!test_cc(c->b, ctxt->eflags)) + c->dst.type = OP_NONE; /* no writeback */ + break; + case 0x80 ... 0x8f: /* jnz rel, etc*/ + if (test_cc(c->b, ctxt->eflags)) + jmp_rel(c, c->src.val); + c->dst.type = OP_NONE; + break; + case 0xa3: + bt: /* bt */ + c->dst.type = OP_NONE; + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; + emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); + break; + case 0xa4: /* shld imm8, r, r/m */ + case 0xa5: /* shld cl, r, r/m */ + emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); + break; + case 0xab: + bts: /* bts */ + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; + emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); + break; + case 0xac: /* shrd imm8, r, r/m */ + case 0xad: /* shrd cl, r, r/m */ + emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags); + break; + case 0xae: /* clflush */ + break; + case 0xb0 ... 0xb1: /* cmpxchg */ + /* + * Save real source value, then compare EAX against + * destination. + */ + c->src.orig_val = c->src.val; + c->src.val = c->regs[VCPU_REGS_RAX]; + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + if (ctxt->eflags & EFLG_ZF) { + /* Success: write back to memory. */ + c->dst.val = c->src.orig_val; + } else { + /* Failure: write the value we saw to EAX. */ + c->dst.type = OP_REG; + c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + } + break; + case 0xb3: + btr: /* btr */ + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; + emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); + break; + case 0xb6 ... 0xb7: /* movzx */ + c->dst.bytes = c->op_bytes; + c->dst.val = (c->d & ByteOp) ? (u8) c->src.val + : (u16) c->src.val; + break; + case 0xba: /* Grp8 */ + switch (c->modrm_reg & 3) { + case 0: + goto bt; + case 1: + goto bts; + case 2: + goto btr; + case 3: + goto btc; + } + break; + case 0xbb: + btc: /* btc */ + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; + emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); + break; + case 0xbe ... 0xbf: /* movsx */ + c->dst.bytes = c->op_bytes; + c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : + (s16) c->src.val; + break; + case 0xc3: /* movnti */ + c->dst.bytes = c->op_bytes; + c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : + (u64) c->src.val; + break; + case 0xc7: /* Grp9 (cmpxchg8b) */ + rc = emulate_grp9(ctxt, ops, memop); + if (rc != 0) + goto done; + c->dst.type = OP_NONE; + break; + } + goto writeback; + +cannot_emulate: + DPRINTF("Cannot emulate %02x\n", c->b); + c->eip = saved_eip; + return -1; +} diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1aa7e6d91d4..c0e942747b0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2759,7 +2759,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; /* - * TODO: fix x86_emulate.c to use guest_read/write_register + * TODO: fix emulate.c to use guest_read/write_register * instead of direct ->regs accesses, can save hundred cycles * on Intel for instructions that don't read/change RSP, for * for example. diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c deleted file mode 100644 index c6663d46f32..00000000000 --- a/arch/x86/kvm/x86_emulate.c +++ /dev/null @@ -1,2392 +0,0 @@ -/****************************************************************************** - * x86_emulate.c - * - * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. - * - * Copyright (c) 2005 Keir Fraser - * - * Linux coding style, mod r/m decoder, segment base fixes, real-mode - * privileged instructions: - * - * Copyright (C) 2006 Qumranet - * - * Avi Kivity - * Yaniv Kamay - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 - */ - -#ifndef __KERNEL__ -#include -#include -#include -#define DPRINTF(_f, _a ...) printf(_f , ## _a) -#else -#include -#include "kvm_cache_regs.h" -#define DPRINTF(x...) do {} while (0) -#endif -#include -#include - -#include "mmu.h" /* for is_long_mode() */ - -/* - * Opcode effective-address decode tables. - * Note that we only emulate instructions that have at least one memory - * operand (excluding implicit stack references). We assume that stack - * references and instruction fetches will never occur in special memory - * areas that require emulation. So, for example, 'mov ,' need - * not be handled. - */ - -/* Operand sizes: 8-bit operands or specified/overridden size. */ -#define ByteOp (1<<0) /* 8-bit operands. */ -/* Destination operand type. */ -#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ -#define DstReg (2<<1) /* Register operand. */ -#define DstMem (3<<1) /* Memory operand. */ -#define DstAcc (4<<1) /* Destination Accumulator */ -#define DstMask (7<<1) -/* Source operand type. */ -#define SrcNone (0<<4) /* No source operand. */ -#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ -#define SrcReg (1<<4) /* Register operand. */ -#define SrcMem (2<<4) /* Memory operand. */ -#define SrcMem16 (3<<4) /* Memory operand (16-bit). */ -#define SrcMem32 (4<<4) /* Memory operand (32-bit). */ -#define SrcImm (5<<4) /* Immediate operand. */ -#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ -#define SrcOne (7<<4) /* Implied '1' */ -#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ -#define SrcImmU (9<<4) /* Immediate operand, unsigned */ -#define SrcMask (0xf<<4) -/* Generic ModRM decode. */ -#define ModRM (1<<8) -/* Destination is only written; never read. */ -#define Mov (1<<9) -#define BitOp (1<<10) -#define MemAbs (1<<11) /* Memory operand is absolute displacement */ -#define String (1<<12) /* String instruction (rep capable) */ -#define Stack (1<<13) /* Stack instruction (push/pop) */ -#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ -#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ -#define GroupMask 0xff /* Group number stored in bits 0:7 */ -/* Source 2 operand type */ -#define Src2None (0<<29) -#define Src2CL (1<<29) -#define Src2ImmByte (2<<29) -#define Src2One (3<<29) -#define Src2Imm16 (4<<29) -#define Src2Mask (7<<29) - -enum { - Group1_80, Group1_81, Group1_82, Group1_83, - Group1A, Group3_Byte, Group3, Group4, Group5, Group7, -}; - -static u32 opcode_table[256] = { - /* 0x00 - 0x07 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, - /* 0x08 - 0x0F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x10 - 0x17 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x18 - 0x1F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x20 - 0x27 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, - /* 0x28 - 0x2F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x30 - 0x37 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, - /* 0x38 - 0x3F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - 0, 0, - /* 0x40 - 0x47 */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, - /* 0x48 - 0x4F */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, - /* 0x50 - 0x57 */ - SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, - SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, - /* 0x58 - 0x5F */ - DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, - DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, - /* 0x60 - 0x67 */ - 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , - 0, 0, 0, 0, - /* 0x68 - 0x6F */ - SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ - /* 0x70 - 0x77 */ - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - /* 0x78 - 0x7F */ - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - /* 0x80 - 0x87 */ - Group | Group1_80, Group | Group1_81, - Group | Group1_82, Group | Group1_83, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - /* 0x88 - 0x8F */ - ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, - ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, - DstReg | SrcMem | ModRM | Mov, Group | Group1A, - /* 0x90 - 0x97 */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, - /* 0x98 - 0x9F */ - 0, 0, SrcImm | Src2Imm16, 0, - ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, - /* 0xA0 - 0xA7 */ - ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, - ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, - ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, - ByteOp | ImplicitOps | String, ImplicitOps | String, - /* 0xA8 - 0xAF */ - 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, - ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, - ByteOp | ImplicitOps | String, ImplicitOps | String, - /* 0xB0 - 0xB7 */ - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - /* 0xB8 - 0xBF */ - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - /* 0xC0 - 0xC7 */ - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, - 0, ImplicitOps | Stack, 0, 0, - ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, - /* 0xC8 - 0xCF */ - 0, 0, 0, ImplicitOps | Stack, - ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps, - /* 0xD0 - 0xD7 */ - ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, - ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, - 0, 0, 0, 0, - /* 0xD8 - 0xDF */ - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE0 - 0xE7 */ - 0, 0, 0, 0, - ByteOp | SrcImmUByte, SrcImmUByte, - ByteOp | SrcImmUByte, SrcImmUByte, - /* 0xE8 - 0xEF */ - SrcImm | Stack, SrcImm | ImplicitOps, - SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, - SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, - /* 0xF0 - 0xF7 */ - 0, 0, 0, 0, - ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, - /* 0xF8 - 0xFF */ - ImplicitOps, 0, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, -}; - -static u32 twobyte_table[256] = { - /* 0x00 - 0x0F */ - 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, - /* 0x10 - 0x1F */ - 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, - /* 0x20 - 0x2F */ - ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x30 - 0x3F */ - ImplicitOps, 0, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x40 - 0x47 */ - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - /* 0x48 - 0x4F */ - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - /* 0x50 - 0x5F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x60 - 0x6F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x70 - 0x7F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x80 - 0x8F */ - SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, - SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, - /* 0x90 - 0x9F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xA0 - 0xA7 */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, - DstMem | SrcReg | Src2ImmByte | ModRM, - DstMem | SrcReg | Src2CL | ModRM, 0, 0, - /* 0xA8 - 0xAF */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, - DstMem | SrcReg | Src2ImmByte | ModRM, - DstMem | SrcReg | Src2CL | ModRM, - ModRM, 0, - /* 0xB0 - 0xB7 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, - DstMem | SrcReg | ModRM | BitOp, - 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem16 | ModRM | Mov, - /* 0xB8 - 0xBF */ - 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, - 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem16 | ModRM | Mov, - /* 0xC0 - 0xCF */ - 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xD0 - 0xDF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE0 - 0xEF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xF0 - 0xFF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -static u32 group_table[] = { - [Group1_80*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - [Group1_81*8] = - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - [Group1_82*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - [Group1_83*8] = - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - [Group1A*8] = - DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, - [Group3_Byte*8] = - ByteOp | SrcImm | DstMem | ModRM, 0, - ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, - 0, 0, 0, 0, - [Group3*8] = - DstMem | SrcImm | ModRM, 0, - DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, - 0, 0, 0, 0, - [Group4*8] = - ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, - 0, 0, 0, 0, 0, 0, - [Group5*8] = - DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, - SrcMem | ModRM | Stack, 0, - SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, - [Group7*8] = - 0, 0, ModRM | SrcMem, ModRM | SrcMem, - SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, -}; - -static u32 group2_table[] = { - [Group7*8] = - SrcNone | ModRM, 0, 0, SrcNone | ModRM, - SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov, 0, -}; - -/* EFLAGS bit definitions. */ -#define EFLG_VM (1<<17) -#define EFLG_RF (1<<16) -#define EFLG_OF (1<<11) -#define EFLG_DF (1<<10) -#define EFLG_IF (1<<9) -#define EFLG_SF (1<<7) -#define EFLG_ZF (1<<6) -#define EFLG_AF (1<<4) -#define EFLG_PF (1<<2) -#define EFLG_CF (1<<0) - -/* - * Instruction emulation: - * Most instructions are emulated directly via a fragment of inline assembly - * code. This allows us to save/restore EFLAGS and thus very easily pick up - * any modified flags. - */ - -#if defined(CONFIG_X86_64) -#define _LO32 "k" /* force 32-bit operand */ -#define _STK "%%rsp" /* stack pointer */ -#elif defined(__i386__) -#define _LO32 "" /* force 32-bit operand */ -#define _STK "%%esp" /* stack pointer */ -#endif - -/* - * These EFLAGS bits are restored from saved value during emulation, and - * any changes are written back to the saved value after emulation. - */ -#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF) - -/* Before executing instruction: restore necessary bits in EFLAGS. */ -#define _PRE_EFLAGS(_sav, _msk, _tmp) \ - /* EFLAGS = (_sav & _msk) | (EFLAGS & ~_msk); _sav &= ~_msk; */ \ - "movl %"_sav",%"_LO32 _tmp"; " \ - "push %"_tmp"; " \ - "push %"_tmp"; " \ - "movl %"_msk",%"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "pushf; " \ - "notl %"_LO32 _tmp"; " \ - "andl %"_LO32 _tmp",("_STK"); " \ - "andl %"_LO32 _tmp","__stringify(BITS_PER_LONG/4)"("_STK"); " \ - "pop %"_tmp"; " \ - "orl %"_LO32 _tmp",("_STK"); " \ - "popf; " \ - "pop %"_sav"; " - -/* After executing instruction: write-back necessary bits in EFLAGS. */ -#define _POST_EFLAGS(_sav, _msk, _tmp) \ - /* _sav |= EFLAGS & _msk; */ \ - "pushf; " \ - "pop %"_tmp"; " \ - "andl %"_msk",%"_LO32 _tmp"; " \ - "orl %"_LO32 _tmp",%"_sav"; " - -#ifdef CONFIG_X86_64 -#define ON64(x) x -#else -#define ON64(x) -#endif - -#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op _suffix " %"_x"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : _y ((_src).val), "i" (EFLAGS_MASK)); \ - } while (0) - - -/* Raw emulation: instruction has two explicit operands. */ -#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - \ - switch ((_dst).bytes) { \ - case 2: \ - ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ - break; \ - case 4: \ - ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ - break; \ - case 8: \ - ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ - break; \ - } \ - } while (0) - -#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - switch ((_dst).bytes) { \ - case 1: \ - ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ - break; \ - default: \ - __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ - _wx, _wy, _lx, _ly, _qx, _qy); \ - break; \ - } \ - } while (0) - -/* Source operand is byte-sized and may be restricted to just %cl. */ -#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ - __emulate_2op(_op, _src, _dst, _eflags, \ - "b", "c", "b", "c", "b", "c", "b", "c") - -/* Source operand is byte, word, long or quad sized. */ -#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ - __emulate_2op(_op, _src, _dst, _eflags, \ - "b", "q", "w", "r", _LO32, "r", "", "r") - -/* Source operand is word, long or quad sized. */ -#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ - __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ - "w", "r", _LO32, "r", "", "r") - -/* Instruction has three operands and one operand is stored in ECX register */ -#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ - do { \ - unsigned long _tmp; \ - _type _clv = (_cl).val; \ - _type _srcv = (_src).val; \ - _type _dstv = (_dst).val; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "5", "2") \ - _op _suffix " %4,%1 \n" \ - _POST_EFLAGS("0", "5", "2") \ - : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ - : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ - ); \ - \ - (_cl).val = (unsigned long) _clv; \ - (_src).val = (unsigned long) _srcv; \ - (_dst).val = (unsigned long) _dstv; \ - } while (0) - -#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ - do { \ - switch ((_dst).bytes) { \ - case 2: \ - __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "w", unsigned short); \ - break; \ - case 4: \ - __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "l", unsigned int); \ - break; \ - case 8: \ - ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "q", unsigned long)); \ - break; \ - } \ - } while (0) - -#define __emulate_1op(_op, _dst, _eflags, _suffix) \ - do { \ - unsigned long _tmp; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op _suffix " %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "+m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - } while (0) - -/* Instruction has only one explicit operand (no source operand). */ -#define emulate_1op(_op, _dst, _eflags) \ - do { \ - switch ((_dst).bytes) { \ - case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \ - case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \ - case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \ - case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \ - } \ - } while (0) - -/* Fetch next part of the instruction being emulated. */ -#define insn_fetch(_type, _size, _eip) \ -({ unsigned long _x; \ - rc = do_insn_fetch(ctxt, ops, (_eip), &_x, (_size)); \ - if (rc != 0) \ - goto done; \ - (_eip) += (_size); \ - (_type)_x; \ -}) - -static inline unsigned long ad_mask(struct decode_cache *c) -{ - return (1UL << (c->ad_bytes << 3)) - 1; -} - -/* Access/update address held in a register, based on addressing mode. */ -static inline unsigned long -address_mask(struct decode_cache *c, unsigned long reg) -{ - if (c->ad_bytes == sizeof(unsigned long)) - return reg; - else - return reg & ad_mask(c); -} - -static inline unsigned long -register_address(struct decode_cache *c, unsigned long base, unsigned long reg) -{ - return base + address_mask(c, reg); -} - -static inline void -register_address_increment(struct decode_cache *c, unsigned long *reg, int inc) -{ - if (c->ad_bytes == sizeof(unsigned long)) - *reg += inc; - else - *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c)); -} - -static inline void jmp_rel(struct decode_cache *c, int rel) -{ - register_address_increment(c, &c->eip, rel); -} - -static void set_seg_override(struct decode_cache *c, int seg) -{ - c->has_seg_override = true; - c->seg_override = seg; -} - -static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) -{ - if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) - return 0; - - return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg); -} - -static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, - struct decode_cache *c) -{ - if (!c->has_seg_override) - return 0; - - return seg_base(ctxt, c->seg_override); -} - -static unsigned long es_base(struct x86_emulate_ctxt *ctxt) -{ - return seg_base(ctxt, VCPU_SREG_ES); -} - -static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) -{ - return seg_base(ctxt, VCPU_SREG_SS); -} - -static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - unsigned long linear, u8 *dest) -{ - struct fetch_cache *fc = &ctxt->decode.fetch; - int rc; - int size; - - if (linear < fc->start || linear >= fc->end) { - size = min(15UL, PAGE_SIZE - offset_in_page(linear)); - rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); - if (rc) - return rc; - fc->start = linear; - fc->end = linear + size; - } - *dest = fc->data[linear - fc->start]; - return 0; -} - -static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - unsigned long eip, void *dest, unsigned size) -{ - int rc = 0; - - eip += ctxt->cs_base; - while (size--) { - rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); - if (rc) - return rc; - } - return 0; -} - -/* - * Given the 'reg' portion of a ModRM byte, and a register block, return a - * pointer into the block that addresses the relevant register. - * @highbyte_regs specifies whether to decode AH,CH,DH,BH. - */ -static void *decode_register(u8 modrm_reg, unsigned long *regs, - int highbyte_regs) -{ - void *p; - - p = ®s[modrm_reg]; - if (highbyte_regs && modrm_reg >= 4 && modrm_reg < 8) - p = (unsigned char *)®s[modrm_reg & 3] + 1; - return p; -} - -static int read_descriptor(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - void *ptr, - u16 *size, unsigned long *address, int op_bytes) -{ - int rc; - - if (op_bytes == 2) - op_bytes = 3; - *address = 0; - rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, - ctxt->vcpu); - if (rc) - return rc; - rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, - ctxt->vcpu); - return rc; -} - -static int test_cc(unsigned int condition, unsigned int flags) -{ - int rc = 0; - - switch ((condition & 15) >> 1) { - case 0: /* o */ - rc |= (flags & EFLG_OF); - break; - case 1: /* b/c/nae */ - rc |= (flags & EFLG_CF); - break; - case 2: /* z/e */ - rc |= (flags & EFLG_ZF); - break; - case 3: /* be/na */ - rc |= (flags & (EFLG_CF|EFLG_ZF)); - break; - case 4: /* s */ - rc |= (flags & EFLG_SF); - break; - case 5: /* p/pe */ - rc |= (flags & EFLG_PF); - break; - case 7: /* le/ng */ - rc |= (flags & EFLG_ZF); - /* fall through */ - case 6: /* l/nge */ - rc |= (!(flags & EFLG_SF) != !(flags & EFLG_OF)); - break; - } - - /* Odd condition identifiers (lsb == 1) have inverted sense. */ - return (!!rc ^ (condition & 1)); -} - -static void decode_register_operand(struct operand *op, - struct decode_cache *c, - int inhibit_bytereg) -{ - unsigned reg = c->modrm_reg; - int highbyte_regs = c->rex_prefix == 0; - - if (!(c->d & ModRM)) - reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); - op->type = OP_REG; - if ((c->d & ByteOp) && !inhibit_bytereg) { - op->ptr = decode_register(reg, c->regs, highbyte_regs); - op->val = *(u8 *)op->ptr; - op->bytes = 1; - } else { - op->ptr = decode_register(reg, c->regs, 0); - op->bytes = c->op_bytes; - switch (op->bytes) { - case 2: - op->val = *(u16 *)op->ptr; - break; - case 4: - op->val = *(u32 *)op->ptr; - break; - case 8: - op->val = *(u64 *) op->ptr; - break; - } - } - op->orig_val = op->val; -} - -static int decode_modrm(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - u8 sib; - int index_reg = 0, base_reg = 0, scale; - int rc = 0; - - if (c->rex_prefix) { - c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ - index_reg = (c->rex_prefix & 2) << 2; /* REX.X */ - c->modrm_rm = base_reg = (c->rex_prefix & 1) << 3; /* REG.B */ - } - - c->modrm = insn_fetch(u8, 1, c->eip); - c->modrm_mod |= (c->modrm & 0xc0) >> 6; - c->modrm_reg |= (c->modrm & 0x38) >> 3; - c->modrm_rm |= (c->modrm & 0x07); - c->modrm_ea = 0; - c->use_modrm_ea = 1; - - if (c->modrm_mod == 3) { - c->modrm_ptr = decode_register(c->modrm_rm, - c->regs, c->d & ByteOp); - c->modrm_val = *(unsigned long *)c->modrm_ptr; - return rc; - } - - if (c->ad_bytes == 2) { - unsigned bx = c->regs[VCPU_REGS_RBX]; - unsigned bp = c->regs[VCPU_REGS_RBP]; - unsigned si = c->regs[VCPU_REGS_RSI]; - unsigned di = c->regs[VCPU_REGS_RDI]; - - /* 16-bit ModR/M decode. */ - switch (c->modrm_mod) { - case 0: - if (c->modrm_rm == 6) - c->modrm_ea += insn_fetch(u16, 2, c->eip); - break; - case 1: - c->modrm_ea += insn_fetch(s8, 1, c->eip); - break; - case 2: - c->modrm_ea += insn_fetch(u16, 2, c->eip); - break; - } - switch (c->modrm_rm) { - case 0: - c->modrm_ea += bx + si; - break; - case 1: - c->modrm_ea += bx + di; - break; - case 2: - c->modrm_ea += bp + si; - break; - case 3: - c->modrm_ea += bp + di; - break; - case 4: - c->modrm_ea += si; - break; - case 5: - c->modrm_ea += di; - break; - case 6: - if (c->modrm_mod != 0) - c->modrm_ea += bp; - break; - case 7: - c->modrm_ea += bx; - break; - } - if (c->modrm_rm == 2 || c->modrm_rm == 3 || - (c->modrm_rm == 6 && c->modrm_mod != 0)) - if (!c->has_seg_override) - set_seg_override(c, VCPU_SREG_SS); - c->modrm_ea = (u16)c->modrm_ea; - } else { - /* 32/64-bit ModR/M decode. */ - if ((c->modrm_rm & 7) == 4) { - sib = insn_fetch(u8, 1, c->eip); - index_reg |= (sib >> 3) & 7; - base_reg |= sib & 7; - scale = sib >> 6; - - if ((base_reg & 7) == 5 && c->modrm_mod == 0) - c->modrm_ea += insn_fetch(s32, 4, c->eip); - else - c->modrm_ea += c->regs[base_reg]; - if (index_reg != 4) - c->modrm_ea += c->regs[index_reg] << scale; - } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { - if (ctxt->mode == X86EMUL_MODE_PROT64) - c->rip_relative = 1; - } else - c->modrm_ea += c->regs[c->modrm_rm]; - switch (c->modrm_mod) { - case 0: - if (c->modrm_rm == 5) - c->modrm_ea += insn_fetch(s32, 4, c->eip); - break; - case 1: - c->modrm_ea += insn_fetch(s8, 1, c->eip); - break; - case 2: - c->modrm_ea += insn_fetch(s32, 4, c->eip); - break; - } - } -done: - return rc; -} - -static int decode_abs(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc = 0; - - switch (c->ad_bytes) { - case 2: - c->modrm_ea = insn_fetch(u16, 2, c->eip); - break; - case 4: - c->modrm_ea = insn_fetch(u32, 4, c->eip); - break; - case 8: - c->modrm_ea = insn_fetch(u64, 8, c->eip); - break; - } -done: - return rc; -} - -int -x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc = 0; - int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes, group; - - /* Shadow copy of register state. Committed on successful emulation. */ - - memset(c, 0, sizeof(struct decode_cache)); - c->eip = kvm_rip_read(ctxt->vcpu); - ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); - - switch (mode) { - case X86EMUL_MODE_REAL: - case X86EMUL_MODE_PROT16: - def_op_bytes = def_ad_bytes = 2; - break; - case X86EMUL_MODE_PROT32: - def_op_bytes = def_ad_bytes = 4; - break; -#ifdef CONFIG_X86_64 - case X86EMUL_MODE_PROT64: - def_op_bytes = 4; - def_ad_bytes = 8; - break; -#endif - default: - return -1; - } - - c->op_bytes = def_op_bytes; - c->ad_bytes = def_ad_bytes; - - /* Legacy prefixes. */ - for (;;) { - switch (c->b = insn_fetch(u8, 1, c->eip)) { - case 0x66: /* operand-size override */ - /* switch between 2/4 bytes */ - c->op_bytes = def_op_bytes ^ 6; - break; - case 0x67: /* address-size override */ - if (mode == X86EMUL_MODE_PROT64) - /* switch between 4/8 bytes */ - c->ad_bytes = def_ad_bytes ^ 12; - else - /* switch between 2/4 bytes */ - c->ad_bytes = def_ad_bytes ^ 6; - break; - case 0x26: /* ES override */ - case 0x2e: /* CS override */ - case 0x36: /* SS override */ - case 0x3e: /* DS override */ - set_seg_override(c, (c->b >> 3) & 3); - break; - case 0x64: /* FS override */ - case 0x65: /* GS override */ - set_seg_override(c, c->b & 7); - break; - case 0x40 ... 0x4f: /* REX */ - if (mode != X86EMUL_MODE_PROT64) - goto done_prefixes; - c->rex_prefix = c->b; - continue; - case 0xf0: /* LOCK */ - c->lock_prefix = 1; - break; - case 0xf2: /* REPNE/REPNZ */ - c->rep_prefix = REPNE_PREFIX; - break; - case 0xf3: /* REP/REPE/REPZ */ - c->rep_prefix = REPE_PREFIX; - break; - default: - goto done_prefixes; - } - - /* Any legacy prefix after a REX prefix nullifies its effect. */ - - c->rex_prefix = 0; - } - -done_prefixes: - - /* REX prefix. */ - if (c->rex_prefix) - if (c->rex_prefix & 8) - c->op_bytes = 8; /* REX.W */ - - /* Opcode byte(s). */ - c->d = opcode_table[c->b]; - if (c->d == 0) { - /* Two-byte opcode? */ - if (c->b == 0x0f) { - c->twobyte = 1; - c->b = insn_fetch(u8, 1, c->eip); - c->d = twobyte_table[c->b]; - } - } - - if (c->d & Group) { - group = c->d & GroupMask; - c->modrm = insn_fetch(u8, 1, c->eip); - --c->eip; - - group = (group << 3) + ((c->modrm >> 3) & 7); - if ((c->d & GroupDual) && (c->modrm >> 6) == 3) - c->d = group2_table[group]; - else - c->d = group_table[group]; - } - - /* Unrecognised? */ - if (c->d == 0) { - DPRINTF("Cannot emulate %02x\n", c->b); - return -1; - } - - if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) - c->op_bytes = 8; - - /* ModRM and SIB bytes. */ - if (c->d & ModRM) - rc = decode_modrm(ctxt, ops); - else if (c->d & MemAbs) - rc = decode_abs(ctxt, ops); - if (rc) - goto done; - - if (!c->has_seg_override) - set_seg_override(c, VCPU_SREG_DS); - - if (!(!c->twobyte && c->b == 0x8d)) - c->modrm_ea += seg_override_base(ctxt, c); - - if (c->ad_bytes != 8) - c->modrm_ea = (u32)c->modrm_ea; - /* - * Decode and fetch the source operand: register, memory - * or immediate. - */ - switch (c->d & SrcMask) { - case SrcNone: - break; - case SrcReg: - decode_register_operand(&c->src, c, 0); - break; - case SrcMem16: - c->src.bytes = 2; - goto srcmem_common; - case SrcMem32: - c->src.bytes = 4; - goto srcmem_common; - case SrcMem: - c->src.bytes = (c->d & ByteOp) ? 1 : - c->op_bytes; - /* Don't fetch the address for invlpg: it could be unmapped. */ - if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) - break; - srcmem_common: - /* - * For instructions with a ModR/M byte, switch to register - * access if Mod = 3. - */ - if ((c->d & ModRM) && c->modrm_mod == 3) { - c->src.type = OP_REG; - c->src.val = c->modrm_val; - c->src.ptr = c->modrm_ptr; - break; - } - c->src.type = OP_MEM; - break; - case SrcImm: - case SrcImmU: - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - if (c->src.bytes == 8) - c->src.bytes = 4; - /* NB. Immediates are sign-extended as necessary. */ - switch (c->src.bytes) { - case 1: - c->src.val = insn_fetch(s8, 1, c->eip); - break; - case 2: - c->src.val = insn_fetch(s16, 2, c->eip); - break; - case 4: - c->src.val = insn_fetch(s32, 4, c->eip); - break; - } - if ((c->d & SrcMask) == SrcImmU) { - switch (c->src.bytes) { - case 1: - c->src.val &= 0xff; - break; - case 2: - c->src.val &= 0xffff; - break; - case 4: - c->src.val &= 0xffffffff; - break; - } - } - break; - case SrcImmByte: - case SrcImmUByte: - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = 1; - if ((c->d & SrcMask) == SrcImmByte) - c->src.val = insn_fetch(s8, 1, c->eip); - else - c->src.val = insn_fetch(u8, 1, c->eip); - break; - case SrcOne: - c->src.bytes = 1; - c->src.val = 1; - break; - } - - /* - * Decode and fetch the second source operand: register, memory - * or immediate. - */ - switch (c->d & Src2Mask) { - case Src2None: - break; - case Src2CL: - c->src2.bytes = 1; - c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; - break; - case Src2ImmByte: - c->src2.type = OP_IMM; - c->src2.ptr = (unsigned long *)c->eip; - c->src2.bytes = 1; - c->src2.val = insn_fetch(u8, 1, c->eip); - break; - case Src2Imm16: - c->src2.type = OP_IMM; - c->src2.ptr = (unsigned long *)c->eip; - c->src2.bytes = 2; - c->src2.val = insn_fetch(u16, 2, c->eip); - break; - case Src2One: - c->src2.bytes = 1; - c->src2.val = 1; - break; - } - - /* Decode and fetch the destination operand: register or memory. */ - switch (c->d & DstMask) { - case ImplicitOps: - /* Special instructions do their own operand decoding. */ - return 0; - case DstReg: - decode_register_operand(&c->dst, c, - c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); - break; - case DstMem: - if ((c->d & ModRM) && c->modrm_mod == 3) { - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.type = OP_REG; - c->dst.val = c->dst.orig_val = c->modrm_val; - c->dst.ptr = c->modrm_ptr; - break; - } - c->dst.type = OP_MEM; - break; - case DstAcc: - c->dst.type = OP_REG; - c->dst.bytes = c->op_bytes; - c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - switch (c->op_bytes) { - case 1: - c->dst.val = *(u8 *)c->dst.ptr; - break; - case 2: - c->dst.val = *(u16 *)c->dst.ptr; - break; - case 4: - c->dst.val = *(u32 *)c->dst.ptr; - break; - } - c->dst.orig_val = c->dst.val; - break; - } - - if (c->rip_relative) - c->modrm_ea += c->eip; - -done: - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; -} - -static inline void emulate_push(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - - c->dst.type = OP_MEM; - c->dst.bytes = c->op_bytes; - c->dst.val = c->src.val; - register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.ptr = (void *) register_address(c, ss_base(ctxt), - c->regs[VCPU_REGS_RSP]); -} - -static int emulate_pop(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - void *dest, int len) -{ - struct decode_cache *c = &ctxt->decode; - int rc; - - rc = ops->read_emulated(register_address(c, ss_base(ctxt), - c->regs[VCPU_REGS_RSP]), - dest, len, ctxt->vcpu); - if (rc != 0) - return rc; - - register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); - return rc; -} - -static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc; - - rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); - if (rc != 0) - return rc; - return 0; -} - -static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - switch (c->modrm_reg) { - case 0: /* rol */ - emulate_2op_SrcB("rol", c->src, c->dst, ctxt->eflags); - break; - case 1: /* ror */ - emulate_2op_SrcB("ror", c->src, c->dst, ctxt->eflags); - break; - case 2: /* rcl */ - emulate_2op_SrcB("rcl", c->src, c->dst, ctxt->eflags); - break; - case 3: /* rcr */ - emulate_2op_SrcB("rcr", c->src, c->dst, ctxt->eflags); - break; - case 4: /* sal/shl */ - case 6: /* sal/shl */ - emulate_2op_SrcB("sal", c->src, c->dst, ctxt->eflags); - break; - case 5: /* shr */ - emulate_2op_SrcB("shr", c->src, c->dst, ctxt->eflags); - break; - case 7: /* sar */ - emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); - break; - } -} - -static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc = 0; - - switch (c->modrm_reg) { - case 0 ... 1: /* test */ - emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); - break; - case 2: /* not */ - c->dst.val = ~c->dst.val; - break; - case 3: /* neg */ - emulate_1op("neg", c->dst, ctxt->eflags); - break; - default: - DPRINTF("Cannot emulate %02x\n", c->b); - rc = X86EMUL_UNHANDLEABLE; - break; - } - return rc; -} - -static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - - switch (c->modrm_reg) { - case 0: /* inc */ - emulate_1op("inc", c->dst, ctxt->eflags); - break; - case 1: /* dec */ - emulate_1op("dec", c->dst, ctxt->eflags); - break; - case 2: /* call near abs */ { - long int old_eip; - old_eip = c->eip; - c->eip = c->src.val; - c->src.val = old_eip; - emulate_push(ctxt); - break; - } - case 4: /* jmp abs */ - c->eip = c->src.val; - break; - case 6: /* push */ - emulate_push(ctxt); - break; - } - return 0; -} - -static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, - unsigned long memop) -{ - struct decode_cache *c = &ctxt->decode; - u64 old, new; - int rc; - - rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); - if (rc != 0) - return rc; - - if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || - ((u32) (old >> 32) != (u32) c->regs[VCPU_REGS_RDX])) { - - c->regs[VCPU_REGS_RAX] = (u32) (old >> 0); - c->regs[VCPU_REGS_RDX] = (u32) (old >> 32); - ctxt->eflags &= ~EFLG_ZF; - - } else { - new = ((u64)c->regs[VCPU_REGS_RCX] << 32) | - (u32) c->regs[VCPU_REGS_RBX]; - - rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); - if (rc != 0) - return rc; - ctxt->eflags |= EFLG_ZF; - } - return 0; -} - -static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - struct decode_cache *c = &ctxt->decode; - int rc; - unsigned long cs; - - rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); - if (rc) - return rc; - if (c->op_bytes == 4) - c->eip = (u32)c->eip; - rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); - if (rc) - return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); - return rc; -} - -static inline int writeback(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) -{ - int rc; - struct decode_cache *c = &ctxt->decode; - - switch (c->dst.type) { - case OP_REG: - /* The 4-byte case *is* correct: - * in 64-bit mode we zero-extend. - */ - switch (c->dst.bytes) { - case 1: - *(u8 *)c->dst.ptr = (u8)c->dst.val; - break; - case 2: - *(u16 *)c->dst.ptr = (u16)c->dst.val; - break; - case 4: - *c->dst.ptr = (u32)c->dst.val; - break; /* 64b: zero-ext */ - case 8: - *c->dst.ptr = c->dst.val; - break; - } - break; - case OP_MEM: - if (c->lock_prefix) - rc = ops->cmpxchg_emulated( - (unsigned long)c->dst.ptr, - &c->dst.orig_val, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu); - else - rc = ops->write_emulated( - (unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu); - if (rc != 0) - return rc; - break; - case OP_NONE: - /* no writeback */ - break; - default: - break; - } - return 0; -} - -static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) -{ - u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); - /* - * an sti; sti; sequence only disable interrupts for the first - * instruction. So, if the last instruction, be it emulated or - * not, left the system with the INT_STI flag enabled, it - * means that the last instruction is an sti. We should not - * leave the flag on in this case. The same goes for mov ss - */ - if (!(int_shadow & mask)) - ctxt->interruptibility = mask; -} - -static inline void -setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, - struct kvm_segment *cs, struct kvm_segment *ss) -{ - memset(cs, 0, sizeof(struct kvm_segment)); - kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); - memset(ss, 0, sizeof(struct kvm_segment)); - - cs->l = 0; /* will be adjusted later */ - cs->base = 0; /* flat segment */ - cs->g = 1; /* 4kb granularity */ - cs->limit = 0xffffffff; /* 4GB limit */ - cs->type = 0x0b; /* Read, Execute, Accessed */ - cs->s = 1; - cs->dpl = 0; /* will be adjusted later */ - cs->present = 1; - cs->db = 1; - - ss->unusable = 0; - ss->base = 0; /* flat segment */ - ss->limit = 0xffffffff; /* 4GB limit */ - ss->g = 1; /* 4kb granularity */ - ss->s = 1; - ss->type = 0x03; /* Read/Write, Accessed */ - ss->db = 1; /* 32bit stack segment */ - ss->dpl = 0; - ss->present = 1; -} - -static int -emulate_syscall(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - struct kvm_segment cs, ss; - u64 msr_data; - - /* syscall is not available in real mode */ - if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) - return -1; - - setup_syscalls_segments(ctxt, &cs, &ss); - - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); - msr_data >>= 32; - cs.selector = (u16)(msr_data & 0xfffc); - ss.selector = (u16)(msr_data + 8); - - if (is_long_mode(ctxt->vcpu)) { - cs.db = 0; - cs.l = 1; - } - kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); - kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); - - c->regs[VCPU_REGS_RCX] = c->eip; - if (is_long_mode(ctxt->vcpu)) { -#ifdef CONFIG_X86_64 - c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; - - kvm_x86_ops->get_msr(ctxt->vcpu, - ctxt->mode == X86EMUL_MODE_PROT64 ? - MSR_LSTAR : MSR_CSTAR, &msr_data); - c->eip = msr_data; - - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); - ctxt->eflags &= ~(msr_data | EFLG_RF); -#endif - } else { - /* legacy mode */ - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); - c->eip = (u32)msr_data; - - ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); - } - - return 0; -} - -static int -emulate_sysenter(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - struct kvm_segment cs, ss; - u64 msr_data; - - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL || - !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - - /* XXX sysenter/sysexit have not been tested in 64bit mode. - * Therefore, we inject an #UD. - */ - if (ctxt->mode == X86EMUL_MODE_PROT64) - return -1; - - setup_syscalls_segments(ctxt, &cs, &ss); - - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); - switch (ctxt->mode) { - case X86EMUL_MODE_PROT32: - if ((msr_data & 0xfffc) == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - break; - case X86EMUL_MODE_PROT64: - if (msr_data == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - break; - } - - ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); - cs.selector = (u16)msr_data; - cs.selector &= ~SELECTOR_RPL_MASK; - ss.selector = cs.selector + 8; - ss.selector &= ~SELECTOR_RPL_MASK; - if (ctxt->mode == X86EMUL_MODE_PROT64 - || is_long_mode(ctxt->vcpu)) { - cs.db = 0; - cs.l = 1; - } - - kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); - kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); - - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); - c->eip = msr_data; - - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); - c->regs[VCPU_REGS_RSP] = msr_data; - - return 0; -} - -static int -emulate_sysexit(struct x86_emulate_ctxt *ctxt) -{ - struct decode_cache *c = &ctxt->decode; - struct kvm_segment cs, ss; - u64 msr_data; - int usermode; - - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - - /* sysexit must be called from CPL 0 */ - if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - - setup_syscalls_segments(ctxt, &cs, &ss); - - if ((c->rex_prefix & 0x8) != 0x0) - usermode = X86EMUL_MODE_PROT64; - else - usermode = X86EMUL_MODE_PROT32; - - cs.dpl = 3; - ss.dpl = 3; - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); - switch (usermode) { - case X86EMUL_MODE_PROT32: - cs.selector = (u16)(msr_data + 16); - if ((msr_data & 0xfffc) == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - ss.selector = (u16)(msr_data + 24); - break; - case X86EMUL_MODE_PROT64: - cs.selector = (u16)(msr_data + 32); - if (msr_data == 0x0) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - ss.selector = cs.selector + 8; - cs.db = 0; - cs.l = 1; - break; - } - cs.selector |= SELECTOR_RPL_MASK; - ss.selector |= SELECTOR_RPL_MASK; - - kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); - kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); - - c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; - c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; - - return 0; -} - -int -x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) -{ - unsigned long memop = 0; - u64 msr_data; - unsigned long saved_eip = 0; - struct decode_cache *c = &ctxt->decode; - unsigned int port; - int io_dir_in; - int rc = 0; - - ctxt->interruptibility = 0; - - /* Shadow copy of register state. Committed on successful emulation. - * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't - * modify them. - */ - - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); - saved_eip = c->eip; - - if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) - memop = c->modrm_ea; - - if (c->rep_prefix && (c->d & String)) { - /* All REP prefixes have the same first termination condition */ - if (c->regs[VCPU_REGS_RCX] == 0) { - kvm_rip_write(ctxt->vcpu, c->eip); - goto done; - } - /* The second termination condition only applies for REPE - * and REPNE. Test if the repeat string operation prefix is - * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the - * corresponding termination condition according to: - * - if REPE/REPZ and ZF = 0 then done - * - if REPNE/REPNZ and ZF = 1 then done - */ - if ((c->b == 0xa6) || (c->b == 0xa7) || - (c->b == 0xae) || (c->b == 0xaf)) { - if ((c->rep_prefix == REPE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == 0)) { - kvm_rip_write(ctxt->vcpu, c->eip); - goto done; - } - if ((c->rep_prefix == REPNE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { - kvm_rip_write(ctxt->vcpu, c->eip); - goto done; - } - } - c->regs[VCPU_REGS_RCX]--; - c->eip = kvm_rip_read(ctxt->vcpu); - } - - if (c->src.type == OP_MEM) { - c->src.ptr = (unsigned long *)memop; - c->src.val = 0; - rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu); - if (rc != 0) - goto done; - c->src.orig_val = c->src.val; - } - - if ((c->d & DstMask) == ImplicitOps) - goto special_insn; - - - if (c->dst.type == OP_MEM) { - c->dst.ptr = (unsigned long *)memop; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.val = 0; - if (c->d & BitOp) { - unsigned long mask = ~(c->dst.bytes * 8 - 1); - - c->dst.ptr = (void *)c->dst.ptr + - (c->src.val & mask) / 8; - } - if (!(c->d & Mov) && - /* optimisation - avoid slow emulated read */ - ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0)) - goto done; - } - c->dst.orig_val = c->dst.val; - -special_insn: - - if (c->twobyte) - goto twobyte_insn; - - switch (c->b) { - case 0x00 ... 0x05: - add: /* add */ - emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); - break; - case 0x08 ... 0x0d: - or: /* or */ - emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); - break; - case 0x10 ... 0x15: - adc: /* adc */ - emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); - break; - case 0x18 ... 0x1d: - sbb: /* sbb */ - emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); - break; - case 0x20 ... 0x25: - and: /* and */ - emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); - break; - case 0x28 ... 0x2d: - sub: /* sub */ - emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); - break; - case 0x30 ... 0x35: - xor: /* xor */ - emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); - break; - case 0x38 ... 0x3d: - cmp: /* cmp */ - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - break; - case 0x40 ... 0x47: /* inc r16/r32 */ - emulate_1op("inc", c->dst, ctxt->eflags); - break; - case 0x48 ... 0x4f: /* dec r16/r32 */ - emulate_1op("dec", c->dst, ctxt->eflags); - break; - case 0x50 ... 0x57: /* push reg */ - emulate_push(ctxt); - break; - case 0x58 ... 0x5f: /* pop reg */ - pop_instruction: - rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); - if (rc != 0) - goto done; - break; - case 0x63: /* movsxd */ - if (ctxt->mode != X86EMUL_MODE_PROT64) - goto cannot_emulate; - c->dst.val = (s32) c->src.val; - break; - case 0x68: /* push imm */ - case 0x6a: /* push imm8 */ - emulate_push(ctxt); - break; - case 0x6c: /* insb */ - case 0x6d: /* insw/insd */ - if (kvm_emulate_pio_string(ctxt->vcpu, NULL, - 1, - (c->d & ByteOp) ? 1 : c->op_bytes, - c->rep_prefix ? - address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, - (ctxt->eflags & EFLG_DF), - register_address(c, es_base(ctxt), - c->regs[VCPU_REGS_RDI]), - c->rep_prefix, - c->regs[VCPU_REGS_RDX]) == 0) { - c->eip = saved_eip; - return -1; - } - return 0; - case 0x6e: /* outsb */ - case 0x6f: /* outsw/outsd */ - if (kvm_emulate_pio_string(ctxt->vcpu, NULL, - 0, - (c->d & ByteOp) ? 1 : c->op_bytes, - c->rep_prefix ? - address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, - (ctxt->eflags & EFLG_DF), - register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), - c->rep_prefix, - c->regs[VCPU_REGS_RDX]) == 0) { - c->eip = saved_eip; - return -1; - } - return 0; - case 0x70 ... 0x7f: /* jcc (short) */ - if (test_cc(c->b, ctxt->eflags)) - jmp_rel(c, c->src.val); - break; - case 0x80 ... 0x83: /* Grp1 */ - switch (c->modrm_reg) { - case 0: - goto add; - case 1: - goto or; - case 2: - goto adc; - case 3: - goto sbb; - case 4: - goto and; - case 5: - goto sub; - case 6: - goto xor; - case 7: - goto cmp; - } - break; - case 0x84 ... 0x85: - emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); - break; - case 0x86 ... 0x87: /* xchg */ - xchg: - /* Write back the register source. */ - switch (c->dst.bytes) { - case 1: - *(u8 *) c->src.ptr = (u8) c->dst.val; - break; - case 2: - *(u16 *) c->src.ptr = (u16) c->dst.val; - break; - case 4: - *c->src.ptr = (u32) c->dst.val; - break; /* 64b reg: zero-extend */ - case 8: - *c->src.ptr = c->dst.val; - break; - } - /* - * Write back the memory destination with implicit LOCK - * prefix. - */ - c->dst.val = c->src.val; - c->lock_prefix = 1; - break; - case 0x88 ... 0x8b: /* mov */ - goto mov; - case 0x8c: { /* mov r/m, sreg */ - struct kvm_segment segreg; - - if (c->modrm_reg <= 5) - kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); - else { - printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n", - c->modrm); - goto cannot_emulate; - } - c->dst.val = segreg.selector; - break; - } - case 0x8d: /* lea r16/r32, m */ - c->dst.val = c->modrm_ea; - break; - case 0x8e: { /* mov seg, r/m16 */ - uint16_t sel; - int type_bits; - int err; - - sel = c->src.val; - if (c->modrm_reg == VCPU_SREG_SS) - toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); - - if (c->modrm_reg <= 5) { - type_bits = (c->modrm_reg == 1) ? 9 : 1; - err = kvm_load_segment_descriptor(ctxt->vcpu, sel, - type_bits, c->modrm_reg); - } else { - printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n", - c->modrm); - goto cannot_emulate; - } - - if (err < 0) - goto cannot_emulate; - - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - } - case 0x8f: /* pop (sole member of Grp1a) */ - rc = emulate_grp1a(ctxt, ops); - if (rc != 0) - goto done; - break; - case 0x90: /* nop / xchg r8,rax */ - if (!(c->rex_prefix & 1)) { /* nop */ - c->dst.type = OP_NONE; - break; - } - case 0x91 ... 0x97: /* xchg reg,rax */ - c->src.type = c->dst.type = OP_REG; - c->src.bytes = c->dst.bytes = c->op_bytes; - c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; - c->src.val = *(c->src.ptr); - goto xchg; - case 0x9c: /* pushf */ - c->src.val = (unsigned long) ctxt->eflags; - emulate_push(ctxt); - break; - case 0x9d: /* popf */ - c->dst.type = OP_REG; - c->dst.ptr = (unsigned long *) &ctxt->eflags; - c->dst.bytes = c->op_bytes; - goto pop_instruction; - case 0xa0 ... 0xa1: /* mov */ - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - c->dst.val = c->src.val; - break; - case 0xa2 ... 0xa3: /* mov */ - c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; - break; - case 0xa4 ... 0xa5: /* movs */ - c->dst.type = OP_MEM; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address(c, - es_base(ctxt), - c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), - &c->dst.val, - c->dst.bytes, ctxt->vcpu)) != 0) - goto done; - register_address_increment(c, &c->regs[VCPU_REGS_RSI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - register_address_increment(c, &c->regs[VCPU_REGS_RDI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - break; - case 0xa6 ... 0xa7: /* cmps */ - c->src.type = OP_NONE; /* Disable writeback. */ - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.ptr = (unsigned long *)register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]); - if ((rc = ops->read_emulated((unsigned long)c->src.ptr, - &c->src.val, - c->src.bytes, - ctxt->vcpu)) != 0) - goto done; - - c->dst.type = OP_NONE; /* Disable writeback. */ - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address(c, - es_base(ctxt), - c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) - goto done; - - DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); - - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - - register_address_increment(c, &c->regs[VCPU_REGS_RSI], - (ctxt->eflags & EFLG_DF) ? -c->src.bytes - : c->src.bytes); - register_address_increment(c, &c->regs[VCPU_REGS_RDI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - - break; - case 0xaa ... 0xab: /* stos */ - c->dst.type = OP_MEM; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address(c, - es_base(ctxt), - c->regs[VCPU_REGS_RDI]); - c->dst.val = c->regs[VCPU_REGS_RAX]; - register_address_increment(c, &c->regs[VCPU_REGS_RDI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - break; - case 0xac ... 0xad: /* lods */ - c->dst.type = OP_REG; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - if ((rc = ops->read_emulated(register_address(c, - seg_override_base(ctxt, c), - c->regs[VCPU_REGS_RSI]), - &c->dst.val, - c->dst.bytes, - ctxt->vcpu)) != 0) - goto done; - register_address_increment(c, &c->regs[VCPU_REGS_RSI], - (ctxt->eflags & EFLG_DF) ? -c->dst.bytes - : c->dst.bytes); - break; - case 0xae ... 0xaf: /* scas */ - DPRINTF("Urk! I don't handle SCAS.\n"); - goto cannot_emulate; - case 0xb0 ... 0xbf: /* mov r, imm */ - goto mov; - case 0xc0 ... 0xc1: - emulate_grp2(ctxt); - break; - case 0xc3: /* ret */ - c->dst.type = OP_REG; - c->dst.ptr = &c->eip; - c->dst.bytes = c->op_bytes; - goto pop_instruction; - case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ - mov: - c->dst.val = c->src.val; - break; - case 0xcb: /* ret far */ - rc = emulate_ret_far(ctxt, ops); - if (rc) - goto done; - break; - case 0xd0 ... 0xd1: /* Grp2 */ - c->src.val = 1; - emulate_grp2(ctxt); - break; - case 0xd2 ... 0xd3: /* Grp2 */ - c->src.val = c->regs[VCPU_REGS_RCX]; - emulate_grp2(ctxt); - break; - case 0xe4: /* inb */ - case 0xe5: /* in */ - port = c->src.val; - io_dir_in = 1; - goto do_io; - case 0xe6: /* outb */ - case 0xe7: /* out */ - port = c->src.val; - io_dir_in = 0; - goto do_io; - case 0xe8: /* call (near) */ { - long int rel = c->src.val; - c->src.val = (unsigned long) c->eip; - jmp_rel(c, rel); - emulate_push(ctxt); - break; - } - case 0xe9: /* jmp rel */ - goto jmp; - case 0xea: /* jmp far */ - if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, - VCPU_SREG_CS) < 0) { - DPRINTF("jmp far: Failed to load CS descriptor\n"); - goto cannot_emulate; - } - - c->eip = c->src.val; - break; - case 0xeb: - jmp: /* jmp rel short */ - jmp_rel(c, c->src.val); - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xec: /* in al,dx */ - case 0xed: /* in (e/r)ax,dx */ - port = c->regs[VCPU_REGS_RDX]; - io_dir_in = 1; - goto do_io; - case 0xee: /* out al,dx */ - case 0xef: /* out (e/r)ax,dx */ - port = c->regs[VCPU_REGS_RDX]; - io_dir_in = 0; - do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, - (c->d & ByteOp) ? 1 : c->op_bytes, - port) != 0) { - c->eip = saved_eip; - goto cannot_emulate; - } - break; - case 0xf4: /* hlt */ - ctxt->vcpu->arch.halt_request = 1; - break; - case 0xf5: /* cmc */ - /* complement carry flag from eflags reg */ - ctxt->eflags ^= EFLG_CF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xf6 ... 0xf7: /* Grp3 */ - rc = emulate_grp3(ctxt, ops); - if (rc != 0) - goto done; - break; - case 0xf8: /* clc */ - ctxt->eflags &= ~EFLG_CF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xfa: /* cli */ - ctxt->eflags &= ~X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xfb: /* sti */ - toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); - ctxt->eflags |= X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xfc: /* cld */ - ctxt->eflags &= ~EFLG_DF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xfd: /* std */ - ctxt->eflags |= EFLG_DF; - c->dst.type = OP_NONE; /* Disable writeback. */ - break; - case 0xfe ... 0xff: /* Grp4/Grp5 */ - rc = emulate_grp45(ctxt, ops); - if (rc != 0) - goto done; - break; - } - -writeback: - rc = writeback(ctxt, ops); - if (rc != 0) - goto done; - - /* Commit shadow register state. */ - memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(ctxt->vcpu, c->eip); - -done: - if (rc == X86EMUL_UNHANDLEABLE) { - c->eip = saved_eip; - return -1; - } - return 0; - -twobyte_insn: - switch (c->b) { - case 0x01: /* lgdt, lidt, lmsw */ - switch (c->modrm_reg) { - u16 size; - unsigned long address; - - case 0: /* vmcall */ - if (c->modrm_mod != 3 || c->modrm_rm != 1) - goto cannot_emulate; - - rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc) - goto done; - - /* Let the processor re-execute the fixed hypercall */ - c->eip = kvm_rip_read(ctxt->vcpu); - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - case 2: /* lgdt */ - rc = read_descriptor(ctxt, ops, c->src.ptr, - &size, &address, c->op_bytes); - if (rc) - goto done; - realmode_lgdt(ctxt->vcpu, size, address); - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - case 3: /* lidt/vmmcall */ - if (c->modrm_mod == 3) { - switch (c->modrm_rm) { - case 1: - rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc) - goto done; - break; - default: - goto cannot_emulate; - } - } else { - rc = read_descriptor(ctxt, ops, c->src.ptr, - &size, &address, - c->op_bytes); - if (rc) - goto done; - realmode_lidt(ctxt->vcpu, size, address); - } - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - case 4: /* smsw */ - c->dst.bytes = 2; - c->dst.val = realmode_get_cr(ctxt->vcpu, 0); - break; - case 6: /* lmsw */ - realmode_lmsw(ctxt->vcpu, (u16)c->src.val, - &ctxt->eflags); - c->dst.type = OP_NONE; - break; - case 7: /* invlpg*/ - emulate_invlpg(ctxt->vcpu, memop); - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - default: - goto cannot_emulate; - } - break; - case 0x05: /* syscall */ - if (emulate_syscall(ctxt) == -1) - goto cannot_emulate; - else - goto writeback; - break; - case 0x06: - emulate_clts(ctxt->vcpu); - c->dst.type = OP_NONE; - break; - case 0x08: /* invd */ - case 0x09: /* wbinvd */ - case 0x0d: /* GrpP (prefetch) */ - case 0x18: /* Grp16 (prefetch/nop) */ - c->dst.type = OP_NONE; - break; - case 0x20: /* mov cr, reg */ - if (c->modrm_mod != 3) - goto cannot_emulate; - c->regs[c->modrm_rm] = - realmode_get_cr(ctxt->vcpu, c->modrm_reg); - c->dst.type = OP_NONE; /* no writeback */ - break; - case 0x21: /* mov from dr to reg */ - if (c->modrm_mod != 3) - goto cannot_emulate; - rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); - if (rc) - goto cannot_emulate; - c->dst.type = OP_NONE; /* no writeback */ - break; - case 0x22: /* mov reg, cr */ - if (c->modrm_mod != 3) - goto cannot_emulate; - realmode_set_cr(ctxt->vcpu, - c->modrm_reg, c->modrm_val, &ctxt->eflags); - c->dst.type = OP_NONE; - break; - case 0x23: /* mov from reg to dr */ - if (c->modrm_mod != 3) - goto cannot_emulate; - rc = emulator_set_dr(ctxt, c->modrm_reg, - c->regs[c->modrm_rm]); - if (rc) - goto cannot_emulate; - c->dst.type = OP_NONE; /* no writeback */ - break; - case 0x30: - /* wrmsr */ - msr_data = (u32)c->regs[VCPU_REGS_RAX] - | ((u64)c->regs[VCPU_REGS_RDX] << 32); - rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); - if (rc) { - kvm_inject_gp(ctxt->vcpu, 0); - c->eip = kvm_rip_read(ctxt->vcpu); - } - rc = X86EMUL_CONTINUE; - c->dst.type = OP_NONE; - break; - case 0x32: - /* rdmsr */ - rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); - if (rc) { - kvm_inject_gp(ctxt->vcpu, 0); - c->eip = kvm_rip_read(ctxt->vcpu); - } else { - c->regs[VCPU_REGS_RAX] = (u32)msr_data; - c->regs[VCPU_REGS_RDX] = msr_data >> 32; - } - rc = X86EMUL_CONTINUE; - c->dst.type = OP_NONE; - break; - case 0x34: /* sysenter */ - if (emulate_sysenter(ctxt) == -1) - goto cannot_emulate; - else - goto writeback; - break; - case 0x35: /* sysexit */ - if (emulate_sysexit(ctxt) == -1) - goto cannot_emulate; - else - goto writeback; - break; - case 0x40 ... 0x4f: /* cmov */ - c->dst.val = c->dst.orig_val = c->src.val; - if (!test_cc(c->b, ctxt->eflags)) - c->dst.type = OP_NONE; /* no writeback */ - break; - case 0x80 ... 0x8f: /* jnz rel, etc*/ - if (test_cc(c->b, ctxt->eflags)) - jmp_rel(c, c->src.val); - c->dst.type = OP_NONE; - break; - case 0xa3: - bt: /* bt */ - c->dst.type = OP_NONE; - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); - break; - case 0xa4: /* shld imm8, r, r/m */ - case 0xa5: /* shld cl, r, r/m */ - emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); - break; - case 0xab: - bts: /* bts */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); - break; - case 0xac: /* shrd imm8, r, r/m */ - case 0xad: /* shrd cl, r, r/m */ - emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags); - break; - case 0xae: /* clflush */ - break; - case 0xb0 ... 0xb1: /* cmpxchg */ - /* - * Save real source value, then compare EAX against - * destination. - */ - c->src.orig_val = c->src.val; - c->src.val = c->regs[VCPU_REGS_RAX]; - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - if (ctxt->eflags & EFLG_ZF) { - /* Success: write back to memory. */ - c->dst.val = c->src.orig_val; - } else { - /* Failure: write the value we saw to EAX. */ - c->dst.type = OP_REG; - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - } - break; - case 0xb3: - btr: /* btr */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); - break; - case 0xb6 ... 0xb7: /* movzx */ - c->dst.bytes = c->op_bytes; - c->dst.val = (c->d & ByteOp) ? (u8) c->src.val - : (u16) c->src.val; - break; - case 0xba: /* Grp8 */ - switch (c->modrm_reg & 3) { - case 0: - goto bt; - case 1: - goto bts; - case 2: - goto btr; - case 3: - goto btc; - } - break; - case 0xbb: - btc: /* btc */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); - break; - case 0xbe ... 0xbf: /* movsx */ - c->dst.bytes = c->op_bytes; - c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : - (s16) c->src.val; - break; - case 0xc3: /* movnti */ - c->dst.bytes = c->op_bytes; - c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : - (u64) c->src.val; - break; - case 0xc7: /* Grp9 (cmpxchg8b) */ - rc = emulate_grp9(ctxt, ops, memop); - if (rc != 0) - goto done; - c->dst.type = OP_NONE; - break; - } - goto writeback; - -cannot_emulate: - DPRINTF("Cannot emulate %02x\n", c->b); - c->eip = saved_eip; - return -1; -} -- cgit v1.2.3-70-g09d2 From fa6870c6b64214aced218d930ec7221e2a9767b8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 16 Aug 2009 15:31:33 +0300 Subject: KVM: Add missing #include Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_para.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index b8a3305ae09..c584076a47f 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_KVM_PARA_H #define _ASM_X86_KVM_PARA_H +#include + /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It * should be used to determine that a VM is running under KVM. */ -- cgit v1.2.3-70-g09d2 From 3d53c27d05950390712f92c5ad1604c60190ed64 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 Sep 2009 12:34:07 +0300 Subject: KVM: Use thread debug register storage instead of kvm specific data Instead of saving the debug registers from the processor to a kvm data structure, rely in the debug registers stored in the thread structure. This allows us not to save dr6 and dr7. Reduces lightweight vmexit cost by 350 cycles, or 11 percent. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 3 --- arch/x86/kvm/x86.c | 22 +++++++--------------- 2 files changed, 7 insertions(+), 18 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 33901be75a3..e8f166a02c7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -362,9 +362,6 @@ struct kvm_vcpu_arch { u32 pat; int switch_db_regs; - unsigned long host_db[KVM_NR_DB_REGS]; - unsigned long host_dr6; - unsigned long host_dr7; unsigned long db[KVM_NR_DB_REGS]; unsigned long dr6; unsigned long dr7; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 35e7fc54de3..3e893c4fdd3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3623,14 +3623,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_guest_enter(); - get_debugreg(vcpu->arch.host_dr6, 6); - get_debugreg(vcpu->arch.host_dr7, 7); if (unlikely(vcpu->arch.switch_db_regs)) { - get_debugreg(vcpu->arch.host_db[0], 0); - get_debugreg(vcpu->arch.host_db[1], 1); - get_debugreg(vcpu->arch.host_db[2], 2); - get_debugreg(vcpu->arch.host_db[3], 3); - set_debugreg(0, 7); set_debugreg(vcpu->arch.eff_db[0], 0); set_debugreg(vcpu->arch.eff_db[1], 1); @@ -3641,15 +3634,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) trace_kvm_entry(vcpu->vcpu_id); kvm_x86_ops->run(vcpu, kvm_run); - if (unlikely(vcpu->arch.switch_db_regs)) { - set_debugreg(0, 7); - set_debugreg(vcpu->arch.host_db[0], 0); - set_debugreg(vcpu->arch.host_db[1], 1); - set_debugreg(vcpu->arch.host_db[2], 2); - set_debugreg(vcpu->arch.host_db[3], 3); + if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { + set_debugreg(current->thread.debugreg0, 0); + set_debugreg(current->thread.debugreg1, 1); + set_debugreg(current->thread.debugreg2, 2); + set_debugreg(current->thread.debugreg3, 3); + set_debugreg(current->thread.debugreg6, 6); + set_debugreg(current->thread.debugreg7, 7); } - set_debugreg(vcpu->arch.host_dr6, 6); - set_debugreg(vcpu->arch.host_dr7, 7); set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); -- cgit v1.2.3-70-g09d2 From 0a79b009525b160081d75cef5dbf45817956acf2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 Sep 2009 12:03:25 +0300 Subject: KVM: VMX: Check cpl before emulating debug register access Debug registers may only be accessed from cpl 0. Unfortunately, vmx will code to emulate the instruction even though it was issued from guest userspace, possibly leading to an unexpected trap later. Cc: stable@kernel.org Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx.c | 2 ++ arch/x86/kvm/x86.c | 13 +++++++++++++ 3 files changed, 16 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e8f166a02c7..3be000435fa 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -620,6 +620,7 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, u32 error_code); +bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); int kvm_pic_set_irq(void *opaque, int irq, int level); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index cc6e00a9f72..f3812014bd0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2934,6 +2934,8 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) unsigned long val; int dr, reg; + if (!kvm_require_cpl(vcpu, 0)) + return 1; dr = vmcs_readl(GUEST_DR7); if (dr & DR7_GD) { /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7627ff607a9..4137cc579ba 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -222,6 +222,19 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); +/* + * Checks if cpl <= required_cpl; if true, return true. Otherwise queue + * a #GP and return false. + */ +bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) +{ + if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) + return true; + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return false; +} +EXPORT_SYMBOL_GPL(kvm_require_cpl); + /* * Load the pae pdptrs. Return true is they are all valid. */ -- cgit v1.2.3-70-g09d2