diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/include/asm/apic.h | 3 | ||||
-rw-r--r-- | arch/x86/include/asm/bitops.h | 7 | ||||
-rw-r--r-- | arch/x86/include/asm/hypervisor.h | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm.h | 1 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_emulate.h | 6 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 31 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_para.h | 7 | ||||
-rw-r--r-- | arch/x86/include/asm/processor-flags.h | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/vmx.h | 6 | ||||
-rw-r--r-- | arch/x86/kernel/apic/apic.c | 17 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/hypervisor.c | 3 | ||||
-rw-r--r-- | arch/x86/kernel/kvm.c | 64 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 46 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.h | 9 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 273 | ||||
-rw-r--r-- | arch/x86/kvm/i8259.c | 17 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 194 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 11 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 359 | ||||
-rw-r--r-- | arch/x86/kvm/mmutrace.h | 45 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 3 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 12 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 34 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 189 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 123 |
25 files changed, 1172 insertions, 291 deletions
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 88093c1d44f..3ea51a84a0e 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -465,6 +465,8 @@ static inline u32 safe_apic_wait_icr_idle(void) return apic->safe_wait_icr_idle(); } +extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)); + #else /* CONFIG_X86_LOCAL_APIC */ static inline u32 apic_read(u32 reg) { return 0; } @@ -474,6 +476,7 @@ static inline u64 apic_icr_read(void) { return 0; } static inline void apic_icr_write(u32 low, u32 high) { } static inline void apic_wait_icr_idle(void) { } static inline u32 safe_apic_wait_icr_idle(void) { return 0; } +static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {} #endif /* CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index a6983b27722..72f5009deb5 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -264,6 +264,13 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) * This operation is non-atomic and can be reordered. * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. + * + * Note: the operation is performed atomically with respect to + * the local CPU, but not other CPUs. Portable code should not + * rely on this behaviour. + * KVM relies on this behaviour on x86 for modifying memory that is also + * accessed from a hypervisor on the same CPU if running in a VM: don't change + * this without also updating arch/x86/kernel/kvm.c */ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) { diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 7a15153c675..b518c750993 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -49,6 +49,7 @@ extern const struct hypervisor_x86 *x86_hyper; extern const struct hypervisor_x86 x86_hyper_vmware; extern const struct hypervisor_x86 x86_hyper_ms_hyperv; extern const struct hypervisor_x86 x86_hyper_xen_hvm; +extern const struct hypervisor_x86 x86_hyper_kvm; static inline bool hypervisor_x2apic_available(void) { diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index e7d1c194d27..246617efd67 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -12,6 +12,7 @@ /* Select x86 specific features in <linux/kvm.h> */ #define __KVM_HAVE_PIT #define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_DEVICE_ASSIGNMENT #define __KVM_HAVE_MSI #define __KVM_HAVE_USER_NMI diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 1ac46c22dd5..c764f43b71c 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -192,8 +192,8 @@ struct x86_emulate_ops { struct x86_instruction_info *info, enum x86_intercept_stage stage); - bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); + void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); }; typedef u32 __attribute__((vector_size(16))) sse128_t; @@ -280,9 +280,9 @@ struct x86_emulate_ctxt { u8 modrm_seg; bool rip_relative; unsigned long _eip; + struct operand memop; /* Fields above regs are cleared together. */ unsigned long regs[NR_VCPU_REGS]; - struct operand memop; struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2da88c0cda1..09155d64cf7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -48,12 +48,13 @@ #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) +#define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 0xFFFFFF0000000000ULL) #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ - | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) @@ -175,6 +176,13 @@ enum { /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 +/* + * The following bit is set with PV-EOI, unset on EOI. + * We detect PV-EOI changes by guest by comparing + * this bit with PV-EOI in guest memory. + * See the implementation in apic_update_pv_eoi. + */ +#define KVM_APIC_PV_EOI_PENDING 1 /* * We don't want allocation failures within the mmu code, so we preallocate @@ -484,6 +492,11 @@ struct kvm_vcpu_arch { u64 length; u64 status; } osvw; + + struct { + u64 msr_val; + struct gfn_to_hva_cache data; + } pv_eoi; }; struct kvm_lpage_info { @@ -661,6 +674,7 @@ struct kvm_x86_ops { u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); + bool (*invpcid_supported)(void); void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -802,7 +816,20 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); -int kvm_pic_set_irq(void *opaque, int irq, int level); +static inline int __kvm_irq_line_state(unsigned long *irq_state, + int irq_source_id, int level) +{ + /* Logical OR for level trig interrupt */ + if (level) + __set_bit(irq_source_id, irq_state); + else + __clear_bit(irq_source_id, irq_state); + + return !!(*irq_state); +} + +int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); +void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 63ab1661d00..2f7712e08b1 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -22,6 +22,7 @@ #define KVM_FEATURE_CLOCKSOURCE2 3 #define KVM_FEATURE_ASYNC_PF 4 #define KVM_FEATURE_STEAL_TIME 5 +#define KVM_FEATURE_PV_EOI 6 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. @@ -37,6 +38,7 @@ #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 #define MSR_KVM_STEAL_TIME 0x4b564d03 +#define MSR_KVM_PV_EOI_EN 0x4b564d04 struct kvm_steal_time { __u64 steal; @@ -89,6 +91,11 @@ struct kvm_vcpu_pv_apf_data { __u32 enabled; }; +#define KVM_PV_EOI_BIT 0 +#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT) +#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK +#define KVM_PV_EOI_DISABLED 0x0 + #ifdef __KERNEL__ #include <asm/processor.h> diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index f8ab3eaad12..aea1d1d848c 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -44,6 +44,7 @@ */ #define X86_CR3_PWT 0x00000008 /* Page Write Through */ #define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ +#define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */ /* * Intel CPU features in CR4 @@ -61,6 +62,7 @@ #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ #define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ #define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ +#define X86_CR4_PCIDE 0x00020000 /* enable PCID support */ #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ #define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 31f180c21ce..74fcb963595 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -60,6 +60,7 @@ #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 +#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 @@ -281,6 +282,7 @@ enum vmcs_field { #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 +#define EXIT_REASON_INVPCID 58 /* * Interruption-information format @@ -404,6 +406,7 @@ enum vmcs_field { #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) +#define VMX_EPT_AD_BIT (1ull << 21) #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) @@ -415,11 +418,14 @@ enum vmcs_field { #define VMX_EPT_MAX_GAW 0x4 #define VMX_EPT_MT_EPTE_SHIFT 3 #define VMX_EPT_GAW_EPTP_SHIFT 3 +#define VMX_EPT_AD_ENABLE_BIT (1ull << 6) #define VMX_EPT_DEFAULT_MT 0x6ull #define VMX_EPT_READABLE_MASK 0x1ull #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull #define VMX_EPT_IPAT_BIT (1ull << 6) +#define VMX_EPT_ACCESS_BIT (1ull << 8) +#define VMX_EPT_DIRTY_BIT (1ull << 9) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index c421512ca5e..98e24131ff3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2143,6 +2143,23 @@ int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, } /* + * Override the generic EOI implementation with an optimized version. + * Only called during early boot when only one CPU is active and with + * interrupts disabled, so we know this does not race with actual APIC driver + * use. + */ +void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + /* Should happen once for each apic */ + WARN_ON((*drv)->eoi_write == eoi_write); + (*drv)->eoi_write = eoi_write; + } +} + +/* * Power management */ #ifdef CONFIG_PM diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 755f64fb074..a8f8fa9769d 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -37,6 +37,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = #endif &x86_hyper_vmware, &x86_hyper_ms_hyperv, +#ifdef CONFIG_KVM_GUEST + &x86_hyper_kvm, +#endif }; const struct hypervisor_x86 *x86_hyper; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e554e5ad2fe..c1d61ee4b4f 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -39,6 +39,9 @@ #include <asm/desc.h> #include <asm/tlbflush.h> #include <asm/idle.h> +#include <asm/apic.h> +#include <asm/apicdef.h> +#include <asm/hypervisor.h> static int kvmapf = 1; @@ -283,6 +286,22 @@ static void kvm_register_steal_time(void) cpu, __pa(st)); } +static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; + +static void kvm_guest_apic_eoi_write(u32 reg, u32 val) +{ + /** + * This relies on __test_and_clear_bit to modify the memory + * in a way that is atomic with respect to the local CPU. + * The hypervisor only accesses this memory from the local CPU so + * there's no need for lock or memory barriers. + * An optimization barrier is implied in apic write. + */ + if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) + return; + apic_write(APIC_EOI, APIC_EOI_ACK); +} + void __cpuinit kvm_guest_cpu_init(void) { if (!kvm_para_available()) @@ -300,11 +319,20 @@ void __cpuinit kvm_guest_cpu_init(void) smp_processor_id()); } + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { + unsigned long pa; + /* Size alignment is implied but just to make it explicit. */ + BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); + __get_cpu_var(kvm_apic_eoi) = 0; + pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; + wrmsrl(MSR_KVM_PV_EOI_EN, pa); + } + if (has_steal_clock) kvm_register_steal_time(); } -static void kvm_pv_disable_apf(void *unused) +static void kvm_pv_disable_apf(void) { if (!__get_cpu_var(apf_reason).enabled) return; @@ -316,11 +344,23 @@ static void kvm_pv_disable_apf(void *unused) smp_processor_id()); } +static void kvm_pv_guest_cpu_reboot(void *unused) +{ + /* + * We disable PV EOI before we load a new kernel by kexec, + * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. + * New kernel can re-enable when it boots. + */ + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); +} + static int kvm_pv_reboot_notify(struct notifier_block *nb, unsigned long code, void *unused) { if (code == SYS_RESTART) - on_each_cpu(kvm_pv_disable_apf, NULL, 1); + on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); return NOTIFY_DONE; } @@ -371,7 +411,9 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy) static void kvm_guest_cpu_offline(void *dummy) { kvm_disable_steal_time(); - kvm_pv_disable_apf(NULL); + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); apf_task_wake_all(); } @@ -424,6 +466,9 @@ void __init kvm_guest_init(void) pv_time_ops.steal_clock = kvm_steal_clock; } + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + apic_set_eoi_write(kvm_guest_apic_eoi_write); + #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; register_cpu_notifier(&kvm_cpu_notifier); @@ -432,6 +477,19 @@ void __init kvm_guest_init(void) #endif } +static bool __init kvm_detect(void) +{ + if (!kvm_para_available()) + return false; + return true; +} + +const struct hypervisor_x86 x86_hyper_kvm __refconst = { + .name = "KVM", + .detect = kvm_detect, +}; +EXPORT_SYMBOL_GPL(x86_hyper_kvm); + static __init int activate_jump_labels(void) { if (has_steal_clock) { diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7df1c6d839f..0595f1397b7 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -201,6 +201,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_lm = 0; #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; + unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -228,7 +229,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | - 0 /* Reserved, DCA */ | F(XMM4_1) | + F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | F(F16C) | F(RDRAND); @@ -248,7 +249,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | F(RTM); + F(BMI2) | F(ERMS) | f_invpcid | F(RTM); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -409,6 +410,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_NOP_IO_DELAY) | (1 << KVM_FEATURE_CLOCKSOURCE2) | (1 << KVM_FEATURE_ASYNC_PF) | + (1 << KVM_FEATURE_PV_EOI) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); if (sched_info_on()) @@ -639,33 +641,37 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); } -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) { - u32 function, index; + u32 function = *eax, index = *ecx; struct kvm_cpuid_entry2 *best; - function = kvm_register_read(vcpu, VCPU_REGS_RAX); - index = kvm_register_read(vcpu, VCPU_REGS_RCX); - kvm_register_write(vcpu, VCPU_REGS_RAX, 0); - kvm_register_write(vcpu, VCPU_REGS_RBX, 0); - kvm_register_write(vcpu, VCPU_REGS_RCX, 0); - kvm_register_write(vcpu, VCPU_REGS_RDX, 0); best = kvm_find_cpuid_entry(vcpu, function, index); if (!best) best = check_cpuid_limit(vcpu, function, index); if (best) { - kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); - kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); - kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); - kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); - } + *eax = best->eax; + *ebx = best->ebx; + *ecx = best->ecx; + *edx = best->edx; + } else + *eax = *ebx = *ecx = *edx = 0; +} + +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + u32 function, eax, ebx, ecx, edx; + + function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX); + ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); + kvm_register_write(vcpu, VCPU_REGS_RAX, eax); + kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); + kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, edx); kvm_x86_ops->skip_emulated_instruction(vcpu); - trace_kvm_cpuid(function, - kvm_register_read(vcpu, VCPU_REGS_RAX), - kvm_register_read(vcpu, VCPU_REGS_RBX), - kvm_register_read(vcpu, VCPU_REGS_RCX), - kvm_register_read(vcpu, VCPU_REGS_RDX)); + trace_kvm_cpuid(function, eax, ebx, ecx, edx); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 26d1fb437eb..a10e4601685 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -17,6 +17,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); +void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) @@ -51,4 +52,12 @@ static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) return best && (best->ecx & bit(X86_FEATURE_OSVW)); } +static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + return best && (best->ecx & bit(X86_FEATURE_PCID)); +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f95d242ee9f..97d9a9914ba 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -433,11 +433,32 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, return ctxt->ops->intercept(ctxt, &info, stage); } +static void assign_masked(ulong *dest, ulong src, ulong mask) +{ + *dest = (*dest & ~mask) | (src & mask); +} + static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) { return (1UL << (ctxt->ad_bytes << 3)) - 1; } +static ulong stack_mask(struct x86_emulate_ctxt *ctxt) +{ + u16 sel; + struct desc_struct ss; + + if (ctxt->mode == X86EMUL_MODE_PROT64) + return ~0UL; + ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS); + return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */ +} + +static int stack_size(struct x86_emulate_ctxt *ctxt) +{ + return (__fls(stack_mask(ctxt)) + 1) >> 3; +} + /* Access/update address held in a register, based on addressing mode. */ static inline unsigned long address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) @@ -958,6 +979,12 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, op->orig_val = op->val; } +static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg) +{ + if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP) + ctxt->modrm_seg = VCPU_SREG_SS; +} + static int decode_modrm(struct x86_emulate_ctxt *ctxt, struct operand *op) { @@ -1061,15 +1088,20 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) modrm_ea += insn_fetch(s32, ctxt); - else + else { modrm_ea += ctxt->regs[base_reg]; + adjust_modrm_seg(ctxt, base_reg); + } if (index_reg != 4) modrm_ea += ctxt->regs[index_reg] << scale; } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { if (ctxt->mode == X86EMUL_MODE_PROT64) ctxt->rip_relative = 1; - } else - modrm_ea += ctxt->regs[ctxt->modrm_rm]; + } else { + base_reg = ctxt->modrm_rm; + modrm_ea += ctxt->regs[base_reg]; + adjust_modrm_seg(ctxt, base_reg); + } switch (ctxt->modrm_mod) { case 0: if (ctxt->modrm_rm == 5) @@ -1264,7 +1296,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, /* allowed just for 8 bytes segments */ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, struct desc_struct *desc) + u16 selector, struct desc_struct *desc, + ulong *desc_addr_p) { struct desc_ptr dt; u16 index = selector >> 3; @@ -1275,7 +1308,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (dt.size < index * 8 + 7) return emulate_gp(ctxt, selector & 0xfffc); - addr = dt.address + index * 8; + *desc_addr_p = addr = dt.address + index * 8; return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); } @@ -1302,11 +1335,12 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg) { - struct desc_struct seg_desc; + struct desc_struct seg_desc, old_desc; u8 dpl, rpl, cpl; unsigned err_vec = GP_VECTOR; u32 err_code = 0; bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + ulong desc_addr; int ret; memset(&seg_desc, 0, sizeof seg_desc); @@ -1324,8 +1358,14 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, goto load; } - /* NULL selector is not valid for TR, CS and SS */ - if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + rpl = selector & 3; + cpl = ctxt->ops->cpl(ctxt); + + /* NULL selector is not valid for TR, CS and SS (except for long mode) */ + if ((seg == VCPU_SREG_CS + || (seg == VCPU_SREG_SS + && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)) + || seg == VCPU_SREG_TR) && null_selector) goto exception; @@ -1336,7 +1376,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (null_selector) /* for NULL selector skip all following checks */ goto load; - ret = read_segment_descriptor(ctxt, selector, &seg_desc); + ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) return ret; @@ -1352,9 +1392,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, goto exception; } - rpl = selector & 3; dpl = seg_desc.dpl; - cpl = ctxt->ops->cpl(ctxt); switch (seg) { case VCPU_SREG_SS: @@ -1384,6 +1422,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, case VCPU_SREG_TR: if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) goto exception; + old_desc = seg_desc; + seg_desc.type |= 2; /* busy */ + ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, + sizeof(seg_desc), &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + return ret; break; case VCPU_SREG_LDTR: if (seg_desc.s || seg_desc.type != 2) @@ -1474,17 +1518,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_push(struct x86_emulate_ctxt *ctxt) +static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes) { struct segmented_address addr; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes); + register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes); addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); addr.seg = VCPU_SREG_SS; + return segmented_write(ctxt, addr, data, bytes); +} + +static int em_push(struct x86_emulate_ctxt *ctxt) +{ /* Disable writeback. */ ctxt->dst.type = OP_NONE; - return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes); + return push(ctxt, &ctxt->src.val, ctxt->op_bytes); } static int emulate_pop(struct x86_emulate_ctxt *ctxt, @@ -1556,6 +1605,33 @@ static int em_popf(struct x86_emulate_ctxt *ctxt) return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); } +static int em_enter(struct x86_emulate_ctxt *ctxt) +{ + int rc; + unsigned frame_size = ctxt->src.val; + unsigned nesting_level = ctxt->src2.val & 31; + + if (nesting_level) + return X86EMUL_UNHANDLEABLE; + + rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt)); + if (rc != X86EMUL_CONTINUE) + return rc; + assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP], + stack_mask(ctxt)); + assign_masked(&ctxt->regs[VCPU_REGS_RSP], + ctxt->regs[VCPU_REGS_RSP] - frame_size, + stack_mask(ctxt)); + return X86EMUL_CONTINUE; +} + +static int em_leave(struct x86_emulate_ctxt *ctxt) +{ + assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP], + stack_mask(ctxt)); + return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes); +} + static int em_push_sreg(struct x86_emulate_ctxt *ctxt) { int seg = ctxt->src2.val; @@ -1993,8 +2069,8 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt) u32 eax, ebx, ecx, edx; eax = ecx = 0; - return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx) - && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; } @@ -2013,32 +2089,31 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) eax = 0x00000000; ecx = 0x00000000; - if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) { - /* - * Intel ("GenuineIntel") - * remark: Intel CPUs only support "syscall" in 64bit - * longmode. Also an 64bit guest with a - * 32bit compat-app running will #UD !! While this - * behaviour can be fixed (by emulating) into AMD - * response - CPUs of AMD can't behave like Intel. - */ - if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && - ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && - edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) - return false; + ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + /* + * Intel ("GenuineIntel") + * remark: Intel CPUs only support "syscall" in 64bit + * longmode. Also an 64bit guest with a + * 32bit compat-app running will #UD !! While this + * behaviour can be fixed (by emulating) into AMD + * response - CPUs of AMD can't behave like Intel. + */ + if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && + ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && + edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) + return false; - /* AMD ("AuthenticAMD") */ - if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && - ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && - edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) - return true; - - /* AMD ("AMDisbetter!") */ - if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && - ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && - edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) - return true; - } + /* AMD ("AuthenticAMD") */ + if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && + ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && + edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) + return true; + + /* AMD ("AMDisbetter!") */ + if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && + ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && + edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) + return true; /* default: (not Intel, not AMD), apply Intel's stricter rules... */ return false; @@ -2547,13 +2622,14 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, ulong old_tss_base = ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); u32 desc_limit; + ulong desc_addr; /* FIXME: old_tss_base == ~0 ? */ - ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc); + ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) return ret; - ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); + ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) return ret; @@ -2948,6 +3024,24 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); } +static int em_lldt(struct x86_emulate_ctxt *ctxt) +{ + u16 sel = ctxt->src.val; + + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR); +} + +static int em_ltr(struct x86_emulate_ctxt *ctxt) +{ + u16 sel = ctxt->src.val; + + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR); +} + static int em_invlpg(struct x86_emulate_ctxt *ctxt) { int rc; @@ -2989,11 +3083,42 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt, + void (*get)(struct x86_emulate_ctxt *ctxt, + struct desc_ptr *ptr)) +{ + struct desc_ptr desc_ptr; + + if (ctxt->mode == X86EMUL_MODE_PROT64) + ctxt->op_bytes = 8; + get(ctxt, &desc_ptr); + if (ctxt->op_bytes == 2) { + ctxt->op_bytes = 4; + desc_ptr.address &= 0x00ffffff; + } + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + return segmented_write(ctxt, ctxt->dst.addr.mem, + &desc_ptr, 2 + ctxt->op_bytes); +} + +static int em_sgdt(struct x86_emulate_ctxt *ctxt) +{ + return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt); +} + +static int em_sidt(struct x86_emulate_ctxt *ctxt) +{ + return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt); +} + static int em_lgdt(struct x86_emulate_ctxt *ctxt) { struct desc_ptr desc_ptr; int rc; + if (ctxt->mode == X86EMUL_MODE_PROT64) + ctxt->op_bytes = 8; rc = read_descriptor(ctxt, ctxt->src.addr.mem, &desc_ptr.size, &desc_ptr.address, ctxt->op_bytes); @@ -3021,6 +3146,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt) struct desc_ptr desc_ptr; int rc; + if (ctxt->mode == X86EMUL_MODE_PROT64) + ctxt->op_bytes = 8; rc = read_descriptor(ctxt, ctxt->src.addr.mem, &desc_ptr.size, &desc_ptr.address, ctxt->op_bytes); @@ -3143,6 +3270,42 @@ static int em_bsr(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_cpuid(struct x86_emulate_ctxt *ctxt) +{ + u32 eax, ebx, ecx, edx; + + eax = ctxt->regs[VCPU_REGS_RAX]; + ecx = ctxt->regs[VCPU_REGS_RCX]; + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + ctxt->regs[VCPU_REGS_RAX] = eax; + ctxt->regs[VCPU_REGS_RBX] = ebx; + ctxt->regs[VCPU_REGS_RCX] = ecx; + ctxt->regs[VCPU_REGS_RDX] = edx; + return X86EMUL_CONTINUE; +} + +static int em_lahf(struct x86_emulate_ctxt *ctxt) +{ + ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL; + ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8; + return X86EMUL_CONTINUE; +} + +static int em_bswap(struct x86_emulate_ctxt *ctxt) +{ + switch (ctxt->op_bytes) { +#ifdef CONFIG_X86_64 + case 8: + asm("bswap %0" : "+r"(ctxt->dst.val)); + break; +#endif + default: + asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val)); + break; + } + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -3424,14 +3587,14 @@ static struct opcode group5[] = { static struct opcode group6[] = { DI(Prot, sldt), DI(Prot, str), - DI(Prot | Priv, lldt), - DI(Prot | Priv, ltr), + II(Prot | Priv | SrcMem16, em_lldt, lldt), + II(Prot | Priv | SrcMem16, em_ltr, ltr), N, N, N, N, }; static struct group_dual group7 = { { - DI(Mov | DstMem | Priv, sgdt), - DI(Mov | DstMem | Priv, sidt), + II(Mov | DstMem | Priv, em_sgdt, sgdt), + II(Mov | DstMem | Priv, em_sidt, sidt), II(SrcMem | Priv, em_lgdt, lgdt), II(SrcMem | Priv, em_lidt, lidt), II(SrcNone | DstMem | Mov, em_smsw, smsw), N, @@ -3538,7 +3701,7 @@ static struct opcode opcode_table[256] = { D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), I(SrcImmFAddr | No64, em_call_far), N, II(ImplicitOps | Stack, em_pushf, pushf), - II(ImplicitOps | Stack, em_popf, popf), N, N, + II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf), /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), @@ -3561,7 +3724,8 @@ static struct opcode opcode_table[256] = { I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ - N, N, N, I(ImplicitOps | Stack, em_ret_far), + I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), + N, I(ImplicitOps | Stack, em_ret_far), D(ImplicitOps), DI(SrcImmByte, intn), D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), /* 0xD0 - 0xD7 */ @@ -3635,7 +3799,7 @@ static struct opcode twobyte_table[256] = { X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), - DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), + II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), N, N, /* 0xA8 - 0xAF */ @@ -3658,11 +3822,12 @@ static struct opcode twobyte_table[256] = { I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), - /* 0xC0 - 0xCF */ + /* 0xC0 - 0xC7 */ D2bv(DstMem | SrcReg | ModRM | Lock), N, D(DstMem | SrcReg | ModRM | Mov), N, N, N, GD(0, &group9), - N, N, N, N, N, N, N, N, + /* 0xC8 - 0xCF */ + X8(I(DstReg, em_bswap)), /* 0xD0 - 0xDF */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0xE0 - 0xEF */ @@ -4426,12 +4591,12 @@ twobyte_insn: break; case 0xb6 ... 0xb7: /* movzx */ ctxt->dst.bytes = ctxt->op_bytes; - ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val + ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val : (u16) ctxt->src.val; break; case 0xbe ... 0xbf: /* movsx */ ctxt->dst.bytes = ctxt->op_bytes; - ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : + ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : (s16) ctxt->src.val; break; case 0xc0 ... 0xc1: /* xadd */ diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 81cf4fa4a2b..1df8fb9e1d5 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -188,14 +188,15 @@ void kvm_pic_update_irq(struct kvm_pic *s) pic_unlock(s); } -int kvm_pic_set_irq(void *opaque, int irq, int level) +int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) { - struct kvm_pic *s = opaque; int ret = -1; pic_lock(s); if (irq >= 0 && irq < PIC_NUM_PINS) { - ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); + int irq_level = __kvm_irq_line_state(&s->irq_states[irq], + irq_source_id, level); + ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); pic_update_irq(s); trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, s->pics[irq >> 3].imr, ret == 0); @@ -205,6 +206,16 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) return ret; } +void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id) +{ + int i; + + pic_lock(s); + for (i = 0; i < PIC_NUM_PINS; i++) + __clear_bit(irq_source_id, &s->irq_states[i]); + pic_unlock(s); +} + /* * acknowledge interrupt 'irq' */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 93c15743f1e..ce878788a39 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -107,6 +107,16 @@ static inline void apic_clear_vector(int vec, void *bitmap) clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } +static inline int __apic_test_and_set_vector(int vec, void *bitmap) +{ + return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline int __apic_test_and_clear_vector(int vec, void *bitmap) +{ + return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + static inline int apic_hw_enabled(struct kvm_lapic *apic) { return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; @@ -210,6 +220,16 @@ static int find_highest_vector(void *bitmap) return fls(word[word_offset << 2]) - 1 + (word_offset << 5); } +static u8 count_vectors(void *bitmap) +{ + u32 *word = bitmap; + int word_offset; + u8 count = 0; + for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset) + count += hweight32(word[word_offset << 2]); + return count; +} + static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) { apic->irr_pending = true; @@ -242,6 +262,27 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) apic->irr_pending = true; } +static inline void apic_set_isr(int vec, struct kvm_lapic *apic) +{ + if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) + ++apic->isr_count; + BUG_ON(apic->isr_count > MAX_APIC_VECTOR); + /* + * ISR (in service register) bit is set when injecting an interrupt. + * The highest vector is injected. Thus the latest bit set matches + * the highest bit in ISR. + */ + apic->highest_isr_cache = vec; +} + +static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) +{ + if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) + --apic->isr_count; + BUG_ON(apic->isr_count < 0); + apic->highest_isr_cache = -1; +} + int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -270,9 +311,61 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) irq->level, irq->trig_mode); } +static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) +{ + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, + sizeof(val)); +} + +static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) +{ + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, + sizeof(*val)); +} + +static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; +} + +static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) +{ + u8 val; + if (pv_eoi_get_user(vcpu, &val) < 0) + apic_debug("Can't read EOI MSR value: 0x%llx\n", + (unsigned long long)vcpi->arch.pv_eoi.msr_val); + return val & 0x1; +} + +static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) +{ + if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { + apic_debug("Can't set EOI MSR value: 0x%llx\n", + (unsigned long long)vcpi->arch.pv_eoi.msr_val); + return; + } + __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); +} + +static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) +{ + if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { + apic_debug("Can't clear EOI MSR value: 0x%llx\n", + (unsigned long long)vcpi->arch.pv_eoi.msr_val); + return; + } + __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); +} + static inline int apic_find_highest_isr(struct kvm_lapic *apic) { int result; + if (!apic->isr_count) + return -1; + if (likely(apic->highest_isr_cache != -1)) + return apic->highest_isr_cache; result = find_highest_vector(apic->regs + APIC_ISR); ASSERT(result == -1 || result >= 16); @@ -482,17 +575,20 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } -static void apic_set_eoi(struct kvm_lapic *apic) +static int apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); + + trace_kvm_eoi(apic, vector); + /* * Not every write EOI will has corresponding ISR, * one example is when Kernel check timer on setup_IO_APIC */ if (vector == -1) - return; + return vector; - apic_clear_vector(vector, apic->regs + APIC_ISR); + apic_clear_isr(vector, apic); apic_update_ppr(apic); if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && @@ -505,6 +601,7 @@ static void apic_set_eoi(struct kvm_lapic *apic) kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); } kvm_make_request(KVM_REQ_EVENT, apic->vcpu); + return vector; } static void apic_send_ipi(struct kvm_lapic *apic) @@ -1081,10 +1178,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } apic->irr_pending = false; + apic->isr_count = 0; + apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); if (kvm_vcpu_is_bsp(vcpu)) vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); vcpu->arch.apic_arb_prio = 0; @@ -1248,7 +1348,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) if (vector == -1) return -1; - apic_set_vector(vector, apic->regs + APIC_ISR); + apic_set_isr(vector, apic); apic_update_ppr(apic); apic_clear_irr(vector, apic); return vector; @@ -1267,6 +1367,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; + apic->isr_count = count_vectors(apic->regs + APIC_ISR); + apic->highest_isr_cache = -1; kvm_make_request(KVM_REQ_EVENT, vcpu); } @@ -1283,11 +1385,51 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } +/* + * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt + * + * Detect whether guest triggered PV EOI since the + * last entry. If yes, set EOI on guests's behalf. + * Clear PV EOI in guest memory in any case. + */ +static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, + struct kvm_lapic *apic) +{ + bool pending; + int vector; + /* + * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host + * and KVM_PV_EOI_ENABLED in guest memory as follows: + * + * KVM_APIC_PV_EOI_PENDING is unset: + * -> host disabled PV EOI. + * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set: + * -> host enabled PV EOI, guest did not execute EOI yet. + * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset: + * -> host enabled PV EOI, guest executed EOI. + */ + BUG_ON(!pv_eoi_enabled(vcpu)); + pending = pv_eoi_get_pending(vcpu); + /* + * Clear pending bit in any case: it will be set again on vmentry. + * While this might not be ideal from performance point of view, + * this makes sure pv eoi is only enabled when we know it's safe. + */ + pv_eoi_clr_pending(vcpu); + if (pending) + return; + vector = apic_set_eoi(apic); + trace_kvm_pv_eoi(apic, vector); +} + void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) { u32 data; void *vapic; + if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention)) + apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic); + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; @@ -1298,17 +1440,44 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) apic_set_tpr(vcpu->arch.apic, data & 0xff); } +/* + * apic_sync_pv_eoi_to_guest - called before vmentry + * + * Detect whether it's safe to enable PV EOI and + * if yes do so. + */ +static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, + struct kvm_lapic *apic) +{ + if (!pv_eoi_enabled(vcpu) || + /* IRR set or many bits in ISR: could be nested. */ + apic->irr_pending || + /* Cache not set: could be safe but we don't bother. */ + apic->highest_isr_cache == -1 || + /* Need EOI to update ioapic. */ + kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) { + /* + * PV EOI was disabled by apic_sync_pv_eoi_from_guest + * so we need not do anything here. + */ + return; + } + + pv_eoi_set_pending(apic->vcpu); +} + void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) { u32 data, tpr; int max_irr, max_isr; - struct kvm_lapic *apic; + struct kvm_lapic *apic = vcpu->arch.apic; void *vapic; + apic_sync_pv_eoi_to_guest(vcpu, apic); + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - apic = vcpu->arch.apic; tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; max_irr = apic_find_highest_irr(apic); if (max_irr < 0) @@ -1394,3 +1563,16 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) return 0; } + +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) +{ + u64 addr = data & ~KVM_MSR_ENABLED; + if (!IS_ALIGNED(addr, 4)) + return 1; + + vcpu->arch.pv_eoi.msr_val = data; + if (!pv_eoi_enabled(vcpu)) + return 0; + return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, + addr); +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 6f4ce2575d0..4af5405ae1e 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -13,6 +13,15 @@ struct kvm_lapic { u32 divide_count; struct kvm_vcpu *vcpu; bool irr_pending; + /* Number of bits set in ISR. */ + s16 isr_count; + /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */ + int highest_isr_cache; + /** + * APIC register page. The layout matches the register layout seen by + * the guest 1:1, because it is accessed by the vmx microcode. + * Note: Only one register, the TPR, is used by the microcode. + */ void *regs; gpa_t vapic_addr; struct page *vapic_page; @@ -60,4 +69,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) { return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; } + +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 57e168e27b5..01ca0042393 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -90,7 +90,7 @@ module_param(dbg, bool, 0644); #define PTE_PREFETCH_NUM 8 -#define PT_FIRST_AVAIL_BITS_SHIFT 9 +#define PT_FIRST_AVAIL_BITS_SHIFT 10 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 #define PT64_LEVEL_BITS 9 @@ -145,7 +145,8 @@ module_param(dbg, bool, 0644); #define CREATE_TRACE_POINTS #include "mmutrace.h" -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) @@ -188,6 +189,7 @@ static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; static void mmu_spte_set(u64 *sptep, u64 spte); +static void mmu_free_roots(struct kvm_vcpu *vcpu); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) { @@ -444,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte) } #endif +static bool spte_is_locklessly_modifiable(u64 spte) +{ + return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); +} + static bool spte_has_volatile_bits(u64 spte) { + /* + * Always atomicly update spte if it can be updated + * out of mmu-lock, it can ensure dirty bit is not lost, + * also, it can help us to get a stable is_writable_pte() + * to ensure tlb flush is not missed. + */ + if (spte_is_locklessly_modifiable(spte)) + return true; + if (!shadow_accessed_mask) return false; @@ -478,34 +494,47 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) /* Rules for using mmu_spte_update: * Update the state bits, it means the mapped pfn is not changged. + * + * Whenever we overwrite a writable spte with a read-only one we + * should flush remote TLBs. Otherwise rmap_write_protect + * will find a read-only spte, even though the writable spte + * might be cached on a CPU's TLB, the return value indicates this + * case. */ -static void mmu_spte_update(u64 *sptep, u64 new_spte) +static bool mmu_spte_update(u64 *sptep, u64 new_spte) { - u64 mask, old_spte = *sptep; + u64 old_spte = *sptep; + bool ret = false; WARN_ON(!is_rmap_spte(new_spte)); - if (!is_shadow_present_pte(old_spte)) - return mmu_spte_set(sptep, new_spte); - - new_spte |= old_spte & shadow_dirty_mask; - - mask = shadow_accessed_mask; - if (is_writable_pte(old_spte)) - mask |= shadow_dirty_mask; + if (!is_shadow_present_pte(old_spte)) { + mmu_spte_set(sptep, new_spte); + return ret; + } - if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) + if (!spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, new_spte); else old_spte = __update_clear_spte_slow(sptep, new_spte); + /* + * For the spte updated out of mmu-lock is safe, since + * we always atomicly update it, see the comments in + * spte_has_volatile_bits(). + */ + if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) + ret = true; + if (!shadow_accessed_mask) - return; + return ret; if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) kvm_set_pfn_accessed(spte_to_pfn(old_spte)); if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + + return ret; } /* @@ -652,8 +681,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) mmu_page_header_cache); } -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, - size_t size) +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) { void *p; @@ -664,8 +692,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) { - return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, - sizeof(struct pte_list_desc)); + return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); } static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) @@ -1051,35 +1078,82 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } -static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) + +static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) +{ + if (is_large_pte(*sptep)) { + WARN_ON(page_header(__pa(sptep))->role.level == + PT_PAGE_TABLE_LEVEL); + drop_spte(kvm, sptep); + --kvm->stat.lpages; + return true; + } + + return false; +} + +static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) +{ + if (__drop_large_spte(vcpu->kvm, sptep)) + kvm_flush_remote_tlbs(vcpu->kvm); +} + +/* + * Write-protect on the specified @sptep, @pt_protect indicates whether + * spte writ-protection is caused by protecting shadow page table. + * @flush indicates whether tlb need be flushed. + * + * Note: write protection is difference between drity logging and spte + * protection: + * - for dirty logging, the spte can be set to writable at anytime if + * its dirty bitmap is properly set. + * - for spte protection, the spte can be writable only after unsync-ing + * shadow page. + * + * Return true if the spte is dropped. + */ +static bool +spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) +{ + u64 spte = *sptep; + + if (!is_writable_pte(spte) && + !(pt_protect && spte_is_locklessly_modifiable(spte))) + return false; + + rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); + + if (__drop_large_spte(kvm, sptep)) { + *flush |= true; + return true; + } + + if (pt_protect) + spte &= ~SPTE_MMU_WRITEABLE; + spte = spte & ~PT_WRITABLE_MASK; + + *flush |= mmu_spte_update(sptep, spte); + return false; +} + +static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, + int level, bool pt_protect) { u64 *sptep; struct rmap_iterator iter; - int write_protected = 0; + bool flush = false; for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { BUG_ON(!(*sptep & PT_PRESENT_MASK)); - rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); - - if (!is_writable_pte(*sptep)) { - sptep = rmap_get_next(&iter); - continue; - } - - if (level == PT_PAGE_TABLE_LEVEL) { - mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK); - sptep = rmap_get_next(&iter); - } else { - BUG_ON(!is_large_pte(*sptep)); - drop_spte(kvm, sptep); - --kvm->stat.lpages; + if (spte_write_protect(kvm, sptep, &flush, pt_protect)) { sptep = rmap_get_first(*rmapp, &iter); + continue; } - write_protected = 1; + sptep = rmap_get_next(&iter); } - return write_protected; + return flush; } /** @@ -1100,26 +1174,26 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, while (mask) { rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; - __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); + __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); /* clear the first set bit */ mask &= mask - 1; } } -static int rmap_write_protect(struct kvm *kvm, u64 gfn) +static bool rmap_write_protect(struct kvm *kvm, u64 gfn) { struct kvm_memory_slot *slot; unsigned long *rmapp; int i; - int write_protected = 0; + bool write_protected = false; slot = gfn_to_memslot(kvm, gfn); for (i = PT_PAGE_TABLE_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { rmapp = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmapp, i); + write_protected |= __rmap_write_protect(kvm, rmapp, i, true); } return write_protected; @@ -1238,11 +1312,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { u64 *sptep; - struct rmap_iterator iter; + struct rmap_iterator uninitialized_var(iter); int young = 0; /* - * Emulate the accessed bit for EPT, by checking if this page has + * In case of absence of EPT Access and Dirty Bits supports, + * emulate the accessed bit for EPT, by checking if this page has * an EPT mapping, and clearing it if it does. On the next access, * a new EPT mapping will be established. * This has some overhead, but not as much as the cost of swapping @@ -1253,11 +1328,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); + BUG_ON(!is_shadow_present_pte(*sptep)); - if (*sptep & PT_ACCESSED_MASK) { + if (*sptep & shadow_accessed_mask) { young = 1; - clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); } } @@ -1281,9 +1357,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); + BUG_ON(!is_shadow_present_pte(*sptep)); - if (*sptep & PT_ACCESSED_MASK) { + if (*sptep & shadow_accessed_mask) { young = 1; break; } @@ -1401,12 +1477,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte, int direct) { struct kvm_mmu_page *sp; - sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, - sizeof *sp); - sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); if (!direct) - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, - PAGE_SIZE); + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); @@ -1701,7 +1775,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, kvm_mmu_pages_init(parent, &parents, &pages); while (mmu_unsync_walk(parent, &pages)) { - int protected = 0; + bool protected = false; for_each_sp(pages, sp, parents, i) protected |= rmap_write_protect(vcpu->kvm, sp->gfn); @@ -1866,15 +1940,6 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) mmu_spte_set(sptep, spte); } -static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) -{ - if (is_large_pte(*sptep)) { - drop_spte(vcpu->kvm, sptep); - --vcpu->kvm->stat.lpages; - kvm_flush_remote_tlbs(vcpu->kvm); - } -} - static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned direct_access) { @@ -2243,7 +2308,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { - u64 spte, entry = *sptep; + u64 spte; int ret = 0; if (set_mmio_spte(sptep, gfn, pfn, pte_access)) @@ -2257,8 +2322,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= shadow_x_mask; else spte |= shadow_nx_mask; + if (pte_access & ACC_USER_MASK) spte |= shadow_user_mask; + if (level > PT_PAGE_TABLE_LEVEL) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) @@ -2283,7 +2350,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, goto done; } - spte |= PT_WRITABLE_MASK; + spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; if (!vcpu->arch.mmu.direct_map && !(pte_access & ACC_WRITE_MASK)) { @@ -2312,8 +2379,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, __func__, gfn); ret = 1; pte_access &= ~ACC_WRITE_MASK; - if (is_writable_pte(spte)) - spte &= ~PT_WRITABLE_MASK; + spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); } } @@ -2321,14 +2387,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, mark_page_dirty(vcpu->kvm, gfn); set_pte: - mmu_spte_update(sptep, spte); - /* - * If we overwrite a writable spte with a read-only one we - * should flush remote TLBs. Otherwise rmap_write_protect - * will find a read-only spte, even though the writable spte - * might be cached on a CPU's TLB. - */ - if (is_writable_pte(entry) && !is_writable_pte(*sptep)) + if (mmu_spte_update(sptep, spte)) kvm_flush_remote_tlbs(vcpu->kvm); done: return ret; @@ -2403,6 +2462,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { + mmu_free_roots(vcpu); } static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, @@ -2625,18 +2685,116 @@ exit: return ret; } +static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) +{ + /* + * #PF can be fast only if the shadow page table is present and it + * is caused by write-protect, that means we just need change the + * W bit of the spte which can be done out of mmu-lock. + */ + if (!(error_code & PFERR_PRESENT_MASK) || + !(error_code & PFERR_WRITE_MASK)) + return false; + + return true; +} + +static bool +fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte) +{ + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + gfn_t gfn; + + WARN_ON(!sp->role.direct); + + /* + * The gfn of direct spte is stable since it is calculated + * by sp->gfn. + */ + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + + if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) + mark_page_dirty(vcpu->kvm, gfn); + + return true; +} + +/* + * Return value: + * - true: let the vcpu to access on the same address again. + * - false: let the real page fault path to fix it. + */ +static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, + u32 error_code) +{ + struct kvm_shadow_walk_iterator iterator; + bool ret = false; + u64 spte = 0ull; + + if (!page_fault_can_be_fast(vcpu, error_code)) + return false; + + walk_shadow_page_lockless_begin(vcpu); + for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) + if (!is_shadow_present_pte(spte) || iterator.level < level) + break; + + /* + * If the mapping has been changed, let the vcpu fault on the + * same address again. + */ + if (!is_rmap_spte(spte)) { + ret = true; + goto exit; + } + + if (!is_last_spte(spte, level)) + goto exit; + + /* + * Check if it is a spurious fault caused by TLB lazily flushed. + * + * Need not check the access of upper level table entries since + * they are always ACC_ALL. + */ + if (is_writable_pte(spte)) { + ret = true; + goto exit; + } + + /* + * Currently, to simplify the code, only the spte write-protected + * by dirty-log can be fast fixed. + */ + if (!spte_is_locklessly_modifiable(spte)) + goto exit; + + /* + * Currently, fast page fault only works for direct mapping since + * the gfn is not stable for indirect shadow page. + * See Documentation/virtual/kvm/locking.txt to get more detail. + */ + ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); +exit: + trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, + spte, ret); + walk_shadow_page_lockless_end(vcpu); + + return ret; +} + static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, pfn_t *pfn, bool write, bool *writable); -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, - bool prefault) +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, + gfn_t gfn, bool prefault) { int r; int level; int force_pt_level; pfn_t pfn; unsigned long mmu_seq; - bool map_writable; + bool map_writable, write = error_code & PFERR_WRITE_MASK; force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); if (likely(!force_pt_level)) { @@ -2653,6 +2811,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, } else level = PT_PAGE_TABLE_LEVEL; + if (fast_page_fault(vcpu, v, level, error_code)) + return 0; + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -3041,7 +3202,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn = gva >> PAGE_SHIFT; return nonpaging_map(vcpu, gva & PAGE_MASK, - error_code & PFERR_WRITE_MASK, gfn, prefault); + error_code, gfn, prefault); } static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) @@ -3121,6 +3282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, } else level = PT_PAGE_TABLE_LEVEL; + if (fast_page_fault(vcpu, gpa, level, error_code)) + return 0; + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -3885,6 +4049,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { struct kvm_mmu_page *sp; + bool flush = false; list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { int i; @@ -3899,16 +4064,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) !is_last_spte(pt[i], sp->role.level)) continue; - if (is_large_pte(pt[i])) { - drop_spte(kvm, &pt[i]); - --kvm->stat.lpages; - continue; - } - - /* avoid RMW */ - if (is_writable_pte(pt[i])) - mmu_spte_update(&pt[i], - pt[i] & ~PT_WRITABLE_MASK); + spte_write_protect(kvm, &pt[i], &flush, false); } } kvm_flush_remote_tlbs(kvm); @@ -3945,7 +4101,6 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; - struct kvm *kvm_freed = NULL; int nr_to_scan = sc->nr_to_scan; if (nr_to_scan == 0) @@ -3957,22 +4112,30 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) int idx; LIST_HEAD(invalid_list); + /* + * n_used_mmu_pages is accessed without holding kvm->mmu_lock + * here. We may skip a VM instance errorneosly, but we do not + * want to shrink a VM that only started to populate its MMU + * anyway. + */ + if (kvm->arch.n_used_mmu_pages > 0) { + if (!nr_to_scan--) + break; + continue; + } + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - if (!kvm_freed && nr_to_scan > 0 && - kvm->arch.n_used_mmu_pages > 0) { - kvm_mmu_remove_some_alloc_mmu_pages(kvm, - &invalid_list); - kvm_freed = kvm; - } - nr_to_scan--; + kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list); kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); + + list_move_tail(&kvm->vm_list, &vm_list); + break; } - if (kvm_freed) - list_move_tail(&kvm_freed->vm_list, &vm_list); raw_spin_unlock(&kvm_lock); diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 89fb0e81322..cd6e98333ba 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -54,8 +54,8 @@ */ TRACE_EVENT( kvm_mmu_pagetable_walk, - TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), - TP_ARGS(addr, write_fault, user_fault, fetch_fault), + TP_PROTO(u64 addr, u32 pferr), + TP_ARGS(addr, pferr), TP_STRUCT__entry( __field(__u64, addr) @@ -64,8 +64,7 @@ TRACE_EVENT( TP_fast_assign( __entry->addr = addr; - __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) - | (!!fetch_fault << 4); + __entry->pferr = pferr; ), TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, @@ -243,6 +242,44 @@ TRACE_EVENT( TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, __entry->access) ); + +#define __spte_satisfied(__spte) \ + (__entry->retry && is_writable_pte(__entry->__spte)) + +TRACE_EVENT( + fast_page_fault, + TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, + u64 *sptep, u64 old_spte, bool retry), + TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(gva_t, gva) + __field(u32, error_code) + __field(u64 *, sptep) + __field(u64, old_spte) + __field(u64, new_spte) + __field(bool, retry) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->gva = gva; + __entry->error_code = error_code; + __entry->sptep = sptep; + __entry->old_spte = old_spte; + __entry->new_spte = *sptep; + __entry->retry = retry; + ), + + TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx" + " new %llx spurious %d fixed %d", __entry->vcpu_id, + __entry->gva, __print_flags(__entry->error_code, "|", + kvm_mmu_trace_pferr_flags), __entry->sptep, + __entry->old_spte, __entry->new_spte, + __spte_satisfied(old_spte), __spte_satisfied(new_spte) + ) +); #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 34f970937ef..bb7cf01cae7 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, const int fetch_fault = access & PFERR_FETCH_MASK; u16 errcode = 0; - trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, - fetch_fault); + trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: eperm = false; walker->level = mmu->root_level; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f75af406b26..baead950d6c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3185,8 +3185,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) break; case MSR_IA32_DEBUGCTLMSR: if (!boot_cpu_has(X86_FEATURE_LBRV)) { - pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", - __func__, data); + vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", + __func__, data); break; } if (data & DEBUGCTL_RESERVED_BITS) @@ -3205,7 +3205,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) case MSR_VM_CR: return svm_set_vm_cr(vcpu, data); case MSR_VM_IGNNE: - pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); + vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; default: return kvm_set_msr_common(vcpu, ecx, data); @@ -4044,6 +4044,11 @@ static bool svm_rdtscp_supported(void) return false; } +static bool svm_invpcid_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -4312,6 +4317,7 @@ static struct kvm_x86_ops svm_x86_ops = { .cpuid_update = svm_cpuid_update, .rdtscp_supported = svm_rdtscp_supported, + .invpcid_supported = svm_invpcid_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 62d02e3c3ed..a71faf727ff 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -517,6 +517,40 @@ TRACE_EVENT(kvm_apic_accept_irq, __entry->coalesced ? " (coalesced)" : "") ); +TRACE_EVENT(kvm_eoi, + TP_PROTO(struct kvm_lapic *apic, int vector), + TP_ARGS(apic, vector), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( int, vector ) + ), + + TP_fast_assign( + __entry->apicid = apic->vcpu->vcpu_id; + __entry->vector = vector; + ), + + TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) +); + +TRACE_EVENT(kvm_pv_eoi, + TP_PROTO(struct kvm_lapic *apic, int vector), + TP_ARGS(apic, vector), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( int, vector ) + ), + + TP_fast_assign( + __entry->apicid = apic->vcpu->vcpu_id; + __entry->vector = vector; + ), + + TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) +); + /* * Tracepoint for nested VMRUN */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 32eb5886629..c39b60707e0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -71,7 +71,10 @@ static bool __read_mostly enable_unrestricted_guest = 1; module_param_named(unrestricted_guest, enable_unrestricted_guest, bool, S_IRUGO); -static bool __read_mostly emulate_invalid_guest_state = 0; +static bool __read_mostly enable_ept_ad_bits = 1; +module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); + +static bool __read_mostly emulate_invalid_guest_state = true; module_param(emulate_invalid_guest_state, bool, S_IRUGO); static bool __read_mostly vmm_exclusive = 1; @@ -615,6 +618,10 @@ static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); +static void vmx_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); +static void vmx_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -789,6 +796,11 @@ static inline bool cpu_has_vmx_ept_4levels(void) return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; } +static inline bool cpu_has_vmx_ept_ad_bits(void) +{ + return vmx_capability.ept & VMX_EPT_AD_BIT; +} + static inline bool cpu_has_vmx_invept_individual_addr(void) { return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; @@ -849,6 +861,12 @@ static inline bool cpu_has_vmx_rdtscp(void) SECONDARY_EXEC_RDTSCP; } +static inline bool cpu_has_vmx_invpcid(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_INVPCID; +} + static inline bool cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; @@ -1739,6 +1757,11 @@ static bool vmx_rdtscp_supported(void) return cpu_has_vmx_rdtscp(); } +static bool vmx_invpcid_supported(void) +{ + return cpu_has_vmx_invpcid() && enable_ept; +} + /* * Swap MSR entry in host/guest MSR entry array. */ @@ -2458,7 +2481,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_PAUSE_LOOP_EXITING | - SECONDARY_EXEC_RDTSCP; + SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_ENABLE_INVPCID; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -2645,8 +2669,12 @@ static __init int hardware_setup(void) !cpu_has_vmx_ept_4levels()) { enable_ept = 0; enable_unrestricted_guest = 0; + enable_ept_ad_bits = 0; } + if (!cpu_has_vmx_ept_ad_bits()) + enable_ept_ad_bits = 0; + if (!cpu_has_vmx_unrestricted_guest()) enable_unrestricted_guest = 0; @@ -2770,6 +2798,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) { unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_segment var; if (enable_unrestricted_guest) return; @@ -2813,20 +2842,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu) if (emulate_invalid_guest_state) goto continue_rmode; - vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); - vmcs_write32(GUEST_SS_LIMIT, 0xffff); - vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); + vmx_get_segment(vcpu, &var, VCPU_SREG_SS); + vmx_set_segment(vcpu, &var, VCPU_SREG_SS); + + vmx_get_segment(vcpu, &var, VCPU_SREG_CS); + vmx_set_segment(vcpu, &var, VCPU_SREG_CS); + + vmx_get_segment(vcpu, &var, VCPU_SREG_ES); + vmx_set_segment(vcpu, &var, VCPU_SREG_ES); + + vmx_get_segment(vcpu, &var, VCPU_SREG_DS); + vmx_set_segment(vcpu, &var, VCPU_SREG_DS); - vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); - vmcs_write32(GUEST_CS_LIMIT, 0xffff); - if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) - vmcs_writel(GUEST_CS_BASE, 0xf0000); - vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); + vmx_get_segment(vcpu, &var, VCPU_SREG_GS); + vmx_set_segment(vcpu, &var, VCPU_SREG_GS); - fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); - fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); - fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); - fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); + vmx_get_segment(vcpu, &var, VCPU_SREG_FS); + vmx_set_segment(vcpu, &var, VCPU_SREG_FS); continue_rmode: kvm_mmu_reset_context(vcpu); @@ -3027,6 +3059,8 @@ static u64 construct_eptp(unsigned long root_hpa) /* TODO write the value reading from MSR */ eptp = VMX_EPT_DEFAULT_MT | VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; + if (enable_ept_ad_bits) + eptp |= VMX_EPT_AD_ENABLE_BIT; eptp |= (root_hpa & PAGE_MASK); return eptp; @@ -3153,11 +3187,22 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu) static int vmx_get_cpl(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations + * fail; use the cache instead. + */ + if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) { + return vmx->cpl; + } + if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); - to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu); + vmx->cpl = __vmx_get_cpl(vcpu); } - return to_vmx(vcpu)->cpl; + + return vmx->cpl; } @@ -3165,7 +3210,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; - if (var->unusable) + if (var->unusable || !var->present) ar = 1 << 16; else { ar = var->type & 15; @@ -3177,8 +3222,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) ar |= (var->db & 1) << 14; ar |= (var->g & 1) << 15; } - if (ar == 0) /* a 0 value means unusable */ - ar = AR_UNUSABLE_MASK; return ar; } @@ -3229,6 +3272,44 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, vmcs_write32(sf->ar_bytes, ar); __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + + /* + * Fix segments for real mode guest in hosts that don't have + * "unrestricted_mode" or it was disabled. + * This is done to allow migration of the guests from hosts with + * unrestricted guest like Westmere to older host that don't have + * unrestricted guest like Nehelem. + */ + if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { + switch (seg) { + case VCPU_SREG_CS: + vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); + vmcs_write32(GUEST_CS_LIMIT, 0xffff); + if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) + vmcs_writel(GUEST_CS_BASE, 0xf0000); + vmcs_write16(GUEST_CS_SELECTOR, + vmcs_readl(GUEST_CS_BASE) >> 4); + break; + case VCPU_SREG_ES: + fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); + break; + case VCPU_SREG_DS: + fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); + break; + case VCPU_SREG_GS: + fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); + break; + case VCPU_SREG_FS: + fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); + break; + case VCPU_SREG_SS: + vmcs_write16(GUEST_SS_SELECTOR, + vmcs_readl(GUEST_SS_BASE) >> 4); + vmcs_write32(GUEST_SS_LIMIT, 0xffff); + vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); + break; + } + } } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -3731,6 +3812,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; enable_unrestricted_guest = 0; + /* Enable INVPCID for non-ept guests may cause performance regression. */ + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; @@ -4489,7 +4572,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) break; } vcpu->run->exit_reason = 0; - pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", + vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", (int)(exit_qualification >> 4) & 3, cr); return 0; } @@ -4769,6 +4852,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; gpa_t gpa; + u32 error_code; int gla_validity; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -4793,7 +4877,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); - return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0); + + /* It is a write fault? */ + error_code = exit_qualification & (1U << 1); + /* ept page table is present? */ + error_code |= (exit_qualification >> 3) & 0x1; + + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } static u64 ept_rsvd_mask(u64 spte, int level) @@ -4908,15 +4998,18 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) int ret = 1; u32 cpu_exec_ctrl; bool intr_window_requested; + unsigned count = 130; cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; - while (!guest_state_valid(vcpu)) { - if (intr_window_requested - && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) + while (!guest_state_valid(vcpu) && count-- != 0) { + if (intr_window_requested && vmx_interrupt_allowed(vcpu)) return handle_interrupt_window(&vmx->vcpu); + if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) + return 1; + err = emulate_instruction(vcpu, 0); if (err == EMULATE_DO_MMIO) { @@ -4924,8 +5017,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) goto out; } - if (err != EMULATE_DONE) + if (err != EMULATE_DONE) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; return 0; + } if (signal_pending(current)) goto out; @@ -4933,7 +5030,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) schedule(); } - vmx->emulation_required = 0; + vmx->emulation_required = !guest_state_valid(vcpu); out: return ret; } @@ -6467,6 +6564,23 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) } } } + + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + /* Exposing INVPCID only when PCID is exposed */ + best = kvm_find_cpuid_entry(vcpu, 0x7, 0); + if (vmx_invpcid_supported() && + best && (best->ecx & bit(X86_FEATURE_INVPCID)) && + guest_cpuid_has_pcid(vcpu)) { + exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } else { + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + if (best) + best->ecx &= ~bit(X86_FEATURE_INVPCID); + } } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -7201,6 +7315,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .cpuid_update = vmx_cpuid_update, .rdtscp_supported = vmx_rdtscp_supported, + .invpcid_supported = vmx_invpcid_supported, .set_supported_cpuid = vmx_set_supported_cpuid, @@ -7230,23 +7345,21 @@ static int __init vmx_init(void) if (!vmx_io_bitmap_a) return -ENOMEM; + r = -ENOMEM; + vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_b) { - r = -ENOMEM; + if (!vmx_io_bitmap_b) goto out; - } vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy) { - r = -ENOMEM; + if (!vmx_msr_bitmap_legacy) goto out1; - } + vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode) { - r = -ENOMEM; + if (!vmx_msr_bitmap_longmode) goto out2; - } + /* * Allow direct access to the PC debug port (it is often used for I/O @@ -7275,8 +7388,10 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); if (enable_ept) { - kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, - VMX_EPT_EXECUTABLE_MASK); + kvm_mmu_set_mask_ptes(0ull, + (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, + (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK); ept_set_mmio_spte_mask(); kvm_enable_tdp(); } else diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index be6d54929fa..59b59508ff0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -528,6 +528,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) return 1; } + if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) + return 1; + kvm_x86_ops->set_cr0(vcpu, cr0); if ((cr0 ^ old_cr0) & X86_CR0_PG) { @@ -604,10 +607,20 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_read_cr3(vcpu))) return 1; + if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { + if (!guest_cpuid_has_pcid(vcpu)) + return 1; + + /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ + if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) + return 1; + } + if (kvm_x86_ops->set_cr4(vcpu, cr4)) return 1; - if ((cr4 ^ old_cr4) & pdptr_bits) + if (((cr4 ^ old_cr4) & pdptr_bits) || + (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) @@ -626,8 +639,12 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) } if (is_long_mode(vcpu)) { - if (cr3 & CR3_L_MODE_RESERVED_BITS) - return 1; + if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) { + if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) + return 1; + } else + if (cr3 & CR3_L_MODE_RESERVED_BITS) + return 1; } else { if (is_pae(vcpu)) { if (cr3 & CR3_PAE_RESERVED_BITS) @@ -795,6 +812,7 @@ static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, + MSR_KVM_PV_EOI_EN, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1437,8 +1455,8 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; } default: - pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); return 1; } return 0; @@ -1470,8 +1488,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); default: - pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); return 1; } @@ -1551,15 +1569,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) data &= ~(u64)0x100; /* ignore ignne emulation enable */ data &= ~(u64)0x8; /* ignore TLB cache disable */ if (data != 0) { - pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", - data); + vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", + data); return 1; } break; case MSR_FAM10H_MMIO_CONF_BASE: if (data != 0) { - pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " - "0x%llx\n", data); + vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " + "0x%llx\n", data); return 1; } break; @@ -1574,8 +1592,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) thus reserved and should throw a #GP */ return 1; } - pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); + vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", + __func__, data); break; case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: @@ -1653,6 +1671,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); break; + case MSR_KVM_PV_EOI_EN: + if (kvm_lapic_enable_pv_eoi(vcpu, data)) + return 1; + break; case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: @@ -1671,8 +1693,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_K7_EVNTSEL2: case MSR_K7_EVNTSEL3: if (data != 0) - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); break; /* at least RHEL 4 unconditionally writes to the perfctr registers, * so we ignore writes to make it happy. @@ -1681,8 +1703,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_K7_PERFCTR1: case MSR_K7_PERFCTR2: case MSR_K7_PERFCTR3: - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); break; case MSR_P6_PERFCTR0: case MSR_P6_PERFCTR1: @@ -1693,8 +1715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) return kvm_pmu_set_msr(vcpu, msr, data); if (pr || data != 0) - pr_unimpl(vcpu, "disabled perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); break; case MSR_K7_CLK_CTL: /* @@ -1720,7 +1742,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ - pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has_osvw(vcpu)) @@ -1738,12 +1760,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (kvm_pmu_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr, data); if (!ignore_msrs) { - pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", - msr, data); + vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", + msr, data); return 1; } else { - pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", - msr, data); + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", + msr, data); break; } } @@ -1846,7 +1868,7 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = kvm->arch.hv_hypercall; break; default: - pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; } @@ -1877,7 +1899,7 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = vcpu->arch.hv_vapic; break; default: - pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; } *pdata = data; @@ -2030,10 +2052,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) if (kvm_pmu_msr(vcpu, msr)) return kvm_pmu_get_msr(vcpu, msr, pdata); if (!ignore_msrs) { - pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); return 1; } else { - pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); data = 0; } break; @@ -4116,7 +4138,7 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) value = kvm_get_cr8(vcpu); break; default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + kvm_err("%s: unexpected cr %u\n", __func__, cr); return 0; } @@ -4145,7 +4167,7 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) res = kvm_set_cr8(vcpu, val); break; default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + kvm_err("%s: unexpected cr %u\n", __func__, cr); res = -1; } @@ -4297,26 +4319,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); } -static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, +static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) { - struct kvm_cpuid_entry2 *cpuid = NULL; - - if (eax && ecx) - cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt), - *eax, *ecx); - - if (cpuid) { - *eax = cpuid->eax; - *ecx = cpuid->ecx; - if (ebx) - *ebx = cpuid->ebx; - if (edx) - *edx = cpuid->edx; - return true; - } - - return false; + kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx); } static struct x86_emulate_ops emulate_ops = { @@ -5296,8 +5302,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = kvm_mmu_reload(vcpu); if (unlikely(r)) { - kvm_x86_ops->cancel_injection(vcpu); - goto out; + goto cancel_injection; } preempt_disable(); @@ -5322,9 +5327,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) smp_wmb(); local_irq_enable(); preempt_enable(); - kvm_x86_ops->cancel_injection(vcpu); r = 1; - goto out; + goto cancel_injection; } srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); @@ -5388,9 +5392,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (unlikely(vcpu->arch.tsc_always_catchup)) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - kvm_lapic_sync_from_vapic(vcpu); + if (vcpu->arch.apic_attention) + kvm_lapic_sync_from_vapic(vcpu); r = kvm_x86_ops->handle_exit(vcpu); + return r; + +cancel_injection: + kvm_x86_ops->cancel_injection(vcpu); + if (unlikely(vcpu->arch.apic_attention)) + kvm_lapic_sync_from_vapic(vcpu); out: return r; } @@ -6304,7 +6315,7 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free, for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { - vfree(free->arch.lpage_info[i]); + kvm_kvfree(free->arch.lpage_info[i]); free->arch.lpage_info[i] = NULL; } } @@ -6323,7 +6334,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) slot->base_gfn, level) + 1; slot->arch.lpage_info[i] = - vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); + kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); if (!slot->arch.lpage_info[i]) goto out_free; @@ -6350,7 +6361,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) out_free: for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - vfree(slot->arch.lpage_info[i]); + kvm_kvfree(slot->arch.lpage_info[i]); slot->arch.lpage_info[i] = NULL; } return -ENOMEM; |