From e08b96371625aaa84cb03f51acc4c8e0be27403a Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 4 Jan 2012 10:25:20 +0100 Subject: KVM: s390: add parameter for KVM_CREATE_VM This patch introduces a new config option for user controlled kernel virtual machines. It introduces a parameter to KVM_CREATE_VM that allows to set bits that alter the capabilities of the newly created virtual machine. The parameter is passed to kvm_arch_init_vm for all architectures. The only valid modifier bit for now is KVM_VM_S390_UCONTROL. This requires CAP_SYS_ADMIN privileges and creates a user controlled virtual machine on s390 architectures. Signed-off-by: Carsten Otte Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9cbfc069811..06925b4bcc2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6031,8 +6031,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.pio_data); } -int kvm_arch_init_vm(struct kvm *kvm) +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + if (type) + return -EINVAL; + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); -- cgit v1.2.3-70-g09d2 From 5b1c1493afe8d69909f9df3221bb2fffdf479f4a Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 4 Jan 2012 10:25:23 +0100 Subject: KVM: s390: ucontrol: export SIE control block to user This patch exports the s390 SIE hardware control block to userspace via the mapping of the vcpu file descriptor. In order to do so, a new arch callback named kvm_arch_vcpu_fault is introduced for all architectures. It allows to map architecture specific pages. Signed-off-by: Carsten Otte Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 5 +++++ arch/ia64/kvm/kvm-ia64.c | 5 +++++ arch/powerpc/kvm/powerpc.c | 5 +++++ arch/s390/kvm/kvm-s390.c | 13 +++++++++++++ arch/x86/kvm/x86.c | 5 +++++ include/linux/kvm.h | 2 ++ include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 2 +- 8 files changed, 37 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 6e53ff51422..5ebf47d99e5 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -218,6 +218,11 @@ allocation of vcpu ids. For example, if userspace wants single-threaded guest vcpus, it should make all vcpu ids be a multiple of the number of vcpus per vcore. +For virtual cpus that have been created with S390 user controlled virtual +machines, the resulting vcpu fd can be memory mapped at page offset +KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual +cpu's hardware control block. + 4.8 KVM_GET_DIRTY_LOG (vm ioctl) Capability: basic diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index df6b1419405..8ca7261e7b3 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1566,6 +1566,11 @@ out: return r; } +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 83f24456987..a5671616af8 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -659,6 +659,11 @@ out: return r; } +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) { u32 inst_lis = 0x3c000000; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index af05328aca2..d6bc65aeb95 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -763,6 +763,19 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return r; } +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ +#ifdef CONFIG_KVM_S390_UCONTROL + if ((vmf->pgoff == KVM_S390_SIE_PAGE_OFFSET) + && (kvm_is_ucontrol(vcpu->kvm))) { + vmf->page = virt_to_page(vcpu->arch.sie_block); + get_page(vmf->page); + return 0; + } +#endif + return VM_FAULT_SIGBUS; +} + /* Section: memory related */ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 06925b4bcc2..a3ce196d21f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2814,6 +2814,11 @@ out: return r; } +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) { int ret; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 7f686f6708b..8f888df206a 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -440,6 +440,8 @@ struct kvm_ppc_pvinfo { /* machine type bits, to be used as argument to KVM_CREATE_VM */ #define KVM_VM_S390_UCONTROL 1 +#define KVM_S390_SIE_PAGE_OFFSET 1 + /* * ioctls for /dev/kvm fds: */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 82375e145e6..d4d4d709211 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -450,6 +450,7 @@ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf); int kvm_dev_ioctl_check_extension(long ext); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 32e3b048a6c..64be836f334 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1657,7 +1657,7 @@ static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); #endif else - return VM_FAULT_SIGBUS; + return kvm_arch_vcpu_fault(vcpu, vmf); get_page(page); vmf->page = page; return 0; -- cgit v1.2.3-70-g09d2 From 2b036c6b861dc5da295c6fe19a3edcff7093fdeb Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 9 Jan 2012 14:00:35 -0500 Subject: KVM: SVM: Add support for AMD's OSVW feature in guests In some cases guests should not provide workarounds for errata even when the physical processor is affected. For example, because of erratum 400 on family 10h processors a Linux guest will read an MSR (resulting in VMEXIT) before going to idle in order to avoid getting stuck in a non-C0 state. This is not necessary: HLT and IO instructions are intercepted and therefore there is no reason for erratum 400 workaround in the guest. This patch allows us to present a guest with certain errata as fixed, regardless of the state of actual hardware. Signed-off-by: Boris Ostrovsky Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 6 +++++ arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/cpuid.h | 8 ++++++ arch/x86/kvm/svm.c | 59 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 20 ++++++++++++++ 5 files changed, 94 insertions(+), 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 52d6640a5ca..bd69c93da8f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -478,6 +478,12 @@ struct kvm_vcpu_arch { u32 id; bool send_user_only; } apf; + + /* OSVW MSRs (AMD only) */ + struct { + u64 length; + u64 status; + } osvw; }; struct kvm_arch { diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 89b02bfaaca..9fed5bedaad 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -236,7 +236,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | + F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); /* cpuid 0xC0000001.edx */ diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 5b97e1797a6..26d1fb437eb 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -43,4 +43,12 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); } +static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + return best && (best->ecx & bit(X86_FEATURE_OSVW)); +} + #endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5fa553babe5..fce3ba0f207 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -110,6 +110,12 @@ struct nested_state { #define MSRPM_OFFSETS 16 static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; +/* + * Set osvw_len to higher value when updated Revision Guides + * are published and we know what the new status bits are + */ +static uint64_t osvw_len = 4, osvw_status; + struct vcpu_svm { struct kvm_vcpu vcpu; struct vmcb *vmcb; @@ -556,6 +562,27 @@ static void svm_init_erratum_383(void) erratum_383_found = true; } +static void svm_init_osvw(struct kvm_vcpu *vcpu) +{ + /* + * Guests should see errata 400 and 415 as fixed (assuming that + * HLT and IO instructions are intercepted). + */ + vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3; + vcpu->arch.osvw.status = osvw_status & ~(6ULL); + + /* + * By increasing VCPU's osvw.length to 3 we are telling the guest that + * all osvw.status bits inside that length, including bit 0 (which is + * reserved for erratum 298), are valid. However, if host processor's + * osvw_len is 0 then osvw_status[0] carries no information. We need to + * be conservative here and therefore we tell the guest that erratum 298 + * is present (because we really don't know). + */ + if (osvw_len == 0 && boot_cpu_data.x86 == 0x10) + vcpu->arch.osvw.status |= 1; +} + static int has_svm(void) { const char *msg; @@ -620,6 +647,36 @@ static int svm_hardware_enable(void *garbage) __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; } + + /* + * Get OSVW bits. + * + * Note that it is possible to have a system with mixed processor + * revisions and therefore different OSVW bits. If bits are not the same + * on different processors then choose the worst case (i.e. if erratum + * is present on one processor and not on another then assume that the + * erratum is present everywhere). + */ + if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { + uint64_t len, status = 0; + int err; + + len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); + if (!err) + status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, + &err); + + if (err) + osvw_status = osvw_len = 0; + else { + if (len < osvw_len) + osvw_len = len; + osvw_status |= status; + osvw_status &= (1ULL << osvw_len) - 1; + } + } else + osvw_status = osvw_len = 0; + svm_init_erratum_383(); return 0; @@ -1186,6 +1243,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; + svm_init_osvw(&svm->vcpu); + return &svm->vcpu; free_page4: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a3ce196d21f..2bd77a3a41e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1675,6 +1675,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) */ pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); break; + case MSR_AMD64_OSVW_ID_LENGTH: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + vcpu->arch.osvw.length = data; + break; + case MSR_AMD64_OSVW_STATUS: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + vcpu->arch.osvw.status = data; + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -1959,6 +1969,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) */ data = 0xbe702111; break; + case MSR_AMD64_OSVW_ID_LENGTH: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + data = vcpu->arch.osvw.length; + break; + case MSR_AMD64_OSVW_STATUS: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + data = vcpu->arch.osvw.status; + break; default: if (kvm_pmu_msr(vcpu, msr)) return kvm_pmu_get_msr(vcpu, msr, pdata); -- cgit v1.2.3-70-g09d2 From cc578287e3224d0da196cc1d226bdae6b068faa7 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:50 -0200 Subject: KVM: Infrastructure for software and hardware based TSC rate scaling This requires some restructuring; rather than use 'virtual_tsc_khz' to indicate whether hardware rate scaling is in effect, we consider each VCPU to always have a virtual TSC rate. Instead, there is new logic above the vendor-specific hardware scaling that decides whether it is even necessary to use and updates all rate variables used by common code. This means we can simply query the virtual rate at any point, which is needed for software rate scaling. There is also now a threshold added to the TSC rate scaling; minor differences and variations of measured TSC rate can accidentally provoke rate scaling to be used when it is not needed. Instead, we have a tolerance variable called tsc_tolerance_ppm, which is the maximum variation from user requested rate at which scaling will be used. The default is 250ppm, which is the half the threshold for NTP adjustment, allowing for some hardware variation. In the event that hardware rate scaling is not available, we can kludge a bit by forcing TSC catchup to turn on when a faster than hardware speed has been requested, but there is nothing available yet for the reverse case; this requires a trap and emulate software implementation for RDTSC, which is still forthcoming. [avi: fix 64-bit division on i386] Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 9 +++-- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/svm.c | 20 ++++++---- arch/x86/kvm/vmx.c | 16 +++++--- arch/x86/kvm/x86.c | 82 +++++++++++++++++++++-------------------- 5 files changed, 71 insertions(+), 58 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 782d973b071..ddebbe01fff 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -422,10 +422,11 @@ struct kvm_vcpu_arch { u64 last_kernel_ns; u64 last_tsc_nsec; u64 last_tsc_write; - u32 virtual_tsc_khz; bool tsc_catchup; - u32 tsc_catchup_mult; - s8 tsc_catchup_shift; + bool tsc_always_catchup; + s8 virtual_tsc_shift; + u32 virtual_tsc_mult; + u32 virtual_tsc_khz; atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ unsigned nmi_pending; /* NMI queued after currently running handler */ @@ -651,7 +652,7 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); - void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz); + void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 3ee1d83c695..72975f758c8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -731,7 +731,7 @@ static void start_apic_timer(struct kvm_lapic *apic) u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; u64 ns = 0; struct kvm_vcpu *vcpu = apic->vcpu; - unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu); + unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; unsigned long flags; if (unlikely(!tscdeadline || !this_tsc_khz)) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7bbd17cc348..e12026e5244 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -964,20 +964,25 @@ static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) return _tsc; } -static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { struct vcpu_svm *svm = to_svm(vcpu); u64 ratio; u64 khz; - /* TSC scaling supported? */ - if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) + /* Guest TSC same frequency as host TSC? */ + if (!scale) { + svm->tsc_ratio = TSC_RATIO_DEFAULT; return; + } - /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */ - if (user_tsc_khz == 0) { - vcpu->arch.virtual_tsc_khz = 0; - svm->tsc_ratio = TSC_RATIO_DEFAULT; + /* TSC scaling supported? */ + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + if (user_tsc_khz > tsc_khz) { + vcpu->arch.tsc_catchup = 1; + vcpu->arch.tsc_always_catchup = 1; + } else + WARN(1, "user requested TSC rate below hardware speed\n"); return; } @@ -992,7 +997,6 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) user_tsc_khz); return; } - vcpu->arch.virtual_tsc_khz = user_tsc_khz; svm->tsc_ratio = ratio; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3b4c8d8ad90..e6bf61fa1c0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1817,13 +1817,19 @@ u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) } /* - * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ - * ioctl. In this case the call-back should update internal vmx state to make - * the changes effective. + * Engage any workarounds for mis-matched TSC rates. Currently limited to + * software catchup for faster rates on slower CPUs. */ -static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { - /* Nothing to do here */ + if (!scale) + return; + + if (user_tsc_khz > tsc_khz) { + vcpu->arch.tsc_catchup = 1; + vcpu->arch.tsc_always_catchup = 1; + } else + WARN(1, "user requested TSC rate below hardware speed\n"); } /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2bd77a3a41e..41bb90acb23 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -96,6 +96,10 @@ EXPORT_SYMBOL_GPL(kvm_has_tsc_control); u32 kvm_max_guest_tsc_khz; EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); +/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ +static u32 tsc_tolerance_ppm = 250; +module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -968,49 +972,50 @@ static inline u64 get_kernel_ns(void) static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); unsigned long max_tsc_khz; -static inline int kvm_tsc_changes_freq(void) +static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { - int cpu = get_cpu(); - int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && - cpufreq_quick_get(cpu) != 0; - put_cpu(); - return ret; + return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, + vcpu->arch.virtual_tsc_shift); } -u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) +static u32 adjust_tsc_khz(u32 khz, s32 ppm) { - if (vcpu->arch.virtual_tsc_khz) - return vcpu->arch.virtual_tsc_khz; - else - return __this_cpu_read(cpu_tsc_khz); + u64 v = (u64)khz * (1000000 + ppm); + do_div(v, 1000000); + return v; } -static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) +static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) { - u64 ret; + u32 thresh_lo, thresh_hi; + int use_scaling = 0; - WARN_ON(preemptible()); - if (kvm_tsc_changes_freq()) - printk_once(KERN_WARNING - "kvm: unreliable cycle conversion on adjustable rate TSC\n"); - ret = nsec * vcpu_tsc_khz(vcpu); - do_div(ret, USEC_PER_SEC); - return ret; -} - -static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz) -{ /* Compute a scale to convert nanoseconds in TSC cycles */ kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, - &vcpu->arch.tsc_catchup_shift, - &vcpu->arch.tsc_catchup_mult); + &vcpu->arch.virtual_tsc_shift, + &vcpu->arch.virtual_tsc_mult); + vcpu->arch.virtual_tsc_khz = this_tsc_khz; + + /* + * Compute the variation in TSC rate which is acceptable + * within the range of tolerance and decide if the + * rate being applied is within that bounds of the hardware + * rate. If so, no scaling or compensation need be done. + */ + thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); + thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); + if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) { + pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi); + use_scaling = 1; + } + kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling); } static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) { u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, - vcpu->arch.tsc_catchup_mult, - vcpu->arch.tsc_catchup_shift); + vcpu->arch.virtual_tsc_mult, + vcpu->arch.virtual_tsc_shift); tsc += vcpu->arch.last_tsc_write; return tsc; } @@ -1077,7 +1082,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) local_irq_save(flags); tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); kernel_ns = get_kernel_ns(); - this_tsc_khz = vcpu_tsc_khz(v); + this_tsc_khz = __get_cpu_var(cpu_tsc_khz); if (unlikely(this_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); @@ -2804,26 +2809,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u32 user_tsc_khz; r = -EINVAL; - if (!kvm_has_tsc_control) - break; - user_tsc_khz = (u32)arg; if (user_tsc_khz >= kvm_max_guest_tsc_khz) goto out; - kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz); + if (user_tsc_khz == 0) + user_tsc_khz = tsc_khz; + + kvm_set_tsc_khz(vcpu, user_tsc_khz); r = 0; goto out; } case KVM_GET_TSC_KHZ: { - r = -EIO; - if (check_tsc_unstable()) - goto out; - - r = vcpu_tsc_khz(vcpu); - + r = vcpu->arch.virtual_tsc_khz; goto out; } default: @@ -5312,6 +5312,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) profile_hit(KVM_PROFILING, (void *)rip); } + if (unlikely(vcpu->arch.tsc_always_catchup)) + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); kvm_lapic_sync_from_vapic(vcpu); @@ -6004,7 +6006,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.pio_data = page_address(page); - kvm_init_tsc_catchup(vcpu, max_tsc_khz); + kvm_set_tsc_khz(vcpu, max_tsc_khz); r = kvm_mmu_create(vcpu); if (r < 0) -- cgit v1.2.3-70-g09d2 From 5d3cb0f6a8e3af018a522ae8d36f8f7d2511b5d8 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:51 -0200 Subject: KVM: Improve TSC offset matching There are a few improvements that can be made to the TSC offset matching code. First, we don't need to call the 128-bit multiply (especially on a constant number), the code works much nicer to do computation in nanosecond units. Second, the way everything is setup with software TSC rate scaling, we currently have per-cpu rates. Obviously this isn't too desirable to use in practice, but if for some reason we do change the rate of all VCPUs at runtime, then reset the TSCs, we will only want to match offsets for VCPUs running at the same rate. Finally, for the case where we have an unstable host TSC, but rate scaling is being done in hardware, we should call the platform code to compute the TSC offset, so the math is reorganized to recompute the base instead, then transform the base into an offset using the existing API. [avi: fix 64-bit division on i386] Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti KVM: Fix 64-bit division in kvm_write_tsc() Breaks i386 build. Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 44 +++++++++++++++++++++++++++-------------- 2 files changed, 30 insertions(+), 15 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ddebbe01fff..8a34fca6c57 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -513,6 +513,7 @@ struct kvm_arch { u64 last_tsc_nsec; u64 last_tsc_offset; u64 last_tsc_write; + u32 last_tsc_khz; struct kvm_xen_hvm_config xen_hvm_config; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 41bb90acb23..4390f42b371 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1025,33 +1025,46 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - s64 sdiff; + s64 nsdiff; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; - sdiff = data - kvm->arch.last_tsc_write; - if (sdiff < 0) - sdiff = -sdiff; + + /* n.b - signed multiplication and division required */ + nsdiff = data - kvm->arch.last_tsc_write; +#ifdef CONFIG_X86_64 + nsdiff = (nsdiff * 1000) / vcpu->arch.virtual_tsc_khz; +#else + /* do_div() only does unsigned */ + asm("idivl %2; xor %%edx, %%edx" + : "=A"(nsdiff) + : "A"(nsdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); +#endif + nsdiff -= elapsed; + if (nsdiff < 0) + nsdiff = -nsdiff; /* - * Special case: close write to TSC within 5 seconds of - * another CPU is interpreted as an attempt to synchronize - * The 5 seconds is to accommodate host load / swapping as - * well as any reset of TSC during the boot process. - * - * In that case, for a reliable TSC, we can match TSC offsets, - * or make a best guest using elapsed value. - */ - if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) && - elapsed < 5ULL * NSEC_PER_SEC) { + * Special case: TSC write with a small delta (1 second) of virtual + * cycle time against real time is interpreted as an attempt to + * synchronize the CPU. + * + * For a reliable TSC, we can match TSC offsets, and for an unstable + * TSC, we add elapsed time in this computation. We could let the + * compensation code attempt to catch up if we fall behind, but + * it's better to try to match offsets from the beginning. + */ + if (nsdiff < NSEC_PER_SEC && + vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!check_tsc_unstable()) { offset = kvm->arch.last_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); } else { u64 delta = nsec_to_cycles(vcpu, elapsed); - offset += delta; + data += delta; + offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } ns = kvm->arch.last_tsc_nsec; @@ -1059,6 +1072,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) kvm->arch.last_tsc_nsec = ns; kvm->arch.last_tsc_write = data; kvm->arch.last_tsc_offset = offset; + kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; kvm_x86_ops->write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); -- cgit v1.2.3-70-g09d2 From 4dd7980b21408624e9b6f3df05719c3c61db6e9f Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:52 -0200 Subject: KVM: Leave TSC synchronization window open with each new sync Currently, when the TSC is written by the guest, the variable ns is updated to force the current write to appear to have taken place at the time of the first write in this sync phase. This leaves a cliff at the end of the match window where updates will fall of the end. There are two scenarios where this can be a problem in practe - first, on a system with a large number of VCPUs, the sync period may last for an extended period of time. The second way this can happen is if the VM reboots very rapidly and we catch a VCPU TSC synchronization just around the edge. We may be unaware of the reboot, and thus the first VCPU might synchronize with an old set of the timer (at, say 0.97 seconds ago, when first powered on). The second VCPU can come in 0.04 seconds later to try to synchronize, but it misses the window because it is just over the threshold. Instead, stop doing this artificial setback of the ns variable and just update it with every write of the TSC. It may be observed that doing so causes values computed by compute_guest_tsc to diverge slightly across CPUs - note that the last_tsc_ns and last_tsc_write variable are used here, and now they last_tsc_ns will be different for each VCPU, reflecting the actual time of the update. However, compute_guest_tsc is used only for guests which already have TSC stability issues, and further, note that the previous patch has caused last_tsc_write to be incremented by the difference in nanoseconds, converted back into guest cycles. As such, only boundary rounding errors should be visible, which given the resolution in nanoseconds, is going to only be a few cycles and only visible in cross-CPU consistency tests. The problem can be fixed by adding a new set of variables to track the start offset and start write value for the current sync cycle. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4390f42b371..030d495e5c7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1067,7 +1067,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } - ns = kvm->arch.last_tsc_nsec; } kvm->arch.last_tsc_nsec = ns; kvm->arch.last_tsc_write = data; -- cgit v1.2.3-70-g09d2 From b183aa580a3a09b5d79224a9022418508532c778 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:53 -0200 Subject: KVM: Fix last_guest_tsc / tsc_offset semantics The variable last_guest_tsc was being used as an ad-hoc indicator that guest TSC has been initialized and recorded correctly. However, it may not have been, it could be that guest TSC has been set to some large value, the back to a small value (by, say, a software reboot). This defeats the logic and causes KVM to falsely assume that the guest TSC has gone backwards, marking the host TSC unstable, which is undesirable behavior. In addition, rather than try to compute an offset adjustment for the TSC on unstable platforms, just recompute the whole offset. This allows us to get rid of one callsite for adjust_tsc_offset, which is problematic because the units it takes are in guest units, but here, the computation was originally being done in host units. Doing this, and also recording last_guest_tsc when the TSC is written allow us to remove the tricky logic which depended on last_guest_tsc being zero to indicate a reset of uninitialized value. Instead, we now have the guarantee that the guest TSC offset is always at least something which will get us last_guest_tsc. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 030d495e5c7..2a59f76d96f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1079,6 +1079,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) vcpu->arch.hv_clock.tsc_timestamp = 0; vcpu->arch.last_tsc_write = data; vcpu->arch.last_tsc_nsec = ns; + vcpu->arch.last_guest_tsc = data; } EXPORT_SYMBOL_GPL(kvm_write_tsc); @@ -1147,7 +1148,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * observed by the guest and ensure the new system time is greater. */ max_kernel_ns = 0; - if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { + if (vcpu->hv_clock.tsc_timestamp) { max_kernel_ns = vcpu->last_guest_tsc - vcpu->hv_clock.tsc_timestamp; max_kernel_ns = pvclock_scale_delta(max_kernel_ns, @@ -2257,13 +2258,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) u64 tsc; tsc = kvm_x86_ops->read_l1_tsc(vcpu); - tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : - tsc - vcpu->arch.last_guest_tsc; + tsc_delta = tsc - vcpu->arch.last_guest_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (check_tsc_unstable()) { - kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); + u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu, + vcpu->arch.last_guest_tsc); + kvm_x86_ops->write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); -- cgit v1.2.3-70-g09d2 From 6f526ec5383dcd5fa5ffc7b3ac1d62099a0b46ad Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:54 -0200 Subject: KVM: Add last_host_tsc tracking back to KVM The variable last_host_tsc was removed from upstream code. I am adding it back for two reasons. First, it is unnecessary to use guest TSC computation to conclude information about the host TSC. The guest may set the TSC backwards (this case handled by the previous patch), but the computation of guest TSC (and fetching an MSR) is significanlty more work and complexity than simply reading the hardware counter. In addition, we don't actually need the guest TSC for any part of the computation, by always recomputing the offset, we can eliminate the need to deal with the current offset and any scaling factors that may apply. The second reason is that later on, we are going to be using the host TSC value to restore TSC offsets after a host S4 suspend, so we need to be reading the host values, not the guest values here. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 11 +++-------- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8a34fca6c57..b23682900f4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -422,6 +422,7 @@ struct kvm_vcpu_arch { u64 last_kernel_ns; u64 last_tsc_nsec; u64 last_tsc_write; + u64 last_host_tsc; bool tsc_catchup; bool tsc_always_catchup; s8 virtual_tsc_shift; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2a59f76d96f..39a57dac884 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2253,13 +2253,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->vcpu_load(vcpu, cpu); if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { - /* Make sure TSC doesn't go backwards */ - s64 tsc_delta; - u64 tsc; - - tsc = kvm_x86_ops->read_l1_tsc(vcpu); - tsc_delta = tsc - vcpu->arch.last_guest_tsc; - + s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : + native_read_tsc() - vcpu->arch.last_host_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (check_tsc_unstable()) { @@ -2282,7 +2277,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); - vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + vcpu->arch.last_host_tsc = native_read_tsc(); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, -- cgit v1.2.3-70-g09d2 From f1e2b26003c41e581243c09ceed7567677449468 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 3 Feb 2012 15:43:55 -0200 Subject: KVM: Allow adjust_tsc_offset to be in host or guest cycles Redefine the API to take a parameter indicating whether an adjustment is in host or guest cycles. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 13 ++++++++++++- arch/x86/kvm/svm.c | 6 +++++- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 2 +- 4 files changed, 19 insertions(+), 4 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b23682900f4..dd439f13df8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -646,7 +646,7 @@ struct kvm_x86_ops { u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); - void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment); + void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -676,6 +676,17 @@ struct kvm_arch_async_pf { extern struct kvm_x86_ops *kvm_x86_ops; +static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, + s64 adjustment) +{ + kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false); +} + +static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) +{ + kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true); +} + int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e12026e5244..0b7690ee20b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1016,10 +1016,14 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) { struct vcpu_svm *svm = to_svm(vcpu); + WARN_ON(adjustment < 0); + if (host) + adjustment = svm_scale_tsc(vcpu, adjustment); + svm->vmcb->control.tsc_offset += adjustment; if (is_guest_mode(vcpu)) svm->nested.hsave->control.tsc_offset += adjustment; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e6bf61fa1c0..575fb742a6f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1856,7 +1856,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) } } -static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) { u64 offset = vmcs_read64(TSC_OFFSET); vmcs_write64(TSC_OFFSET, offset + adjustment); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 39a57dac884..3b931302fa5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1116,7 +1116,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (vcpu->tsc_catchup) { u64 tsc = compute_guest_tsc(v, kernel_ns); if (tsc > tsc_timestamp) { - kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); + adjust_tsc_offset_guest(v, tsc - tsc_timestamp); tsc_timestamp = tsc; } } -- cgit v1.2.3-70-g09d2 From 0dd6a6edb0124e6c71931ff575b18e15ed6e8603 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:56 -0200 Subject: KVM: Dont mark TSC unstable due to S4 suspend During a host suspend, TSC may go backwards, which KVM interprets as an unstable TSC. Technically, KVM should not be marking the TSC unstable, which causes the TSC clocksource to go bad, but we need to be adjusting the TSC offsets in such a case. Dealing with this issue is a little tricky as the only place we can reliably do it is before much of the timekeeping infrastructure is up and running. On top of this, we are not in a KVM thread context, so we may not be able to safely access VCPU fields. Instead, we compute our best known hardware offset at power-up and stash it to be applied to all VCPUs when they actually start running. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 93 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 89 insertions(+), 5 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dd439f13df8..4fbeb84b181 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -423,6 +423,7 @@ struct kvm_vcpu_arch { u64 last_tsc_nsec; u64 last_tsc_write; u64 last_host_tsc; + u64 tsc_offset_adjustment; bool tsc_catchup; bool tsc_always_catchup; s8 virtual_tsc_shift; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3b931302fa5..4e9bd23d522 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2252,6 +2252,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } kvm_x86_ops->vcpu_load(vcpu, cpu); + + /* Apply any externally detected TSC adjustments (due to suspend) */ + if (unlikely(vcpu->arch.tsc_offset_adjustment)) { + adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); + vcpu->arch.tsc_offset_adjustment = 0; + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + } + if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : native_read_tsc() - vcpu->arch.last_host_tsc; @@ -5964,13 +5972,88 @@ int kvm_arch_hardware_enable(void *garbage) struct kvm *kvm; struct kvm_vcpu *vcpu; int i; + int ret; + u64 local_tsc; + u64 max_tsc = 0; + bool stable, backwards_tsc = false; kvm_shared_msr_cpu_online(); - list_for_each_entry(kvm, &vm_list, vm_list) - kvm_for_each_vcpu(i, vcpu, kvm) - if (vcpu->cpu == smp_processor_id()) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - return kvm_x86_ops->hardware_enable(garbage); + ret = kvm_x86_ops->hardware_enable(garbage); + if (ret != 0) + return ret; + + local_tsc = native_read_tsc(); + stable = !check_tsc_unstable(); + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!stable && vcpu->cpu == smp_processor_id()) + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + if (stable && vcpu->arch.last_host_tsc > local_tsc) { + backwards_tsc = true; + if (vcpu->arch.last_host_tsc > max_tsc) + max_tsc = vcpu->arch.last_host_tsc; + } + } + } + + /* + * Sometimes, even reliable TSCs go backwards. This happens on + * platforms that reset TSC during suspend or hibernate actions, but + * maintain synchronization. We must compensate. Fortunately, we can + * detect that condition here, which happens early in CPU bringup, + * before any KVM threads can be running. Unfortunately, we can't + * bring the TSCs fully up to date with real time, as we aren't yet far + * enough into CPU bringup that we know how much real time has actually + * elapsed; our helper function, get_kernel_ns() will be using boot + * variables that haven't been updated yet. + * + * So we simply find the maximum observed TSC above, then record the + * adjustment to TSC in each VCPU. When the VCPU later gets loaded, + * the adjustment will be applied. Note that we accumulate + * adjustments, in case multiple suspend cycles happen before some VCPU + * gets a chance to run again. In the event that no KVM threads get a + * chance to run, we will miss the entire elapsed period, as we'll have + * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may + * loose cycle time. This isn't too big a deal, since the loss will be + * uniform across all VCPUs (not to mention the scenario is extremely + * unlikely). It is possible that a second hibernate recovery happens + * much faster than a first, causing the observed TSC here to be + * smaller; this would require additional padding adjustment, which is + * why we set last_host_tsc to the local tsc observed here. + * + * N.B. - this code below runs only on platforms with reliable TSC, + * as that is the only way backwards_tsc is set above. Also note + * that this runs for ALL vcpus, which is not a bug; all VCPUs should + * have the same delta_cyc adjustment applied if backwards_tsc + * is detected. Note further, this adjustment is only done once, + * as we reset last_host_tsc on all VCPUs to stop this from being + * called multiple times (one for each physical CPU bringup). + * + * Platforms with unnreliable TSCs don't have to deal with this, they + * will be compensated by the logic in vcpu_load, which sets the TSC to + * catchup mode. This will catchup all VCPUs to real time, but cannot + * guarantee that they stay in perfect synchronization. + */ + if (backwards_tsc) { + u64 delta_cyc = max_tsc - local_tsc; + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + vcpu->arch.tsc_offset_adjustment += delta_cyc; + vcpu->arch.last_host_tsc = local_tsc; + } + + /* + * We have to disable TSC offset matching.. if you were + * booting a VM while issuing an S4 host suspend.... + * you may have some problem. Solving this issue is + * left as an exercise to the reader. + */ + kvm->arch.last_tsc_nsec = 0; + kvm->arch.last_tsc_write = 0; + } + + } + return 0; } void kvm_arch_hardware_disable(void *garbage) -- cgit v1.2.3-70-g09d2 From e26101b116a6235bcd80b3a4c38c9fe91286cd79 Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Fri, 3 Feb 2012 15:43:57 -0200 Subject: KVM: Track TSC synchronization in generations This allows us to track the original nanosecond and counter values at each phase of TSC writing by the guest. This gets us perfect offset matching for stable TSC systems, and perfect software computed TSC matching for machines with unstable TSC. Signed-off-by: Zachary Amsden Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 10 +++++++--- arch/x86/kvm/x86.c | 41 +++++++++++++++++++++++++++++++++-------- 2 files changed, 40 insertions(+), 11 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4fbeb84b181..c24125cd0c6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -420,10 +420,11 @@ struct kvm_vcpu_arch { u64 last_guest_tsc; u64 last_kernel_ns; - u64 last_tsc_nsec; - u64 last_tsc_write; u64 last_host_tsc; u64 tsc_offset_adjustment; + u64 this_tsc_nsec; + u64 this_tsc_write; + u8 this_tsc_generation; bool tsc_catchup; bool tsc_always_catchup; s8 virtual_tsc_shift; @@ -513,9 +514,12 @@ struct kvm_arch { s64 kvmclock_offset; raw_spinlock_t tsc_write_lock; u64 last_tsc_nsec; - u64 last_tsc_offset; u64 last_tsc_write; u32 last_tsc_khz; + u64 cur_tsc_nsec; + u64 cur_tsc_write; + u64 cur_tsc_offset; + u8 cur_tsc_generation; struct kvm_xen_hvm_config xen_hvm_config; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4e9bd23d522..e86f9b22eac 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1013,10 +1013,10 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) { - u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, + u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, vcpu->arch.virtual_tsc_mult, vcpu->arch.virtual_tsc_shift); - tsc += vcpu->arch.last_tsc_write; + tsc += vcpu->arch.this_tsc_write; return tsc; } @@ -1059,7 +1059,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) if (nsdiff < NSEC_PER_SEC && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!check_tsc_unstable()) { - offset = kvm->arch.last_tsc_offset; + offset = kvm->arch.cur_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); } else { u64 delta = nsec_to_cycles(vcpu, elapsed); @@ -1067,20 +1067,45 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } + } else { + /* + * We split periods of matched TSC writes into generations. + * For each generation, we track the original measured + * nanosecond time, offset, and write, so if TSCs are in + * sync, we can match exact offset, and if not, we can match + * exact software computaion in compute_guest_tsc() + * + * These values are tracked in kvm->arch.cur_xxx variables. + */ + kvm->arch.cur_tsc_generation++; + kvm->arch.cur_tsc_nsec = ns; + kvm->arch.cur_tsc_write = data; + kvm->arch.cur_tsc_offset = offset; + pr_debug("kvm: new tsc generation %u, clock %llu\n", + kvm->arch.cur_tsc_generation, data); } + + /* + * We also track th most recent recorded KHZ, write and time to + * allow the matching interval to be extended at each write. + */ kvm->arch.last_tsc_nsec = ns; kvm->arch.last_tsc_write = data; - kvm->arch.last_tsc_offset = offset; kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; - kvm_x86_ops->write_tsc_offset(vcpu, offset); - raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); /* Reset of TSC must disable overshoot protection below */ vcpu->arch.hv_clock.tsc_timestamp = 0; - vcpu->arch.last_tsc_write = data; - vcpu->arch.last_tsc_nsec = ns; vcpu->arch.last_guest_tsc = data; + + /* Keep track of which generation this VCPU has synchronized to */ + vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; + vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; + vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; + + kvm_x86_ops->write_tsc_offset(vcpu, offset); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); } + EXPORT_SYMBOL_GPL(kvm_write_tsc); static int kvm_guest_time_update(struct kvm_vcpu *v) -- cgit v1.2.3-70-g09d2 From 6dbf79e7164e9a86c1e466062c48498142ae6128 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sun, 5 Feb 2012 20:42:41 +0900 Subject: KVM: Fix write protection race during dirty logging This patch fixes a race introduced by: commit 95d4c16ce78cb6b7549a09159c409d52ddd18dae KVM: Optimize dirty logging by rmap_write_protect() During protecting pages for dirty logging, other threads may also try to protect a page in mmu_sync_children() or kvm_mmu_get_page(). In such a case, because get_dirty_log releases mmu_lock before flushing TLB's, the following race condition can happen: A (get_dirty_log) B (another thread) lock(mmu_lock) clear pte.w unlock(mmu_lock) lock(mmu_lock) pte.w is already cleared unlock(mmu_lock) skip TLB flush return ... TLB flush Though thread B assumes the page has already been protected when it returns, the remaining TLB entry will break that assumption. This patch fixes this problem by making get_dirty_log hold the mmu_lock until it flushes the TLB's. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e86f9b22eac..3df0b7a140b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3065,6 +3065,8 @@ static void write_protect_slot(struct kvm *kvm, unsigned long *dirty_bitmap, unsigned long nr_dirty_pages) { + spin_lock(&kvm->mmu_lock); + /* Not many dirty pages compared to # of shadow pages. */ if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { unsigned long gfn_offset; @@ -3072,16 +3074,13 @@ static void write_protect_slot(struct kvm *kvm, for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { unsigned long gfn = memslot->base_gfn + gfn_offset; - spin_lock(&kvm->mmu_lock); kvm_mmu_rmap_write_protect(kvm, gfn, memslot); - spin_unlock(&kvm->mmu_lock); } kvm_flush_remote_tlbs(kvm); - } else { - spin_lock(&kvm->mmu_lock); + } else kvm_mmu_slot_remove_write_access(kvm, memslot->id); - spin_unlock(&kvm->mmu_lock); - } + + spin_unlock(&kvm->mmu_lock); } /* -- cgit v1.2.3-70-g09d2 From db3fe4eb45f3555d91a7124e18cf3a2f2a30eb90 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 8 Feb 2012 13:02:18 +0900 Subject: KVM: Introduce kvm_memory_slot::arch and move lpage_info into it Some members of kvm_memory_slot are not used by every architecture. This patch is the first step to make this difference clear by introducing kvm_memory_slot::arch; lpage_info is moved into it. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/ia64/include/asm/kvm_host.h | 3 ++ arch/ia64/kvm/kvm-ia64.c | 10 ++++++ arch/powerpc/include/asm/kvm_host.h | 3 ++ arch/powerpc/kvm/powerpc.c | 10 ++++++ arch/s390/include/asm/kvm_host.h | 3 ++ arch/s390/kvm/kvm-s390.c | 10 ++++++ arch/x86/include/asm/kvm_host.h | 9 +++++ arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/x86.c | 59 +++++++++++++++++++++++++++++++ include/linux/kvm_host.h | 11 +++--- virt/kvm/kvm_main.c | 70 +++++-------------------------------- 11 files changed, 122 insertions(+), 68 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h index 2689ee54a1c..e35b3a84a40 100644 --- a/arch/ia64/include/asm/kvm_host.h +++ b/arch/ia64/include/asm/kvm_host.h @@ -459,6 +459,9 @@ struct kvm_sal_data { unsigned long boot_gp; }; +struct kvm_arch_memory_slot { +}; + struct kvm_arch { spinlock_t dirty_log_lock; diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 8ca7261e7b3..d8ddbba6fe7 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1571,6 +1571,16 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + return 0; +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1843d5d2a3b..52eb9c1f4fe 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -213,6 +213,9 @@ struct revmap_entry { #define KVMPPC_PAGE_WRITETHRU HPTE_R_W /* 0x40 */ #define KVMPPC_GOT_PAGE 0x80 +struct kvm_arch_memory_slot { +}; + struct kvm_arch { #ifdef CONFIG_KVM_BOOK3S_64_HV unsigned long hpt_virt; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 0e21d155eea..00d7e345b3f 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -281,6 +281,16 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + return 0; +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index e6304268ea2..7343872890a 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -245,6 +245,9 @@ struct kvm_vm_stat { u32 remote_tlb_flush; }; +struct kvm_arch_memory_slot { +}; + struct kvm_arch{ struct sca_block *sca; debug_info_t *dbf; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index cf3c0a91d04..17ad69d596f 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -814,6 +814,16 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + return 0; +} + /* Section: memory related */ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c24125cd0c6..74c9edf2bb1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -483,6 +483,15 @@ struct kvm_vcpu_arch { } osvw; }; +struct kvm_lpage_info { + unsigned long rmap_pde; + int write_count; +}; + +struct kvm_arch_memory_slot { + struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; +}; + struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 37e7f100a0e..ff053ca3230 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -689,7 +689,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, unsigned long idx; idx = gfn_to_index(gfn, slot->base_gfn, level); - return &slot->lpage_info[level - 2][idx]; + return &slot->arch.lpage_info[level - 2][idx]; } static void account_shadowed(struct kvm *kvm, gfn_t gfn) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3df0b7a140b..ca74c1dadf3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6239,6 +6239,65 @@ void kvm_arch_destroy_vm(struct kvm *kvm) put_page(kvm->arch.ept_identity_pagetable); } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + int i; + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { + vfree(free->arch.lpage_info[i]); + free->arch.lpage_info[i] = NULL; + } + } +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + int i; + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + unsigned long ugfn; + int lpages; + int level = i + 2; + + lpages = gfn_to_index(slot->base_gfn + npages - 1, + slot->base_gfn, level) + 1; + + slot->arch.lpage_info[i] = + vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); + if (!slot->arch.lpage_info[i]) + goto out_free; + + if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) + slot->arch.lpage_info[i][0].write_count = 1; + if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) + slot->arch.lpage_info[i][lpages - 1].write_count = 1; + ugfn = slot->userspace_addr >> PAGE_SHIFT; + /* + * If the gfn and userspace address are not aligned wrt each + * other, or if explicitly asked to, disable large page + * support for this slot + */ + if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || + !kvm_largepages_enabled()) { + unsigned long j; + + for (j = 0; j < lpages; ++j) + slot->arch.lpage_info[i][j].write_count = 1; + } + } + + return 0; + +out_free: + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + vfree(slot->arch.lpage_info[i]); + slot->arch.lpage_info[i] = NULL; + } + return -ENOMEM; +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7a08496b974..355e44555c3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -171,11 +171,6 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) */ #define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1) -struct kvm_lpage_info { - unsigned long rmap_pde; - int write_count; -}; - struct kvm_memory_slot { gfn_t base_gfn; unsigned long npages; @@ -184,7 +179,7 @@ struct kvm_memory_slot { unsigned long *dirty_bitmap; unsigned long *dirty_bitmap_head; unsigned long nr_dirty_pages; - struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; + struct kvm_arch_memory_slot arch; unsigned long userspace_addr; int user_alloc; int id; @@ -376,6 +371,9 @@ int kvm_set_memory_region(struct kvm *kvm, int __kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc); +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont); +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages); int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, @@ -385,6 +383,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, struct kvm_memory_slot old, int user_alloc); +bool kvm_largepages_enabled(void); void kvm_disable_largepages(void); void kvm_arch_flush_shadow(struct kvm *kvm); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a30447c5eb4..8340e0e6203 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -535,21 +535,13 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) static void kvm_free_physmem_slot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { - int i; - if (!dont || free->rmap != dont->rmap) vfree(free->rmap); if (!dont || free->dirty_bitmap != dont->dirty_bitmap) kvm_destroy_dirty_bitmap(free); - - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { - vfree(free->lpage_info[i]); - free->lpage_info[i] = NULL; - } - } + kvm_arch_free_memslot(free, dont); free->npages = 0; free->rmap = NULL; @@ -685,53 +677,6 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new) slots->generation++; } -#ifndef CONFIG_S390 -static int create_lpage_info(struct kvm_memory_slot *slot, unsigned long npages) -{ - int i; - - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - unsigned long ugfn; - int lpages; - int level = i + 2; - - lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; - - slot->lpage_info[i] = vzalloc(lpages * sizeof(*slot->lpage_info[i])); - if (!slot->lpage_info[i]) - goto out_free; - - if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->lpage_info[i][0].write_count = 1; - if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->lpage_info[i][lpages - 1].write_count = 1; - ugfn = slot->userspace_addr >> PAGE_SHIFT; - /* - * If the gfn and userspace address are not aligned wrt each - * other, or if explicitly asked to, disable large page - * support for this slot - */ - if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || - !largepages_enabled) { - unsigned long j; - - for (j = 0; j < lpages; ++j) - slot->lpage_info[i][j].write_count = 1; - } - } - - return 0; - -out_free: - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - vfree(slot->lpage_info[i]); - slot->lpage_info[i] = NULL; - } - return -ENOMEM; -} -#endif /* not defined CONFIG_S390 */ - /* * Allocate some memory and give it an address in the guest physical address * space. @@ -819,10 +764,9 @@ int __kvm_set_memory_region(struct kvm *kvm, new.rmap = vzalloc(npages * sizeof(*new.rmap)); if (!new.rmap) goto out_free; - - if (create_lpage_info(&new, npages)) - goto out_free; #endif /* not defined CONFIG_S390 */ + if (kvm_arch_create_memslot(&new, npages)) + goto out_free; } /* Allocate page dirty bitmap if needed */ @@ -880,8 +824,7 @@ int __kvm_set_memory_region(struct kvm *kvm, if (!npages) { new.rmap = NULL; new.dirty_bitmap = NULL; - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) - new.lpage_info[i] = NULL; + memset(&new.arch, 0, sizeof(new.arch)); } update_memslots(slots, &new); @@ -968,6 +911,11 @@ out: return r; } +bool kvm_largepages_enabled(void) +{ + return largepages_enabled; +} + void kvm_disable_largepages(void) { largepages_enabled = false; -- cgit v1.2.3-70-g09d2 From 7f3d35fddd173e52886d03bc34b5b5d6f5bea343 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 8 Feb 2012 14:34:38 +0100 Subject: KVM: x86 emulator: Fix task switch privilege checks Currently, all task switches check privileges against the DPL of the TSS. This is only correct for jmp/call to a TSS. If a task gate is used, the DPL of this take gate is used for the check instead. Exceptions, external interrupts and iret shouldn't perform any check. [avi: kill kvm-kmod remnants] Signed-off-by: Kevin Wolf Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 2 +- arch/x86/include/asm/kvm_host.h | 4 +-- arch/x86/kvm/emulate.c | 53 +++++++++++++++++++++++++++++++++----- arch/x86/kvm/svm.c | 5 +++- arch/x86/kvm/vmx.c | 8 +++--- arch/x86/kvm/x86.c | 6 ++--- 6 files changed, 61 insertions(+), 17 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 7b9cfc4878a..df437b68f42 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -388,7 +388,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_INTERCEPTED 2 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, + u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 74c9edf2bb1..e216ba066e7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -768,8 +768,8 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, - bool has_error_code, u32 error_code); +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, + int reason, bool has_error_code, u32 error_code); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 71450aca3b8..fa310a48591 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1152,6 +1152,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, return 1; } +static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt, + u16 index, struct desc_struct *desc) +{ + struct desc_ptr dt; + ulong addr; + + ctxt->ops->get_idt(ctxt, &dt); + + if (dt.size < index * 8 + 7) + return emulate_gp(ctxt, index << 3 | 0x2); + + addr = dt.address + index * 8; + return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, + &ctxt->exception); +} + static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_ptr *dt) { @@ -2421,7 +2437,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, } static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, + u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { struct x86_emulate_ops *ops = ctxt->ops; @@ -2443,12 +2459,35 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, /* FIXME: check that next_tss_desc is tss */ - if (reason != TASK_SWITCH_IRET) { - if ((tss_selector & 3) > next_tss_desc.dpl || - ops->cpl(ctxt) > next_tss_desc.dpl) - return emulate_gp(ctxt, 0); + /* + * Check privileges. The three cases are task switch caused by... + * + * 1. jmp/call/int to task gate: Check against DPL of the task gate + * 2. Exception/IRQ/iret: No check is performed + * 3. jmp/call to TSS: Check agains DPL of the TSS + */ + if (reason == TASK_SWITCH_GATE) { + if (idt_index != -1) { + /* Software interrupts */ + struct desc_struct task_gate_desc; + int dpl; + + ret = read_interrupt_descriptor(ctxt, idt_index, + &task_gate_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + + dpl = task_gate_desc.dpl; + if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) + return emulate_gp(ctxt, (idt_index << 3) | 0x2); + } + } else if (reason != TASK_SWITCH_IRET) { + int dpl = next_tss_desc.dpl; + if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) + return emulate_gp(ctxt, tss_selector); } + desc_limit = desc_limit_scaled(&next_tss_desc); if (!next_tss_desc.p || ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || @@ -2501,7 +2540,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, } int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, + u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { int rc; @@ -2509,7 +2548,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, ctxt->_eip = ctxt->eip; ctxt->dst.type = OP_NONE; - rc = emulator_do_task_switch(ctxt, tss_selector, reason, + rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); if (rc == X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 0b7690ee20b..95cdeaf9c71 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2799,7 +2799,10 @@ static int task_switch_interception(struct vcpu_svm *svm) (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) skip_emulated_instruction(&svm->vcpu); - if (kvm_task_switch(&svm->vcpu, tss_selector, reason, + if (int_type != SVM_EXITINTINFO_TYPE_SOFT) + int_vec = -1; + + if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, has_error_code, error_code) == EMULATE_FAIL) { svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d2bd719925a..124a0952a04 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4658,9 +4658,10 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) bool has_error_code = false; u32 error_code = 0; u16 tss_selector; - int reason, type, idt_v; + int reason, type, idt_v, idt_index; idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -4698,8 +4699,9 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) type != INTR_TYPE_NMI_INTR)) skip_emulated_instruction(vcpu); - if (kvm_task_switch(vcpu, tss_selector, reason, - has_error_code, error_code) == EMULATE_FAIL) { + if (kvm_task_switch(vcpu, tss_selector, + type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, + has_error_code, error_code) == EMULATE_FAIL) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ca74c1dadf3..490a1b1a255 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5655,15 +5655,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return 0; } -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, - bool has_error_code, u32 error_code) +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, + int reason, bool has_error_code, u32 error_code) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; int ret; init_emulate_ctxt(vcpu); - ret = emulator_task_switch(ctxt, tss_selector, reason, + ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); if (ret) -- cgit v1.2.3-70-g09d2 From 4cee4798a304ee1ea579423ca048f16ceaccdfb5 Mon Sep 17 00:00:00 2001 From: Kevin Wolf Date: Wed, 8 Feb 2012 14:34:41 +0100 Subject: KVM: x86 emulator: Allow PM/VM86 switch during task switch Task switches can switch between Protected Mode and VM86. The current mode must be updated during the task switch emulation so that the new segment selectors are interpreted correctly. In order to let privilege checks succeed, rflags needs to be updated in the vcpu struct as this causes a CPL update. Signed-off-by: Kevin Wolf Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 20 ++++++++++++++++++++ arch/x86/kvm/svm.c | 4 ++++ arch/x86/kvm/x86.c | 6 ++++++ 4 files changed, 31 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index df437b68f42..c222e1a1b12 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -176,6 +176,7 @@ struct x86_emulate_ops { void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); + void (*set_rflags)(struct x86_emulate_ctxt *ctxt, ulong val); int (*cpl)(struct x86_emulate_ctxt *ctxt); int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b19e9fffe58..83756223f8a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2344,6 +2344,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, return emulate_gp(ctxt, 0); ctxt->_eip = tss->eip; ctxt->eflags = tss->eflags | 2; + + /* General purpose registers */ ctxt->regs[VCPU_REGS_RAX] = tss->eax; ctxt->regs[VCPU_REGS_RCX] = tss->ecx; ctxt->regs[VCPU_REGS_RDX] = tss->edx; @@ -2365,6 +2367,24 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS); set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); + /* + * If we're switching between Protected Mode and VM86, we need to make + * sure to update the mode before loading the segment descriptors so + * that the selectors are interpreted correctly. + * + * Need to get rflags to the vcpu struct immediately because it + * influences the CPL which is checked at least when loading the segment + * descriptors and when pushing an error code to the new kernel stack. + * + * TODO Introduce a separate ctxt->ops->set_cpl callback + */ + if (ctxt->eflags & X86_EFLAGS_VM) + ctxt->mode = X86EMUL_MODE_VM86; + else + ctxt->mode = X86EMUL_MODE_PROT32; + + ctxt->ops->set_rflags(ctxt, ctxt->eflags); + /* * Now load segment descriptors. If fault happenes at this stage * it is handled in a context of new task diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ab39d84dee0..53efd597f39 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1354,7 +1354,11 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags; + to_svm(vcpu)->vmcb->save.rflags = rflags; + if ((old_rflags ^ rflags) & X86_EFLAGS_VM) + svm_update_cpl(vcpu); } static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 490a1b1a255..03a1fd47a6d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4129,6 +4129,11 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) return res; } +static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val) +{ + kvm_set_rflags(emul_to_vcpu(ctxt), val); +} + static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); @@ -4310,6 +4315,7 @@ static struct x86_emulate_ops emulate_ops = { .set_idt = emulator_set_idt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, + .set_rflags = emulator_set_rflags, .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, -- cgit v1.2.3-70-g09d2 From 3e515705a1f46beb1c942bb8043c16f8ac7b1e9e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 5 Mar 2012 14:23:29 +0200 Subject: KVM: Ensure all vcpus are consistent with in-kernel irqchip settings If some vcpus are created before KVM_CREATE_IRQCHIP, then irqchip_in_kernel() and vcpu->arch.apic will be inconsistent, leading to potential NULL pointer dereferences. Fix by: - ensuring that no vcpus are installed when KVM_CREATE_IRQCHIP is called - ensuring that a vcpu has an apic if it is installed after KVM_CREATE_IRQCHIP This is somewhat long winded because vcpu->arch.apic is created without kvm->lock held. Based on earlier patch by Michael Ellerman. Signed-off-by: Michael Ellerman Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 5 +++++ arch/x86/kvm/x86.c | 8 ++++++++ include/linux/kvm_host.h | 7 +++++++ virt/kvm/kvm_main.c | 4 ++++ 4 files changed, 24 insertions(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index d8ddbba6fe7..f5104b7c52c 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1172,6 +1172,11 @@ out: #define PALE_RESET_ENTRY 0x80000000ffffffb0UL +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) +{ + return irqchip_in_kernel(vcpu->kcm) == (vcpu->arch.apic != NULL); +} + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct kvm_vcpu *v; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 03a1fd47a6d..9477dc6ccca 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3199,6 +3199,9 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EEXIST; if (kvm->arch.vpic) goto create_irqchip_unlock; + r = -EINVAL; + if (atomic_read(&kvm->online_vcpus)) + goto create_irqchip_unlock; r = -ENOMEM; vpic = kvm_create_pic(kvm); if (vpic) { @@ -6107,6 +6110,11 @@ void kvm_arch_check_processor_compat(void *rtn) kvm_x86_ops->check_processor_compatibility(rtn); } +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) +{ + return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); +} + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct page *page; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 355e44555c3..e42d85ae854 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -805,6 +805,13 @@ static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) { return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id; } + +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu); + +#else + +static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; } + #endif #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e4431ada594..94e148e3871 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1651,6 +1651,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) goto vcpu_destroy; mutex_lock(&kvm->lock); + if (!kvm_vcpu_compatible(vcpu)) { + r = -EINVAL; + goto unlock_vcpu_destroy; + } if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { r = -EINVAL; goto unlock_vcpu_destroy; -- cgit v1.2.3-70-g09d2 From 07700a94b00a4fcbbfb07d1b72dc112a0e036735 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 28 Feb 2012 14:19:54 +0100 Subject: KVM: Allow host IRQ sharing for assigned PCI 2.3 devices PCI 2.3 allows to generically disable IRQ sources at device level. This enables us to share legacy IRQs of such devices with other host devices when passing them to a guest. The new IRQ sharing feature introduced here is optional, user space has to request it explicitly. Moreover, user space can inform us about its view of PCI_COMMAND_INTX_DISABLE so that we can avoid unmasking the interrupt and signaling it if the guest masked it via the virtualized PCI config space. Signed-off-by: Jan Kiszka Acked-by: Alex Williamson Acked-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 41 ++++++++ arch/x86/kvm/x86.c | 1 + include/linux/kvm.h | 6 ++ include/linux/kvm_host.h | 2 + virt/kvm/assigned-dev.c | 209 ++++++++++++++++++++++++++++++++------ 5 files changed, 230 insertions(+), 29 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 59a38264a0e..6386f8c0482 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1169,6 +1169,14 @@ following flags are specified: /* Depends on KVM_CAP_IOMMU */ #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) +/* The following two depend on KVM_CAP_PCI_2_3 */ +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) + +If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts +via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other +assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the +guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details. The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure isolation of the device. Usages not specifying this flag are deprecated. @@ -1441,6 +1449,39 @@ The "num_dirty" field is a performance hint for KVM to determine whether it should skip processing the bitmap and just invalidate everything. It must be set to the number of set bits in the bitmap. +4.60 KVM_ASSIGN_SET_INTX_MASK + +Capability: KVM_CAP_PCI_2_3 +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_assigned_pci_dev (in) +Returns: 0 on success, -1 on error + +Allows userspace to mask PCI INTx interrupts from the assigned device. The +kernel will not deliver INTx interrupts to the guest between setting and +clearing of KVM_ASSIGN_SET_INTX_MASK via this interface. This enables use of +and emulation of PCI 2.3 INTx disable command register behavior. + +This may be used for both PCI 2.3 devices supporting INTx disable natively and +older devices lacking this support. Userspace is responsible for emulating the +read value of the INTx disable bit in the guest visible PCI command register. +When modifying the INTx disable state, userspace should precede updating the +physical device command register by calling this ioctl to inform the kernel of +the new intended INTx mask state. + +Note that the kernel uses the device INTx disable bit to internally manage the +device interrupt state for PCI 2.3 devices. Reads of this register may +therefore not match the expected value. Writes should always use the guest +intended INTx disable value rather than attempting to read-copy-update the +current physical device state. Races between user and kernel updates to the +INTx disable bit are handled lazily in the kernel. It's possible the device +may generate unintended interrupts, but they will not be injected into the +guest. + +See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified +by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is +evaluated. + 4.62 KVM_CREATE_SPAPR_TCE Capability: KVM_CAP_SPAPR_TCE diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9477dc6ccca..6866083a48c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2143,6 +2143,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: case KVM_CAP_GET_TSC_KHZ: + case KVM_CAP_PCI_2_3: r = 1; break; case KVM_CAP_COALESCED_MMIO: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index acbe4293908..6c322a90b92 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -588,6 +588,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_TSC_DEADLINE_TIMER 72 #define KVM_CAP_S390_UCONTROL 73 #define KVM_CAP_SYNC_REGS 74 +#define KVM_CAP_PCI_2_3 75 #ifdef KVM_CAP_IRQ_ROUTING @@ -784,6 +785,9 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_TSC_CONTROL */ #define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2) #define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3) +/* Available with KVM_CAP_PCI_2_3 */ +#define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \ + struct kvm_assigned_pci_dev) /* * ioctls for vcpu fds @@ -857,6 +861,8 @@ struct kvm_s390_ucas_mapping { #define KVM_SET_ONE_REG _IOW(KVMIO, 0xac, struct kvm_one_reg) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) +#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) +#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) struct kvm_assigned_pci_dev { __u32 assigned_dev_id; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e42d85ae854..ec171c1d087 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -546,6 +546,7 @@ struct kvm_assigned_dev_kernel { unsigned int entries_nr; int host_irq; bool host_irq_disabled; + bool pci_2_3; struct msix_entry *host_msix_entries; int guest_irq; struct msix_entry *guest_msix_entries; @@ -555,6 +556,7 @@ struct kvm_assigned_dev_kernel { struct pci_dev *dev; struct kvm *kvm; spinlock_t intx_lock; + struct mutex intx_mask_lock; char irq_name[32]; struct pci_saved_state *pci_saved_state; }; diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index ece80612b59..08e05715df7 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -55,22 +55,66 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel return index; } -static irqreturn_t kvm_assigned_dev_thread(int irq, void *dev_id) +static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id) { struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + int ret; - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_INTX) { - spin_lock(&assigned_dev->intx_lock); + spin_lock(&assigned_dev->intx_lock); + if (pci_check_and_mask_intx(assigned_dev->dev)) { + assigned_dev->host_irq_disabled = true; + ret = IRQ_WAKE_THREAD; + } else + ret = IRQ_NONE; + spin_unlock(&assigned_dev->intx_lock); + + return ret; +} + +static void +kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev, + int vector) +{ + if (unlikely(assigned_dev->irq_requested_type & + KVM_DEV_IRQ_GUEST_INTX)) { + mutex_lock(&assigned_dev->intx_mask_lock); + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) + kvm_set_irq(assigned_dev->kvm, + assigned_dev->irq_source_id, vector, 1); + mutex_unlock(&assigned_dev->intx_mask_lock); + } else + kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, + vector, 1); +} + +static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { + spin_lock_irq(&assigned_dev->intx_lock); disable_irq_nosync(irq); assigned_dev->host_irq_disabled = true; - spin_unlock(&assigned_dev->intx_lock); + spin_unlock_irq(&assigned_dev->intx_lock); } - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - assigned_dev->guest_irq, 1); + kvm_assigned_dev_raise_guest_irq(assigned_dev, + assigned_dev->guest_irq); + + return IRQ_HANDLED; +} + +#ifdef __KVM_HAVE_MSI +static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; + + kvm_assigned_dev_raise_guest_irq(assigned_dev, + assigned_dev->guest_irq); return IRQ_HANDLED; } +#endif #ifdef __KVM_HAVE_MSIX static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) @@ -81,8 +125,7 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) if (index >= 0) { vector = assigned_dev->guest_msix_entries[index].vector; - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - vector, 1); + kvm_assigned_dev_raise_guest_irq(assigned_dev, vector); } return IRQ_HANDLED; @@ -98,15 +141,31 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); - /* The guest irq may be shared so this ack may be - * from another device. - */ - spin_lock(&dev->intx_lock); - if (dev->host_irq_disabled) { - enable_irq(dev->host_irq); - dev->host_irq_disabled = false; + mutex_lock(&dev->intx_mask_lock); + + if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) { + bool reassert = false; + + spin_lock_irq(&dev->intx_lock); + /* + * The guest IRQ may be shared so this ack can come from an + * IRQ for another guest device. + */ + if (dev->host_irq_disabled) { + if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) + enable_irq(dev->host_irq); + else if (!pci_check_and_unmask_intx(dev->dev)) + reassert = true; + dev->host_irq_disabled = reassert; + } + spin_unlock_irq(&dev->intx_lock); + + if (reassert) + kvm_set_irq(dev->kvm, dev->irq_source_id, + dev->guest_irq, 1); } - spin_unlock(&dev->intx_lock); + + mutex_unlock(&dev->intx_mask_lock); } static void deassign_guest_irq(struct kvm *kvm, @@ -154,7 +213,15 @@ static void deassign_host_irq(struct kvm *kvm, pci_disable_msix(assigned_dev->dev); } else { /* Deal with MSI and INTx */ - disable_irq(assigned_dev->host_irq); + if ((assigned_dev->irq_requested_type & + KVM_DEV_IRQ_HOST_INTX) && + (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { + spin_lock_irq(&assigned_dev->intx_lock); + pci_intx(assigned_dev->dev, false); + spin_unlock_irq(&assigned_dev->intx_lock); + synchronize_irq(assigned_dev->host_irq); + } else + disable_irq(assigned_dev->host_irq); free_irq(assigned_dev->host_irq, assigned_dev); @@ -235,15 +302,34 @@ void kvm_free_all_assigned_devices(struct kvm *kvm) static int assigned_device_enable_host_intx(struct kvm *kvm, struct kvm_assigned_dev_kernel *dev) { + irq_handler_t irq_handler; + unsigned long flags; + dev->host_irq = dev->dev->irq; - /* Even though this is PCI, we don't want to use shared - * interrupts. Sharing host devices with guest-assigned devices - * on the same interrupt line is not a happy situation: there - * are going to be long delays in accepting, acking, etc. + + /* + * We can only share the IRQ line with other host devices if we are + * able to disable the IRQ source at device-level - independently of + * the guest driver. Otherwise host devices may suffer from unbounded + * IRQ latencies when the guest keeps the line asserted. */ - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, - IRQF_ONESHOT, dev->irq_name, dev)) + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { + irq_handler = kvm_assigned_dev_intx; + flags = IRQF_SHARED; + } else { + irq_handler = NULL; + flags = IRQF_ONESHOT; + } + if (request_threaded_irq(dev->host_irq, irq_handler, + kvm_assigned_dev_thread_intx, flags, + dev->irq_name, dev)) return -EIO; + + if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { + spin_lock_irq(&dev->intx_lock); + pci_intx(dev->dev, true); + spin_unlock_irq(&dev->intx_lock); + } return 0; } @@ -260,8 +346,9 @@ static int assigned_device_enable_host_msi(struct kvm *kvm, } dev->host_irq = dev->dev->irq; - if (request_threaded_irq(dev->host_irq, NULL, kvm_assigned_dev_thread, - 0, dev->irq_name, dev)) { + if (request_threaded_irq(dev->host_irq, NULL, + kvm_assigned_dev_thread_msi, 0, + dev->irq_name, dev)) { pci_disable_msi(dev->dev); return -EIO; } @@ -319,7 +406,6 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm, { dev->guest_irq = irq->guest_irq; dev->ack_notifier.gsi = -1; - dev->host_irq_disabled = false; return 0; } #endif @@ -331,7 +417,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm, { dev->guest_irq = irq->guest_irq; dev->ack_notifier.gsi = -1; - dev->host_irq_disabled = false; return 0; } #endif @@ -365,6 +450,7 @@ static int assign_host_irq(struct kvm *kvm, default: r = -EINVAL; } + dev->host_irq_disabled = false; if (!r) dev->irq_requested_type |= host_irq_type; @@ -466,6 +552,7 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, { int r = -ENODEV; struct kvm_assigned_dev_kernel *match; + unsigned long irq_type; mutex_lock(&kvm->lock); @@ -474,7 +561,9 @@ static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, if (!match) goto out; - r = kvm_deassign_irq(kvm, match, assigned_irq->flags); + irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK | + KVM_DEV_IRQ_GUEST_MASK); + r = kvm_deassign_irq(kvm, match, irq_type); out: mutex_unlock(&kvm->lock); return r; @@ -607,6 +696,10 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, if (!match->pci_saved_state) printk(KERN_DEBUG "%s: Couldn't store %s saved state\n", __func__, dev_name(&dev->dev)); + + if (!pci_intx_mask_supported(dev)) + assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3; + match->assigned_dev_id = assigned_dev->assigned_dev_id; match->host_segnr = assigned_dev->segnr; match->host_busnr = assigned_dev->busnr; @@ -614,6 +707,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, match->flags = assigned_dev->flags; match->dev = dev; spin_lock_init(&match->intx_lock); + mutex_init(&match->intx_mask_lock); match->irq_source_id = -1; match->kvm = kvm; match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; @@ -759,6 +853,55 @@ msix_entry_out: } #endif +static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (!match) { + r = -ENODEV; + goto out; + } + + mutex_lock(&match->intx_mask_lock); + + match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX; + match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX; + + if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { + if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) { + kvm_set_irq(match->kvm, match->irq_source_id, + match->guest_irq, 0); + /* + * Masking at hardware-level is performed on demand, + * i.e. when an IRQ actually arrives at the host. + */ + } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { + /* + * Unmask the IRQ line if required. Unmasking at + * device level will be performed by user space. + */ + spin_lock_irq(&match->intx_lock); + if (match->host_irq_disabled) { + enable_irq(match->host_irq); + match->host_irq_disabled = false; + } + spin_unlock_irq(&match->intx_lock); + } + } + + mutex_unlock(&match->intx_mask_lock); + +out: + mutex_unlock(&kvm->lock); + return r; +} + long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, unsigned long arg) { @@ -866,6 +1009,15 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, break; } #endif + case KVM_ASSIGN_SET_INTX_MASK: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev); + break; + } default: r = -ENOTTY; break; @@ -873,4 +1025,3 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, out: return r; } - -- cgit v1.2.3-70-g09d2 From a223c313cb13e9ab71051fc5b70610a2829a4082 Mon Sep 17 00:00:00 2001 From: Nicolae Mogoreanu Date: Tue, 21 Feb 2012 13:44:21 -0800 Subject: KVM: Ignore the writes to MSR_K7_HWCR(3) When CPUID Fn8000_0001_EAX reports 0x00100f22 Windows 7 x64 guest tries to set bit 3 in MSRC001_0015 in nt!KiDisableCacheErrataSource and fails. This patch will ignore this step and allow things to move on without having to fake CPUID value. Signed-off-by: Nicolae Mogoreanu Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6866083a48c..32096cf6c6c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1547,6 +1547,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_K7_HWCR: data &= ~(u64)0x40; /* ignore flush filter disable */ data &= ~(u64)0x100; /* ignore ignne emulation enable */ + data &= ~(u64)0x8; /* ignore TLB cache disable */ if (data != 0) { pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", data); -- cgit v1.2.3-70-g09d2 From 02626b6af5d2bc62db3bb85fc2891b2725535d44 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 8 Mar 2012 18:46:57 -0300 Subject: KVM: x86: fix kvm_write_tsc() TSC matching thinko kvm_write_tsc() converts from guest TSC to microseconds, not nanoseconds as intended. The result is that the window for matching is 1000 seconds, not 1 second. Microsecond precision is enough for checking whether the TSC write delta is within the heuristic values, so use it instead of nanoseconds. Noted by Avi Kivity. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86/kvm/x86.c') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 32096cf6c6c..7287812eeb7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1025,7 +1025,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - s64 nsdiff; + s64 usdiff; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); @@ -1033,18 +1033,19 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) elapsed = ns - kvm->arch.last_tsc_nsec; /* n.b - signed multiplication and division required */ - nsdiff = data - kvm->arch.last_tsc_write; + usdiff = data - kvm->arch.last_tsc_write; #ifdef CONFIG_X86_64 - nsdiff = (nsdiff * 1000) / vcpu->arch.virtual_tsc_khz; + usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; #else /* do_div() only does unsigned */ asm("idivl %2; xor %%edx, %%edx" - : "=A"(nsdiff) - : "A"(nsdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); + : "=A"(usdiff) + : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); #endif - nsdiff -= elapsed; - if (nsdiff < 0) - nsdiff = -nsdiff; + do_div(elapsed, 1000); + usdiff -= elapsed; + if (usdiff < 0) + usdiff = -usdiff; /* * Special case: TSC write with a small delta (1 second) of virtual @@ -1056,7 +1057,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) * compensation code attempt to catch up if we fall behind, but * it's better to try to match offsets from the beginning. */ - if (nsdiff < NSEC_PER_SEC && + if (usdiff < USEC_PER_SEC && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!check_tsc_unstable()) { offset = kvm->arch.cur_tsc_offset; -- cgit v1.2.3-70-g09d2