From d19a748b1c42b133e9263e9023c1d162efa6f4ad Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Mon, 2 Jul 2012 17:54:30 +0900 Subject: KVM: Introduce hva_to_gfn_memslot() for kvm_handle_hva() This restricts hva handling in mmu code and makes it easier to extend kvm_handle_hva() so that it can treat a range of addresses later in this patch series. Signed-off-by: Takuya Yoshikawa Cc: Alexander Graf Cc: Paul Mackerras Signed-off-by: Marcelo Tosatti --- include/linux/kvm_host.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux/kvm_host.h') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e3c86f8c86c..6f6c18a03c5 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -740,6 +740,14 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); } +static inline gfn_t +hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot) +{ + gfn_t gfn_offset = (hva - slot->userspace_addr) >> PAGE_SHIFT; + + return slot->base_gfn + gfn_offset; +} + static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { -- cgit v1.2.3-70-g09d2 From 903816fa4d016e20ec71a1a97700cfcdda115580 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 17 Jul 2012 21:54:11 +0800 Subject: KVM: using get_fault_pfn to get the fault pfn Using get_fault_pfn to cleanup the code Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 6 ++---- include/linux/kvm_host.h | 5 +---- virt/kvm/kvm_main.c | 13 ++++--------- 3 files changed, 7 insertions(+), 17 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 685a4855738..f85cc21ae95 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2513,10 +2513,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, unsigned long hva; slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); - if (!slot) { - get_page(fault_page); - return page_to_pfn(fault_page); - } + if (!slot) + return get_fault_pfn(); hva = gfn_to_hva_memslot(slot, gfn); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6f6c18a03c5..1a7f838d30c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -383,15 +383,11 @@ id_to_memslot(struct kvm_memslots *slots, int id) static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } extern struct page *bad_page; -extern struct page *fault_page; - extern pfn_t bad_pfn; -extern pfn_t fault_pfn; int is_error_page(struct page *page); int is_error_pfn(pfn_t pfn); int is_hwpoison_pfn(pfn_t pfn); -int is_fault_pfn(pfn_t pfn); int is_noslot_pfn(pfn_t pfn); int is_invalid_pfn(pfn_t pfn); int kvm_is_error_hva(unsigned long addr); @@ -441,6 +437,7 @@ void kvm_release_pfn_clean(pfn_t pfn); void kvm_set_pfn_dirty(pfn_t pfn); void kvm_set_pfn_accessed(pfn_t pfn); void kvm_get_pfn(pfn_t pfn); +pfn_t get_fault_pfn(void); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e2b1a159e5d..0fbbf2d2160 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -103,8 +103,8 @@ static bool largepages_enabled = true; static struct page *hwpoison_page; static pfn_t hwpoison_pfn; -struct page *fault_page; -pfn_t fault_pfn; +static struct page *fault_page; +static pfn_t fault_pfn; inline int kvm_is_mmio_pfn(pfn_t pfn) { @@ -949,12 +949,6 @@ int is_hwpoison_pfn(pfn_t pfn) } EXPORT_SYMBOL_GPL(is_hwpoison_pfn); -int is_fault_pfn(pfn_t pfn) -{ - return pfn == fault_pfn; -} -EXPORT_SYMBOL_GPL(is_fault_pfn); - int is_noslot_pfn(pfn_t pfn) { return pfn == bad_pfn; @@ -1038,11 +1032,12 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_hva); -static pfn_t get_fault_pfn(void) +pfn_t get_fault_pfn(void) { get_page(fault_page); return fault_pfn; } +EXPORT_SYMBOL_GPL(get_fault_pfn); int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int write, struct page **page) -- cgit v1.2.3-70-g09d2 From ca0565f5736e67af3172d188577b57e303dd082a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 17 Jul 2012 21:54:52 +0800 Subject: KVM: make bad_pfn static to kvm_main.c bad_pfn is not used out of kvm_main.c, so mark it static, also move it near hwpoison_pfn and fault_pfn Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- include/linux/kvm_host.h | 1 - virt/kvm/kvm_main.c | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1a7f838d30c..5f956cde137 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -383,7 +383,6 @@ id_to_memslot(struct kvm_memslots *slots, int id) static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } extern struct page *bad_page; -extern pfn_t bad_pfn; int is_error_page(struct page *page); int is_error_pfn(pfn_t pfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0fbbf2d2160..f955eee92aa 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -100,6 +100,9 @@ EXPORT_SYMBOL_GPL(kvm_rebooting); static bool largepages_enabled = true; +struct page *bad_page; +static pfn_t bad_pfn; + static struct page *hwpoison_page; static pfn_t hwpoison_pfn; @@ -2691,9 +2694,6 @@ static struct syscore_ops kvm_syscore_ops = { .resume = kvm_resume, }; -struct page *bad_page; -pfn_t bad_pfn; - static inline struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) { -- cgit v1.2.3-70-g09d2 From f340a51b7e41abbe92ae3a327c0020974a059f95 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 17 Jul 2012 21:55:34 +0800 Subject: KVM: remove is_error_hpa Remove them since they are not used anymore Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- include/linux/kvm_host.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5f956cde137..e8d13a072d2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -378,10 +378,6 @@ id_to_memslot(struct kvm_memslots *slots, int id) return slot; } -#define HPA_MSB ((sizeof(hpa_t) * 8) - 1) -#define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) -static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } - extern struct page *bad_page; int is_error_page(struct page *page); -- cgit v1.2.3-70-g09d2 From d566104853361cc377c61f70e41c1ad3d44b86c6 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 17 Jul 2012 21:56:16 +0800 Subject: KVM: remove the unused parameter of gfn_to_pfn_memslot The parameter, 'kvm', is not used in gfn_to_pfn_memslot, we can happily remove it Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/powerpc/kvm/e500_tlb.c | 2 +- arch/x86/kvm/mmu.c | 2 +- include/linux/kvm_host.h | 5 ++--- virt/kvm/iommu.c | 10 +++++----- virt/kvm/kvm_main.c | 15 +++++++-------- 5 files changed, 16 insertions(+), 18 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index c510fc96130..c8f6c582674 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -520,7 +520,7 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, if (likely(!pfnmap)) { unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); - pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn); + pfn = gfn_to_pfn_memslot(slot, gfn); if (is_error_pfn(pfn)) { printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", (long)gfn); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f85cc21ae95..4f77f7ac6d2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2518,7 +2518,7 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, hva = gfn_to_hva_memslot(slot, gfn); - return hva_to_pfn_atomic(vcpu->kvm, hva); + return hva_to_pfn_atomic(hva); } static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e8d13a072d2..db9aa917840 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -418,15 +418,14 @@ void kvm_release_page_dirty(struct page *page); void kvm_set_page_dirty(struct page *page); void kvm_set_page_accessed(struct page *page); -pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr); +pfn_t hva_to_pfn_atomic(unsigned long addr); pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, bool write_fault, bool *writable); pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); -pfn_t gfn_to_pfn_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn); +pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_release_pfn_dirty(pfn_t); void kvm_release_pfn_clean(pfn_t pfn); void kvm_set_pfn_dirty(pfn_t pfn); diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index e9fff9830bf..c03f1fb2670 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -42,13 +42,13 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm); static void kvm_iommu_put_pages(struct kvm *kvm, gfn_t base_gfn, unsigned long npages); -static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, unsigned long size) +static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, + unsigned long size) { gfn_t end_gfn; pfn_t pfn; - pfn = gfn_to_pfn_memslot(kvm, slot, gfn); + pfn = gfn_to_pfn_memslot(slot, gfn); end_gfn = gfn + (size >> PAGE_SHIFT); gfn += 1; @@ -56,7 +56,7 @@ static pfn_t kvm_pin_pages(struct kvm *kvm, struct kvm_memory_slot *slot, return pfn; while (gfn < end_gfn) - gfn_to_pfn_memslot(kvm, slot, gfn++); + gfn_to_pfn_memslot(slot, gfn++); return pfn; } @@ -105,7 +105,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) * Pin all pages we are about to map in memory. This is * important because we unmap and unpin in 4kb steps later. */ - pfn = kvm_pin_pages(kvm, slot, gfn, page_size); + pfn = kvm_pin_pages(slot, gfn, page_size); if (is_error_pfn(pfn)) { gfn += 1; continue; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f955eee92aa..68dda513cd7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1062,8 +1062,8 @@ static inline int check_user_page_hwpoison(unsigned long addr) return rc == -EHWPOISON; } -static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, - bool *async, bool write_fault, bool *writable) +static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, + bool write_fault, bool *writable) { struct page *page[1]; int npages = 0; @@ -1143,9 +1143,9 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, return pfn; } -pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) +pfn_t hva_to_pfn_atomic(unsigned long addr) { - return hva_to_pfn(kvm, addr, true, NULL, true, NULL); + return hva_to_pfn(addr, true, NULL, true, NULL); } EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); @@ -1163,7 +1163,7 @@ static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, return page_to_pfn(bad_page); } - return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); + return hva_to_pfn(addr, atomic, async, write_fault, writable); } pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) @@ -1192,11 +1192,10 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, } EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); -pfn_t gfn_to_pfn_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot, gfn_t gfn) +pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { unsigned long addr = gfn_to_hva_memslot(slot, gfn); - return hva_to_pfn(kvm, addr, false, NULL, true, NULL); + return hva_to_pfn(addr, false, NULL, true, NULL); } int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, -- cgit v1.2.3-70-g09d2 From 4c088493c8d07e4e27bad53a99dcfdc14cdf45f8 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Wed, 18 Jul 2012 19:07:46 +0530 Subject: KVM: Note down when cpu relax intercepted or pause loop exited Noting pause loop exited vcpu or cpu relax intercepted helps in filtering right candidate to yield. Wrong selection of vcpu; i.e., a vcpu that just did a pl-exit or cpu relax intercepted may contribute to performance degradation. Signed-off-by: Raghavendra K T Reviewed-by: Marcelo Tosatti Reviewed-by: Rik van Riel Tested-by: Christian Borntraeger # on s390x Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 34 ++++++++++++++++++++++++++++++++++ virt/kvm/kvm_main.c | 5 +++++ 2 files changed, 39 insertions(+) (limited to 'include/linux/kvm_host.h') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index db9aa917840..361b36fe7ec 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -183,6 +183,18 @@ struct kvm_vcpu { } async_pf; #endif +#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT + /* + * Cpu relax intercept or pause loop exit optimization + * in_spin_loop: set when a vcpu does a pause loop exit + * or cpu relax intercepted. + * dy_eligible: indicates whether vcpu is eligible for directed yield. + */ + struct { + bool in_spin_loop; + bool dy_eligible; + } spin_loop; +#endif struct kvm_vcpu_arch arch; }; @@ -898,5 +910,27 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) } } +#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT + +static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) +{ + vcpu->spin_loop.in_spin_loop = val; +} +static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) +{ + vcpu->spin_loop.dy_eligible = val; +} + +#else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ + +static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val) +{ +} + +static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) +{ +} + +#endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 68dda513cd7..0892b75eeed 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -239,6 +239,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) } vcpu->run = page_address(page); + kvm_vcpu_set_in_spin_loop(vcpu, false); + kvm_vcpu_set_dy_eligible(vcpu, false); + r = kvm_arch_vcpu_init(vcpu); if (r < 0) goto fail_free_run; @@ -1585,6 +1588,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) int pass; int i; + kvm_vcpu_set_in_spin_loop(me, true); /* * We boost the priority of a VCPU that is runnable but not * currently running, because it got preempted by something @@ -1610,6 +1614,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) } } } + kvm_vcpu_set_in_spin_loop(me, false); } EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); -- cgit v1.2.3-70-g09d2 From 06e48c510aa37f6e791602e6420422ea7071fe94 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Thu, 19 Jul 2012 15:17:52 +0530 Subject: KVM: Choose better candidate for directed yield Currently, on a large vcpu guests, there is a high probability of yielding to the same vcpu who had recently done a pause-loop exit or cpu relax intercepted. Such a yield can lead to the vcpu spinning again and hence degrade the performance. The patchset keeps track of the pause loop exit/cpu relax interception and gives chance to a vcpu which: (a) Has not done pause loop exit or cpu relax intercepted at all (probably he is preempted lock-holder) (b) Was skipped in last iteration because it did pause loop exit or cpu relax intercepted, and probably has become eligible now (next eligible lock holder) Signed-off-by: Raghavendra K T Reviewed-by: Marcelo Tosatti Reviewed-by: Rik van Riel Tested-by: Christian Borntraeger # on s390x Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 5 +++++ virt/kvm/kvm_main.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) (limited to 'include/linux/kvm_host.h') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 361b36fe7ec..74a78d09c45 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -931,6 +931,11 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val) { } +static inline bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) +{ + return true; +} + #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */ #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0892b75eeed..1e10ebe1a37 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1579,6 +1579,43 @@ bool kvm_vcpu_yield_to(struct kvm_vcpu *target) } EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); +#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT +/* + * Helper that checks whether a VCPU is eligible for directed yield. + * Most eligible candidate to yield is decided by following heuristics: + * + * (a) VCPU which has not done pl-exit or cpu relax intercepted recently + * (preempted lock holder), indicated by @in_spin_loop. + * Set at the beiginning and cleared at the end of interception/PLE handler. + * + * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get + * chance last time (mostly it has become eligible now since we have probably + * yielded to lockholder in last iteration. This is done by toggling + * @dy_eligible each time a VCPU checked for eligibility.) + * + * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding + * to preempted lock-holder could result in wrong VCPU selection and CPU + * burning. Giving priority for a potential lock-holder increases lock + * progress. + * + * Since algorithm is based on heuristics, accessing another VCPU data without + * locking does not harm. It may result in trying to yield to same VCPU, fail + * and continue with next VCPU and so on. + */ +bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) +{ + bool eligible; + + eligible = !vcpu->spin_loop.in_spin_loop || + (vcpu->spin_loop.in_spin_loop && + vcpu->spin_loop.dy_eligible); + + if (vcpu->spin_loop.in_spin_loop) + kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); + + return eligible; +} +#endif void kvm_vcpu_on_spin(struct kvm_vcpu *me) { struct kvm *kvm = me->kvm; @@ -1607,6 +1644,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) continue; if (waitqueue_active(&vcpu->wq)) continue; + if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) + continue; if (kvm_vcpu_yield_to(vcpu)) { kvm->last_boosted_vcpu = i; yielded = 1; @@ -1615,6 +1654,9 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me) } } kvm_vcpu_set_in_spin_loop(me, false); + + /* Ensure vcpu is not eligible during next spinloop */ + kvm_vcpu_set_dy_eligible(me, false); } EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); -- cgit v1.2.3-70-g09d2 From a2766325cf9f9e36d1225145f1ce1b066f001837 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 26 Jul 2012 11:58:59 +0800 Subject: KVM: remove dummy pages Currently, kvm allocates some pages and use them as error indicators, it wastes memory and is not good for scalability Base on Avi's suggestion, we use the error codes instead of these pages to indicate the error conditions Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 3 +- virt/kvm/async_pf.c | 3 +- virt/kvm/kvm_main.c | 121 +++++++++++++++++++---------------------------- 3 files changed, 52 insertions(+), 75 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1993eb1cb2c..4e60d3695e4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -423,6 +423,7 @@ void kvm_arch_flush_shadow(struct kvm *kvm); int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, int nr_pages); +struct page *get_bad_page(void); struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); void kvm_release_page_clean(struct page *page); @@ -576,7 +577,7 @@ void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); -int kvm_is_mmio_pfn(pfn_t pfn); +bool kvm_is_mmio_pfn(pfn_t pfn); struct kvm_irq_ack_notifier { struct hlist_node link; diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index ebae24b62c9..79722782d9d 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -203,8 +203,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) if (!work) return -ENOMEM; - work->page = bad_page; - get_page(bad_page); + work->page = get_bad_page(); INIT_LIST_HEAD(&work->queue); /* for list_del to work */ spin_lock(&vcpu->async_pf.lock); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0014ee99dc7..de89497fe4c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -100,17 +100,11 @@ EXPORT_SYMBOL_GPL(kvm_rebooting); static bool largepages_enabled = true; -struct page *bad_page; -static pfn_t bad_pfn; - -static struct page *hwpoison_page; -static pfn_t hwpoison_pfn; - -static struct page *fault_page; -static pfn_t fault_pfn; - -inline int kvm_is_mmio_pfn(pfn_t pfn) +bool kvm_is_mmio_pfn(pfn_t pfn) { + if (is_error_pfn(pfn)) + return false; + if (pfn_valid(pfn)) { int reserved; struct page *tail = pfn_to_page(pfn); @@ -939,34 +933,55 @@ EXPORT_SYMBOL_GPL(kvm_disable_largepages); int is_error_page(struct page *page) { - return page == bad_page || page == hwpoison_page || page == fault_page; + return IS_ERR(page); } EXPORT_SYMBOL_GPL(is_error_page); int is_error_pfn(pfn_t pfn) { - return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn; + return IS_ERR_VALUE(pfn); } EXPORT_SYMBOL_GPL(is_error_pfn); +static pfn_t get_bad_pfn(void) +{ + return -ENOENT; +} + +pfn_t get_fault_pfn(void) +{ + return -EFAULT; +} +EXPORT_SYMBOL_GPL(get_fault_pfn); + +static pfn_t get_hwpoison_pfn(void) +{ + return -EHWPOISON; +} + int is_hwpoison_pfn(pfn_t pfn) { - return pfn == hwpoison_pfn; + return pfn == -EHWPOISON; } EXPORT_SYMBOL_GPL(is_hwpoison_pfn); int is_noslot_pfn(pfn_t pfn) { - return pfn == bad_pfn; + return pfn == -ENOENT; } EXPORT_SYMBOL_GPL(is_noslot_pfn); int is_invalid_pfn(pfn_t pfn) { - return pfn == hwpoison_pfn || pfn == fault_pfn; + return !is_noslot_pfn(pfn) && is_error_pfn(pfn); } EXPORT_SYMBOL_GPL(is_invalid_pfn); +struct page *get_bad_page(void) +{ + return ERR_PTR(-ENOENT); +} + static inline unsigned long bad_hva(void) { return PAGE_OFFSET; @@ -1038,13 +1053,6 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_hva); -pfn_t get_fault_pfn(void) -{ - get_page(fault_page); - return fault_pfn; -} -EXPORT_SYMBOL_GPL(get_fault_pfn); - int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int write, struct page **page) { @@ -1122,8 +1130,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, if (npages == -EHWPOISON || (!async && check_user_page_hwpoison(addr))) { up_read(¤t->mm->mmap_sem); - get_page(hwpoison_page); - return page_to_pfn(hwpoison_page); + return get_hwpoison_pfn(); } vma = find_vma_intersection(current->mm, addr, addr+1); @@ -1161,10 +1168,8 @@ static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, *async = false; addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) { - get_page(bad_page); - return page_to_pfn(bad_page); - } + if (kvm_is_error_hva(addr)) + return get_bad_pfn(); return hva_to_pfn(addr, atomic, async, write_fault, writable); } @@ -1218,37 +1223,45 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, } EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); +static struct page *kvm_pfn_to_page(pfn_t pfn) +{ + WARN_ON(kvm_is_mmio_pfn(pfn)); + + if (is_error_pfn(pfn) || kvm_is_mmio_pfn(pfn)) + return get_bad_page(); + + return pfn_to_page(pfn); +} + struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) { pfn_t pfn; pfn = gfn_to_pfn(kvm, gfn); - if (!kvm_is_mmio_pfn(pfn)) - return pfn_to_page(pfn); - - WARN_ON(kvm_is_mmio_pfn(pfn)); - get_page(bad_page); - return bad_page; + return kvm_pfn_to_page(pfn); } EXPORT_SYMBOL_GPL(gfn_to_page); void kvm_release_page_clean(struct page *page) { - kvm_release_pfn_clean(page_to_pfn(page)); + if (!is_error_page(page)) + kvm_release_pfn_clean(page_to_pfn(page)); } EXPORT_SYMBOL_GPL(kvm_release_page_clean); void kvm_release_pfn_clean(pfn_t pfn) { - if (!kvm_is_mmio_pfn(pfn)) + if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn)) put_page(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); void kvm_release_page_dirty(struct page *page) { + WARN_ON(is_error_page(page)); + kvm_release_pfn_dirty(page_to_pfn(page)); } EXPORT_SYMBOL_GPL(kvm_release_page_dirty); @@ -2771,33 +2784,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, if (r) goto out_fail; - bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); - - if (bad_page == NULL) { - r = -ENOMEM; - goto out; - } - - bad_pfn = page_to_pfn(bad_page); - - hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); - - if (hwpoison_page == NULL) { - r = -ENOMEM; - goto out_free_0; - } - - hwpoison_pfn = page_to_pfn(hwpoison_page); - - fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO); - - if (fault_page == NULL) { - r = -ENOMEM; - goto out_free_0; - } - - fault_pfn = page_to_pfn(fault_page); - if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { r = -ENOMEM; goto out_free_0; @@ -2872,12 +2858,6 @@ out_free_1: out_free_0a: free_cpumask_var(cpus_hardware_enabled); out_free_0: - if (fault_page) - __free_page(fault_page); - if (hwpoison_page) - __free_page(hwpoison_page); - __free_page(bad_page); -out: kvm_arch_exit(); out_fail: return r; @@ -2897,8 +2877,5 @@ void kvm_exit(void) kvm_arch_hardware_unsetup(); kvm_arch_exit(); free_cpumask_var(cpus_hardware_enabled); - __free_page(fault_page); - __free_page(hwpoison_page); - __free_page(bad_page); } EXPORT_SYMBOL_GPL(kvm_exit); -- cgit v1.2.3-70-g09d2 From 23d43cf998275bc97437931c0cdee1df2c1aa3ca Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Tue, 24 Jul 2012 08:51:20 -0400 Subject: KVM: Move KVM_IRQ_LINE to arch-generic code Handle KVM_IRQ_LINE and KVM_IRQ_LINE_STATUS in the generic kvm_vm_ioctl() function and call into kvm_vm_ioctl_irq_line(). This is even more relevant when KVM/ARM also uses this ioctl. Signed-off-by: Christoffer Dall Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 33 ++++++++++----------------------- arch/x86/kvm/x86.c | 33 ++++++++++----------------------- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 23 +++++++++++++++++++++++ 4 files changed, 44 insertions(+), 46 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index bd77cb507c1..eac65380bd2 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -924,6 +924,16 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event) +{ + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event->irq, irq_event->level); + return 0; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -963,29 +973,6 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; } break; - case KVM_IRQ_LINE_STATUS: - case KVM_IRQ_LINE: { - struct kvm_irq_level irq_event; - - r = -EFAULT; - if (copy_from_user(&irq_event, argp, sizeof irq_event)) - goto out; - r = -ENXIO; - if (irqchip_in_kernel(kvm)) { - __s32 status; - status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, - irq_event.irq, irq_event.level); - if (ioctl == KVM_IRQ_LINE_STATUS) { - r = -EFAULT; - irq_event.status = status; - if (copy_to_user(argp, &irq_event, - sizeof irq_event)) - goto out; - } - r = 0; - } - break; - } case KVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ struct kvm_irqchip chip; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3d9d08edbf2..b6379e55ee2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3165,6 +3165,16 @@ out: return r; } +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event) +{ + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event->irq, irq_event->level); + return 0; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -3271,29 +3281,6 @@ long kvm_arch_vm_ioctl(struct file *filp, create_pit_unlock: mutex_unlock(&kvm->slots_lock); break; - case KVM_IRQ_LINE_STATUS: - case KVM_IRQ_LINE: { - struct kvm_irq_level irq_event; - - r = -EFAULT; - if (copy_from_user(&irq_event, argp, sizeof irq_event)) - goto out; - r = -ENXIO; - if (irqchip_in_kernel(kvm)) { - __s32 status; - status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, - irq_event.irq, irq_event.level); - if (ioctl == KVM_IRQ_LINE_STATUS) { - r = -EFAULT; - irq_event.status = status; - if (copy_to_user(argp, &irq_event, - sizeof irq_event)) - goto out; - } - r = 0; - } - break; - } case KVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ struct kvm_irqchip *chip; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4e60d3695e4..dbc65f9d6a2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -498,6 +498,7 @@ int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc); +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level); long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index de89497fe4c..bcf973ec98f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2148,6 +2148,29 @@ static long kvm_vm_ioctl(struct file *filp, r = kvm_send_userspace_msi(kvm, &msi); break; } +#endif +#ifdef __KVM_HAVE_IRQ_LINE + case KVM_IRQ_LINE_STATUS: + case KVM_IRQ_LINE: { + struct kvm_irq_level irq_event; + + r = -EFAULT; + if (copy_from_user(&irq_event, argp, sizeof irq_event)) + goto out; + + r = kvm_vm_ioctl_irq_line(kvm, &irq_event); + if (r) + goto out; + + r = -EFAULT; + if (ioctl == KVM_IRQ_LINE_STATUS) { + if (copy_to_user(argp, &irq_event, sizeof irq_event)) + goto out; + } + + r = 0; + break; + } #endif default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); -- cgit v1.2.3-70-g09d2 From d89cc617b954aff4030fce178f7d86f59aaf713d Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Wed, 1 Aug 2012 18:03:28 +0900 Subject: KVM: Push rmap into kvm_arch_memory_slot Two reasons: - x86 can integrate rmap and rmap_pde and remove heuristics in __gfn_to_rmap(). - Some architectures do not need rmap. Since rmap is one of the most memory consuming stuff in KVM, ppc'd better restrict the allocation to Book3S HV. Signed-off-by: Takuya Yoshikawa Acked-by: Paul Mackerras Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/book3s_64_mmu_hv.c | 6 ++-- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 4 +-- arch/powerpc/kvm/powerpc.c | 8 ++++++ arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 5 +--- arch/x86/kvm/x86.c | 55 +++++++++++++++++++++---------------- include/linux/kvm_host.h | 1 - virt/kvm/kvm_main.c | 11 +------- 9 files changed, 49 insertions(+), 44 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 572ad014126..a29e0918172 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -221,6 +221,7 @@ struct revmap_entry { #define KVMPPC_GOT_PAGE 0x80 struct kvm_arch_memory_slot { + unsigned long *rmap; }; struct kvm_arch { diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 3c635c0616b..d95d11322a1 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -705,7 +705,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, goto out_unlock; hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; - rmap = &memslot->rmap[gfn - memslot->base_gfn]; + rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; lock_rmap(rmap); /* Check if we might have been invalidated; let the guest retry if so */ @@ -788,7 +788,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, for (; gfn < gfn_end; ++gfn) { gfn_t gfn_offset = gfn - memslot->base_gfn; - ret = handler(kvm, &memslot->rmap[gfn_offset], gfn); + ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn); retval |= ret; } } @@ -1036,7 +1036,7 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) unsigned long *rmapp, *map; preempt_disable(); - rmapp = memslot->rmap; + rmapp = memslot->arch.rmap; map = memslot->dirty_bitmap; for (i = 0; i < memslot->npages; ++i) { if (kvm_test_clear_dirty(kvm, rmapp)) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 5c70d19494f..56ac1a5d991 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -84,7 +84,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) return; - rmap = real_vmalloc_addr(&memslot->rmap[gfn - memslot->base_gfn]); + rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]); lock_rmap(rmap); head = *rmap & KVMPPC_RMAP_INDEX; @@ -180,7 +180,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, if (!slot_is_aligned(memslot, psize)) return H_PARAMETER; slot_fn = gfn - memslot->base_gfn; - rmap = &memslot->rmap[slot_fn]; + rmap = &memslot->arch.rmap[slot_fn]; if (!kvm->arch.using_mmu_notifiers) { physp = kvm->arch.slot_phys[memslot->id]; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 87f4dc88607..879b14a6140 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -302,10 +302,18 @@ long kvm_arch_dev_ioctl(struct file *filp, void kvm_arch_free_memslot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { + if (!dont || free->arch.rmap != dont->arch.rmap) { + vfree(free->arch.rmap); + free->arch.rmap = NULL; + } } int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) { + slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); + if (!slot->arch.rmap) + return -ENOMEM; + return 0; } diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 48e71318846..1309e69b57f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -504,7 +504,7 @@ struct kvm_lpage_info { }; struct kvm_arch_memory_slot { - unsigned long *rmap_pde[KVM_NR_PAGE_SIZES - 1]; + unsigned long *rmap[KVM_NR_PAGE_SIZES]; struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; }; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ee768bb2367..aa9a987ddef 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -970,11 +970,8 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, { unsigned long idx; - if (likely(level == PT_PAGE_TABLE_LEVEL)) - return &slot->rmap[gfn - slot->base_gfn]; - idx = gfn_to_index(gfn, slot->base_gfn, level); - return &slot->arch.rmap_pde[level - PT_DIRECTORY_LEVEL][idx]; + return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx]; } /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index abc039d7842..ebf2109318e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6303,14 +6303,18 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free, { int i; - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - if (!dont || free->arch.rmap_pde[i] != dont->arch.rmap_pde[i]) { - kvm_kvfree(free->arch.rmap_pde[i]); - free->arch.rmap_pde[i] = NULL; + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { + kvm_kvfree(free->arch.rmap[i]); + free->arch.rmap[i] = NULL; } - if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { - kvm_kvfree(free->arch.lpage_info[i]); - free->arch.lpage_info[i] = NULL; + if (i == 0) + continue; + + if (!dont || free->arch.lpage_info[i - 1] != + dont->arch.lpage_info[i - 1]) { + kvm_kvfree(free->arch.lpage_info[i - 1]); + free->arch.lpage_info[i - 1] = NULL; } } } @@ -6319,28 +6323,30 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) { int i; - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { unsigned long ugfn; int lpages; - int level = i + 2; + int level = i + 1; lpages = gfn_to_index(slot->base_gfn + npages - 1, slot->base_gfn, level) + 1; - slot->arch.rmap_pde[i] = - kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap_pde[i])); - if (!slot->arch.rmap_pde[i]) + slot->arch.rmap[i] = + kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i])); + if (!slot->arch.rmap[i]) goto out_free; + if (i == 0) + continue; - slot->arch.lpage_info[i] = - kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); - if (!slot->arch.lpage_info[i]) + slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages * + sizeof(*slot->arch.lpage_info[i - 1])); + if (!slot->arch.lpage_info[i - 1]) goto out_free; if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->arch.lpage_info[i][0].write_count = 1; + slot->arch.lpage_info[i - 1][0].write_count = 1; if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) - slot->arch.lpage_info[i][lpages - 1].write_count = 1; + slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1; ugfn = slot->userspace_addr >> PAGE_SHIFT; /* * If the gfn and userspace address are not aligned wrt each @@ -6352,18 +6358,21 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) unsigned long j; for (j = 0; j < lpages; ++j) - slot->arch.lpage_info[i][j].write_count = 1; + slot->arch.lpage_info[i - 1][j].write_count = 1; } } return 0; out_free: - for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - kvm_kvfree(slot->arch.rmap_pde[i]); - kvm_kvfree(slot->arch.lpage_info[i]); - slot->arch.rmap_pde[i] = NULL; - slot->arch.lpage_info[i] = NULL; + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { + kvm_kvfree(slot->arch.rmap[i]); + slot->arch.rmap[i] = NULL; + if (i == 0) + continue; + + kvm_kvfree(slot->arch.lpage_info[i - 1]); + slot->arch.lpage_info[i - 1] = NULL; } return -ENOMEM; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index dbc65f9d6a2..3c16f0f1fe3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -213,7 +213,6 @@ struct kvm_memory_slot { gfn_t base_gfn; unsigned long npages; unsigned long flags; - unsigned long *rmap; unsigned long *dirty_bitmap; struct kvm_arch_memory_slot arch; unsigned long userspace_addr; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index bcf973ec98f..14ec567816a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -550,16 +550,12 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) static void kvm_free_physmem_slot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { - if (!dont || free->rmap != dont->rmap) - vfree(free->rmap); - if (!dont || free->dirty_bitmap != dont->dirty_bitmap) kvm_destroy_dirty_bitmap(free); kvm_arch_free_memslot(free, dont); free->npages = 0; - free->rmap = NULL; } void kvm_free_physmem(struct kvm *kvm) @@ -768,11 +764,7 @@ int __kvm_set_memory_region(struct kvm *kvm, if (npages && !old.npages) { new.user_alloc = user_alloc; new.userspace_addr = mem->userspace_addr; -#ifndef CONFIG_S390 - new.rmap = vzalloc(npages * sizeof(*new.rmap)); - if (!new.rmap) - goto out_free; -#endif /* not defined CONFIG_S390 */ + if (kvm_arch_create_memslot(&new, npages)) goto out_free; } @@ -831,7 +823,6 @@ int __kvm_set_memory_region(struct kvm *kvm, /* actual memory is freed via old in kvm_free_physmem_slot below */ if (!npages) { - new.rmap = NULL; new.dirty_bitmap = NULL; memset(&new.arch, 0, sizeof(new.arch)); } -- cgit v1.2.3-70-g09d2 From 6c8ee57be9350c5c2cafdd6a99d0462d528676e2 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 3 Aug 2012 15:37:54 +0800 Subject: KVM: introduce KVM_PFN_ERR_FAULT After that, the exported and un-inline function, get_fault_pfn, can be removed Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- include/linux/kvm_host.h | 3 ++- virt/kvm/kvm_main.c | 12 +++--------- 3 files changed, 6 insertions(+), 11 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index aa9a987ddef..9cf90c8d584 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2512,7 +2512,7 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); if (!slot) - return get_fault_pfn(); + return KVM_PFN_ERR_FAULT; hva = gfn_to_hva_memslot(slot, gfn); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3c16f0f1fe3..ef5554f4748 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -48,6 +48,8 @@ #define KVM_MAX_MMIO_FRAGMENTS \ (KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS) +#define KVM_PFN_ERR_FAULT (-EFAULT) + /* * vcpu->requests bit members */ @@ -443,7 +445,6 @@ void kvm_release_pfn_clean(pfn_t pfn); void kvm_set_pfn_dirty(pfn_t pfn); void kvm_set_pfn_accessed(pfn_t pfn); void kvm_get_pfn(pfn_t pfn); -pfn_t get_fault_pfn(void); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 14ec567816a..ef0491645a1 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -939,12 +939,6 @@ static pfn_t get_bad_pfn(void) return -ENOENT; } -pfn_t get_fault_pfn(void) -{ - return -EFAULT; -} -EXPORT_SYMBOL_GPL(get_fault_pfn); - static pfn_t get_hwpoison_pfn(void) { return -EHWPOISON; @@ -1115,7 +1109,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, struct vm_area_struct *vma; if (atomic) - return get_fault_pfn(); + return KVM_PFN_ERR_FAULT; down_read(¤t->mm->mmap_sem); if (npages == -EHWPOISON || @@ -1127,7 +1121,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, vma = find_vma_intersection(current->mm, addr, addr+1); if (vma == NULL) - pfn = get_fault_pfn(); + pfn = KVM_PFN_ERR_FAULT; else if ((vma->vm_flags & VM_PFNMAP)) { pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; @@ -1135,7 +1129,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, } else { if (async && (vma->vm_flags & VM_WRITE)) *async = true; - pfn = get_fault_pfn(); + pfn = KVM_PFN_ERR_FAULT; } up_read(¤t->mm->mmap_sem); } else -- cgit v1.2.3-70-g09d2 From e6c1502b3f933ace20c711ce34ab696f5a67086d Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 3 Aug 2012 15:38:36 +0800 Subject: KVM: introduce KVM_PFN_ERR_HWPOISON Then, get_hwpoison_pfn and is_hwpoison_pfn can be removed Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 13 +------------ 3 files changed, 3 insertions(+), 14 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9cf90c8d584..d3cdf69da51 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2649,7 +2649,7 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) { kvm_release_pfn_clean(pfn); - if (is_hwpoison_pfn(pfn)) { + if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); return 0; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ef5554f4748..840f44a096c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -49,6 +49,7 @@ (KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS) #define KVM_PFN_ERR_FAULT (-EFAULT) +#define KVM_PFN_ERR_HWPOISON (-EHWPOISON) /* * vcpu->requests bit members @@ -395,7 +396,6 @@ extern struct page *bad_page; int is_error_page(struct page *page); int is_error_pfn(pfn_t pfn); -int is_hwpoison_pfn(pfn_t pfn); int is_noslot_pfn(pfn_t pfn); int is_invalid_pfn(pfn_t pfn); int kvm_is_error_hva(unsigned long addr); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ef0491645a1..7fce2d5787a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -939,17 +939,6 @@ static pfn_t get_bad_pfn(void) return -ENOENT; } -static pfn_t get_hwpoison_pfn(void) -{ - return -EHWPOISON; -} - -int is_hwpoison_pfn(pfn_t pfn) -{ - return pfn == -EHWPOISON; -} -EXPORT_SYMBOL_GPL(is_hwpoison_pfn); - int is_noslot_pfn(pfn_t pfn) { return pfn == -ENOENT; @@ -1115,7 +1104,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, if (npages == -EHWPOISON || (!async && check_user_page_hwpoison(addr))) { up_read(¤t->mm->mmap_sem); - return get_hwpoison_pfn(); + return KVM_PFN_ERR_HWPOISON; } vma = find_vma_intersection(current->mm, addr, addr+1); -- cgit v1.2.3-70-g09d2 From 950e95097b1c6573ef5e21061ccb56964278c45b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 3 Aug 2012 15:39:19 +0800 Subject: KVM: introduce KVM_PFN_ERR_BAD Then, remove get_bad_pfn Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 7 +------ 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 840f44a096c..b2cf3109822 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -50,6 +50,7 @@ #define KVM_PFN_ERR_FAULT (-EFAULT) #define KVM_PFN_ERR_HWPOISON (-EHWPOISON) +#define KVM_PFN_ERR_BAD (-ENOENT) /* * vcpu->requests bit members diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7fce2d5787a..988b2339d84 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -934,11 +934,6 @@ int is_error_pfn(pfn_t pfn) } EXPORT_SYMBOL_GPL(is_error_pfn); -static pfn_t get_bad_pfn(void) -{ - return -ENOENT; -} - int is_noslot_pfn(pfn_t pfn) { return pfn == -ENOENT; @@ -1143,7 +1138,7 @@ static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) - return get_bad_pfn(); + return KVM_PFN_ERR_BAD; return hva_to_pfn(addr, atomic, async, write_fault, writable); } -- cgit v1.2.3-70-g09d2 From 83f09228d068911ac8797ae8d6febef886698936 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 3 Aug 2012 15:39:59 +0800 Subject: KVM: inline is_*_pfn functions These functions are exported and can not inline, move them to kvm_host.h to eliminate the overload of function call Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 19 ++++++++++++++++--- virt/kvm/kvm_main.c | 18 ------------------ 2 files changed, 16 insertions(+), 21 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b2cf3109822..19d91ceaf5e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -52,6 +53,21 @@ #define KVM_PFN_ERR_HWPOISON (-EHWPOISON) #define KVM_PFN_ERR_BAD (-ENOENT) +static inline int is_error_pfn(pfn_t pfn) +{ + return IS_ERR_VALUE(pfn); +} + +static inline int is_noslot_pfn(pfn_t pfn) +{ + return pfn == -ENOENT; +} + +static inline int is_invalid_pfn(pfn_t pfn) +{ + return !is_noslot_pfn(pfn) && is_error_pfn(pfn); +} + /* * vcpu->requests bit members */ @@ -396,9 +412,6 @@ id_to_memslot(struct kvm_memslots *slots, int id) extern struct page *bad_page; int is_error_page(struct page *page); -int is_error_pfn(pfn_t pfn); -int is_noslot_pfn(pfn_t pfn); -int is_invalid_pfn(pfn_t pfn); int kvm_is_error_hva(unsigned long addr); int kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 988b2339d84..eb73e5f1367 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -928,24 +928,6 @@ int is_error_page(struct page *page) } EXPORT_SYMBOL_GPL(is_error_page); -int is_error_pfn(pfn_t pfn) -{ - return IS_ERR_VALUE(pfn); -} -EXPORT_SYMBOL_GPL(is_error_pfn); - -int is_noslot_pfn(pfn_t pfn) -{ - return pfn == -ENOENT; -} -EXPORT_SYMBOL_GPL(is_noslot_pfn); - -int is_invalid_pfn(pfn_t pfn) -{ - return !is_noslot_pfn(pfn) && is_error_pfn(pfn); -} -EXPORT_SYMBOL_GPL(is_invalid_pfn); - struct page *get_bad_page(void) { return ERR_PTR(-ENOENT); -- cgit v1.2.3-70-g09d2 From 9a592a953880fd6981955e69c1476ce541d9bd16 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 3 Aug 2012 15:40:39 +0800 Subject: KVM: remove the unused declare Remove it since it is not used anymore Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 19d91ceaf5e..e2dcc7cb228 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -409,8 +409,6 @@ id_to_memslot(struct kvm_memslots *slots, int id) return slot; } -extern struct page *bad_page; - int is_error_page(struct page *page); int kvm_is_error_hva(unsigned long addr); int kvm_set_memory_region(struct kvm *kvm, -- cgit v1.2.3-70-g09d2 From 6cede2e6794be6b0649f62d3681e0c4aff5a9270 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 3 Aug 2012 15:41:22 +0800 Subject: KVM: introduce KVM_ERR_PTR_BAD_PAGE It is used to eliminate the overload of function call and cleanup the code Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 9 +++++++-- virt/kvm/async_pf.c | 2 +- virt/kvm/kvm_main.c | 13 +------------ 3 files changed, 9 insertions(+), 15 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index e2dcc7cb228..ce7c32950f4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -68,6 +68,13 @@ static inline int is_invalid_pfn(pfn_t pfn) return !is_noslot_pfn(pfn) && is_error_pfn(pfn); } +#define KVM_ERR_PTR_BAD_PAGE (ERR_PTR(-ENOENT)) + +static inline int is_error_page(struct page *page) +{ + return IS_ERR(page); +} + /* * vcpu->requests bit members */ @@ -409,7 +416,6 @@ id_to_memslot(struct kvm_memslots *slots, int id) return slot; } -int is_error_page(struct page *page); int kvm_is_error_hva(unsigned long addr); int kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, @@ -436,7 +442,6 @@ void kvm_arch_flush_shadow(struct kvm *kvm); int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, int nr_pages); -struct page *get_bad_page(void); struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); void kvm_release_page_clean(struct page *page); diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 79722782d9d..56f55339189 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -203,7 +203,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) if (!work) return -ENOMEM; - work->page = get_bad_page(); + work->page = KVM_ERR_PTR_BAD_PAGE; INIT_LIST_HEAD(&work->queue); /* for list_del to work */ spin_lock(&vcpu->async_pf.lock); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index eb73e5f1367..93d3c6e063c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -922,17 +922,6 @@ void kvm_disable_largepages(void) } EXPORT_SYMBOL_GPL(kvm_disable_largepages); -int is_error_page(struct page *page) -{ - return IS_ERR(page); -} -EXPORT_SYMBOL_GPL(is_error_page); - -struct page *get_bad_page(void) -{ - return ERR_PTR(-ENOENT); -} - static inline unsigned long bad_hva(void) { return PAGE_OFFSET; @@ -1179,7 +1168,7 @@ static struct page *kvm_pfn_to_page(pfn_t pfn) WARN_ON(kvm_is_mmio_pfn(pfn)); if (is_error_pfn(pfn) || kvm_is_mmio_pfn(pfn)) - return get_bad_page(); + return KVM_ERR_PTR_BAD_PAGE; return pfn_to_page(pfn); } -- cgit v1.2.3-70-g09d2 From 32cad84f44d186654492f1a50a1424c8906ccbd9 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 3 Aug 2012 15:42:52 +0800 Subject: KVM: do not release the error page After commit a2766325cf9f9, the error page is replaced by the error code, it need not be released anymore [ The patch has been compiling tested for powerpc ] Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/powerpc/kvm/44x_tlb.c | 1 - arch/powerpc/kvm/book3s_pr.c | 4 +--- arch/x86/kvm/svm.c | 1 - arch/x86/kvm/vmx.c | 5 ++--- arch/x86/kvm/x86.c | 9 +++------ include/linux/kvm_host.h | 2 +- virt/kvm/async_pf.c | 4 ++-- virt/kvm/kvm_main.c | 5 +++-- 8 files changed, 12 insertions(+), 19 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c index 33aa715dab2..5dd3ab46997 100644 --- a/arch/powerpc/kvm/44x_tlb.c +++ b/arch/powerpc/kvm/44x_tlb.c @@ -319,7 +319,6 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, if (is_error_page(new_page)) { printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n", (unsigned long long)gfn); - kvm_release_page_clean(new_page); return; } hpaddr = page_to_phys(new_page); diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index a1baec340f7..05c28f59f77 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -242,10 +242,8 @@ static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) int i; hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT); - if (is_error_page(hpage)) { - kvm_release_page_clean(hpage); + if (is_error_page(hpage)) return; - } hpage_offset = pte->raddr & ~PAGE_MASK; hpage_offset &= ~0xFFFULL; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 687d0c30e55..31be4a55744 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2105,7 +2105,6 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page) return kmap(page); error: - kvm_release_page_clean(page); kvm_inject_gp(&svm->vcpu, 0); return NULL; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d6e4cbc42b8..cc8ad983692 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -596,10 +596,9 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) { struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); - if (is_error_page(page)) { - kvm_release_page_clean(page); + if (is_error_page(page)) return NULL; - } + return page; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ebf2109318e..7953a9e7cb1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1639,10 +1639,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) vcpu->arch.time_page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); - if (is_error_page(vcpu->arch.time_page)) { - kvm_release_page_clean(vcpu->arch.time_page); + if (is_error_page(vcpu->arch.time_page)) vcpu->arch.time_page = NULL; - } + break; } case MSR_KVM_ASYNC_PF_EN: @@ -3945,10 +3944,8 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, goto emul_write; page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - if (is_error_page(page)) { - kvm_release_page_clean(page); + if (is_error_page(page)) goto emul_write; - } kaddr = kmap_atomic(page); kaddr += offset_in_page(gpa); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ce7c32950f4..07226f820e6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -457,7 +457,7 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); -void kvm_release_pfn_dirty(pfn_t); +void kvm_release_pfn_dirty(pfn_t pfn); void kvm_release_pfn_clean(pfn_t pfn); void kvm_set_pfn_dirty(pfn_t pfn); void kvm_set_pfn_accessed(pfn_t pfn); diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 56f55339189..ea475cd0351 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -111,7 +111,7 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu) list_entry(vcpu->async_pf.done.next, typeof(*work), link); list_del(&work->link); - if (work->page) + if (!is_error_page(work->page)) kvm_release_page_clean(work->page); kmem_cache_free(async_pf_cache, work); } @@ -138,7 +138,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) list_del(&work->queue); vcpu->async_pf.queued--; - if (work->page) + if (!is_error_page(work->page)) kvm_release_page_clean(work->page); kmem_cache_free(async_pf_cache, work); } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index eafba99d107..a2e85af847c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1186,8 +1186,9 @@ EXPORT_SYMBOL_GPL(gfn_to_page); void kvm_release_page_clean(struct page *page) { - if (!is_error_page(page)) - kvm_release_pfn_clean(page_to_pfn(page)); + WARN_ON(is_error_page(page)); + + kvm_release_pfn_clean(page_to_pfn(page)); } EXPORT_SYMBOL_GPL(kvm_release_page_clean); -- cgit v1.2.3-70-g09d2 From 9c5b11728344e1085593f494ddc8838497e7ffde Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 3 Aug 2012 15:43:51 +0800 Subject: KVM: let the error pfn not depend on error code Currently, we use the error code as error pfn to indicat the error condition, it is not straightforward and it will not work on PAE 32-bit cpu with huge memory, since the valid physical address can be at most 52 bits For the normal pfn, the highest 12 bits should be zero, so we can mask these bits to indicate the error. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 07226f820e6..d2b897ee3ac 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -49,28 +49,34 @@ #define KVM_MAX_MMIO_FRAGMENTS \ (KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS) -#define KVM_PFN_ERR_FAULT (-EFAULT) -#define KVM_PFN_ERR_HWPOISON (-EHWPOISON) -#define KVM_PFN_ERR_BAD (-ENOENT) +/* + * For the normal pfn, the highest 12 bits should be zero, + * so we can mask these bits to indicate the error. + */ +#define KVM_PFN_ERR_MASK (0xfffULL << 52) + +#define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK) +#define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1) +#define KVM_PFN_ERR_BAD (KVM_PFN_ERR_MASK + 2) -static inline int is_error_pfn(pfn_t pfn) +static inline bool is_error_pfn(pfn_t pfn) { - return IS_ERR_VALUE(pfn); + return !!(pfn & KVM_PFN_ERR_MASK); } -static inline int is_noslot_pfn(pfn_t pfn) +static inline bool is_noslot_pfn(pfn_t pfn) { - return pfn == -ENOENT; + return pfn == KVM_PFN_ERR_BAD; } -static inline int is_invalid_pfn(pfn_t pfn) +static inline bool is_invalid_pfn(pfn_t pfn) { return !is_noslot_pfn(pfn) && is_error_pfn(pfn); } #define KVM_ERR_PTR_BAD_PAGE (ERR_PTR(-ENOENT)) -static inline int is_error_page(struct page *page) +static inline bool is_error_page(struct page *page) { return IS_ERR(page); } -- cgit v1.2.3-70-g09d2 From 67b29204c8c9ecb4b2799a06ab646eeb363a0fe6 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 21 Aug 2012 10:58:45 +0800 Subject: KVM: hide KVM_MEMSLOT_INVALID from userspace Quote Avi's comment: | KVM_MEMSLOT_INVALID is actually an internal symbol, not used by | userspace. Please move it to kvm_host.h. Also, we divide the memlsot->flags into two parts, the lower 16 bits are visible for userspace, the higher 16 bits are internally used in kvm Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- include/linux/kvm.h | 7 +++++-- include/linux/kvm_host.h | 7 +++++++ 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 2ce09aa7d3b..2de335d7f63 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -101,9 +101,12 @@ struct kvm_userspace_memory_region { __u64 userspace_addr; /* start of the userspace allocated memory */ }; -/* for kvm_memory_region::flags */ +/* + * The bit 0 ~ bit 15 of kvm_memory_region::flags are visible for userspace, + * other bits are reserved for kvm internal use which are defined in + * include/linux/kvm_host.h. + */ #define KVM_MEM_LOG_DIRTY_PAGES 1UL -#define KVM_MEMSLOT_INVALID (1UL << 1) /* for KVM_IRQ_LINE */ struct kvm_irq_level { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d2b897ee3ac..d4bd4d41e35 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -35,6 +35,13 @@ #define KVM_MMIO_SIZE 8 #endif +/* + * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used + * in kvm, other bits are visible for userspace which are defined in + * include/linux/kvm_h. + */ +#define KVM_MEMSLOT_INVALID (1UL << 16) + /* * If we support unaligned MMIO, at most one fragment will be split into two: */ -- cgit v1.2.3-70-g09d2 From 037d92dc5d4691ae7cf44699c55ca83b1b441992 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 21 Aug 2012 10:59:12 +0800 Subject: KVM: introduce gfn_to_pfn_memslot_atomic It can instead of hva_to_pfn_atomic Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 +---- include/linux/kvm_host.h | 3 ++- virt/kvm/kvm_main.c | 14 ++++++++------ 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9651c2cd000..5548971ae80 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2510,15 +2510,12 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { struct kvm_memory_slot *slot; - unsigned long hva; slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); if (!slot) return KVM_PFN_ERR_FAULT; - hva = gfn_to_hva_memslot(slot, gfn); - - return hva_to_pfn_atomic(hva); + return gfn_to_pfn_memslot_atomic(slot, gfn); } static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d4bd4d41e35..52c86e4f6d8 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -462,7 +462,6 @@ void kvm_release_page_dirty(struct page *page); void kvm_set_page_dirty(struct page *page); void kvm_set_page_accessed(struct page *page); -pfn_t hva_to_pfn_atomic(unsigned long addr); pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, bool write_fault, bool *writable); @@ -470,6 +469,8 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); +pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); + void kvm_release_pfn_dirty(pfn_t pfn); void kvm_release_pfn_clean(pfn_t pfn); void kvm_set_pfn_dirty(pfn_t pfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7b94d70a323..543f9b7e5aa 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1102,12 +1102,6 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, return pfn; } -pfn_t hva_to_pfn_atomic(unsigned long addr) -{ - return hva_to_pfn(addr, true, NULL, true, NULL); -} -EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); - static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, bool write_fault, bool *writable) { @@ -1155,6 +1149,14 @@ pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) return hva_to_pfn(addr, false, NULL, true, NULL); } +pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) +{ + unsigned long addr = gfn_to_hva_memslot(slot, gfn); + + return hva_to_pfn(addr, true, NULL, true, NULL); +} +EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); + int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, int nr_pages) { -- cgit v1.2.3-70-g09d2 From 69552c295e3df1431801526b5b705a0e130c920d Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 21 Aug 2012 11:01:19 +0800 Subject: KVM: introduce KVM_PFN_ERR_RO_FAULT In the later patch, it indicates failure when we try to get a writable pfn from the readonly memslot Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/kvm_host.h') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 52c86e4f6d8..4fd33e03d5d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -65,6 +65,7 @@ #define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK) #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1) #define KVM_PFN_ERR_BAD (KVM_PFN_ERR_MASK + 2) +#define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 3) static inline bool is_error_pfn(pfn_t pfn) { -- cgit v1.2.3-70-g09d2 From ca3a490c7de8472b514e2d635c052b1e0f8e1c9d Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 21 Aug 2012 11:01:50 +0800 Subject: KVM: introduce KVM_HVA_ERR_BAD Then, remove bad_hva and inline kvm_is_error_hva Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 8 +++++++- virt/kvm/kvm_main.c | 13 +------------ 2 files changed, 8 insertions(+), 13 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4fd33e03d5d..b9bba60c298 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -82,6 +82,13 @@ static inline bool is_invalid_pfn(pfn_t pfn) return !is_noslot_pfn(pfn) && is_error_pfn(pfn); } +#define KVM_HVA_ERR_BAD (PAGE_OFFSET) + +static inline bool kvm_is_error_hva(unsigned long addr) +{ + return addr == PAGE_OFFSET; +} + #define KVM_ERR_PTR_BAD_PAGE (ERR_PTR(-ENOENT)) static inline bool is_error_page(struct page *page) @@ -430,7 +437,6 @@ id_to_memslot(struct kvm_memslots *slots, int id) return slot; } -int kvm_is_error_hva(unsigned long addr); int kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c89d1b5129e..e3e1658c491 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -931,17 +931,6 @@ void kvm_disable_largepages(void) } EXPORT_SYMBOL_GPL(kvm_disable_largepages); -static inline unsigned long bad_hva(void) -{ - return PAGE_OFFSET; -} - -int kvm_is_error_hva(unsigned long addr) -{ - return addr == bad_hva(); -} -EXPORT_SYMBOL_GPL(kvm_is_error_hva); - struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { return __gfn_to_memslot(kvm_memslots(kvm), gfn); @@ -988,7 +977,7 @@ static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, gfn_t *nr_pages) { if (!slot || slot->flags & KVM_MEMSLOT_INVALID) - return bad_hva(); + return KVM_HVA_ERR_BAD; if (nr_pages) *nr_pages = slot->npages - (gfn - slot->base_gfn); -- cgit v1.2.3-70-g09d2 From 7068d0971524dd47a38f44f6020ba133432871ca Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 21 Aug 2012 11:02:22 +0800 Subject: KVM: introduce KVM_HVA_ERR_RO_BAD In the later patch, it indicates failure when we try to get a writable hva from the readonly memslot Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b9bba60c298..a913ac709a9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -82,11 +82,12 @@ static inline bool is_invalid_pfn(pfn_t pfn) return !is_noslot_pfn(pfn) && is_error_pfn(pfn); } -#define KVM_HVA_ERR_BAD (PAGE_OFFSET) +#define KVM_HVA_ERR_BAD (PAGE_OFFSET) +#define KVM_HVA_ERR_RO_BAD (PAGE_OFFSET + PAGE_SIZE) static inline bool kvm_is_error_hva(unsigned long addr) { - return addr == PAGE_OFFSET; + return addr >= PAGE_OFFSET; } #define KVM_ERR_PTR_BAD_PAGE (ERR_PTR(-ENOENT)) -- cgit v1.2.3-70-g09d2 From 4d8b81abc47b83a1939e59df2fdb0e98dfe0eedd Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 21 Aug 2012 11:02:51 +0800 Subject: KVM: introduce readonly memslot In current code, if we map a readonly memory space from host to guest and the page is not currently mapped in the host, we will get a fault pfn and async is not allowed, then the vm will crash We introduce readonly memory region to map ROM/ROMD to the guest, read access is happy for readonly memslot, write access on readonly memslot will cause KVM_EXIT_MMIO exit Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 10 ++-- arch/x86/include/asm/kvm.h | 1 + arch/x86/kvm/mmu.c | 9 ++++ arch/x86/kvm/x86.c | 1 + include/linux/kvm.h | 6 ++- include/linux/kvm_host.h | 7 +-- virt/kvm/kvm_main.c | 96 +++++++++++++++++++++++++++++++-------- 7 files changed, 102 insertions(+), 28 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index bf33aaa4c59..b91bfd43f00 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -857,7 +857,8 @@ struct kvm_userspace_memory_region { }; /* for kvm_memory_region::flags */ -#define KVM_MEM_LOG_DIRTY_PAGES 1UL +#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) +#define KVM_MEM_READONLY (1UL << 1) This ioctl allows the user to create or modify a guest physical memory slot. When changing an existing slot, it may be moved in the guest @@ -873,9 +874,12 @@ It is recommended that the lower 21 bits of guest_phys_addr and userspace_addr be identical. This allows large pages in the guest to be backed by large pages in the host. -The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which +The flags field supports two flag, KVM_MEM_LOG_DIRTY_PAGES, which instructs kvm to keep track of writes to memory within the slot. See -the KVM_GET_DIRTY_LOG ioctl. +the KVM_GET_DIRTY_LOG ioctl. Another flag is KVM_MEM_READONLY when the +KVM_CAP_READONLY_MEM capability, it indicates the guest memory is read-only, +that means, guest is only allowed to read it. Writes will be posted to +userspace as KVM_EXIT_MMIO exits. When the KVM_CAP_SYNC_MMU capability, changes in the backing of the memory region are automatically reflected into the guest. For example, an mmap() diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 246617efd67..521bf252e34 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -25,6 +25,7 @@ #define __KVM_HAVE_DEBUGREGS #define __KVM_HAVE_XSAVE #define __KVM_HAVE_XCRS +#define __KVM_HAVE_READONLY_MEM /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5548971ae80..8e312a2e141 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2647,6 +2647,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) { + /* + * Do not cache the mmio info caused by writing the readonly gfn + * into the spte otherwise read access on readonly gfn also can + * caused mmio page fault and treat it as mmio access. + * Return 1 to tell kvm to emulate it. + */ + if (pfn == KVM_PFN_ERR_RO_FAULT) + return 1; + if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); return 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 704680d0fa3..42bbf4187d2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2175,6 +2175,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_PCI_2_3: case KVM_CAP_KVMCLOCK_CTRL: + case KVM_CAP_READONLY_MEM: r = 1; break; case KVM_CAP_COALESCED_MMIO: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 2de335d7f63..d808694673f 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -106,7 +106,8 @@ struct kvm_userspace_memory_region { * other bits are reserved for kvm internal use which are defined in * include/linux/kvm_host.h. */ -#define KVM_MEM_LOG_DIRTY_PAGES 1UL +#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) +#define KVM_MEM_READONLY (1UL << 1) /* for KVM_IRQ_LINE */ struct kvm_irq_level { @@ -621,6 +622,9 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_PPC_GET_SMMU_INFO 78 #define KVM_CAP_S390_COW 79 #define KVM_CAP_PPC_ALLOC_HTAB 80 +#ifdef __KVM_HAVE_READONLY_MEM +#define KVM_CAP_READONLY_MEM 81 +#endif #ifdef KVM_CAP_IRQ_ROUTING diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a913ac709a9..5972c9845dd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -465,6 +465,7 @@ int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); +unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn); void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); void kvm_set_page_dirty(struct page *page); @@ -792,12 +793,6 @@ hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot) return slot->base_gfn + gfn_offset; } -static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, - gfn_t gfn) -{ - return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE; -} - static inline gpa_t gfn_to_gpa(gfn_t gfn) { return (gpa_t)gfn << PAGE_SHIFT; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e3e1658c491..3416f8a31f6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -680,7 +680,13 @@ void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new) static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) { - if (mem->flags & ~KVM_MEM_LOG_DIRTY_PAGES) + u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; + +#ifdef KVM_CAP_READONLY_MEM + valid_flags |= KVM_MEM_READONLY; +#endif + + if (mem->flags & ~valid_flags) return -EINVAL; return 0; @@ -973,18 +979,45 @@ out: return size; } -static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, - gfn_t *nr_pages) +static bool memslot_is_readonly(struct kvm_memory_slot *slot) +{ + return slot->flags & KVM_MEM_READONLY; +} + +static unsigned long __gfn_to_hva_memslot(struct kvm_memory_slot *slot, + gfn_t gfn) +{ + return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE; +} + +static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, + gfn_t *nr_pages, bool write) { if (!slot || slot->flags & KVM_MEMSLOT_INVALID) return KVM_HVA_ERR_BAD; + if (memslot_is_readonly(slot) && write) + return KVM_HVA_ERR_RO_BAD; + if (nr_pages) *nr_pages = slot->npages - (gfn - slot->base_gfn); - return gfn_to_hva_memslot(slot, gfn); + return __gfn_to_hva_memslot(slot, gfn); } +static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, + gfn_t *nr_pages) +{ + return __gfn_to_hva_many(slot, gfn, nr_pages, true); +} + +unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, + gfn_t gfn) +{ + return gfn_to_hva_many(slot, gfn, NULL); +} +EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); + unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) { return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); @@ -997,7 +1030,7 @@ EXPORT_SYMBOL_GPL(gfn_to_hva); */ static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn) { - return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); + return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false); } static int kvm_read_hva(void *data, void __user *hva, int len) @@ -1106,6 +1139,17 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, return npages; } +static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) +{ + if (unlikely(!(vma->vm_flags & VM_READ))) + return false; + + if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) + return false; + + return true; +} + /* * Pin guest page in memory and return its pfn. * @addr: host virtual address which maps memory to the guest @@ -1130,8 +1174,6 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, /* we can do it either atomically or asynchronously, not both */ BUG_ON(atomic && async); - BUG_ON(!write_fault && !writable); - if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn)) return pfn; @@ -1158,7 +1200,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, vma->vm_pgoff; BUG_ON(!kvm_is_mmio_pfn(pfn)); } else { - if (async && (vma->vm_flags & VM_WRITE)) + if (async && vma_is_valid(vma, write_fault)) *async = true; pfn = KVM_PFN_ERR_FAULT; } @@ -1167,19 +1209,40 @@ exit: return pfn; } +static pfn_t +__gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, + bool *async, bool write_fault, bool *writable) +{ + unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); + + if (addr == KVM_HVA_ERR_RO_BAD) + return KVM_PFN_ERR_RO_FAULT; + + if (kvm_is_error_hva(addr)) + return KVM_PFN_ERR_BAD; + + /* Do not map writable pfn in the readonly memslot. */ + if (writable && memslot_is_readonly(slot)) { + *writable = false; + writable = NULL; + } + + return hva_to_pfn(addr, atomic, async, write_fault, + writable); +} + static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, bool write_fault, bool *writable) { - unsigned long addr; + struct kvm_memory_slot *slot; if (async) *async = false; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return KVM_PFN_ERR_BAD; + slot = gfn_to_memslot(kvm, gfn); - return hva_to_pfn(addr, atomic, async, write_fault, writable); + return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault, + writable); } pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) @@ -1210,15 +1273,12 @@ EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) { - unsigned long addr = gfn_to_hva_memslot(slot, gfn); - return hva_to_pfn(addr, false, NULL, true, NULL); + return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); } pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) { - unsigned long addr = gfn_to_hva_memslot(slot, gfn); - - return hva_to_pfn(addr, true, NULL, true, NULL); + return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); -- cgit v1.2.3-70-g09d2 From 66a03505a7fcc70187319ef2318832f4d3c451a6 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 24 Aug 2012 16:50:28 +0800 Subject: KVM: PPC: book3s: fix build error caused by gfn_to_hva_memslot() The build error was caused by that builtin functions are calling the functions implemented in modules. This error was introduced by commit 4d8b81abc4 ("KVM: introduce readonly memslot"). The patch fixes the build error by moving function __gfn_to_hva_memslot() from kvm_main.c to kvm_host.h and making that "inline" so that the builtin function (kvmppc_h_enter) can use that. Acked-by: Paul Mackerras Signed-off-by: Gavin Shan Signed-off-by: Marcelo Tosatti --- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +- include/linux/kvm_host.h | 6 ++++++ virt/kvm/kvm_main.c | 6 ------ 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 56ac1a5d991..fb0e821622d 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -197,7 +197,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, pa &= PAGE_MASK; } else { /* Translate to host virtual address */ - hva = gfn_to_hva_memslot(memslot, gfn); + hva = __gfn_to_hva_memslot(memslot, gfn); /* Look up the Linux PTE for the backing page */ pte_size = psize; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5972c9845dd..9c0b3c3ae0a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -773,6 +773,12 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) return search_memslots(slots, gfn); } +static inline unsigned long +__gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn) +{ + return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE; +} + static inline int memslot_id(struct kvm *kvm, gfn_t gfn) { return gfn_to_memslot(kvm, gfn)->id; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3416f8a31f6..6425906d7ce 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -984,12 +984,6 @@ static bool memslot_is_readonly(struct kvm_memory_slot *slot) return slot->flags & KVM_MEM_READONLY; } -static unsigned long __gfn_to_hva_memslot(struct kvm_memory_slot *slot, - gfn_t gfn) -{ - return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE; -} - static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, gfn_t *nr_pages, bool write) { -- cgit v1.2.3-70-g09d2 From 2df72e9bc4c505d8357012f2924589f3d16f9d44 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 24 Aug 2012 15:54:57 -0300 Subject: KVM: split kvm_arch_flush_shadow Introducing kvm_arch_flush_shadow_memslot, to invalidate the translations of a single memory slot. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 8 +++++++- arch/powerpc/kvm/powerpc.c | 6 +++++- arch/s390/kvm/kvm-s390.c | 7 ++++++- arch/x86/kvm/x86.c | 8 +++++++- include/linux/kvm_host.h | 6 +++++- virt/kvm/kvm_main.c | 8 ++++---- 6 files changed, 34 insertions(+), 9 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index eac65380bd2..8b3a9c0e771 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1613,11 +1613,17 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, return; } -void kvm_arch_flush_shadow(struct kvm *kvm) +void kvm_arch_flush_shadow_all(struct kvm *kvm) { kvm_flush_remote_tlbs(kvm); } +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_arch_flush_shadow_all(); +} + long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 879b14a6140..4d213b8b0fb 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -334,8 +334,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvmppc_core_commit_memory_region(kvm, mem); } +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ +} -void kvm_arch_flush_shadow(struct kvm *kvm) +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) { } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index e83df7f0fed..ecced9d1898 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -969,7 +969,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, return; } -void kvm_arch_flush_shadow(struct kvm *kvm) +void kvm_arch_flush_shadow_all(struct kvm *kvm) +{ +} + +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) { } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 666da13c34f..37797a090a8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6447,12 +6447,18 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, spin_unlock(&kvm->mmu_lock); } -void kvm_arch_flush_shadow(struct kvm *kvm) +void kvm_arch_flush_shadow_all(struct kvm *kvm) { kvm_mmu_zap_all(kvm); kvm_reload_remote_mmus(kvm); } +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvm_arch_flush_shadow_all(kvm); +} + int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9c0b3c3ae0a..40791930bc1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -458,7 +458,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, int user_alloc); bool kvm_largepages_enabled(void); void kvm_disable_largepages(void); -void kvm_arch_flush_shadow(struct kvm *kvm); +/* flush all memory translations */ +void kvm_arch_flush_shadow_all(struct kvm *kvm); +/* flush memory translations pointing to 'slot' */ +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot); int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, int nr_pages); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6425906d7ce..a4bf05be5fe 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -408,7 +408,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, int idx; idx = srcu_read_lock(&kvm->srcu); - kvm_arch_flush_shadow(kvm); + kvm_arch_flush_shadow_all(kvm); srcu_read_unlock(&kvm->srcu, idx); } @@ -582,7 +582,7 @@ static void kvm_destroy_vm(struct kvm *kvm) #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); #else - kvm_arch_flush_shadow(kvm); + kvm_arch_flush_shadow_all(kvm); #endif kvm_arch_destroy_vm(kvm); kvm_free_physmem(kvm); @@ -814,7 +814,7 @@ int __kvm_set_memory_region(struct kvm *kvm, * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) * - kvm_is_visible_gfn (mmu_check_roots) */ - kvm_arch_flush_shadow(kvm); + kvm_arch_flush_shadow_memslot(kvm, slot); kfree(old_memslots); } @@ -854,7 +854,7 @@ int __kvm_set_memory_region(struct kvm *kvm, * mmio sptes. */ if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) - kvm_arch_flush_shadow(kvm); + kvm_arch_flush_shadow_all(kvm); kvm_free_physmem_slot(&old, &new); kfree(old_memslots); -- cgit v1.2.3-70-g09d2 From 9fc77441e5e1bf80b794cc546d2243ee9f4afb75 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 16 Sep 2012 11:50:30 +0300 Subject: KVM: make processes waiting on vcpu mutex killable vcpu mutex can be held for unlimited time so taking it with mutex_lock on an ioctl is wrong: one process could be passed a vcpu fd and call this ioctl on the vcpu used by another process, it will then be unkillable until the owner exits. Call mutex_lock_killable instead and return status. Note: mutex_lock_interruptible would be even nicer, but I am not sure all users are prepared to handle EINTR from these ioctls. They might misinterpret it as an error. Cleanup paths expect a vcpu that can't be used by any userspace so this will always succeed - catch bugs by calling BUG_ON. Catch callers that don't check return state by adding __must_check. Signed-off-by: Michael S. Tsirkin Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 12 +++++++++--- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 10 +++++++--- 3 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c4d451ed157..19047eafa38 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6016,7 +6016,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) int r; vcpu->arch.mtrr_state.have_fixed = 1; - vcpu_load(vcpu); + r = vcpu_load(vcpu); + if (r) + return r; r = kvm_arch_vcpu_reset(vcpu); if (r == 0) r = kvm_mmu_setup(vcpu); @@ -6027,9 +6029,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { + int r; vcpu->arch.apf.msr_val = 0; - vcpu_load(vcpu); + r = vcpu_load(vcpu); + BUG_ON(r); kvm_mmu_unload(vcpu); vcpu_put(vcpu); @@ -6275,7 +6279,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) { - vcpu_load(vcpu); + int r; + r = vcpu_load(vcpu); + BUG_ON(r); kvm_mmu_unload(vcpu); vcpu_put(vcpu); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 40791930bc1..80bfc880921 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -408,7 +408,7 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); -void vcpu_load(struct kvm_vcpu *vcpu); +int __must_check vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4fe02d90081..cc3f6dc506e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -131,11 +131,12 @@ bool kvm_is_mmio_pfn(pfn_t pfn) /* * Switches to specified vcpu, until a matching vcpu_put() */ -void vcpu_load(struct kvm_vcpu *vcpu) +int vcpu_load(struct kvm_vcpu *vcpu) { int cpu; - mutex_lock(&vcpu->mutex); + if (mutex_lock_killable(&vcpu->mutex)) + return -EINTR; if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { /* The thread running this VCPU changed. */ struct pid *oldpid = vcpu->pid; @@ -148,6 +149,7 @@ void vcpu_load(struct kvm_vcpu *vcpu) preempt_notifier_register(&vcpu->preempt_notifier); kvm_arch_vcpu_load(vcpu, cpu); put_cpu(); + return 0; } void vcpu_put(struct kvm_vcpu *vcpu) @@ -1891,7 +1893,9 @@ static long kvm_vcpu_ioctl(struct file *filp, #endif - vcpu_load(vcpu); + r = vcpu_load(vcpu); + if (r) + return r; switch (ioctl) { case KVM_RUN: r = -EINVAL; -- cgit v1.2.3-70-g09d2 From 7a84428af7ca6a847f058c9ff244a18a2664fd1b Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 21 Sep 2012 11:58:03 -0600 Subject: KVM: Add resampling irqfds for level triggered interrupts To emulate level triggered interrupts, add a resample option to KVM_IRQFD. When specified, a new resamplefd is provided that notifies the user when the irqchip has been resampled by the VM. This may, for instance, indicate an EOI. Also in this mode, posting of an interrupt through an irqfd only asserts the interrupt. On resampling, the interrupt is automatically de-asserted prior to user notification. This enables level triggered interrupts to be posted and re-enabled from vfio with no userspace intervention. All resampling irqfds can make use of a single irq source ID, so we reserve a new one for this interface. Signed-off-by: Alex Williamson Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 13 ++++ arch/x86/kvm/x86.c | 4 + include/linux/kvm.h | 12 ++- include/linux/kvm_host.h | 5 +- virt/kvm/eventfd.c | 150 +++++++++++++++++++++++++++++++++++++- virt/kvm/irq_comm.c | 6 ++ 6 files changed, 184 insertions(+), 6 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 36befa775fd..f6ec3a92e62 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1950,6 +1950,19 @@ the guest using the specified gsi pin. The irqfd is removed using the KVM_IRQFD_FLAG_DEASSIGN flag, specifying both kvm_irqfd.fd and kvm_irqfd.gsi. +With KVM_CAP_IRQFD_RESAMPLE, KVM_IRQFD supports a de-assert and notify +mechanism allowing emulation of level-triggered, irqfd-based +interrupts. When KVM_IRQFD_FLAG_RESAMPLE is set the user must pass an +additional eventfd in the kvm_irqfd.resamplefd field. When operating +in resample mode, posting of an interrupt through kvm_irq.fd asserts +the specified gsi in the irqchip. When the irqchip is resampled, such +as from an EOI, the gsi is de-asserted and the user is notifed via +kvm_irqfd.resamplefd. It is the user's responsibility to re-queue +the interrupt if the device making use of it still requires service. +Note that closing the resamplefd is not sufficient to disable the +irqfd. The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment +and need not be specified with KVM_IRQFD_FLAG_DEASSIGN. + 4.76 KVM_PPC_ALLOCATE_HTAB Capability: KVM_CAP_PPC_ALLOC_HTAB diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fc2a0a132e4..7d44204c604 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2176,6 +2176,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PCI_2_3: case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: + case KVM_CAP_IRQFD_RESAMPLE: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -6268,6 +6269,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); + /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */ + set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, + &kvm->arch.irq_sources_bitmap); raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); diff --git a/include/linux/kvm.h b/include/linux/kvm.h index d808694673f..0a6d6ba44c8 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -625,6 +625,7 @@ struct kvm_ppc_smmu_info { #ifdef __KVM_HAVE_READONLY_MEM #define KVM_CAP_READONLY_MEM 81 #endif +#define KVM_CAP_IRQFD_RESAMPLE 82 #ifdef KVM_CAP_IRQ_ROUTING @@ -690,12 +691,21 @@ struct kvm_xen_hvm_config { #endif #define KVM_IRQFD_FLAG_DEASSIGN (1 << 0) +/* + * Available with KVM_CAP_IRQFD_RESAMPLE + * + * KVM_IRQFD_FLAG_RESAMPLE indicates resamplefd is valid and specifies + * the irqfd to operate in resampling mode for level triggered interrupt + * emlation. See Documentation/virtual/kvm/api.txt. + */ +#define KVM_IRQFD_FLAG_RESAMPLE (1 << 1) struct kvm_irqfd { __u32 fd; __u32 gsi; __u32 flags; - __u8 pad[20]; + __u32 resamplefd; + __u8 pad[16]; }; struct kvm_clock_data { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 80bfc880921..2850656e2e9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -119,7 +119,8 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_PMU 16 #define KVM_REQ_PMI 17 -#define KVM_USERSPACE_IRQ_SOURCE_ID 0 +#define KVM_USERSPACE_IRQ_SOURCE_ID 0 +#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 struct kvm; struct kvm_vcpu; @@ -343,6 +344,8 @@ struct kvm { struct { spinlock_t lock; struct list_head items; + struct list_head resampler_list; + struct mutex resampler_lock; } irqfds; struct list_head ioeventfds; #endif diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 7d7e2aaffec..356965c9d10 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -43,6 +43,31 @@ * -------------------------------------------------------------------- */ +/* + * Resampling irqfds are a special variety of irqfds used to emulate + * level triggered interrupts. The interrupt is asserted on eventfd + * trigger. On acknowledgement through the irq ack notifier, the + * interrupt is de-asserted and userspace is notified through the + * resamplefd. All resamplers on the same gsi are de-asserted + * together, so we don't need to track the state of each individual + * user. We can also therefore share the same irq source ID. + */ +struct _irqfd_resampler { + struct kvm *kvm; + /* + * List of resampling struct _irqfd objects sharing this gsi. + * RCU list modified under kvm->irqfds.resampler_lock + */ + struct list_head list; + struct kvm_irq_ack_notifier notifier; + /* + * Entry in list of kvm->irqfd.resampler_list. Use for sharing + * resamplers among irqfds on the same gsi. + * Accessed and modified under kvm->irqfds.resampler_lock + */ + struct list_head link; +}; + struct _irqfd { /* Used for MSI fast-path */ struct kvm *kvm; @@ -52,6 +77,12 @@ struct _irqfd { /* Used for level IRQ fast-path */ int gsi; struct work_struct inject; + /* The resampler used by this irqfd (resampler-only) */ + struct _irqfd_resampler *resampler; + /* Eventfd notified on resample (resampler-only) */ + struct eventfd_ctx *resamplefd; + /* Entry in list of irqfds for a resampler (resampler-only) */ + struct list_head resampler_link; /* Used for setup/shutdown */ struct eventfd_ctx *eventfd; struct list_head list; @@ -67,8 +98,58 @@ irqfd_inject(struct work_struct *work) struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); struct kvm *kvm = irqfd->kvm; - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); + if (!irqfd->resampler) { + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); + } else + kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, + irqfd->gsi, 1); +} + +/* + * Since resampler irqfds share an IRQ source ID, we de-assert once + * then notify all of the resampler irqfds using this GSI. We can't + * do multiple de-asserts or we risk racing with incoming re-asserts. + */ +static void +irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) +{ + struct _irqfd_resampler *resampler; + struct _irqfd *irqfd; + + resampler = container_of(kian, struct _irqfd_resampler, notifier); + + kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, + resampler->notifier.gsi, 0); + + rcu_read_lock(); + + list_for_each_entry_rcu(irqfd, &resampler->list, resampler_link) + eventfd_signal(irqfd->resamplefd, 1); + + rcu_read_unlock(); +} + +static void +irqfd_resampler_shutdown(struct _irqfd *irqfd) +{ + struct _irqfd_resampler *resampler = irqfd->resampler; + struct kvm *kvm = resampler->kvm; + + mutex_lock(&kvm->irqfds.resampler_lock); + + list_del_rcu(&irqfd->resampler_link); + synchronize_rcu(); + + if (list_empty(&resampler->list)) { + list_del(&resampler->link); + kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); + kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, + resampler->notifier.gsi, 0); + kfree(resampler); + } + + mutex_unlock(&kvm->irqfds.resampler_lock); } /* @@ -92,6 +173,11 @@ irqfd_shutdown(struct work_struct *work) */ flush_work_sync(&irqfd->inject); + if (irqfd->resampler) { + irqfd_resampler_shutdown(irqfd); + eventfd_ctx_put(irqfd->resamplefd); + } + /* * It is now safe to release the object's resources */ @@ -203,7 +289,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) struct kvm_irq_routing_table *irq_rt; struct _irqfd *irqfd, *tmp; struct file *file = NULL; - struct eventfd_ctx *eventfd = NULL; + struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; int ret; unsigned int events; @@ -231,6 +317,54 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) irqfd->eventfd = eventfd; + if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) { + struct _irqfd_resampler *resampler; + + resamplefd = eventfd_ctx_fdget(args->resamplefd); + if (IS_ERR(resamplefd)) { + ret = PTR_ERR(resamplefd); + goto fail; + } + + irqfd->resamplefd = resamplefd; + INIT_LIST_HEAD(&irqfd->resampler_link); + + mutex_lock(&kvm->irqfds.resampler_lock); + + list_for_each_entry(resampler, + &kvm->irqfds.resampler_list, list) { + if (resampler->notifier.gsi == irqfd->gsi) { + irqfd->resampler = resampler; + break; + } + } + + if (!irqfd->resampler) { + resampler = kzalloc(sizeof(*resampler), GFP_KERNEL); + if (!resampler) { + ret = -ENOMEM; + mutex_unlock(&kvm->irqfds.resampler_lock); + goto fail; + } + + resampler->kvm = kvm; + INIT_LIST_HEAD(&resampler->list); + resampler->notifier.gsi = irqfd->gsi; + resampler->notifier.irq_acked = irqfd_resampler_ack; + INIT_LIST_HEAD(&resampler->link); + + list_add(&resampler->link, &kvm->irqfds.resampler_list); + kvm_register_irq_ack_notifier(kvm, + &resampler->notifier); + irqfd->resampler = resampler; + } + + list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); + synchronize_rcu(); + + mutex_unlock(&kvm->irqfds.resampler_lock); + } + /* * Install our own custom wake-up handling so we are notified via * a callback whenever someone signals the underlying eventfd @@ -276,6 +410,12 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) return 0; fail: + if (irqfd->resampler) + irqfd_resampler_shutdown(irqfd); + + if (resamplefd && !IS_ERR(resamplefd)) + eventfd_ctx_put(resamplefd); + if (eventfd && !IS_ERR(eventfd)) eventfd_ctx_put(eventfd); @@ -291,6 +431,8 @@ kvm_eventfd_init(struct kvm *kvm) { spin_lock_init(&kvm->irqfds.lock); INIT_LIST_HEAD(&kvm->irqfds.items); + INIT_LIST_HEAD(&kvm->irqfds.resampler_list); + mutex_init(&kvm->irqfds.resampler_lock); INIT_LIST_HEAD(&kvm->ioeventfds); } @@ -340,7 +482,7 @@ kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) { - if (args->flags & ~KVM_IRQFD_FLAG_DEASSIGN) + if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE)) return -EINVAL; if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 3ca89c451d6..2eb58af7ee9 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -228,6 +228,9 @@ int kvm_request_irq_source_id(struct kvm *kvm) } ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); +#ifdef CONFIG_X86 + ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); +#endif set_bit(irq_source_id, bitmap); unlock: mutex_unlock(&kvm->irq_lock); @@ -238,6 +241,9 @@ unlock: void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) { ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID); +#ifdef CONFIG_X86 + ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID); +#endif mutex_lock(&kvm->irq_lock); if (irq_source_id < 0 || -- cgit v1.2.3-70-g09d2 From bf9fae9f5e4ca8dce4708812f9ad6281e61df109 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Sep 2012 15:23:11 +0200 Subject: cputime: Use a proper subsystem naming for vtime related APIs Use a naming based on vtime as a prefix for virtual based cputime accounting APIs: - account_system_vtime() -> vtime_account() - account_switch_vtime() -> vtime_task_switch() It makes it easier to allow for further declension such as vtime_account_system(), vtime_account_idle(), ... if we want to find out the context we account to from generic code. This also make it better to know on which subsystem these APIs refer to. Signed-off-by: Frederic Weisbecker Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra --- arch/ia64/kernel/time.c | 6 +++--- arch/powerpc/kernel/time.c | 10 +++++----- arch/s390/kernel/vtime.c | 6 +++--- include/linux/hardirq.h | 8 ++++---- include/linux/kernel_stat.h | 4 ++-- include/linux/kvm_host.h | 4 ++-- kernel/sched/core.c | 2 +- kernel/sched/cputime.c | 8 ++++---- kernel/softirq.c | 6 +++--- 9 files changed, 27 insertions(+), 27 deletions(-) (limited to 'include/linux/kvm_host.h') diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 6247197b987..16bb6eda879 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -88,7 +88,7 @@ extern cputime_t cycle_to_cputime(u64 cyc); * accumulated times to the current process, and to prepare accounting on * the next process. */ -void account_switch_vtime(struct task_struct *prev) +void vtime_task_switch(struct task_struct *prev) { struct thread_info *pi = task_thread_info(prev); struct thread_info *ni = task_thread_info(current); @@ -116,7 +116,7 @@ void account_switch_vtime(struct task_struct *prev) * Account time for a transition between system, hard irq or soft irq state. * Note that this function is called with interrupts enabled. */ -void account_system_vtime(struct task_struct *tsk) +void vtime_account(struct task_struct *tsk) { struct thread_info *ti = task_thread_info(tsk); unsigned long flags; @@ -138,7 +138,7 @@ void account_system_vtime(struct task_struct *tsk) local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(account_system_vtime); +EXPORT_SYMBOL_GPL(vtime_account); /* * Called from the timer interrupt handler to charge accumulated user time diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 49da7f06e64..39899d7ebda 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -291,7 +291,7 @@ static inline u64 calculate_stolen_time(u64 stop_tb) * Account time for a transition between system, hard irq * or soft irq state. */ -void account_system_vtime(struct task_struct *tsk) +void vtime_account(struct task_struct *tsk) { u64 now, nowscaled, delta, deltascaled; unsigned long flags; @@ -343,14 +343,14 @@ void account_system_vtime(struct task_struct *tsk) } local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(account_system_vtime); +EXPORT_SYMBOL_GPL(vtime_account); /* * Transfer the user and system times accumulated in the paca * by the exception entry and exit code to the generic process * user and system time records. * Must be called with interrupts disabled. - * Assumes that account_system_vtime() has been called recently + * Assumes that vtime_account() has been called recently * (i.e. since the last entry from usermode) so that * get_paca()->user_time_scaled is up to date. */ @@ -366,9 +366,9 @@ void account_process_tick(struct task_struct *tsk, int user_tick) account_user_time(tsk, utime, utimescaled); } -void account_switch_vtime(struct task_struct *prev) +void vtime_task_switch(struct task_struct *prev) { - account_system_vtime(prev); + vtime_account(prev); account_process_tick(prev, 0); } diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 449ac22cc71..cb5093c26d1 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -99,7 +99,7 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset) return virt_timer_forward(user + system); } -void account_switch_vtime(struct task_struct *prev) +void vtime_task_switch(struct task_struct *prev) { struct thread_info *ti; @@ -122,7 +122,7 @@ void account_process_tick(struct task_struct *tsk, int user_tick) * Update process times based on virtual cpu times stored by entry.S * to the lowcore fields user_timer, system_timer & steal_clock. */ -void account_system_vtime(struct task_struct *tsk) +void vtime_account(struct task_struct *tsk) { struct thread_info *ti = task_thread_info(tsk); u64 timer, system; @@ -138,7 +138,7 @@ void account_system_vtime(struct task_struct *tsk) virt_timer_forward(system); } -EXPORT_SYMBOL_GPL(account_system_vtime); +EXPORT_SYMBOL_GPL(vtime_account); void __kprobes vtime_stop_cpu(void) { diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 305f23cd7cf..cab3da3d094 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -132,11 +132,11 @@ extern void synchronize_irq(unsigned int irq); struct task_struct; #if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING) -static inline void account_system_vtime(struct task_struct *tsk) +static inline void vtime_account(struct task_struct *tsk) { } #else -extern void account_system_vtime(struct task_struct *tsk); +extern void vtime_account(struct task_struct *tsk); #endif #if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) @@ -162,7 +162,7 @@ extern void rcu_nmi_exit(void); */ #define __irq_enter() \ do { \ - account_system_vtime(current); \ + vtime_account(current); \ add_preempt_count(HARDIRQ_OFFSET); \ trace_hardirq_enter(); \ } while (0) @@ -178,7 +178,7 @@ extern void irq_enter(void); #define __irq_exit() \ do { \ trace_hardirq_exit(); \ - account_system_vtime(current); \ + vtime_account(current); \ sub_preempt_count(HARDIRQ_OFFSET); \ } while (0) diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index bbe5d15d659..ca0944b92f4 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -131,9 +131,9 @@ extern void account_steal_ticks(unsigned long ticks); extern void account_idle_ticks(unsigned long ticks); #ifdef CONFIG_VIRT_CPU_ACCOUNTING -extern void account_switch_vtime(struct task_struct *prev); +extern void vtime_task_switch(struct task_struct *prev); #else -static inline void account_switch_vtime(struct task_struct *prev) { } +static inline void vtime_task_switch(struct task_struct *prev) { } #endif #endif /* _LINUX_KERNEL_STAT_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b70b48b0109..8a59e0abe5f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -685,7 +685,7 @@ static inline int kvm_deassign_device(struct kvm *kvm, static inline void kvm_guest_enter(void) { BUG_ON(preemptible()); - account_system_vtime(current); + vtime_account(current); current->flags |= PF_VCPU; /* KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode @@ -699,7 +699,7 @@ static inline void kvm_guest_enter(void) static inline void kvm_guest_exit(void) { - account_system_vtime(current); + vtime_account(current); current->flags &= ~PF_VCPU; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ba144b121f3..21e4dcff18f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1764,7 +1764,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) * Manfred Spraul */ prev_state = prev->state; - account_switch_vtime(prev); + vtime_task_switch(prev); finish_arch_switch(prev); perf_event_task_sched_in(prev, current); finish_lock_switch(rq, prev); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 372692bd537..53f5b12f282 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -10,11 +10,11 @@ /* * There are no locks covering percpu hardirq/softirq time. - * They are only modified in account_system_vtime, on corresponding CPU + * They are only modified in vtime_account, on corresponding CPU * with interrupts disabled. So, writes are safe. * They are read and saved off onto struct rq in update_rq_clock(). * This may result in other CPU reading this CPU's irq time and can - * race with irq/account_system_vtime on this CPU. We would either get old + * race with irq/vtime_account on this CPU. We would either get old * or new value with a side effect of accounting a slice of irq time to wrong * task when irq is in progress while we read rq->clock. That is a worthy * compromise in place of having locks on each irq in account_system_time. @@ -43,7 +43,7 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq); * Called before incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. */ -void account_system_vtime(struct task_struct *curr) +void vtime_account(struct task_struct *curr) { unsigned long flags; s64 delta; @@ -73,7 +73,7 @@ void account_system_vtime(struct task_struct *curr) irq_time_write_end(); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(account_system_vtime); +EXPORT_SYMBOL_GPL(vtime_account); static int irqtime_account_hi_update(void) { diff --git a/kernel/softirq.c b/kernel/softirq.c index b73e681df09..d55e3159f92 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -220,7 +220,7 @@ asmlinkage void __do_softirq(void) current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); - account_system_vtime(current); + vtime_account(current); __local_bh_disable((unsigned long)__builtin_return_address(0), SOFTIRQ_OFFSET); @@ -271,7 +271,7 @@ restart: lockdep_softirq_exit(); - account_system_vtime(current); + vtime_account(current); __local_bh_enable(SOFTIRQ_OFFSET); tsk_restore_flags(current, old_flags, PF_MEMALLOC); } @@ -340,7 +340,7 @@ static inline void invoke_softirq(void) */ void irq_exit(void) { - account_system_vtime(current); + vtime_account(current); trace_hardirq_exit(); sub_preempt_count(IRQ_EXIT_OFFSET); if (!in_interrupt() && local_softirq_pending()) -- cgit v1.2.3-70-g09d2