From 3f6d8c8a478dd1ab2a4944b0d65474df06ecd882 Mon Sep 17 00:00:00 2001 From: Xudong Hao Date: Tue, 22 May 2012 11:23:15 +0800 Subject: KVM: VMX: Use EPT Access bit in response to memory notifiers Signed-off-by: Haitao Shan Signed-off-by: Xudong Hao Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index be3cea4407f..d07e436b7a4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1242,7 +1242,8 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, int young = 0; /* - * Emulate the accessed bit for EPT, by checking if this page has + * In case of absence of EPT Access and Dirty Bits supports, + * emulate the accessed bit for EPT, by checking if this page has * an EPT mapping, and clearing it if it does. On the next access, * a new EPT mapping will be established. * This has some overhead, but not as much as the cost of swapping @@ -1253,11 +1254,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); + BUG_ON(!is_shadow_present_pte(*sptep)); - if (*sptep & PT_ACCESSED_MASK) { + if (*sptep & shadow_accessed_mask) { young = 1; - clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); } } @@ -1281,9 +1283,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); + BUG_ON(!is_shadow_present_pte(*sptep)); - if (*sptep & PT_ACCESSED_MASK) { + if (*sptep & shadow_accessed_mask) { young = 1; break; } -- cgit v1.2.3-70-g09d2 From 1952639665e92481c34c34c3e2a71bf3e66ba362 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 4 Jun 2012 14:53:23 +0300 Subject: KVM: MMU: do not iterate over all VMs in mmu_shrink() mmu_shrink() needlessly iterates over all VMs even though it will not attempt to free mmu pages from more than one on them. Fix that and also check used mmu pages count outside of VM lock to skip inactive VMs faster. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d07e436b7a4..1ca7164a74f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3944,7 +3944,6 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; - struct kvm *kvm_freed = NULL; int nr_to_scan = sc->nr_to_scan; if (nr_to_scan == 0) @@ -3956,22 +3955,30 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) int idx; LIST_HEAD(invalid_list); + /* + * n_used_mmu_pages is accessed without holding kvm->mmu_lock + * here. We may skip a VM instance errorneosly, but we do not + * want to shrink a VM that only started to populate its MMU + * anyway. + */ + if (kvm->arch.n_used_mmu_pages > 0) { + if (!nr_to_scan--) + break; + continue; + } + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - if (!kvm_freed && nr_to_scan > 0 && - kvm->arch.n_used_mmu_pages > 0) { - kvm_mmu_remove_some_alloc_mmu_pages(kvm, - &invalid_list); - kvm_freed = kvm; - } - nr_to_scan--; + kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list); kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); + + list_move_tail(&kvm->vm_list, &vm_list); + break; } - if (kvm_freed) - list_move_tail(&kvm_freed->vm_list, &vm_list); raw_spin_unlock(&kvm_lock); -- cgit v1.2.3-70-g09d2 From 79f702a6d18c75760c68202007265b2310d6f44e Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 3 Jun 2012 11:34:08 +0300 Subject: KVM: disable uninitialized var warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I see this in 3.5-rc1: arch/x86/kvm/mmu.c: In function ‘kvm_test_age_rmapp’: arch/x86/kvm/mmu.c:1271: warning: ‘iter.desc’ may be used uninitialized in this function The line in question was introduced by commit 1e3f42f03c38c29c1814199a6f0a2f01b919ea3f static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { - u64 *spte; + u64 *sptep; + struct rmap_iterator iter; <- line 1271 int young = 0; /* The reason I think is that the compiler assumes that the rmap value could be 0, so static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter) { if (!rmap) return NULL; if (!(rmap & 1)) { iter->desc = NULL; return (u64 *)rmap; } iter->desc = (struct pte_list_desc *)(rmap & ~1ul); iter->pos = 0; return iter->desc->sptes[iter->pos]; } will not initialize iter.desc, but the compiler isn't smart enough to see that for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { will immediately exit in this case. I checked by adding if (!*rmapp) goto out; on top which is clearly equivalent but disables the warning. This patch uses uninitialized_var to disable the warning without increasing code size. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1ca7164a74f..24dd43d45ae 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1238,7 +1238,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { u64 *sptep; - struct rmap_iterator iter; + struct rmap_iterator uninitialized_var(iter); int young = 0; /* -- cgit v1.2.3-70-g09d2 From 80feb89a0a1381642f1cce9036ef3bb22f13b40a Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Tue, 29 May 2012 23:54:26 +0900 Subject: KVM: MMU: Remove unused parameter from mmu_memory_cache_alloc() Size is not needed to return one from pre-allocated objects. Signed-off-by: Takuya Yoshikawa Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 24dd43d45ae..b32a11dc884 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -652,8 +652,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) mmu_page_header_cache); } -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, - size_t size) +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) { void *p; @@ -664,8 +663,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) { - return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, - sizeof(struct pte_list_desc)); + return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); } static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) @@ -1403,12 +1401,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte, int direct) { struct kvm_mmu_page *sp; - sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, - sizeof *sp); - sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); if (!direct) - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, - PAGE_SIZE); + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); -- cgit v1.2.3-70-g09d2 From 00763e41138267a307531397f9745835aecb8c7b Mon Sep 17 00:00:00 2001 From: Xudong Hao Date: Thu, 7 Jun 2012 18:26:07 +0800 Subject: KVM: x86: change PT_FIRST_AVAIL_BITS_SHIFT to avoid conflict with EPT Dirty bit EPT Dirty bit use bit 9 as Intel SDM definition, to avoid conflict, change PT_FIRST_AVAIL_BITS_SHIFT to 10. Signed-off-by: Xudong Hao Signed-off-by: Xiantao Zhang Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b32a11dc884..3b53d9e08bf 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -90,7 +90,7 @@ module_param(dbg, bool, 0644); #define PTE_PREFETCH_NUM 8 -#define PT_FIRST_AVAIL_BITS_SHIFT 9 +#define PT_FIRST_AVAIL_BITS_SHIFT 10 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 #define PT64_LEVEL_BITS 9 -- cgit v1.2.3-70-g09d2 From e676505ac96813e8b93170b1f5e5ffe0cf6a2348 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 8 Jul 2012 17:16:30 +0300 Subject: KVM: MMU: Force cr3 reload with two dimensional paging on mov cr3 emulation Currently the MMU's ->new_cr3() callback does nothing when guest paging is disabled or when two-dimentional paging (e.g. EPT on Intel) is active. This means that an emulated write to cr3 can be lost; kvm_set_cr3() will write vcpu-arch.cr3, but the GUEST_CR3 field in the VMCS will retain its old value and this is what the guest sees. This bug did not have any effect until now because: - with unrestricted guest, or with svm, we never emulate a mov cr3 instruction - without unrestricted guest, and with paging enabled, we also never emulate a mov cr3 instruction - without unrestricted guest, but with paging disabled, the guest's cr3 is ignored until the guest enables paging; at this point the value from arch.cr3 is loaded correctly my the mov cr0 instruction which turns on paging However, the patchset that enables big real mode causes us to emulate mov cr3 instructions in protected mode sometimes (when guest state is not virtualizable by vmx); this mov cr3 is effectively ignored and will crash the guest. The fix is to make nonpaging_new_cr3() call mmu_free_roots() to force a cr3 reload. This is awkward because now all the new_cr3 callbacks to the same thing, and because mmu_free_roots() is somewhat of an overkill; but fixing that is more complicated and will be done after this minimal fix. Observed in the Window XP 32-bit installer while bringing up secondary vcpus. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3b53d9e08bf..569cd66ba24 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -188,6 +188,7 @@ static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; static void mmu_spte_set(u64 *sptep, u64 spte); +static void mmu_free_roots(struct kvm_vcpu *vcpu); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) { @@ -2401,6 +2402,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { + mmu_free_roots(vcpu); } static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, -- cgit v1.2.3-70-g09d2 From 2f84569f978cd7d54970d893b4c4e68ef24dc1ec Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 20 Jun 2012 15:56:53 +0800 Subject: KVM: MMU: return bool in __rmap_write_protect The reture value of __rmap_write_protect is either 1 or 0, use true/false instead of these Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 569cd66ba24..5dd22427251 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1050,11 +1050,12 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } -static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) +static bool +__rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) { u64 *sptep; struct rmap_iterator iter; - int write_protected = 0; + bool write_protected = false; for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { BUG_ON(!(*sptep & PT_PRESENT_MASK)); @@ -1075,7 +1076,7 @@ static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level sptep = rmap_get_first(*rmapp, &iter); } - write_protected = 1; + write_protected = true; } return write_protected; @@ -1106,12 +1107,12 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, } } -static int rmap_write_protect(struct kvm *kvm, u64 gfn) +static bool rmap_write_protect(struct kvm *kvm, u64 gfn) { struct kvm_memory_slot *slot; unsigned long *rmapp; int i; - int write_protected = 0; + bool write_protected = false; slot = gfn_to_memslot(kvm, gfn); @@ -1700,7 +1701,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, kvm_mmu_pages_init(parent, &parents, &pages); while (mmu_unsync_walk(parent, &pages)) { - int protected = 0; + bool protected = false; for_each_sp(pages, sp, parents, i) protected |= rmap_write_protect(vcpu->kvm, sp->gfn); -- cgit v1.2.3-70-g09d2 From d13bc5b5a1f9eafd59331baa1d1d32e1867f57b5 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 20 Jun 2012 15:57:15 +0800 Subject: KVM: MMU: abstract spte write-protect Introduce a common function to abstract spte write-protect to cleanup the code Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 58 +++++++++++++++++++++++++++++------------------------- 1 file changed, 31 insertions(+), 27 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5dd22427251..d04d6305a72 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1050,36 +1050,48 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } +/* Return true if the spte is dropped. */ +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush) +{ + u64 spte = *sptep; + + if (!is_writable_pte(spte)) + return false; + + rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); + + *flush |= true; + if (is_large_pte(spte)) { + WARN_ON(page_header(__pa(sptep))->role.level == + PT_PAGE_TABLE_LEVEL); + drop_spte(kvm, sptep); + --kvm->stat.lpages; + return true; + } + + spte = spte & ~PT_WRITABLE_MASK; + mmu_spte_update(sptep, spte); + return false; +} + static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) { u64 *sptep; struct rmap_iterator iter; - bool write_protected = false; + bool flush = false; for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { BUG_ON(!(*sptep & PT_PRESENT_MASK)); - rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); - - if (!is_writable_pte(*sptep)) { - sptep = rmap_get_next(&iter); - continue; - } - - if (level == PT_PAGE_TABLE_LEVEL) { - mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK); - sptep = rmap_get_next(&iter); - } else { - BUG_ON(!is_large_pte(*sptep)); - drop_spte(kvm, sptep); - --kvm->stat.lpages; + if (spte_write_protect(kvm, sptep, &flush)) { sptep = rmap_get_first(*rmapp, &iter); + continue; } - write_protected = true; + sptep = rmap_get_next(&iter); } - return write_protected; + return flush; } /** @@ -3886,6 +3898,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { struct kvm_mmu_page *sp; + bool flush = false; list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { int i; @@ -3900,16 +3913,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) !is_last_spte(pt[i], sp->role.level)) continue; - if (is_large_pte(pt[i])) { - drop_spte(kvm, &pt[i]); - --kvm->stat.lpages; - continue; - } - - /* avoid RMW */ - if (is_writable_pte(pt[i])) - mmu_spte_update(&pt[i], - pt[i] & ~PT_WRITABLE_MASK); + spte_write_protect(kvm, &pt[i], &flush); } } kvm_flush_remote_tlbs(kvm); -- cgit v1.2.3-70-g09d2 From 8e22f955fb65c5930cc4c5a863cce4e27d0e4a3c Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 20 Jun 2012 15:57:39 +0800 Subject: KVM: MMU: cleanup spte_write_protect Use __drop_large_spte to cleanup this function and comment spte_write_protect Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d04d6305a72..ed9e9680608 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1050,7 +1050,33 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } -/* Return true if the spte is dropped. */ + +static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) +{ + if (is_large_pte(*sptep)) { + WARN_ON(page_header(__pa(sptep))->role.level == + PT_PAGE_TABLE_LEVEL); + drop_spte(kvm, sptep); + --kvm->stat.lpages; + return true; + } + + return false; +} + +static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) +{ + if (__drop_large_spte(vcpu->kvm, sptep)) + kvm_flush_remote_tlbs(vcpu->kvm); +} + +/* + * Write-protect on the specified @sptep due to dirty page logging or + * protecting shadow page table. @flush indicates whether tlb need be + * flushed. + * + * Return true if the spte is dropped. + */ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush) { u64 spte = *sptep; @@ -1061,13 +1087,9 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush) rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); *flush |= true; - if (is_large_pte(spte)) { - WARN_ON(page_header(__pa(sptep))->role.level == - PT_PAGE_TABLE_LEVEL); - drop_spte(kvm, sptep); - --kvm->stat.lpages; + + if (__drop_large_spte(kvm, sptep)) return true; - } spte = spte & ~PT_WRITABLE_MASK; mmu_spte_update(sptep, spte); @@ -1878,15 +1900,6 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) mmu_spte_set(sptep, spte); } -static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) -{ - if (is_large_pte(*sptep)) { - drop_spte(vcpu->kvm, sptep); - --vcpu->kvm->stat.lpages; - kvm_flush_remote_tlbs(vcpu->kvm); - } -} - static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned direct_access) { -- cgit v1.2.3-70-g09d2 From 6e7d035407dc402a313e466c4f7ccb21aaed0da2 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 20 Jun 2012 15:58:33 +0800 Subject: KVM: MMU: fold tlb flush judgement into mmu_spte_update mmu_spte_update() is the common function, we can easily audit the path Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ed9e9680608..a2fc65ba76a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -479,15 +479,24 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) /* Rules for using mmu_spte_update: * Update the state bits, it means the mapped pfn is not changged. + * + * Whenever we overwrite a writable spte with a read-only one we + * should flush remote TLBs. Otherwise rmap_write_protect + * will find a read-only spte, even though the writable spte + * might be cached on a CPU's TLB, the return value indicates this + * case. */ -static void mmu_spte_update(u64 *sptep, u64 new_spte) +static bool mmu_spte_update(u64 *sptep, u64 new_spte) { u64 mask, old_spte = *sptep; + bool ret = false; WARN_ON(!is_rmap_spte(new_spte)); - if (!is_shadow_present_pte(old_spte)) - return mmu_spte_set(sptep, new_spte); + if (!is_shadow_present_pte(old_spte)) { + mmu_spte_set(sptep, new_spte); + return ret; + } new_spte |= old_spte & shadow_dirty_mask; @@ -500,13 +509,18 @@ static void mmu_spte_update(u64 *sptep, u64 new_spte) else old_spte = __update_clear_spte_slow(sptep, new_spte); + if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) + ret = true; + if (!shadow_accessed_mask) - return; + return ret; if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) kvm_set_pfn_accessed(spte_to_pfn(old_spte)); if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + + return ret; } /* @@ -2268,7 +2282,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { - u64 spte, entry = *sptep; + u64 spte; int ret = 0; if (set_mmio_spte(sptep, gfn, pfn, pte_access)) @@ -2346,14 +2360,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, mark_page_dirty(vcpu->kvm, gfn); set_pte: - mmu_spte_update(sptep, spte); - /* - * If we overwrite a writable spte with a read-only one we - * should flush remote TLBs. Otherwise rmap_write_protect - * will find a read-only spte, even though the writable spte - * might be cached on a CPU's TLB. - */ - if (is_writable_pte(entry) && !is_writable_pte(*sptep)) + if (mmu_spte_update(sptep, spte)) kvm_flush_remote_tlbs(vcpu->kvm); done: return ret; -- cgit v1.2.3-70-g09d2 From 49fde3406f3266c5af9430467672c20b63a31e83 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 20 Jun 2012 15:58:58 +0800 Subject: KVM: MMU: introduce SPTE_MMU_WRITEABLE bit This bit indicates whether the spte can be writable on MMU, that means the corresponding gpte is writable and the corresponding gfn is not protected by shadow page protection In the later path, SPTE_MMU_WRITEABLE will indicates whether the spte can be locklessly updated Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 57 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 19 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a2fc65ba76a..b160652f7ee 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -145,7 +145,8 @@ module_param(dbg, bool, 0644); #define CREATE_TRACE_POINTS #include "mmutrace.h" -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) @@ -1084,34 +1085,51 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) kvm_flush_remote_tlbs(vcpu->kvm); } +static bool spte_is_locklessly_modifiable(u64 spte) +{ + return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); +} + /* - * Write-protect on the specified @sptep due to dirty page logging or - * protecting shadow page table. @flush indicates whether tlb need be - * flushed. + * Write-protect on the specified @sptep, @pt_protect indicates whether + * spte writ-protection is caused by protecting shadow page table. + * @flush indicates whether tlb need be flushed. + * + * Note: write protection is difference between drity logging and spte + * protection: + * - for dirty logging, the spte can be set to writable at anytime if + * its dirty bitmap is properly set. + * - for spte protection, the spte can be writable only after unsync-ing + * shadow page. * * Return true if the spte is dropped. */ -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush) +static bool +spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) { u64 spte = *sptep; - if (!is_writable_pte(spte)) + if (!is_writable_pte(spte) && + !(pt_protect && spte_is_locklessly_modifiable(spte))) return false; rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); - *flush |= true; - - if (__drop_large_spte(kvm, sptep)) + if (__drop_large_spte(kvm, sptep)) { + *flush |= true; return true; + } + if (pt_protect) + spte &= ~SPTE_MMU_WRITEABLE; spte = spte & ~PT_WRITABLE_MASK; - mmu_spte_update(sptep, spte); + + *flush |= mmu_spte_update(sptep, spte); return false; } -static bool -__rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) +static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, + int level, bool pt_protect) { u64 *sptep; struct rmap_iterator iter; @@ -1119,7 +1137,7 @@ __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { BUG_ON(!(*sptep & PT_PRESENT_MASK)); - if (spte_write_protect(kvm, sptep, &flush)) { + if (spte_write_protect(kvm, sptep, &flush, pt_protect)) { sptep = rmap_get_first(*rmapp, &iter); continue; } @@ -1148,7 +1166,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, while (mask) { rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; - __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); + __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); /* clear the first set bit */ mask &= mask - 1; @@ -1167,7 +1185,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn) for (i = PT_PAGE_TABLE_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { rmapp = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmapp, i); + write_protected |= __rmap_write_protect(kvm, rmapp, i, true); } return write_protected; @@ -2296,8 +2314,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= shadow_x_mask; else spte |= shadow_nx_mask; + if (pte_access & ACC_USER_MASK) spte |= shadow_user_mask; + if (level > PT_PAGE_TABLE_LEVEL) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) @@ -2322,7 +2342,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, goto done; } - spte |= PT_WRITABLE_MASK; + spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; if (!vcpu->arch.mmu.direct_map && !(pte_access & ACC_WRITE_MASK)) { @@ -2351,8 +2371,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, __func__, gfn); ret = 1; pte_access &= ~ACC_WRITE_MASK; - if (is_writable_pte(spte)) - spte &= ~PT_WRITABLE_MASK; + spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); } } @@ -3933,7 +3952,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) !is_last_spte(pt[i], sp->role.level)) continue; - spte_write_protect(kvm, &pt[i], &flush); + spte_write_protect(kvm, &pt[i], &flush, false); } } kvm_flush_remote_tlbs(kvm); -- cgit v1.2.3-70-g09d2 From c7ba5b48cc8ddc015a9e0463813ca1e60bc42c59 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 20 Jun 2012 15:59:18 +0800 Subject: KVM: MMU: fast path of handling guest page fault If the the present bit of page fault error code is set, it indicates the shadow page is populated on all levels, it means what we do is only modify the access bit which can be done out of mmu-lock Currently, in order to simplify the code, we only fix the page fault caused by write-protect on the fast path Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 127 insertions(+), 17 deletions(-) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b160652f7ee..8637bffbdb4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -446,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte) } #endif +static bool spte_is_locklessly_modifiable(u64 spte) +{ + return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); +} + static bool spte_has_volatile_bits(u64 spte) { + /* + * Always atomicly update spte if it can be updated + * out of mmu-lock, it can ensure dirty bit is not lost, + * also, it can help us to get a stable is_writable_pte() + * to ensure tlb flush is not missed. + */ + if (spte_is_locklessly_modifiable(spte)) + return true; + if (!shadow_accessed_mask) return false; @@ -489,7 +503,7 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) */ static bool mmu_spte_update(u64 *sptep, u64 new_spte) { - u64 mask, old_spte = *sptep; + u64 old_spte = *sptep; bool ret = false; WARN_ON(!is_rmap_spte(new_spte)); @@ -499,17 +513,16 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) return ret; } - new_spte |= old_spte & shadow_dirty_mask; - - mask = shadow_accessed_mask; - if (is_writable_pte(old_spte)) - mask |= shadow_dirty_mask; - - if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) + if (!spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, new_spte); else old_spte = __update_clear_spte_slow(sptep, new_spte); + /* + * For the spte updated out of mmu-lock is safe, since + * we always atomicly update it, see the comments in + * spte_has_volatile_bits(). + */ if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) ret = true; @@ -1085,11 +1098,6 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) kvm_flush_remote_tlbs(vcpu->kvm); } -static bool spte_is_locklessly_modifiable(u64 spte) -{ - return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); -} - /* * Write-protect on the specified @sptep, @pt_protect indicates whether * spte writ-protection is caused by protecting shadow page table. @@ -2677,18 +2685,114 @@ exit: return ret; } +static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) +{ + /* + * #PF can be fast only if the shadow page table is present and it + * is caused by write-protect, that means we just need change the + * W bit of the spte which can be done out of mmu-lock. + */ + if (!(error_code & PFERR_PRESENT_MASK) || + !(error_code & PFERR_WRITE_MASK)) + return false; + + return true; +} + +static bool +fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte) +{ + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + gfn_t gfn; + + WARN_ON(!sp->role.direct); + + /* + * The gfn of direct spte is stable since it is calculated + * by sp->gfn. + */ + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + + if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) + mark_page_dirty(vcpu->kvm, gfn); + + return true; +} + +/* + * Return value: + * - true: let the vcpu to access on the same address again. + * - false: let the real page fault path to fix it. + */ +static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, + u32 error_code) +{ + struct kvm_shadow_walk_iterator iterator; + bool ret = false; + u64 spte = 0ull; + + if (!page_fault_can_be_fast(vcpu, error_code)) + return false; + + walk_shadow_page_lockless_begin(vcpu); + for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) + if (!is_shadow_present_pte(spte) || iterator.level < level) + break; + + /* + * If the mapping has been changed, let the vcpu fault on the + * same address again. + */ + if (!is_rmap_spte(spte)) { + ret = true; + goto exit; + } + + if (!is_last_spte(spte, level)) + goto exit; + + /* + * Check if it is a spurious fault caused by TLB lazily flushed. + * + * Need not check the access of upper level table entries since + * they are always ACC_ALL. + */ + if (is_writable_pte(spte)) { + ret = true; + goto exit; + } + + /* + * Currently, to simplify the code, only the spte write-protected + * by dirty-log can be fast fixed. + */ + if (!spte_is_locklessly_modifiable(spte)) + goto exit; + + /* + * Currently, fast page fault only works for direct mapping since + * the gfn is not stable for indirect shadow page. + * See Documentation/virtual/kvm/locking.txt to get more detail. + */ + ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); +exit: + walk_shadow_page_lockless_end(vcpu); + + return ret; +} + static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, pfn_t *pfn, bool write, bool *writable); -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, - bool prefault) +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, + gfn_t gfn, bool prefault) { int r; int level; int force_pt_level; pfn_t pfn; unsigned long mmu_seq; - bool map_writable; + bool map_writable, write = error_code & PFERR_WRITE_MASK; force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); if (likely(!force_pt_level)) { @@ -2705,6 +2809,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, } else level = PT_PAGE_TABLE_LEVEL; + if (fast_page_fault(vcpu, v, level, error_code)) + return 0; + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -3093,7 +3200,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn = gva >> PAGE_SHIFT; return nonpaging_map(vcpu, gva & PAGE_MASK, - error_code & PFERR_WRITE_MASK, gfn, prefault); + error_code, gfn, prefault); } static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) @@ -3173,6 +3280,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, } else level = PT_PAGE_TABLE_LEVEL; + if (fast_page_fault(vcpu, gpa, level, error_code)) + return 0; + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); -- cgit v1.2.3-70-g09d2 From a72faf2504dfc12ff9bfb486a42f2761296666ff Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 20 Jun 2012 15:59:41 +0800 Subject: KVM: MMU: trace fast page fault To see what happen on this path and help us to optimize it Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 ++ arch/x86/kvm/mmutrace.h | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) (limited to 'arch/x86/kvm/mmu.c') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8637bffbdb4..28c8fbcc676 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2776,6 +2776,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, */ ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); exit: + trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, + spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 89fb0e81322..c364abc8d03 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -243,6 +243,44 @@ TRACE_EVENT( TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, __entry->access) ); + +#define __spte_satisfied(__spte) \ + (__entry->retry && is_writable_pte(__entry->__spte)) + +TRACE_EVENT( + fast_page_fault, + TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, + u64 *sptep, u64 old_spte, bool retry), + TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(gva_t, gva) + __field(u32, error_code) + __field(u64 *, sptep) + __field(u64, old_spte) + __field(u64, new_spte) + __field(bool, retry) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->gva = gva; + __entry->error_code = error_code; + __entry->sptep = sptep; + __entry->old_spte = old_spte; + __entry->new_spte = *sptep; + __entry->retry = retry; + ), + + TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx" + " new %llx spurious %d fixed %d", __entry->vcpu_id, + __entry->gva, __print_flags(__entry->error_code, "|", + kvm_mmu_trace_pferr_flags), __entry->sptep, + __entry->old_spte, __entry->new_spte, + __spte_satisfied(old_spte), __spte_satisfied(new_spte) + ) +); #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3-70-g09d2