diff options
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r-- | arch/x86/kvm/mmu.c | 251 |
1 files changed, 161 insertions, 90 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index fbb04aee830..9cafbb49981 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -18,9 +18,11 @@ * */ +#include "irq.h" #include "mmu.h" #include "x86.h" #include "kvm_cache_regs.h" +#include "x86.h" #include <linux/kvm_host.h> #include <linux/types.h> @@ -194,7 +196,6 @@ static struct percpu_counter kvm_total_used_mmu_pages; static u64 __read_mostly shadow_trap_nonpresent_pte; static u64 __read_mostly shadow_notrap_nonpresent_pte; -static u64 __read_mostly shadow_base_present_pte; static u64 __read_mostly shadow_nx_mask; static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ static u64 __read_mostly shadow_user_mask; @@ -213,12 +214,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) } EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); -void kvm_mmu_set_base_ptes(u64 base_pte) -{ - shadow_base_present_pte = base_pte; -} -EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); - void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, u64 dirty_mask, u64 nx_mask, u64 x_mask) { @@ -482,46 +477,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) } /* - * Return the pointer to the largepage write count for a given - * gfn, handling slots that are not large page aligned. + * Return the pointer to the large page information for a given gfn, + * handling slots that are not large page aligned. */ -static int *slot_largepage_idx(gfn_t gfn, - struct kvm_memory_slot *slot, - int level) +static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, + struct kvm_memory_slot *slot, + int level) { unsigned long idx; idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); - return &slot->lpage_info[level - 2][idx].write_count; + return &slot->lpage_info[level - 2][idx]; } static void account_shadowed(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *slot; - int *write_count; + struct kvm_lpage_info *linfo; int i; slot = gfn_to_memslot(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - write_count = slot_largepage_idx(gfn, slot, i); - *write_count += 1; + linfo = lpage_info_slot(gfn, slot, i); + linfo->write_count += 1; } } static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) { struct kvm_memory_slot *slot; - int *write_count; + struct kvm_lpage_info *linfo; int i; slot = gfn_to_memslot(kvm, gfn); for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - write_count = slot_largepage_idx(gfn, slot, i); - *write_count -= 1; - WARN_ON(*write_count < 0); + linfo = lpage_info_slot(gfn, slot, i); + linfo->write_count -= 1; + WARN_ON(linfo->write_count < 0); } } @@ -530,12 +525,12 @@ static int has_wrprotected_page(struct kvm *kvm, int level) { struct kvm_memory_slot *slot; - int *largepage_idx; + struct kvm_lpage_info *linfo; slot = gfn_to_memslot(kvm, gfn); if (slot) { - largepage_idx = slot_largepage_idx(gfn, slot, level); - return *largepage_idx; + linfo = lpage_info_slot(gfn, slot, level); + return linfo->write_count; } return 1; @@ -590,16 +585,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) { struct kvm_memory_slot *slot; - unsigned long idx; + struct kvm_lpage_info *linfo; slot = gfn_to_memslot(kvm, gfn); if (likely(level == PT_PAGE_TABLE_LEVEL)) return &slot->rmap[gfn - slot->base_gfn]; - idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - - (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); + linfo = lpage_info_slot(gfn, slot, level); - return &slot->lpage_info[level - 2][idx].rmap_pde; + return &linfo->rmap_pde; } /* @@ -887,19 +881,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + gfn_t gfn = memslot->base_gfn + gfn_offset; ret = handler(kvm, &memslot->rmap[gfn_offset], data); for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { - unsigned long idx; - int sh; - - sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); - idx = ((memslot->base_gfn+gfn_offset) >> sh) - - (memslot->base_gfn >> sh); - ret |= handler(kvm, - &memslot->lpage_info[j][idx].rmap_pde, - data); + struct kvm_lpage_info *linfo; + + linfo = lpage_info_slot(gfn, memslot, + PT_DIRECTORY_LEVEL + j); + ret |= handler(kvm, &linfo->rmap_pde, data); } trace_kvm_age_page(hva, memslot, ret); retval |= ret; @@ -1161,7 +1152,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, } static int nonpaging_sync_page(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, bool clear_unsync) + struct kvm_mmu_page *sp) { return 1; } @@ -1291,7 +1282,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (clear_unsync) kvm_unlink_unsync_page(vcpu->kvm, sp); - if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { + if (vcpu->arch.mmu.sync_page(vcpu, sp)) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); return 1; } @@ -1332,12 +1323,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) continue; WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); + kvm_unlink_unsync_page(vcpu->kvm, s); if ((s->role.cr4_pae != !!is_pae(vcpu)) || - (vcpu->arch.mmu.sync_page(vcpu, s, true))) { + (vcpu->arch.mmu.sync_page(vcpu, s))) { kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); continue; } - kvm_unlink_unsync_page(vcpu->kvm, s); flush = true; } @@ -1963,9 +1954,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int user_fault, int write_fault, int dirty, int level, gfn_t gfn, pfn_t pfn, bool speculative, - bool can_unsync, bool reset_host_protection) + bool can_unsync, bool host_writable) { - u64 spte; + u64 spte, entry = *sptep; int ret = 0; /* @@ -1973,7 +1964,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * whether the guest actually used the pte (in order to detect * demand paging). */ - spte = shadow_base_present_pte; + spte = PT_PRESENT_MASK; if (!speculative) spte |= shadow_accessed_mask; if (!dirty) @@ -1990,8 +1981,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, kvm_is_mmio_pfn(pfn)); - if (reset_host_protection) + if (host_writable) spte |= SPTE_HOST_WRITEABLE; + else + pte_access &= ~ACC_WRITE_MASK; spte |= (u64)pfn << PAGE_SHIFT; @@ -2036,6 +2029,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, set_pte: update_spte(sptep, spte); + /* + * If we overwrite a writable spte with a read-only one we + * should flush remote TLBs. Otherwise rmap_write_protect + * will find a read-only spte, even though the writable spte + * might be cached on a CPU's TLB. + */ + if (is_writable_pte(entry) && !is_writable_pte(*sptep)) + kvm_flush_remote_tlbs(vcpu->kvm); done: return ret; } @@ -2045,7 +2046,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, int user_fault, int write_fault, int dirty, int *ptwrite, int level, gfn_t gfn, pfn_t pfn, bool speculative, - bool reset_host_protection) + bool host_writable) { int was_rmapped = 0; int rmap_count; @@ -2080,7 +2081,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, dirty, level, gfn, pfn, speculative, true, - reset_host_protection)) { + host_writable)) { if (write_fault) *ptwrite = 1; kvm_mmu_flush_tlb(vcpu); @@ -2211,7 +2212,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) } static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int level, gfn_t gfn, pfn_t pfn) + int map_writable, int level, gfn_t gfn, pfn_t pfn, + bool prefault) { struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; @@ -2220,9 +2222,11 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { if (iterator.level == level) { - mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, + unsigned pte_access = ACC_ALL; + + mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access, 0, write, 1, &pt_write, - level, gfn, pfn, false, true); + level, gfn, pfn, prefault, map_writable); direct_pte_prefetch(vcpu, iterator.sptep); ++vcpu->stat.pf_fixed; break; @@ -2277,12 +2281,17 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) return 1; } -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) +static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, + gva_t gva, pfn_t *pfn, bool write, bool *writable); + +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, + bool prefault) { int r; int level; pfn_t pfn; unsigned long mmu_seq; + bool map_writable; level = mapping_level(vcpu, gfn); @@ -2297,7 +2306,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - pfn = gfn_to_pfn(vcpu->kvm, gfn); + + if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) + return 0; /* mmio */ if (is_error_pfn(pfn)) @@ -2307,7 +2318,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); - r = __direct_map(vcpu, v, write, level, gfn, pfn); + r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, + prefault); spin_unlock(&vcpu->kvm->mmu_lock); @@ -2530,6 +2542,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); mmu_sync_children(vcpu, sp); + trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); return; } for (i = 0; i < 4; ++i) { @@ -2552,23 +2565,24 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) } static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, - u32 access, u32 *error) + u32 access, struct x86_exception *exception) { - if (error) - *error = 0; + if (exception) + exception->error_code = 0; return vaddr; } static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, - u32 access, u32 *error) + u32 access, + struct x86_exception *exception) { - if (error) - *error = 0; + if (exception) + exception->error_code = 0; return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); } static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, - u32 error_code) + u32 error_code, bool prefault) { gfn_t gfn; int r; @@ -2584,17 +2598,67 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn = gva >> PAGE_SHIFT; return nonpaging_map(vcpu, gva & PAGE_MASK, - error_code & PFERR_WRITE_MASK, gfn); + error_code & PFERR_WRITE_MASK, gfn, prefault); +} + +static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) +{ + struct kvm_arch_async_pf arch; + + arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; + arch.gfn = gfn; + arch.direct_map = vcpu->arch.mmu.direct_map; + arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); + + return kvm_setup_async_pf(vcpu, gva, gfn, &arch); } -static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, - u32 error_code) +static bool can_do_async_pf(struct kvm_vcpu *vcpu) +{ + if (unlikely(!irqchip_in_kernel(vcpu->kvm) || + kvm_event_needs_reinjection(vcpu))) + return false; + + return kvm_x86_ops->interrupt_allowed(vcpu); +} + +static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, + gva_t gva, pfn_t *pfn, bool write, bool *writable) +{ + bool async; + + *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable); + + if (!async) + return false; /* *pfn has correct page already */ + + put_page(pfn_to_page(*pfn)); + + if (!prefault && can_do_async_pf(vcpu)) { + trace_kvm_try_async_get_page(gva, gfn); + if (kvm_find_async_pf_gfn(vcpu, gfn)) { + trace_kvm_async_pf_doublefault(gva, gfn); + kvm_make_request(KVM_REQ_APF_HALT, vcpu); + return true; + } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn)) + return true; + } + + *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable); + + return false; +} + +static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, + bool prefault) { pfn_t pfn; int r; int level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; + int write = error_code & PFERR_WRITE_MASK; + bool map_writable; ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); @@ -2609,15 +2673,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - pfn = gfn_to_pfn(vcpu->kvm, gfn); + + if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) + return 0; + + /* mmio */ if (is_error_pfn(pfn)) return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); - r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, - level, gfn, pfn); + r = __direct_map(vcpu, gpa, write, map_writable, + level, gfn, pfn, prefault); spin_unlock(&vcpu->kvm->mmu_lock); return r; @@ -2659,18 +2727,19 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) static void paging_new_cr3(struct kvm_vcpu *vcpu) { - pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3); + pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu)); mmu_free_roots(vcpu); } static unsigned long get_cr3(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr3; + return kvm_read_cr3(vcpu); } -static void inject_page_fault(struct kvm_vcpu *vcpu) +static void inject_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) { - vcpu->arch.mmu.inject_page_fault(vcpu); + vcpu->arch.mmu.inject_page_fault(vcpu, fault); } static void paging_free(struct kvm_vcpu *vcpu) @@ -2816,6 +2885,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { struct kvm_mmu *context = vcpu->arch.walk_mmu; + context->base_role.word = 0; context->new_cr3 = nonpaging_new_cr3; context->page_fault = tdp_page_fault; context->free = nonpaging_free; @@ -3008,9 +3078,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, return; } - if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) - return; - ++vcpu->kvm->stat.mmu_pte_updated; if (!sp->role.cr4_pae) paging32_update_pte(vcpu, sp, spte, new); @@ -3264,12 +3331,13 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) } } -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, + void *insn, int insn_len) { int r; enum emulation_result er; - r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); + r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); if (r < 0) goto out; @@ -3282,7 +3350,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) if (r) goto out; - er = emulate_instruction(vcpu, cr2, error_code, 0); + er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len); switch (er) { case EMULATE_DONE: @@ -3377,11 +3445,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) if (!test_bit(slot, sp->slot_bitmap)) continue; + if (sp->role.level != PT_PAGE_TABLE_LEVEL) + continue; + pt = sp->spt; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) /* avoid RMW */ if (is_writable_pte(pt[i])) - pt[i] &= ~PT_WRITABLE_MASK; + update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); } kvm_flush_remote_tlbs(kvm); } @@ -3463,13 +3534,6 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } -void kvm_mmu_module_exit(void) -{ - mmu_destroy_caches(); - percpu_counter_destroy(&kvm_total_used_mmu_pages); - unregister_shrinker(&mmu_shrinker); -} - int kvm_mmu_module_init(void) { pte_chain_cache = kmem_cache_create("kvm_pte_chain", @@ -3566,7 +3630,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) { - (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); + (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu)); return 1; } @@ -3662,12 +3726,6 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) } EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); -#ifdef CONFIG_KVM_MMU_AUDIT -#include "mmu_audit.c" -#else -static void mmu_audit_disable(void) { } -#endif - void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { ASSERT(vcpu); @@ -3675,5 +3733,18 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu) destroy_kvm_mmu(vcpu); free_mmu_pages(vcpu); mmu_free_memory_caches(vcpu); +} + +#ifdef CONFIG_KVM_MMU_AUDIT +#include "mmu_audit.c" +#else +static void mmu_audit_disable(void) { } +#endif + +void kvm_mmu_module_exit(void) +{ + mmu_destroy_caches(); + percpu_counter_destroy(&kvm_total_used_mmu_pages); + unregister_shrinker(&mmu_shrinker); mmu_audit_disable(); } |