summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/mmu.c')
-rw-r--r--arch/x86/kvm/mmu.c684
1 files changed, 542 insertions, 142 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0bfe2bd305e..99c239c5c0a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -70,6 +70,9 @@ static int dbg = 0;
module_param(dbg, bool, 0644);
#endif
+static int oos_shadow = 1;
+module_param(oos_shadow, bool, 0644);
+
#ifndef MMU_DEBUG
#define ASSERT(x) do { } while (0)
#else
@@ -135,18 +138,24 @@ module_param(dbg, bool, 0644);
#define ACC_USER_MASK PT_USER_MASK
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
-struct kvm_pv_mmu_op_buffer {
- void *ptr;
- unsigned len;
- unsigned processed;
- char buf[512] __aligned(sizeof(long));
-};
+#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
struct kvm_rmap_desc {
u64 *shadow_ptes[RMAP_EXT];
struct kvm_rmap_desc *more;
};
+struct kvm_shadow_walk {
+ int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
+ u64 addr, u64 *spte, int level);
+};
+
+struct kvm_unsync_walk {
+ int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
+};
+
+typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
+
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
static struct kmem_cache *mmu_page_header_cache;
@@ -405,16 +414,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
{
struct vm_area_struct *vma;
unsigned long addr;
+ int ret = 0;
addr = gfn_to_hva(kvm, gfn);
if (kvm_is_error_hva(addr))
- return 0;
+ return ret;
+ down_read(&current->mm->mmap_sem);
vma = find_vma(current->mm, addr);
if (vma && is_vm_hugetlb_page(vma))
- return 1;
+ ret = 1;
+ up_read(&current->mm->mmap_sem);
- return 0;
+ return ret;
}
static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -649,8 +661,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
if (write_protected)
kvm_flush_remote_tlbs(kvm);
-
- account_shadowed(kvm, gfn);
}
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -711,6 +721,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
u64 *spte;
int young = 0;
+ /* always return old for EPT */
+ if (!shadow_accessed_mask)
+ return 0;
+
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
int _young;
@@ -855,6 +869,77 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
BUG();
}
+
+static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ mmu_parent_walk_fn fn)
+{
+ struct kvm_pte_chain *pte_chain;
+ struct hlist_node *node;
+ struct kvm_mmu_page *parent_sp;
+ int i;
+
+ if (!sp->multimapped && sp->parent_pte) {
+ parent_sp = page_header(__pa(sp->parent_pte));
+ fn(vcpu, parent_sp);
+ mmu_parent_walk(vcpu, parent_sp, fn);
+ return;
+ }
+ hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+ for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+ if (!pte_chain->parent_ptes[i])
+ break;
+ parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
+ fn(vcpu, parent_sp);
+ mmu_parent_walk(vcpu, parent_sp, fn);
+ }
+}
+
+static void kvm_mmu_update_unsync_bitmap(u64 *spte)
+{
+ unsigned int index;
+ struct kvm_mmu_page *sp = page_header(__pa(spte));
+
+ index = spte - sp->spt;
+ __set_bit(index, sp->unsync_child_bitmap);
+ sp->unsync_children = 1;
+}
+
+static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
+{
+ struct kvm_pte_chain *pte_chain;
+ struct hlist_node *node;
+ int i;
+
+ if (!sp->parent_pte)
+ return;
+
+ if (!sp->multimapped) {
+ kvm_mmu_update_unsync_bitmap(sp->parent_pte);
+ return;
+ }
+
+ hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
+ for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+ if (!pte_chain->parent_ptes[i])
+ break;
+ kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
+ }
+}
+
+static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ sp->unsync_children = 1;
+ kvm_mmu_update_parents_unsync(sp);
+ return 1;
+}
+
+static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ mmu_parent_walk(vcpu, sp, unsync_walk_fn);
+ kvm_mmu_update_parents_unsync(sp);
+}
+
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
@@ -864,6 +949,58 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
sp->spt[i] = shadow_trap_nonpresent_pte;
}
+static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp)
+{
+ return 1;
+}
+
+static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+}
+
+#define for_each_unsync_children(bitmap, idx) \
+ for (idx = find_first_bit(bitmap, 512); \
+ idx < 512; \
+ idx = find_next_bit(bitmap, 512, idx+1))
+
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_unsync_walk *walker)
+{
+ int i, ret;
+
+ if (!sp->unsync_children)
+ return 0;
+
+ for_each_unsync_children(sp->unsync_child_bitmap, i) {
+ u64 ent = sp->spt[i];
+
+ if (is_shadow_present_pte(ent)) {
+ struct kvm_mmu_page *child;
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+ if (child->unsync_children) {
+ ret = mmu_unsync_walk(child, walker);
+ if (ret)
+ return ret;
+ __clear_bit(i, sp->unsync_child_bitmap);
+ }
+
+ if (child->unsync) {
+ ret = walker->entry(child, walker);
+ __clear_bit(i, sp->unsync_child_bitmap);
+ if (ret)
+ return ret;
+ }
+ }
+ }
+
+ if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
+ sp->unsync_children = 0;
+
+ return 0;
+}
+
static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
{
unsigned index;
@@ -884,6 +1021,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
return NULL;
}
+static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ WARN_ON(!sp->unsync);
+ sp->unsync = 0;
+ --kvm->stat.mmu_unsync;
+}
+
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ if (sp->role.glevels != vcpu->arch.mmu.root_level) {
+ kvm_mmu_zap_page(vcpu->kvm, sp);
+ return 1;
+ }
+
+ rmap_write_protect(vcpu->kvm, sp->gfn);
+ if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+ kvm_mmu_zap_page(vcpu->kvm, sp);
+ return 1;
+ }
+
+ kvm_mmu_flush_tlb(vcpu);
+ kvm_unlink_unsync_page(vcpu->kvm, sp);
+ return 0;
+}
+
+struct sync_walker {
+ struct kvm_vcpu *vcpu;
+ struct kvm_unsync_walk walker;
+};
+
+static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+{
+ struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
+ walker);
+ struct kvm_vcpu *vcpu = sync_walk->vcpu;
+
+ kvm_sync_page(vcpu, sp);
+ return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
+}
+
+static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ struct sync_walker walker = {
+ .walker = { .entry = mmu_sync_fn, },
+ .vcpu = vcpu,
+ };
+
+ while (mmu_unsync_walk(sp, &walker.walker))
+ cond_resched_lock(&vcpu->kvm->mmu_lock);
+}
+
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
gva_t gaddr,
@@ -897,7 +1087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
unsigned quadrant;
struct hlist_head *bucket;
struct kvm_mmu_page *sp;
- struct hlist_node *node;
+ struct hlist_node *node, *tmp;
role.word = 0;
role.glevels = vcpu->arch.mmu.root_level;
@@ -913,9 +1103,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn, role.word);
index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
- hlist_for_each_entry(sp, node, bucket, hash_link)
- if (sp->gfn == gfn && sp->role.word == role.word) {
+ hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
+ if (sp->gfn == gfn) {
+ if (sp->unsync)
+ if (kvm_sync_page(vcpu, sp))
+ continue;
+
+ if (sp->role.word != role.word)
+ continue;
+
mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+ if (sp->unsync_children) {
+ set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
+ kvm_mmu_mark_parents_unsync(vcpu, sp);
+ }
pgprintk("%s: found\n", __func__);
return sp;
}
@@ -927,8 +1128,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
sp->gfn = gfn;
sp->role = role;
hlist_add_head(&sp->hash_link, bucket);
- if (!metaphysical)
+ if (!metaphysical) {
rmap_write_protect(vcpu->kvm, gfn);
+ account_shadowed(vcpu->kvm, gfn);
+ }
if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
vcpu->arch.mmu.prefetch_page(vcpu, sp);
else
@@ -936,6 +1139,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
return sp;
}
+static int walk_shadow(struct kvm_shadow_walk *walker,
+ struct kvm_vcpu *vcpu, u64 addr)
+{
+ hpa_t shadow_addr;
+ int level;
+ int r;
+ u64 *sptep;
+ unsigned index;
+
+ shadow_addr = vcpu->arch.mmu.root_hpa;
+ level = vcpu->arch.mmu.shadow_root_level;
+ if (level == PT32E_ROOT_LEVEL) {
+ shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
+ shadow_addr &= PT64_BASE_ADDR_MASK;
+ --level;
+ }
+
+ while (level >= PT_PAGE_TABLE_LEVEL) {
+ index = SHADOW_PT_INDEX(addr, level);
+ sptep = ((u64 *)__va(shadow_addr)) + index;
+ r = walker->entry(walker, vcpu, addr, sptep, level);
+ if (r)
+ return r;
+ shadow_addr = *sptep & PT64_BASE_ADDR_MASK;
+ --level;
+ }
+ return 0;
+}
+
static void kvm_mmu_page_unlink_children(struct kvm *kvm,
struct kvm_mmu_page *sp)
{
@@ -951,7 +1183,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
rmap_remove(kvm, &pt[i]);
pt[i] = shadow_trap_nonpresent_pte;
}
- kvm_flush_remote_tlbs(kvm);
return;
}
@@ -970,7 +1201,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
}
pt[i] = shadow_trap_nonpresent_pte;
}
- kvm_flush_remote_tlbs(kvm);
}
static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -987,11 +1217,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
kvm->vcpus[i]->arch.last_pte_updated = NULL;
}
-static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
{
u64 *parent_pte;
- ++kvm->stat.mmu_shadow_zapped;
while (sp->multimapped || sp->parent_pte) {
if (!sp->multimapped)
parent_pte = sp->parent_pte;
@@ -1006,21 +1235,59 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_mmu_put_page(sp, parent_pte);
set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
}
+}
+
+struct zap_walker {
+ struct kvm_unsync_walk walker;
+ struct kvm *kvm;
+ int zapped;
+};
+
+static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+{
+ struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
+ walker);
+ kvm_mmu_zap_page(zap_walk->kvm, sp);
+ zap_walk->zapped = 1;
+ return 0;
+}
+
+static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ struct zap_walker walker = {
+ .walker = { .entry = mmu_zap_fn, },
+ .kvm = kvm,
+ .zapped = 0,
+ };
+
+ if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+ return 0;
+ mmu_unsync_walk(sp, &walker.walker);
+ return walker.zapped;
+}
+
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ int ret;
+ ++kvm->stat.mmu_shadow_zapped;
+ ret = mmu_zap_unsync_children(kvm, sp);
kvm_mmu_page_unlink_children(kvm, sp);
+ kvm_mmu_unlink_parents(kvm, sp);
+ kvm_flush_remote_tlbs(kvm);
+ if (!sp->role.invalid && !sp->role.metaphysical)
+ unaccount_shadowed(kvm, sp->gfn);
+ if (sp->unsync)
+ kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) {
- if (!sp->role.metaphysical && !sp->role.invalid)
- unaccount_shadowed(kvm, sp->gfn);
hlist_del(&sp->hash_link);
kvm_mmu_free_page(kvm, sp);
} else {
- int invalid = sp->role.invalid;
- list_move(&sp->link, &kvm->arch.active_mmu_pages);
sp->role.invalid = 1;
+ list_move(&sp->link, &kvm->arch.active_mmu_pages);
kvm_reload_remote_mmus(kvm);
- if (!sp->role.metaphysical && !invalid)
- unaccount_shadowed(kvm, sp->gfn);
}
kvm_mmu_reset_last_pte_updated(kvm);
+ return ret;
}
/*
@@ -1073,8 +1340,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
if (sp->gfn == gfn && !sp->role.metaphysical) {
pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
sp->role.word);
- kvm_mmu_zap_page(kvm, sp);
r = 1;
+ if (kvm_mmu_zap_page(kvm, sp))
+ n = bucket->first;
}
return r;
}
@@ -1097,6 +1365,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
__set_bit(slot, &sp->slot_bitmap);
}
+static void mmu_convert_notrap(struct kvm_mmu_page *sp)
+{
+ int i;
+ u64 *pt = sp->spt;
+
+ if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
+ return;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (pt[i] == shadow_notrap_nonpresent_pte)
+ set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
+ }
+}
+
struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
{
struct page *page;
@@ -1106,51 +1388,60 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
if (gpa == UNMAPPED_GVA)
return NULL;
- down_read(&current->mm->mmap_sem);
page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
- up_read(&current->mm->mmap_sem);
return page;
}
-static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
- unsigned pt_access, unsigned pte_access,
- int user_fault, int write_fault, int dirty,
- int *ptwrite, int largepage, gfn_t gfn,
- pfn_t pfn, bool speculative)
+static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
- u64 spte;
- int was_rmapped = 0;
- int was_writeble = is_writeble_pte(*shadow_pte);
+ unsigned index;
+ struct hlist_head *bucket;
+ struct kvm_mmu_page *s;
+ struct hlist_node *node, *n;
- pgprintk("%s: spte %llx access %x write_fault %d"
- " user_fault %d gfn %lx\n",
- __func__, *shadow_pte, pt_access,
- write_fault, user_fault, gfn);
+ index = kvm_page_table_hashfn(sp->gfn);
+ bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+ /* don't unsync if pagetable is shadowed with multiple roles */
+ hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
+ if (s->gfn != sp->gfn || s->role.metaphysical)
+ continue;
+ if (s->role.word != sp->role.word)
+ return 1;
+ }
+ kvm_mmu_mark_parents_unsync(vcpu, sp);
+ ++vcpu->kvm->stat.mmu_unsync;
+ sp->unsync = 1;
+ mmu_convert_notrap(sp);
+ return 0;
+}
- if (is_rmap_pte(*shadow_pte)) {
- /*
- * If we overwrite a PTE page pointer with a 2MB PMD, unlink
- * the parent of the now unreachable PTE.
- */
- if (largepage && !is_large_pte(*shadow_pte)) {
- struct kvm_mmu_page *child;
- u64 pte = *shadow_pte;
+static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool can_unsync)
+{
+ struct kvm_mmu_page *shadow;
- child = page_header(pte & PT64_BASE_ADDR_MASK);
- mmu_page_remove_parent_pte(child, shadow_pte);
- } else if (pfn != spte_to_pfn(*shadow_pte)) {
- pgprintk("hfn old %lx new %lx\n",
- spte_to_pfn(*shadow_pte), pfn);
- rmap_remove(vcpu->kvm, shadow_pte);
- } else {
- if (largepage)
- was_rmapped = is_large_pte(*shadow_pte);
- else
- was_rmapped = 1;
- }
+ shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
+ if (shadow) {
+ if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
+ return 1;
+ if (shadow->unsync)
+ return 0;
+ if (can_unsync && oos_shadow)
+ return kvm_unsync_page(vcpu, shadow);
+ return 1;
}
+ return 0;
+}
+static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+ unsigned pte_access, int user_fault,
+ int write_fault, int dirty, int largepage,
+ gfn_t gfn, pfn_t pfn, bool speculative,
+ bool can_unsync)
+{
+ u64 spte;
+ int ret = 0;
/*
* We don't set the accessed bit, since we sometimes want to see
* whether the guest actually used the pte (in order to detect
@@ -1158,7 +1449,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
*/
spte = shadow_base_present_pte | shadow_dirty_mask;
if (!speculative)
- pte_access |= PT_ACCESSED_MASK;
+ spte |= shadow_accessed_mask;
if (!dirty)
pte_access &= ~ACC_WRITE_MASK;
if (pte_access & ACC_EXEC_MASK)
@@ -1174,35 +1465,82 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
if ((pte_access & ACC_WRITE_MASK)
|| (write_fault && !is_write_protection(vcpu) && !user_fault)) {
- struct kvm_mmu_page *shadow;
+
+ if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
+ ret = 1;
+ spte = shadow_trap_nonpresent_pte;
+ goto set_pte;
+ }
spte |= PT_WRITABLE_MASK;
- shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
- if (shadow ||
- (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
+ if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
__func__, gfn);
+ ret = 1;
pte_access &= ~ACC_WRITE_MASK;
- if (is_writeble_pte(spte)) {
+ if (is_writeble_pte(spte))
spte &= ~PT_WRITABLE_MASK;
- kvm_x86_ops->tlb_flush(vcpu);
- }
- if (write_fault)
- *ptwrite = 1;
}
}
if (pte_access & ACC_WRITE_MASK)
mark_page_dirty(vcpu->kvm, gfn);
- pgprintk("%s: setting spte %llx\n", __func__, spte);
- pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
- (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
- (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
+set_pte:
set_shadow_pte(shadow_pte, spte);
- if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK)
- && (spte & PT_PRESENT_MASK))
+ return ret;
+}
+
+static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
+ unsigned pt_access, unsigned pte_access,
+ int user_fault, int write_fault, int dirty,
+ int *ptwrite, int largepage, gfn_t gfn,
+ pfn_t pfn, bool speculative)
+{
+ int was_rmapped = 0;
+ int was_writeble = is_writeble_pte(*shadow_pte);
+
+ pgprintk("%s: spte %llx access %x write_fault %d"
+ " user_fault %d gfn %lx\n",
+ __func__, *shadow_pte, pt_access,
+ write_fault, user_fault, gfn);
+
+ if (is_rmap_pte(*shadow_pte)) {
+ /*
+ * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+ * the parent of the now unreachable PTE.
+ */
+ if (largepage && !is_large_pte(*shadow_pte)) {
+ struct kvm_mmu_page *child;
+ u64 pte = *shadow_pte;
+
+ child = page_header(pte & PT64_BASE_ADDR_MASK);
+ mmu_page_remove_parent_pte(child, shadow_pte);
+ } else if (pfn != spte_to_pfn(*shadow_pte)) {
+ pgprintk("hfn old %lx new %lx\n",
+ spte_to_pfn(*shadow_pte), pfn);
+ rmap_remove(vcpu->kvm, shadow_pte);
+ } else {
+ if (largepage)
+ was_rmapped = is_large_pte(*shadow_pte);
+ else
+ was_rmapped = 1;
+ }
+ }
+ if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
+ dirty, largepage, gfn, pfn, speculative, true)) {
+ if (write_fault)
+ *ptwrite = 1;
+ kvm_x86_ops->tlb_flush(vcpu);
+ }
+
+ pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
+ pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
+ is_large_pte(*shadow_pte)? "2MB" : "4kB",
+ is_present_pte(*shadow_pte)?"RW":"R", gfn,
+ *shadow_pte, shadow_pte);
+ if (!was_rmapped && is_large_pte(*shadow_pte))
++vcpu->kvm->stat.lpages;
page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
@@ -1226,54 +1564,67 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}
-static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
- int largepage, gfn_t gfn, pfn_t pfn,
- int level)
-{
- hpa_t table_addr = vcpu->arch.mmu.root_hpa;
- int pt_write = 0;
-
- for (; ; level--) {
- u32 index = PT64_INDEX(v, level);
- u64 *table;
-
- ASSERT(VALID_PAGE(table_addr));
- table = __va(table_addr);
+struct direct_shadow_walk {
+ struct kvm_shadow_walk walker;
+ pfn_t pfn;
+ int write;
+ int largepage;
+ int pt_write;
+};
- if (level == 1) {
- mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
- 0, write, 1, &pt_write, 0, gfn, pfn, false);
- return pt_write;
- }
+static int direct_map_entry(struct kvm_shadow_walk *_walk,
+ struct kvm_vcpu *vcpu,
+ u64 addr, u64 *sptep, int level)
+{
+ struct direct_shadow_walk *walk =
+ container_of(_walk, struct direct_shadow_walk, walker);
+ struct kvm_mmu_page *sp;
+ gfn_t pseudo_gfn;
+ gfn_t gfn = addr >> PAGE_SHIFT;
+
+ if (level == PT_PAGE_TABLE_LEVEL
+ || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
+ mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
+ 0, walk->write, 1, &walk->pt_write,
+ walk->largepage, gfn, walk->pfn, false);
+ ++vcpu->stat.pf_fixed;
+ return 1;
+ }
- if (largepage && level == 2) {
- mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
- 0, write, 1, &pt_write, 1, gfn, pfn, false);
- return pt_write;
+ if (*sptep == shadow_trap_nonpresent_pte) {
+ pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1,
+ 1, ACC_ALL, sptep);
+ if (!sp) {
+ pgprintk("nonpaging_map: ENOMEM\n");
+ kvm_release_pfn_clean(walk->pfn);
+ return -ENOMEM;
}
- if (table[index] == shadow_trap_nonpresent_pte) {
- struct kvm_mmu_page *new_table;
- gfn_t pseudo_gfn;
-
- pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
- >> PAGE_SHIFT;
- new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
- v, level - 1,
- 1, ACC_ALL, &table[index]);
- if (!new_table) {
- pgprintk("nonpaging_map: ENOMEM\n");
- kvm_release_pfn_clean(pfn);
- return -ENOMEM;
- }
-
- set_shadow_pte(&table[index],
- __pa(new_table->spt)
- | PT_PRESENT_MASK | PT_WRITABLE_MASK
- | shadow_user_mask | shadow_x_mask);
- }
- table_addr = table[index] & PT64_BASE_ADDR_MASK;
+ set_shadow_pte(sptep,
+ __pa(sp->spt)
+ | PT_PRESENT_MASK | PT_WRITABLE_MASK
+ | shadow_user_mask | shadow_x_mask);
}
+ return 0;
+}
+
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
+ int largepage, gfn_t gfn, pfn_t pfn)
+{
+ int r;
+ struct direct_shadow_walk walker = {
+ .walker = { .entry = direct_map_entry, },
+ .pfn = pfn,
+ .largepage = largepage,
+ .write = write,
+ .pt_write = 0,
+ };
+
+ r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
+ if (r < 0)
+ return r;
+ return walker.pt_write;
}
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
@@ -1283,16 +1634,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
pfn_t pfn;
unsigned long mmu_seq;
- down_read(&current->mm->mmap_sem);
if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
gfn &= ~(KVM_PAGES_PER_HPAGE-1);
largepage = 1;
}
mmu_seq = vcpu->kvm->mmu_notifier_seq;
- /* implicit mb(), we'll read before PT lock is unlocked */
+ smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
- up_read(&current->mm->mmap_sem);
/* mmio */
if (is_error_pfn(pfn)) {
@@ -1304,8 +1653,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
- r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
- PT32E_ROOT_LEVEL);
+ r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -1401,6 +1749,37 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
}
+static void mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+ sp = page_header(root);
+ mmu_sync_children(vcpu, sp);
+ return;
+ }
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ mmu_sync_children(vcpu, sp);
+ }
+ }
+}
+
+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
+{
+ spin_lock(&vcpu->kvm->mmu_lock);
+ mmu_sync_roots(vcpu);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
{
return vaddr;
@@ -1442,15 +1821,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
if (r)
return r;
- down_read(&current->mm->mmap_sem);
if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
gfn &= ~(KVM_PAGES_PER_HPAGE-1);
largepage = 1;
}
mmu_seq = vcpu->kvm->mmu_notifier_seq;
- /* implicit mb(), we'll read before PT lock is unlocked */
+ smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
- up_read(&current->mm->mmap_sem);
if (is_error_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
return 1;
@@ -1460,7 +1837,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
- largepage, gfn, pfn, kvm_x86_ops->get_tdp_level());
+ largepage, gfn, pfn);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
@@ -1485,6 +1862,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->free = nonpaging_free;
context->prefetch_page = nonpaging_prefetch_page;
+ context->sync_page = nonpaging_sync_page;
+ context->invlpg = nonpaging_invlpg;
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE;
@@ -1532,6 +1911,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
context->prefetch_page = paging64_prefetch_page;
+ context->sync_page = paging64_sync_page;
+ context->invlpg = paging64_invlpg;
context->free = paging_free;
context->root_level = level;
context->shadow_root_level = level;
@@ -1553,6 +1934,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
context->gva_to_gpa = paging32_gva_to_gpa;
context->free = paging_free;
context->prefetch_page = paging32_prefetch_page;
+ context->sync_page = paging32_sync_page;
+ context->invlpg = paging32_invlpg;
context->root_level = PT32_ROOT_LEVEL;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE;
@@ -1572,6 +1955,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->page_fault = tdp_page_fault;
context->free = nonpaging_free;
context->prefetch_page = nonpaging_prefetch_page;
+ context->sync_page = nonpaging_sync_page;
+ context->invlpg = nonpaging_invlpg;
context->shadow_root_level = kvm_x86_ops->get_tdp_level();
context->root_hpa = INVALID_PAGE;
@@ -1643,6 +2028,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
mmu_alloc_roots(vcpu);
+ mmu_sync_roots(vcpu);
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
kvm_mmu_flush_tlb(vcpu);
@@ -1763,15 +2149,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
return;
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
- down_read(&current->mm->mmap_sem);
if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
gfn &= ~(KVM_PAGES_PER_HPAGE-1);
vcpu->arch.update_pte.largepage = 1;
}
vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
- /* implicit mb(), we'll read before PT lock is unlocked */
+ smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
- up_read(&current->mm->mmap_sem);
if (is_error_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
@@ -1833,7 +2217,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
- if (sp->gfn != gfn || sp->role.metaphysical)
+ if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid)
continue;
pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
@@ -1851,7 +2235,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
*/
pgprintk("misaligned: gpa %llx bytes %d role %x\n",
gpa, bytes, sp->role.word);
- kvm_mmu_zap_page(vcpu->kvm, sp);
+ if (kvm_mmu_zap_page(vcpu->kvm, sp))
+ n = bucket->first;
++vcpu->kvm->stat.mmu_flooded;
continue;
}
@@ -1965,6 +2350,16 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ spin_lock(&vcpu->kvm->mmu_lock);
+ vcpu->arch.mmu.invlpg(vcpu, gva);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_flush_tlb(vcpu);
+ ++vcpu->stat.invlpg;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
+
void kvm_enable_tdp(void)
{
tdp_enabled = true;
@@ -2051,6 +2446,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
{
struct kvm_mmu_page *sp;
+ spin_lock(&kvm->mmu_lock);
list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
int i;
u64 *pt;
@@ -2064,6 +2460,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
if (pt[i] & PT_WRITABLE_MASK)
pt[i] &= ~PT_WRITABLE_MASK;
}
+ kvm_flush_remote_tlbs(kvm);
+ spin_unlock(&kvm->mmu_lock);
}
void kvm_mmu_zap_all(struct kvm *kvm)
@@ -2072,7 +2470,9 @@ void kvm_mmu_zap_all(struct kvm *kvm)
spin_lock(&kvm->mmu_lock);
list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
- kvm_mmu_zap_page(kvm, sp);
+ if (kvm_mmu_zap_page(kvm, sp))
+ node = container_of(kvm->arch.active_mmu_pages.next,
+ struct kvm_mmu_page, link);
spin_unlock(&kvm->mmu_lock);
kvm_flush_remote_tlbs(kvm);
@@ -2287,18 +2687,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
gpa_t addr, unsigned long *ret)
{
int r;
- struct kvm_pv_mmu_op_buffer buffer;
+ struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
- buffer.ptr = buffer.buf;
- buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf);
- buffer.processed = 0;
+ buffer->ptr = buffer->buf;
+ buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
+ buffer->processed = 0;
- r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len);
+ r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
if (r)
goto out;
- while (buffer.len) {
- r = kvm_pv_mmu_op_one(vcpu, &buffer);
+ while (buffer->len) {
+ r = kvm_pv_mmu_op_one(vcpu, buffer);
if (r < 0)
goto out;
if (r == 0)
@@ -2307,7 +2707,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
r = 1;
out:
- *ret = buffer.processed;
+ *ret = buffer->processed;
return r;
}