From 697d3899dcb4bcd918d060a92db57b794e56b077 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Dec 2011 12:36:37 +0000 Subject: KVM: PPC: Implement MMIO emulation support for Book3S HV guests This provides the low-level support for MMIO emulation in Book3S HV guests. When the guest tries to map a page which is not covered by any memslot, that page is taken to be an MMIO emulation page. Instead of inserting a valid HPTE, we insert an HPTE that has the valid bit clear but another hypervisor software-use bit set, which we call HPTE_V_ABSENT, to indicate that this is an absent page. An absent page is treated much like a valid page as far as guest hcalls (H_ENTER, H_REMOVE, H_READ etc.) are concerned, except of course that an absent HPTE doesn't need to be invalidated with tlbie since it was never valid as far as the hardware is concerned. When the guest accesses a page for which there is an absent HPTE, it will take a hypervisor data storage interrupt (HDSI) since we now set the VPM1 bit in the LPCR. Our HDSI handler for HPTE-not-present faults looks up the hash table and if it finds an absent HPTE mapping the requested virtual address, will switch to kernel mode and handle the fault in kvmppc_book3s_hv_page_fault(), which at present just calls kvmppc_hv_emulate_mmio() to set up the MMIO emulation. This is based on an earlier patch by Benjamin Herrenschmidt, but since heavily reworked. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 127 ++++++++++++++++++++++++++------ 1 file changed, 105 insertions(+), 22 deletions(-) (limited to 'arch/powerpc/kvm/book3s_hv_rmhandlers.S') diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 5c8b26183f5..d07b64d5f37 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -601,6 +601,28 @@ kvmppc_interrupt: stw r12,VCPU_TRAP(r9) + /* Save HEIR (HV emulation assist reg) in last_inst + if this is an HEI (HV emulation interrupt, e40) */ + li r3,KVM_INST_FETCH_FAILED +BEGIN_FTR_SECTION + cmpwi r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST + bne 11f + mfspr r3,SPRN_HEIR +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +11: stw r3,VCPU_LAST_INST(r9) + + /* these are volatile across C function calls */ + mfctr r3 + mfxer r4 + std r3, VCPU_CTR(r9) + stw r4, VCPU_XER(r9) + +BEGIN_FTR_SECTION + /* If this is a page table miss then see if it's theirs or ours */ + cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE + beq kvmppc_hdsi +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + /* See if this is a leftover HDEC interrupt */ cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER bne 2f @@ -608,7 +630,7 @@ kvmppc_interrupt: cmpwi r3,0 bge ignore_hdec 2: - /* See if this is something we can handle in real mode */ + /* See if this is an hcall we can handle in real mode */ cmpwi r12,BOOK3S_INTERRUPT_SYSCALL beq hcall_try_real_mode @@ -624,6 +646,7 @@ BEGIN_FTR_SECTION 1: END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +nohpte_cont: hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ /* Save DEC */ mfspr r5,SPRN_DEC @@ -632,36 +655,21 @@ hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ add r5,r5,r6 std r5,VCPU_DEC_EXPIRES(r9) - /* Save HEIR (HV emulation assist reg) in last_inst - if this is an HEI (HV emulation interrupt, e40) */ - li r3,-1 -BEGIN_FTR_SECTION - cmpwi r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST - bne 11f - mfspr r3,SPRN_HEIR -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) -11: stw r3,VCPU_LAST_INST(r9) - /* Save more register state */ - mfxer r5 mfdar r6 mfdsisr r7 - mfctr r8 - - stw r5, VCPU_XER(r9) std r6, VCPU_DAR(r9) stw r7, VCPU_DSISR(r9) - std r8, VCPU_CTR(r9) - /* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */ BEGIN_FTR_SECTION + /* don't overwrite fault_dar/fault_dsisr if HDSI */ cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE beq 6f END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) -7: std r6, VCPU_FAULT_DAR(r9) + std r6, VCPU_FAULT_DAR(r9) stw r7, VCPU_FAULT_DSISR(r9) /* Save guest CTRL register, set runlatch to 1 */ - mfspr r6,SPRN_CTRLF +6: mfspr r6,SPRN_CTRLF stw r6,VCPU_CTRL(r9) andi. r0,r6,1 bne 4f @@ -1094,9 +1102,84 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) mtspr SPRN_HSRR1, r7 ba 0x500 -6: mfspr r6,SPRN_HDAR - mfspr r7,SPRN_HDSISR - b 7b +/* + * Check whether an HDSI is an HPTE not found fault or something else. + * If it is an HPTE not found fault that is due to the guest accessing + * a page that they have mapped but which we have paged out, then + * we continue on with the guest exit path. In all other cases, + * reflect the HDSI to the guest as a DSI. + */ +kvmppc_hdsi: + mfspr r4, SPRN_HDAR + mfspr r6, SPRN_HDSISR + /* HPTE not found fault? */ + andis. r0, r6, DSISR_NOHPTE@h + beq 1f /* if not, send it to the guest */ + andi. r0, r11, MSR_DR /* data relocation enabled? */ + beq 3f + clrrdi r0, r4, 28 + PPC_SLBFEE_DOT(r5, r0) /* if so, look up SLB */ + bne 1f /* if no SLB entry found */ +4: std r4, VCPU_FAULT_DAR(r9) + stw r6, VCPU_FAULT_DSISR(r9) + + /* Search the hash table. */ + mr r3, r9 /* vcpu pointer */ + bl .kvmppc_hpte_hv_fault + ld r9, HSTATE_KVM_VCPU(r13) + ld r10, VCPU_PC(r9) + ld r11, VCPU_MSR(r9) + li r12, BOOK3S_INTERRUPT_H_DATA_STORAGE + cmpdi r3, 0 /* retry the instruction */ + beq 6f + cmpdi r3, -1 /* handle in kernel mode */ + beq nohpte_cont + cmpdi r3, -2 /* MMIO emulation; need instr word */ + beq 2f + + /* Synthesize a DSI for the guest */ + ld r4, VCPU_FAULT_DAR(r9) + mr r6, r3 +1: mtspr SPRN_DAR, r4 + mtspr SPRN_DSISR, r6 + mtspr SPRN_SRR0, r10 + mtspr SPRN_SRR1, r11 + li r10, BOOK3S_INTERRUPT_DATA_STORAGE + li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ + rotldi r11, r11, 63 +6: ld r7, VCPU_CTR(r9) + lwz r8, VCPU_XER(r9) + mtctr r7 + mtxer r8 + mr r4, r9 + b fast_guest_return + +3: ld r5, VCPU_KVM(r9) /* not relocated, use VRMA */ + ld r5, KVM_VRMA_SLB_V(r5) + b 4b + + /* If this is for emulated MMIO, load the instruction word */ +2: li r8, KVM_INST_FETCH_FAILED /* In case lwz faults */ + + /* Set guest mode to 'jump over instruction' so if lwz faults + * we'll just continue at the next IP. */ + li r0, KVM_GUEST_MODE_SKIP + stb r0, HSTATE_IN_GUEST(r13) + + /* Do the access with MSR:DR enabled */ + mfmsr r3 + ori r4, r3, MSR_DR /* Enable paging for data */ + mtmsrd r4 + lwz r8, 0(r10) + mtmsrd r3 + + /* Store the result */ + stw r8, VCPU_LAST_INST(r9) + + /* Unset guest mode. */ + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) + b nohpte_cont /* * Try to handle an hcall in real mode. -- cgit v1.2.3-70-g09d2 From 342d3db763f2621ed4546ebf8f6c61cb29d7fbdb Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Dec 2011 12:38:05 +0000 Subject: KVM: PPC: Implement MMU notifiers for Book3S HV guests This adds the infrastructure to enable us to page out pages underneath a Book3S HV guest, on processors that support virtualized partition memory, that is, POWER7. Instead of pinning all the guest's pages, we now look in the host userspace Linux page tables to find the mapping for a given guest page. Then, if the userspace Linux PTE gets invalidated, kvm_unmap_hva() gets called for that address, and we replace all the guest HPTEs that refer to that page with absent HPTEs, i.e. ones with the valid bit clear and the HPTE_V_ABSENT bit set, which will cause an HDSI when the guest tries to access them. Finally, the page fault handler is extended to reinstantiate the guest HPTE when the guest tries to access a page which has been paged out. Since we can't intercept the guest DSI and ISI interrupts on PPC970, we still have to pin all the guest pages on PPC970. We have a new flag, kvm->arch.using_mmu_notifiers, that indicates whether we can page guest pages out. If it is not set, the MMU notifier callbacks do nothing and everything operates as before. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_book3s.h | 4 + arch/powerpc/include/asm/kvm_book3s_64.h | 31 ++++ arch/powerpc/include/asm/kvm_host.h | 16 ++ arch/powerpc/include/asm/reg.h | 3 + arch/powerpc/kvm/Kconfig | 1 + arch/powerpc/kvm/book3s_64_mmu_hv.c | 290 ++++++++++++++++++++++++++++--- arch/powerpc/kvm/book3s_hv.c | 25 +-- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 140 +++++++++++---- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 49 ++++++ arch/powerpc/kvm/powerpc.c | 3 + arch/powerpc/mm/hugetlbpage.c | 2 + 11 files changed, 499 insertions(+), 65 deletions(-) (limited to 'arch/powerpc/kvm/book3s_hv_rmhandlers.S') diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 3a9e51f4339..9240cebf8ba 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -143,6 +143,10 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); +extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, + unsigned long *rmap, long pte_index, int realmode); +extern void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep, + unsigned long pte_index); extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr, unsigned long *nb_ret); extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr); diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 79dc37fb86b..c21e46da4a3 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -136,6 +136,37 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type) return (wimg & (HPTE_R_W | HPTE_R_I)) == io_type; } +/* + * Lock and read a linux PTE. If it's present and writable, atomically + * set dirty and referenced bits and return the PTE, otherwise return 0. + */ +static inline pte_t kvmppc_read_update_linux_pte(pte_t *p) +{ + pte_t pte, tmp; + + /* wait until _PAGE_BUSY is clear then set it atomically */ + __asm__ __volatile__ ( + "1: ldarx %0,0,%3\n" + " andi. %1,%0,%4\n" + " bne- 1b\n" + " ori %1,%0,%4\n" + " stdcx. %1,0,%3\n" + " bne- 1b" + : "=&r" (pte), "=&r" (tmp), "=m" (*p) + : "r" (p), "i" (_PAGE_BUSY) + : "cc"); + + if (pte_present(pte)) { + pte = pte_mkyoung(pte); + if (pte_write(pte)) + pte = pte_mkdirty(pte); + } + + *p = pte; /* clears _PAGE_BUSY */ + + return pte; +} + /* Return HPTE cache control bits corresponding to Linux pte bits */ static inline unsigned long hpte_cache_bits(unsigned long pte_val) { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 937cacaaf23..968f3aa61cd 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -32,6 +32,7 @@ #include #include #include +#include #define KVM_MAX_VCPUS NR_CPUS #define KVM_MAX_VCORES NR_CPUS @@ -44,6 +45,19 @@ #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #endif +#ifdef CONFIG_KVM_BOOK3S_64_HV +#include + +#define KVM_ARCH_WANT_MMU_NOTIFIER + +struct kvm; +extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); +extern int kvm_age_hva(struct kvm *kvm, unsigned long hva); +extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); +extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); + +#endif + /* We don't currently support large pages. */ #define KVM_HPAGE_GFN_SHIFT(x) 0 #define KVM_NR_PAGE_SIZES 1 @@ -212,6 +226,7 @@ struct kvm_arch { struct kvmppc_rma_info *rma; unsigned long vrma_slb_v; int rma_setup_done; + int using_mmu_notifiers; struct list_head spapr_tce_tables; spinlock_t slot_phys_lock; unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; @@ -460,6 +475,7 @@ struct kvm_vcpu_arch { struct list_head run_list; struct task_struct *run_task; struct kvm_run *kvm_run; + pgd_t *pgdir; #endif }; diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 16efb3151c2..35c9309bf03 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -495,6 +495,9 @@ #define SPRN_SPRG7 0x117 /* Special Purpose Register General 7 */ #define SPRN_SRR0 0x01A /* Save/Restore Register 0 */ #define SPRN_SRR1 0x01B /* Save/Restore Register 1 */ +#define SRR1_ISI_NOPT 0x40000000 /* ISI: Not found in hash */ +#define SRR1_ISI_N_OR_G 0x10000000 /* ISI: Access is no-exec or G */ +#define SRR1_ISI_PROT 0x08000000 /* ISI: Other protection fault */ #define SRR1_WAKEMASK 0x00380000 /* reason for wakeup */ #define SRR1_WAKESYSERR 0x00300000 /* System error */ #define SRR1_WAKEEE 0x00200000 /* External interrupt */ diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 78133deb4b6..8f64709ae33 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -69,6 +69,7 @@ config KVM_BOOK3S_64 config KVM_BOOK3S_64_HV bool "KVM support for POWER7 and PPC970 using hypervisor mode in host" depends on KVM_BOOK3S_64 + select MMU_NOTIFIER ---help--- Support running unmodified book3s_64 guest kernels in virtual machines on POWER7 and PPC970 processors that have diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 2d31519b863..83761dd8a92 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -281,8 +281,9 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, } /* - * We come here on a H_ENTER call from the guest when - * we don't have the requested page pinned already. + * We come here on a H_ENTER call from the guest when we are not + * using mmu notifiers and we don't have the requested page pinned + * already. */ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel) @@ -292,6 +293,9 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, struct kvm_memory_slot *memslot; long ret; + if (kvm->arch.using_mmu_notifiers) + goto do_insert; + psize = hpte_page_size(pteh, ptel); if (!psize) return H_PARAMETER; @@ -309,9 +313,12 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, return H_PARAMETER; } - preempt_disable(); + do_insert: + /* Protect linux PTE lookup from page table destruction */ + rcu_read_lock_sched(); /* this disables preemption too */ + vcpu->arch.pgdir = current->mm->pgd; ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel); - preempt_enable(); + rcu_read_unlock_sched(); if (ret == H_TOO_HARD) { /* this can't happen */ pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); @@ -487,12 +494,16 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long ea, unsigned long dsisr) { struct kvm *kvm = vcpu->kvm; - unsigned long *hptep, hpte[3]; - unsigned long psize; - unsigned long gfn; + unsigned long *hptep, hpte[3], r; + unsigned long mmu_seq, psize, pte_size; + unsigned long gfn, hva, pfn; struct kvm_memory_slot *memslot; + unsigned long *rmap; struct revmap_entry *rev; - long index; + struct page *page, *pages[1]; + long index, ret, npages; + unsigned long is_io; + struct vm_area_struct *vma; /* * Real-mode code has already searched the HPT and found the @@ -510,7 +521,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, cpu_relax(); hpte[0] = hptep[0] & ~HPTE_V_HVLOCK; hpte[1] = hptep[1]; - hpte[2] = rev->guest_rpte; + hpte[2] = r = rev->guest_rpte; asm volatile("lwsync" : : : "memory"); hptep[0] = hpte[0]; preempt_enable(); @@ -520,8 +531,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, return RESUME_GUEST; /* Translate the logical address and get the page */ - psize = hpte_page_size(hpte[0], hpte[1]); - gfn = hpte_rpn(hpte[2], psize); + psize = hpte_page_size(hpte[0], r); + gfn = hpte_rpn(r, psize); memslot = gfn_to_memslot(kvm, gfn); /* No memslot means it's an emulated MMIO region */ @@ -531,8 +542,228 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, dsisr & DSISR_ISSTORE); } - /* should never get here otherwise */ - return -EFAULT; + if (!kvm->arch.using_mmu_notifiers) + return -EFAULT; /* should never get here */ + + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + is_io = 0; + pfn = 0; + page = NULL; + pte_size = PAGE_SIZE; + hva = gfn_to_hva_memslot(memslot, gfn); + npages = get_user_pages_fast(hva, 1, 1, pages); + if (npages < 1) { + /* Check if it's an I/O mapping */ + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, hva); + if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end && + (vma->vm_flags & VM_PFNMAP)) { + pfn = vma->vm_pgoff + + ((hva - vma->vm_start) >> PAGE_SHIFT); + pte_size = psize; + is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); + } + up_read(¤t->mm->mmap_sem); + if (!pfn) + return -EFAULT; + } else { + page = pages[0]; + if (PageHuge(page)) { + page = compound_head(page); + pte_size <<= compound_order(page); + } + pfn = page_to_pfn(page); + } + + ret = -EFAULT; + if (psize > pte_size) + goto out_put; + + /* Check WIMG vs. the actual page we're accessing */ + if (!hpte_cache_flags_ok(r, is_io)) { + if (is_io) + return -EFAULT; + /* + * Allow guest to map emulated device memory as + * uncacheable, but actually make it cacheable. + */ + r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M; + } + + /* Set the HPTE to point to pfn */ + r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT); + ret = RESUME_GUEST; + preempt_disable(); + while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) + cpu_relax(); + if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] || + rev->guest_rpte != hpte[2]) + /* HPTE has been changed under us; let the guest retry */ + goto out_unlock; + hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; + + rmap = &memslot->rmap[gfn - memslot->base_gfn]; + lock_rmap(rmap); + + /* Check if we might have been invalidated; let the guest retry if so */ + ret = RESUME_GUEST; + if (mmu_notifier_retry(vcpu, mmu_seq)) { + unlock_rmap(rmap); + goto out_unlock; + } + kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); + + hptep[1] = r; + eieio(); + hptep[0] = hpte[0]; + asm volatile("ptesync" : : : "memory"); + preempt_enable(); + if (page) + SetPageDirty(page); + + out_put: + if (page) + put_page(page); + return ret; + + out_unlock: + hptep[0] &= ~HPTE_V_HVLOCK; + preempt_enable(); + goto out_put; +} + +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, + int (*handler)(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn)) +{ + int ret; + int retval = 0; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) { + unsigned long start = memslot->userspace_addr; + unsigned long end; + + end = start + (memslot->npages << PAGE_SHIFT); + if (hva >= start && hva < end) { + gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + + ret = handler(kvm, &memslot->rmap[gfn_offset], + memslot->base_gfn + gfn_offset); + retval |= ret; + } + } + + return retval; +} + +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long h, i, j; + unsigned long *hptep; + unsigned long ptel, psize; + + for (;;) { + while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmapp)) + cpu_relax(); + if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { + __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmapp); + break; + } + + /* + * To avoid an ABBA deadlock with the HPTE lock bit, + * we have to unlock the rmap chain before locking the HPTE. + * Thus we remove the first entry, unlock the rmap chain, + * lock the HPTE and then check that it is for the + * page we're unmapping before changing it to non-present. + */ + i = *rmapp & KVMPPC_RMAP_INDEX; + j = rev[i].forw; + if (j == i) { + /* chain is now empty */ + j = 0; + } else { + /* remove i from chain */ + h = rev[i].back; + rev[h].forw = j; + rev[j].back = h; + rev[i].forw = rev[i].back = i; + j |= KVMPPC_RMAP_PRESENT; + } + smp_wmb(); + *rmapp = j | (1ul << KVMPPC_RMAP_REF_BIT); + + /* Now lock, check and modify the HPTE */ + hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) + cpu_relax(); + ptel = rev[i].guest_rpte; + psize = hpte_page_size(hptep[0], ptel); + if ((hptep[0] & HPTE_V_VALID) && + hpte_rpn(ptel, psize) == gfn) { + kvmppc_invalidate_hpte(kvm, hptep, i); + hptep[0] |= HPTE_V_ABSENT; + } + hptep[0] &= ~HPTE_V_HVLOCK; + } + return 0; +} + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + if (kvm->arch.using_mmu_notifiers) + kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); + return 0; +} + +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn) +{ + if (!kvm->arch.using_mmu_notifiers) + return 0; + if (!(*rmapp & KVMPPC_RMAP_REFERENCED)) + return 0; + kvm_unmap_rmapp(kvm, rmapp, gfn); + while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmapp)) + cpu_relax(); + __clear_bit(KVMPPC_RMAP_REF_BIT, rmapp); + __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmapp); + return 1; +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + if (!kvm->arch.using_mmu_notifiers) + return 0; + return kvm_handle_hva(kvm, hva, kvm_age_rmapp); +} + +static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn) +{ + return !!(*rmapp & KVMPPC_RMAP_REFERENCED); +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + if (!kvm->arch.using_mmu_notifiers) + return 0; + return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp); +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + if (!kvm->arch.using_mmu_notifiers) + return; + kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); } void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, @@ -540,31 +771,42 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, { struct kvm_memory_slot *memslot; unsigned long gfn = gpa >> PAGE_SHIFT; - struct page *page; - unsigned long psize, offset; + struct page *page, *pages[1]; + int npages; + unsigned long hva, psize, offset; unsigned long pa; unsigned long *physp; memslot = gfn_to_memslot(kvm, gfn); if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) return NULL; - physp = kvm->arch.slot_phys[memslot->id]; - if (!physp) - return NULL; - physp += gfn - memslot->base_gfn; - pa = *physp; - if (!pa) { - if (kvmppc_get_guest_page(kvm, gfn, memslot, PAGE_SIZE) < 0) + if (!kvm->arch.using_mmu_notifiers) { + physp = kvm->arch.slot_phys[memslot->id]; + if (!physp) return NULL; + physp += gfn - memslot->base_gfn; pa = *physp; + if (!pa) { + if (kvmppc_get_guest_page(kvm, gfn, memslot, + PAGE_SIZE) < 0) + return NULL; + pa = *physp; + } + page = pfn_to_page(pa >> PAGE_SHIFT); + } else { + hva = gfn_to_hva_memslot(memslot, gfn); + npages = get_user_pages_fast(hva, 1, 1, pages); + if (npages < 1) + return NULL; + page = pages[0]; } - page = pfn_to_page(pa >> PAGE_SHIFT); psize = PAGE_SIZE; if (PageHuge(page)) { page = compound_head(page); psize <<= compound_order(page); } - get_page(page); + if (!kvm->arch.using_mmu_notifiers) + get_page(page); offset = gpa & (psize - 1); if (nb_ret) *nb_ret = psize - offset; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 45aabb9a527..86c4191cb75 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -326,19 +326,19 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } /* - * We get this if the guest accesses a page which it thinks - * it has mapped but which is not actually present, because - * it is for an emulated I/O device. - * Any other HDSI interrupt has been handled already. + * We get these next two if the guest accesses a page which it thinks + * it has mapped but which is not actually present, either because + * it is for an emulated I/O device or because the corresonding + * host page has been paged out. Any other HDSI/HISI interrupts + * have been handled already. */ case BOOK3S_INTERRUPT_H_DATA_STORAGE: r = kvmppc_book3s_hv_page_fault(run, vcpu, vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); break; case BOOK3S_INTERRUPT_H_INST_STORAGE: - kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE, - vcpu->arch.shregs.msr & 0x58000000); - r = RESUME_GUEST; + r = kvmppc_book3s_hv_page_fault(run, vcpu, + kvmppc_get_pc(vcpu), 0); break; /* * This occurs if the guest executes an illegal instruction. @@ -867,6 +867,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) flush_altivec_to_thread(current); flush_vsx_to_thread(current); vcpu->arch.wqp = &vcpu->arch.vcore->wq; + vcpu->arch.pgdir = current->mm->pgd; do { r = kvmppc_run_vcpu(run, vcpu); @@ -1090,9 +1091,9 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, unsigned long *phys; /* Allocate a slot_phys array */ - npages = mem->memory_size >> PAGE_SHIFT; phys = kvm->arch.slot_phys[mem->slot]; - if (!phys) { + if (!kvm->arch.using_mmu_notifiers && !phys) { + npages = mem->memory_size >> PAGE_SHIFT; phys = vzalloc(npages * sizeof(unsigned long)); if (!phys) return -ENOMEM; @@ -1298,6 +1299,7 @@ int kvmppc_core_init_vm(struct kvm *kvm) } kvm->arch.lpcr = lpcr; + kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206); spin_lock_init(&kvm->arch.slot_phys_lock); return 0; } @@ -1306,8 +1308,9 @@ void kvmppc_core_destroy_vm(struct kvm *kvm) { unsigned long i; - for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) - unpin_slot(kvm, i); + if (!kvm->arch.using_mmu_notifiers) + for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) + unpin_slot(kvm, i); if (kvm->arch.rma) { kvm_release_rma(kvm->arch.rma); diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index a5176dc37e7..81d16ed9767 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -58,7 +58,7 @@ static void *real_vmalloc_addr(void *x) * Add this HPTE into the chain for the real page. * Must be called with the chain locked; it unlocks the chain. */ -static void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, +void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, unsigned long *rmap, long pte_index, int realmode) { struct revmap_entry *head, *tail; @@ -83,6 +83,7 @@ static void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, smp_wmb(); *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */ } +EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); /* Remove this HPTE from the chain for a real page */ static void remove_revmap_chain(struct kvm *kvm, long pte_index, @@ -118,12 +119,33 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, unlock_rmap(rmap); } +static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva, + unsigned long *pte_sizep) +{ + pte_t *ptep; + unsigned long ps = *pte_sizep; + unsigned int shift; + + ptep = find_linux_pte_or_hugepte(vcpu->arch.pgdir, hva, &shift); + if (!ptep) + return __pte(0); + if (shift) + *pte_sizep = 1ul << shift; + else + *pte_sizep = PAGE_SIZE; + if (ps > *pte_sizep) + return __pte(0); + if (!pte_present(*ptep)) + return __pte(0); + return kvmppc_read_update_linux_pte(ptep); +} + long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel) { struct kvm *kvm = vcpu->kvm; unsigned long i, pa, gpa, gfn, psize; - unsigned long slot_fn; + unsigned long slot_fn, hva; unsigned long *hpte; struct revmap_entry *rev; unsigned long g_ptel = ptel; @@ -131,6 +153,8 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long *physp, pte_size; unsigned long is_io; unsigned long *rmap; + pte_t pte; + unsigned long mmu_seq; bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING; psize = hpte_page_size(pteh, ptel); @@ -138,11 +162,16 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, return H_PARAMETER; pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); + /* used later to detect if we might have been invalidated */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + /* Find the memslot (if any) for this address */ gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); gfn = gpa >> PAGE_SHIFT; memslot = builtin_gfn_to_memslot(kvm, gfn); pa = 0; + is_io = ~0ul; rmap = NULL; if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) { /* PPC970 can't do emulated MMIO */ @@ -160,19 +189,31 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, slot_fn = gfn - memslot->base_gfn; rmap = &memslot->rmap[slot_fn]; - physp = kvm->arch.slot_phys[memslot->id]; - if (!physp) - return H_PARAMETER; - physp += slot_fn; - if (realmode) - physp = real_vmalloc_addr(physp); - pa = *physp; - if (!pa) - return H_TOO_HARD; - is_io = pa & (HPTE_R_I | HPTE_R_W); - pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK); - pa &= PAGE_MASK; - + if (!kvm->arch.using_mmu_notifiers) { + physp = kvm->arch.slot_phys[memslot->id]; + if (!physp) + return H_PARAMETER; + physp += slot_fn; + if (realmode) + physp = real_vmalloc_addr(physp); + pa = *physp; + if (!pa) + return H_TOO_HARD; + is_io = pa & (HPTE_R_I | HPTE_R_W); + pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK); + pa &= PAGE_MASK; + } else { + /* Translate to host virtual address */ + hva = gfn_to_hva_memslot(memslot, gfn); + + /* Look up the Linux PTE for the backing page */ + pte_size = psize; + pte = lookup_linux_pte(vcpu, hva, &pte_size); + if (pte_present(pte)) { + is_io = hpte_cache_bits(pte_val(pte)); + pa = pte_pfn(pte) << PAGE_SHIFT; + } + } if (pte_size < psize) return H_PARAMETER; if (pa && pte_size > psize) @@ -180,10 +221,14 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, ptel &= ~(HPTE_R_PP0 - psize); ptel |= pa; - pteh |= HPTE_V_VALID; + + if (pa) + pteh |= HPTE_V_VALID; + else + pteh |= HPTE_V_ABSENT; /* Check WIMG */ - if (!hpte_cache_flags_ok(ptel, is_io)) { + if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) { if (is_io) return H_PARAMETER; /* @@ -194,6 +239,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, ptel |= HPTE_R_M; } + /* Find and lock the HPTEG slot to use */ do_insert: if (pte_index >= HPT_NPTE) return H_PARAMETER; @@ -253,7 +299,17 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, if (realmode) rmap = real_vmalloc_addr(rmap); lock_rmap(rmap); - kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, realmode); + /* Check for pending invalidations under the rmap chain lock */ + if (kvm->arch.using_mmu_notifiers && + mmu_notifier_retry(vcpu, mmu_seq)) { + /* inval in progress, write a non-present HPTE */ + pteh |= HPTE_V_ABSENT; + pteh &= ~HPTE_V_VALID; + unlock_rmap(rmap); + } else { + kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, + realmode); + } } hpte[1] = ptel; @@ -516,6 +572,23 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, return H_SUCCESS; } +void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep, + unsigned long pte_index) +{ + unsigned long rb; + + hptep[0] &= ~HPTE_V_VALID; + rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index); + while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" + : : "r" (rb), "r" (kvm->arch.lpid)); + asm volatile("ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; +} +EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte); + static int slb_base_page_shift[4] = { 24, /* 16M */ 16, /* 64k */ @@ -605,15 +678,15 @@ EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte); /* * Called in real mode to check whether an HPTE not found fault - * is due to accessing an emulated MMIO page. + * is due to accessing a paged-out page or an emulated MMIO page. * Returns a possibly modified status (DSISR) value if not * (i.e. pass the interrupt to the guest), * -1 to pass the fault up to host kernel mode code, -2 to do that - * and also load the instruction word, + * and also load the instruction word (for MMIO emulation), * or 0 if we should make the guest retry the access. */ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, - unsigned long slb_v, unsigned int status) + unsigned long slb_v, unsigned int status, bool data) { struct kvm *kvm = vcpu->kvm; long int index; @@ -624,6 +697,7 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, unsigned long pp, key; valid = HPTE_V_VALID | HPTE_V_ABSENT; + index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid); if (index < 0) return status; /* there really was no HPTE */ @@ -645,22 +719,28 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, /* Check access permissions to the page */ pp = gr & (HPTE_R_PP0 | HPTE_R_PP); key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; - if (status & DSISR_ISSTORE) { + status &= ~DSISR_NOHPTE; /* DSISR_NOHPTE == SRR1_ISI_NOPT */ + if (!data) { + if (gr & (HPTE_R_N | HPTE_R_G)) + return status | SRR1_ISI_N_OR_G; + if (!hpte_read_permission(pp, slb_v & key)) + return status | SRR1_ISI_PROT; + } else if (status & DSISR_ISSTORE) { /* check write permission */ if (!hpte_write_permission(pp, slb_v & key)) - goto protfault; + return status | DSISR_PROTFAULT; } else { if (!hpte_read_permission(pp, slb_v & key)) - goto protfault; + return status | DSISR_PROTFAULT; } /* Check storage key, if applicable */ - if (vcpu->arch.shregs.msr & MSR_DR) { + if (data && (vcpu->arch.shregs.msr & MSR_DR)) { unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr); if (status & DSISR_ISSTORE) perm >>= 1; if (perm & 1) - return (status & ~DSISR_NOHPTE) | DSISR_KEYFAULT; + return status | DSISR_KEYFAULT; } /* Save HPTE info for virtual-mode handler */ @@ -669,11 +749,11 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, vcpu->arch.pgfault_hpte[0] = v; vcpu->arch.pgfault_hpte[1] = r; - if (vcpu->arch.shregs.msr & MSR_IR) + /* Check the storage key to see if it is possibly emulated MMIO */ + if (data && (vcpu->arch.shregs.msr & MSR_IR) && + (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) == + (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) return -2; /* MMIO emulation - load instr word */ return -1; /* send fault up to host kernel mode */ - - protfault: - return (status & ~DSISR_NOHPTE) | DSISR_PROTFAULT; } diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index d07b64d5f37..7d4990665d0 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -621,6 +621,8 @@ BEGIN_FTR_SECTION /* If this is a page table miss then see if it's theirs or ours */ cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE beq kvmppc_hdsi + cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE + beq kvmppc_hisi END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) /* See if this is a leftover HDEC interrupt */ @@ -1125,6 +1127,7 @@ kvmppc_hdsi: /* Search the hash table. */ mr r3, r9 /* vcpu pointer */ + li r7, 1 /* data fault */ bl .kvmppc_hpte_hv_fault ld r9, HSTATE_KVM_VCPU(r13) ld r10, VCPU_PC(r9) @@ -1181,6 +1184,52 @@ kvmppc_hdsi: stb r0, HSTATE_IN_GUEST(r13) b nohpte_cont +/* + * Similarly for an HISI, reflect it to the guest as an ISI unless + * it is an HPTE not found fault for a page that we have paged out. + */ +kvmppc_hisi: + andis. r0, r11, SRR1_ISI_NOPT@h + beq 1f + andi. r0, r11, MSR_IR /* instruction relocation enabled? */ + beq 3f + clrrdi r0, r10, 28 + PPC_SLBFEE_DOT(r5, r0) /* if so, look up SLB */ + bne 1f /* if no SLB entry found */ +4: + /* Search the hash table. */ + mr r3, r9 /* vcpu pointer */ + mr r4, r10 + mr r6, r11 + li r7, 0 /* instruction fault */ + bl .kvmppc_hpte_hv_fault + ld r9, HSTATE_KVM_VCPU(r13) + ld r10, VCPU_PC(r9) + ld r11, VCPU_MSR(r9) + li r12, BOOK3S_INTERRUPT_H_INST_STORAGE + cmpdi r3, 0 /* retry the instruction */ + beq 6f + cmpdi r3, -1 /* handle in kernel mode */ + beq nohpte_cont + + /* Synthesize an ISI for the guest */ + mr r11, r3 +1: mtspr SPRN_SRR0, r10 + mtspr SPRN_SRR1, r11 + li r10, BOOK3S_INTERRUPT_INST_STORAGE + li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ + rotldi r11, r11, 63 +6: ld r7, VCPU_CTR(r9) + lwz r8, VCPU_XER(r9) + mtctr r7 + mtxer r8 + mr r4, r9 + b fast_guest_return + +3: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */ + ld r5, KVM_VRMA_SLB_V(r6) + b 4b + /* * Try to handle an hcall in real mode. * Returns to the guest if we handle it, or continues on up to diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 4f85ac32258..06e955b5837 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -245,6 +245,9 @@ int kvm_dev_ioctl_check_extension(long ext) if (cpu_has_feature(CPU_FTR_ARCH_201)) r = 2; break; + case KVM_CAP_SYNC_MMU: + r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0; + break; #endif default: r = 0; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a8b3cc7d90f..f348c3d9040 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -103,6 +104,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift *shift = hugepd_shift(*hpdp); return hugepte_offset(hpdp, ea, pdshift); } +EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { -- cgit v1.2.3-70-g09d2 From 4cf302bc106566c5bad523337296ea8b72df63f5 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Dec 2011 12:38:51 +0000 Subject: KVM: PPC: Allow for read-only pages backing a Book3S HV guest With this, if a guest does an H_ENTER with a read/write HPTE on a page which is currently read-only, we make the actual HPTE inserted be a read-only version of the HPTE. We now intercept protection faults as well as HPTE not found faults, and for a protection fault we work out whether it should be reflected to the guest (e.g. because the guest HPTE didn't allow write access to usermode) or handled by switching to kernel context and calling kvmppc_book3s_hv_page_fault, which will then request write access to the page and update the actual HPTE. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/powerpc/include/asm/kvm_book3s_64.h | 20 ++++++++++++++-- arch/powerpc/kvm/book3s_64_mmu_hv.c | 39 +++++++++++++++++++++++++++++--- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 32 ++++++++++++++++++-------- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 ++-- 4 files changed, 78 insertions(+), 17 deletions(-) (limited to 'arch/powerpc/kvm/book3s_hv_rmhandlers.S') diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index c21e46da4a3..b0c08b14277 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -121,6 +121,22 @@ static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize) return ((ptel & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT; } +static inline int hpte_is_writable(unsigned long ptel) +{ + unsigned long pp = ptel & (HPTE_R_PP0 | HPTE_R_PP); + + return pp != PP_RXRX && pp != PP_RXXX; +} + +static inline unsigned long hpte_make_readonly(unsigned long ptel) +{ + if ((ptel & HPTE_R_PP0) || (ptel & HPTE_R_PP) == PP_RWXX) + ptel = (ptel & ~HPTE_R_PP) | PP_RXXX; + else + ptel |= PP_RXRX; + return ptel; +} + static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type) { unsigned int wimg = ptel & HPTE_R_WIMG; @@ -140,7 +156,7 @@ static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type) * Lock and read a linux PTE. If it's present and writable, atomically * set dirty and referenced bits and return the PTE, otherwise return 0. */ -static inline pte_t kvmppc_read_update_linux_pte(pte_t *p) +static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing) { pte_t pte, tmp; @@ -158,7 +174,7 @@ static inline pte_t kvmppc_read_update_linux_pte(pte_t *p) if (pte_present(pte)) { pte = pte_mkyoung(pte); - if (pte_write(pte)) + if (writing && pte_write(pte)) pte = pte_mkdirty(pte); } diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 83761dd8a92..66d6452c108 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -503,6 +503,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, struct page *page, *pages[1]; long index, ret, npages; unsigned long is_io; + unsigned int writing, write_ok; struct vm_area_struct *vma; /* @@ -553,8 +554,11 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, pfn = 0; page = NULL; pte_size = PAGE_SIZE; + writing = (dsisr & DSISR_ISSTORE) != 0; + /* If writing != 0, then the HPTE must allow writing, if we get here */ + write_ok = writing; hva = gfn_to_hva_memslot(memslot, gfn); - npages = get_user_pages_fast(hva, 1, 1, pages); + npages = get_user_pages_fast(hva, 1, writing, pages); if (npages < 1) { /* Check if it's an I/O mapping */ down_read(¤t->mm->mmap_sem); @@ -565,6 +569,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, ((hva - vma->vm_start) >> PAGE_SHIFT); pte_size = psize; is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); + write_ok = vma->vm_flags & VM_WRITE; } up_read(¤t->mm->mmap_sem); if (!pfn) @@ -575,6 +580,24 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, page = compound_head(page); pte_size <<= compound_order(page); } + /* if the guest wants write access, see if that is OK */ + if (!writing && hpte_is_writable(r)) { + pte_t *ptep, pte; + + /* + * We need to protect against page table destruction + * while looking up and updating the pte. + */ + rcu_read_lock_sched(); + ptep = find_linux_pte_or_hugepte(current->mm->pgd, + hva, NULL); + if (ptep && pte_present(*ptep)) { + pte = kvmppc_read_update_linux_pte(ptep, 1); + if (pte_write(pte)) + write_ok = 1; + } + rcu_read_unlock_sched(); + } pfn = page_to_pfn(page); } @@ -595,6 +618,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* Set the HPTE to point to pfn */ r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT); + if (hpte_is_writable(r) && !write_ok) + r = hpte_make_readonly(r); ret = RESUME_GUEST; preempt_disable(); while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) @@ -614,14 +639,22 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unlock_rmap(rmap); goto out_unlock; } - kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); + + if (hptep[0] & HPTE_V_VALID) { + /* HPTE was previously valid, so we need to invalidate it */ + unlock_rmap(rmap); + hptep[0] |= HPTE_V_ABSENT; + kvmppc_invalidate_hpte(kvm, hptep, index); + } else { + kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); + } hptep[1] = r; eieio(); hptep[0] = hpte[0]; asm volatile("ptesync" : : : "memory"); preempt_enable(); - if (page) + if (page && hpte_is_writable(r)) SetPageDirty(page); out_put: diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 81d16ed9767..d3e36fc77e2 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -120,7 +120,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, } static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva, - unsigned long *pte_sizep) + int writing, unsigned long *pte_sizep) { pte_t *ptep; unsigned long ps = *pte_sizep; @@ -137,7 +137,7 @@ static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva, return __pte(0); if (!pte_present(*ptep)) return __pte(0); - return kvmppc_read_update_linux_pte(ptep); + return kvmppc_read_update_linux_pte(ptep, writing); } long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, @@ -154,12 +154,14 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long is_io; unsigned long *rmap; pte_t pte; + unsigned int writing; unsigned long mmu_seq; bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING; psize = hpte_page_size(pteh, ptel); if (!psize) return H_PARAMETER; + writing = hpte_is_writable(ptel); pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); /* used later to detect if we might have been invalidated */ @@ -208,8 +210,11 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, /* Look up the Linux PTE for the backing page */ pte_size = psize; - pte = lookup_linux_pte(vcpu, hva, &pte_size); + pte = lookup_linux_pte(vcpu, hva, writing, &pte_size); if (pte_present(pte)) { + if (writing && !pte_write(pte)) + /* make the actual HPTE be read-only */ + ptel = hpte_make_readonly(ptel); is_io = hpte_cache_bits(pte_val(pte)); pa = pte_pfn(pte) << PAGE_SHIFT; } @@ -678,7 +683,9 @@ EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte); /* * Called in real mode to check whether an HPTE not found fault - * is due to accessing a paged-out page or an emulated MMIO page. + * is due to accessing a paged-out page or an emulated MMIO page, + * or if a protection fault is due to accessing a page that the + * guest wanted read/write access to but which we made read-only. * Returns a possibly modified status (DSISR) value if not * (i.e. pass the interrupt to the guest), * -1 to pass the fault up to host kernel mode code, -2 to do that @@ -696,12 +703,17 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, struct revmap_entry *rev; unsigned long pp, key; - valid = HPTE_V_VALID | HPTE_V_ABSENT; + /* For protection fault, expect to find a valid HPTE */ + valid = HPTE_V_VALID; + if (status & DSISR_NOHPTE) + valid |= HPTE_V_ABSENT; index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid); - if (index < 0) - return status; /* there really was no HPTE */ - + if (index < 0) { + if (status & DSISR_NOHPTE) + return status; /* there really was no HPTE */ + return 0; /* for prot fault, HPTE disappeared */ + } hpte = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); v = hpte[0] & ~HPTE_V_HVLOCK; r = hpte[1]; @@ -712,8 +724,8 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, asm volatile("lwsync" : : : "memory"); hpte[0] = v; - /* If the HPTE is valid by now, retry the instruction */ - if (v & HPTE_V_VALID) + /* For not found, if the HPTE is valid by now, retry the instruction */ + if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID)) return 0; /* Check access permissions to the page */ diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 7d4990665d0..b70bf22a3ff 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1114,8 +1114,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) kvmppc_hdsi: mfspr r4, SPRN_HDAR mfspr r6, SPRN_HDSISR - /* HPTE not found fault? */ - andis. r0, r6, DSISR_NOHPTE@h + /* HPTE not found fault or protection fault? */ + andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h beq 1f /* if not, send it to the guest */ andi. r0, r11, MSR_DR /* data relocation enabled? */ beq 3f -- cgit v1.2.3-70-g09d2