From de56a948b9182fbcf92cb8212f114de096c2d574 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 29 Jun 2011 00:21:34 +0000 Subject: KVM: PPC: Add support for Book3S processors in hypervisor mode This adds support for KVM running on 64-bit Book 3S processors, specifically POWER7, in hypervisor mode. Using hypervisor mode means that the guest can use the processor's supervisor mode. That means that the guest can execute privileged instructions and access privileged registers itself without trapping to the host. This gives excellent performance, but does mean that KVM cannot emulate a processor architecture other than the one that the hardware implements. This code assumes that the guest is running paravirtualized using the PAPR (Power Architecture Platform Requirements) interface, which is the interface that IBM's PowerVM hypervisor uses. That means that existing Linux distributions that run on IBM pSeries machines will also run under KVM without modification. In order to communicate the PAPR hypercalls to qemu, this adds a new KVM_EXIT_PAPR_HCALL exit code to include/linux/kvm.h. Currently the choice between book3s_hv support and book3s_pr support (i.e. the existing code, which runs the guest in user mode) has to be made at kernel configuration time, so a given kernel binary can only do one or the other. This new book3s_hv code doesn't support MMIO emulation at present. Since we are running paravirtualized guests, this isn't a serious restriction. With the guest running in supervisor mode, most exceptions go straight to the guest. We will never get data or instruction storage or segment interrupts, alignment interrupts, decrementer interrupts, program interrupts, single-step interrupts, etc., coming to the hypervisor from the guest. Therefore this introduces a new KVMTEST_NONHV macro for the exception entry path so that we don't have to do the KVM test on entry to those exception handlers. We do however get hypervisor decrementer, hypervisor data storage, hypervisor instruction storage, and hypervisor emulation assist interrupts, so we have to handle those. In hypervisor mode, real-mode accesses can access all of RAM, not just a limited amount. Therefore we put all the guest state in the vcpu.arch and use the shadow_vcpu in the PACA only for temporary scratch space. We allocate the vcpu with kzalloc rather than vzalloc, and we don't use anything in the kvmppc_vcpu_book3s struct, so we don't allocate it. We don't have a shared page with the guest, but we still need a kvm_vcpu_arch_shared struct to store the values of various registers, so we include one in the vcpu_arch struct. The POWER7 processor has a restriction that all threads in a core have to be in the same partition. MMU-on kernel code counts as a partition (partition 0), so we have to do a partition switch on every entry to and exit from the guest. At present we require the host and guest to run in single-thread mode because of this hardware restriction. This code allocates a hashed page table for the guest and initializes it with HPTEs for the guest's Virtual Real Memory Area (VRMA). We require that the guest memory is allocated using 16MB huge pages, in order to simplify the low-level memory management. This also means that we can get away without tracking paging activity in the host for now, since huge pages can't be paged or swapped. This also adds a few new exports needed by the book3s_hv code. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_hv.c | 445 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 445 insertions(+) create mode 100644 arch/powerpc/kvm/book3s_hv.c (limited to 'arch/powerpc/kvm/book3s_hv.c') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c new file mode 100644 index 00000000000..60b7300568c --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv.c @@ -0,0 +1,445 @@ +/* + * Copyright 2011 Paul Mackerras, IBM Corp. + * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved. + * + * Authors: + * Paul Mackerras + * Alexander Graf + * Kevin Wolf + * + * Description: KVM functions specific to running on Book 3S + * processors in hypervisor mode (specifically POWER7 and later). + * + * This file is derived from arch/powerpc/kvm/book3s.c, + * by Alexander Graf . + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* #define EXIT_DEBUG */ +/* #define EXIT_DEBUG_SIMPLE */ +/* #define EXIT_DEBUG_INT */ + +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + local_paca->kvm_hstate.kvm_vcpu = vcpu; +} + +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +{ +} + +void kvmppc_vcpu_block(struct kvm_vcpu *vcpu) +{ + u64 now; + unsigned long dec_nsec; + + now = get_tb(); + if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu)) + kvmppc_core_queue_dec(vcpu); + if (vcpu->arch.pending_exceptions) + return; + if (vcpu->arch.dec_expires != ~(u64)0) { + dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC / + tb_ticks_per_sec; + hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec), + HRTIMER_MODE_REL); + } + + kvm_vcpu_block(vcpu); + vcpu->stat.halt_wakeup++; + + if (vcpu->arch.dec_expires != ~(u64)0) + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); +} + +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) +{ + vcpu->arch.shregs.msr = msr; +} + +void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) +{ + vcpu->arch.pvr = pvr; +} + +void kvmppc_dump_regs(struct kvm_vcpu *vcpu) +{ + int r; + + pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id); + pr_err("pc = %.16lx msr = %.16llx trap = %x\n", + vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap); + for (r = 0; r < 16; ++r) + pr_err("r%2d = %.16lx r%d = %.16lx\n", + r, kvmppc_get_gpr(vcpu, r), + r+16, kvmppc_get_gpr(vcpu, r+16)); + pr_err("ctr = %.16lx lr = %.16lx\n", + vcpu->arch.ctr, vcpu->arch.lr); + pr_err("srr0 = %.16llx srr1 = %.16llx\n", + vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1); + pr_err("sprg0 = %.16llx sprg1 = %.16llx\n", + vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1); + pr_err("sprg2 = %.16llx sprg3 = %.16llx\n", + vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3); + pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n", + vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr); + pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar); + pr_err("fault dar = %.16lx dsisr = %.8x\n", + vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); + pr_err("SLB (%d entries):\n", vcpu->arch.slb_max); + for (r = 0; r < vcpu->arch.slb_max; ++r) + pr_err(" ESID = %.16llx VSID = %.16llx\n", + vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv); + pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n", + vcpu->arch.lpcr, vcpu->kvm->arch.sdr1, + vcpu->arch.last_inst); +} + +static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, + struct task_struct *tsk) +{ + int r = RESUME_HOST; + + vcpu->stat.sum_exits++; + + run->exit_reason = KVM_EXIT_UNKNOWN; + run->ready_for_interrupt_injection = 1; + switch (vcpu->arch.trap) { + /* We're good on these - the host merely wanted to get our attention */ + case BOOK3S_INTERRUPT_HV_DECREMENTER: + vcpu->stat.dec_exits++; + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_EXTERNAL: + vcpu->stat.ext_intr_exits++; + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_PERFMON: + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_PROGRAM: + { + ulong flags; + /* + * Normally program interrupts are delivered directly + * to the guest by the hardware, but we can get here + * as a result of a hypervisor emulation interrupt + * (e40) getting turned into a 700 by BML RTAS. + */ + flags = vcpu->arch.shregs.msr & 0x1f0000ull; + kvmppc_core_queue_program(vcpu, flags); + r = RESUME_GUEST; + break; + } + case BOOK3S_INTERRUPT_SYSCALL: + { + /* hcall - punt to userspace */ + int i; + + if (vcpu->arch.shregs.msr & MSR_PR) { + /* sc 1 from userspace - reflect to guest syscall */ + kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_SYSCALL); + r = RESUME_GUEST; + break; + } + run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3); + for (i = 0; i < 9; ++i) + run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i); + run->exit_reason = KVM_EXIT_PAPR_HCALL; + vcpu->arch.hcall_needed = 1; + r = RESUME_HOST; + break; + } + /* + * We get these next two if the guest does a bad real-mode access, + * as we have enabled VRMA (virtualized real mode area) mode in the + * LPCR. We just generate an appropriate DSI/ISI to the guest. + */ + case BOOK3S_INTERRUPT_H_DATA_STORAGE: + vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr; + vcpu->arch.shregs.dar = vcpu->arch.fault_dar; + kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0); + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_H_INST_STORAGE: + kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE, + 0x08000000); + r = RESUME_GUEST; + break; + /* + * This occurs if the guest executes an illegal instruction. + * We just generate a program interrupt to the guest, since + * we don't emulate any guest instructions at this stage. + */ + case BOOK3S_INTERRUPT_H_EMUL_ASSIST: + kvmppc_core_queue_program(vcpu, 0x80000); + r = RESUME_GUEST; + break; + default: + kvmppc_dump_regs(vcpu); + printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", + vcpu->arch.trap, kvmppc_get_pc(vcpu), + vcpu->arch.shregs.msr); + r = RESUME_HOST; + BUG(); + break; + } + + + if (!(r & RESUME_HOST)) { + /* To avoid clobbering exit_reason, only check for signals if + * we aren't already exiting to userspace for some other + * reason. */ + if (signal_pending(tsk)) { + vcpu->stat.signal_exits++; + run->exit_reason = KVM_EXIT_INTR; + r = -EINTR; + } else { + kvmppc_core_deliver_interrupts(vcpu); + } + } + + return r; +} + +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + int i; + + sregs->pvr = vcpu->arch.pvr; + + memset(sregs, 0, sizeof(struct kvm_sregs)); + for (i = 0; i < vcpu->arch.slb_max; i++) { + sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige; + sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv; + } + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + int i, j; + + kvmppc_set_pvr(vcpu, sregs->pvr); + + j = 0; + for (i = 0; i < vcpu->arch.slb_nr; i++) { + if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) { + vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe; + vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv; + ++j; + } + } + vcpu->arch.slb_max = j; + + return 0; +} + +int kvmppc_core_check_processor_compat(void) +{ + if (cpu_has_feature(CPU_FTR_HVMODE_206)) + return 0; + return -EIO; +} + +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +{ + struct kvm_vcpu *vcpu; + int err = -ENOMEM; + unsigned long lpcr; + + vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); + if (!vcpu) + goto out; + + err = kvm_vcpu_init(vcpu, kvm, id); + if (err) + goto free_vcpu; + + vcpu->arch.shared = &vcpu->arch.shregs; + vcpu->arch.last_cpu = -1; + vcpu->arch.mmcr[0] = MMCR0_FC; + vcpu->arch.ctrl = CTRL_RUNLATCH; + /* default to host PVR, since we can't spoof it */ + vcpu->arch.pvr = mfspr(SPRN_PVR); + kvmppc_set_pvr(vcpu, vcpu->arch.pvr); + + lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES); + lpcr |= LPCR_VPM0 | LPCR_VRMA_L | (4UL << LPCR_DPFD_SH) | LPCR_HDICE; + vcpu->arch.lpcr = lpcr; + + kvmppc_mmu_book3s_hv_init(vcpu); + + return vcpu; + +free_vcpu: + kfree(vcpu); +out: + return ERR_PTR(err); +} + +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +{ + kvm_vcpu_uninit(vcpu); + kfree(vcpu); +} + +extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); + +int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + u64 now; + + if (signal_pending(current)) { + run->exit_reason = KVM_EXIT_INTR; + return -EINTR; + } + + flush_fp_to_thread(current); + flush_altivec_to_thread(current); + flush_vsx_to_thread(current); + preempt_disable(); + + /* + * Make sure we are running on thread 0, and that + * secondary threads are offline. + * XXX we should also block attempts to bring any + * secondary threads online. + */ + if (threads_per_core > 1) { + int cpu = smp_processor_id(); + int thr = cpu_thread_in_core(cpu); + + if (thr) + goto out; + while (++thr < threads_per_core) + if (cpu_online(cpu + thr)) + goto out; + } + + kvm_guest_enter(); + + __kvmppc_vcore_entry(NULL, vcpu); + + kvm_guest_exit(); + + preempt_enable(); + kvm_resched(vcpu); + + now = get_tb(); + /* cancel pending dec exception if dec is positive */ + if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu)) + kvmppc_core_dequeue_dec(vcpu); + + return kvmppc_handle_exit(run, vcpu, current); + + out: + preempt_enable(); + return -EBUSY; +} + +int kvmppc_core_prepare_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem) +{ + if (mem->guest_phys_addr == 0 && mem->memory_size != 0) + return kvmppc_prepare_vrma(kvm, mem); + return 0; +} + +void kvmppc_core_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem) +{ + if (mem->guest_phys_addr == 0 && mem->memory_size != 0) + kvmppc_map_vrma(kvm, mem); +} + +int kvmppc_core_init_vm(struct kvm *kvm) +{ + long r; + + /* Allocate hashed page table */ + r = kvmppc_alloc_hpt(kvm); + + return r; +} + +void kvmppc_core_destroy_vm(struct kvm *kvm) +{ + kvmppc_free_hpt(kvm); +} + +/* These are stubs for now */ +void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) +{ +} + +/* We don't need to emulate any privileged instructions or dcbz */ +int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) +{ + return EMULATE_FAIL; +} + +int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +{ + return EMULATE_FAIL; +} + +int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +{ + return EMULATE_FAIL; +} + +static int kvmppc_book3s_hv_init(void) +{ + int r; + + r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + + if (r) + return r; + + r = kvmppc_mmu_hv_init(); + + return r; +} + +static void kvmppc_book3s_hv_exit(void) +{ + kvm_exit(); +} + +module_init(kvmppc_book3s_hv_init); +module_exit(kvmppc_book3s_hv_exit); -- cgit v1.2.3-70-g09d2 From a8606e20e41a8149456bafdf76ad29d47672027c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 29 Jun 2011 00:22:05 +0000 Subject: KVM: PPC: Handle some PAPR hcalls in the kernel This adds the infrastructure for handling PAPR hcalls in the kernel, either early in the guest exit path while we are still in real mode, or later once the MMU has been turned back on and we are in the full kernel context. The advantage of handling hcalls in real mode if possible is that we avoid two partition switches -- and this will become more important when we support SMT4 guests, since a partition switch means we have to pull all of the threads in the core out of the guest. The disadvantage is that we can only access the kernel linear mapping, not anything vmalloced or ioremapped, since the MMU is off. This also adds code to handle the following hcalls in real mode: H_ENTER Add an HPTE to the hashed page table H_REMOVE Remove an HPTE from the hashed page table H_READ Read HPTEs from the hashed page table H_PROTECT Change the protection bits in an HPTE H_BULK_REMOVE Remove up to 4 HPTEs from the hashed page table H_SET_DABR Set the data address breakpoint register Plus code to handle the following hcalls in the kernel: H_CEDE Idle the vcpu until an interrupt or H_PROD hcall arrives H_PROD Wake up a ceded vcpu H_REGISTER_VPA Register a virtual processor area (VPA) The code that runs in real mode has to be in the base kernel, not in the module, if KVM is compiled as a module. The real-mode code can only access the kernel linear mapping, not vmalloc or ioremap space. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/hvcall.h | 5 + arch/powerpc/include/asm/kvm_host.h | 11 + arch/powerpc/include/asm/kvm_ppc.h | 1 + arch/powerpc/kernel/asm-offsets.c | 2 + arch/powerpc/kvm/Makefile | 8 +- arch/powerpc/kvm/book3s_hv.c | 170 ++++++++++++++- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 368 ++++++++++++++++++++++++++++++++ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 158 +++++++++++++- arch/powerpc/kvm/powerpc.c | 2 +- 9 files changed, 718 insertions(+), 7 deletions(-) create mode 100644 arch/powerpc/kvm/book3s_hv_rm_mmu.c (limited to 'arch/powerpc/kvm/book3s_hv.c') diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index fd8201dddd4..1c324ff55ea 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -29,6 +29,10 @@ #define H_LONG_BUSY_ORDER_100_SEC 9905 /* Long busy, hint that 100sec \ is a good time to retry */ #define H_LONG_BUSY_END_RANGE 9905 /* End of long busy range */ + +/* Internal value used in book3s_hv kvm support; not returned to guests */ +#define H_TOO_HARD 9999 + #define H_HARDWARE -1 /* Hardware error */ #define H_FUNCTION -2 /* Function not supported */ #define H_PRIVILEGE -3 /* Caller not privileged */ @@ -100,6 +104,7 @@ #define H_PAGE_SET_ACTIVE H_PAGE_STATE_CHANGE #define H_AVPN (1UL<<(63-32)) /* An avpn is provided as a sanity test */ #define H_ANDCOND (1UL<<(63-33)) +#define H_LOCAL (1UL<<(63-35)) #define H_ICACHE_INVALIDATE (1UL<<(63-40)) /* icbi, etc. (ignored for IO pages) */ #define H_ICACHE_SYNCHRONIZE (1UL<<(63-41)) /* dcbst, icbi, etc (ignored for IO pages */ #define H_COALESCE_CAND (1UL<<(63-42)) /* page is a good candidate for coalescing */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 4a3f790d5fc..6ebf1721680 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -59,6 +59,10 @@ struct kvm; struct kvm_run; struct kvm_vcpu; +struct lppaca; +struct slb_shadow; +struct dtl; + struct kvm_vm_stat { u32 remote_tlb_flush; }; @@ -344,7 +348,14 @@ struct kvm_vcpu_arch { u64 dec_expires; unsigned long pending_exceptions; u16 last_cpu; + u8 ceded; + u8 prodded; u32 last_inst; + + struct lppaca *vpa; + struct slb_shadow *slb_shadow; + struct dtl *dtl; + struct dtl *dtl_end; int trap; struct kvm_vcpu_arch_shared *shared; unsigned long magic_page_pa; /* phys addr to map the magic page to */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 0dafd53c30e..2afe92e6f62 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -118,6 +118,7 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem); extern void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem); +extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern int kvmppc_core_init_vm(struct kvm *kvm); extern void kvmppc_core_destroy_vm(struct kvm *kvm); extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 936267462ca..c70d106bf1a 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -189,6 +189,7 @@ int main(void) DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int)); DEFINE(LPPACA_PMCINUSE, offsetof(struct lppaca, pmcregs_in_use)); DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx)); + DEFINE(LPPACA_YIELDCOUNT, offsetof(struct lppaca, yield_count)); DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx)); #endif /* CONFIG_PPC_STD_MMU_64 */ DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp)); @@ -459,6 +460,7 @@ int main(void) DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec)); DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires)); DEFINE(VCPU_LPCR, offsetof(struct kvm_vcpu, arch.lpcr)); + DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa)); DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr)); DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc)); DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb)); diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 8a435a6da66..2ecffc0dc1b 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -54,14 +54,17 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ book3s_hv.o \ book3s_hv_interrupts.o \ book3s_64_mmu_hv.o +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ + book3s_hv_rm_mmu.o -kvm-book3s_64-objs := \ +kvm-book3s_64-module-objs := \ ../../../virt/kvm/kvm_main.o \ powerpc.o \ emulate.o \ book3s.o \ $(kvm-book3s_64-objs-y) -kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-objs) + +kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs) kvm-book3s_32-objs := \ $(common-objs-y) \ @@ -83,3 +86,4 @@ obj-$(CONFIG_KVM_E500) += kvm.o obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o +obj-y += $(kvm-book3s_64-builtin-objs-y) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 60b7300568c..af862c30b70 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -124,6 +124,158 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu) vcpu->arch.last_inst); } +struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id) +{ + int r; + struct kvm_vcpu *v, *ret = NULL; + + mutex_lock(&kvm->lock); + kvm_for_each_vcpu(r, v, kvm) { + if (v->vcpu_id == id) { + ret = v; + break; + } + } + mutex_unlock(&kvm->lock); + return ret; +} + +static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa) +{ + vpa->shared_proc = 1; + vpa->yield_count = 1; +} + +static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, + unsigned long flags, + unsigned long vcpuid, unsigned long vpa) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long pg_index, ra, len; + unsigned long pg_offset; + void *va; + struct kvm_vcpu *tvcpu; + + tvcpu = kvmppc_find_vcpu(kvm, vcpuid); + if (!tvcpu) + return H_PARAMETER; + + flags >>= 63 - 18; + flags &= 7; + if (flags == 0 || flags == 4) + return H_PARAMETER; + if (flags < 4) { + if (vpa & 0x7f) + return H_PARAMETER; + /* registering new area; convert logical addr to real */ + pg_index = vpa >> kvm->arch.ram_porder; + pg_offset = vpa & (kvm->arch.ram_psize - 1); + if (pg_index >= kvm->arch.ram_npages) + return H_PARAMETER; + if (kvm->arch.ram_pginfo[pg_index].pfn == 0) + return H_PARAMETER; + ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT; + ra |= pg_offset; + va = __va(ra); + if (flags <= 1) + len = *(unsigned short *)(va + 4); + else + len = *(unsigned int *)(va + 4); + if (pg_offset + len > kvm->arch.ram_psize) + return H_PARAMETER; + switch (flags) { + case 1: /* register VPA */ + if (len < 640) + return H_PARAMETER; + tvcpu->arch.vpa = va; + init_vpa(vcpu, va); + break; + case 2: /* register DTL */ + if (len < 48) + return H_PARAMETER; + if (!tvcpu->arch.vpa) + return H_RESOURCE; + len -= len % 48; + tvcpu->arch.dtl = va; + tvcpu->arch.dtl_end = va + len; + break; + case 3: /* register SLB shadow buffer */ + if (len < 8) + return H_PARAMETER; + if (!tvcpu->arch.vpa) + return H_RESOURCE; + tvcpu->arch.slb_shadow = va; + len = (len - 16) / 16; + tvcpu->arch.slb_shadow = va; + break; + } + } else { + switch (flags) { + case 5: /* unregister VPA */ + if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl) + return H_RESOURCE; + tvcpu->arch.vpa = NULL; + break; + case 6: /* unregister DTL */ + tvcpu->arch.dtl = NULL; + break; + case 7: /* unregister SLB shadow buffer */ + tvcpu->arch.slb_shadow = NULL; + break; + } + } + return H_SUCCESS; +} + +int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) +{ + unsigned long req = kvmppc_get_gpr(vcpu, 3); + unsigned long target, ret = H_SUCCESS; + struct kvm_vcpu *tvcpu; + + switch (req) { + case H_CEDE: + vcpu->arch.shregs.msr |= MSR_EE; + vcpu->arch.ceded = 1; + smp_mb(); + if (!vcpu->arch.prodded) + kvmppc_vcpu_block(vcpu); + else + vcpu->arch.prodded = 0; + smp_mb(); + vcpu->arch.ceded = 0; + break; + case H_PROD: + target = kvmppc_get_gpr(vcpu, 4); + tvcpu = kvmppc_find_vcpu(vcpu->kvm, target); + if (!tvcpu) { + ret = H_PARAMETER; + break; + } + tvcpu->arch.prodded = 1; + smp_mb(); + if (vcpu->arch.ceded) { + if (waitqueue_active(&vcpu->wq)) { + wake_up_interruptible(&vcpu->wq); + vcpu->stat.halt_wakeup++; + } + } + break; + case H_CONFER: + break; + case H_REGISTER_VPA: + ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6)); + break; + default: + return RESUME_HOST; + } + kvmppc_set_gpr(vcpu, 3, ret); + vcpu->arch.hcall_needed = 0; + return RESUME_GUEST; +} + static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, struct task_struct *tsk) { @@ -318,7 +470,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); -int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) +static int kvmppc_run_vcpu(struct kvm_run *run, struct kvm_vcpu *vcpu) { u64 now; @@ -370,6 +522,22 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) return -EBUSY; } +int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + int r; + + do { + r = kvmppc_run_vcpu(run, vcpu); + + if (run->exit_reason == KVM_EXIT_PAPR_HCALL && + !(vcpu->arch.shregs.msr & MSR_PR)) { + r = kvmppc_pseries_do_hcall(vcpu); + kvmppc_core_deliver_interrupts(vcpu); + } + } while (r == RESUME_GUEST); + return r; +} + int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem) { diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c new file mode 100644 index 00000000000..edb0aae901a --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -0,0 +1,368 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * Copyright 2010-2011 Paul Mackerras, IBM Corp. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +/* For now use fixed-size 16MB page table */ +#define HPT_ORDER 24 +#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ +#define HPT_HASH_MASK (HPT_NPTEG - 1) + +#define HPTE_V_HVLOCK 0x40UL + +static inline long lock_hpte(unsigned long *hpte, unsigned long bits) +{ + unsigned long tmp, old; + + asm volatile(" ldarx %0,0,%2\n" + " and. %1,%0,%3\n" + " bne 2f\n" + " ori %0,%0,%4\n" + " stdcx. %0,0,%2\n" + " beq+ 2f\n" + " li %1,%3\n" + "2: isync" + : "=&r" (tmp), "=&r" (old) + : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK) + : "cc", "memory"); + return old == 0; +} + +long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel) +{ + unsigned long porder; + struct kvm *kvm = vcpu->kvm; + unsigned long i, lpn, pa; + unsigned long *hpte; + + /* only handle 4k, 64k and 16M pages for now */ + porder = 12; + if (pteh & HPTE_V_LARGE) { + if ((ptel & 0xf000) == 0x1000) { + /* 64k page */ + porder = 16; + } else if ((ptel & 0xff000) == 0) { + /* 16M page */ + porder = 24; + /* lowest AVA bit must be 0 for 16M pages */ + if (pteh & 0x80) + return H_PARAMETER; + } else + return H_PARAMETER; + } + lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder; + if (lpn >= kvm->arch.ram_npages || porder > kvm->arch.ram_porder) + return H_PARAMETER; + pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT; + if (!pa) + return H_PARAMETER; + /* Check WIMG */ + if ((ptel & HPTE_R_WIMG) != HPTE_R_M && + (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M)) + return H_PARAMETER; + pteh &= ~0x60UL; + ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize); + ptel |= pa; + if (pte_index >= (HPT_NPTEG << 3)) + return H_PARAMETER; + if (likely((flags & H_EXACT) == 0)) { + pte_index &= ~7UL; + hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); + for (i = 0; ; ++i) { + if (i == 8) + return H_PTEG_FULL; + if ((*hpte & HPTE_V_VALID) == 0 && + lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) + break; + hpte += 2; + } + } else { + i = 0; + hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); + if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) + return H_PTEG_FULL; + } + hpte[1] = ptel; + eieio(); + hpte[0] = pteh; + asm volatile("ptesync" : : : "memory"); + atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt); + vcpu->arch.gpr[4] = pte_index + i; + return H_SUCCESS; +} + +static unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, + unsigned long pte_index) +{ + unsigned long rb, va_low; + + rb = (v & ~0x7fUL) << 16; /* AVA field */ + va_low = pte_index >> 3; + if (v & HPTE_V_SECONDARY) + va_low = ~va_low; + /* xor vsid from AVA */ + if (!(v & HPTE_V_1TB_SEG)) + va_low ^= v >> 12; + else + va_low ^= v >> 24; + va_low &= 0x7ff; + if (v & HPTE_V_LARGE) { + rb |= 1; /* L field */ + if (r & 0xff000) { + /* non-16MB large page, must be 64k */ + /* (masks depend on page size) */ + rb |= 0x1000; /* page encoding in LP field */ + rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */ + rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */ + } + } else { + /* 4kB page */ + rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */ + } + rb |= (v >> 54) & 0x300; /* B field */ + return rb; +} + +#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) + +static inline int try_lock_tlbie(unsigned int *lock) +{ + unsigned int tmp, old; + unsigned int token = LOCK_TOKEN; + + asm volatile("1:lwarx %1,0,%2\n" + " cmpwi cr0,%1,0\n" + " bne 2f\n" + " stwcx. %3,0,%2\n" + " bne- 1b\n" + " isync\n" + "2:" + : "=&r" (tmp), "=&r" (old) + : "r" (lock), "r" (token) + : "cc", "memory"); + return old == 0; +} + +long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index, unsigned long avpn, + unsigned long va) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long *hpte; + unsigned long v, r, rb; + + if (pte_index >= (HPT_NPTEG << 3)) + return H_PARAMETER; + hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); + while (!lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + if ((hpte[0] & HPTE_V_VALID) == 0 || + ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) || + ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) { + hpte[0] &= ~HPTE_V_HVLOCK; + return H_NOT_FOUND; + } + if (atomic_read(&kvm->online_vcpus) == 1) + flags |= H_LOCAL; + vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK; + vcpu->arch.gpr[5] = r = hpte[1]; + rb = compute_tlbie_rb(v, r, pte_index); + hpte[0] = 0; + if (!(flags & H_LOCAL)) { + while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" + : : "r" (rb), "r" (kvm->arch.lpid)); + asm volatile("ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + asm volatile("ptesync" : : : "memory"); + asm volatile("tlbiel %0" : : "r" (rb)); + asm volatile("ptesync" : : : "memory"); + } + return H_SUCCESS; +} + +long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long *args = &vcpu->arch.gpr[4]; + unsigned long *hp, tlbrb[4]; + long int i, found; + long int n_inval = 0; + unsigned long flags, req, pte_index; + long int local = 0; + long int ret = H_SUCCESS; + + if (atomic_read(&kvm->online_vcpus) == 1) + local = 1; + for (i = 0; i < 4; ++i) { + pte_index = args[i * 2]; + flags = pte_index >> 56; + pte_index &= ((1ul << 56) - 1); + req = flags >> 6; + flags &= 3; + if (req == 3) + break; + if (req != 1 || flags == 3 || + pte_index >= (HPT_NPTEG << 3)) { + /* parameter error */ + args[i * 2] = ((0xa0 | flags) << 56) + pte_index; + ret = H_PARAMETER; + break; + } + hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); + while (!lock_hpte(hp, HPTE_V_HVLOCK)) + cpu_relax(); + found = 0; + if (hp[0] & HPTE_V_VALID) { + switch (flags & 3) { + case 0: /* absolute */ + found = 1; + break; + case 1: /* andcond */ + if (!(hp[0] & args[i * 2 + 1])) + found = 1; + break; + case 2: /* AVPN */ + if ((hp[0] & ~0x7fUL) == args[i * 2 + 1]) + found = 1; + break; + } + } + if (!found) { + hp[0] &= ~HPTE_V_HVLOCK; + args[i * 2] = ((0x90 | flags) << 56) + pte_index; + continue; + } + /* insert R and C bits from PTE */ + flags |= (hp[1] >> 5) & 0x0c; + args[i * 2] = ((0x80 | flags) << 56) + pte_index; + tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index); + hp[0] = 0; + } + if (n_inval == 0) + return ret; + + if (!local) { + while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + for (i = 0; i < n_inval; ++i) + asm volatile(PPC_TLBIE(%1,%0) + : : "r" (tlbrb[i]), "r" (kvm->arch.lpid)); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + asm volatile("ptesync" : : : "memory"); + for (i = 0; i < n_inval; ++i) + asm volatile("tlbiel %0" : : "r" (tlbrb[i])); + asm volatile("ptesync" : : : "memory"); + } + return ret; +} + +long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index, unsigned long avpn, + unsigned long va) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long *hpte; + unsigned long v, r, rb; + + if (pte_index >= (HPT_NPTEG << 3)) + return H_PARAMETER; + hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); + while (!lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + if ((hpte[0] & HPTE_V_VALID) == 0 || + ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) { + hpte[0] &= ~HPTE_V_HVLOCK; + return H_NOT_FOUND; + } + if (atomic_read(&kvm->online_vcpus) == 1) + flags |= H_LOCAL; + v = hpte[0]; + r = hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | + HPTE_R_KEY_HI | HPTE_R_KEY_LO); + r |= (flags << 55) & HPTE_R_PP0; + r |= (flags << 48) & HPTE_R_KEY_HI; + r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); + rb = compute_tlbie_rb(v, r, pte_index); + hpte[0] = v & ~HPTE_V_VALID; + if (!(flags & H_LOCAL)) { + while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" + : : "r" (rb), "r" (kvm->arch.lpid)); + asm volatile("ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + asm volatile("ptesync" : : : "memory"); + asm volatile("tlbiel %0" : : "r" (rb)); + asm volatile("ptesync" : : : "memory"); + } + hpte[1] = r; + eieio(); + hpte[0] = v & ~HPTE_V_HVLOCK; + asm volatile("ptesync" : : : "memory"); + return H_SUCCESS; +} + +static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr) +{ + long int i; + unsigned long offset, rpn; + + offset = realaddr & (kvm->arch.ram_psize - 1); + rpn = (realaddr - offset) >> PAGE_SHIFT; + for (i = 0; i < kvm->arch.ram_npages; ++i) + if (rpn == kvm->arch.ram_pginfo[i].pfn) + return (i << PAGE_SHIFT) + offset; + return HPTE_R_RPN; /* all 1s in the RPN field */ +} + +long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long *hpte, r; + int i, n = 1; + + if (pte_index >= (HPT_NPTEG << 3)) + return H_PARAMETER; + if (flags & H_READ_4) { + pte_index &= ~3; + n = 4; + } + for (i = 0; i < n; ++i, ++pte_index) { + hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); + r = hpte[1]; + if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID)) + r = reverse_xlate(kvm, r & HPTE_R_RPN) | + (r & ~HPTE_R_RPN); + vcpu->arch.gpr[4 + i * 2] = hpte[0]; + vcpu->arch.gpr[5 + i * 2] = r; + } + return H_SUCCESS; +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 9af264840b9..319ff63b1f3 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -166,6 +166,14 @@ kvmppc_hv_entry: /* Save R1 in the PACA */ std r1, HSTATE_HOST_R1(r13) + /* Increment yield count if they have a VPA */ + ld r3, VCPU_VPA(r4) + cmpdi r3, 0 + beq 25f + lwz r5, LPPACA_YIELDCOUNT(r3) + addi r5, r5, 1 + stw r5, LPPACA_YIELDCOUNT(r3) +25: /* Load up DAR and DSISR */ ld r5, VCPU_DAR(r4) lwz r6, VCPU_DSISR(r4) @@ -401,6 +409,10 @@ kvmppc_interrupt: cmpwi r3,0 bge ignore_hdec 2: + /* See if this is something we can handle in real mode */ + cmpwi r12,BOOK3S_INTERRUPT_SYSCALL + beq hcall_try_real_mode +hcall_real_cont: /* Check for mediated interrupts (could be done earlier really ...) */ cmpwi r12,BOOK3S_INTERRUPT_EXTERNAL @@ -579,13 +591,28 @@ hdec_soon: std r5, VCPU_SPRG2(r9) std r6, VCPU_SPRG3(r9) - /* Save PMU registers */ + /* Increment yield count if they have a VPA */ + ld r8, VCPU_VPA(r9) /* do they have a VPA? */ + cmpdi r8, 0 + beq 25f + lwz r3, LPPACA_YIELDCOUNT(r8) + addi r3, r3, 1 + stw r3, LPPACA_YIELDCOUNT(r8) +25: + /* Save PMU registers if requested */ + /* r8 and cr0.eq are live here */ li r3, 1 sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ mfspr r4, SPRN_MMCR0 /* save MMCR0 */ mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ isync - mfspr r5, SPRN_MMCR1 + beq 21f /* if no VPA, save PMU stuff anyway */ + lbz r7, LPPACA_PMCINUSE(r8) + cmpwi r7, 0 /* did they ask for PMU stuff to be saved? */ + bne 21f + std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */ + b 22f +21: mfspr r5, SPRN_MMCR1 mfspr r6, SPRN_MMCRA std r4, VCPU_MMCR(r9) std r5, VCPU_MMCR + 8(r9) @@ -676,6 +703,125 @@ hdec_soon: mfspr r7,SPRN_HDSISR b 7b +/* + * Try to handle an hcall in real mode. + * Returns to the guest if we handle it, or continues on up to + * the kernel if we can't (i.e. if we don't have a handler for + * it, or if the handler returns H_TOO_HARD). + */ + .globl hcall_try_real_mode +hcall_try_real_mode: + ld r3,VCPU_GPR(r3)(r9) + andi. r0,r11,MSR_PR + bne hcall_real_cont + clrrdi r3,r3,2 + cmpldi r3,hcall_real_table_end - hcall_real_table + bge hcall_real_cont + LOAD_REG_ADDR(r4, hcall_real_table) + lwzx r3,r3,r4 + cmpwi r3,0 + beq hcall_real_cont + add r3,r3,r4 + mtctr r3 + mr r3,r9 /* get vcpu pointer */ + ld r4,VCPU_GPR(r4)(r9) + bctrl + cmpdi r3,H_TOO_HARD + beq hcall_real_fallback + ld r4,HSTATE_KVM_VCPU(r13) + std r3,VCPU_GPR(r3)(r4) + ld r10,VCPU_PC(r4) + ld r11,VCPU_MSR(r4) + b fast_guest_return + + /* We've attempted a real mode hcall, but it's punted it back + * to userspace. We need to restore some clobbered volatiles + * before resuming the pass-it-to-qemu path */ +hcall_real_fallback: + li r12,BOOK3S_INTERRUPT_SYSCALL + ld r9, HSTATE_KVM_VCPU(r13) + ld r11, VCPU_MSR(r9) + + b hcall_real_cont + + .globl hcall_real_table +hcall_real_table: + .long 0 /* 0 - unused */ + .long .kvmppc_h_remove - hcall_real_table + .long .kvmppc_h_enter - hcall_real_table + .long .kvmppc_h_read - hcall_real_table + .long 0 /* 0x10 - H_CLEAR_MOD */ + .long 0 /* 0x14 - H_CLEAR_REF */ + .long .kvmppc_h_protect - hcall_real_table + .long 0 /* 0x1c - H_GET_TCE */ + .long 0 /* 0x20 - H_SET_TCE */ + .long 0 /* 0x24 - H_SET_SPRG0 */ + .long .kvmppc_h_set_dabr - hcall_real_table + .long 0 /* 0x2c */ + .long 0 /* 0x30 */ + .long 0 /* 0x34 */ + .long 0 /* 0x38 */ + .long 0 /* 0x3c */ + .long 0 /* 0x40 */ + .long 0 /* 0x44 */ + .long 0 /* 0x48 */ + .long 0 /* 0x4c */ + .long 0 /* 0x50 */ + .long 0 /* 0x54 */ + .long 0 /* 0x58 */ + .long 0 /* 0x5c */ + .long 0 /* 0x60 */ + .long 0 /* 0x64 */ + .long 0 /* 0x68 */ + .long 0 /* 0x6c */ + .long 0 /* 0x70 */ + .long 0 /* 0x74 */ + .long 0 /* 0x78 */ + .long 0 /* 0x7c */ + .long 0 /* 0x80 */ + .long 0 /* 0x84 */ + .long 0 /* 0x88 */ + .long 0 /* 0x8c */ + .long 0 /* 0x90 */ + .long 0 /* 0x94 */ + .long 0 /* 0x98 */ + .long 0 /* 0x9c */ + .long 0 /* 0xa0 */ + .long 0 /* 0xa4 */ + .long 0 /* 0xa8 */ + .long 0 /* 0xac */ + .long 0 /* 0xb0 */ + .long 0 /* 0xb4 */ + .long 0 /* 0xb8 */ + .long 0 /* 0xbc */ + .long 0 /* 0xc0 */ + .long 0 /* 0xc4 */ + .long 0 /* 0xc8 */ + .long 0 /* 0xcc */ + .long 0 /* 0xd0 */ + .long 0 /* 0xd4 */ + .long 0 /* 0xd8 */ + .long 0 /* 0xdc */ + .long 0 /* 0xe0 */ + .long 0 /* 0xe4 */ + .long 0 /* 0xe8 */ + .long 0 /* 0xec */ + .long 0 /* 0xf0 */ + .long 0 /* 0xf4 */ + .long 0 /* 0xf8 */ + .long 0 /* 0xfc */ + .long 0 /* 0x100 */ + .long 0 /* 0x104 */ + .long 0 /* 0x108 */ + .long 0 /* 0x10c */ + .long 0 /* 0x110 */ + .long 0 /* 0x114 */ + .long 0 /* 0x118 */ + .long 0 /* 0x11c */ + .long 0 /* 0x120 */ + .long .kvmppc_h_bulk_remove - hcall_real_table +hcall_real_table_end: + ignore_hdec: mr r4,r9 b fast_guest_return @@ -688,10 +834,16 @@ bounce_ext_interrupt: LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME); b fast_guest_return +_GLOBAL(kvmppc_h_set_dabr) + std r4,VCPU_DABR(r3) + mtspr SPRN_DABR,r4 + li r3,0 + blr + /* * Save away FP, VMX and VSX registers. * r3 = vcpu pointer -*/ + */ _GLOBAL(kvmppc_save_fp) mfmsr r9 ori r8,r9,MSR_FP diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 3a4f379ee70..6fc9ee499b6 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -42,7 +42,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) return !(v->arch.shared->msr & MSR_WE) || !!(v->arch.pending_exceptions); #else - return 1; + return !(v->arch.ceded) || !!(v->arch.pending_exceptions); #endif } -- cgit v1.2.3-70-g09d2 From 54738c097163c3f01e67ccc85462b78d4d4f495f Mon Sep 17 00:00:00 2001 From: David Gibson Date: Wed, 29 Jun 2011 00:22:41 +0000 Subject: KVM: PPC: Accelerate H_PUT_TCE by implementing it in real mode This improves I/O performance for guests using the PAPR paravirtualization interface by making the H_PUT_TCE hcall faster, by implementing it in real mode. H_PUT_TCE is used for updating virtual IOMMU tables, and is used both for virtual I/O and for real I/O in the PAPR interface. Since this moves the IOMMU tables into the kernel, we define a new KVM_CREATE_SPAPR_TCE ioctl to allow qemu to create the tables. The ioctl returns a file descriptor which can be used to mmap the newly created table. The qemu driver models use them in the same way as userspace managed tables, but they can be updated directly by the guest with a real-mode H_PUT_TCE implementation, reducing the number of host/guest context switches during guest IO. There are certain circumstances where it is useful for userland qemu to write to the TCE table even if the kernel H_PUT_TCE path is used most of the time. Specifically, allowing this will avoid awkwardness when we need to reset the table. More importantly, we will in the future need to write the table in order to restore its state after a checkpoint resume or migration. Signed-off-by: David Gibson Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- Documentation/virtual/kvm/api.txt | 35 ++++++++++ arch/powerpc/include/asm/kvm.h | 9 +++ arch/powerpc/include/asm/kvm_book3s_64.h | 2 + arch/powerpc/include/asm/kvm_host.h | 9 +++ arch/powerpc/include/asm/kvm_ppc.h | 2 + arch/powerpc/kvm/Makefile | 3 +- arch/powerpc/kvm/book3s_64_vio_hv.c | 73 +++++++++++++++++++ arch/powerpc/kvm/book3s_hv.c | 116 ++++++++++++++++++++++++++++++- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 +- arch/powerpc/kvm/powerpc.c | 18 +++++ include/linux/kvm.h | 2 + 11 files changed, 268 insertions(+), 3 deletions(-) create mode 100644 arch/powerpc/kvm/book3s_64_vio_hv.c (limited to 'arch/powerpc/kvm/book3s_hv.c') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index e8875fef3eb..a1d344d5ff4 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1350,6 +1350,41 @@ The following flags are defined: If datamatch flag is set, the event will be signaled only if the written value to the registered address is equal to datamatch in struct kvm_ioeventfd. +4.62 KVM_CREATE_SPAPR_TCE + +Capability: KVM_CAP_SPAPR_TCE +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_create_spapr_tce (in) +Returns: file descriptor for manipulating the created TCE table + +This creates a virtual TCE (translation control entry) table, which +is an IOMMU for PAPR-style virtual I/O. It is used to translate +logical addresses used in virtual I/O into guest physical addresses, +and provides a scatter/gather capability for PAPR virtual I/O. + +/* for KVM_CAP_SPAPR_TCE */ +struct kvm_create_spapr_tce { + __u64 liobn; + __u32 window_size; +}; + +The liobn field gives the logical IO bus number for which to create a +TCE table. The window_size field specifies the size of the DMA window +which this TCE table will translate - the table will contain one 64 +bit TCE entry for every 4kiB of the DMA window. + +When the guest issues an H_PUT_TCE hcall on a liobn for which a TCE +table has been created using this ioctl(), the kernel will handle it +in real mode, updating the TCE table. H_PUT_TCE calls for other +liobns will cause a vm exit and must be handled by userspace. + +The return value is a file descriptor which can be passed to mmap(2) +to map the created TCE table into userspace. This lets userspace read +the entries written by kernel-handled H_PUT_TCE calls, and also lets +userspace update the TCE table directly which is useful in some +circumstances. + 5. The kvm_run structure Application code obtains a pointer to the kvm_run structure by diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index d2ca5ed3877..c3ec990daf4 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -22,6 +22,9 @@ #include +/* Select powerpc specific features in */ +#define __KVM_HAVE_SPAPR_TCE + struct kvm_regs { __u64 pc; __u64 cr; @@ -272,4 +275,10 @@ struct kvm_guest_debug_arch { #define KVM_INTERRUPT_UNSET -2U #define KVM_INTERRUPT_SET_LEVEL -3U +/* for KVM_CAP_SPAPR_TCE */ +struct kvm_create_spapr_tce { + __u64 liobn; + __u32 window_size; +}; + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 5f73388ea0a..e43fe42b987 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -27,4 +27,6 @@ static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu) } #endif +#define SPAPR_TCE_SHIFT 12 + #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 6ebf1721680..5616e39a7fa 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -144,6 +144,14 @@ struct kvmppc_pginfo { atomic_t refcnt; }; +struct kvmppc_spapr_tce_table { + struct list_head list; + struct kvm *kvm; + u64 liobn; + u32 window_size; + struct page *pages[0]; +}; + struct kvm_arch { #ifdef CONFIG_KVM_BOOK3S_64_HV unsigned long hpt_virt; @@ -157,6 +165,7 @@ struct kvm_arch { unsigned long sdr1; unsigned long host_sdr1; int tlbie_lock; + struct list_head spapr_tce_tables; unsigned short last_vcpu[NR_CPUS]; #endif /* CONFIG_KVM_BOOK3S_64_HV */ }; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2afe92e6f62..99f6fcf4cf8 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -119,6 +119,8 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm, extern void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem); extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); +extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, + struct kvm_create_spapr_tce *args); extern int kvmppc_core_init_vm(struct kvm *kvm); extern void kvmppc_core_destroy_vm(struct kvm *kvm); extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 2ecffc0dc1b..1de3d54901d 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -55,7 +55,8 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ book3s_hv_interrupts.o \ book3s_64_mmu_hv.o kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ - book3s_hv_rm_mmu.o + book3s_hv_rm_mmu.o \ + book3s_64_vio_hv.o kvm-book3s_64-module-objs := \ ../../../virt/kvm/kvm_main.o \ diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c new file mode 100644 index 00000000000..ea0f8c537c2 --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -0,0 +1,73 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright 2010 Paul Mackerras, IBM Corp. + * Copyright 2011 David Gibson, IBM Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) + +long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba, unsigned long tce) +{ + struct kvm *kvm = vcpu->kvm; + struct kvmppc_spapr_tce_table *stt; + + /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ + /* liobn, ioba, tce); */ + + list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { + if (stt->liobn == liobn) { + unsigned long idx = ioba >> SPAPR_TCE_SHIFT; + struct page *page; + u64 *tbl; + + /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p window_size=0x%x\n", */ + /* liobn, stt, stt->window_size); */ + if (ioba >= stt->window_size) + return H_PARAMETER; + + page = stt->pages[idx / TCES_PER_PAGE]; + tbl = (u64 *)page_address(page); + + /* FIXME: Need to validate the TCE itself */ + /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */ + tbl[idx % TCES_PER_PAGE] = tce; + return H_SUCCESS; + } + } + + /* Didn't find the liobn, punt it to userspace */ + return H_TOO_HARD; +} diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index af862c30b70..6fe469eabce 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -538,6 +538,116 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) return r; } +static long kvmppc_stt_npages(unsigned long window_size) +{ + return ALIGN((window_size >> SPAPR_TCE_SHIFT) + * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; +} + +static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt) +{ + struct kvm *kvm = stt->kvm; + int i; + + mutex_lock(&kvm->lock); + list_del(&stt->list); + for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++) + __free_page(stt->pages[i]); + kfree(stt); + mutex_unlock(&kvm->lock); + + kvm_put_kvm(kvm); +} + +static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data; + struct page *page; + + if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size)) + return VM_FAULT_SIGBUS; + + page = stt->pages[vmf->pgoff]; + get_page(page); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct kvm_spapr_tce_vm_ops = { + .fault = kvm_spapr_tce_fault, +}; + +static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_spapr_tce_vm_ops; + return 0; +} + +static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) +{ + struct kvmppc_spapr_tce_table *stt = filp->private_data; + + release_spapr_tce_table(stt); + return 0; +} + +static struct file_operations kvm_spapr_tce_fops = { + .mmap = kvm_spapr_tce_mmap, + .release = kvm_spapr_tce_release, +}; + +long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, + struct kvm_create_spapr_tce *args) +{ + struct kvmppc_spapr_tce_table *stt = NULL; + long npages; + int ret = -ENOMEM; + int i; + + /* Check this LIOBN hasn't been previously allocated */ + list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { + if (stt->liobn == args->liobn) + return -EBUSY; + } + + npages = kvmppc_stt_npages(args->window_size); + + stt = kzalloc(sizeof(*stt) + npages* sizeof(struct page *), + GFP_KERNEL); + if (!stt) + goto fail; + + stt->liobn = args->liobn; + stt->window_size = args->window_size; + stt->kvm = kvm; + + for (i = 0; i < npages; i++) { + stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!stt->pages[i]) + goto fail; + } + + kvm_get_kvm(kvm); + + mutex_lock(&kvm->lock); + list_add(&stt->list, &kvm->arch.spapr_tce_tables); + + mutex_unlock(&kvm->lock); + + return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, + stt, O_RDWR); + +fail: + if (stt) { + for (i = 0; i < npages; i++) + if (stt->pages[i]) + __free_page(stt->pages[i]); + + kfree(stt); + } + return ret; +} + int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem) { @@ -559,13 +669,17 @@ int kvmppc_core_init_vm(struct kvm *kvm) /* Allocate hashed page table */ r = kvmppc_alloc_hpt(kvm); + if (r) + return r; - return r; + INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); + return 0; } void kvmppc_core_destroy_vm(struct kvm *kvm) { kvmppc_free_hpt(kvm); + WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); } /* These are stubs for now */ diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 319ff63b1f3..e6adaadcdff 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -754,7 +754,7 @@ hcall_real_table: .long 0 /* 0x14 - H_CLEAR_REF */ .long .kvmppc_h_protect - hcall_real_table .long 0 /* 0x1c - H_GET_TCE */ - .long 0 /* 0x20 - H_SET_TCE */ + .long .kvmppc_h_put_tce - hcall_real_table .long 0 /* 0x24 - H_SET_SPRG0 */ .long .kvmppc_h_set_dabr - hcall_real_table .long 0 /* 0x2c */ diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 6fc9ee499b6..c78ceb9d560 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -202,6 +202,11 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_COALESCED_MMIO: r = KVM_COALESCED_MMIO_PAGE_OFFSET; break; +#endif +#ifdef CONFIG_KVM_BOOK3S_64_HV + case KVM_CAP_SPAPR_TCE: + r = 1; + break; #endif default: r = 0; @@ -653,6 +658,19 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } +#ifdef CONFIG_KVM_BOOK3S_64_HV + case KVM_CREATE_SPAPR_TCE: { + struct kvm_create_spapr_tce create_tce; + struct kvm *kvm = filp->private_data; + + r = -EFAULT; + if (copy_from_user(&create_tce, argp, sizeof(create_tce))) + goto out; + r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce); + goto out; + } +#endif /* CONFIG_KVM_BOOK3S_64_HV */ + default: r = -ENOTTY; } diff --git a/include/linux/kvm.h b/include/linux/kvm.h index a156294fc22..61f56502732 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -550,6 +550,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_TSC_CONTROL 60 #define KVM_CAP_GET_TSC_KHZ 61 #define KVM_CAP_PPC_BOOKE_SREGS 62 +#define KVM_CAP_SPAPR_TCE 63 #ifdef KVM_CAP_IRQ_ROUTING @@ -752,6 +753,7 @@ struct kvm_clock_data { /* Available with KVM_CAP_XCRS */ #define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) #define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) +#define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) -- cgit v1.2.3-70-g09d2 From 371fefd6f2dc46668e00871930dde613b88d4bde Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 29 Jun 2011 00:23:08 +0000 Subject: KVM: PPC: Allow book3s_hv guests to use SMT processor modes This lifts the restriction that book3s_hv guests can only run one hardware thread per core, and allows them to use up to 4 threads per core on POWER7. The host still has to run single-threaded. This capability is advertised to qemu through a new KVM_CAP_PPC_SMT capability. The return value of the ioctl querying this capability is the number of vcpus per virtual CPU core (vcore), currently 4. To use this, the host kernel should be booted with all threads active, and then all the secondary threads should be offlined. This will put the secondary threads into nap mode. KVM will then wake them from nap mode and use them for running guest code (while they are still offline). To wake the secondary threads, we send them an IPI using a new xics_wake_cpu() function, implemented in arch/powerpc/sysdev/xics/icp-native.c. In other words, at this stage we assume that the platform has a XICS interrupt controller and we are using icp-native.c to drive it. Since the woken thread will need to acknowledge and clear the IPI, we also export the base physical address of the XICS registers using kvmppc_set_xics_phys() for use in the low-level KVM book3s code. When a vcpu is created, it is assigned to a virtual CPU core. The vcore number is obtained by dividing the vcpu number by the number of threads per core in the host. This number is exported to userspace via the KVM_CAP_PPC_SMT capability. If qemu wishes to run the guest in single-threaded mode, it should make all vcpu numbers be multiples of the number of threads per core. We distinguish three states of a vcpu: runnable (i.e., ready to execute the guest), blocked (that is, idle), and busy in host. We currently implement a policy that the vcore can run only when all its threads are runnable or blocked. This way, if a vcpu needs to execute elsewhere in the kernel or in qemu, it can do so without being starved of CPU by the other vcpus. When a vcore starts to run, it executes in the context of one of the vcpu threads. The other vcpu threads all go to sleep and stay asleep until something happens requiring the vcpu thread to return to qemu, or to wake up to run the vcore (this can happen when another vcpu thread goes from busy in host state to blocked). It can happen that a vcpu goes from blocked to runnable state (e.g. because of an interrupt), and the vcore it belongs to is already running. In that case it can start to run immediately as long as the none of the vcpus in the vcore have started to exit the guest. We send the next free thread in the vcore an IPI to get it to start to execute the guest. It synchronizes with the other threads via the vcore->entry_exit_count field to make sure that it doesn't go into the guest if the other vcpus are exiting by the time that it is ready to actually enter the guest. Note that there is no fixed relationship between the hardware thread number and the vcpu number. Hardware threads are assigned to vcpus as they become runnable, so we will always use the lower-numbered hardware threads in preference to higher-numbered threads if not all the vcpus in the vcore are runnable, regardless of which vcpus are runnable. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- Documentation/virtual/kvm/api.txt | 13 ++ arch/powerpc/include/asm/kvm.h | 1 + arch/powerpc/include/asm/kvm_book3s_asm.h | 2 + arch/powerpc/include/asm/kvm_host.h | 46 ++++- arch/powerpc/include/asm/kvm_ppc.h | 13 ++ arch/powerpc/kernel/asm-offsets.c | 6 + arch/powerpc/kernel/exceptions-64s.S | 31 ++- arch/powerpc/kernel/idle_power7.S | 2 - arch/powerpc/kvm/book3s_hv.c | 316 +++++++++++++++++++++++++++--- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 168 +++++++++++++++- arch/powerpc/kvm/powerpc.c | 4 + arch/powerpc/sysdev/xics/icp-native.c | 9 + include/linux/kvm.h | 1 + 13 files changed, 567 insertions(+), 45 deletions(-) (limited to 'arch/powerpc/kvm/book3s_hv.c') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a1d344d5ff4..681871311d3 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -180,6 +180,19 @@ KVM_CHECK_EXTENSION ioctl() to determine the value for max_vcpus at run-time. If the KVM_CAP_NR_VCPUS does not exist, you should assume that max_vcpus is 4 cpus max. +On powerpc using book3s_hv mode, the vcpus are mapped onto virtual +threads in one or more virtual CPU cores. (This is because the +hardware requires all the hardware threads in a CPU core to be in the +same partition.) The KVM_CAP_PPC_SMT capability indicates the number +of vcpus per virtual core (vcore). The vcore id is obtained by +dividing the vcpu id by the number of vcpus per vcore. The vcpus in a +given vcore will always be in the same physical core as each other +(though that might be a different physical core from time to time). +Userspace can control the threading (SMT) mode of the guest by its +allocation of vcpu ids. For example, if userspace wants +single-threaded guest vcpus, it should make all vcpu ids be a multiple +of the number of vcpus per vcore. + 4.8 KVM_GET_DIRTY_LOG (vm ioctl) Capability: basic diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index c3ec990daf4..471bb3d85e0 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -24,6 +24,7 @@ /* Select powerpc specific features in */ #define __KVM_HAVE_SPAPR_TCE +#define __KVM_HAVE_PPC_SMT struct kvm_regs { __u64 pc; diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index b7b039532fb..9cfd5436782 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -78,6 +78,8 @@ struct kvmppc_host_state { #ifdef CONFIG_KVM_BOOK3S_64_HV struct kvm_vcpu *kvm_vcpu; + struct kvmppc_vcore *kvm_vcore; + unsigned long xics_phys; u64 dabr; u64 host_mmcr[3]; u32 host_pmc[6]; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 5616e39a7fa..0d6d569e19c 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -25,10 +25,14 @@ #include #include #include +#include +#include #include #include +#include -#define KVM_MAX_VCPUS 1 +#define KVM_MAX_VCPUS NR_CPUS +#define KVM_MAX_VCORES NR_CPUS #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 @@ -167,9 +171,34 @@ struct kvm_arch { int tlbie_lock; struct list_head spapr_tce_tables; unsigned short last_vcpu[NR_CPUS]; + struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; #endif /* CONFIG_KVM_BOOK3S_64_HV */ }; +/* + * Struct for a virtual core. + * Note: entry_exit_count combines an entry count in the bottom 8 bits + * and an exit count in the next 8 bits. This is so that we can + * atomically increment the entry count iff the exit count is 0 + * without taking the lock. + */ +struct kvmppc_vcore { + int n_runnable; + int n_blocked; + int num_threads; + int entry_exit_count; + int n_woken; + int nap_count; + u16 pcpu; + u8 vcore_running; + u8 in_guest; + struct list_head runnable_threads; + spinlock_t lock; +}; + +#define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff) +#define VCORE_EXIT_COUNT(vc) ((vc)->entry_exit_count >> 8) + struct kvmppc_pte { ulong eaddr; u64 vpage; @@ -365,14 +394,29 @@ struct kvm_vcpu_arch { struct slb_shadow *slb_shadow; struct dtl *dtl; struct dtl *dtl_end; + + struct kvmppc_vcore *vcore; + int ret; int trap; + int state; + int ptid; + wait_queue_head_t cpu_run; + struct kvm_vcpu_arch_shared *shared; unsigned long magic_page_pa; /* phys addr to map the magic page to */ unsigned long magic_page_ea; /* effect. addr to map the magic page to */ #ifdef CONFIG_KVM_BOOK3S_64_HV struct kvm_vcpu_arch_shared shregs; + + struct list_head run_list; + struct task_struct *run_task; + struct kvm_run *kvm_run; #endif }; +#define KVMPPC_VCPU_BUSY_IN_HOST 0 +#define KVMPPC_VCPU_BLOCKED 1 +#define KVMPPC_VCPU_RUNNABLE 2 + #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 99f6fcf4cf8..6ef73442863 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -33,6 +33,9 @@ #else #include #endif +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER +#include +#endif enum emulation_result { EMULATE_DONE, /* no further processing */ @@ -169,4 +172,14 @@ int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); +#ifdef CONFIG_KVM_BOOK3S_64_HV +static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) +{ + paca[cpu].kvm_hstate.xics_phys = addr; +} +#else +static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) +{} +#endif + #endif /* __POWERPC_KVM_PPC_H__ */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index c70d106bf1a..d0f2387fd79 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -471,6 +471,10 @@ int main(void) DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar)); DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap)); + DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid)); + DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count)); + DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count)); + DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest)); DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) - offsetof(struct kvmppc_vcpu_book3s, vcpu)); DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige)); @@ -530,6 +534,8 @@ int main(void) #ifdef CONFIG_KVM_BOOK3S_64_HV HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); + HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore); + HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys); HSTATE_FIELD(HSTATE_MMCR, host_mmcr); HSTATE_FIELD(HSTATE_PMC, host_pmc); HSTATE_FIELD(HSTATE_PURR, host_purr); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 163c041cec2..5bc06fdfa6c 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -49,19 +49,32 @@ BEGIN_FTR_SECTION * state loss at this time. */ mfspr r13,SPRN_SRR1 - rlwinm r13,r13,47-31,30,31 - cmpwi cr0,r13,1 - bne 1f - b .power7_wakeup_noloss -1: cmpwi cr0,r13,2 - bne 1f - b .power7_wakeup_loss + rlwinm. r13,r13,47-31,30,31 + beq 9f + + /* waking up from powersave (nap) state */ + cmpwi cr1,r13,2 /* Total loss of HV state is fatal, we could try to use the * PIR to locate a PACA, then use an emergency stack etc... * but for now, let's just stay stuck here */ -1: cmpwi cr0,r13,3 - beq . + bgt cr1,. + GET_PACA(r13) + +#ifdef CONFIG_KVM_BOOK3S_64_HV + lbz r0,PACAPROCSTART(r13) + cmpwi r0,0x80 + bne 1f + li r0,0 + stb r0,PACAPROCSTART(r13) + b kvm_start_guest +1: +#endif + + beq cr1,2f + b .power7_wakeup_noloss +2: b .power7_wakeup_loss +9: END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206) #endif /* CONFIG_PPC_P7_NAP */ EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD, diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S index f8f0bc7f1d4..3a70845a51c 100644 --- a/arch/powerpc/kernel/idle_power7.S +++ b/arch/powerpc/kernel/idle_power7.S @@ -73,7 +73,6 @@ _GLOBAL(power7_idle) b . _GLOBAL(power7_wakeup_loss) - GET_PACA(r13) ld r1,PACAR1(r13) REST_NVGPRS(r1) REST_GPR(2, r1) @@ -87,7 +86,6 @@ _GLOBAL(power7_wakeup_loss) rfid _GLOBAL(power7_wakeup_noloss) - GET_PACA(r13) ld r1,PACAR1(r13) ld r4,_MSR(r1) ld r5,_NIP(r1) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 6fe469eabce..36b6d98f119 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -51,12 +52,16 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { local_paca->kvm_hstate.kvm_vcpu = vcpu; + local_paca->kvm_hstate.kvm_vcore = vcpu->arch.vcore; } void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) { } +static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu); +static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu); + void kvmppc_vcpu_block(struct kvm_vcpu *vcpu) { u64 now; @@ -74,11 +79,15 @@ void kvmppc_vcpu_block(struct kvm_vcpu *vcpu) HRTIMER_MODE_REL); } + kvmppc_vcpu_blocked(vcpu); + kvm_vcpu_block(vcpu); vcpu->stat.halt_wakeup++; if (vcpu->arch.dec_expires != ~(u64)0) hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + + kvmppc_vcpu_unblocked(vcpu); } void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) @@ -429,9 +438,16 @@ int kvmppc_core_check_processor_compat(void) struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; - int err = -ENOMEM; + int err = -EINVAL; + int core; + struct kvmppc_vcore *vcore; unsigned long lpcr; + core = id / threads_per_core; + if (core >= KVM_MAX_VCORES) + goto out; + + err = -ENOMEM; vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); if (!vcpu) goto out; @@ -454,6 +470,38 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) kvmppc_mmu_book3s_hv_init(vcpu); + /* + * Some vcpus may start out in stopped state. If we initialize + * them to busy-in-host state they will stop other vcpus in the + * vcore from running. Instead we initialize them to blocked + * state, effectively considering them to be stopped until we + * see the first run ioctl for them. + */ + vcpu->arch.state = KVMPPC_VCPU_BLOCKED; + + init_waitqueue_head(&vcpu->arch.cpu_run); + + mutex_lock(&kvm->lock); + vcore = kvm->arch.vcores[core]; + if (!vcore) { + vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL); + if (vcore) { + INIT_LIST_HEAD(&vcore->runnable_threads); + spin_lock_init(&vcore->lock); + } + kvm->arch.vcores[core] = vcore; + } + mutex_unlock(&kvm->lock); + + if (!vcore) + goto free_vcpu; + + spin_lock(&vcore->lock); + ++vcore->num_threads; + ++vcore->n_blocked; + spin_unlock(&vcore->lock); + vcpu->arch.vcore = vcore; + return vcpu; free_vcpu: @@ -468,21 +516,121 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kfree(vcpu); } +static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + spin_lock(&vc->lock); + vcpu->arch.state = KVMPPC_VCPU_BLOCKED; + ++vc->n_blocked; + if (vc->n_runnable > 0 && + vc->n_runnable + vc->n_blocked == vc->num_threads) { + vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu, + arch.run_list); + wake_up(&vcpu->arch.cpu_run); + } + spin_unlock(&vc->lock); +} + +static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + spin_lock(&vc->lock); + vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; + --vc->n_blocked; + spin_unlock(&vc->lock); +} + extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); +extern void xics_wake_cpu(int cpu); -static int kvmppc_run_vcpu(struct kvm_run *run, struct kvm_vcpu *vcpu) +static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, + struct kvm_vcpu *vcpu) { - u64 now; + struct kvm_vcpu *v; - if (signal_pending(current)) { - run->exit_reason = KVM_EXIT_INTR; - return -EINTR; + if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) + return; + vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; + --vc->n_runnable; + /* decrement the physical thread id of each following vcpu */ + v = vcpu; + list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list) + --v->arch.ptid; + list_del(&vcpu->arch.run_list); +} + +static void kvmppc_start_thread(struct kvm_vcpu *vcpu) +{ + int cpu; + struct paca_struct *tpaca; + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + cpu = vc->pcpu + vcpu->arch.ptid; + tpaca = &paca[cpu]; + tpaca->kvm_hstate.kvm_vcpu = vcpu; + tpaca->kvm_hstate.kvm_vcore = vc; + smp_wmb(); +#ifdef CONFIG_PPC_ICP_NATIVE + if (vcpu->arch.ptid) { + tpaca->cpu_start = 0x80; + tpaca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST; + wmb(); + xics_wake_cpu(cpu); + ++vc->n_woken; } +#endif +} - flush_fp_to_thread(current); - flush_altivec_to_thread(current); - flush_vsx_to_thread(current); - preempt_disable(); +static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc) +{ + int i; + + HMT_low(); + i = 0; + while (vc->nap_count < vc->n_woken) { + if (++i >= 1000000) { + pr_err("kvmppc_wait_for_nap timeout %d %d\n", + vc->nap_count, vc->n_woken); + break; + } + cpu_relax(); + } + HMT_medium(); +} + +/* + * Check that we are on thread 0 and that any other threads in + * this core are off-line. + */ +static int on_primary_thread(void) +{ + int cpu = smp_processor_id(); + int thr = cpu_thread_in_core(cpu); + + if (thr) + return 0; + while (++thr < threads_per_core) + if (cpu_online(cpu + thr)) + return 0; + return 1; +} + +/* + * Run a set of guest threads on a physical core. + * Called with vc->lock held. + */ +static int kvmppc_run_core(struct kvmppc_vcore *vc) +{ + struct kvm_vcpu *vcpu, *vnext; + long ret; + u64 now; + + /* don't start if any threads have a signal pending */ + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + if (signal_pending(vcpu->arch.run_task)) + return 0; /* * Make sure we are running on thread 0, and that @@ -490,36 +638,150 @@ static int kvmppc_run_vcpu(struct kvm_run *run, struct kvm_vcpu *vcpu) * XXX we should also block attempts to bring any * secondary threads online. */ - if (threads_per_core > 1) { - int cpu = smp_processor_id(); - int thr = cpu_thread_in_core(cpu); - - if (thr) - goto out; - while (++thr < threads_per_core) - if (cpu_online(cpu + thr)) - goto out; + if (threads_per_core > 1 && !on_primary_thread()) { + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + vcpu->arch.ret = -EBUSY; + goto out; } - kvm_guest_enter(); + vc->n_woken = 0; + vc->nap_count = 0; + vc->entry_exit_count = 0; + vc->vcore_running = 1; + vc->in_guest = 0; + vc->pcpu = smp_processor_id(); + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + kvmppc_start_thread(vcpu); + vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu, + arch.run_list); + + spin_unlock(&vc->lock); + preempt_disable(); + kvm_guest_enter(); __kvmppc_vcore_entry(NULL, vcpu); + /* wait for secondary threads to finish writing their state to memory */ + spin_lock(&vc->lock); + if (vc->nap_count < vc->n_woken) + kvmppc_wait_for_nap(vc); + /* prevent other vcpu threads from doing kvmppc_start_thread() now */ + vc->vcore_running = 2; + spin_unlock(&vc->lock); + + /* make sure updates to secondary vcpu structs are visible now */ + smp_mb(); kvm_guest_exit(); preempt_enable(); kvm_resched(vcpu); now = get_tb(); - /* cancel pending dec exception if dec is positive */ - if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu)) - kvmppc_core_dequeue_dec(vcpu); - - return kvmppc_handle_exit(run, vcpu, current); + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { + /* cancel pending dec exception if dec is positive */ + if (now < vcpu->arch.dec_expires && + kvmppc_core_pending_dec(vcpu)) + kvmppc_core_dequeue_dec(vcpu); + if (!vcpu->arch.trap) { + if (signal_pending(vcpu->arch.run_task)) { + vcpu->arch.kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->arch.ret = -EINTR; + } + continue; /* didn't get to run */ + } + ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu, + vcpu->arch.run_task); + vcpu->arch.ret = ret; + vcpu->arch.trap = 0; + } + spin_lock(&vc->lock); out: - preempt_enable(); - return -EBUSY; + vc->vcore_running = 0; + list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, + arch.run_list) { + if (vcpu->arch.ret != RESUME_GUEST) { + kvmppc_remove_runnable(vc, vcpu); + wake_up(&vcpu->arch.cpu_run); + } + } + + return 1; +} + +static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + int ptid; + int wait_state; + struct kvmppc_vcore *vc; + DEFINE_WAIT(wait); + + /* No need to go into the guest when all we do is going out */ + if (signal_pending(current)) { + kvm_run->exit_reason = KVM_EXIT_INTR; + return -EINTR; + } + + kvm_run->exit_reason = 0; + vcpu->arch.ret = RESUME_GUEST; + vcpu->arch.trap = 0; + + flush_fp_to_thread(current); + flush_altivec_to_thread(current); + flush_vsx_to_thread(current); + + /* + * Synchronize with other threads in this virtual core + */ + vc = vcpu->arch.vcore; + spin_lock(&vc->lock); + /* This happens the first time this is called for a vcpu */ + if (vcpu->arch.state == KVMPPC_VCPU_BLOCKED) + --vc->n_blocked; + vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; + ptid = vc->n_runnable; + vcpu->arch.run_task = current; + vcpu->arch.kvm_run = kvm_run; + vcpu->arch.ptid = ptid; + list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads); + ++vc->n_runnable; + + wait_state = TASK_INTERRUPTIBLE; + while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { + if (signal_pending(current)) { + if (!vc->vcore_running) { + kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->arch.ret = -EINTR; + break; + } + /* have to wait for vcore to stop executing guest */ + wait_state = TASK_UNINTERRUPTIBLE; + smp_send_reschedule(vc->pcpu); + } + + if (!vc->vcore_running && + vc->n_runnable + vc->n_blocked == vc->num_threads) { + /* we can run now */ + if (kvmppc_run_core(vc)) + continue; + } + + if (vc->vcore_running == 1 && VCORE_EXIT_COUNT(vc) == 0) + kvmppc_start_thread(vcpu); + + /* wait for other threads to come in, or wait for vcore */ + prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state); + spin_unlock(&vc->lock); + schedule(); + finish_wait(&vcpu->arch.cpu_run, &wait); + spin_lock(&vc->lock); + } + + if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) + kvmppc_remove_runnable(vc, vcpu); + spin_unlock(&vc->lock); + + return vcpu->arch.ret; } int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index e6adaadcdff..c9bf177b7cf 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -30,8 +30,6 @@ * * ****************************************************************************/ -#define SHADOW_VCPU_OFF PACA_KVM_SVCPU - .globl kvmppc_skip_interrupt kvmppc_skip_interrupt: mfspr r13,SPRN_SRR0 @@ -79,6 +77,32 @@ _GLOBAL(kvmppc_hv_entry_trampoline) * * *****************************************************************************/ +#define XICS_XIRR 4 +#define XICS_QIRR 0xc + +/* + * We come in here when wakened from nap mode on a secondary hw thread. + * Relocation is off and most register values are lost. + * r13 points to the PACA. + */ + .globl kvm_start_guest +kvm_start_guest: + ld r1,PACAEMERGSP(r13) + subi r1,r1,STACK_FRAME_OVERHEAD + + /* get vcpu pointer */ + ld r4, HSTATE_KVM_VCPU(r13) + + /* We got here with an IPI; clear it */ + ld r5, HSTATE_XICS_PHYS(r13) + li r0, 0xff + li r6, XICS_QIRR + li r7, XICS_XIRR + lwzcix r8, r5, r7 /* ack the interrupt */ + sync + stbcix r0, r5, r6 /* clear it */ + stwcix r8, r5, r7 /* EOI it */ + .global kvmppc_hv_entry kvmppc_hv_entry: @@ -200,7 +224,20 @@ kvmppc_hv_entry: slbia ptesync - /* Switch to guest partition. */ + /* Increment entry count iff exit count is zero. */ + ld r5,HSTATE_KVM_VCORE(r13) + addi r9,r5,VCORE_ENTRY_EXIT +21: lwarx r3,0,r9 + cmpwi r3,0x100 /* any threads starting to exit? */ + bge secondary_too_late /* if so we're too late to the party */ + addi r3,r3,1 + stwcx. r3,0,r9 + bne 21b + + /* Primary thread switches to guest partition. */ + lwz r6,VCPU_PTID(r4) + cmpwi r6,0 + bne 20f ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ ld r6,KVM_SDR1(r9) lwz r7,KVM_LPID(r9) @@ -210,7 +247,15 @@ kvmppc_hv_entry: mtspr SPRN_SDR1,r6 /* switch to partition page table */ mtspr SPRN_LPID,r7 isync - ld r8,VCPU_LPCR(r4) + li r0,1 + stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ + b 10f + + /* Secondary threads wait for primary to have done partition switch */ +20: lbz r0,VCORE_IN_GUEST(r5) + cmpwi r0,0 + beq 20b +10: ld r8,VCPU_LPCR(r4) mtspr SPRN_LPCR,r8 isync @@ -225,10 +270,12 @@ kvmppc_hv_entry: * Invalidate the TLB if we could possibly have stale TLB * entries for this partition on this core due to the use * of tlbiel. + * XXX maybe only need this on primary thread? */ ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ lwz r5,VCPU_VCPUID(r4) lhz r6,PACAPACAINDEX(r13) + rldimi r6,r5,0,62 /* XXX map as if threads 1:1 p:v */ lhz r8,VCPU_LAST_CPU(r4) sldi r7,r6,1 /* see if this is the same vcpu */ add r7,r7,r9 /* as last ran on this pcpu */ @@ -512,8 +559,60 @@ hcall_real_cont: ptesync hdec_soon: - /* Switch back to host partition */ + /* Increment the threads-exiting-guest count in the 0xff00 + bits of vcore->entry_exit_count */ + lwsync + ld r5,HSTATE_KVM_VCORE(r13) + addi r6,r5,VCORE_ENTRY_EXIT +41: lwarx r3,0,r6 + addi r0,r3,0x100 + stwcx. r0,0,r6 + bne 41b + + /* + * At this point we have an interrupt that we have to pass + * up to the kernel or qemu; we can't handle it in real mode. + * Thus we have to do a partition switch, so we have to + * collect the other threads, if we are the first thread + * to take an interrupt. To do this, we set the HDEC to 0, + * which causes an HDEC interrupt in all threads within 2ns + * because the HDEC register is shared between all 4 threads. + * However, we don't need to bother if this is an HDEC + * interrupt, since the other threads will already be on their + * way here in that case. + */ + cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER + beq 40f + cmpwi r3,0x100 /* Are we the first here? */ + bge 40f + cmpwi r3,1 + ble 40f + li r0,0 + mtspr SPRN_HDEC,r0 +40: + + /* Secondary threads wait for primary to do partition switch */ ld r4,VCPU_KVM(r9) /* pointer to struct kvm */ + ld r5,HSTATE_KVM_VCORE(r13) + lwz r3,VCPU_PTID(r9) + cmpwi r3,0 + beq 15f + HMT_LOW +13: lbz r3,VCORE_IN_GUEST(r5) + cmpwi r3,0 + bne 13b + HMT_MEDIUM + b 16f + + /* Primary thread waits for all the secondaries to exit guest */ +15: lwz r3,VCORE_ENTRY_EXIT(r5) + srwi r0,r3,8 + clrldi r3,r3,56 + cmpw r3,r0 + bne 15b + isync + + /* Primary thread switches back to host partition */ ld r6,KVM_HOST_SDR1(r4) lwz r7,KVM_HOST_LPID(r4) li r8,LPID_RSVD /* switch to reserved LPID */ @@ -522,10 +621,12 @@ hdec_soon: mtspr SPRN_SDR1,r6 /* switch to partition page table */ mtspr SPRN_LPID,r7 isync + li r0,0 + stb r0,VCORE_IN_GUEST(r5) lis r8,0x7fff /* MAX_INT@h */ mtspr SPRN_HDEC,r8 - ld r8,KVM_HOST_LPCR(r4) +16: ld r8,KVM_HOST_LPCR(r4) mtspr SPRN_LPCR,r8 isync @@ -634,6 +735,11 @@ hdec_soon: mr r3, r9 bl .kvmppc_save_fp + /* Secondary threads go off to take a nap */ + lwz r0,VCPU_PTID(r3) + cmpwi r0,0 + bne secondary_nap + /* * Reload DEC. HDEC interrupts were disabled when * we reloaded the host's LPCR value. @@ -840,6 +946,56 @@ _GLOBAL(kvmppc_h_set_dabr) li r3,0 blr +secondary_too_late: + ld r5,HSTATE_KVM_VCORE(r13) + HMT_LOW +13: lbz r3,VCORE_IN_GUEST(r5) + cmpwi r3,0 + bne 13b + HMT_MEDIUM + ld r11,PACA_SLBSHADOWPTR(r13) + + .rept SLB_NUM_BOLTED + ld r5,SLBSHADOW_SAVEAREA(r11) + ld r6,SLBSHADOW_SAVEAREA+8(r11) + andis. r7,r5,SLB_ESID_V@h + beq 1f + slbmte r6,r5 +1: addi r11,r11,16 + .endr + b 50f + +secondary_nap: + /* Clear any pending IPI */ +50: ld r5, HSTATE_XICS_PHYS(r13) + li r0, 0xff + li r6, XICS_QIRR + stbcix r0, r5, r6 + + /* increment the nap count and then go to nap mode */ + ld r4, HSTATE_KVM_VCORE(r13) + addi r4, r4, VCORE_NAP_COUNT + lwsync /* make previous updates visible */ +51: lwarx r3, 0, r4 + addi r3, r3, 1 + stwcx. r3, 0, r4 + bne 51b + isync + + mfspr r4, SPRN_LPCR + li r0, LPCR_PECE + andc r4, r4, r0 + ori r4, r4, LPCR_PECE0 /* exit nap on interrupt */ + mtspr SPRN_LPCR, r4 + li r0, 0 + std r0, HSTATE_SCRATCH0(r13) + ptesync + ld r0, HSTATE_SCRATCH0(r13) +1: cmpd r0, r0 + bne 1b + nap + b . + /* * Save away FP, VMX and VSX registers. * r3 = vcpu pointer diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index c78ceb9d560..4c549664c98 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "timing.h" #include "../mm/mmu_decl.h" @@ -207,6 +208,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_SPAPR_TCE: r = 1; break; + case KVM_CAP_PPC_SMT: + r = threads_per_core; + break; #endif default: r = 0; diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c index 1f15ad43614..ba382b59b92 100644 --- a/arch/powerpc/sysdev/xics/icp-native.c +++ b/arch/powerpc/sysdev/xics/icp-native.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -24,6 +25,7 @@ #include #include #include +#include struct icp_ipl { union { @@ -139,6 +141,12 @@ static void icp_native_cause_ipi(int cpu, unsigned long data) icp_native_set_qirr(cpu, IPI_PRIORITY); } +void xics_wake_cpu(int cpu) +{ + icp_native_set_qirr(cpu, IPI_PRIORITY); +} +EXPORT_SYMBOL_GPL(xics_wake_cpu); + static irqreturn_t icp_native_ipi_action(int irq, void *dev_id) { int cpu = smp_processor_id(); @@ -185,6 +193,7 @@ static int __init icp_native_map_one_cpu(int hw_id, unsigned long addr, } icp_native_regs[cpu] = ioremap(addr, size); + kvmppc_set_xics_phys(cpu, addr); if (!icp_native_regs[cpu]) { pr_warning("icp_native: Failed ioremap for CPU %d, " "interrupt server #0x%x, addr %#lx\n", diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 61f56502732..e2a378d9716 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -551,6 +551,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_GET_TSC_KHZ 61 #define KVM_CAP_PPC_BOOKE_SREGS 62 #define KVM_CAP_SPAPR_TCE 63 +#define KVM_CAP_PPC_SMT 64 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3-70-g09d2 From aa04b4cc5be64b4fb9ef4e0fdf2418e2f4737fb2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 29 Jun 2011 00:25:44 +0000 Subject: KVM: PPC: Allocate RMAs (Real Mode Areas) at boot for use by guests This adds infrastructure which will be needed to allow book3s_hv KVM to run on older POWER processors, including PPC970, which don't support the Virtual Real Mode Area (VRMA) facility, but only the Real Mode Offset (RMO) facility. These processors require a physically contiguous, aligned area of memory for each guest. When the guest does an access in real mode (MMU off), the address is compared against a limit value, and if it is lower, the address is ORed with an offset value (from the Real Mode Offset Register (RMOR)) and the result becomes the real address for the access. The size of the RMA has to be one of a set of supported values, which usually includes 64MB, 128MB, 256MB and some larger powers of 2. Since we are unlikely to be able to allocate 64MB or more of physically contiguous memory after the kernel has been running for a while, we allocate a pool of RMAs at boot time using the bootmem allocator. The size and number of the RMAs can be set using the kvm_rma_size=xx and kvm_rma_count=xx kernel command line options. KVM exports a new capability, KVM_CAP_PPC_RMA, to signal the availability of the pool of preallocated RMAs. The capability value is 1 if the processor can use an RMA but doesn't require one (because it supports the VRMA facility), or 2 if the processor requires an RMA for each guest. This adds a new ioctl, KVM_ALLOCATE_RMA, which allocates an RMA from the pool and returns a file descriptor which can be used to map the RMA. It also returns the size of the RMA in the argument structure. Having an RMA means we will get multiple KMV_SET_USER_MEMORY_REGION ioctl calls from userspace. To cope with this, we now preallocate the kvm->arch.ram_pginfo array when the VM is created with a size sufficient for up to 64GB of guest memory. Subsequently we will get rid of this array and use memory associated with each memslot instead. This moves most of the code that translates the user addresses into host pfns (page frame numbers) out of kvmppc_prepare_vrma up one level to kvmppc_core_prepare_memory_region. Also, instead of having to look up the VMA for each page in order to check the page size, we now check that the pages we get are compound pages of 16MB. However, if we are adding memory that is mapped to an RMA, we don't bother with calling get_user_pages_fast and instead just offset from the base pfn for the RMA. Typically the RMA gets added after vcpus are created, which makes it inconvenient to have the LPCR (logical partition control register) value in the vcpu->arch struct, since the LPCR controls whether the processor uses RMA or VRMA for the guest. This moves the LPCR value into the kvm->arch struct and arranges for the MER (mediated external request) bit, which is the only bit that varies between vcpus, to be set in assembly code when going into the guest if there is a pending external interrupt request. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- Documentation/virtual/kvm/api.txt | 32 ++++ arch/powerpc/include/asm/kvm.h | 5 + arch/powerpc/include/asm/kvm_book3s.h | 8 - arch/powerpc/include/asm/kvm_host.h | 15 +- arch/powerpc/include/asm/kvm_ppc.h | 10 ++ arch/powerpc/include/asm/reg.h | 1 + arch/powerpc/kernel/asm-offsets.c | 4 +- arch/powerpc/kernel/setup_64.c | 3 + arch/powerpc/kvm/Makefile | 3 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 97 +----------- arch/powerpc/kvm/book3s_hv.c | 259 ++++++++++++++++++++++++++++++-- arch/powerpc/kvm/book3s_hv_builtin.c | 152 +++++++++++++++++++ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 19 ++- arch/powerpc/kvm/powerpc.c | 13 ++ include/linux/kvm.h | 3 + 15 files changed, 505 insertions(+), 119 deletions(-) create mode 100644 arch/powerpc/kvm/book3s_hv_builtin.c (limited to 'arch/powerpc/kvm/book3s_hv.c') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 681871311d3..b0e4b9cd6a6 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1398,6 +1398,38 @@ the entries written by kernel-handled H_PUT_TCE calls, and also lets userspace update the TCE table directly which is useful in some circumstances. +4.63 KVM_ALLOCATE_RMA + +Capability: KVM_CAP_PPC_RMA +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_allocate_rma (out) +Returns: file descriptor for mapping the allocated RMA + +This allocates a Real Mode Area (RMA) from the pool allocated at boot +time by the kernel. An RMA is a physically-contiguous, aligned region +of memory used on older POWER processors to provide the memory which +will be accessed by real-mode (MMU off) accesses in a KVM guest. +POWER processors support a set of sizes for the RMA that usually +includes 64MB, 128MB, 256MB and some larger powers of two. + +/* for KVM_ALLOCATE_RMA */ +struct kvm_allocate_rma { + __u64 rma_size; +}; + +The return value is a file descriptor which can be passed to mmap(2) +to map the allocated RMA into userspace. The mapped area can then be +passed to the KVM_SET_USER_MEMORY_REGION ioctl to establish it as the +RMA for a virtual machine. The size of the RMA in bytes (which is +fixed at host kernel boot time) is returned in the rma_size field of +the argument structure. + +The KVM_CAP_PPC_RMA capability is 1 or 2 if the KVM_ALLOCATE_RMA ioctl +is supported; 2 if the processor requires all virtual machines to have +an RMA, or 1 if the processor can use an RMA but doesn't require it, +because it supports the Virtual RMA (VRMA) facility. + 5. The kvm_run structure Application code obtains a pointer to the kvm_run structure by diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index 471bb3d85e0..a4f6c85431f 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -282,4 +282,9 @@ struct kvm_create_spapr_tce { __u32 window_size; }; +/* for KVM_ALLOCATE_RMA */ +struct kvm_allocate_rma { + __u64 rma_size; +}; + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 5537c45d626..3f91ebd4ae4 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -298,14 +298,6 @@ static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, unsigned long pending_now, unsigned long old_pending) { - /* Recalculate LPCR:MER based on the presence of - * a pending external interrupt - */ - if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &pending_now) || - test_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &pending_now)) - vcpu->arch.lpcr |= LPCR_MER; - else - vcpu->arch.lpcr &= ~((u64)LPCR_MER); } static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 0d6d569e19c..f572d9cc31b 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include @@ -156,6 +158,14 @@ struct kvmppc_spapr_tce_table { struct page *pages[0]; }; +struct kvmppc_rma_info { + void *base_virt; + unsigned long base_pfn; + unsigned long npages; + struct list_head list; + atomic_t use_count; +}; + struct kvm_arch { #ifdef CONFIG_KVM_BOOK3S_64_HV unsigned long hpt_virt; @@ -169,6 +179,10 @@ struct kvm_arch { unsigned long sdr1; unsigned long host_sdr1; int tlbie_lock; + int n_rma_pages; + unsigned long lpcr; + unsigned long rmor; + struct kvmppc_rma_info *rma; struct list_head spapr_tce_tables; unsigned short last_vcpu[NR_CPUS]; struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; @@ -295,7 +309,6 @@ struct kvm_vcpu_arch { ulong guest_owned_ext; ulong purr; ulong spurr; - ulong lpcr; ulong dscr; ulong amr; ulong uamor; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 6ef73442863..d121f49d62b 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -124,6 +124,10 @@ extern void kvmppc_map_vrma(struct kvm *kvm, extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); +extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, + struct kvm_allocate_rma *rma); +extern struct kvmppc_rma_info *kvm_alloc_rma(void); +extern void kvm_release_rma(struct kvmppc_rma_info *ri); extern int kvmppc_core_init_vm(struct kvm *kvm); extern void kvmppc_core_destroy_vm(struct kvm *kvm); extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, @@ -177,9 +181,15 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) { paca[cpu].kvm_hstate.xics_phys = addr; } + +extern void kvm_rma_init(void); + #else static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) {} + +static inline void kvm_rma_init(void) +{} #endif #endif /* __POWERPC_KVM_PPC_H__ */ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 36a611b398c..20a053c1427 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -242,6 +242,7 @@ #define LPCR_VRMA_LP1 (1ul << (63-16)) #define LPCR_VRMASD_SH (63-16) #define LPCR_RMLS 0x1C000000 /* impl dependent rmo limit sel */ +#define LPCR_RMLS_SH (63-37) #define LPCR_ILE 0x02000000 /* !HV irqs set MSR:LE */ #define LPCR_PECE 0x00007000 /* powersave exit cause enable */ #define LPCR_PECE0 0x00004000 /* ext. exceptions can cause exit */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index d0f2387fd79..f4aba938166 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -437,6 +437,8 @@ int main(void) DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock)); DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter)); DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu)); + DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr)); + DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor)); DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); #endif @@ -459,7 +461,7 @@ int main(void) DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags)); DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec)); DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires)); - DEFINE(VCPU_LPCR, offsetof(struct kvm_vcpu, arch.lpcr)); + DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions)); DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa)); DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr)); DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc)); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index a88bf2713d4..532054f24ec 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -63,6 +63,7 @@ #include #include #include +#include #include "setup.h" @@ -580,6 +581,8 @@ void __init setup_arch(char **cmdline_p) /* Initialize the MMU context management stuff */ mmu_context_init(); + kvm_rma_init(); + ppc64_boot_msg(0x15, "Setup Done"); } diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 1de3d54901d..08428e2c188 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -56,7 +56,8 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ book3s_64_mmu_hv.o kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \ book3s_hv_rm_mmu.o \ - book3s_64_vio_hv.o + book3s_64_vio_hv.o \ + book3s_hv_builtin.o kvm-book3s_64-module-objs := \ ../../../virt/kvm/kvm_main.o \ diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 4a4fbec61a1..96ba96a16ab 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -79,103 +79,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm) void kvmppc_free_hpt(struct kvm *kvm) { - unsigned long i; - struct kvmppc_pginfo *pginfo; - clear_bit(kvm->arch.lpid, lpid_inuse); free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); - - if (kvm->arch.ram_pginfo) { - pginfo = kvm->arch.ram_pginfo; - kvm->arch.ram_pginfo = NULL; - for (i = 0; i < kvm->arch.ram_npages; ++i) - put_page(pfn_to_page(pginfo[i].pfn)); - kfree(pginfo); - } -} - -static unsigned long user_page_size(unsigned long addr) -{ - struct vm_area_struct *vma; - unsigned long size = PAGE_SIZE; - - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, addr); - if (vma) - size = vma_kernel_pagesize(vma); - up_read(¤t->mm->mmap_sem); - return size; -} - -static pfn_t hva_to_pfn(unsigned long addr) -{ - struct page *page[1]; - int npages; - - might_sleep(); - - npages = get_user_pages_fast(addr, 1, 1, page); - - if (unlikely(npages != 1)) - return 0; - - return page_to_pfn(page[0]); -} - -long kvmppc_prepare_vrma(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) -{ - unsigned long psize, porder; - unsigned long i, npages; - struct kvmppc_pginfo *pginfo; - pfn_t pfn; - unsigned long hva; - - /* First see what page size we have */ - psize = user_page_size(mem->userspace_addr); - /* For now, only allow 16MB pages */ - if (psize != 1ul << VRMA_PAGE_ORDER || (mem->memory_size & (psize - 1))) { - pr_err("bad psize=%lx memory_size=%llx @ %llx\n", - psize, mem->memory_size, mem->userspace_addr); - return -EINVAL; - } - porder = __ilog2(psize); - - npages = mem->memory_size >> porder; - pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo), GFP_KERNEL); - if (!pginfo) { - pr_err("kvmppc_prepare_vrma: couldn't alloc %lu bytes\n", - npages * sizeof(struct kvmppc_pginfo)); - return -ENOMEM; - } - - for (i = 0; i < npages; ++i) { - hva = mem->userspace_addr + (i << porder); - if (user_page_size(hva) != psize) - goto err; - pfn = hva_to_pfn(hva); - if (pfn == 0) { - pr_err("oops, no pfn for hva %lx\n", hva); - goto err; - } - if (pfn & ((1ul << (porder - PAGE_SHIFT)) - 1)) { - pr_err("oops, unaligned pfn %llx\n", pfn); - put_page(pfn_to_page(pfn)); - goto err; - } - pginfo[i].pfn = pfn; - } - - kvm->arch.ram_npages = npages; - kvm->arch.ram_psize = psize; - kvm->arch.ram_porder = porder; - kvm->arch.ram_pginfo = pginfo; - - return 0; - - err: - kfree(pginfo); - return -EINVAL; } void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) @@ -199,6 +104,8 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) for (i = 0; i < npages; ++i) { pfn = pginfo[i].pfn; + if (!pfn) + break; /* can't use hpt_hash since va > 64 bits */ hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK; /* diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 36b6d98f119..04da135cae6 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include #include @@ -40,11 +42,22 @@ #include #include #include +#include #include #include #include #include +/* + * For now, limit memory to 64GB and require it to be large pages. + * This value is chosen because it makes the ram_pginfo array be + * 64kB in size, which is about as large as we want to be trying + * to allocate with kmalloc. + */ +#define MAX_MEM_ORDER 36 + +#define LARGE_PAGE_ORDER 24 /* 16MB pages */ + /* #define EXIT_DEBUG */ /* #define EXIT_DEBUG_SIMPLE */ /* #define EXIT_DEBUG_INT */ @@ -129,7 +142,7 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu) pr_err(" ESID = %.16llx VSID = %.16llx\n", vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv); pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n", - vcpu->arch.lpcr, vcpu->kvm->arch.sdr1, + vcpu->kvm->arch.lpcr, vcpu->kvm->arch.sdr1, vcpu->arch.last_inst); } @@ -441,7 +454,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) int err = -EINVAL; int core; struct kvmppc_vcore *vcore; - unsigned long lpcr; core = id / threads_per_core; if (core >= KVM_MAX_VCORES) @@ -464,10 +476,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) vcpu->arch.pvr = mfspr(SPRN_PVR); kvmppc_set_pvr(vcpu, vcpu->arch.pvr); - lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES); - lpcr |= LPCR_VPM0 | LPCR_VRMA_L | (4UL << LPCR_DPFD_SH) | LPCR_HDICE; - vcpu->arch.lpcr = lpcr; - kvmppc_mmu_book3s_hv_init(vcpu); /* @@ -910,24 +918,216 @@ fail: return ret; } +/* Work out RMLS (real mode limit selector) field value for a given RMA size. + Assumes POWER7. */ +static inline int lpcr_rmls(unsigned long rma_size) +{ + switch (rma_size) { + case 32ul << 20: /* 32 MB */ + return 8; + case 64ul << 20: /* 64 MB */ + return 3; + case 128ul << 20: /* 128 MB */ + return 7; + case 256ul << 20: /* 256 MB */ + return 4; + case 1ul << 30: /* 1 GB */ + return 2; + case 16ul << 30: /* 16 GB */ + return 1; + case 256ul << 30: /* 256 GB */ + return 0; + default: + return -1; + } +} + +static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct kvmppc_rma_info *ri = vma->vm_file->private_data; + struct page *page; + + if (vmf->pgoff >= ri->npages) + return VM_FAULT_SIGBUS; + + page = pfn_to_page(ri->base_pfn + vmf->pgoff); + get_page(page); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct kvm_rma_vm_ops = { + .fault = kvm_rma_fault, +}; + +static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &kvm_rma_vm_ops; + return 0; +} + +static int kvm_rma_release(struct inode *inode, struct file *filp) +{ + struct kvmppc_rma_info *ri = filp->private_data; + + kvm_release_rma(ri); + return 0; +} + +static struct file_operations kvm_rma_fops = { + .mmap = kvm_rma_mmap, + .release = kvm_rma_release, +}; + +long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret) +{ + struct kvmppc_rma_info *ri; + long fd; + + ri = kvm_alloc_rma(); + if (!ri) + return -ENOMEM; + + fd = anon_inode_getfd("kvm-rma", &kvm_rma_fops, ri, O_RDWR); + if (fd < 0) + kvm_release_rma(ri); + + ret->rma_size = ri->npages << PAGE_SHIFT; + return fd; +} + +static struct page *hva_to_page(unsigned long addr) +{ + struct page *page[1]; + int npages; + + might_sleep(); + + npages = get_user_pages_fast(addr, 1, 1, page); + + if (unlikely(npages != 1)) + return 0; + + return page[0]; +} + int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem) { - if (mem->guest_phys_addr == 0 && mem->memory_size != 0) - return kvmppc_prepare_vrma(kvm, mem); + unsigned long psize, porder; + unsigned long i, npages, totalpages; + unsigned long pg_ix; + struct kvmppc_pginfo *pginfo; + unsigned long hva; + struct kvmppc_rma_info *ri = NULL; + struct page *page; + + /* For now, only allow 16MB pages */ + porder = LARGE_PAGE_ORDER; + psize = 1ul << porder; + if ((mem->memory_size & (psize - 1)) || + (mem->guest_phys_addr & (psize - 1))) { + pr_err("bad memory_size=%llx @ %llx\n", + mem->memory_size, mem->guest_phys_addr); + return -EINVAL; + } + + npages = mem->memory_size >> porder; + totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder; + + /* More memory than we have space to track? */ + if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER))) + return -EINVAL; + + /* Do we already have an RMA registered? */ + if (mem->guest_phys_addr == 0 && kvm->arch.rma) + return -EINVAL; + + if (totalpages > kvm->arch.ram_npages) + kvm->arch.ram_npages = totalpages; + + /* Is this one of our preallocated RMAs? */ + if (mem->guest_phys_addr == 0) { + struct vm_area_struct *vma; + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, mem->userspace_addr); + if (vma && vma->vm_file && + vma->vm_file->f_op == &kvm_rma_fops && + mem->userspace_addr == vma->vm_start) + ri = vma->vm_file->private_data; + up_read(¤t->mm->mmap_sem); + } + + if (ri) { + unsigned long rma_size; + unsigned long lpcr; + long rmls; + + rma_size = ri->npages << PAGE_SHIFT; + if (rma_size > mem->memory_size) + rma_size = mem->memory_size; + rmls = lpcr_rmls(rma_size); + if (rmls < 0) { + pr_err("Can't use RMA of 0x%lx bytes\n", rma_size); + return -EINVAL; + } + atomic_inc(&ri->use_count); + kvm->arch.rma = ri; + kvm->arch.n_rma_pages = rma_size >> porder; + lpcr = kvm->arch.lpcr & ~(LPCR_VPM0 | LPCR_VRMA_L); + lpcr |= rmls << LPCR_RMLS_SH; + kvm->arch.lpcr = lpcr; + kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT; + pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n", + ri->base_pfn << PAGE_SHIFT, rma_size, lpcr); + } + + pg_ix = mem->guest_phys_addr >> porder; + pginfo = kvm->arch.ram_pginfo + pg_ix; + for (i = 0; i < npages; ++i, ++pg_ix) { + if (ri && pg_ix < kvm->arch.n_rma_pages) { + pginfo[i].pfn = ri->base_pfn + + (pg_ix << (porder - PAGE_SHIFT)); + continue; + } + hva = mem->userspace_addr + (i << porder); + page = hva_to_page(hva); + if (!page) { + pr_err("oops, no pfn for hva %lx\n", hva); + goto err; + } + /* Check it's a 16MB page */ + if (!PageHead(page) || + compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) { + pr_err("page at %lx isn't 16MB (o=%d)\n", + hva, compound_order(page)); + goto err; + } + pginfo[i].pfn = page_to_pfn(page); + } + return 0; + + err: + return -EINVAL; } void kvmppc_core_commit_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem) { - if (mem->guest_phys_addr == 0 && mem->memory_size != 0) + if (mem->guest_phys_addr == 0 && mem->memory_size != 0 && + !kvm->arch.rma) kvmppc_map_vrma(kvm, mem); } int kvmppc_core_init_vm(struct kvm *kvm) { long r; + unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER); + long err = -ENOMEM; + unsigned long lpcr; /* Allocate hashed page table */ r = kvmppc_alloc_hpt(kvm); @@ -935,11 +1135,52 @@ int kvmppc_core_init_vm(struct kvm *kvm) return r; INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); + + kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo), + GFP_KERNEL); + if (!kvm->arch.ram_pginfo) { + pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n", + npages * sizeof(struct kvmppc_pginfo)); + goto out_free; + } + + kvm->arch.ram_npages = 0; + kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER; + kvm->arch.ram_porder = LARGE_PAGE_ORDER; + kvm->arch.rma = NULL; + kvm->arch.n_rma_pages = 0; + + lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES); + lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | + LPCR_VPM0 | LPCR_VRMA_L; + kvm->arch.lpcr = lpcr; + + return 0; + + out_free: + kvmppc_free_hpt(kvm); + return err; } void kvmppc_core_destroy_vm(struct kvm *kvm) { + struct kvmppc_pginfo *pginfo; + unsigned long i; + + if (kvm->arch.ram_pginfo) { + pginfo = kvm->arch.ram_pginfo; + kvm->arch.ram_pginfo = NULL; + for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i) + if (pginfo[i].pfn) + put_page(pfn_to_page(pginfo[i].pfn)); + kfree(pginfo); + } + if (kvm->arch.rma) { + kvm_release_rma(kvm->arch.rma); + kvm->arch.rma = NULL; + } + kvmppc_free_hpt(kvm); WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); } diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c new file mode 100644 index 00000000000..736df3cbbc5 --- /dev/null +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -0,0 +1,152 @@ +/* + * Copyright 2011 Paul Mackerras, IBM Corp. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * This maintains a list of RMAs (real mode areas) for KVM guests to use. + * Each RMA has to be physically contiguous and of a size that the + * hardware supports. PPC970 and POWER7 support 64MB, 128MB and 256MB, + * and other larger sizes. Since we are unlikely to be allocate that + * much physically contiguous memory after the system is up and running, + * we preallocate a set of RMAs in early boot for KVM to use. + */ +static unsigned long kvm_rma_size = 64 << 20; /* 64MB */ +static unsigned long kvm_rma_count; + +static int __init early_parse_rma_size(char *p) +{ + if (!p) + return 1; + + kvm_rma_size = memparse(p, &p); + + return 0; +} +early_param("kvm_rma_size", early_parse_rma_size); + +static int __init early_parse_rma_count(char *p) +{ + if (!p) + return 1; + + kvm_rma_count = simple_strtoul(p, NULL, 0); + + return 0; +} +early_param("kvm_rma_count", early_parse_rma_count); + +static struct kvmppc_rma_info *rma_info; +static LIST_HEAD(free_rmas); +static DEFINE_SPINLOCK(rma_lock); + +/* Work out RMLS (real mode limit selector) field value for a given RMA size. + Assumes POWER7. */ +static inline int lpcr_rmls(unsigned long rma_size) +{ + switch (rma_size) { + case 32ul << 20: /* 32 MB */ + return 8; + case 64ul << 20: /* 64 MB */ + return 3; + case 128ul << 20: /* 128 MB */ + return 7; + case 256ul << 20: /* 256 MB */ + return 4; + case 1ul << 30: /* 1 GB */ + return 2; + case 16ul << 30: /* 16 GB */ + return 1; + case 256ul << 30: /* 256 GB */ + return 0; + default: + return -1; + } +} + +/* + * Called at boot time while the bootmem allocator is active, + * to allocate contiguous physical memory for the real memory + * areas for guests. + */ +void kvm_rma_init(void) +{ + unsigned long i; + unsigned long j, npages; + void *rma; + struct page *pg; + + /* Only do this on POWER7 in HV mode */ + if (!cpu_has_feature(CPU_FTR_HVMODE_206)) + return; + + if (!kvm_rma_size || !kvm_rma_count) + return; + + /* Check that the requested size is one supported in hardware */ + if (lpcr_rmls(kvm_rma_size) < 0) { + pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size); + return; + } + + npages = kvm_rma_size >> PAGE_SHIFT; + rma_info = alloc_bootmem(kvm_rma_count * sizeof(struct kvmppc_rma_info)); + for (i = 0; i < kvm_rma_count; ++i) { + rma = alloc_bootmem_align(kvm_rma_size, kvm_rma_size); + pr_info("Allocated KVM RMA at %p (%ld MB)\n", rma, + kvm_rma_size >> 20); + rma_info[i].base_virt = rma; + rma_info[i].base_pfn = __pa(rma) >> PAGE_SHIFT; + rma_info[i].npages = npages; + list_add_tail(&rma_info[i].list, &free_rmas); + atomic_set(&rma_info[i].use_count, 0); + + pg = pfn_to_page(rma_info[i].base_pfn); + for (j = 0; j < npages; ++j) { + atomic_inc(&pg->_count); + ++pg; + } + } +} + +struct kvmppc_rma_info *kvm_alloc_rma(void) +{ + struct kvmppc_rma_info *ri; + + ri = NULL; + spin_lock(&rma_lock); + if (!list_empty(&free_rmas)) { + ri = list_first_entry(&free_rmas, struct kvmppc_rma_info, list); + list_del(&ri->list); + atomic_inc(&ri->use_count); + } + spin_unlock(&rma_lock); + return ri; +} +EXPORT_SYMBOL_GPL(kvm_alloc_rma); + +void kvm_release_rma(struct kvmppc_rma_info *ri) +{ + if (atomic_dec_and_test(&ri->use_count)) { + spin_lock(&rma_lock); + list_add_tail(&ri->list, &free_rmas); + spin_unlock(&rma_lock); + + } +} +EXPORT_SYMBOL_GPL(kvm_release_rma); + diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index c9bf177b7cf..9ee223c3528 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -235,10 +235,10 @@ kvmppc_hv_entry: bne 21b /* Primary thread switches to guest partition. */ + ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ lwz r6,VCPU_PTID(r4) cmpwi r6,0 bne 20f - ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ ld r6,KVM_SDR1(r9) lwz r7,KVM_LPID(r9) li r0,LPID_RSVD /* switch to reserved LPID */ @@ -255,8 +255,18 @@ kvmppc_hv_entry: 20: lbz r0,VCORE_IN_GUEST(r5) cmpwi r0,0 beq 20b -10: ld r8,VCPU_LPCR(r4) - mtspr SPRN_LPCR,r8 + + /* Set LPCR. Set the MER bit if there is a pending external irq. */ +10: ld r8,KVM_LPCR(r9) + ld r0,VCPU_PENDING_EXC(r4) + li r7,(1 << BOOK3S_IRQPRIO_EXTERNAL) + oris r7,r7,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h + and. r0,r0,r7 + beq 11f + ori r8,r8,LPCR_MER +11: mtspr SPRN_LPCR,r8 + ld r8,KVM_RMOR(r9) + mtspr SPRN_RMOR,r8 isync /* Check if HDEC expires soon */ @@ -464,7 +474,8 @@ hcall_real_cont: /* Check for mediated interrupts (could be done earlier really ...) */ cmpwi r12,BOOK3S_INTERRUPT_EXTERNAL bne+ 1f - ld r5,VCPU_LPCR(r9) + ld r5,VCPU_KVM(r9) + ld r5,KVM_LPCR(r5) andi. r0,r11,MSR_EE beq 1f andi. r0,r5,LPCR_MER diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 4c549664c98..72c506505fa 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -211,6 +211,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_SMT: r = threads_per_core; break; + case KVM_CAP_PPC_RMA: + r = 1; + break; #endif default: r = 0; @@ -673,6 +676,16 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce); goto out; } + + case KVM_ALLOCATE_RMA: { + struct kvm *kvm = filp->private_data; + struct kvm_allocate_rma rma; + + r = kvm_vm_ioctl_allocate_rma(kvm, &rma); + if (r >= 0 && copy_to_user(argp, &rma, sizeof(rma))) + r = -EFAULT; + break; + } #endif /* CONFIG_KVM_BOOK3S_64_HV */ default: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index e2a378d9716..2c366b52f50 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -552,6 +552,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_BOOKE_SREGS 62 #define KVM_CAP_SPAPR_TCE 63 #define KVM_CAP_PPC_SMT 64 +#define KVM_CAP_PPC_RMA 65 #ifdef KVM_CAP_IRQ_ROUTING @@ -755,6 +756,8 @@ struct kvm_clock_data { #define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) #define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) #define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce) +/* Available with KVM_CAP_RMA */ +#define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) -- cgit v1.2.3-70-g09d2 From 969391c58a4efb8411d6881179945f425ad9cbb5 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 29 Jun 2011 00:26:11 +0000 Subject: powerpc, KVM: Split HVMODE_206 cpu feature bit into separate HV and architecture bits This replaces the single CPU_FTR_HVMODE_206 bit with two bits, one to indicate that we have a usable hypervisor mode, and another to indicate that the processor conforms to PowerISA version 2.06. We also add another bit to indicate that the processor conforms to ISA version 2.01 and set that for PPC970 and derivatives. Some PPC970 chips (specifically those in Apple machines) have a hypervisor mode in that MSR[HV] is always 1, but the hypervisor mode is not useful in the sense that there is no way to run any code in supervisor mode (HV=0 PR=0). On these processors, the LPES0 and LPES1 bits in HID4 are always 0, and we use that as a way of detecting that hypervisor mode is not useful. Where we have a feature section in assembly code around code that only applies on POWER7 in hypervisor mode, we use a construct like END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) The definition of END_FTR_SECTION_IFSET is such that the code will be enabled (not overwritten with nops) only if all bits in the provided mask are set. Note that the CPU feature check in __tlbie() only needs to check the ARCH_206 bit, not the HVMODE bit, because __tlbie() can only get called if we are running bare-metal, i.e. in hypervisor mode. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/cputable.h | 14 ++++++++------ arch/powerpc/include/asm/reg.h | 16 ++++++++++++---- arch/powerpc/kernel/cpu_setup_power7.S | 4 ++-- arch/powerpc/kernel/cpu_setup_ppc970.S | 26 ++++++++++++++++++++++---- arch/powerpc/kernel/exceptions-64s.S | 4 ++-- arch/powerpc/kernel/paca.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 3 ++- arch/powerpc/kvm/book3s_hv.c | 3 ++- arch/powerpc/kvm/book3s_hv_builtin.c | 4 ++-- arch/powerpc/kvm/book3s_segment.S | 2 +- arch/powerpc/mm/hash_native_64.c | 4 ++-- 11 files changed, 56 insertions(+), 26 deletions(-) (limited to 'arch/powerpc/kvm/book3s_hv.c') diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index c0d842cfd01..e30442c539c 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -179,8 +179,9 @@ extern const char *powerpc_base_platform; #define LONG_ASM_CONST(x) 0 #endif - -#define CPU_FTR_HVMODE_206 LONG_ASM_CONST(0x0000000800000000) +#define CPU_FTR_HVMODE LONG_ASM_CONST(0x0000000200000000) +#define CPU_FTR_ARCH_201 LONG_ASM_CONST(0x0000000400000000) +#define CPU_FTR_ARCH_206 LONG_ASM_CONST(0x0000000800000000) #define CPU_FTR_CFAR LONG_ASM_CONST(0x0000001000000000) #define CPU_FTR_IABR LONG_ASM_CONST(0x0000002000000000) #define CPU_FTR_MMCRA LONG_ASM_CONST(0x0000004000000000) @@ -401,9 +402,10 @@ extern const char *powerpc_base_platform; CPU_FTR_MMCRA | CPU_FTR_CP_USE_DCBTZ | \ CPU_FTR_STCX_CHECKS_ADDRESS) #define CPU_FTRS_PPC970 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ - CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ + CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_201 | \ CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP | CPU_FTR_MMCRA | \ - CPU_FTR_CP_USE_DCBTZ | CPU_FTR_STCX_CHECKS_ADDRESS) + CPU_FTR_CP_USE_DCBTZ | CPU_FTR_STCX_CHECKS_ADDRESS | \ + CPU_FTR_HVMODE) #define CPU_FTRS_POWER5 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ CPU_FTR_MMCRA | CPU_FTR_SMT | \ @@ -417,13 +419,13 @@ extern const char *powerpc_base_platform; CPU_FTR_DSCR | CPU_FTR_UNALIGNED_LD_STD | \ CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_CFAR) #define CPU_FTRS_POWER7 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ - CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_HVMODE_206 |\ + CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\ CPU_FTR_MMCRA | CPU_FTR_SMT | \ CPU_FTR_COHERENT_ICACHE | \ CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT | \ CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \ - CPU_FTR_ICSWX | CPU_FTR_CFAR) + CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE) #define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 20a053c1427..ddbe57ae858 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -307,6 +307,7 @@ #define SPRN_HASH1 0x3D2 /* Primary Hash Address Register */ #define SPRN_HASH2 0x3D3 /* Secondary Hash Address Resgister */ #define SPRN_HID0 0x3F0 /* Hardware Implementation Register 0 */ +#define HID0_HDICE_SH (63 - 23) /* 970 HDEC interrupt enable */ #define HID0_EMCP (1<<31) /* Enable Machine Check pin */ #define HID0_EBA (1<<29) /* Enable Bus Address Parity */ #define HID0_EBD (1<<28) /* Enable Bus Data Parity */ @@ -362,6 +363,13 @@ #define SPRN_IABR2 0x3FA /* 83xx */ #define SPRN_IBCR 0x135 /* 83xx Insn Breakpoint Control Reg */ #define SPRN_HID4 0x3F4 /* 970 HID4 */ +#define HID4_LPES0 (1ul << (63-0)) /* LPAR env. sel. bit 0 */ +#define HID4_RMLS2_SH (63 - 2) /* Real mode limit bottom 2 bits */ +#define HID4_LPID5_SH (63 - 6) /* partition ID bottom 4 bits */ +#define HID4_RMOR_SH (63 - 22) /* real mode offset (16 bits) */ +#define HID4_LPES1 (1 << (63-57)) /* LPAR env. sel. bit 1 */ +#define HID4_RMLS0_SH (63 - 58) /* Real mode limit top bit */ +#define HID4_LPID1_SH 0 /* partition ID top 2 bits */ #define SPRN_HID4_GEKKO 0x3F3 /* Gekko HID4 */ #define SPRN_HID5 0x3F6 /* 970 HID5 */ #define SPRN_HID6 0x3F9 /* BE HID 6 */ @@ -811,28 +819,28 @@ mfspr rX,SPRN_SPRG_PACA; \ FTR_SECTION_ELSE_NESTED(66); \ mfspr rX,SPRN_SPRG_HPACA; \ - ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66) + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66) #define SET_PACA(rX) \ BEGIN_FTR_SECTION_NESTED(66); \ mtspr SPRN_SPRG_PACA,rX; \ FTR_SECTION_ELSE_NESTED(66); \ mtspr SPRN_SPRG_HPACA,rX; \ - ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66) + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66) #define GET_SCRATCH0(rX) \ BEGIN_FTR_SECTION_NESTED(66); \ mfspr rX,SPRN_SPRG_SCRATCH0; \ FTR_SECTION_ELSE_NESTED(66); \ mfspr rX,SPRN_SPRG_HSCRATCH0; \ - ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66) + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66) #define SET_SCRATCH0(rX) \ BEGIN_FTR_SECTION_NESTED(66); \ mtspr SPRN_SPRG_SCRATCH0,rX; \ FTR_SECTION_ELSE_NESTED(66); \ mtspr SPRN_SPRG_HSCRATCH0,rX; \ - ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE_206, 66) + ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_HVMODE, 66) #else /* CONFIG_PPC_BOOK3S_64 */ #define GET_SCRATCH0(rX) mfspr rX,SPRN_SPRG_SCRATCH0 diff --git a/arch/powerpc/kernel/cpu_setup_power7.S b/arch/powerpc/kernel/cpu_setup_power7.S index 2ef6749688e..76797c5105d 100644 --- a/arch/powerpc/kernel/cpu_setup_power7.S +++ b/arch/powerpc/kernel/cpu_setup_power7.S @@ -45,12 +45,12 @@ _GLOBAL(__restore_cpu_power7) blr __init_hvmode_206: - /* Disable CPU_FTR_HVMODE_206 and exit if MSR:HV is not set */ + /* Disable CPU_FTR_HVMODE and exit if MSR:HV is not set */ mfmsr r3 rldicl. r0,r3,4,63 bnelr ld r5,CPU_SPEC_FEATURES(r4) - LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE_206) + LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE) xor r5,r5,r6 std r5,CPU_SPEC_FEATURES(r4) blr diff --git a/arch/powerpc/kernel/cpu_setup_ppc970.S b/arch/powerpc/kernel/cpu_setup_ppc970.S index 27f2507279d..12fac8df01c 100644 --- a/arch/powerpc/kernel/cpu_setup_ppc970.S +++ b/arch/powerpc/kernel/cpu_setup_ppc970.S @@ -76,7 +76,7 @@ _GLOBAL(__setup_cpu_ppc970) /* Do nothing if not running in HV mode */ mfmsr r0 rldicl. r0,r0,4,63 - beqlr + beq no_hv_mode mfspr r0,SPRN_HID0 li r11,5 /* clear DOZE and SLEEP */ @@ -90,7 +90,7 @@ _GLOBAL(__setup_cpu_ppc970MP) /* Do nothing if not running in HV mode */ mfmsr r0 rldicl. r0,r0,4,63 - beqlr + beq no_hv_mode mfspr r0,SPRN_HID0 li r11,0x15 /* clear DOZE and SLEEP */ @@ -109,6 +109,14 @@ load_hids: sync isync + /* Try to set LPES = 01 in HID4 */ + mfspr r0,SPRN_HID4 + clrldi r0,r0,1 /* clear LPES0 */ + ori r0,r0,HID4_LPES1 /* set LPES1 */ + sync + mtspr SPRN_HID4,r0 + isync + /* Save away cpu state */ LOAD_REG_ADDR(r5,cpu_state_storage) @@ -117,11 +125,21 @@ load_hids: std r3,CS_HID0(r5) mfspr r3,SPRN_HID1 std r3,CS_HID1(r5) - mfspr r3,SPRN_HID4 - std r3,CS_HID4(r5) + mfspr r4,SPRN_HID4 + std r4,CS_HID4(r5) mfspr r3,SPRN_HID5 std r3,CS_HID5(r5) + /* See if we successfully set LPES1 to 1; if not we are in Apple mode */ + andi. r4,r4,HID4_LPES1 + bnelr + +no_hv_mode: + /* Disable CPU_FTR_HVMODE and exit, since we don't have HV mode */ + ld r5,CPU_SPEC_FEATURES(r4) + LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE) + andc r5,r5,r6 + std r5,CPU_SPEC_FEATURES(r4) blr /* Called with no MMU context (typically MSR:IR/DR off) to diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 5bc06fdfa6c..a5345380bef 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -75,7 +75,7 @@ BEGIN_FTR_SECTION b .power7_wakeup_noloss 2: b .power7_wakeup_loss 9: -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206) +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #endif /* CONFIG_PPC_P7_NAP */ EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD, NOTEST, 0x100) @@ -173,7 +173,7 @@ hardware_interrupt_hv: _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt, EXC_STD, SOFTEN_TEST_PR) KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500) - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE_206) + ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) STD_EXCEPTION_PSERIES(0x600, 0x600, alignment) KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x600) diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index efeb8818418..0a5a899846b 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -167,7 +167,7 @@ void setup_paca(struct paca_struct *new_paca) * if we do a GET_PACA() before the feature fixups have been * applied */ - if (cpu_has_feature(CPU_FTR_HVMODE_206)) + if (cpu_has_feature(CPU_FTR_HVMODE)) mtspr(SPRN_SPRG_HPACA, local_paca); #endif mtspr(SPRN_SPRG_PACA, local_paca); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 96ba96a16ab..212dcd8fc50 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -128,7 +128,8 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) int kvmppc_mmu_hv_init(void) { - if (!cpu_has_feature(CPU_FTR_HVMODE_206)) + if (!cpu_has_feature(CPU_FTR_HVMODE) || + !cpu_has_feature(CPU_FTR_ARCH_206)) return -EINVAL; memset(lpid_inuse, 0, sizeof(lpid_inuse)); set_bit(mfspr(SPRN_LPID), lpid_inuse); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 04da135cae6..dc70e7745ab 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -443,7 +443,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int kvmppc_core_check_processor_compat(void) { - if (cpu_has_feature(CPU_FTR_HVMODE_206)) + if (cpu_has_feature(CPU_FTR_HVMODE) && + cpu_has_feature(CPU_FTR_ARCH_206)) return 0; return -EIO; } diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 736df3cbbc5..7315ec6e817 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -90,8 +90,8 @@ void kvm_rma_init(void) void *rma; struct page *pg; - /* Only do this on POWER7 in HV mode */ - if (!cpu_has_feature(CPU_FTR_HVMODE_206)) + /* Only do this in HV mode */ + if (!cpu_has_feature(CPU_FTR_HVMODE)) return; if (!kvm_rma_size || !kvm_rma_count) diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 134501691ad..aed32e51721 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -170,7 +170,7 @@ BEGIN_FTR_SECTION mfspr r4,SPRN_HSRR1 andi. r12,r12,0x3ffd b 2f -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE_206) +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) #endif 1: mfsrr0 r3 mfsrr1 r4 diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index dfd764896db..b44f5f80305 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -51,7 +51,7 @@ static inline void __tlbie(unsigned long va, int psize, int ssize) va &= ~0xffful; va |= ssize << 8; asm volatile(ASM_FTR_IFCLR("tlbie %0,0", PPC_TLBIE(%1,%0), %2) - : : "r" (va), "r"(0), "i" (CPU_FTR_HVMODE_206) + : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) : "memory"); break; default: @@ -61,7 +61,7 @@ static inline void __tlbie(unsigned long va, int psize, int ssize) va |= ssize << 8; va |= 1; /* L */ asm volatile(ASM_FTR_IFCLR("tlbie %0,1", PPC_TLBIE(%1,%0), %2) - : : "r" (va), "r"(0), "i" (CPU_FTR_HVMODE_206) + : : "r" (va), "r"(0), "i" (CPU_FTR_ARCH_206) : "memory"); break; } -- cgit v1.2.3-70-g09d2 From 9e368f2915601cd5bc7f5fd638b58435b018bbd7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 29 Jun 2011 00:40:08 +0000 Subject: KVM: PPC: book3s_hv: Add support for PPC970-family processors This adds support for running KVM guests in supervisor mode on those PPC970 processors that have a usable hypervisor mode. Unfortunately, Apple G5 machines have supervisor mode disabled (MSR[HV] is forced to 1), but the YDL PowerStation does have a usable hypervisor mode. There are several differences between the PPC970 and POWER7 in how guests are managed. These differences are accommodated using the CPU_FTR_ARCH_201 (PPC970) and CPU_FTR_ARCH_206 (POWER7) CPU feature bits. Notably, on PPC970: * The LPCR, LPID or RMOR registers don't exist, and the functions of those registers are provided by bits in HID4 and one bit in HID0. * External interrupts can be directed to the hypervisor, but unlike POWER7 they are masked by MSR[EE] in non-hypervisor modes and use SRR0/1 not HSRR0/1. * There is no virtual RMA (VRMA) mode; the guest must use an RMO (real mode offset) area. * The TLB entries are not tagged with the LPID, so it is necessary to flush the whole TLB on partition switch. Furthermore, when switching partitions we have to ensure that no other CPU is executing the tlbie or tlbsync instructions in either the old or the new partition, otherwise undefined behaviour can occur. * The PMU has 8 counters (PMC registers) rather than 6. * The DSCR, PURR, SPURR, AMR, AMOR, UAMOR registers don't exist. * The SLB has 64 entries rather than 32. * There is no mediated external interrupt facility, so if we switch to a guest that has a virtual external interrupt pending but the guest has MSR[EE] = 0, we have to arrange to have an interrupt pending for it so that we can get control back once it re-enables interrupts. We do that by sending ourselves an IPI with smp_send_reschedule after hard-disabling interrupts. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/exception-64s.h | 4 + arch/powerpc/include/asm/kvm_book3s_asm.h | 2 +- arch/powerpc/include/asm/kvm_host.h | 2 +- arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kernel/exceptions-64s.S | 2 +- arch/powerpc/kvm/Kconfig | 13 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 30 ++-- arch/powerpc/kvm/book3s_hv.c | 60 ++++++-- arch/powerpc/kvm/book3s_hv_builtin.c | 11 +- arch/powerpc/kvm/book3s_hv_interrupts.S | 30 ++++ arch/powerpc/kvm/book3s_hv_rm_mmu.c | 6 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 230 +++++++++++++++++++++++++++++- arch/powerpc/kvm/powerpc.c | 3 + arch/powerpc/mm/hash_native_64.c | 2 +- 14 files changed, 354 insertions(+), 42 deletions(-) (limited to 'arch/powerpc/kvm/book3s_hv.c') diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 69435da8f2b..8057f4f6980 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -246,6 +246,10 @@ label##_hv: \ KVMTEST(vec); \ _SOFTEN_TEST(EXC_HV) +#define SOFTEN_TEST_HV_201(vec) \ + KVMTEST(vec); \ + _SOFTEN_TEST(EXC_STD) + #define __MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra) \ HMT_MEDIUM; \ SET_SCRATCH0(r13); /* save r13 */ \ diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index 9cfd5436782..ef7b3688c3b 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -82,7 +82,7 @@ struct kvmppc_host_state { unsigned long xics_phys; u64 dabr; u64 host_mmcr[3]; - u32 host_pmc[6]; + u32 host_pmc[8]; u64 host_purr; u64 host_spurr; u64 host_dscr; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index f572d9cc31b..cc22b282d75 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -353,7 +353,7 @@ struct kvm_vcpu_arch { u32 dbsr; u64 mmcr[3]; - u32 pmc[6]; + u32 pmc[8]; #ifdef CONFIG_KVM_EXIT_TIMING struct mutex exit_timing_lock; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index f4aba938166..54b935f2f5d 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -128,6 +128,7 @@ int main(void) DEFINE(ICACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, ilines_per_page)); /* paca */ DEFINE(PACA_SIZE, sizeof(struct paca_struct)); + DEFINE(PACA_LOCK_TOKEN, offsetof(struct paca_struct, lock_token)); DEFINE(PACAPACAINDEX, offsetof(struct paca_struct, paca_index)); DEFINE(PACAPROCSTART, offsetof(struct paca_struct, cpu_start)); DEFINE(PACAKSAVE, offsetof(struct paca_struct, kstack)); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index a5345380bef..41b02c792aa 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -171,7 +171,7 @@ hardware_interrupt_hv: KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502) FTR_SECTION_ELSE _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt, - EXC_STD, SOFTEN_TEST_PR) + EXC_STD, SOFTEN_TEST_HV_201) KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500) ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 5d9b78ebbaa..eeb42e06f2d 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -67,23 +67,20 @@ config KVM_BOOK3S_64 If unsure, say N. config KVM_BOOK3S_64_HV - bool "KVM support for POWER7 using hypervisor mode in host" + bool "KVM support for POWER7 and PPC970 using hypervisor mode in host" depends on KVM_BOOK3S_64 ---help--- Support running unmodified book3s_64 guest kernels in - virtual machines on POWER7 processors that have hypervisor - mode available to the host. + virtual machines on POWER7 and PPC970 processors that have + hypervisor mode available to the host. If you say Y here, KVM will use the hardware virtualization facilities of POWER7 (and later) processors, meaning that guest operating systems will run at full hardware speed using supervisor and user modes. However, this also means that KVM is not usable under PowerVM (pHyp), is only usable - on POWER7 (or later) processors, and can only emulate - POWER5+, POWER6 and POWER7 processors. - - This module provides access to the hardware capabilities through - a character device node named /dev/kvm. + on POWER7 (or later) processors and PPC970-family processors, + and cannot emulate a different processor from the host processor. If unsure, say N. diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 212dcd8fc50..bc3a2ea9421 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -42,6 +42,8 @@ #define VRMA_PAGE_ORDER 24 #define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ +/* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ +#define MAX_LPID_970 63 #define NR_LPIDS (LPID_RSVD + 1) unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)]; @@ -69,9 +71,6 @@ long kvmppc_alloc_hpt(struct kvm *kvm) kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18); kvm->arch.lpid = lpid; - kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); - kvm->arch.host_lpid = mfspr(SPRN_LPID); - kvm->arch.host_lpcr = mfspr(SPRN_LPCR); pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid); return 0; @@ -128,12 +127,24 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) int kvmppc_mmu_hv_init(void) { - if (!cpu_has_feature(CPU_FTR_HVMODE) || - !cpu_has_feature(CPU_FTR_ARCH_206)) + unsigned long host_lpid, rsvd_lpid; + + if (!cpu_has_feature(CPU_FTR_HVMODE)) return -EINVAL; + memset(lpid_inuse, 0, sizeof(lpid_inuse)); - set_bit(mfspr(SPRN_LPID), lpid_inuse); - set_bit(LPID_RSVD, lpid_inuse); + + if (cpu_has_feature(CPU_FTR_ARCH_206)) { + host_lpid = mfspr(SPRN_LPID); /* POWER7 */ + rsvd_lpid = LPID_RSVD; + } else { + host_lpid = 0; /* PPC970 */ + rsvd_lpid = MAX_LPID_970; + } + + set_bit(host_lpid, lpid_inuse); + /* rsvd_lpid is reserved for use in partition switching */ + set_bit(rsvd_lpid, lpid_inuse); return 0; } @@ -157,7 +168,10 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) { struct kvmppc_mmu *mmu = &vcpu->arch.mmu; - vcpu->arch.slb_nr = 32; /* Assume POWER7 for now */ + if (cpu_has_feature(CPU_FTR_ARCH_206)) + vcpu->arch.slb_nr = 32; /* POWER7 */ + else + vcpu->arch.slb_nr = 64; mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index dc70e7745ab..cc0d7f1b19a 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -443,8 +443,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int kvmppc_core_check_processor_compat(void) { - if (cpu_has_feature(CPU_FTR_HVMODE) && - cpu_has_feature(CPU_FTR_ARCH_206)) + if (cpu_has_feature(CPU_FTR_HVMODE)) return 0; return -EIO; } @@ -731,6 +730,10 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return -EINTR; } + /* On PPC970, check that we have an RMA region */ + if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201)) + return -EPERM; + kvm_run->exit_reason = 0; vcpu->arch.ret = RESUME_GUEST; vcpu->arch.trap = 0; @@ -920,12 +923,14 @@ fail: } /* Work out RMLS (real mode limit selector) field value for a given RMA size. - Assumes POWER7. */ + Assumes POWER7 or PPC970. */ static inline int lpcr_rmls(unsigned long rma_size) { switch (rma_size) { case 32ul << 20: /* 32 MB */ - return 8; + if (cpu_has_feature(CPU_FTR_ARCH_206)) + return 8; /* only supported on POWER7 */ + return -1; case 64ul << 20: /* 64 MB */ return 3; case 128ul << 20: /* 128 MB */ @@ -1059,6 +1064,10 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, mem->userspace_addr == vma->vm_start) ri = vma->vm_file->private_data; up_read(¤t->mm->mmap_sem); + if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) { + pr_err("CPU requires an RMO\n"); + return -EINVAL; + } } if (ri) { @@ -1077,10 +1086,25 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, atomic_inc(&ri->use_count); kvm->arch.rma = ri; kvm->arch.n_rma_pages = rma_size >> porder; - lpcr = kvm->arch.lpcr & ~(LPCR_VPM0 | LPCR_VRMA_L); - lpcr |= rmls << LPCR_RMLS_SH; + + /* Update LPCR and RMOR */ + lpcr = kvm->arch.lpcr; + if (cpu_has_feature(CPU_FTR_ARCH_201)) { + /* PPC970; insert RMLS value (split field) in HID4 */ + lpcr &= ~((1ul << HID4_RMLS0_SH) | + (3ul << HID4_RMLS2_SH)); + lpcr |= ((rmls >> 2) << HID4_RMLS0_SH) | + ((rmls & 3) << HID4_RMLS2_SH); + /* RMOR is also in HID4 */ + lpcr |= ((ri->base_pfn >> (26 - PAGE_SHIFT)) & 0xffff) + << HID4_RMOR_SH; + } else { + /* POWER7 */ + lpcr &= ~(LPCR_VPM0 | LPCR_VRMA_L); + lpcr |= rmls << LPCR_RMLS_SH; + kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT; + } kvm->arch.lpcr = lpcr; - kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT; pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n", ri->base_pfn << PAGE_SHIFT, rma_size, lpcr); } @@ -1151,11 +1175,25 @@ int kvmppc_core_init_vm(struct kvm *kvm) kvm->arch.rma = NULL; kvm->arch.n_rma_pages = 0; - lpcr = kvm->arch.host_lpcr & (LPCR_PECE | LPCR_LPES); - lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | - LPCR_VPM0 | LPCR_VRMA_L; - kvm->arch.lpcr = lpcr; + kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); + if (cpu_has_feature(CPU_FTR_ARCH_201)) { + /* PPC970; HID4 is effectively the LPCR */ + unsigned long lpid = kvm->arch.lpid; + kvm->arch.host_lpid = 0; + kvm->arch.host_lpcr = lpcr = mfspr(SPRN_HID4); + lpcr &= ~((3 << HID4_LPID1_SH) | (0xful << HID4_LPID5_SH)); + lpcr |= ((lpid >> 4) << HID4_LPID1_SH) | + ((lpid & 0xf) << HID4_LPID5_SH); + } else { + /* POWER7; init LPCR for virtual RMA mode */ + kvm->arch.host_lpid = mfspr(SPRN_LPID); + kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); + lpcr &= LPCR_PECE | LPCR_LPES; + lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | + LPCR_VPM0 | LPCR_VRMA_L; + } + kvm->arch.lpcr = lpcr; return 0; diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 7315ec6e817..d43120355ee 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -55,12 +55,14 @@ static LIST_HEAD(free_rmas); static DEFINE_SPINLOCK(rma_lock); /* Work out RMLS (real mode limit selector) field value for a given RMA size. - Assumes POWER7. */ + Assumes POWER7 or PPC970. */ static inline int lpcr_rmls(unsigned long rma_size) { switch (rma_size) { case 32ul << 20: /* 32 MB */ - return 8; + if (cpu_has_feature(CPU_FTR_ARCH_206)) + return 8; /* only supported on POWER7 */ + return -1; case 64ul << 20: /* 64 MB */ return 3; case 128ul << 20: /* 128 MB */ @@ -90,8 +92,9 @@ void kvm_rma_init(void) void *rma; struct page *pg; - /* Only do this in HV mode */ - if (!cpu_has_feature(CPU_FTR_HVMODE)) + /* Only do this on PPC970 in HV mode */ + if (!cpu_has_feature(CPU_FTR_HVMODE) || + !cpu_has_feature(CPU_FTR_ARCH_201)) return; if (!kvm_rma_size || !kvm_rma_count) diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 532afaf1984..3f7b674dd4b 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -50,8 +50,10 @@ _GLOBAL(__kvmppc_vcore_entry) SAVE_NVGPRS(r1) /* Save host DSCR */ +BEGIN_FTR_SECTION mfspr r3, SPRN_DSCR std r3, HSTATE_DSCR(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) /* Save host DABR */ mfspr r3, SPRN_DABR @@ -86,12 +88,20 @@ _GLOBAL(__kvmppc_vcore_entry) mfspr r7, SPRN_PMC4 mfspr r8, SPRN_PMC5 mfspr r9, SPRN_PMC6 +BEGIN_FTR_SECTION + mfspr r10, SPRN_PMC7 + mfspr r11, SPRN_PMC8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) stw r3, HSTATE_PMC(r13) stw r5, HSTATE_PMC + 4(r13) stw r6, HSTATE_PMC + 8(r13) stw r7, HSTATE_PMC + 12(r13) stw r8, HSTATE_PMC + 16(r13) stw r9, HSTATE_PMC + 20(r13) +BEGIN_FTR_SECTION + stw r10, HSTATE_PMC + 24(r13) + stw r11, HSTATE_PMC + 28(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) 31: /* @@ -105,6 +115,26 @@ _GLOBAL(__kvmppc_vcore_entry) add r8,r8,r7 std r8,HSTATE_DECEXP(r13) + /* + * On PPC970, if the guest vcpu has an external interrupt pending, + * send ourselves an IPI so as to interrupt the guest once it + * enables interrupts. (It must have interrupts disabled, + * otherwise we would already have delivered the interrupt.) + */ +BEGIN_FTR_SECTION + ld r0, VCPU_PENDING_EXC(r4) + li r7, (1 << BOOK3S_IRQPRIO_EXTERNAL) + oris r7, r7, (1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h + and. r0, r0, r7 + beq 32f + mr r31, r4 + lhz r3, PACAPACAINDEX(r13) + bl smp_send_reschedule + nop + mr r4, r31 +32: +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + /* Jump to partition switch code */ bl .kvmppc_hv_entry_trampoline nop diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index edb0aae901a..fcfe6b05555 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -56,7 +56,8 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, /* only handle 4k, 64k and 16M pages for now */ porder = 12; if (pteh & HPTE_V_LARGE) { - if ((ptel & 0xf000) == 0x1000) { + if (cpu_has_feature(CPU_FTR_ARCH_206) && + (ptel & 0xf000) == 0x1000) { /* 64k page */ porder = 16; } else if ((ptel & 0xff000) == 0) { @@ -126,7 +127,8 @@ static unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, va_low &= 0x7ff; if (v & HPTE_V_LARGE) { rb |= 1; /* L field */ - if (r & 0xff000) { + if (cpu_has_feature(CPU_FTR_ARCH_206) && + (r & 0xff000)) { /* non-16MB large page, must be 64k */ /* (masks depend on page size) */ rb |= 0x1000; /* page encoding in LP field */ diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 9ee223c3528..6dd33581a22 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -148,12 +148,20 @@ kvmppc_hv_entry: lwz r7, VCPU_PMC + 12(r4) lwz r8, VCPU_PMC + 16(r4) lwz r9, VCPU_PMC + 20(r4) +BEGIN_FTR_SECTION + lwz r10, VCPU_PMC + 24(r4) + lwz r11, VCPU_PMC + 28(r4) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) mtspr SPRN_PMC1, r3 mtspr SPRN_PMC2, r5 mtspr SPRN_PMC3, r6 mtspr SPRN_PMC4, r7 mtspr SPRN_PMC5, r8 mtspr SPRN_PMC6, r9 +BEGIN_FTR_SECTION + mtspr SPRN_PMC7, r10 + mtspr SPRN_PMC8, r11 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) ld r3, VCPU_MMCR(r4) ld r5, VCPU_MMCR + 8(r4) ld r6, VCPU_MMCR + 16(r4) @@ -165,9 +173,11 @@ kvmppc_hv_entry: /* Load up FP, VMX and VSX registers */ bl kvmppc_load_fp +BEGIN_FTR_SECTION /* Switch DSCR to guest value */ ld r5, VCPU_DSCR(r4) mtspr SPRN_DSCR, r5 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) /* * Set the decrementer to the guest decrementer. @@ -210,6 +220,7 @@ kvmppc_hv_entry: mtspr SPRN_DABRX,r5 mtspr SPRN_DABR,r6 +BEGIN_FTR_SECTION /* Restore AMR and UAMOR, set AMOR to all 1s */ ld r5,VCPU_AMR(r4) ld r6,VCPU_UAMOR(r4) @@ -217,6 +228,7 @@ kvmppc_hv_entry: mtspr SPRN_AMR,r5 mtspr SPRN_UAMOR,r6 mtspr SPRN_AMOR,r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) /* Clear out SLB */ li r6,0 @@ -224,6 +236,14 @@ kvmppc_hv_entry: slbia ptesync +BEGIN_FTR_SECTION + b 30f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + /* + * POWER7 host -> guest partition switch code. + * We don't have to lock against concurrent tlbies, + * but we do have to coordinate across hardware threads. + */ /* Increment entry count iff exit count is zero. */ ld r5,HSTATE_KVM_VCORE(r13) addi r9,r5,VCORE_ENTRY_EXIT @@ -315,9 +335,94 @@ kvmppc_hv_entry: ld r8,VCPU_SPURR(r4) mtspr SPRN_PURR,r7 mtspr SPRN_SPURR,r8 + b 31f + + /* + * PPC970 host -> guest partition switch code. + * We have to lock against concurrent tlbies, + * using native_tlbie_lock to lock against host tlbies + * and kvm->arch.tlbie_lock to lock against guest tlbies. + * We also have to invalidate the TLB since its + * entries aren't tagged with the LPID. + */ +30: ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ + + /* first take native_tlbie_lock */ + .section ".toc","aw" +toc_tlbie_lock: + .tc native_tlbie_lock[TC],native_tlbie_lock + .previous + ld r3,toc_tlbie_lock@toc(2) + lwz r8,PACA_LOCK_TOKEN(r13) +24: lwarx r0,0,r3 + cmpwi r0,0 + bne 24b + stwcx. r8,0,r3 + bne 24b + isync + + ld r7,KVM_LPCR(r9) /* use kvm->arch.lpcr to store HID4 */ + li r0,0x18f + rotldi r0,r0,HID4_LPID5_SH /* all lpid bits in HID4 = 1 */ + or r0,r7,r0 + ptesync + sync + mtspr SPRN_HID4,r0 /* switch to reserved LPID */ + isync + li r0,0 + stw r0,0(r3) /* drop native_tlbie_lock */ + + /* invalidate the whole TLB */ + li r0,256 + mtctr r0 + li r6,0 +25: tlbiel r6 + addi r6,r6,0x1000 + bdnz 25b + ptesync + + /* Take the guest's tlbie_lock */ + addi r3,r9,KVM_TLBIE_LOCK +24: lwarx r0,0,r3 + cmpwi r0,0 + bne 24b + stwcx. r8,0,r3 + bne 24b + isync + ld r6,KVM_SDR1(r9) + mtspr SPRN_SDR1,r6 /* switch to partition page table */ + + /* Set up HID4 with the guest's LPID etc. */ + sync + mtspr SPRN_HID4,r7 + isync + + /* drop the guest's tlbie_lock */ + li r0,0 + stw r0,0(r3) + + /* Check if HDEC expires soon */ + mfspr r3,SPRN_HDEC + cmpwi r3,10 + li r12,BOOK3S_INTERRUPT_HV_DECREMENTER + mr r9,r4 + blt hdec_soon + + /* Enable HDEC interrupts */ + mfspr r0,SPRN_HID0 + li r3,1 + rldimi r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1 + sync + mtspr SPRN_HID0,r0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 /* Load up guest SLB entries */ - lwz r5,VCPU_SLB_MAX(r4) +31: lwz r5,VCPU_SLB_MAX(r4) cmpwi r5,0 beq 9f mtctr r5 @@ -472,6 +577,7 @@ kvmppc_interrupt: hcall_real_cont: /* Check for mediated interrupts (could be done earlier really ...) */ +BEGIN_FTR_SECTION cmpwi r12,BOOK3S_INTERRUPT_EXTERNAL bne+ 1f ld r5,VCPU_KVM(r9) @@ -481,6 +587,7 @@ hcall_real_cont: andi. r0,r5,LPCR_MER bne bounce_ext_interrupt 1: +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) /* Save DEC */ mfspr r5,SPRN_DEC @@ -492,9 +599,11 @@ hcall_real_cont: /* Save HEIR (HV emulation assist reg) in last_inst if this is an HEI (HV emulation interrupt, e40) */ li r3,-1 +BEGIN_FTR_SECTION cmpwi r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST bne 11f mfspr r3,SPRN_HEIR +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) 11: stw r3,VCPU_LAST_INST(r9) /* Save more register state */ @@ -508,8 +617,10 @@ hcall_real_cont: stw r7, VCPU_DSISR(r9) std r8, VCPU_CTR(r9) /* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */ +BEGIN_FTR_SECTION cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE beq 6f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) 7: std r6, VCPU_FAULT_DAR(r9) stw r7, VCPU_FAULT_DSISR(r9) @@ -543,6 +654,7 @@ hcall_real_cont: /* * Save the guest PURR/SPURR */ +BEGIN_FTR_SECTION mfspr r5,SPRN_PURR mfspr r6,SPRN_SPURR ld r7,VCPU_PURR(r9) @@ -562,6 +674,7 @@ hcall_real_cont: add r4,r4,r6 mtspr SPRN_PURR,r3 mtspr SPRN_SPURR,r4 +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201) /* Clear out SLB */ li r5,0 @@ -570,6 +683,14 @@ hcall_real_cont: ptesync hdec_soon: +BEGIN_FTR_SECTION + b 32f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + /* + * POWER7 guest -> host partition switch code. + * We don't have to lock against tlbies but we do + * have to coordinate the hardware threads. + */ /* Increment the threads-exiting-guest count in the 0xff00 bits of vcore->entry_exit_count */ lwsync @@ -640,9 +761,82 @@ hdec_soon: 16: ld r8,KVM_HOST_LPCR(r4) mtspr SPRN_LPCR,r8 isync + b 33f + + /* + * PPC970 guest -> host partition switch code. + * We have to lock against concurrent tlbies, and + * we have to flush the whole TLB. + */ +32: ld r4,VCPU_KVM(r9) /* pointer to struct kvm */ + + /* Take the guest's tlbie_lock */ + lwz r8,PACA_LOCK_TOKEN(r13) + addi r3,r4,KVM_TLBIE_LOCK +24: lwarx r0,0,r3 + cmpwi r0,0 + bne 24b + stwcx. r8,0,r3 + bne 24b + isync + + ld r7,KVM_HOST_LPCR(r4) /* use kvm->arch.host_lpcr for HID4 */ + li r0,0x18f + rotldi r0,r0,HID4_LPID5_SH /* all lpid bits in HID4 = 1 */ + or r0,r7,r0 + ptesync + sync + mtspr SPRN_HID4,r0 /* switch to reserved LPID */ + isync + li r0,0 + stw r0,0(r3) /* drop guest tlbie_lock */ + + /* invalidate the whole TLB */ + li r0,256 + mtctr r0 + li r6,0 +25: tlbiel r6 + addi r6,r6,0x1000 + bdnz 25b + ptesync + + /* take native_tlbie_lock */ + ld r3,toc_tlbie_lock@toc(2) +24: lwarx r0,0,r3 + cmpwi r0,0 + bne 24b + stwcx. r8,0,r3 + bne 24b + isync + + ld r6,KVM_HOST_SDR1(r4) + mtspr SPRN_SDR1,r6 /* switch to host page table */ + + /* Set up host HID4 value */ + sync + mtspr SPRN_HID4,r7 + isync + li r0,0 + stw r0,0(r3) /* drop native_tlbie_lock */ + + lis r8,0x7fff /* MAX_INT@h */ + mtspr SPRN_HDEC,r8 + + /* Disable HDEC interrupts */ + mfspr r0,SPRN_HID0 + li r3,0 + rldimi r0,r3, HID0_HDICE_SH, 64-HID0_HDICE_SH-1 + sync + mtspr SPRN_HID0,r0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 + mfspr r0,SPRN_HID0 /* load host SLB entries */ - ld r8,PACA_SLBSHADOWPTR(r13) +33: ld r8,PACA_SLBSHADOWPTR(r13) .rept SLB_NUM_BOLTED ld r5,SLBSHADOW_SAVEAREA(r8) @@ -654,12 +848,14 @@ hdec_soon: .endr /* Save and reset AMR and UAMOR before turning on the MMU */ +BEGIN_FTR_SECTION mfspr r5,SPRN_AMR mfspr r6,SPRN_UAMOR std r5,VCPU_AMR(r9) std r6,VCPU_UAMOR(r9) li r6,0 mtspr SPRN_AMR,r6 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) /* Restore host DABR and DABRX */ ld r5,HSTATE_DABR(r13) @@ -668,10 +864,12 @@ hdec_soon: mtspr SPRN_DABRX,r6 /* Switch DSCR back to host value */ +BEGIN_FTR_SECTION mfspr r8, SPRN_DSCR ld r7, HSTATE_DSCR(r13) std r8, VCPU_DSCR(r7) mtspr SPRN_DSCR, r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) /* Save non-volatile GPRs */ std r14, VCPU_GPR(r14)(r9) @@ -735,21 +933,31 @@ hdec_soon: mfspr r6, SPRN_PMC4 mfspr r7, SPRN_PMC5 mfspr r8, SPRN_PMC6 +BEGIN_FTR_SECTION + mfspr r10, SPRN_PMC7 + mfspr r11, SPRN_PMC8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) stw r3, VCPU_PMC(r9) stw r4, VCPU_PMC + 4(r9) stw r5, VCPU_PMC + 8(r9) stw r6, VCPU_PMC + 12(r9) stw r7, VCPU_PMC + 16(r9) stw r8, VCPU_PMC + 20(r9) +BEGIN_FTR_SECTION + stw r10, VCPU_PMC + 24(r9) + stw r11, VCPU_PMC + 28(r9) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) 22: /* save FP state */ mr r3, r9 bl .kvmppc_save_fp - /* Secondary threads go off to take a nap */ + /* Secondary threads go off to take a nap on POWER7 */ +BEGIN_FTR_SECTION lwz r0,VCPU_PTID(r3) cmpwi r0,0 bne secondary_nap +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) /* * Reload DEC. HDEC interrupts were disabled when @@ -771,12 +979,20 @@ hdec_soon: lwz r6, HSTATE_PMC + 12(r13) lwz r8, HSTATE_PMC + 16(r13) lwz r9, HSTATE_PMC + 20(r13) +BEGIN_FTR_SECTION + lwz r10, HSTATE_PMC + 24(r13) + lwz r11, HSTATE_PMC + 28(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) mtspr SPRN_PMC1, r3 mtspr SPRN_PMC2, r4 mtspr SPRN_PMC3, r5 mtspr SPRN_PMC4, r6 mtspr SPRN_PMC5, r8 mtspr SPRN_PMC6, r9 +BEGIN_FTR_SECTION + mtspr SPRN_PMC7, r10 + mtspr SPRN_PMC8, r11 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) ld r3, HSTATE_MMCR(r13) ld r4, HSTATE_MMCR + 8(r13) ld r5, HSTATE_MMCR + 16(r13) @@ -802,7 +1018,7 @@ hdec_soon: cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK /* RFI into the highmem handler, or branch to interrupt handler */ - mfmsr r6 +12: mfmsr r6 mtctr r12 li r0, MSR_RI andc r6, r6, r0 @@ -812,7 +1028,11 @@ hdec_soon: beqctr RFI -11: mtspr SPRN_HSRR0, r8 +11: +BEGIN_FTR_SECTION + b 12b +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) + mtspr SPRN_HSRR0, r8 mtspr SPRN_HSRR1, r7 ba 0x500 diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 72c506505fa..a107c9be0fb 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -213,6 +213,9 @@ int kvm_dev_ioctl_check_extension(long ext) break; case KVM_CAP_PPC_RMA: r = 1; + /* PPC970 requires an RMA */ + if (cpu_has_feature(CPU_FTR_ARCH_201)) + r = 2; break; #endif default: diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index b44f5f80305..90039bc6411 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -37,7 +37,7 @@ #define HPTE_LOCK_BIT 3 -static DEFINE_RAW_SPINLOCK(native_tlbie_lock); +DEFINE_RAW_SPINLOCK(native_tlbie_lock); static inline void __tlbie(unsigned long va, int psize, int ssize) { -- cgit v1.2.3-70-g09d2