From db507c300ed6ce6e9fc71d4e19975d5abe01a7de Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Fri, 8 Jul 2011 13:40:10 +0200 Subject: KVM: PPC: move compute_tlbie_rb to book3s common header We need the compute_tlbie_rb in _pr and _hv implementations for papr soon, so let's move it over to a common header file that both implementations can leverage. Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_book3s.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 98da010252a..37dd7486627 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -382,6 +382,39 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) } #endif +static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, + unsigned long pte_index) +{ + unsigned long rb, va_low; + + rb = (v & ~0x7fUL) << 16; /* AVA field */ + va_low = pte_index >> 3; + if (v & HPTE_V_SECONDARY) + va_low = ~va_low; + /* xor vsid from AVA */ + if (!(v & HPTE_V_1TB_SEG)) + va_low ^= v >> 12; + else + va_low ^= v >> 24; + va_low &= 0x7ff; + if (v & HPTE_V_LARGE) { + rb |= 1; /* L field */ + if (cpu_has_feature(CPU_FTR_ARCH_206) && + (r & 0xff000)) { + /* non-16MB large page, must be 64k */ + /* (masks depend on page size) */ + rb |= 0x1000; /* page encoding in LP field */ + rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */ + rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */ + } + } else { + /* 4kB page */ + rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */ + } + rb |= (v >> 54) & 0x300; /* B field */ + return rb; +} + /* Magic register values loaded into r3 and r4 before the 'sc' assembly * instruction for the OSI hypercalls */ #define OSI_SC_MAGIC_R3 0x113724FA -- cgit v1.2.3-70-g09d2 From 9432ba6015371f186926cd62e2395718217a17a1 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 8 Aug 2011 16:08:55 +0200 Subject: KVM: PPC: Add papr_enabled flag When running a PAPR guest, some things change. The privilege level drops from hypervisor to supervisor, SDR1 gets treated differently and we interpret hypercalls. For bisectability sake, add the flag now, but only enable it when all the support code is there. Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_host.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index cc22b282d75..e6813021126 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -389,6 +389,7 @@ struct kvm_vcpu_arch { u8 dcr_is_write; u8 osi_needed; u8 osi_enabled; + u8 papr_enabled; u8 hcall_needed; u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ -- cgit v1.2.3-70-g09d2 From a15bd354f083f20f257db450488db52ac27df439 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 8 Aug 2011 17:17:09 +0200 Subject: KVM: PPC: Add support for explicit HIOR setting Until now, we always set HIOR based on the PVR, but this is just wrong. Instead, we should be setting HIOR explicitly, so user space can decide what the initial HIOR value is - just like on real hardware. We keep the old PVR based way around for backwards compatibility, but once user space uses the SREGS based method, we drop the PVR logic. Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm.h | 8 ++++++++ arch/powerpc/include/asm/kvm_book3s.h | 2 ++ arch/powerpc/kvm/book3s_pr.c | 14 ++++++++++++-- arch/powerpc/kvm/powerpc.c | 1 + include/linux/kvm.h | 1 + 5 files changed, 24 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index a4f6c85431f..a6a253ee81b 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -148,6 +148,12 @@ struct kvm_regs { #define KVM_SREGS_E_UPDATE_DEC (1 << 2) #define KVM_SREGS_E_UPDATE_DBSR (1 << 3) +/* + * Book3S special bits to indicate contents in the struct by maintaining + * backwards compatibility with older structs. If adding a new field, + * please make sure to add a flag for that new field */ +#define KVM_SREGS_S_HIOR (1 << 0) + /* * In KVM_SET_SREGS, reserved/pad fields must be left untouched from a * previous KVM_GET_REGS. @@ -173,6 +179,8 @@ struct kvm_sregs { __u64 ibat[8]; __u64 dbat[8]; } ppc32; + __u64 flags; /* KVM_SREGS_S_ */ + __u64 hior; } s; struct { union { diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 37dd7486627..472437b7b85 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -90,6 +90,8 @@ struct kvmppc_vcpu_book3s { #endif int context_id[SID_CONTEXTS]; + bool hior_sregs; /* HIOR is set by SREGS, not PVR */ + struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE]; struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG]; struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 0c0d3f27443..78dcf659e12 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -150,13 +150,15 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) #ifdef CONFIG_PPC_BOOK3S_64 if ((pvr >= 0x330000) && (pvr < 0x70330000)) { kvmppc_mmu_book3s_64_init(vcpu); - to_book3s(vcpu)->hior = 0xfff00000; + if (!to_book3s(vcpu)->hior_sregs) + to_book3s(vcpu)->hior = 0xfff00000; to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; } else #endif { kvmppc_mmu_book3s_32_init(vcpu); - to_book3s(vcpu)->hior = 0; + if (!to_book3s(vcpu)->hior_sregs) + to_book3s(vcpu)->hior = 0; to_book3s(vcpu)->msr_mask = 0xffffffffULL; } @@ -770,6 +772,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, } } + if (sregs->u.s.flags & KVM_SREGS_S_HIOR) + sregs->u.s.hior = to_book3s(vcpu)->hior; + return 0; } @@ -806,6 +811,11 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, /* Flush the MMU after messing with the segments */ kvmppc_mmu_pte_flush(vcpu, 0, 0); + if (sregs->u.s.flags & KVM_SREGS_S_HIOR) { + to_book3s(vcpu)->hior_sregs = true; + to_book3s(vcpu)->hior = sregs->u.s.hior; + } + return 0; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index a107c9be0fb..17a5c83e1cc 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -188,6 +188,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_BOOKE_SREGS: #else case KVM_CAP_PPC_SEGSTATE: + case KVM_CAP_PPC_HIOR: #endif case KVM_CAP_PPC_UNSET_IRQ: case KVM_CAP_PPC_IRQ_LEVEL: diff --git a/include/linux/kvm.h b/include/linux/kvm.h index 20697987788..490b041aba4 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -554,6 +554,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_SMT 64 #define KVM_CAP_PPC_RMA 65 #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ +#define KVM_CAP_PPC_HIOR 67 #define KVM_CAP_S390_GMAP 71 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3-70-g09d2 From 0254f0742998dc61fcf68a3488e2d93636031263 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Mon, 8 Aug 2011 17:21:15 +0200 Subject: KVM: PPC: Add PAPR hypercall code for PR mode When running a PAPR guest, we need to handle a few hypercalls in kernel space, most prominently the page table invalidation (to sync the shadows). So this patch adds handling for a few PAPR hypercalls to PR mode KVM. I tried to share the code with HV mode, but it ended up being a lot easier this way around, as the two differ too much in those details. Signed-off-by: Alexander Graf --- v1 -> v2: - whitespace fix --- arch/powerpc/include/asm/kvm_book3s.h | 1 + arch/powerpc/kvm/Makefile | 1 + arch/powerpc/kvm/book3s_pr_papr.c | 158 ++++++++++++++++++++++++++++++++++ 3 files changed, 160 insertions(+) create mode 100644 arch/powerpc/kvm/book3s_pr_papr.c (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 472437b7b85..91d41fabc5b 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -150,6 +150,7 @@ extern void kvmppc_load_up_altivec(void); extern void kvmppc_load_up_vsx(void); extern u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst); extern ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst); +extern int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd); static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu) { diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 08428e2c188..4c66d51dbd3 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -43,6 +43,7 @@ kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \ fpu.o \ book3s_paired_singles.o \ book3s_pr.o \ + book3s_pr_papr.o \ book3s_emulate.o \ book3s_interrupts.o \ book3s_mmu_hpte.o \ diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c new file mode 100644 index 00000000000..b9589324797 --- /dev/null +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -0,0 +1,158 @@ +/* + * Copyright (C) 2011. Freescale Inc. All rights reserved. + * + * Authors: + * Alexander Graf + * Paul Mackerras + * + * Description: + * + * Hypercall handling for running PAPR guests in PR KVM on Book 3S + * processors. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +static unsigned long get_pteg_addr(struct kvm_vcpu *vcpu, long pte_index) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + unsigned long pteg_addr; + + pte_index <<= 4; + pte_index &= ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1) << 7 | 0x70; + pteg_addr = vcpu_book3s->sdr1 & 0xfffffffffffc0000ULL; + pteg_addr |= pte_index; + + return pteg_addr; +} + +static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu) +{ + long flags = kvmppc_get_gpr(vcpu, 4); + long pte_index = kvmppc_get_gpr(vcpu, 5); + unsigned long pteg[2 * 8]; + unsigned long pteg_addr, i, *hpte; + + pte_index &= ~7UL; + pteg_addr = get_pteg_addr(vcpu, pte_index); + + copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg)); + hpte = pteg; + + if (likely((flags & H_EXACT) == 0)) { + pte_index &= ~7UL; + for (i = 0; ; ++i) { + if (i == 8) + return H_PTEG_FULL; + if ((*hpte & HPTE_V_VALID) == 0) + break; + hpte += 2; + } + } else { + i = kvmppc_get_gpr(vcpu, 5) & 7UL; + hpte += i * 2; + } + + hpte[0] = kvmppc_get_gpr(vcpu, 6); + hpte[1] = kvmppc_get_gpr(vcpu, 7); + copy_to_user((void __user *)pteg_addr, pteg, sizeof(pteg)); + kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + kvmppc_set_gpr(vcpu, 4, pte_index | i); + + return EMULATE_DONE; +} + +static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu) +{ + unsigned long flags= kvmppc_get_gpr(vcpu, 4); + unsigned long pte_index = kvmppc_get_gpr(vcpu, 5); + unsigned long avpn = kvmppc_get_gpr(vcpu, 6); + unsigned long v = 0, pteg, rb; + unsigned long pte[2]; + + pteg = get_pteg_addr(vcpu, pte_index); + copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + + if ((pte[0] & HPTE_V_VALID) == 0 || + ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn) || + ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) { + kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND); + return EMULATE_DONE; + } + + copy_to_user((void __user *)pteg, &v, sizeof(v)); + + rb = compute_tlbie_rb(pte[0], pte[1], pte_index); + vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); + + kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + kvmppc_set_gpr(vcpu, 4, pte[0]); + kvmppc_set_gpr(vcpu, 5, pte[1]); + + return EMULATE_DONE; +} + +static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu) +{ + unsigned long flags = kvmppc_get_gpr(vcpu, 4); + unsigned long pte_index = kvmppc_get_gpr(vcpu, 5); + unsigned long avpn = kvmppc_get_gpr(vcpu, 6); + unsigned long rb, pteg, r, v; + unsigned long pte[2]; + + pteg = get_pteg_addr(vcpu, pte_index); + copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + + if ((pte[0] & HPTE_V_VALID) == 0 || + ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn)) { + kvmppc_set_gpr(vcpu, 3, H_NOT_FOUND); + return EMULATE_DONE; + } + + v = pte[0]; + r = pte[1]; + r &= ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_HI | + HPTE_R_KEY_LO); + r |= (flags << 55) & HPTE_R_PP0; + r |= (flags << 48) & HPTE_R_KEY_HI; + r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); + + pte[1] = r; + + rb = compute_tlbie_rb(v, r, pte_index); + vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); + copy_to_user((void __user *)pteg, pte, sizeof(pte)); + + kvmppc_set_gpr(vcpu, 3, H_SUCCESS); + + return EMULATE_DONE; +} + +int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) +{ + switch (cmd) { + case H_ENTER: + return kvmppc_h_pr_enter(vcpu); + case H_REMOVE: + return kvmppc_h_pr_remove(vcpu); + case H_PROTECT: + return kvmppc_h_pr_protect(vcpu); + case H_BULK_REMOVE: + /* We just flush all PTEs, so user space can + handle the HPT modifications */ + kvmppc_mmu_pte_flush(vcpu, 0, 0); + break; + case H_CEDE: + kvm_vcpu_block(vcpu); + vcpu->stat.halt_wakeup++; + return EMULATE_DONE; + } + + return EMULATE_FAIL; +} -- cgit v1.2.3-70-g09d2 From af8f38b3499f0d4a3c354df2435f0fb2dded250a Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Wed, 10 Aug 2011 13:57:08 +0200 Subject: KVM: PPC: Add sanity checking to vcpu_run There are multiple features in PowerPC KVM that can now be enabled depending on the user's wishes. Some of the combinations don't make sense or don't work though. So this patch adds a way to check if the executing environment would actually be able to run the guest properly. It also adds sanity checks if PVR is set (should always be true given the current code flow), if PAPR is only used with book3s_64 where it works and that HV KVM is only used in PAPR mode. Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm.h | 5 +++++ arch/powerpc/include/asm/kvm_host.h | 2 ++ arch/powerpc/include/asm/kvm_ppc.h | 1 + arch/powerpc/kvm/44x.c | 2 ++ arch/powerpc/kvm/book3s_hv.c | 8 ++++++++ arch/powerpc/kvm/book3s_pr.c | 10 ++++++++++ arch/powerpc/kvm/booke.c | 10 +++++++++- arch/powerpc/kvm/e500.c | 2 ++ arch/powerpc/kvm/powerpc.c | 28 ++++++++++++++++++++++++++++ 9 files changed, 67 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index a6a253ee81b..08fe69edcd1 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -284,6 +284,11 @@ struct kvm_guest_debug_arch { #define KVM_INTERRUPT_UNSET -2U #define KVM_INTERRUPT_SET_LEVEL -3U +#define KVM_CPU_440 1 +#define KVM_CPU_E500V2 2 +#define KVM_CPU_3S_32 3 +#define KVM_CPU_3S_64 4 + /* for KVM_CAP_SPAPR_TCE */ struct kvm_create_spapr_tce { __u64 liobn; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e6813021126..2b8284f4b4b 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -390,6 +390,8 @@ struct kvm_vcpu_arch { u8 osi_needed; u8 osi_enabled; u8 papr_enabled; + u8 sane; + u8 cpu_type; u8 hcall_needed; u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index d121f49d62b..46efd1a265c 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -66,6 +66,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run, extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); +extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); /* Core-specific hooks */ diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index da3a1225c0a..ca1f88b3dc5 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -78,6 +78,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++) vcpu_44x->shadow_refs[i].gtlb_index = -1; + vcpu->arch.cpu_type = KVM_CPU_440; + return 0; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index cc0d7f1b19a..bf66ec731e8 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -510,6 +510,9 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) spin_unlock(&vcore->lock); vcpu->arch.vcore = vcore; + vcpu->arch.cpu_type = KVM_CPU_3S_64; + kvmppc_sanity_check(vcpu); + return vcpu; free_vcpu: @@ -800,6 +803,11 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) { int r; + if (!vcpu->arch.sane) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return -EINVAL; + } + do { r = kvmppc_run_vcpu(run, vcpu); diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 48558f6176e..6e3488b0951 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -153,6 +153,7 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) if (!to_book3s(vcpu)->hior_sregs) to_book3s(vcpu)->hior = 0xfff00000; to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; + vcpu->arch.cpu_type = KVM_CPU_3S_64; } else #endif { @@ -160,8 +161,11 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) if (!to_book3s(vcpu)->hior_sregs) to_book3s(vcpu)->hior = 0; to_book3s(vcpu)->msr_mask = 0xffffffffULL; + vcpu->arch.cpu_type = KVM_CPU_3S_32; } + kvmppc_sanity_check(vcpu); + /* If we are in hypervisor level on 970, we can tell the CPU to * treat DCBZ as 32 bytes store */ vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32; @@ -938,6 +942,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) #endif ulong ext_msr; + /* Check if we can run the vcpu at all */ + if (!vcpu->arch.sane) { + kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return -EINVAL; + } + /* No need to go into the guest when all we do is going out */ if (signal_pending(current)) { kvm_run->exit_reason = KVM_EXIT_INTR; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index ee45fa01220..bb6c988f010 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -316,6 +316,11 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { int ret; + if (!vcpu->arch.sane) { + kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return -EINVAL; + } + local_irq_disable(); kvm_guest_enter(); ret = __kvmppc_vcpu_run(kvm_run, vcpu); @@ -618,6 +623,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { int i; + int r; vcpu->arch.pc = 0; vcpu->arch.shared->msr = 0; @@ -634,7 +640,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) kvmppc_init_timing_stats(vcpu); - return kvmppc_core_vcpu_setup(vcpu); + r = kvmppc_core_vcpu_setup(vcpu); + kvmppc_sanity_check(vcpu); + return r; } int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 797a7447c26..26d20903f2b 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -73,6 +73,8 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) /* Since booke kvm only support one core, update all vcpus' PIR to 0 */ vcpu->vcpu_id = 0; + vcpu->arch.cpu_type = KVM_CPU_E500V2; + return 0; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 13bc798a444..a8000ce562b 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -95,6 +95,31 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) return r; } +int kvmppc_sanity_check(struct kvm_vcpu *vcpu) +{ + int r = false; + + /* We have to know what CPU to virtualize */ + if (!vcpu->arch.pvr) + goto out; + + /* PAPR only works with book3s_64 */ + if ((vcpu->arch.cpu_type != KVM_CPU_3S_64) && vcpu->arch.papr_enabled) + goto out; + +#ifdef CONFIG_KVM_BOOK3S_64_HV + /* HV KVM can only do PAPR mode for now */ + if (!vcpu->arch.papr_enabled) + goto out; +#endif + + r = true; + +out: + vcpu->arch.sane = r; + return r ? 0 : -EINVAL; +} + int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) { enum emulation_result er; @@ -582,6 +607,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, break; } + if (!r) + r = kvmppc_sanity_check(vcpu); + return r; } -- cgit v1.2.3-70-g09d2 From 02143947603fe90237a0423d34dd8943de229f78 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 23 Jul 2011 17:41:44 +1000 Subject: KVM: PPC: book3s_pr: Simplify transitions between virtual and real mode This simplifies the way that the book3s_pr makes the transition to real mode when entering the guest. We now call kvmppc_entry_trampoline (renamed from kvmppc_rmcall) in the base kernel using a normal function call instead of doing an indirect call through a pointer in the vcpu. If kvm is a module, the module loader takes care of generating a trampoline as it does for other calls to functions outside the module. kvmppc_entry_trampoline then disables interrupts and jumps to kvmppc_handler_trampoline_enter in real mode using an rfi[d]. That then uses the link register as the address to return to (potentially in module space) when the guest exits. This also simplifies the way that we call the Linux interrupt handler when we exit the guest due to an external, decrementer or performance monitor interrupt. Instead of turning on the MMU, then deciding that we need to call the Linux handler and turning the MMU back off again, we now go straight to the handler at the point where we would turn the MMU on. The handler will then return to the virtual-mode code (potentially in the module). Along the way, this moves the setting and clearing of the HID5 DCBZ32 bit into real-mode interrupts-off code, and also makes sure that we clear the MSR[RI] bit before loading values into SRR0/1. The net result is that we no longer need any code addresses to be stored in vcpu->arch. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_book3s.h | 4 +- arch/powerpc/include/asm/kvm_book3s_asm.h | 1 + arch/powerpc/include/asm/kvm_host.h | 8 -- arch/powerpc/kernel/asm-offsets.c | 7 +- arch/powerpc/kvm/book3s_32_sr.S | 2 +- arch/powerpc/kvm/book3s_64_slb.S | 2 +- arch/powerpc/kvm/book3s_exports.c | 4 +- arch/powerpc/kvm/book3s_interrupts.S | 129 ++---------------------------- arch/powerpc/kvm/book3s_pr.c | 12 --- arch/powerpc/kvm/book3s_rmhandlers.S | 51 ++++-------- arch/powerpc/kvm/book3s_segment.S | 112 +++++++++++++++++++++----- 11 files changed, 120 insertions(+), 212 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 91d41fabc5b..a384ffdf33d 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -141,9 +141,7 @@ extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); -extern void kvmppc_handler_lowmem_trampoline(void); -extern void kvmppc_handler_trampoline_enter(void); -extern void kvmppc_rmcall(ulong srr0, ulong srr1); +extern void kvmppc_entry_trampoline(void); extern void kvmppc_hv_entry_trampoline(void); extern void kvmppc_load_up_fpu(void); extern void kvmppc_load_up_altivec(void); diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index ef7b3688c3b..af73469530e 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -75,6 +75,7 @@ struct kvmppc_host_state { ulong scratch0; ulong scratch1; u8 in_guest; + u8 restore_hid5; #ifdef CONFIG_KVM_BOOK3S_64_HV struct kvm_vcpu *kvm_vcpu; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 2b8284f4b4b..dec3054f6ad 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -258,14 +258,6 @@ struct kvm_vcpu_arch { ulong host_stack; u32 host_pid; #ifdef CONFIG_PPC_BOOK3S - ulong host_msr; - ulong host_r2; - void *host_retip; - ulong trampoline_lowmem; - ulong trampoline_enter; - ulong highmem_handler; - ulong rmcall; - ulong host_paca_phys; struct kvmppc_slb slb[64]; int slb_max; /* 1 + index of last valid entry in slb[] */ int slb_nr; /* total number of entries in SLB */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 5f078bc2063..e069c766695 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -449,8 +449,6 @@ int main(void) #ifdef CONFIG_PPC_BOOK3S DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm)); DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id)); - DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip)); - DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr)); DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr)); DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr)); DEFINE(VCPU_DSCR, offsetof(struct kvm_vcpu, arch.dscr)); @@ -458,10 +456,6 @@ int main(void) DEFINE(VCPU_UAMOR, offsetof(struct kvm_vcpu, arch.uamor)); DEFINE(VCPU_CTRL, offsetof(struct kvm_vcpu, arch.ctrl)); DEFINE(VCPU_DABR, offsetof(struct kvm_vcpu, arch.dabr)); - DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem)); - DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter)); - DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler)); - DEFINE(VCPU_RMCALL, offsetof(struct kvm_vcpu, arch.rmcall)); DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags)); DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec)); DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires)); @@ -537,6 +531,7 @@ int main(void) HSTATE_FIELD(HSTATE_SCRATCH0, scratch0); HSTATE_FIELD(HSTATE_SCRATCH1, scratch1); HSTATE_FIELD(HSTATE_IN_GUEST, in_guest); + HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5); #ifdef CONFIG_KVM_BOOK3S_64_HV HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); diff --git a/arch/powerpc/kvm/book3s_32_sr.S b/arch/powerpc/kvm/book3s_32_sr.S index 3608471ad2d..7e06a6fc8d0 100644 --- a/arch/powerpc/kvm/book3s_32_sr.S +++ b/arch/powerpc/kvm/book3s_32_sr.S @@ -31,7 +31,7 @@ * R1 = host R1 * R2 = host R2 * R3 = shadow vcpu - * all other volatile GPRS = free + * all other volatile GPRS = free except R4, R6 * SVCPU[CR] = guest CR * SVCPU[XER] = guest XER * SVCPU[CTR] = guest CTR diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S index 04e7d3bbfe8..f2e6e48ea46 100644 --- a/arch/powerpc/kvm/book3s_64_slb.S +++ b/arch/powerpc/kvm/book3s_64_slb.S @@ -53,7 +53,7 @@ slb_exit_skip_ ## num: * R1 = host R1 * R2 = host R2 * R3 = shadow vcpu - * all other volatile GPRS = free + * all other volatile GPRS = free except R4, R6 * SVCPU[CR] = guest CR * SVCPU[XER] = guest XER * SVCPU[CTR] = guest CTR diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c index 88c8f26add0..f7f63a00ab1 100644 --- a/arch/powerpc/kvm/book3s_exports.c +++ b/arch/powerpc/kvm/book3s_exports.c @@ -23,9 +23,7 @@ #ifdef CONFIG_KVM_BOOK3S_64_HV EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline); #else -EXPORT_SYMBOL_GPL(kvmppc_handler_trampoline_enter); -EXPORT_SYMBOL_GPL(kvmppc_handler_lowmem_trampoline); -EXPORT_SYMBOL_GPL(kvmppc_rmcall); +EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline); EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu); #ifdef CONFIG_ALTIVEC EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec); diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S index c54b0e30cf3..0a8515a5c04 100644 --- a/arch/powerpc/kvm/book3s_interrupts.S +++ b/arch/powerpc/kvm/book3s_interrupts.S @@ -29,27 +29,11 @@ #define ULONG_SIZE 8 #define FUNC(name) GLUE(.,name) -#define GET_SHADOW_VCPU_R13 - -#define DISABLE_INTERRUPTS \ - mfmsr r0; \ - rldicl r0,r0,48,1; \ - rotldi r0,r0,16; \ - mtmsrd r0,1; \ - #elif defined(CONFIG_PPC_BOOK3S_32) #define ULONG_SIZE 4 #define FUNC(name) name -#define GET_SHADOW_VCPU_R13 \ - lwz r13, (THREAD + THREAD_KVM_SVCPU)(r2) - -#define DISABLE_INTERRUPTS \ - mfmsr r0; \ - rlwinm r0,r0,0,17,15; \ - mtmsr r0; \ - #endif /* CONFIG_PPC_BOOK3S_XX */ @@ -108,44 +92,17 @@ kvm_start_entry: kvm_start_lightweight: - GET_SHADOW_VCPU_R13 - PPC_LL r3, VCPU_HIGHMEM_HANDLER(r4) - PPC_STL r3, HSTATE_VMHANDLER(r13) - - PPC_LL r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */ - - DISABLE_INTERRUPTS - #ifdef CONFIG_PPC_BOOK3S_64 - /* Some guests may need to have dcbz set to 32 byte length. - * - * Usually we ensure that by patching the guest's instructions - * to trap on dcbz and emulate it in the hypervisor. - * - * If we can, we should tell the CPU to use 32 byte dcbz though, - * because that's a lot faster. - */ - PPC_LL r3, VCPU_HFLAGS(r4) - rldicl. r3, r3, 0, 63 /* CR = ((r3 & 1) == 0) */ - beq no_dcbz32_on - - mfspr r3,SPRN_HID5 - ori r3, r3, 0x80 /* XXX HID5_dcbz32 = 0x80 */ - mtspr SPRN_HID5,r3 - -no_dcbz32_on: - + rldicl r3, r3, 0, 63 /* r3 &= 1 */ + stb r3, HSTATE_RESTORE_HID5(r13) #endif /* CONFIG_PPC_BOOK3S_64 */ - PPC_LL r6, VCPU_RMCALL(r4) - mtctr r6 - - PPC_LL r3, VCPU_TRAMPOLINE_ENTER(r4) - LOAD_REG_IMMEDIATE(r4, MSR_KERNEL & ~(MSR_IR | MSR_DR)) + PPC_LL r4, VCPU_SHADOW_MSR(r4) /* get shadow_msr */ /* Jump to segment patching handler and into our guest */ - bctr + bl FUNC(kvmppc_entry_trampoline) + nop /* * This is the handler in module memory. It gets jumped at from the @@ -170,21 +127,6 @@ kvmppc_handler_highmem: /* R7 = vcpu */ PPC_LL r7, GPR4(r1) -#ifdef CONFIG_PPC_BOOK3S_64 - - PPC_LL r5, VCPU_HFLAGS(r7) - rldicl. r5, r5, 0, 63 /* CR = ((r5 & 1) == 0) */ - beq no_dcbz32_off - - li r4, 0 - mfspr r5,SPRN_HID5 - rldimi r5,r4,6,56 - mtspr SPRN_HID5,r5 - -no_dcbz32_off: - -#endif /* CONFIG_PPC_BOOK3S_64 */ - PPC_STL r14, VCPU_GPR(r14)(r7) PPC_STL r15, VCPU_GPR(r15)(r7) PPC_STL r16, VCPU_GPR(r16)(r7) @@ -204,67 +146,6 @@ no_dcbz32_off: PPC_STL r30, VCPU_GPR(r30)(r7) PPC_STL r31, VCPU_GPR(r31)(r7) - /* Restore host msr -> SRR1 */ - PPC_LL r6, VCPU_HOST_MSR(r7) - - /* - * For some interrupts, we need to call the real Linux - * handler, so it can do work for us. This has to happen - * as if the interrupt arrived from the kernel though, - * so let's fake it here where most state is restored. - * - * Call Linux for hardware interrupts/decrementer - * r3 = address of interrupt handler (exit reason) - */ - - cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL - beq call_linux_handler - cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER - beq call_linux_handler - cmpwi r12, BOOK3S_INTERRUPT_PERFMON - beq call_linux_handler - - /* Back to EE=1 */ - mtmsr r6 - sync - b kvm_return_point - -call_linux_handler: - - /* - * If we land here we need to jump back to the handler we - * came from. - * - * We have a page that we can access from real mode, so let's - * jump back to that and use it as a trampoline to get back into the - * interrupt handler! - * - * R3 still contains the exit code, - * R5 VCPU_HOST_RETIP and - * R6 VCPU_HOST_MSR - */ - - /* Restore host IP -> SRR0 */ - PPC_LL r5, VCPU_HOST_RETIP(r7) - - /* XXX Better move to a safe function? - * What if we get an HTAB flush in between mtsrr0 and mtsrr1? */ - - mtlr r12 - - PPC_LL r4, VCPU_TRAMPOLINE_LOWMEM(r7) - mtsrr0 r4 - LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR)) - mtsrr1 r3 - - RFI - -.global kvm_return_point -kvm_return_point: - - /* Jump back to lightweight entry if we're supposed to */ - /* go back into the guest */ - /* Pass the exit number as 3rd argument to kvmppc_handle_exit */ mr r5, r12 diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 6e3488b0951..d417511abfb 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -875,8 +875,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) if (!p) goto uninit_vcpu; - vcpu->arch.host_retip = kvm_return_point; - vcpu->arch.host_msr = mfmsr(); #ifdef CONFIG_PPC_BOOK3S_64 /* default to book3s_64 (970fx) */ vcpu->arch.pvr = 0x3C0301; @@ -887,16 +885,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) kvmppc_set_pvr(vcpu, vcpu->arch.pvr); vcpu->arch.slb_nr = 64; - /* remember where some real-mode handlers are */ - vcpu->arch.trampoline_lowmem = __pa(kvmppc_handler_lowmem_trampoline); - vcpu->arch.trampoline_enter = __pa(kvmppc_handler_trampoline_enter); - vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem; -#ifdef CONFIG_PPC_BOOK3S_64 - vcpu->arch.rmcall = *(ulong*)kvmppc_rmcall; -#else - vcpu->arch.rmcall = (ulong)kvmppc_rmcall; -#endif - vcpu->arch.shadow_msr = MSR_USER64; err = kvmppc_mmu_init(vcpu); diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S index 5ee66edd749..34187585c50 100644 --- a/arch/powerpc/kvm/book3s_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_rmhandlers.S @@ -36,9 +36,8 @@ #if defined(CONFIG_PPC_BOOK3S_64) -#define LOAD_SHADOW_VCPU(reg) GET_PACA(reg) -#define MSR_NOIRQ MSR_KERNEL & ~(MSR_IR | MSR_DR) #define FUNC(name) GLUE(.,name) +#define MTMSR_EERI(reg) mtmsrd (reg),1 .globl kvmppc_skip_interrupt kvmppc_skip_interrupt: @@ -68,8 +67,8 @@ kvmppc_skip_Hinterrupt: #elif defined(CONFIG_PPC_BOOK3S_32) -#define MSR_NOIRQ MSR_KERNEL #define FUNC(name) name +#define MTMSR_EERI(reg) mtmsr (reg) .macro INTERRUPT_TRAMPOLINE intno @@ -170,40 +169,24 @@ kvmppc_handler_skip_ins: #endif /* - * This trampoline brings us back to a real mode handler - * - * Input Registers: - * - * R5 = SRR0 - * R6 = SRR1 - * LR = real-mode IP + * Call kvmppc_handler_trampoline_enter in real mode * + * On entry, r4 contains the guest shadow MSR */ -.global kvmppc_handler_lowmem_trampoline -kvmppc_handler_lowmem_trampoline: - - mtsrr0 r5 +_GLOBAL(kvmppc_entry_trampoline) + mfmsr r5 + LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter) + toreal(r7) + + li r9, MSR_RI + ori r9, r9, MSR_EE + andc r9, r5, r9 /* Clear EE and RI in MSR value */ + li r6, MSR_IR | MSR_DR + ori r6, r6, MSR_EE + andc r6, r5, r6 /* Clear EE, DR and IR in MSR value */ + MTMSR_EERI(r9) /* Clear EE and RI in MSR */ + mtsrr0 r7 /* before we set srr0/1 */ mtsrr1 r6 - blr -kvmppc_handler_lowmem_trampoline_end: - -/* - * Call a function in real mode - * - * Input Registers: - * - * R3 = function - * R4 = MSR - * R5 = scratch register - * - */ -_GLOBAL(kvmppc_rmcall) - LOAD_REG_IMMEDIATE(r5, MSR_NOIRQ) - mtmsr r5 /* Disable relocation and interrupts, so mtsrr - doesn't get interrupted */ - sync - mtsrr0 r3 - mtsrr1 r4 RFI #if defined(CONFIG_PPC_BOOK3S_32) diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 678b6be3169..0676ae249b9 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -23,6 +23,7 @@ #define GET_SHADOW_VCPU(reg) \ mr reg, r13 +#define MTMSR_EERI(reg) mtmsrd (reg),1 #elif defined(CONFIG_PPC_BOOK3S_32) @@ -30,6 +31,7 @@ tophys(reg, r2); \ lwz reg, (THREAD + THREAD_KVM_SVCPU)(reg); \ tophys(reg, reg) +#define MTMSR_EERI(reg) mtmsr (reg) #endif @@ -57,10 +59,12 @@ kvmppc_handler_trampoline_enter: /* Required state: * * MSR = ~IR|DR - * R13 = PACA * R1 = host R1 * R2 = host R2 - * R10 = guest MSR + * R4 = guest shadow MSR + * R5 = normal host MSR + * R6 = current host MSR (EE, IR, DR off) + * LR = highmem guest exit code * all other volatile GPRS = free * SVCPU[CR] = guest CR * SVCPU[XER] = guest XER @@ -71,15 +75,15 @@ kvmppc_handler_trampoline_enter: /* r3 = shadow vcpu */ GET_SHADOW_VCPU(r3) + /* Save guest exit handler address and MSR */ + mflr r0 + PPC_STL r0, HSTATE_VMHANDLER(r3) + PPC_STL r5, HSTATE_HOST_MSR(r3) + /* Save R1/R2 in the PACA (64-bit) or shadow_vcpu (32-bit) */ PPC_STL r1, HSTATE_HOST_R1(r3) PPC_STL r2, HSTATE_HOST_R2(r3) - /* Move SRR0 and SRR1 into the respective regs */ - PPC_LL r9, SVCPU_PC(r3) - mtsrr0 r9 - mtsrr1 r10 - /* Activate guest mode, so faults get handled by KVM */ li r11, KVM_GUEST_MODE_GUEST stb r11, HSTATE_IN_GUEST(r3) @@ -87,17 +91,46 @@ kvmppc_handler_trampoline_enter: /* Switch to guest segment. This is subarch specific. */ LOAD_GUEST_SEGMENTS +#ifdef CONFIG_PPC_BOOK3S_64 + /* Some guests may need to have dcbz set to 32 byte length. + * + * Usually we ensure that by patching the guest's instructions + * to trap on dcbz and emulate it in the hypervisor. + * + * If we can, we should tell the CPU to use 32 byte dcbz though, + * because that's a lot faster. + */ + lbz r0, HSTATE_RESTORE_HID5(r3) + cmpwi r0, 0 + beq no_dcbz32_on + + mfspr r0,SPRN_HID5 + ori r0, r0, 0x80 /* XXX HID5_dcbz32 = 0x80 */ + mtspr SPRN_HID5,r0 +no_dcbz32_on: + +#endif /* CONFIG_PPC_BOOK3S_64 */ + /* Enter guest */ - PPC_LL r4, SVCPU_CTR(r3) - PPC_LL r5, SVCPU_LR(r3) - lwz r6, SVCPU_CR(r3) - lwz r7, SVCPU_XER(r3) + PPC_LL r8, SVCPU_CTR(r3) + PPC_LL r9, SVCPU_LR(r3) + lwz r10, SVCPU_CR(r3) + lwz r11, SVCPU_XER(r3) + + mtctr r8 + mtlr r9 + mtcr r10 + mtxer r11 - mtctr r4 - mtlr r5 - mtcr r6 - mtxer r7 + /* Move SRR0 and SRR1 into the respective regs */ + PPC_LL r9, SVCPU_PC(r3) + /* First clear RI in our current MSR value */ + li r0, MSR_RI + andc r6, r6, r0 + MTMSR_EERI(r6) + mtsrr0 r9 + mtsrr1 r4 PPC_LL r0, SVCPU_R0(r3) PPC_LL r1, SVCPU_R1(r3) @@ -259,6 +292,43 @@ no_ld_last_inst: /* Switch back to host MMU */ LOAD_HOST_SEGMENTS +#ifdef CONFIG_PPC_BOOK3S_64 + + lbz r5, HSTATE_RESTORE_HID5(r13) + cmpwi r5, 0 + beq no_dcbz32_off + + li r4, 0 + mfspr r5,SPRN_HID5 + rldimi r5,r4,6,56 + mtspr SPRN_HID5,r5 + +no_dcbz32_off: + +#endif /* CONFIG_PPC_BOOK3S_64 */ + + /* + * For some interrupts, we need to call the real Linux + * handler, so it can do work for us. This has to happen + * as if the interrupt arrived from the kernel though, + * so let's fake it here where most state is restored. + * + * Having set up SRR0/1 with the address where we want + * to continue with relocation on (potentially in module + * space), we either just go straight there with rfi[d], + * or we jump to an interrupt handler with bctr if there + * is an interrupt to be handled first. In the latter + * case, the rfi[d] at the end of the interrupt handler + * will get us back to where we want to continue. + */ + + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL + beq 1f + cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER + beq 1f + cmpwi r12, BOOK3S_INTERRUPT_PERFMON +1: mtctr r12 + /* Register usage at this point: * * R1 = host R1 @@ -269,13 +339,15 @@ no_ld_last_inst: * */ - /* RFI into the highmem handler */ - mfmsr r7 - ori r7, r7, MSR_IR|MSR_DR|MSR_RI|MSR_ME /* Enable paging */ - mtsrr1 r7 - /* Load highmem handler address */ + PPC_LL r6, HSTATE_HOST_MSR(r13) PPC_LL r8, HSTATE_VMHANDLER(r13) + + /* Restore host msr -> SRR1 */ + mtsrr1 r6 + /* Load highmem handler address */ mtsrr0 r8 + /* RFI into the highmem handler, or jump to interrupt handler */ + beqctr RFI kvmppc_handler_trampoline_exit_end: -- cgit v1.2.3-70-g09d2 From 19ccb76a1938ab364a412253daec64613acbf3df Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 23 Jul 2011 17:42:46 +1000 Subject: KVM: PPC: Implement H_CEDE hcall for book3s_hv in real-mode code With a KVM guest operating in SMT4 mode (i.e. 4 hardware threads per core), whenever a CPU goes idle, we have to pull all the other hardware threads in the core out of the guest, because the H_CEDE hcall is handled in the kernel. This is inefficient. This adds code to book3s_hv_rmhandlers.S to handle the H_CEDE hcall in real mode. When a guest vcpu does an H_CEDE hcall, we now only exit to the kernel if all the other vcpus in the same core are also idle. Otherwise we mark this vcpu as napping, save state that could be lost in nap mode (mainly GPRs and FPRs), and execute the nap instruction. When the thread wakes up, because of a decrementer or external interrupt, we come back in at kvm_start_guest (from the system reset interrupt vector), find the `napping' flag set in the paca, and go to the resume path. This has some other ramifications. First, when starting a core, we now start all the threads, both those that are immediately runnable and those that are idle. This is so that we don't have to pull all the threads out of the guest when an idle thread gets a decrementer interrupt and wants to start running. In fact the idle threads will all start with the H_CEDE hcall returning; being idle they will just do another H_CEDE immediately and go to nap mode. This required some changes to kvmppc_run_core() and kvmppc_run_vcpu(). These functions have been restructured to make them simpler and clearer. We introduce a level of indirection in the wait queue that gets woken when external and decrementer interrupts get generated for a vcpu, so that we can have the 4 vcpus in a vcore using the same wait queue. We need this because the 4 vcpus are being handled by one thread. Secondly, when we need to exit from the guest to the kernel, we now have to generate an IPI for any napping threads, because an HDEC interrupt doesn't wake up a napping thread. Thirdly, we now need to be able to handle virtual external interrupts and decrementer interrupts becoming pending while a thread is napping, and deliver those interrupts to the guest when the thread wakes. This is done in kvmppc_cede_reentry, just before fast_guest_return. Finally, since we are not using the generic kvm_vcpu_block for book3s_hv, and hence not calling kvm_arch_vcpu_runnable, we can remove the #ifdef from kvm_arch_vcpu_runnable. Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_book3s_asm.h | 1 + arch/powerpc/include/asm/kvm_host.h | 19 +- arch/powerpc/kernel/asm-offsets.c | 6 + arch/powerpc/kvm/book3s_hv.c | 335 +++++++++++++++++------------- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 297 +++++++++++++++++++++++--- arch/powerpc/kvm/powerpc.c | 21 +- 6 files changed, 483 insertions(+), 196 deletions(-) (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index af73469530e..1f2f5b6156b 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -76,6 +76,7 @@ struct kvmppc_host_state { ulong scratch1; u8 in_guest; u8 restore_hid5; + u8 napping; #ifdef CONFIG_KVM_BOOK3S_64_HV struct kvm_vcpu *kvm_vcpu; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index dec3054f6ad..bf8af5d5d5d 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -198,21 +198,29 @@ struct kvm_arch { */ struct kvmppc_vcore { int n_runnable; - int n_blocked; + int n_busy; int num_threads; int entry_exit_count; int n_woken; int nap_count; + int napping_threads; u16 pcpu; - u8 vcore_running; + u8 vcore_state; u8 in_guest; struct list_head runnable_threads; spinlock_t lock; + wait_queue_head_t wq; }; #define VCORE_ENTRY_COUNT(vc) ((vc)->entry_exit_count & 0xff) #define VCORE_EXIT_COUNT(vc) ((vc)->entry_exit_count >> 8) +/* Values for vcore_state */ +#define VCORE_INACTIVE 0 +#define VCORE_RUNNING 1 +#define VCORE_EXITING 2 +#define VCORE_SLEEPING 3 + struct kvmppc_pte { ulong eaddr; u64 vpage; @@ -403,11 +411,13 @@ struct kvm_vcpu_arch { struct dtl *dtl; struct dtl *dtl_end; + wait_queue_head_t *wqp; struct kvmppc_vcore *vcore; int ret; int trap; int state; int ptid; + bool timer_running; wait_queue_head_t cpu_run; struct kvm_vcpu_arch_shared *shared; @@ -423,8 +433,9 @@ struct kvm_vcpu_arch { #endif }; -#define KVMPPC_VCPU_BUSY_IN_HOST 0 -#define KVMPPC_VCPU_BLOCKED 1 +/* Values for vcpu->arch.state */ +#define KVMPPC_VCPU_STOPPED 0 +#define KVMPPC_VCPU_BUSY_IN_HOST 1 #define KVMPPC_VCPU_RUNNABLE 2 #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index e069c766695..69f7ffe7f67 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -44,6 +44,7 @@ #include #include #include +#include #endif #ifdef CONFIG_PPC_ISERIES #include @@ -460,6 +461,8 @@ int main(void) DEFINE(VCPU_DEC, offsetof(struct kvm_vcpu, arch.dec)); DEFINE(VCPU_DEC_EXPIRES, offsetof(struct kvm_vcpu, arch.dec_expires)); DEFINE(VCPU_PENDING_EXC, offsetof(struct kvm_vcpu, arch.pending_exceptions)); + DEFINE(VCPU_CEDED, offsetof(struct kvm_vcpu, arch.ceded)); + DEFINE(VCPU_PRODDED, offsetof(struct kvm_vcpu, arch.prodded)); DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa)); DEFINE(VCPU_MMCR, offsetof(struct kvm_vcpu, arch.mmcr)); DEFINE(VCPU_PMC, offsetof(struct kvm_vcpu, arch.pmc)); @@ -475,6 +478,7 @@ int main(void) DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count)); DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count)); DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest)); + DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads)); DEFINE(VCPU_SVCPU, offsetof(struct kvmppc_vcpu_book3s, shadow_vcpu) - offsetof(struct kvmppc_vcpu_book3s, vcpu)); DEFINE(VCPU_SLB_E, offsetof(struct kvmppc_slb, orige)); @@ -532,6 +536,7 @@ int main(void) HSTATE_FIELD(HSTATE_SCRATCH1, scratch1); HSTATE_FIELD(HSTATE_IN_GUEST, in_guest); HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5); + HSTATE_FIELD(HSTATE_NAPPING, napping); #ifdef CONFIG_KVM_BOOK3S_64_HV HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu); @@ -544,6 +549,7 @@ int main(void) HSTATE_FIELD(HSTATE_DSCR, host_dscr); HSTATE_FIELD(HSTATE_DABR, dabr); HSTATE_FIELD(HSTATE_DECEXP, dec_expires); + DEFINE(IPI_PRIORITY, IPI_PRIORITY); #endif /* CONFIG_KVM_BOOK3S_64_HV */ #else /* CONFIG_PPC_BOOK3S */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index bf66ec731e8..4644c7986d8 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -62,6 +62,8 @@ /* #define EXIT_DEBUG_SIMPLE */ /* #define EXIT_DEBUG_INT */ +static void kvmppc_end_cede(struct kvm_vcpu *vcpu); + void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { local_paca->kvm_hstate.kvm_vcpu = vcpu; @@ -72,40 +74,10 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) { } -static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu); -static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu); - -void kvmppc_vcpu_block(struct kvm_vcpu *vcpu) -{ - u64 now; - unsigned long dec_nsec; - - now = get_tb(); - if (now >= vcpu->arch.dec_expires && !kvmppc_core_pending_dec(vcpu)) - kvmppc_core_queue_dec(vcpu); - if (vcpu->arch.pending_exceptions) - return; - if (vcpu->arch.dec_expires != ~(u64)0) { - dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC / - tb_ticks_per_sec; - hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec), - HRTIMER_MODE_REL); - } - - kvmppc_vcpu_blocked(vcpu); - - kvm_vcpu_block(vcpu); - vcpu->stat.halt_wakeup++; - - if (vcpu->arch.dec_expires != ~(u64)0) - hrtimer_try_to_cancel(&vcpu->arch.dec_timer); - - kvmppc_vcpu_unblocked(vcpu); -} - void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) { vcpu->arch.shregs.msr = msr; + kvmppc_end_cede(vcpu); } void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) @@ -257,15 +229,6 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) switch (req) { case H_CEDE: - vcpu->arch.shregs.msr |= MSR_EE; - vcpu->arch.ceded = 1; - smp_mb(); - if (!vcpu->arch.prodded) - kvmppc_vcpu_block(vcpu); - else - vcpu->arch.prodded = 0; - smp_mb(); - vcpu->arch.ceded = 0; break; case H_PROD: target = kvmppc_get_gpr(vcpu, 4); @@ -388,20 +351,6 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } - - if (!(r & RESUME_HOST)) { - /* To avoid clobbering exit_reason, only check for signals if - * we aren't already exiting to userspace for some other - * reason. */ - if (signal_pending(tsk)) { - vcpu->stat.signal_exits++; - run->exit_reason = KVM_EXIT_INTR; - r = -EINTR; - } else { - kvmppc_core_deliver_interrupts(vcpu); - } - } - return r; } @@ -479,13 +428,9 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) kvmppc_mmu_book3s_hv_init(vcpu); /* - * Some vcpus may start out in stopped state. If we initialize - * them to busy-in-host state they will stop other vcpus in the - * vcore from running. Instead we initialize them to blocked - * state, effectively considering them to be stopped until we - * see the first run ioctl for them. + * We consider the vcpu stopped until we see the first run ioctl for it. */ - vcpu->arch.state = KVMPPC_VCPU_BLOCKED; + vcpu->arch.state = KVMPPC_VCPU_STOPPED; init_waitqueue_head(&vcpu->arch.cpu_run); @@ -496,6 +441,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) if (vcore) { INIT_LIST_HEAD(&vcore->runnable_threads); spin_lock_init(&vcore->lock); + init_waitqueue_head(&vcore->wq); } kvm->arch.vcores[core] = vcore; } @@ -506,7 +452,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) spin_lock(&vcore->lock); ++vcore->num_threads; - ++vcore->n_blocked; spin_unlock(&vcore->lock); vcpu->arch.vcore = vcore; @@ -527,30 +472,31 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kfree(vcpu); } -static void kvmppc_vcpu_blocked(struct kvm_vcpu *vcpu) +static void kvmppc_set_timer(struct kvm_vcpu *vcpu) { - struct kvmppc_vcore *vc = vcpu->arch.vcore; + unsigned long dec_nsec, now; - spin_lock(&vc->lock); - vcpu->arch.state = KVMPPC_VCPU_BLOCKED; - ++vc->n_blocked; - if (vc->n_runnable > 0 && - vc->n_runnable + vc->n_blocked == vc->num_threads) { - vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu, - arch.run_list); - wake_up(&vcpu->arch.cpu_run); + now = get_tb(); + if (now > vcpu->arch.dec_expires) { + /* decrementer has already gone negative */ + kvmppc_core_queue_dec(vcpu); + kvmppc_core_deliver_interrupts(vcpu); + return; } - spin_unlock(&vc->lock); + dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC + / tb_ticks_per_sec; + hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec), + HRTIMER_MODE_REL); + vcpu->arch.timer_running = 1; } -static void kvmppc_vcpu_unblocked(struct kvm_vcpu *vcpu) +static void kvmppc_end_cede(struct kvm_vcpu *vcpu) { - struct kvmppc_vcore *vc = vcpu->arch.vcore; - - spin_lock(&vc->lock); - vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; - --vc->n_blocked; - spin_unlock(&vc->lock); + vcpu->arch.ceded = 0; + if (vcpu->arch.timer_running) { + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + vcpu->arch.timer_running = 0; + } } extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); @@ -565,6 +511,7 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, return; vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; --vc->n_runnable; + ++vc->n_busy; /* decrement the physical thread id of each following vcpu */ v = vcpu; list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list) @@ -578,15 +525,20 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu) struct paca_struct *tpaca; struct kvmppc_vcore *vc = vcpu->arch.vcore; + if (vcpu->arch.timer_running) { + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + vcpu->arch.timer_running = 0; + } cpu = vc->pcpu + vcpu->arch.ptid; tpaca = &paca[cpu]; tpaca->kvm_hstate.kvm_vcpu = vcpu; tpaca->kvm_hstate.kvm_vcore = vc; + tpaca->kvm_hstate.napping = 0; + vcpu->cpu = vc->pcpu; smp_wmb(); #ifdef CONFIG_PPC_ICP_NATIVE if (vcpu->arch.ptid) { tpaca->cpu_start = 0x80; - tpaca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST; wmb(); xics_wake_cpu(cpu); ++vc->n_woken; @@ -634,9 +586,10 @@ static int on_primary_thread(void) */ static int kvmppc_run_core(struct kvmppc_vcore *vc) { - struct kvm_vcpu *vcpu, *vnext; + struct kvm_vcpu *vcpu, *vcpu0, *vnext; long ret; u64 now; + int ptid; /* don't start if any threads have a signal pending */ list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) @@ -655,29 +608,50 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) goto out; } + /* + * Assign physical thread IDs, first to non-ceded vcpus + * and then to ceded ones. + */ + ptid = 0; + vcpu0 = NULL; + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { + if (!vcpu->arch.ceded) { + if (!ptid) + vcpu0 = vcpu; + vcpu->arch.ptid = ptid++; + } + } + if (!vcpu0) + return 0; /* nothing to run */ + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + if (vcpu->arch.ceded) + vcpu->arch.ptid = ptid++; + vc->n_woken = 0; vc->nap_count = 0; vc->entry_exit_count = 0; - vc->vcore_running = 1; + vc->vcore_state = VCORE_RUNNING; vc->in_guest = 0; vc->pcpu = smp_processor_id(); + vc->napping_threads = 0; list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) kvmppc_start_thread(vcpu); - vcpu = list_first_entry(&vc->runnable_threads, struct kvm_vcpu, - arch.run_list); + preempt_disable(); spin_unlock(&vc->lock); - preempt_disable(); kvm_guest_enter(); - __kvmppc_vcore_entry(NULL, vcpu); + __kvmppc_vcore_entry(NULL, vcpu0); - /* wait for secondary threads to finish writing their state to memory */ spin_lock(&vc->lock); + /* disable sending of IPIs on virtual external irqs */ + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + vcpu->cpu = -1; + /* wait for secondary threads to finish writing their state to memory */ if (vc->nap_count < vc->n_woken) kvmppc_wait_for_nap(vc); /* prevent other vcpu threads from doing kvmppc_start_thread() now */ - vc->vcore_running = 2; + vc->vcore_state = VCORE_EXITING; spin_unlock(&vc->lock); /* make sure updates to secondary vcpu structs are visible now */ @@ -693,22 +667,26 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) if (now < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu)) kvmppc_core_dequeue_dec(vcpu); - if (!vcpu->arch.trap) { - if (signal_pending(vcpu->arch.run_task)) { - vcpu->arch.kvm_run->exit_reason = KVM_EXIT_INTR; - vcpu->arch.ret = -EINTR; - } - continue; /* didn't get to run */ - } - ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu, - vcpu->arch.run_task); + + ret = RESUME_GUEST; + if (vcpu->arch.trap) + ret = kvmppc_handle_exit(vcpu->arch.kvm_run, vcpu, + vcpu->arch.run_task); + vcpu->arch.ret = ret; vcpu->arch.trap = 0; + + if (vcpu->arch.ceded) { + if (ret != RESUME_GUEST) + kvmppc_end_cede(vcpu); + else + kvmppc_set_timer(vcpu); + } } spin_lock(&vc->lock); out: - vc->vcore_running = 0; + vc->vcore_state = VCORE_INACTIVE; list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, arch.run_list) { if (vcpu->arch.ret != RESUME_GUEST) { @@ -720,82 +698,130 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc) return 1; } -static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +/* + * Wait for some other vcpu thread to execute us, and + * wake us up when we need to handle something in the host. + */ +static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state) { - int ptid; - int wait_state; - struct kvmppc_vcore *vc; DEFINE_WAIT(wait); - /* No need to go into the guest when all we do is going out */ - if (signal_pending(current)) { - kvm_run->exit_reason = KVM_EXIT_INTR; - return -EINTR; + prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state); + if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) + schedule(); + finish_wait(&vcpu->arch.cpu_run, &wait); +} + +/* + * All the vcpus in this vcore are idle, so wait for a decrementer + * or external interrupt to one of the vcpus. vc->lock is held. + */ +static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) +{ + DEFINE_WAIT(wait); + struct kvm_vcpu *v; + int all_idle = 1; + + prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE); + vc->vcore_state = VCORE_SLEEPING; + spin_unlock(&vc->lock); + list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { + if (!v->arch.ceded || v->arch.pending_exceptions) { + all_idle = 0; + break; + } } + if (all_idle) + schedule(); + finish_wait(&vc->wq, &wait); + spin_lock(&vc->lock); + vc->vcore_state = VCORE_INACTIVE; +} - /* On PPC970, check that we have an RMA region */ - if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201)) - return -EPERM; +static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + int n_ceded; + int prev_state; + struct kvmppc_vcore *vc; + struct kvm_vcpu *v, *vn; kvm_run->exit_reason = 0; vcpu->arch.ret = RESUME_GUEST; vcpu->arch.trap = 0; - flush_fp_to_thread(current); - flush_altivec_to_thread(current); - flush_vsx_to_thread(current); - /* * Synchronize with other threads in this virtual core */ vc = vcpu->arch.vcore; spin_lock(&vc->lock); - /* This happens the first time this is called for a vcpu */ - if (vcpu->arch.state == KVMPPC_VCPU_BLOCKED) - --vc->n_blocked; - vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; - ptid = vc->n_runnable; + vcpu->arch.ceded = 0; vcpu->arch.run_task = current; vcpu->arch.kvm_run = kvm_run; - vcpu->arch.ptid = ptid; + prev_state = vcpu->arch.state; + vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads); ++vc->n_runnable; - wait_state = TASK_INTERRUPTIBLE; - while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { - if (signal_pending(current)) { - if (!vc->vcore_running) { - kvm_run->exit_reason = KVM_EXIT_INTR; - vcpu->arch.ret = -EINTR; - break; - } - /* have to wait for vcore to stop executing guest */ - wait_state = TASK_UNINTERRUPTIBLE; - smp_send_reschedule(vc->pcpu); + /* + * This happens the first time this is called for a vcpu. + * If the vcore is already running, we may be able to start + * this thread straight away and have it join in. + */ + if (prev_state == KVMPPC_VCPU_STOPPED) { + if (vc->vcore_state == VCORE_RUNNING && + VCORE_EXIT_COUNT(vc) == 0) { + vcpu->arch.ptid = vc->n_runnable - 1; + kvmppc_start_thread(vcpu); } - if (!vc->vcore_running && - vc->n_runnable + vc->n_blocked == vc->num_threads) { - /* we can run now */ - if (kvmppc_run_core(vc)) - continue; - } + } else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST) + --vc->n_busy; - if (vc->vcore_running == 1 && VCORE_EXIT_COUNT(vc) == 0) - kvmppc_start_thread(vcpu); + while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && + !signal_pending(current)) { + if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) { + spin_unlock(&vc->lock); + kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE); + spin_lock(&vc->lock); + continue; + } + n_ceded = 0; + list_for_each_entry(v, &vc->runnable_threads, arch.run_list) + n_ceded += v->arch.ceded; + if (n_ceded == vc->n_runnable) + kvmppc_vcore_blocked(vc); + else + kvmppc_run_core(vc); + + list_for_each_entry_safe(v, vn, &vc->runnable_threads, + arch.run_list) { + kvmppc_core_deliver_interrupts(v); + if (signal_pending(v->arch.run_task)) { + kvmppc_remove_runnable(vc, v); + v->stat.signal_exits++; + v->arch.kvm_run->exit_reason = KVM_EXIT_INTR; + v->arch.ret = -EINTR; + wake_up(&v->arch.cpu_run); + } + } + } - /* wait for other threads to come in, or wait for vcore */ - prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state); - spin_unlock(&vc->lock); - schedule(); - finish_wait(&vcpu->arch.cpu_run, &wait); - spin_lock(&vc->lock); + if (signal_pending(current)) { + if (vc->vcore_state == VCORE_RUNNING || + vc->vcore_state == VCORE_EXITING) { + spin_unlock(&vc->lock); + kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE); + spin_lock(&vc->lock); + } + if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { + kvmppc_remove_runnable(vc, vcpu); + vcpu->stat.signal_exits++; + kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->arch.ret = -EINTR; + } } - if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) - kvmppc_remove_runnable(vc, vcpu); spin_unlock(&vc->lock); - return vcpu->arch.ret; } @@ -808,6 +834,21 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) return -EINVAL; } + /* No need to go into the guest when all we'll do is come back out */ + if (signal_pending(current)) { + run->exit_reason = KVM_EXIT_INTR; + return -EINTR; + } + + /* On PPC970, check that we have an RMA region */ + if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201)) + return -EPERM; + + flush_fp_to_thread(current); + flush_altivec_to_thread(current); + flush_vsx_to_thread(current); + vcpu->arch.wqp = &vcpu->arch.vcore->wq; + do { r = kvmppc_run_vcpu(run, vcpu); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index bc6ade93308..f422231d923 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -52,7 +52,7 @@ kvmppc_skip_Hinterrupt: b . /* - * Call kvmppc_handler_trampoline_enter in real mode. + * Call kvmppc_hv_entry in real mode. * Must be called with interrupts hard-disabled. * * Input Registers: @@ -92,6 +92,12 @@ _GLOBAL(kvmppc_hv_entry_trampoline) kvm_start_guest: ld r1,PACAEMERGSP(r13) subi r1,r1,STACK_FRAME_OVERHEAD + ld r2,PACATOC(r13) + + /* were we napping due to cede? */ + lbz r0,HSTATE_NAPPING(r13) + cmpwi r0,0 + bne kvm_end_cede /* get vcpu pointer */ ld r4, HSTATE_KVM_VCPU(r13) @@ -279,15 +285,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) cmpwi r0,0 beq 20b - /* Set LPCR. Set the MER bit if there is a pending external irq. */ + /* Set LPCR and RMOR. */ 10: ld r8,KVM_LPCR(r9) - ld r0,VCPU_PENDING_EXC(r4) - li r7,(1 << BOOK3S_IRQPRIO_EXTERNAL) - oris r7,r7,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h - and. r0,r0,r7 - beq 11f - ori r8,r8,LPCR_MER -11: mtspr SPRN_LPCR,r8 + mtspr SPRN_LPCR,r8 ld r8,KVM_RMOR(r9) mtspr SPRN_RMOR,r8 isync @@ -451,19 +451,50 @@ toc_tlbie_lock: mtctr r6 mtxer r7 - /* Move SRR0 and SRR1 into the respective regs */ +kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ ld r6, VCPU_SRR0(r4) ld r7, VCPU_SRR1(r4) - mtspr SPRN_SRR0, r6 - mtspr SPRN_SRR1, r7 - ld r10, VCPU_PC(r4) + ld r11, VCPU_MSR(r4) /* r11 = vcpu->arch.msr & ~MSR_HV */ - ld r11, VCPU_MSR(r4) /* r10 = vcpu->arch.msr & ~MSR_HV */ rldicl r11, r11, 63 - MSR_HV_LG, 1 rotldi r11, r11, 1 + MSR_HV_LG ori r11, r11, MSR_ME + /* Check if we can deliver an external or decrementer interrupt now */ + ld r0,VCPU_PENDING_EXC(r4) + li r8,(1 << BOOK3S_IRQPRIO_EXTERNAL) + oris r8,r8,(1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h + and r0,r0,r8 + cmpdi cr1,r0,0 + andi. r0,r11,MSR_EE + beq cr1,11f +BEGIN_FTR_SECTION + mfspr r8,SPRN_LPCR + ori r8,r8,LPCR_MER + mtspr SPRN_LPCR,r8 + isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + beq 5f + li r0,BOOK3S_INTERRUPT_EXTERNAL +12: mr r6,r10 + mr r10,r0 + mr r7,r11 + li r11,(MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ + rotldi r11,r11,63 + b 5f +11: beq 5f + mfspr r0,SPRN_DEC + cmpwi r0,0 + li r0,BOOK3S_INTERRUPT_DECREMENTER + blt 12b + + /* Move SRR0 and SRR1 into the respective regs */ +5: mtspr SPRN_SRR0, r6 + mtspr SPRN_SRR1, r7 + li r0,0 + stb r0,VCPU_CEDED(r4) /* cancel cede */ + fast_guest_return: mtspr SPRN_HSRR0,r10 mtspr SPRN_HSRR1,r11 @@ -577,21 +608,20 @@ kvmppc_interrupt: /* See if this is something we can handle in real mode */ cmpwi r12,BOOK3S_INTERRUPT_SYSCALL beq hcall_try_real_mode -hcall_real_cont: /* Check for mediated interrupts (could be done earlier really ...) */ BEGIN_FTR_SECTION cmpwi r12,BOOK3S_INTERRUPT_EXTERNAL bne+ 1f - ld r5,VCPU_KVM(r9) - ld r5,KVM_LPCR(r5) andi. r0,r11,MSR_EE beq 1f + mfspr r5,SPRN_LPCR andi. r0,r5,LPCR_MER bne bounce_ext_interrupt 1: END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ /* Save DEC */ mfspr r5,SPRN_DEC mftb r6 @@ -685,7 +715,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_201) slbia ptesync -hdec_soon: +hdec_soon: /* r9 = vcpu, r12 = trap, r13 = paca */ BEGIN_FTR_SECTION b 32f END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) @@ -703,6 +733,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) addi r0,r3,0x100 stwcx. r0,0,r6 bne 41b + lwsync /* * At this point we have an interrupt that we have to pass @@ -716,18 +747,39 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) * interrupt, since the other threads will already be on their * way here in that case. */ + cmpwi r3,0x100 /* Are we the first here? */ + bge 43f + cmpwi r3,1 /* Are any other threads in the guest? */ + ble 43f cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER beq 40f - cmpwi r3,0x100 /* Are we the first here? */ - bge 40f - cmpwi r3,1 - ble 40f li r0,0 mtspr SPRN_HDEC,r0 40: + /* + * Send an IPI to any napping threads, since an HDEC interrupt + * doesn't wake CPUs up from nap. + */ + lwz r3,VCORE_NAPPING_THREADS(r5) + lwz r4,VCPU_PTID(r9) + li r0,1 + sldi r0,r0,r4 + andc. r3,r3,r0 /* no sense IPI'ing ourselves */ + beq 43f + mulli r4,r4,PACA_SIZE /* get paca for thread 0 */ + subf r6,r4,r13 +42: andi. r0,r3,1 + beq 44f + ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */ + li r0,IPI_PRIORITY + li r7,XICS_QIRR + stbcix r0,r7,r8 /* trigger the IPI */ +44: srdi. r3,r3,1 + addi r6,r6,PACA_SIZE + bne 42b /* Secondary threads wait for primary to do partition switch */ - ld r4,VCPU_KVM(r9) /* pointer to struct kvm */ +43: ld r4,VCPU_KVM(r9) /* pointer to struct kvm */ ld r5,HSTATE_KVM_VCORE(r13) lwz r3,VCPU_PTID(r9) cmpwi r3,0 @@ -1080,7 +1132,6 @@ hcall_try_real_mode: hcall_real_fallback: li r12,BOOK3S_INTERRUPT_SYSCALL ld r9, HSTATE_KVM_VCPU(r13) - ld r11, VCPU_MSR(r9) b hcall_real_cont @@ -1142,7 +1193,7 @@ hcall_real_table: .long 0 /* 0xd4 */ .long 0 /* 0xd8 */ .long 0 /* 0xdc */ - .long 0 /* 0xe0 */ + .long .kvmppc_h_cede - hcall_real_table .long 0 /* 0xe4 */ .long 0 /* 0xe8 */ .long 0 /* 0xec */ @@ -1171,7 +1222,8 @@ bounce_ext_interrupt: mtspr SPRN_SRR0,r10 mtspr SPRN_SRR1,r11 li r10,BOOK3S_INTERRUPT_EXTERNAL - LOAD_REG_IMMEDIATE(r11,MSR_SF | MSR_ME); + li r11,(MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ + rotldi r11,r11,63 b fast_guest_return _GLOBAL(kvmppc_h_set_dabr) @@ -1180,6 +1232,178 @@ _GLOBAL(kvmppc_h_set_dabr) li r3,0 blr +_GLOBAL(kvmppc_h_cede) + ori r11,r11,MSR_EE + std r11,VCPU_MSR(r3) + li r0,1 + stb r0,VCPU_CEDED(r3) + sync /* order setting ceded vs. testing prodded */ + lbz r5,VCPU_PRODDED(r3) + cmpwi r5,0 + bne 1f + li r0,0 /* set trap to 0 to say hcall is handled */ + stw r0,VCPU_TRAP(r3) + li r0,H_SUCCESS + std r0,VCPU_GPR(r3)(r3) +BEGIN_FTR_SECTION + b 2f /* just send it up to host on 970 */ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_206) + + /* + * Set our bit in the bitmask of napping threads unless all the + * other threads are already napping, in which case we send this + * up to the host. + */ + ld r5,HSTATE_KVM_VCORE(r13) + lwz r6,VCPU_PTID(r3) + lwz r8,VCORE_ENTRY_EXIT(r5) + clrldi r8,r8,56 + li r0,1 + sld r0,r0,r6 + addi r6,r5,VCORE_NAPPING_THREADS +31: lwarx r4,0,r6 + or r4,r4,r0 + popcntw r7,r4 + cmpw r7,r8 + bge 2f + stwcx. r4,0,r6 + bne 31b + li r0,1 + stb r0,HSTATE_NAPPING(r13) + /* order napping_threads update vs testing entry_exit_count */ + lwsync + mr r4,r3 + lwz r7,VCORE_ENTRY_EXIT(r5) + cmpwi r7,0x100 + bge 33f /* another thread already exiting */ + +/* + * Although not specifically required by the architecture, POWER7 + * preserves the following registers in nap mode, even if an SMT mode + * switch occurs: SLB entries, PURR, SPURR, AMOR, UAMOR, AMR, SPRG0-3, + * DAR, DSISR, DABR, DABRX, DSCR, PMCx, MMCRx, SIAR, SDAR. + */ + /* Save non-volatile GPRs */ + std r14, VCPU_GPR(r14)(r3) + std r15, VCPU_GPR(r15)(r3) + std r16, VCPU_GPR(r16)(r3) + std r17, VCPU_GPR(r17)(r3) + std r18, VCPU_GPR(r18)(r3) + std r19, VCPU_GPR(r19)(r3) + std r20, VCPU_GPR(r20)(r3) + std r21, VCPU_GPR(r21)(r3) + std r22, VCPU_GPR(r22)(r3) + std r23, VCPU_GPR(r23)(r3) + std r24, VCPU_GPR(r24)(r3) + std r25, VCPU_GPR(r25)(r3) + std r26, VCPU_GPR(r26)(r3) + std r27, VCPU_GPR(r27)(r3) + std r28, VCPU_GPR(r28)(r3) + std r29, VCPU_GPR(r29)(r3) + std r30, VCPU_GPR(r30)(r3) + std r31, VCPU_GPR(r31)(r3) + + /* save FP state */ + bl .kvmppc_save_fp + + /* + * Take a nap until a decrementer or external interrupt occurs, + * with PECE1 (wake on decr) and PECE0 (wake on external) set in LPCR + */ + li r0,0x80 + stb r0,PACAPROCSTART(r13) + mfspr r5,SPRN_LPCR + ori r5,r5,LPCR_PECE0 | LPCR_PECE1 + mtspr SPRN_LPCR,r5 + isync + li r0, 0 + std r0, HSTATE_SCRATCH0(r13) + ptesync + ld r0, HSTATE_SCRATCH0(r13) +1: cmpd r0, r0 + bne 1b + nap + b . + +kvm_end_cede: + /* Woken by external or decrementer interrupt */ + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATOC(r13) + + /* If we're a secondary thread and we got here by an IPI, ack it */ + ld r4,HSTATE_KVM_VCPU(r13) + lwz r3,VCPU_PTID(r4) + cmpwi r3,0 + beq 27f + mfspr r3,SPRN_SRR1 + rlwinm r3,r3,44-31,0x7 /* extract wake reason field */ + cmpwi r3,4 /* was it an external interrupt? */ + bne 27f + ld r5, HSTATE_XICS_PHYS(r13) + li r0,0xff + li r6,XICS_QIRR + li r7,XICS_XIRR + lwzcix r8,r5,r7 /* ack the interrupt */ + sync + stbcix r0,r5,r6 /* clear it */ + stwcix r8,r5,r7 /* EOI it */ +27: + /* load up FP state */ + bl kvmppc_load_fp + + /* Load NV GPRS */ + ld r14, VCPU_GPR(r14)(r4) + ld r15, VCPU_GPR(r15)(r4) + ld r16, VCPU_GPR(r16)(r4) + ld r17, VCPU_GPR(r17)(r4) + ld r18, VCPU_GPR(r18)(r4) + ld r19, VCPU_GPR(r19)(r4) + ld r20, VCPU_GPR(r20)(r4) + ld r21, VCPU_GPR(r21)(r4) + ld r22, VCPU_GPR(r22)(r4) + ld r23, VCPU_GPR(r23)(r4) + ld r24, VCPU_GPR(r24)(r4) + ld r25, VCPU_GPR(r25)(r4) + ld r26, VCPU_GPR(r26)(r4) + ld r27, VCPU_GPR(r27)(r4) + ld r28, VCPU_GPR(r28)(r4) + ld r29, VCPU_GPR(r29)(r4) + ld r30, VCPU_GPR(r30)(r4) + ld r31, VCPU_GPR(r31)(r4) + + /* clear our bit in vcore->napping_threads */ +33: ld r5,HSTATE_KVM_VCORE(r13) + lwz r3,VCPU_PTID(r4) + li r0,1 + sld r0,r0,r3 + addi r6,r5,VCORE_NAPPING_THREADS +32: lwarx r7,0,r6 + andc r7,r7,r0 + stwcx. r7,0,r6 + bne 32b + li r0,0 + stb r0,HSTATE_NAPPING(r13) + + /* see if any other thread is already exiting */ + lwz r0,VCORE_ENTRY_EXIT(r5) + cmpwi r0,0x100 + blt kvmppc_cede_reentry /* if not go back to guest */ + + /* some threads are exiting, so go to the guest exit path */ + b hcall_real_fallback + + /* cede when already previously prodded case */ +1: li r0,0 + stb r0,VCPU_PRODDED(r3) + sync /* order testing prodded vs. clearing ceded */ + stb r0,VCPU_CEDED(r3) + li r3,H_SUCCESS + blr + + /* we've ceded but we want to give control to the host */ +2: li r3,H_TOO_HARD + blr + secondary_too_late: ld r5,HSTATE_KVM_VCORE(r13) HMT_LOW @@ -1197,14 +1421,20 @@ secondary_too_late: slbmte r6,r5 1: addi r11,r11,16 .endr - b 50f secondary_nap: - /* Clear any pending IPI */ -50: ld r5, HSTATE_XICS_PHYS(r13) + /* Clear any pending IPI - assume we're a secondary thread */ + ld r5, HSTATE_XICS_PHYS(r13) + li r7, XICS_XIRR + lwzcix r3, r5, r7 /* ack any pending interrupt */ + rlwinm. r0, r3, 0, 0xffffff /* any pending? */ + beq 37f + sync li r0, 0xff li r6, XICS_QIRR - stbcix r0, r5, r6 + stbcix r0, r5, r6 /* clear the IPI */ + stwcix r3, r5, r7 /* EOI it */ +37: sync /* increment the nap count and then go to nap mode */ ld r4, HSTATE_KVM_VCORE(r13) @@ -1214,13 +1444,12 @@ secondary_nap: addi r3, r3, 1 stwcx. r3, 0, r4 bne 51b - isync + li r3, LPCR_PECE0 mfspr r4, SPRN_LPCR - li r0, LPCR_PECE - andc r4, r4, r0 - ori r4, r4, LPCR_PECE0 /* exit nap on interrupt */ + rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 mtspr SPRN_LPCR, r4 + isync li r0, 0 std r0, HSTATE_SCRATCH0(r13) ptesync diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index a8000ce562b..0d843c6ba31 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -39,12 +39,8 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { -#ifndef CONFIG_KVM_BOOK3S_64_HV return !(v->arch.shared->msr & MSR_WE) || !!(v->arch.pending_exceptions); -#else - return !(v->arch.ceded) || !!(v->arch.pending_exceptions); -#endif } int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) @@ -285,6 +281,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; vcpu = kvmppc_core_vcpu_create(kvm, id); + vcpu->arch.wqp = &vcpu->wq; if (!IS_ERR(vcpu)) kvmppc_create_vcpu_debugfs(vcpu, id); return vcpu; @@ -316,8 +313,8 @@ static void kvmppc_decrementer_func(unsigned long data) kvmppc_core_queue_dec(vcpu); - if (waitqueue_active(&vcpu->wq)) { - wake_up_interruptible(&vcpu->wq); + if (waitqueue_active(vcpu->arch.wqp)) { + wake_up_interruptible(vcpu->arch.wqp); vcpu->stat.halt_wakeup++; } } @@ -570,13 +567,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { - if (irq->irq == KVM_INTERRUPT_UNSET) + if (irq->irq == KVM_INTERRUPT_UNSET) { kvmppc_core_dequeue_external(vcpu, irq); - else - kvmppc_core_queue_external(vcpu, irq); + return 0; + } + + kvmppc_core_queue_external(vcpu, irq); - if (waitqueue_active(&vcpu->wq)) { - wake_up_interruptible(&vcpu->wq); + if (waitqueue_active(vcpu->arch.wqp)) { + wake_up_interruptible(vcpu->arch.wqp); vcpu->stat.halt_wakeup++; } else if (vcpu->cpu != -1) { smp_send_reschedule(vcpu->cpu); -- cgit v1.2.3-70-g09d2 From fcf634098c00dd9cd247447368495f0b79be12d1 Mon Sep 17 00:00:00 2001 From: Christopher Yeoh Date: Mon, 31 Oct 2011 17:06:39 -0700 Subject: Cross Memory Attach The basic idea behind cross memory attach is to allow MPI programs doing intra-node communication to do a single copy of the message rather than a double copy of the message via shared memory. The following patch attempts to achieve this by allowing a destination process, given an address and size from a source process, to copy memory directly from the source process into its own address space via a system call. There is also a symmetrical ability to copy from the current process's address space into a destination process's address space. - Use of /proc/pid/mem has been considered, but there are issues with using it: - Does not allow for specifying iovecs for both src and dest, assuming preadv or pwritev was implemented either the area read from or written to would need to be contiguous. - Currently mem_read allows only processes who are currently ptrace'ing the target and are still able to ptrace the target to read from the target. This check could possibly be moved to the open call, but its not clear exactly what race this restriction is stopping (reason appears to have been lost) - Having to send the fd of /proc/self/mem via SCM_RIGHTS on unix domain socket is a bit ugly from a userspace point of view, especially when you may have hundreds if not (eventually) thousands of processes that all need to do this with each other - Doesn't allow for some future use of the interface we would like to consider adding in the future (see below) - Interestingly reading from /proc/pid/mem currently actually involves two copies! (But this could be fixed pretty easily) As mentioned previously use of vmsplice instead was considered, but has problems. Since you need the reader and writer working co-operatively if the pipe is not drained then you block. Which requires some wrapping to do non blocking on the send side or polling on the receive. In all to all communication it requires ordering otherwise you can deadlock. And in the example of many MPI tasks writing to one MPI task vmsplice serialises the copying. There are some cases of MPI collectives where even a single copy interface does not get us the performance gain we could. For example in an MPI_Reduce rather than copy the data from the source we would like to instead use it directly in a mathops (say the reduce is doing a sum) as this would save us doing a copy. We don't need to keep a copy of the data from the source. I haven't implemented this, but I think this interface could in the future do all this through the use of the flags - eg could specify the math operation and type and the kernel rather than just copying the data would apply the specified operation between the source and destination and store it in the destination. Although we don't have a "second user" of the interface (though I've had some nibbles from people who may be interested in using it for intra process messaging which is not MPI). This interface is something which hardware vendors are already doing for their custom drivers to implement fast local communication. And so in addition to this being useful for OpenMPI it would mean the driver maintainers don't have to fix things up when the mm changes. There was some discussion about how much faster a true zero copy would go. Here's a link back to the email with some testing I did on that: http://marc.info/?l=linux-mm&m=130105930902915&w=2 There is a basic man page for the proposed interface here: http://ozlabs.org/~cyeoh/cma/process_vm_readv.txt This has been implemented for x86 and powerpc, other architecture should mainly (I think) just need to add syscall numbers for the process_vm_readv and process_vm_writev. There are 32 bit compatibility versions for 64-bit kernels. For arch maintainers there are some simple tests to be able to quickly verify that the syscalls are working correctly here: http://ozlabs.org/~cyeoh/cma/cma-test-20110718.tgz Signed-off-by: Chris Yeoh Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: Arnd Bergmann Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: David Howells Cc: James Morris Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/systbl.h | 2 + arch/powerpc/include/asm/unistd.h | 4 +- arch/x86/ia32/ia32entry.S | 2 + arch/x86/include/asm/unistd_32.h | 4 +- arch/x86/include/asm/unistd_64.h | 4 + arch/x86/kernel/syscall_table_32.S | 2 + fs/aio.c | 4 +- fs/compat.c | 7 +- fs/read_write.c | 8 +- include/linux/compat.h | 3 +- include/linux/fs.h | 7 +- include/linux/syscalls.h | 13 + kernel/sys_ni.c | 4 + mm/Makefile | 3 +- mm/process_vm_access.c | 496 +++++++++++++++++++++++++++++++++++++ security/keys/compat.c | 2 +- security/keys/keyctl.c | 2 +- 17 files changed, 550 insertions(+), 17 deletions(-) create mode 100644 mm/process_vm_access.c (limited to 'arch/powerpc/include') diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index fa0d27a400d..559ae1ee670 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -354,3 +354,5 @@ COMPAT_SYS_SPU(clock_adjtime) SYSCALL_SPU(syncfs) COMPAT_SYS_SPU(sendmmsg) SYSCALL_SPU(setns) +COMPAT_SYS(process_vm_readv) +COMPAT_SYS(process_vm_writev) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index b8b3f599362..d3d1b5efd7e 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -373,10 +373,12 @@ #define __NR_syncfs 348 #define __NR_sendmmsg 349 #define __NR_setns 350 +#define __NR_process_vm_readv 351 +#define __NR_process_vm_writev 352 #ifdef __KERNEL__ -#define __NR_syscalls 351 +#define __NR_syscalls 353 #define __NR__exit __NR_exit #define NR_syscalls __NR_syscalls diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 54edb207ff3..a6253ec1b28 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -850,4 +850,6 @@ ia32_sys_call_table: .quad sys_syncfs .quad compat_sys_sendmmsg /* 345 */ .quad sys_setns + .quad compat_sys_process_vm_readv + .quad compat_sys_process_vm_writev ia32_syscall_end: diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 593485b38ab..599c77d38f3 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -352,10 +352,12 @@ #define __NR_syncfs 344 #define __NR_sendmmsg 345 #define __NR_setns 346 +#define __NR_process_vm_readv 347 +#define __NR_process_vm_writev 348 #ifdef __KERNEL__ -#define NR_syscalls 347 +#define NR_syscalls 349 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 0a6ba337a2e..0431f193c3f 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -682,6 +682,10 @@ __SYSCALL(__NR_sendmmsg, sys_sendmmsg) __SYSCALL(__NR_setns, sys_setns) #define __NR_getcpu 309 __SYSCALL(__NR_getcpu, sys_getcpu) +#define __NR_process_vm_readv 310 +__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv) +#define __NR_process_vm_writev 311 +__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index bc19be332bc..9a0e3129392 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -346,3 +346,5 @@ ENTRY(sys_call_table) .long sys_syncfs .long sys_sendmmsg /* 345 */ .long sys_setns + .long sys_process_vm_readv + .long sys_process_vm_writev diff --git a/fs/aio.c b/fs/aio.c index e29ec485af2..632b235f4fb 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1387,13 +1387,13 @@ static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat) ret = compat_rw_copy_check_uvector(type, (struct compat_iovec __user *)kiocb->ki_buf, kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec); + &kiocb->ki_iovec, 1); else #endif ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec); + &kiocb->ki_iovec, 1); if (ret < 0) goto out; diff --git a/fs/compat.c b/fs/compat.c index 302e761bd0a..c98787536bb 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -546,7 +546,7 @@ out: ssize_t compat_rw_copy_check_uvector(int type, const struct compat_iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer) + struct iovec **ret_pointer, int check_access) { compat_ssize_t tot_len; struct iovec *iov = *ret_pointer = fast_pointer; @@ -593,7 +593,8 @@ ssize_t compat_rw_copy_check_uvector(int type, } if (len < 0) /* size_t not fitting in compat_ssize_t .. */ goto out; - if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) { + if (check_access && + !access_ok(vrfy_dir(type), compat_ptr(buf), len)) { ret = -EFAULT; goto out; } @@ -1107,7 +1108,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, goto out; tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs, - UIO_FASTIOV, iovstack, &iov); + UIO_FASTIOV, iovstack, &iov, 1); if (tot_len == 0) { ret = 0; goto out; diff --git a/fs/read_write.c b/fs/read_write.c index dfd12579879..5ad4248b0cd 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -633,7 +633,8 @@ ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer) + struct iovec **ret_pointer, + int check_access) { unsigned long seg; ssize_t ret; @@ -689,7 +690,8 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, ret = -EINVAL; goto out; } - if (unlikely(!access_ok(vrfy_dir(type), buf, len))) { + if (check_access + && unlikely(!access_ok(vrfy_dir(type), buf, len))) { ret = -EFAULT; goto out; } @@ -721,7 +723,7 @@ static ssize_t do_readv_writev(int type, struct file *file, } ret = rw_copy_check_uvector(type, uvector, nr_segs, - ARRAY_SIZE(iovstack), iovstack, &iov); + ARRAY_SIZE(iovstack), iovstack, &iov, 1); if (ret <= 0) goto out; diff --git a/include/linux/compat.h b/include/linux/compat.h index c6e7523bf76..154bf568301 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -547,7 +547,8 @@ extern ssize_t compat_rw_copy_check_uvector(int type, const struct compat_iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer); + struct iovec **ret_pointer, + int check_access); extern void __user *compat_alloc_user_space(unsigned long len); diff --git a/include/linux/fs.h b/include/linux/fs.h index 14493a2d5a0..87b4c6b9692 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1633,9 +1633,10 @@ struct inode_operations { struct seq_file; ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, - unsigned long nr_segs, unsigned long fast_segs, - struct iovec *fast_pointer, - struct iovec **ret_pointer); + unsigned long nr_segs, unsigned long fast_segs, + struct iovec *fast_pointer, + struct iovec **ret_pointer, + int check_access); extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 1ff0ec2a5e8..86a24b1166d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -844,4 +844,17 @@ asmlinkage long sys_open_by_handle_at(int mountdirfd, struct file_handle __user *handle, int flags); asmlinkage long sys_setns(int fd, int nstype); +asmlinkage long sys_process_vm_readv(pid_t pid, + const struct iovec __user *lvec, + unsigned long liovcnt, + const struct iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags); +asmlinkage long sys_process_vm_writev(pid_t pid, + const struct iovec __user *lvec, + unsigned long liovcnt, + const struct iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags); + #endif diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a9a5de07c4f..47bfa16430d 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -145,6 +145,10 @@ cond_syscall(sys_io_submit); cond_syscall(sys_io_cancel); cond_syscall(sys_io_getevents); cond_syscall(sys_syslog); +cond_syscall(sys_process_vm_readv); +cond_syscall(sys_process_vm_writev); +cond_syscall(compat_sys_process_vm_readv); +cond_syscall(compat_sys_process_vm_writev); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/mm/Makefile b/mm/Makefile index 836e4163c1b..50ec00ef2a0 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,7 +5,8 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o pagewalk.o pgtable-generic.o + vmalloc.o pagewalk.o pgtable-generic.o \ + process_vm_access.o obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c new file mode 100644 index 00000000000..e920aa3ce10 --- /dev/null +++ b/mm/process_vm_access.c @@ -0,0 +1,496 @@ +/* + * linux/mm/process_vm_access.c + * + * Copyright (C) 2010-2011 Christopher Yeoh , IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_COMPAT +#include +#endif + +/** + * process_vm_rw_pages - read/write pages from task specified + * @task: task to read/write from + * @mm: mm for task + * @process_pages: struct pages area that can store at least + * nr_pages_to_copy struct page pointers + * @pa: address of page in task to start copying from/to + * @start_offset: offset in page to start copying from/to + * @len: number of bytes to copy + * @lvec: iovec array specifying where to copy to/from + * @lvec_cnt: number of elements in iovec array + * @lvec_current: index in iovec array we are up to + * @lvec_offset: offset in bytes from current iovec iov_base we are up to + * @vm_write: 0 means copy from, 1 means copy to + * @nr_pages_to_copy: number of pages to copy + * @bytes_copied: returns number of bytes successfully copied + * Returns 0 on success, error code otherwise + */ +static int process_vm_rw_pages(struct task_struct *task, + struct mm_struct *mm, + struct page **process_pages, + unsigned long pa, + unsigned long start_offset, + unsigned long len, + const struct iovec *lvec, + unsigned long lvec_cnt, + unsigned long *lvec_current, + size_t *lvec_offset, + int vm_write, + unsigned int nr_pages_to_copy, + ssize_t *bytes_copied) +{ + int pages_pinned; + void *target_kaddr; + int pgs_copied = 0; + int j; + int ret; + ssize_t bytes_to_copy; + ssize_t rc = 0; + + *bytes_copied = 0; + + /* Get the pages we're interested in */ + down_read(&mm->mmap_sem); + pages_pinned = get_user_pages(task, mm, pa, + nr_pages_to_copy, + vm_write, 0, process_pages, NULL); + up_read(&mm->mmap_sem); + + if (pages_pinned != nr_pages_to_copy) { + rc = -EFAULT; + goto end; + } + + /* Do the copy for each page */ + for (pgs_copied = 0; + (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt); + pgs_copied++) { + /* Make sure we have a non zero length iovec */ + while (*lvec_current < lvec_cnt + && lvec[*lvec_current].iov_len == 0) + (*lvec_current)++; + if (*lvec_current == lvec_cnt) + break; + + /* + * Will copy smallest of: + * - bytes remaining in page + * - bytes remaining in destination iovec + */ + bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset, + len - *bytes_copied); + bytes_to_copy = min_t(ssize_t, bytes_to_copy, + lvec[*lvec_current].iov_len + - *lvec_offset); + + target_kaddr = kmap(process_pages[pgs_copied]) + start_offset; + + if (vm_write) + ret = copy_from_user(target_kaddr, + lvec[*lvec_current].iov_base + + *lvec_offset, + bytes_to_copy); + else + ret = copy_to_user(lvec[*lvec_current].iov_base + + *lvec_offset, + target_kaddr, bytes_to_copy); + kunmap(process_pages[pgs_copied]); + if (ret) { + *bytes_copied += bytes_to_copy - ret; + pgs_copied++; + rc = -EFAULT; + goto end; + } + *bytes_copied += bytes_to_copy; + *lvec_offset += bytes_to_copy; + if (*lvec_offset == lvec[*lvec_current].iov_len) { + /* + * Need to copy remaining part of page into the + * next iovec if there are any bytes left in page + */ + (*lvec_current)++; + *lvec_offset = 0; + start_offset = (start_offset + bytes_to_copy) + % PAGE_SIZE; + if (start_offset) + pgs_copied--; + } else { + start_offset = 0; + } + } + +end: + if (vm_write) { + for (j = 0; j < pages_pinned; j++) { + if (j < pgs_copied) + set_page_dirty_lock(process_pages[j]); + put_page(process_pages[j]); + } + } else { + for (j = 0; j < pages_pinned; j++) + put_page(process_pages[j]); + } + + return rc; +} + +/* Maximum number of pages kmalloc'd to hold struct page's during copy */ +#define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2) + +/** + * process_vm_rw_single_vec - read/write pages from task specified + * @addr: start memory address of target process + * @len: size of area to copy to/from + * @lvec: iovec array specifying where to copy to/from locally + * @lvec_cnt: number of elements in iovec array + * @lvec_current: index in iovec array we are up to + * @lvec_offset: offset in bytes from current iovec iov_base we are up to + * @process_pages: struct pages area that can store at least + * nr_pages_to_copy struct page pointers + * @mm: mm for task + * @task: task to read/write from + * @vm_write: 0 means copy from, 1 means copy to + * @bytes_copied: returns number of bytes successfully copied + * Returns 0 on success or on failure error code + */ +static int process_vm_rw_single_vec(unsigned long addr, + unsigned long len, + const struct iovec *lvec, + unsigned long lvec_cnt, + unsigned long *lvec_current, + size_t *lvec_offset, + struct page **process_pages, + struct mm_struct *mm, + struct task_struct *task, + int vm_write, + ssize_t *bytes_copied) +{ + unsigned long pa = addr & PAGE_MASK; + unsigned long start_offset = addr - pa; + unsigned long nr_pages; + ssize_t bytes_copied_loop; + ssize_t rc = 0; + unsigned long nr_pages_copied = 0; + unsigned long nr_pages_to_copy; + unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES + / sizeof(struct pages *); + + *bytes_copied = 0; + + /* Work out address and page range required */ + if (len == 0) + return 0; + nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; + + while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) { + nr_pages_to_copy = min(nr_pages - nr_pages_copied, + max_pages_per_loop); + + rc = process_vm_rw_pages(task, mm, process_pages, pa, + start_offset, len, + lvec, lvec_cnt, + lvec_current, lvec_offset, + vm_write, nr_pages_to_copy, + &bytes_copied_loop); + start_offset = 0; + *bytes_copied += bytes_copied_loop; + + if (rc < 0) { + return rc; + } else { + len -= bytes_copied_loop; + nr_pages_copied += nr_pages_to_copy; + pa += nr_pages_to_copy * PAGE_SIZE; + } + } + + return rc; +} + +/* Maximum number of entries for process pages array + which lives on stack */ +#define PVM_MAX_PP_ARRAY_COUNT 16 + +/** + * process_vm_rw_core - core of reading/writing pages from task specified + * @pid: PID of process to read/write from/to + * @lvec: iovec array specifying where to copy to/from locally + * @liovcnt: size of lvec array + * @rvec: iovec array specifying where to copy to/from in the other process + * @riovcnt: size of rvec array + * @flags: currently unused + * @vm_write: 0 if reading from other process, 1 if writing to other process + * Returns the number of bytes read/written or error code. May + * return less bytes than expected if an error occurs during the copying + * process. + */ +static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, + unsigned long liovcnt, + const struct iovec *rvec, + unsigned long riovcnt, + unsigned long flags, int vm_write) +{ + struct task_struct *task; + struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT]; + struct page **process_pages = pp_stack; + struct mm_struct *mm; + unsigned long i; + ssize_t rc = 0; + ssize_t bytes_copied_loop; + ssize_t bytes_copied = 0; + unsigned long nr_pages = 0; + unsigned long nr_pages_iov; + unsigned long iov_l_curr_idx = 0; + size_t iov_l_curr_offset = 0; + ssize_t iov_len; + + /* + * Work out how many pages of struct pages we're going to need + * when eventually calling get_user_pages + */ + for (i = 0; i < riovcnt; i++) { + iov_len = rvec[i].iov_len; + if (iov_len > 0) { + nr_pages_iov = ((unsigned long)rvec[i].iov_base + + iov_len) + / PAGE_SIZE - (unsigned long)rvec[i].iov_base + / PAGE_SIZE + 1; + nr_pages = max(nr_pages, nr_pages_iov); + } + } + + if (nr_pages == 0) + return 0; + + if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) { + /* For reliability don't try to kmalloc more than + 2 pages worth */ + process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES, + sizeof(struct pages *)*nr_pages), + GFP_KERNEL); + + if (!process_pages) + return -ENOMEM; + } + + /* Get process information */ + rcu_read_lock(); + task = find_task_by_vpid(pid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + if (!task) { + rc = -ESRCH; + goto free_proc_pages; + } + + task_lock(task); + if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) { + task_unlock(task); + rc = -EPERM; + goto put_task_struct; + } + mm = task->mm; + + if (!mm || (task->flags & PF_KTHREAD)) { + task_unlock(task); + rc = -EINVAL; + goto put_task_struct; + } + + atomic_inc(&mm->mm_users); + task_unlock(task); + + for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { + rc = process_vm_rw_single_vec( + (unsigned long)rvec[i].iov_base, rvec[i].iov_len, + lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset, + process_pages, mm, task, vm_write, &bytes_copied_loop); + bytes_copied += bytes_copied_loop; + if (rc != 0) { + /* If we have managed to copy any data at all then + we return the number of bytes copied. Otherwise + we return the error code */ + if (bytes_copied) + rc = bytes_copied; + goto put_mm; + } + } + + rc = bytes_copied; +put_mm: + mmput(mm); + +put_task_struct: + put_task_struct(task); + +free_proc_pages: + if (process_pages != pp_stack) + kfree(process_pages); + return rc; +} + +/** + * process_vm_rw - check iovecs before calling core routine + * @pid: PID of process to read/write from/to + * @lvec: iovec array specifying where to copy to/from locally + * @liovcnt: size of lvec array + * @rvec: iovec array specifying where to copy to/from in the other process + * @riovcnt: size of rvec array + * @flags: currently unused + * @vm_write: 0 if reading from other process, 1 if writing to other process + * Returns the number of bytes read/written or error code. May + * return less bytes than expected if an error occurs during the copying + * process. + */ +static ssize_t process_vm_rw(pid_t pid, + const struct iovec __user *lvec, + unsigned long liovcnt, + const struct iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags, int vm_write) +{ + struct iovec iovstack_l[UIO_FASTIOV]; + struct iovec iovstack_r[UIO_FASTIOV]; + struct iovec *iov_l = iovstack_l; + struct iovec *iov_r = iovstack_r; + ssize_t rc; + + if (flags != 0) + return -EINVAL; + + /* Check iovecs */ + if (vm_write) + rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, + iovstack_l, &iov_l, 1); + else + rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV, + iovstack_l, &iov_l, 1); + if (rc <= 0) + goto free_iovecs; + + rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV, + iovstack_r, &iov_r, 0); + if (rc <= 0) + goto free_iovecs; + + rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, + vm_write); + +free_iovecs: + if (iov_r != iovstack_r) + kfree(iov_r); + if (iov_l != iovstack_l) + kfree(iov_l); + + return rc; +} + +SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec, + unsigned long, liovcnt, const struct iovec __user *, rvec, + unsigned long, riovcnt, unsigned long, flags) +{ + return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0); +} + +SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, + const struct iovec __user *, lvec, + unsigned long, liovcnt, const struct iovec __user *, rvec, + unsigned long, riovcnt, unsigned long, flags) +{ + return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); +} + +#ifdef CONFIG_COMPAT + +asmlinkage ssize_t +compat_process_vm_rw(compat_pid_t pid, + const struct compat_iovec __user *lvec, + unsigned long liovcnt, + const struct compat_iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags, int vm_write) +{ + struct iovec iovstack_l[UIO_FASTIOV]; + struct iovec iovstack_r[UIO_FASTIOV]; + struct iovec *iov_l = iovstack_l; + struct iovec *iov_r = iovstack_r; + ssize_t rc = -EFAULT; + + if (flags != 0) + return -EINVAL; + + if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec))) + goto out; + + if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec))) + goto out; + + if (vm_write) + rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, + UIO_FASTIOV, iovstack_l, + &iov_l, 1); + else + rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt, + UIO_FASTIOV, iovstack_l, + &iov_l, 1); + if (rc <= 0) + goto free_iovecs; + rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt, + UIO_FASTIOV, iovstack_r, + &iov_r, 0); + if (rc <= 0) + goto free_iovecs; + + rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, + vm_write); + +free_iovecs: + if (iov_r != iovstack_r) + kfree(iov_r); + if (iov_l != iovstack_l) + kfree(iov_l); + +out: + return rc; +} + +asmlinkage ssize_t +compat_sys_process_vm_readv(compat_pid_t pid, + const struct compat_iovec __user *lvec, + unsigned long liovcnt, + const struct compat_iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags) +{ + return compat_process_vm_rw(pid, lvec, liovcnt, rvec, + riovcnt, flags, 0); +} + +asmlinkage ssize_t +compat_sys_process_vm_writev(compat_pid_t pid, + const struct compat_iovec __user *lvec, + unsigned long liovcnt, + const struct compat_iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags) +{ + return compat_process_vm_rw(pid, lvec, liovcnt, rvec, + riovcnt, flags, 1); +} + +#endif diff --git a/security/keys/compat.c b/security/keys/compat.c index 338b510e902..4c48e13448f 100644 --- a/security/keys/compat.c +++ b/security/keys/compat.c @@ -38,7 +38,7 @@ long compat_keyctl_instantiate_key_iov( ret = compat_rw_copy_check_uvector(WRITE, _payload_iov, ioc, ARRAY_SIZE(iovstack), - iovstack, &iov); + iovstack, &iov, 1); if (ret < 0) return ret; if (ret == 0) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index eca51918c95..0b3f5d72af1 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1065,7 +1065,7 @@ long keyctl_instantiate_key_iov(key_serial_t id, goto no_payload; ret = rw_copy_check_uvector(WRITE, _payload_iov, ioc, - ARRAY_SIZE(iovstack), iovstack, &iov); + ARRAY_SIZE(iovstack), iovstack, &iov, 1); if (ret < 0) return ret; if (ret == 0) -- cgit v1.2.3-70-g09d2