diff options
Diffstat (limited to 'arch/powerpc')
84 files changed, 4499 insertions, 1523 deletions
diff --git a/arch/powerpc/boot/.gitignore b/arch/powerpc/boot/.gitignore index 12da77ec022..1c1aadc8c48 100644 --- a/arch/powerpc/boot/.gitignore +++ b/arch/powerpc/boot/.gitignore @@ -27,7 +27,6 @@ zImage.bin.* zImage.chrp zImage.coff zImage.holly -zImage.iseries zImage.*lds zImage.miboot zImage.pmac diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index edfc9803ec9..957a83f4364 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -112,7 +112,6 @@ extern void iommu_unmap_page(struct iommu_table *tbl, dma_addr_t dma_handle, struct dma_attrs *attrs); extern void iommu_init_early_pSeries(void); -extern void iommu_init_early_iSeries(void); extern void iommu_init_early_dart(void); extern void iommu_init_early_pasemi(void); diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index fe0b09dceb7..cf417e51073 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -27,12 +27,6 @@ extern atomic_t ppc_n_lost_interrupts; /* This number is used when no interrupt has been assigned */ #define NO_IRQ (0) -/* This is a special irq number to return from get_irq() to tell that - * no interrupt happened _and_ ignore it (don't count it as bad). Some - * platforms like iSeries rely on that. - */ -#define NO_IRQ_IGNORE ((unsigned int)-1) - /* Total number of virq in the platform */ #define NR_IRQS CONFIG_NR_IRQS diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index f7727d91ac6..b921c3f4892 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -265,12 +265,9 @@ struct kvm_debug_exit_arch { struct kvm_guest_debug_arch { }; -#define KVM_REG_MASK 0x001f -#define KVM_REG_EXT_MASK 0xffe0 -#define KVM_REG_GPR 0x0000 -#define KVM_REG_FPR 0x0020 -#define KVM_REG_QPR 0x0040 -#define KVM_REG_FQPR 0x0060 +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; #define KVM_INTERRUPT_SET -1U #define KVM_INTERRUPT_UNSET -2U @@ -292,4 +289,41 @@ struct kvm_allocate_rma { __u64 rma_size; }; +struct kvm_book3e_206_tlb_entry { + __u32 mas8; + __u32 mas1; + __u64 mas2; + __u64 mas7_3; +}; + +struct kvm_book3e_206_tlb_params { + /* + * For mmu types KVM_MMU_FSL_BOOKE_NOHV and KVM_MMU_FSL_BOOKE_HV: + * + * - The number of ways of TLB0 must be a power of two between 2 and + * 16. + * - TLB1 must be fully associative. + * - The size of TLB0 must be a multiple of the number of ways, and + * the number of sets must be a power of two. + * - The size of TLB1 may not exceed 64 entries. + * - TLB0 supports 4 KiB pages. + * - The page sizes supported by TLB1 are as indicated by + * TLB1CFG (if MMUCFG[MAVN] = 0) or TLB1PS (if MMUCFG[MAVN] = 1) + * as returned by KVM_GET_SREGS. + * - TLB2 and TLB3 are reserved, and their entries in tlb_sizes[] + * and tlb_ways[] must be zero. + * + * tlb_ways[n] = tlb_sizes[n] means the array is fully associative. + * + * KVM will adjust TLBnCFG based on the sizes configured here, + * though arrays greater than 2048 entries will have TLBnCFG[NENTRY] + * set to zero. + */ + __u32 tlb_sizes[4]; + __u32 tlb_ways[4]; + __u32 reserved[8]; +}; + +#define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1) + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 69c7377d207..aa795ccef29 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -90,6 +90,8 @@ struct kvmppc_vcpu_book3s { #endif int context_id[SID_CONTEXTS]; + bool hior_explicit; /* HIOR is set by ioctl, not PVR */ + struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE]; struct hlist_head hpte_hash_pte_long[HPTEG_HASH_NUM_PTE_LONG]; struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; @@ -119,6 +121,11 @@ extern void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu); extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte); extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr); extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu); +extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run, + struct kvm_vcpu *vcpu, unsigned long addr, + unsigned long status); +extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, + unsigned long slb_v, unsigned long valid); extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte); extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu); @@ -138,6 +145,21 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); extern pfn_t kvmppc_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); +extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, + unsigned long *rmap, long pte_index, int realmode); +extern void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep, + unsigned long pte_index); +void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep, + unsigned long pte_index); +extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr, + unsigned long *nb_ret); +extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr); +extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel); +extern long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel); +extern long kvmppc_hv_get_dirty_log(struct kvm *kvm, + struct kvm_memory_slot *memslot); extern void kvmppc_entry_trampoline(void); extern void kvmppc_hv_entry_trampoline(void); @@ -183,7 +205,9 @@ static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) { if ( num < 14 ) { - to_svcpu(vcpu)->gpr[num] = val; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + svcpu->gpr[num] = val; + svcpu_put(svcpu); to_book3s(vcpu)->shadow_vcpu->gpr[num] = val; } else vcpu->arch.gpr[num] = val; @@ -191,80 +215,120 @@ static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val) static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) { - if ( num < 14 ) - return to_svcpu(vcpu)->gpr[num]; - else + if ( num < 14 ) { + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong r = svcpu->gpr[num]; + svcpu_put(svcpu); + return r; + } else return vcpu->arch.gpr[num]; } static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) { - to_svcpu(vcpu)->cr = val; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + svcpu->cr = val; + svcpu_put(svcpu); to_book3s(vcpu)->shadow_vcpu->cr = val; } static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) { - return to_svcpu(vcpu)->cr; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + u32 r; + r = svcpu->cr; + svcpu_put(svcpu); + return r; } static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val) { - to_svcpu(vcpu)->xer = val; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + svcpu->xer = val; to_book3s(vcpu)->shadow_vcpu->xer = val; + svcpu_put(svcpu); } static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu) { - return to_svcpu(vcpu)->xer; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + u32 r; + r = svcpu->xer; + svcpu_put(svcpu); + return r; } static inline void kvmppc_set_ctr(struct kvm_vcpu *vcpu, ulong val) { - to_svcpu(vcpu)->ctr = val; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + svcpu->ctr = val; + svcpu_put(svcpu); } static inline ulong kvmppc_get_ctr(struct kvm_vcpu *vcpu) { - return to_svcpu(vcpu)->ctr; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong r; + r = svcpu->ctr; + svcpu_put(svcpu); + return r; } static inline void kvmppc_set_lr(struct kvm_vcpu *vcpu, ulong val) { - to_svcpu(vcpu)->lr = val; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + svcpu->lr = val; + svcpu_put(svcpu); } static inline ulong kvmppc_get_lr(struct kvm_vcpu *vcpu) { - return to_svcpu(vcpu)->lr; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong r; + r = svcpu->lr; + svcpu_put(svcpu); + return r; } static inline void kvmppc_set_pc(struct kvm_vcpu *vcpu, ulong val) { - to_svcpu(vcpu)->pc = val; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + svcpu->pc = val; + svcpu_put(svcpu); } static inline ulong kvmppc_get_pc(struct kvm_vcpu *vcpu) { - return to_svcpu(vcpu)->pc; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong r; + r = svcpu->pc; + svcpu_put(svcpu); + return r; } static inline u32 kvmppc_get_last_inst(struct kvm_vcpu *vcpu) { ulong pc = kvmppc_get_pc(vcpu); - struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + u32 r; /* Load the instruction manually if it failed to do so in the * exit path */ if (svcpu->last_inst == KVM_INST_FETCH_FAILED) kvmppc_ld(vcpu, &pc, sizeof(u32), &svcpu->last_inst, false); - return svcpu->last_inst; + r = svcpu->last_inst; + svcpu_put(svcpu); + return r; } static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu) { - return to_svcpu(vcpu)->fault_dar; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong r; + r = svcpu->fault_dar; + svcpu_put(svcpu); + return r; } static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/include/asm/kvm_book3s_32.h b/arch/powerpc/include/asm/kvm_book3s_32.h index de604db135f..38040ff8206 100644 --- a/arch/powerpc/include/asm/kvm_book3s_32.h +++ b/arch/powerpc/include/asm/kvm_book3s_32.h @@ -20,11 +20,15 @@ #ifndef __ASM_KVM_BOOK3S_32_H__ #define __ASM_KVM_BOOK3S_32_H__ -static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu) +static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu) { return to_book3s(vcpu)->shadow_vcpu; } +static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) +{ +} + #define PTE_SIZE 12 #define VSID_ALL 0 #define SR_INVALID 0x00000001 /* VSID 1 should always be unused */ diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index d0ac94f98f9..b0c08b14277 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -21,14 +21,56 @@ #define __ASM_KVM_BOOK3S_64_H__ #ifdef CONFIG_KVM_BOOK3S_PR -static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu) +static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu) { + preempt_disable(); return &get_paca()->shadow_vcpu; } + +static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) +{ + preempt_enable(); +} #endif #define SPAPR_TCE_SHIFT 12 +#ifdef CONFIG_KVM_BOOK3S_64_HV +/* For now use fixed-size 16MB page table */ +#define HPT_ORDER 24 +#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ +#define HPT_NPTE (HPT_NPTEG << 3) /* 8 PTEs per PTEG */ +#define HPT_HASH_MASK (HPT_NPTEG - 1) +#endif + +#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ + +/* + * We use a lock bit in HPTE dword 0 to synchronize updates and + * accesses to each HPTE, and another bit to indicate non-present + * HPTEs. + */ +#define HPTE_V_HVLOCK 0x40UL +#define HPTE_V_ABSENT 0x20UL + +static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits) +{ + unsigned long tmp, old; + + asm volatile(" ldarx %0,0,%2\n" + " and. %1,%0,%3\n" + " bne 2f\n" + " ori %0,%0,%4\n" + " stdcx. %0,0,%2\n" + " beq+ 2f\n" + " li %1,%3\n" + "2: isync" + : "=&r" (tmp), "=&r" (old) + : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK) + : "cc", "memory"); + return old == 0; +} + static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, unsigned long pte_index) { @@ -62,4 +104,140 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, return rb; } +static inline unsigned long hpte_page_size(unsigned long h, unsigned long l) +{ + /* only handle 4k, 64k and 16M pages for now */ + if (!(h & HPTE_V_LARGE)) + return 1ul << 12; /* 4k page */ + if ((l & 0xf000) == 0x1000 && cpu_has_feature(CPU_FTR_ARCH_206)) + return 1ul << 16; /* 64k page */ + if ((l & 0xff000) == 0) + return 1ul << 24; /* 16M page */ + return 0; /* error */ +} + +static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize) +{ + return ((ptel & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT; +} + +static inline int hpte_is_writable(unsigned long ptel) +{ + unsigned long pp = ptel & (HPTE_R_PP0 | HPTE_R_PP); + + return pp != PP_RXRX && pp != PP_RXXX; +} + +static inline unsigned long hpte_make_readonly(unsigned long ptel) +{ + if ((ptel & HPTE_R_PP0) || (ptel & HPTE_R_PP) == PP_RWXX) + ptel = (ptel & ~HPTE_R_PP) | PP_RXXX; + else + ptel |= PP_RXRX; + return ptel; +} + +static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type) +{ + unsigned int wimg = ptel & HPTE_R_WIMG; + + /* Handle SAO */ + if (wimg == (HPTE_R_W | HPTE_R_I | HPTE_R_M) && + cpu_has_feature(CPU_FTR_ARCH_206)) + wimg = HPTE_R_M; + + if (!io_type) + return wimg == HPTE_R_M; + + return (wimg & (HPTE_R_W | HPTE_R_I)) == io_type; +} + +/* + * Lock and read a linux PTE. If it's present and writable, atomically + * set dirty and referenced bits and return the PTE, otherwise return 0. + */ +static inline pte_t kvmppc_read_update_linux_pte(pte_t *p, int writing) +{ + pte_t pte, tmp; + + /* wait until _PAGE_BUSY is clear then set it atomically */ + __asm__ __volatile__ ( + "1: ldarx %0,0,%3\n" + " andi. %1,%0,%4\n" + " bne- 1b\n" + " ori %1,%0,%4\n" + " stdcx. %1,0,%3\n" + " bne- 1b" + : "=&r" (pte), "=&r" (tmp), "=m" (*p) + : "r" (p), "i" (_PAGE_BUSY) + : "cc"); + + if (pte_present(pte)) { + pte = pte_mkyoung(pte); + if (writing && pte_write(pte)) + pte = pte_mkdirty(pte); + } + + *p = pte; /* clears _PAGE_BUSY */ + + return pte; +} + +/* Return HPTE cache control bits corresponding to Linux pte bits */ +static inline unsigned long hpte_cache_bits(unsigned long pte_val) +{ +#if _PAGE_NO_CACHE == HPTE_R_I && _PAGE_WRITETHRU == HPTE_R_W + return pte_val & (HPTE_R_W | HPTE_R_I); +#else + return ((pte_val & _PAGE_NO_CACHE) ? HPTE_R_I : 0) + + ((pte_val & _PAGE_WRITETHRU) ? HPTE_R_W : 0); +#endif +} + +static inline bool hpte_read_permission(unsigned long pp, unsigned long key) +{ + if (key) + return PP_RWRX <= pp && pp <= PP_RXRX; + return 1; +} + +static inline bool hpte_write_permission(unsigned long pp, unsigned long key) +{ + if (key) + return pp == PP_RWRW; + return pp <= PP_RWRW; +} + +static inline int hpte_get_skey_perm(unsigned long hpte_r, unsigned long amr) +{ + unsigned long skey; + + skey = ((hpte_r & HPTE_R_KEY_HI) >> 57) | + ((hpte_r & HPTE_R_KEY_LO) >> 9); + return (amr >> (62 - 2 * skey)) & 3; +} + +static inline void lock_rmap(unsigned long *rmap) +{ + do { + while (test_bit(KVMPPC_RMAP_LOCK_BIT, rmap)) + cpu_relax(); + } while (test_and_set_bit_lock(KVMPPC_RMAP_LOCK_BIT, rmap)); +} + +static inline void unlock_rmap(unsigned long *rmap) +{ + __clear_bit_unlock(KVMPPC_RMAP_LOCK_BIT, rmap); +} + +static inline bool slot_is_aligned(struct kvm_memory_slot *memslot, + unsigned long pagesize) +{ + unsigned long mask = (pagesize >> PAGE_SHIFT) - 1; + + if (pagesize <= PAGE_SIZE) + return 1; + return !(memslot->base_gfn & mask) && !(memslot->npages & mask); +} + #endif /* __ASM_KVM_BOOK3S_64_H__ */ diff --git a/arch/powerpc/include/asm/kvm_e500.h b/arch/powerpc/include/asm/kvm_e500.h index adbfca9dd10..8cd50a51427 100644 --- a/arch/powerpc/include/asm/kvm_e500.h +++ b/arch/powerpc/include/asm/kvm_e500.h @@ -22,46 +22,55 @@ #define E500_PID_NUM 3 #define E500_TLB_NUM 2 -struct tlbe{ - u32 mas1; - u32 mas2; - u32 mas3; - u32 mas7; -}; - #define E500_TLB_VALID 1 #define E500_TLB_DIRTY 2 -struct tlbe_priv { +struct tlbe_ref { pfn_t pfn; unsigned int flags; /* E500_TLB_* */ }; +struct tlbe_priv { + struct tlbe_ref ref; /* TLB0 only -- TLB1 uses tlb_refs */ +}; + struct vcpu_id_table; +struct kvmppc_e500_tlb_params { + int entries, ways, sets; +}; + struct kvmppc_vcpu_e500 { - /* Unmodified copy of the guest's TLB. */ - struct tlbe *gtlb_arch[E500_TLB_NUM]; + /* Unmodified copy of the guest's TLB -- shared with host userspace. */ + struct kvm_book3e_206_tlb_entry *gtlb_arch; + + /* Starting entry number in gtlb_arch[] */ + int gtlb_offset[E500_TLB_NUM]; /* KVM internal information associated with each guest TLB entry */ struct tlbe_priv *gtlb_priv[E500_TLB_NUM]; - unsigned int gtlb_size[E500_TLB_NUM]; + struct kvmppc_e500_tlb_params gtlb_params[E500_TLB_NUM]; + unsigned int gtlb_nv[E500_TLB_NUM]; + /* + * information associated with each host TLB entry -- + * TLB1 only for now. If/when guest TLB1 entries can be + * mapped with host TLB0, this will be used for that too. + * + * We don't want to use this for guest TLB0 because then we'd + * have the overhead of doing the translation again even if + * the entry is still in the guest TLB (e.g. we swapped out + * and back, and our host TLB entries got evicted). + */ + struct tlbe_ref *tlb_refs[E500_TLB_NUM]; + unsigned int host_tlb1_nv; + u32 host_pid[E500_PID_NUM]; u32 pid[E500_PID_NUM]; u32 svr; - u32 mas0; - u32 mas1; - u32 mas2; - u32 mas3; - u32 mas4; - u32 mas5; - u32 mas6; - u32 mas7; - /* vcpu id table */ struct vcpu_id_table *idt; @@ -73,6 +82,9 @@ struct kvmppc_vcpu_e500 { u32 tlb1cfg; u64 mcar; + struct page **shared_tlb_pages; + int num_shared_tlb_pages; + struct kvm_vcpu vcpu; }; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index bf8af5d5d5d..52eb9c1f4fe 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -32,17 +32,32 @@ #include <linux/atomic.h> #include <asm/kvm_asm.h> #include <asm/processor.h> +#include <asm/page.h> #define KVM_MAX_VCPUS NR_CPUS #define KVM_MAX_VCORES NR_CPUS #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 +#define KVM_MEM_SLOTS_NUM (KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) #ifdef CONFIG_KVM_MMIO #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #endif +#ifdef CONFIG_KVM_BOOK3S_64_HV +#include <linux/mmu_notifier.h> + +#define KVM_ARCH_WANT_MMU_NOTIFIER + +struct kvm; +extern int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); +extern int kvm_age_hva(struct kvm *kvm, unsigned long hva); +extern int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); +extern void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); + +#endif + /* We don't currently support large pages. */ #define KVM_HPAGE_GFN_SHIFT(x) 0 #define KVM_NR_PAGE_SIZES 1 @@ -158,34 +173,72 @@ struct kvmppc_spapr_tce_table { struct page *pages[0]; }; -struct kvmppc_rma_info { +struct kvmppc_linear_info { void *base_virt; unsigned long base_pfn; unsigned long npages; struct list_head list; - atomic_t use_count; + atomic_t use_count; + int type; +}; + +/* + * The reverse mapping array has one entry for each HPTE, + * which stores the guest's view of the second word of the HPTE + * (including the guest physical address of the mapping), + * plus forward and backward pointers in a doubly-linked ring + * of HPTEs that map the same host page. The pointers in this + * ring are 32-bit HPTE indexes, to save space. + */ +struct revmap_entry { + unsigned long guest_rpte; + unsigned int forw, back; +}; + +/* + * We use the top bit of each memslot->rmap entry as a lock bit, + * and bit 32 as a present flag. The bottom 32 bits are the + * index in the guest HPT of a HPTE that points to the page. + */ +#define KVMPPC_RMAP_LOCK_BIT 63 +#define KVMPPC_RMAP_RC_SHIFT 32 +#define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT) +#define KVMPPC_RMAP_CHANGED (HPTE_R_C << KVMPPC_RMAP_RC_SHIFT) +#define KVMPPC_RMAP_PRESENT 0x100000000ul +#define KVMPPC_RMAP_INDEX 0xfffffffful + +/* Low-order bits in kvm->arch.slot_phys[][] */ +#define KVMPPC_PAGE_ORDER_MASK 0x1f +#define KVMPPC_PAGE_NO_CACHE HPTE_R_I /* 0x20 */ +#define KVMPPC_PAGE_WRITETHRU HPTE_R_W /* 0x40 */ +#define KVMPPC_GOT_PAGE 0x80 + +struct kvm_arch_memory_slot { }; struct kvm_arch { #ifdef CONFIG_KVM_BOOK3S_64_HV unsigned long hpt_virt; - unsigned long ram_npages; - unsigned long ram_psize; - unsigned long ram_porder; - struct kvmppc_pginfo *ram_pginfo; + struct revmap_entry *revmap; unsigned int lpid; unsigned int host_lpid; unsigned long host_lpcr; unsigned long sdr1; unsigned long host_sdr1; int tlbie_lock; - int n_rma_pages; unsigned long lpcr; unsigned long rmor; - struct kvmppc_rma_info *rma; + struct kvmppc_linear_info *rma; + unsigned long vrma_slb_v; + int rma_setup_done; + int using_mmu_notifiers; struct list_head spapr_tce_tables; + spinlock_t slot_phys_lock; + unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; + int slot_npages[KVM_MEM_SLOTS_NUM]; unsigned short last_vcpu[NR_CPUS]; struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; + struct kvmppc_linear_info *hpt_li; #endif /* CONFIG_KVM_BOOK3S_64_HV */ }; @@ -318,10 +371,6 @@ struct kvm_vcpu_arch { u32 vrsave; /* also USPRG0 */ u32 mmucr; ulong shadow_msr; - ulong sprg4; - ulong sprg5; - ulong sprg6; - ulong sprg7; ulong csrr0; ulong csrr1; ulong dsrr0; @@ -329,16 +378,14 @@ struct kvm_vcpu_arch { ulong mcsrr0; ulong mcsrr1; ulong mcsr; - ulong esr; u32 dec; u32 decar; u32 tbl; u32 tbu; u32 tcr; - u32 tsr; + ulong tsr; /* we need to perform set/clr_bits() which requires ulong */ u32 ivor[64]; ulong ivpr; - u32 pir; u32 pvr; u32 shadow_pid; @@ -427,9 +474,14 @@ struct kvm_vcpu_arch { #ifdef CONFIG_KVM_BOOK3S_64_HV struct kvm_vcpu_arch_shared shregs; + unsigned long pgfault_addr; + long pgfault_index; + unsigned long pgfault_hpte[2]; + struct list_head run_list; struct task_struct *run_task; struct kvm_run *kvm_run; + pgd_t *pgdir; #endif }; @@ -438,4 +490,12 @@ struct kvm_vcpu_arch { #define KVMPPC_VCPU_BUSY_IN_HOST 1 #define KVMPPC_VCPU_RUNNABLE 2 +/* Values for vcpu->arch.io_gpr */ +#define KVM_MMIO_REG_MASK 0x001f +#define KVM_MMIO_REG_EXT_MASK 0xffe0 +#define KVM_MMIO_REG_GPR 0x0000 +#define KVM_MMIO_REG_FPR 0x0020 +#define KVM_MMIO_REG_QPR 0x0040 +#define KVM_MMIO_REG_FQPR 0x0060 + #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h index 50533f9adf4..7b754e74300 100644 --- a/arch/powerpc/include/asm/kvm_para.h +++ b/arch/powerpc/include/asm/kvm_para.h @@ -22,6 +22,16 @@ #include <linux/types.h> +/* + * Additions to this struct must only occur at the end, and should be + * accompanied by a KVM_MAGIC_FEAT flag to advertise that they are present + * (albeit not necessarily relevant to the current target hardware platform). + * + * Struct fields are always 32 or 64 bit aligned, depending on them being 32 + * or 64 bit wide respectively. + * + * See Documentation/virtual/kvm/ppc-pv.txt + */ struct kvm_vcpu_arch_shared { __u64 scratch1; __u64 scratch2; @@ -33,11 +43,35 @@ struct kvm_vcpu_arch_shared { __u64 sprg3; __u64 srr0; __u64 srr1; - __u64 dar; + __u64 dar; /* dear on BookE */ __u64 msr; __u32 dsisr; __u32 int_pending; /* Tells the guest if we have an interrupt */ __u32 sr[16]; + __u32 mas0; + __u32 mas1; + __u64 mas7_3; + __u64 mas2; + __u32 mas4; + __u32 mas6; + __u32 esr; + __u32 pir; + + /* + * SPRG4-7 are user-readable, so we can only keep these consistent + * between the shared area and the real registers when there's an + * intervening exit to KVM. This also applies to SPRG3 on some + * chips. + * + * This suffices for access by guest userspace, since in PR-mode + * KVM, an exit must occur when changing the guest's MSR[PR]. + * If the guest kernel writes to SPRG3-7 via the shared area, it + * must also use the shared area for reading while in kernel space. + */ + __u64 sprg4; + __u64 sprg5; + __u64 sprg6; + __u64 sprg7; }; #define KVM_SC_MAGIC_R0 0x4b564d21 /* "KVM!" */ @@ -47,7 +81,10 @@ struct kvm_vcpu_arch_shared { #define KVM_FEATURE_MAGIC_PAGE 1 -#define KVM_MAGIC_FEAT_SR (1 << 0) +#define KVM_MAGIC_FEAT_SR (1 << 0) + +/* MASn, ESR, PIR, and high SPRGs */ +#define KVM_MAGIC_FEAT_MAS0_TO_SPRG7 (1 << 1) #ifdef __KERNEL__ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 46efd1a265c..9d6dee0f7d4 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -66,6 +66,7 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run, extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); +extern void kvmppc_decrementer_func(unsigned long data); extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); /* Core-specific hooks */ @@ -94,7 +95,7 @@ extern int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, extern void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu); extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu); -extern void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu); +extern void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu); extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu); extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags); extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu); @@ -120,15 +121,17 @@ extern long kvmppc_alloc_hpt(struct kvm *kvm); extern void kvmppc_free_hpt(struct kvm *kvm); extern long kvmppc_prepare_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem); -extern void kvmppc_map_vrma(struct kvm *kvm, - struct kvm_userspace_memory_region *mem); +extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, + struct kvm_memory_slot *memslot, unsigned long porder); extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args); extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *rma); -extern struct kvmppc_rma_info *kvm_alloc_rma(void); -extern void kvm_release_rma(struct kvmppc_rma_info *ri); +extern struct kvmppc_linear_info *kvm_alloc_rma(void); +extern void kvm_release_rma(struct kvmppc_linear_info *ri); +extern struct kvmppc_linear_info *kvm_alloc_hpt(void); +extern void kvm_release_hpt(struct kvmppc_linear_info *li); extern int kvmppc_core_init_vm(struct kvm *kvm); extern void kvmppc_core_destroy_vm(struct kvm *kvm); extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, @@ -175,6 +178,9 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg); +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg); + void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); #ifdef CONFIG_KVM_BOOK3S_64_HV @@ -183,14 +189,19 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) paca[cpu].kvm_hstate.xics_phys = addr; } -extern void kvm_rma_init(void); +extern void kvm_linear_init(void); #else static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) {} -static inline void kvm_rma_init(void) +static inline void kvm_linear_init(void) {} #endif +int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, + struct kvm_config_tlb *cfg); +int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, + struct kvm_dirty_tlb *cfg); + #endif /* __POWERPC_KVM_PPC_H__ */ diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index bf37931d1ad..42ce570812c 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -99,9 +99,7 @@ struct machdep_calls { void (*init_IRQ)(void); - /* Return an irq, or NO_IRQ to indicate there are none pending. - * If for some reason there is no irq, but the interrupt - * shouldn't be counted as spurious, return NO_IRQ_IGNORE. */ + /* Return an irq, or NO_IRQ to indicate there are none pending. */ unsigned int (*get_irq)(void); /* PCI stuff */ diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index f5f89cafebd..cdb5421877e 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -41,9 +41,10 @@ /* MAS registers bit definitions */ #define MAS0_TLBSEL(x) (((x) << 28) & 0x30000000) -#define MAS0_ESEL(x) (((x) << 16) & 0x0FFF0000) -#define MAS0_NV(x) ((x) & 0x00000FFF) #define MAS0_ESEL_MASK 0x0FFF0000 +#define MAS0_ESEL_SHIFT 16 +#define MAS0_ESEL(x) (((x) << MAS0_ESEL_SHIFT) & MAS0_ESEL_MASK) +#define MAS0_NV(x) ((x) & 0x00000FFF) #define MAS0_HES 0x00004000 #define MAS0_WQ_ALLWAYS 0x00000000 #define MAS0_WQ_COND 0x00001000 @@ -167,6 +168,7 @@ #define TLBnCFG_MAXSIZE 0x000f0000 /* Maximum Page Size (v1.0) */ #define TLBnCFG_MAXSIZE_SHIFT 16 #define TLBnCFG_ASSOC 0xff000000 /* Associativity */ +#define TLBnCFG_ASSOC_SHIFT 24 /* TLBnPS encoding */ #define TLBnPS_4K 0x00000004 diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index 412ba493cb9..1c65a59881e 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -108,11 +108,11 @@ extern char initial_stab[]; #define HPTE_V_VRMA_MASK ASM_CONST(0x4001ffffff000000) /* Values for PP (assumes Ks=0, Kp=1) */ -/* pp0 will always be 0 for linux */ #define PP_RWXX 0 /* Supervisor read/write, User none */ #define PP_RWRX 1 /* Supervisor read/write, User read */ #define PP_RWRW 2 /* Supervisor read/write, User read/write */ #define PP_RXRX 3 /* Supervisor read, User read */ +#define PP_RXXX (HPTE_R_PP0 | 2) /* Supervisor read, user none */ #ifndef __ASSEMBLY__ @@ -267,7 +267,6 @@ extern void demote_segment_4k(struct mm_struct *mm, unsigned long addr); extern void hpte_init_native(void); extern void hpte_init_lpar(void); -extern void hpte_init_iSeries(void); extern void hpte_init_beat(void); extern void hpte_init_beat_v3(void); @@ -325,9 +324,6 @@ extern void slb_set_size(u16 size); * WARNING - If you change these you must make sure the asm * implementations in slb_allocate (slb_low.S), do_stab_bolted * (head.S) and ASM_VSID_SCRAMBLE (below) are changed accordingly. - * - * You'll also need to change the precomputed VSID values in head.S - * which are used by the iSeries firmware. */ #define VSID_MULTIPLIER_256M ASM_CONST(200730139) /* 28-bit prime */ @@ -484,14 +480,6 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea, | (ea >> SID_SHIFT_1T), 1T); } -/* - * This is only used on legacy iSeries in lparmap.c, - * hence the 256MB segment assumption. - */ -#define VSID_SCRAMBLE(pvsid) (((pvsid) * VSID_MULTIPLIER_256M) % \ - VSID_MODULUS_256M) -#define KERNEL_VSID(ea) VSID_SCRAMBLE(GET_ESID(ea)) - #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_MMU_HASH64_H_ */ diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 5d487657322..ac39e6a3b25 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -155,14 +155,7 @@ struct pci_dn { struct pci_dev *pcidev; /* back-pointer to the pci device */ #ifdef CONFIG_EEH - int class_code; /* pci device class */ - int eeh_mode; /* See eeh.h for possible EEH_MODEs */ - int eeh_config_addr; - int eeh_pe_config_addr; /* new-style partition endpoint address */ - int eeh_check_count; /* # times driver ignored error */ - int eeh_freeze_count; /* # times this device froze up. */ - int eeh_false_positives; /* # times this device reported #ff's */ - u32 config_space[16]; /* saved PCI config space */ + struct eeh_dev *edev; /* eeh device */ #endif #define IODA_INVALID_PE (-1) #ifdef CONFIG_PPC_POWERNV @@ -185,6 +178,13 @@ static inline int pci_device_from_OF_node(struct device_node *np, return 0; } +#if defined(CONFIG_EEH) +static inline struct eeh_dev *of_node_to_eeh_dev(struct device_node *dn) +{ + return PCI_DN(dn)->edev; +} +#endif + /** Find the bus corresponding to the indicated device node */ extern struct pci_bus *pcibios_find_pci_bus(struct device_node *dn); diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h index f54b3d26ce9..6653f2743c4 100644 --- a/arch/powerpc/include/asm/pci.h +++ b/arch/powerpc/include/asm/pci.h @@ -154,14 +154,6 @@ extern int pci_mmap_legacy_page_range(struct pci_bus *bus, #endif /* CONFIG_PPC64 */ -extern void pcibios_resource_to_bus(struct pci_dev *dev, - struct pci_bus_region *region, - struct resource *res); - -extern void pcibios_bus_to_resource(struct pci_dev *dev, - struct resource *res, - struct pci_bus_region *region); - extern void pcibios_claim_one_bus(struct pci_bus *b); extern void pcibios_finish_adding_to_bus(struct pci_bus *bus); @@ -190,6 +182,7 @@ extern void pci_resource_to_user(const struct pci_dev *dev, int bar, const struct resource *rsrc, resource_size_t *start, resource_size_t *end); +extern resource_size_t pcibios_io_space_offset(struct pci_controller *hose); extern void pcibios_setup_bus_devices(struct pci_bus *bus); extern void pcibios_setup_bus_self(struct pci_bus *bus); extern void pcibios_setup_phb_io_space(struct pci_controller *hose); diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h index 1a8093fa8f7..078019b5b35 100644 --- a/arch/powerpc/include/asm/perf_event_server.h +++ b/arch/powerpc/include/asm/perf_event_server.h @@ -47,6 +47,8 @@ struct power_pmu { */ #define PPMU_LIMITED_PMC5_6 1 /* PMC5/6 have limited function */ #define PPMU_ALT_SIPR 2 /* uses alternate posn for SIPR/HV */ +#define PPMU_NO_SIPR 4 /* no SIPR/HV in MMCRA at all */ +#define PPMU_NO_CONT_SAMPLING 8 /* no continuous sampling */ /* * Values for flags to get_alternatives() diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index e980faae422..d81f99430fe 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -45,6 +45,7 @@ #define PPC_INST_MFSPR_DSCR_MASK 0xfc1fffff #define PPC_INST_MTSPR_DSCR 0x7c1103a6 #define PPC_INST_MTSPR_DSCR_MASK 0xfc1fffff +#define PPC_INST_SLBFEE 0x7c0007a7 #define PPC_INST_STRING 0x7c00042a #define PPC_INST_STRING_MASK 0xfc0007fe @@ -183,7 +184,8 @@ __PPC_RS(t) | __PPC_RA(a) | __PPC_RB(b)) #define PPC_ERATSX_DOT(t, a, w) stringify_in_c(.long PPC_INST_ERATSX_DOT | \ __PPC_RS(t) | __PPC_RA(a) | __PPC_RB(b)) - +#define PPC_SLBFEE_DOT(t, b) stringify_in_c(.long PPC_INST_SLBFEE | \ + __PPC_RT(t) | __PPC_RB(b)) /* * Define what the VSX XX1 form instructions will look like, then add diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index e660b37aa7d..80fa704d410 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -45,8 +45,6 @@ extern void init_pci_config_tokens (void); extern unsigned long get_phb_buid (struct device_node *); extern int rtas_setup_phb(struct pci_controller *phb); -extern unsigned long pci_probe_only; - #ifdef CONFIG_EEH void pci_addr_cache_build(void); diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index b1a215eabef..9d7f0fb6902 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -216,6 +216,7 @@ #define DSISR_ISSTORE 0x02000000 /* access was a store */ #define DSISR_DABRMATCH 0x00400000 /* hit data breakpoint */ #define DSISR_NOSEGMENT 0x00200000 /* STAB/SLB miss */ +#define DSISR_KEYFAULT 0x00200000 /* Key fault */ #define SPRN_TBRL 0x10C /* Time Base Read Lower Register (user, R/O) */ #define SPRN_TBRU 0x10D /* Time Base Read Upper Register (user, R/O) */ #define SPRN_TBWL 0x11C /* Time Base Lower Register (super, R/W) */ @@ -237,6 +238,7 @@ #define LPCR_ISL (1ul << (63-2)) #define LPCR_VC_SH (63-2) #define LPCR_DPFD_SH (63-11) +#define LPCR_VRMASD (0x1ful << (63-16)) #define LPCR_VRMA_L (1ul << (63-12)) #define LPCR_VRMA_LP0 (1ul << (63-15)) #define LPCR_VRMA_LP1 (1ul << (63-16)) @@ -493,6 +495,9 @@ #define SPRN_SPRG7 0x117 /* Special Purpose Register General 7 */ #define SPRN_SRR0 0x01A /* Save/Restore Register 0 */ #define SPRN_SRR1 0x01B /* Save/Restore Register 1 */ +#define SRR1_ISI_NOPT 0x40000000 /* ISI: Not found in hash */ +#define SRR1_ISI_N_OR_G 0x10000000 /* ISI: Access is no-exec or G */ +#define SRR1_ISI_PROT 0x08000000 /* ISI: Other protection fault */ #define SRR1_WAKEMASK 0x00380000 /* reason for wakeup */ #define SRR1_WAKESYSERR 0x00300000 /* System error */ #define SRR1_WAKEEE 0x00200000 /* External interrupt */ diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 9d6c37a11ee..557cff845de 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -74,7 +74,6 @@ struct rtas_suspend_me_data { /* RTAS event classes */ #define RTAS_INTERNAL_ERROR 0x80000000 /* set bit 0 */ #define RTAS_EPOW_WARNING 0x40000000 /* set bit 1 */ -#define RTAS_POWERMGM_EVENTS 0x20000000 /* set bit 2 */ #define RTAS_HOTPLUG_EVENTS 0x10000000 /* set bit 3 */ #define RTAS_IO_EVENTS 0x08000000 /* set bit 4 */ #define RTAS_EVENT_SCAN_ALL_EVENTS 0xffffffff @@ -204,6 +203,39 @@ struct rtas_ext_event_log_v6 { /* Variable length. */ }; +/* pSeries event log format */ + +/* Two bytes ASCII section IDs */ +#define PSERIES_ELOG_SECT_ID_PRIV_HDR (('P' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_USER_HDR (('U' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_PRIMARY_SRC (('P' << 8) | 'S') +#define PSERIES_ELOG_SECT_ID_EXTENDED_UH (('E' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_FAILING_MTMS (('M' << 8) | 'T') +#define PSERIES_ELOG_SECT_ID_SECONDARY_SRC (('S' << 8) | 'S') +#define PSERIES_ELOG_SECT_ID_DUMP_LOCATOR (('D' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_FW_ERROR (('S' << 8) | 'W') +#define PSERIES_ELOG_SECT_ID_IMPACT_PART_ID (('L' << 8) | 'P') +#define PSERIES_ELOG_SECT_ID_LOGIC_RESOURCE_ID (('L' << 8) | 'R') +#define PSERIES_ELOG_SECT_ID_HMC_ID (('H' << 8) | 'M') +#define PSERIES_ELOG_SECT_ID_EPOW (('E' << 8) | 'P') +#define PSERIES_ELOG_SECT_ID_IO_EVENT (('I' << 8) | 'E') +#define PSERIES_ELOG_SECT_ID_MANUFACT_INFO (('M' << 8) | 'I') +#define PSERIES_ELOG_SECT_ID_CALL_HOME (('C' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_USER_DEF (('U' << 8) | 'D') + +/* Vendor specific Platform Event Log Format, Version 6, section header */ +struct pseries_errorlog { + uint16_t id; /* 0x00 2-byte ASCII section ID */ + uint16_t length; /* 0x02 Section length in bytes */ + uint8_t version; /* 0x04 Section version */ + uint8_t subtype; /* 0x05 Section subtype */ + uint16_t creator_component; /* 0x06 Creator component ID */ + uint8_t data[]; /* 0x08 Start of section data */ +}; + +struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, + uint16_t section_id); + /* * This can be set by the rtas_flash module so that it can get called * as the absolutely last thing before the kernel terminates. diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index adba970ce91..ebc24dc5b1a 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -122,7 +122,6 @@ extern void smp_muxed_ipi_set_data(int cpu, unsigned long data); extern void smp_muxed_ipi_message_pass(int cpu, int msg); extern irqreturn_t smp_ipi_demux(void); -void smp_init_iSeries(void); void smp_init_pSeries(void); void smp_init_cell(void); void smp_init_celleb(void); diff --git a/arch/powerpc/include/asm/udbg.h b/arch/powerpc/include/asm/udbg.h index 8338aef5a4d..b3038817b8d 100644 --- a/arch/powerpc/include/asm/udbg.h +++ b/arch/powerpc/include/asm/udbg.h @@ -44,7 +44,6 @@ extern void __init udbg_init_debug_lpar_hvsi(void); extern void __init udbg_init_pmac_realmode(void); extern void __init udbg_init_maple_realmode(void); extern void __init udbg_init_pas_realmode(void); -extern void __init udbg_init_iseries(void); extern void __init udbg_init_rtas_panel(void); extern void __init udbg_init_rtas_console(void); extern void __init udbg_init_debug_beat(void); diff --git a/arch/powerpc/include/asm/vio.h b/arch/powerpc/include/asm/vio.h index 0a290a19594..6bfd5ffe1d4 100644 --- a/arch/powerpc/include/asm/vio.h +++ b/arch/powerpc/include/asm/vio.h @@ -69,6 +69,7 @@ struct vio_dev { }; struct vio_driver { + const char *name; const struct vio_device_id *id_table; int (*probe)(struct vio_dev *dev, const struct vio_device_id *id); int (*remove)(struct vio_dev *dev); @@ -76,10 +77,17 @@ struct vio_driver { * be loaded in a CMO environment if it uses DMA. */ unsigned long (*get_desired_dma)(struct vio_dev *dev); + const struct dev_pm_ops *pm; struct device_driver driver; }; -extern int vio_register_driver(struct vio_driver *drv); +extern int __vio_register_driver(struct vio_driver *drv, struct module *owner, + const char *mod_name); +/* + * vio_register_driver must be a macro so that KBUILD_MODNAME can be expanded + */ +#define vio_register_driver(driver) \ + __vio_register_driver(driver, THIS_MODULE, KBUILD_MODNAME) extern void vio_unregister_driver(struct vio_driver *drv); extern int vio_cmo_entitlement_update(size_t); diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index cc492e48ddf..34b8afe94a5 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -412,16 +412,23 @@ int main(void) DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2)); DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3)); #endif - DEFINE(VCPU_SPRG4, offsetof(struct kvm_vcpu, arch.sprg4)); - DEFINE(VCPU_SPRG5, offsetof(struct kvm_vcpu, arch.sprg5)); - DEFINE(VCPU_SPRG6, offsetof(struct kvm_vcpu, arch.sprg6)); - DEFINE(VCPU_SPRG7, offsetof(struct kvm_vcpu, arch.sprg7)); + DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4)); + DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5)); + DEFINE(VCPU_SHARED_SPRG6, offsetof(struct kvm_vcpu_arch_shared, sprg6)); + DEFINE(VCPU_SHARED_SPRG7, offsetof(struct kvm_vcpu_arch_shared, sprg7)); DEFINE(VCPU_SHADOW_PID, offsetof(struct kvm_vcpu, arch.shadow_pid)); DEFINE(VCPU_SHADOW_PID1, offsetof(struct kvm_vcpu, arch.shadow_pid1)); DEFINE(VCPU_SHARED, offsetof(struct kvm_vcpu, arch.shared)); DEFINE(VCPU_SHARED_MSR, offsetof(struct kvm_vcpu_arch_shared, msr)); DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr)); + DEFINE(VCPU_SHARED_MAS0, offsetof(struct kvm_vcpu_arch_shared, mas0)); + DEFINE(VCPU_SHARED_MAS1, offsetof(struct kvm_vcpu_arch_shared, mas1)); + DEFINE(VCPU_SHARED_MAS2, offsetof(struct kvm_vcpu_arch_shared, mas2)); + DEFINE(VCPU_SHARED_MAS7_3, offsetof(struct kvm_vcpu_arch_shared, mas7_3)); + DEFINE(VCPU_SHARED_MAS4, offsetof(struct kvm_vcpu_arch_shared, mas4)); + DEFINE(VCPU_SHARED_MAS6, offsetof(struct kvm_vcpu_arch_shared, mas6)); + /* book3s */ #ifdef CONFIG_KVM_BOOK3S_64_HV DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid)); @@ -434,6 +441,7 @@ int main(void) DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu)); DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr)); DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor)); + DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); #endif diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 2d0868a4e2f..cb705fdbb45 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -101,14 +101,14 @@ data_access_not_stab: END_MMU_FTR_SECTION_IFCLR(MMU_FTR_SLB) #endif EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD, - KVMTEST_PR, 0x300) + KVMTEST, 0x300) . = 0x380 .globl data_access_slb_pSeries data_access_slb_pSeries: HMT_MEDIUM SET_SCRATCH0(r13) - EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) + EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380) std r3,PACA_EXSLB+EX_R3(r13) mfspr r3,SPRN_DAR #ifdef __DISABLED__ @@ -330,8 +330,8 @@ do_stab_bolted_pSeries: EXCEPTION_PROLOG_PSERIES_1(.do_stab_bolted, EXC_STD) #endif /* CONFIG_POWER4_ONLY */ - KVM_HANDLER_PR_SKIP(PACA_EXGEN, EXC_STD, 0x300) - KVM_HANDLER_PR_SKIP(PACA_EXSLB, EXC_STD, 0x380) + KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300) + KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380) KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x400) KVM_HANDLER_PR(PACA_EXSLB, EXC_STD, 0x480) KVM_HANDLER_PR(PACA_EXGEN, EXC_STD, 0x900) diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 2c5635dce05..243dbabfe74 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -208,8 +208,8 @@ notrace void arch_local_irq_restore(unsigned long en) * we are checking the "new" CPU instead of the old one. This * is only a problem if an event happened on the "old" CPU. * - * External interrupt events on non-iseries will have caused - * interrupts to be hard-disabled, so there is no problem, we + * External interrupt events will have caused interrupts to + * be hard-disabled, so there is no problem, we * cannot have preempted. */ irq_happened = get_irq_happened(); @@ -445,9 +445,9 @@ void do_IRQ(struct pt_regs *regs) may_hard_irq_enable(); /* And finally process it */ - if (irq != NO_IRQ && irq != NO_IRQ_IGNORE) + if (irq != NO_IRQ) handle_one_irq(irq); - else if (irq != NO_IRQ_IGNORE) + else __get_cpu_var(irq_stat).spurious_irqs++; irq_exit(); diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 2985338d0e1..62bdf238966 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -1,5 +1,6 @@ /* * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved. + * Copyright 2010-2011 Freescale Semiconductor, Inc. * * Authors: * Alexander Graf <agraf@suse.de> @@ -29,6 +30,7 @@ #include <asm/sections.h> #include <asm/cacheflush.h> #include <asm/disassemble.h> +#include <asm/ppc-opcode.h> #define KVM_MAGIC_PAGE (-4096L) #define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x) @@ -41,34 +43,30 @@ #define KVM_INST_B 0x48000000 #define KVM_INST_B_MASK 0x03ffffff #define KVM_INST_B_MAX 0x01ffffff +#define KVM_INST_LI 0x38000000 #define KVM_MASK_RT 0x03e00000 #define KVM_RT_30 0x03c00000 #define KVM_MASK_RB 0x0000f800 #define KVM_INST_MFMSR 0x7c0000a6 -#define KVM_INST_MFSPR_SPRG0 0x7c1042a6 -#define KVM_INST_MFSPR_SPRG1 0x7c1142a6 -#define KVM_INST_MFSPR_SPRG2 0x7c1242a6 -#define KVM_INST_MFSPR_SPRG3 0x7c1342a6 -#define KVM_INST_MFSPR_SRR0 0x7c1a02a6 -#define KVM_INST_MFSPR_SRR1 0x7c1b02a6 -#define KVM_INST_MFSPR_DAR 0x7c1302a6 -#define KVM_INST_MFSPR_DSISR 0x7c1202a6 - -#define KVM_INST_MTSPR_SPRG0 0x7c1043a6 -#define KVM_INST_MTSPR_SPRG1 0x7c1143a6 -#define KVM_INST_MTSPR_SPRG2 0x7c1243a6 -#define KVM_INST_MTSPR_SPRG3 0x7c1343a6 -#define KVM_INST_MTSPR_SRR0 0x7c1a03a6 -#define KVM_INST_MTSPR_SRR1 0x7c1b03a6 -#define KVM_INST_MTSPR_DAR 0x7c1303a6 -#define KVM_INST_MTSPR_DSISR 0x7c1203a6 + +#define SPR_FROM 0 +#define SPR_TO 0x100 + +#define KVM_INST_SPR(sprn, moveto) (0x7c0002a6 | \ + (((sprn) & 0x1f) << 16) | \ + (((sprn) & 0x3e0) << 6) | \ + (moveto)) + +#define KVM_INST_MFSPR(sprn) KVM_INST_SPR(sprn, SPR_FROM) +#define KVM_INST_MTSPR(sprn) KVM_INST_SPR(sprn, SPR_TO) #define KVM_INST_TLBSYNC 0x7c00046c #define KVM_INST_MTMSRD_L0 0x7c000164 #define KVM_INST_MTMSRD_L1 0x7c010164 #define KVM_INST_MTMSR 0x7c000124 +#define KVM_INST_WRTEE 0x7c000106 #define KVM_INST_WRTEEI_0 0x7c000146 #define KVM_INST_WRTEEI_1 0x7c008146 @@ -270,26 +268,27 @@ static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt) #ifdef CONFIG_BOOKE -extern u32 kvm_emulate_wrteei_branch_offs; -extern u32 kvm_emulate_wrteei_ee_offs; -extern u32 kvm_emulate_wrteei_len; -extern u32 kvm_emulate_wrteei[]; +extern u32 kvm_emulate_wrtee_branch_offs; +extern u32 kvm_emulate_wrtee_reg_offs; +extern u32 kvm_emulate_wrtee_orig_ins_offs; +extern u32 kvm_emulate_wrtee_len; +extern u32 kvm_emulate_wrtee[]; -static void kvm_patch_ins_wrteei(u32 *inst) +static void kvm_patch_ins_wrtee(u32 *inst, u32 rt, int imm_one) { u32 *p; int distance_start; int distance_end; ulong next_inst; - p = kvm_alloc(kvm_emulate_wrteei_len * 4); + p = kvm_alloc(kvm_emulate_wrtee_len * 4); if (!p) return; /* Find out where we are and put everything there */ distance_start = (ulong)p - (ulong)inst; next_inst = ((ulong)inst + 4); - distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_branch_offs]; + distance_end = next_inst - (ulong)&p[kvm_emulate_wrtee_branch_offs]; /* Make sure we only write valid b instructions */ if (distance_start > KVM_INST_B_MAX) { @@ -298,10 +297,65 @@ static void kvm_patch_ins_wrteei(u32 *inst) } /* Modify the chunk to fit the invocation */ - memcpy(p, kvm_emulate_wrteei, kvm_emulate_wrteei_len * 4); - p[kvm_emulate_wrteei_branch_offs] |= distance_end & KVM_INST_B_MASK; - p[kvm_emulate_wrteei_ee_offs] |= (*inst & MSR_EE); - flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_len * 4); + memcpy(p, kvm_emulate_wrtee, kvm_emulate_wrtee_len * 4); + p[kvm_emulate_wrtee_branch_offs] |= distance_end & KVM_INST_B_MASK; + + if (imm_one) { + p[kvm_emulate_wrtee_reg_offs] = + KVM_INST_LI | __PPC_RT(30) | MSR_EE; + } else { + /* Make clobbered registers work too */ + switch (get_rt(rt)) { + case 30: + kvm_patch_ins_ll(&p[kvm_emulate_wrtee_reg_offs], + magic_var(scratch2), KVM_RT_30); + break; + case 31: + kvm_patch_ins_ll(&p[kvm_emulate_wrtee_reg_offs], + magic_var(scratch1), KVM_RT_30); + break; + default: + p[kvm_emulate_wrtee_reg_offs] |= rt; + break; + } + } + + p[kvm_emulate_wrtee_orig_ins_offs] = *inst; + flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrtee_len * 4); + + /* Patch the invocation */ + kvm_patch_ins_b(inst, distance_start); +} + +extern u32 kvm_emulate_wrteei_0_branch_offs; +extern u32 kvm_emulate_wrteei_0_len; +extern u32 kvm_emulate_wrteei_0[]; + +static void kvm_patch_ins_wrteei_0(u32 *inst) +{ + u32 *p; + int distance_start; + int distance_end; + ulong next_inst; + + p = kvm_alloc(kvm_emulate_wrteei_0_len * 4); + if (!p) + return; + + /* Find out where we are and put everything there */ + distance_start = (ulong)p - (ulong)inst; + next_inst = ((ulong)inst + 4); + distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_0_branch_offs]; + + /* Make sure we only write valid b instructions */ + if (distance_start > KVM_INST_B_MAX) { + kvm_patching_worked = false; + return; + } + + memcpy(p, kvm_emulate_wrteei_0, kvm_emulate_wrteei_0_len * 4); + p[kvm_emulate_wrteei_0_branch_offs] |= distance_end & KVM_INST_B_MASK; + flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_0_len * 4); /* Patch the invocation */ kvm_patch_ins_b(inst, distance_start); @@ -380,56 +434,191 @@ static void kvm_check_ins(u32 *inst, u32 features) case KVM_INST_MFMSR: kvm_patch_ins_ld(inst, magic_var(msr), inst_rt); break; - case KVM_INST_MFSPR_SPRG0: + case KVM_INST_MFSPR(SPRN_SPRG0): kvm_patch_ins_ld(inst, magic_var(sprg0), inst_rt); break; - case KVM_INST_MFSPR_SPRG1: + case KVM_INST_MFSPR(SPRN_SPRG1): kvm_patch_ins_ld(inst, magic_var(sprg1), inst_rt); break; - case KVM_INST_MFSPR_SPRG2: + case KVM_INST_MFSPR(SPRN_SPRG2): kvm_patch_ins_ld(inst, magic_var(sprg2), inst_rt); break; - case KVM_INST_MFSPR_SPRG3: + case KVM_INST_MFSPR(SPRN_SPRG3): kvm_patch_ins_ld(inst, magic_var(sprg3), inst_rt); break; - case KVM_INST_MFSPR_SRR0: + case KVM_INST_MFSPR(SPRN_SRR0): kvm_patch_ins_ld(inst, magic_var(srr0), inst_rt); break; - case KVM_INST_MFSPR_SRR1: + case KVM_INST_MFSPR(SPRN_SRR1): kvm_patch_ins_ld(inst, magic_var(srr1), inst_rt); break; - case KVM_INST_MFSPR_DAR: +#ifdef CONFIG_BOOKE + case KVM_INST_MFSPR(SPRN_DEAR): +#else + case KVM_INST_MFSPR(SPRN_DAR): +#endif kvm_patch_ins_ld(inst, magic_var(dar), inst_rt); break; - case KVM_INST_MFSPR_DSISR: + case KVM_INST_MFSPR(SPRN_DSISR): kvm_patch_ins_lwz(inst, magic_var(dsisr), inst_rt); break; +#ifdef CONFIG_PPC_BOOK3E_MMU + case KVM_INST_MFSPR(SPRN_MAS0): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(mas0), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_MAS1): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(mas1), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_MAS2): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_ld(inst, magic_var(mas2), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_MAS3): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(mas7_3) + 4, inst_rt); + break; + case KVM_INST_MFSPR(SPRN_MAS4): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(mas4), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_MAS6): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(mas6), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_MAS7): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(mas7_3), inst_rt); + break; +#endif /* CONFIG_PPC_BOOK3E_MMU */ + + case KVM_INST_MFSPR(SPRN_SPRG4): +#ifdef CONFIG_BOOKE + case KVM_INST_MFSPR(SPRN_SPRG4R): +#endif + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_ld(inst, magic_var(sprg4), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_SPRG5): +#ifdef CONFIG_BOOKE + case KVM_INST_MFSPR(SPRN_SPRG5R): +#endif + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_ld(inst, magic_var(sprg5), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_SPRG6): +#ifdef CONFIG_BOOKE + case KVM_INST_MFSPR(SPRN_SPRG6R): +#endif + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_ld(inst, magic_var(sprg6), inst_rt); + break; + case KVM_INST_MFSPR(SPRN_SPRG7): +#ifdef CONFIG_BOOKE + case KVM_INST_MFSPR(SPRN_SPRG7R): +#endif + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_ld(inst, magic_var(sprg7), inst_rt); + break; + +#ifdef CONFIG_BOOKE + case KVM_INST_MFSPR(SPRN_ESR): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(esr), inst_rt); + break; +#endif + + case KVM_INST_MFSPR(SPRN_PIR): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_lwz(inst, magic_var(pir), inst_rt); + break; + + /* Stores */ - case KVM_INST_MTSPR_SPRG0: + case KVM_INST_MTSPR(SPRN_SPRG0): kvm_patch_ins_std(inst, magic_var(sprg0), inst_rt); break; - case KVM_INST_MTSPR_SPRG1: + case KVM_INST_MTSPR(SPRN_SPRG1): kvm_patch_ins_std(inst, magic_var(sprg1), inst_rt); break; - case KVM_INST_MTSPR_SPRG2: + case KVM_INST_MTSPR(SPRN_SPRG2): kvm_patch_ins_std(inst, magic_var(sprg2), inst_rt); break; - case KVM_INST_MTSPR_SPRG3: + case KVM_INST_MTSPR(SPRN_SPRG3): kvm_patch_ins_std(inst, magic_var(sprg3), inst_rt); break; - case KVM_INST_MTSPR_SRR0: + case KVM_INST_MTSPR(SPRN_SRR0): kvm_patch_ins_std(inst, magic_var(srr0), inst_rt); break; - case KVM_INST_MTSPR_SRR1: + case KVM_INST_MTSPR(SPRN_SRR1): kvm_patch_ins_std(inst, magic_var(srr1), inst_rt); break; - case KVM_INST_MTSPR_DAR: +#ifdef CONFIG_BOOKE + case KVM_INST_MTSPR(SPRN_DEAR): +#else + case KVM_INST_MTSPR(SPRN_DAR): +#endif kvm_patch_ins_std(inst, magic_var(dar), inst_rt); break; - case KVM_INST_MTSPR_DSISR: + case KVM_INST_MTSPR(SPRN_DSISR): kvm_patch_ins_stw(inst, magic_var(dsisr), inst_rt); break; +#ifdef CONFIG_PPC_BOOK3E_MMU + case KVM_INST_MTSPR(SPRN_MAS0): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(mas0), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_MAS1): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(mas1), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_MAS2): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_std(inst, magic_var(mas2), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_MAS3): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(mas7_3) + 4, inst_rt); + break; + case KVM_INST_MTSPR(SPRN_MAS4): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(mas4), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_MAS6): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(mas6), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_MAS7): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(mas7_3), inst_rt); + break; +#endif /* CONFIG_PPC_BOOK3E_MMU */ + + case KVM_INST_MTSPR(SPRN_SPRG4): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_std(inst, magic_var(sprg4), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_SPRG5): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_std(inst, magic_var(sprg5), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_SPRG6): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_std(inst, magic_var(sprg6), inst_rt); + break; + case KVM_INST_MTSPR(SPRN_SPRG7): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_std(inst, magic_var(sprg7), inst_rt); + break; + +#ifdef CONFIG_BOOKE + case KVM_INST_MTSPR(SPRN_ESR): + if (features & KVM_MAGIC_FEAT_MAS0_TO_SPRG7) + kvm_patch_ins_stw(inst, magic_var(esr), inst_rt); + break; +#endif /* Nops */ case KVM_INST_TLBSYNC: @@ -444,6 +633,11 @@ static void kvm_check_ins(u32 *inst, u32 features) case KVM_INST_MTMSRD_L0: kvm_patch_ins_mtmsr(inst, inst_rt); break; +#ifdef CONFIG_BOOKE + case KVM_INST_WRTEE: + kvm_patch_ins_wrtee(inst, inst_rt, 0); + break; +#endif } switch (inst_no_rt & ~KVM_MASK_RB) { @@ -461,13 +655,19 @@ static void kvm_check_ins(u32 *inst, u32 features) switch (_inst) { #ifdef CONFIG_BOOKE case KVM_INST_WRTEEI_0: + kvm_patch_ins_wrteei_0(inst); + break; + case KVM_INST_WRTEEI_1: - kvm_patch_ins_wrteei(inst); + kvm_patch_ins_wrtee(inst, 0, 1); break; #endif } } +extern u32 kvm_template_start[]; +extern u32 kvm_template_end[]; + static void kvm_use_magic_page(void) { u32 *p; @@ -488,8 +688,23 @@ static void kvm_use_magic_page(void) start = (void*)_stext; end = (void*)_etext; - for (p = start; p < end; p++) + /* + * Being interrupted in the middle of patching would + * be bad for SPRG4-7, which KVM can't keep in sync + * with emulated accesses because reads don't trap. + */ + local_irq_disable(); + + for (p = start; p < end; p++) { + /* Avoid patching the template code */ + if (p >= kvm_template_start && p < kvm_template_end) { + p = kvm_template_end - 1; + continue; + } kvm_check_ins(p, features); + } + + local_irq_enable(); printk(KERN_INFO "KVM: Live patching for a fast VM %s\n", kvm_patching_worked ? "worked" : "failed"); diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S index f2b1b2523e6..e291cf3cf95 100644 --- a/arch/powerpc/kernel/kvm_emul.S +++ b/arch/powerpc/kernel/kvm_emul.S @@ -13,6 +13,7 @@ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * Copyright SUSE Linux Products GmbH 2010 + * Copyright 2010-2011 Freescale Semiconductor, Inc. * * Authors: Alexander Graf <agraf@suse.de> */ @@ -65,6 +66,9 @@ kvm_hypercall_start: shared->critical == r1 and r2 is always != r1 */ \ STL64(r2, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0); +.global kvm_template_start +kvm_template_start: + .global kvm_emulate_mtmsrd kvm_emulate_mtmsrd: @@ -167,6 +171,9 @@ maybe_stay_in_guest: kvm_emulate_mtmsr_reg2: ori r30, r0, 0 + /* Put MSR into magic page because we don't call mtmsr */ + STL64(r30, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + /* Check if we have to fetch an interrupt */ lwz r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0) cmpwi r31, 0 @@ -174,15 +181,10 @@ kvm_emulate_mtmsr_reg2: /* Check if we may trigger an interrupt */ andi. r31, r30, MSR_EE - beq no_mtmsr - - b do_mtmsr + bne do_mtmsr no_mtmsr: - /* Put MSR into magic page because we don't call mtmsr */ - STL64(r30, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) - SCRATCH_RESTORE /* Go back to caller */ @@ -210,24 +212,80 @@ kvm_emulate_mtmsr_orig_ins_offs: kvm_emulate_mtmsr_len: .long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4 +/* also used for wrteei 1 */ +.global kvm_emulate_wrtee +kvm_emulate_wrtee: + + SCRATCH_SAVE + + /* Fetch old MSR in r31 */ + LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + + /* Insert new MSR[EE] */ +kvm_emulate_wrtee_reg: + ori r30, r0, 0 + rlwimi r31, r30, 0, MSR_EE + + /* + * If MSR[EE] is now set, check for a pending interrupt. + * We could skip this if MSR[EE] was already on, but that + * should be rare, so don't bother. + */ + andi. r30, r30, MSR_EE + + /* Put MSR into magic page because we don't call wrtee */ + STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) + + beq no_wrtee + + /* Check if we have to fetch an interrupt */ + lwz r30, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0) + cmpwi r30, 0 + bne do_wrtee + +no_wrtee: + SCRATCH_RESTORE + + /* Go back to caller */ +kvm_emulate_wrtee_branch: + b . + +do_wrtee: + SCRATCH_RESTORE + /* Just fire off the wrtee if it's critical */ +kvm_emulate_wrtee_orig_ins: + wrtee r0 -.global kvm_emulate_wrteei -kvm_emulate_wrteei: + b kvm_emulate_wrtee_branch +kvm_emulate_wrtee_end: + +.global kvm_emulate_wrtee_branch_offs +kvm_emulate_wrtee_branch_offs: + .long (kvm_emulate_wrtee_branch - kvm_emulate_wrtee) / 4 + +.global kvm_emulate_wrtee_reg_offs +kvm_emulate_wrtee_reg_offs: + .long (kvm_emulate_wrtee_reg - kvm_emulate_wrtee) / 4 + +.global kvm_emulate_wrtee_orig_ins_offs +kvm_emulate_wrtee_orig_ins_offs: + .long (kvm_emulate_wrtee_orig_ins - kvm_emulate_wrtee) / 4 + +.global kvm_emulate_wrtee_len +kvm_emulate_wrtee_len: + .long (kvm_emulate_wrtee_end - kvm_emulate_wrtee) / 4 + +.global kvm_emulate_wrteei_0 +kvm_emulate_wrteei_0: SCRATCH_SAVE /* Fetch old MSR in r31 */ LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) /* Remove MSR_EE from old MSR */ - li r30, 0 - ori r30, r30, MSR_EE - andc r31, r31, r30 - - /* OR new MSR_EE onto the old MSR */ -kvm_emulate_wrteei_ee: - ori r31, r31, 0 + rlwinm r31, r31, 0, ~MSR_EE /* Write new MSR value back */ STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0) @@ -235,22 +293,17 @@ kvm_emulate_wrteei_ee: SCRATCH_RESTORE /* Go back to caller */ -kvm_emulate_wrteei_branch: +kvm_emulate_wrteei_0_branch: b . -kvm_emulate_wrteei_end: - -.global kvm_emulate_wrteei_branch_offs -kvm_emulate_wrteei_branch_offs: - .long (kvm_emulate_wrteei_branch - kvm_emulate_wrteei) / 4 +kvm_emulate_wrteei_0_end: -.global kvm_emulate_wrteei_ee_offs -kvm_emulate_wrteei_ee_offs: - .long (kvm_emulate_wrteei_ee - kvm_emulate_wrteei) / 4 - -.global kvm_emulate_wrteei_len -kvm_emulate_wrteei_len: - .long (kvm_emulate_wrteei_end - kvm_emulate_wrteei) / 4 +.global kvm_emulate_wrteei_0_branch_offs +kvm_emulate_wrteei_0_branch_offs: + .long (kvm_emulate_wrteei_0_branch - kvm_emulate_wrteei_0) / 4 +.global kvm_emulate_wrteei_0_len +kvm_emulate_wrteei_0_len: + .long (kvm_emulate_wrteei_0_end - kvm_emulate_wrteei_0) / 4 .global kvm_emulate_mtsrin kvm_emulate_mtsrin: @@ -300,3 +353,6 @@ kvm_emulate_mtsrin_orig_ins_offs: .global kvm_emulate_mtsrin_len kvm_emulate_mtsrin_len: .long (kvm_emulate_mtsrin_end - kvm_emulate_mtsrin) / 4 + +.global kvm_template_end +kvm_template_end: diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index d0373bcb7c9..8e78e93c818 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -49,9 +49,6 @@ static int global_phb_number; /* Global phb counter */ /* ISA Memory physical address */ resource_size_t isa_mem_base; -/* Default PCI flags is 0 on ppc32, modified at boot on ppc64 */ -unsigned int pci_flags = 0; - static struct dma_map_ops *pci_dma_ops = &dma_direct_ops; @@ -834,60 +831,6 @@ int pci_proc_domain(struct pci_bus *bus) return 1; } -void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region, - struct resource *res) -{ - resource_size_t offset = 0, mask = (resource_size_t)-1; - struct pci_controller *hose = pci_bus_to_host(dev->bus); - - if (!hose) - return; - if (res->flags & IORESOURCE_IO) { - offset = (unsigned long)hose->io_base_virt - _IO_BASE; - mask = 0xffffffffu; - } else if (res->flags & IORESOURCE_MEM) - offset = hose->pci_mem_offset; - - region->start = (res->start - offset) & mask; - region->end = (res->end - offset) & mask; -} -EXPORT_SYMBOL(pcibios_resource_to_bus); - -void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, - struct pci_bus_region *region) -{ - resource_size_t offset = 0, mask = (resource_size_t)-1; - struct pci_controller *hose = pci_bus_to_host(dev->bus); - - if (!hose) - return; - if (res->flags & IORESOURCE_IO) { - offset = (unsigned long)hose->io_base_virt - _IO_BASE; - mask = 0xffffffffu; - } else if (res->flags & IORESOURCE_MEM) - offset = hose->pci_mem_offset; - res->start = (region->start + offset) & mask; - res->end = (region->end + offset) & mask; -} -EXPORT_SYMBOL(pcibios_bus_to_resource); - -/* Fixup a bus resource into a linux resource */ -static void __devinit fixup_resource(struct resource *res, struct pci_dev *dev) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - resource_size_t offset = 0, mask = (resource_size_t)-1; - - if (res->flags & IORESOURCE_IO) { - offset = (unsigned long)hose->io_base_virt - _IO_BASE; - mask = 0xffffffffu; - } else if (res->flags & IORESOURCE_MEM) - offset = hose->pci_mem_offset; - - res->start = (res->start + offset) & mask; - res->end = (res->end + offset) & mask; -} - - /* This header fixup will do the resource fixup for all devices as they are * probed, but not for bridge ranges */ @@ -927,18 +870,11 @@ static void __devinit pcibios_fixup_resources(struct pci_dev *dev) continue; } - pr_debug("PCI:%s Resource %d %016llx-%016llx [%x] fixup...\n", + pr_debug("PCI:%s Resource %d %016llx-%016llx [%x]\n", pci_name(dev), i, (unsigned long long)res->start,\ (unsigned long long)res->end, (unsigned int)res->flags); - - fixup_resource(res, dev); - - pr_debug("PCI:%s %016llx-%016llx\n", - pci_name(dev), - (unsigned long long)res->start, - (unsigned long long)res->end); } /* Call machine specific resource fixup */ @@ -1040,27 +976,18 @@ static void __devinit pcibios_fixup_bridge(struct pci_bus *bus) continue; } - pr_debug("PCI:%s Bus rsrc %d %016llx-%016llx [%x] fixup...\n", + pr_debug("PCI:%s Bus rsrc %d %016llx-%016llx [%x]\n", pci_name(dev), i, (unsigned long long)res->start,\ (unsigned long long)res->end, (unsigned int)res->flags); - /* Perform fixup */ - fixup_resource(res, dev); - /* Try to detect uninitialized P2P bridge resources, * and clear them out so they get re-assigned later */ if (pcibios_uninitialized_bridge_resource(bus, res)) { res->flags = 0; pr_debug("PCI:%s (unassigned)\n", pci_name(dev)); - } else { - - pr_debug("PCI:%s %016llx-%016llx\n", - pci_name(dev), - (unsigned long long)res->start, - (unsigned long long)res->end); } } } @@ -1550,6 +1477,11 @@ int pcibios_enable_device(struct pci_dev *dev, int mask) return pci_enable_resources(dev, mask); } +resource_size_t pcibios_io_space_offset(struct pci_controller *hose) +{ + return (unsigned long) hose->io_base_virt - _IO_BASE; +} + static void __devinit pcibios_setup_phb_resources(struct pci_controller *hose, struct list_head *resources) { struct resource *res; @@ -1574,7 +1506,7 @@ static void __devinit pcibios_setup_phb_resources(struct pci_controller *hose, s (unsigned long long)res->start, (unsigned long long)res->end, (unsigned long)res->flags); - pci_add_resource(resources, res); + pci_add_resource_offset(resources, res, pcibios_io_space_offset(hose)); /* Hookup PHB Memory resources */ for (i = 0; i < 3; ++i) { @@ -1597,7 +1529,7 @@ static void __devinit pcibios_setup_phb_resources(struct pci_controller *hose, s (unsigned long long)res->start, (unsigned long long)res->end, (unsigned long)res->flags); - pci_add_resource(resources, res); + pci_add_resource_offset(resources, res, hose->pci_mem_offset); } pr_debug("PCI: PHB MEM offset = %016llx\n", diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index fdd1a3d951d..4b06ec5a502 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -219,9 +219,9 @@ void __devinit pcibios_setup_phb_io_space(struct pci_controller *hose) struct resource *res = &hose->io_resource; /* Fixup IO space offset */ - io_offset = (unsigned long)hose->io_base_virt - isa_io_base; - res->start = (res->start + io_offset) & 0xffffffffu; - res->end = (res->end + io_offset) & 0xffffffffu; + io_offset = pcibios_io_space_offset(hose); + res->start += io_offset; + res->end += io_offset; } static int __init pcibios_init(void) diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index 3318d39b7d4..94a54f61d34 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -33,8 +33,6 @@ #include <asm/machdep.h> #include <asm/ppc-pci.h> -unsigned long pci_probe_only = 1; - /* pci_io_base -- the base address from which io bars are offsets. * This is the lowest I/O base address (so bar values are always positive), * and it *must* be the start of ISA space if an ISA bus exists because @@ -55,9 +53,6 @@ static int __init pcibios_init(void) */ ppc_md.phys_mem_access_prot = pci_phys_mem_access_prot; - if (pci_probe_only) - pci_add_flags(PCI_PROBE_ONLY); - /* On ppc64, we always enable PCI domains and we keep domain 0 * backward compatible in /proc for video cards */ @@ -173,7 +168,7 @@ static int __devinit pcibios_map_phb_io_space(struct pci_controller *hose) return -ENOMEM; /* Fixup hose IO resource */ - io_virt_offset = (unsigned long)hose->io_base_virt - _IO_BASE; + io_virt_offset = pcibios_io_space_offset(hose); hose->io_resource.start += io_virt_offset; hose->io_resource.end += io_virt_offset; diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c index b37d0b5a796..89dde171a6f 100644 --- a/arch/powerpc/kernel/pci_of_scan.c +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -75,6 +75,7 @@ static void of_pci_parse_addrs(struct device_node *node, struct pci_dev *dev) { u64 base, size; unsigned int flags; + struct pci_bus_region region; struct resource *res; const u32 *addrs; u32 i; @@ -106,10 +107,11 @@ static void of_pci_parse_addrs(struct device_node *node, struct pci_dev *dev) printk(KERN_ERR "PCI: bad cfg reg num 0x%x\n", i); continue; } - res->start = base; - res->end = base + size - 1; res->flags = flags; res->name = pci_name(dev); + region.start = base; + region.end = base + size - 1; + pcibios_bus_to_resource(dev, res, ®ion); } } @@ -209,6 +211,7 @@ void __devinit of_scan_pci_bridge(struct pci_dev *dev) struct pci_bus *bus; const u32 *busrange, *ranges; int len, i, mode; + struct pci_bus_region region; struct resource *res; unsigned int flags; u64 size; @@ -270,9 +273,10 @@ void __devinit of_scan_pci_bridge(struct pci_dev *dev) res = bus->resource[i]; ++i; } - res->start = of_read_number(&ranges[1], 2); - res->end = res->start + size - 1; res->flags = flags; + region.start = of_read_number(&ranges[1], 2); + region.end = region.start + size - 1; + pcibios_bus_to_resource(dev, res, ®ion); } sprintf(bus->name, "PCI Bus %04x:%02x", pci_domain_nr(bus), bus->number); diff --git a/arch/powerpc/kernel/pmc.c b/arch/powerpc/kernel/pmc.c index a841a9d136a..58eaa3ddf7b 100644 --- a/arch/powerpc/kernel/pmc.c +++ b/arch/powerpc/kernel/pmc.c @@ -13,6 +13,7 @@ */ #include <linux/errno.h> +#include <linux/bug.h> #include <linux/spinlock.h> #include <linux/export.h> diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index b2aae219b4b..99860273211 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -446,7 +446,7 @@ static void __init __attribute__((noreturn)) prom_panic(const char *reason) if (RELOC(of_platform) == PLATFORM_POWERMAC) asm("trap\n"); - /* ToDo: should put up an SRC here on p/iSeries */ + /* ToDo: should put up an SRC here on pSeries */ call_prom("exit", 0, 0); for (;;) /* should never get here */ diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 3dc679930d0..fcec38241f7 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -867,6 +867,40 @@ int rtas_ibm_suspend_me(struct rtas_args *args) } #endif +/** + * Find a specific pseries error log in an RTAS extended event log. + * @log: RTAS error/event log + * @section_id: two character section identifier + * + * Returns a pointer to the specified errorlog or NULL if not found. + */ +struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, + uint16_t section_id) +{ + struct rtas_ext_event_log_v6 *ext_log = + (struct rtas_ext_event_log_v6 *)log->buffer; + struct pseries_errorlog *sect; + unsigned char *p, *log_end; + + /* Check that we understand the format */ + if (log->extended_log_length < sizeof(struct rtas_ext_event_log_v6) || + ext_log->log_format != RTAS_V6EXT_LOG_FORMAT_EVENT_LOG || + ext_log->company_id != RTAS_V6EXT_COMPANY_ID_IBM) + return NULL; + + log_end = log->buffer + log->extended_log_length; + p = ext_log->vendor_log; + + while (p < log_end) { + sect = (struct pseries_errorlog *)p; + if (sect->id == section_id) + return sect; + p += sect->length; + } + + return NULL; +} + asmlinkage int ppc_rtas(struct rtas_args __user *uargs) { struct rtas_args args; diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c index 517bd86bc3f..179af906dcd 100644 --- a/arch/powerpc/kernel/rtas_pci.c +++ b/arch/powerpc/kernel/rtas_pci.c @@ -279,7 +279,7 @@ void __init find_and_init_phbs(void) eeh_dev_phb_init(); /* - * pci_probe_only and pci_assign_all_buses can be set via properties + * PCI_PROBE_ONLY and PCI_REASSIGN_ALL_BUS can be set via properties * in chosen. */ if (of_chosen) { @@ -287,8 +287,12 @@ void __init find_and_init_phbs(void) prop = of_get_property(of_chosen, "linux,pci-probe-only", NULL); - if (prop) - pci_probe_only = *prop; + if (prop) { + if (*prop) + pci_add_flags(PCI_PROBE_ONLY); + else + pci_clear_flags(PCI_PROBE_ONLY); + } #ifdef CONFIG_PPC32 /* Will be made generic soon */ prop = of_get_property(of_chosen, diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 6d639b2d24c..389bd4f0cdb 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -597,7 +597,7 @@ void __init setup_arch(char **cmdline_p) /* Initialize the MMU context management stuff */ mmu_context_init(); - kvm_rma_init(); + kvm_linear_init(); ppc64_boot_msg(0x15, "Setup Done"); } diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c index 57fa2c0a531..c39c1ca77f4 100644 --- a/arch/powerpc/kernel/udbg.c +++ b/arch/powerpc/kernel/udbg.c @@ -46,9 +46,6 @@ void __init udbg_early_init(void) #elif defined(CONFIG_PPC_EARLY_DEBUG_MAPLE) /* Maple real mode debug */ udbg_init_maple_realmode(); -#elif defined(CONFIG_PPC_EARLY_DEBUG_ISERIES) - /* For iSeries - hit Ctrl-x Ctrl-x to see the output */ - udbg_init_iseries(); #elif defined(CONFIG_PPC_EARLY_DEBUG_BEAT) udbg_init_debug_beat(); #elif defined(CONFIG_PPC_EARLY_DEBUG_PAS_REALMODE) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 1ce1ec410a1..9eb5b9b536a 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -262,17 +262,11 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) * the "data" page of the vDSO or you'll stop getting kernel updates * and your nice userland gettimeofday will be totally dead. * It's fine to use that for setting breakpoints in the vDSO code - * pages though - * - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. + * pages though. */ rc = install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdso_pagelist); if (rc) { current->mm->context.vdso_base = 0; @@ -726,10 +720,10 @@ static int __init vdso_init(void) vdso_data->version.minor = SYSTEMCFG_MINOR; vdso_data->processor = mfspr(SPRN_PVR); /* - * Fake the old platform number for pSeries and iSeries and add + * Fake the old platform number for pSeries and add * in LPAR bit if necessary */ - vdso_data->platform = machine_is(iseries) ? 0x200 : 0x100; + vdso_data->platform = 0x100; if (firmware_has_feature(FW_FEATURE_LPAR)) vdso_data->platform |= 1; vdso_data->physicalMemorySize = memblock_phys_mem_size(); diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index bca3fc427b4..b2f7c8480bf 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -1159,17 +1159,21 @@ static int vio_bus_remove(struct device *dev) * vio_register_driver: - Register a new vio driver * @drv: The vio_driver structure to be registered. */ -int vio_register_driver(struct vio_driver *viodrv) +int __vio_register_driver(struct vio_driver *viodrv, struct module *owner, + const char *mod_name) { - printk(KERN_DEBUG "%s: driver %s registering\n", __func__, - viodrv->driver.name); + pr_debug("%s: driver %s registering\n", __func__, viodrv->name); /* fill in 'struct driver' fields */ + viodrv->driver.name = viodrv->name; + viodrv->driver.pm = viodrv->pm; viodrv->driver.bus = &vio_bus_type; + viodrv->driver.owner = owner; + viodrv->driver.mod_name = mod_name; return driver_register(&viodrv->driver); } -EXPORT_SYMBOL(vio_register_driver); +EXPORT_SYMBOL(__vio_register_driver); /** * vio_unregister_driver - Remove registration of vio driver. diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 78133deb4b6..8f64709ae33 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -69,6 +69,7 @@ config KVM_BOOK3S_64 config KVM_BOOK3S_64_HV bool "KVM support for POWER7 and PPC970 using hypervisor mode in host" depends on KVM_BOOK3S_64 + select MMU_NOTIFIER ---help--- Support running unmodified book3s_64 guest kernels in virtual machines on POWER7 and PPC970 processors that have diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index e41ac6f7dcf..7d54f4ed6d9 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -258,7 +258,7 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority) return true; } -void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) +void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) { unsigned long *pending = &vcpu->arch.pending_exceptions; unsigned long old_pending = vcpu->arch.pending_exceptions; @@ -423,10 +423,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->sprg1 = vcpu->arch.shared->sprg1; regs->sprg2 = vcpu->arch.shared->sprg2; regs->sprg3 = vcpu->arch.shared->sprg3; - regs->sprg4 = vcpu->arch.sprg4; - regs->sprg5 = vcpu->arch.sprg5; - regs->sprg6 = vcpu->arch.sprg6; - regs->sprg7 = vcpu->arch.sprg7; + regs->sprg4 = vcpu->arch.shared->sprg4; + regs->sprg5 = vcpu->arch.shared->sprg5; + regs->sprg6 = vcpu->arch.shared->sprg6; + regs->sprg7 = vcpu->arch.shared->sprg7; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) regs->gpr[i] = kvmppc_get_gpr(vcpu, i); @@ -450,10 +450,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.shared->sprg1 = regs->sprg1; vcpu->arch.shared->sprg2 = regs->sprg2; vcpu->arch.shared->sprg3 = regs->sprg3; - vcpu->arch.sprg4 = regs->sprg4; - vcpu->arch.sprg5 = regs->sprg5; - vcpu->arch.sprg6 = regs->sprg6; - vcpu->arch.sprg7 = regs->sprg7; + vcpu->arch.shared->sprg4 = regs->sprg4; + vcpu->arch.shared->sprg5 = regs->sprg5; + vcpu->arch.shared->sprg6 = regs->sprg6; + vcpu->arch.shared->sprg7 = regs->sprg7; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) kvmppc_set_gpr(vcpu, i, regs->gpr[i]); @@ -477,41 +477,10 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, return 0; } -/* - * Get (and clear) the dirty memory log for a memory slot. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, - struct kvm_dirty_log *log) +void kvmppc_decrementer_func(unsigned long data) { - struct kvm_memory_slot *memslot; - struct kvm_vcpu *vcpu; - ulong ga, ga_end; - int is_dirty = 0; - int r; - unsigned long n; - - mutex_lock(&kvm->slots_lock); - - r = kvm_get_dirty_log(kvm, log, &is_dirty); - if (r) - goto out; - - /* If nothing is dirty, don't bother messing with page tables. */ - if (is_dirty) { - memslot = id_to_memslot(kvm->memslots, log->slot); + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; - ga = memslot->base_gfn << PAGE_SHIFT; - ga_end = ga + (memslot->npages << PAGE_SHIFT); - - kvm_for_each_vcpu(n, vcpu, kvm) - kvmppc_mmu_pte_pflush(vcpu, ga, ga_end); - - n = kvm_dirty_bitmap_bytes(memslot); - memset(memslot->dirty_bitmap, 0, n); - } - - r = 0; -out: - mutex_unlock(&kvm->slots_lock); - return r; + kvmppc_core_queue_dec(vcpu); + kvm_vcpu_kick(vcpu); } diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c index 9fecbfbce77..f922c29bb23 100644 --- a/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -151,13 +151,15 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) bool primary = false; bool evict = false; struct hpte_cache *pte; + int r = 0; /* Get host physical address for gpa */ hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); if (is_error_pfn(hpaddr)) { printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); - return -EINVAL; + r = -EINVAL; + goto out; } hpaddr <<= PAGE_SHIFT; @@ -249,7 +251,8 @@ next_pteg: kvmppc_mmu_hpte_cache_map(vcpu, pte); - return 0; +out: + return r; } static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) @@ -297,12 +300,14 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) u64 gvsid; u32 sr; struct kvmppc_sid_map *map; - struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + int r = 0; if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) { /* Invalidate an entry */ svcpu->sr[esid] = SR_INVALID; - return -ENOENT; + r = -ENOENT; + goto out; } map = find_sid_vsid(vcpu, gvsid); @@ -315,17 +320,21 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) dprintk_sr("MMU: mtsr %d, 0x%x\n", esid, sr); - return 0; +out: + svcpu_put(svcpu); + return r; } void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) { int i; - struct kvmppc_book3s_shadow_vcpu *svcpu = to_svcpu(vcpu); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); dprintk_sr("MMU: flushing all segments (%d)\n", ARRAY_SIZE(svcpu->sr)); for (i = 0; i < ARRAY_SIZE(svcpu->sr); i++) svcpu->sr[i] = SR_INVALID; + + svcpu_put(svcpu); } void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index fa2f08434ba..6f87f39a1ac 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -88,12 +88,14 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) int vflags = 0; int attempt = 0; struct kvmppc_sid_map *map; + int r = 0; /* Get host physical address for gpa */ hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); if (is_error_pfn(hpaddr)) { printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); - return -EINVAL; + r = -EINVAL; + goto out; } hpaddr <<= PAGE_SHIFT; hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK); @@ -110,7 +112,8 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) printk(KERN_ERR "KVM: Segment map for 0x%llx (0x%lx) failed\n", vsid, orig_pte->eaddr); WARN_ON(true); - return -EINVAL; + r = -EINVAL; + goto out; } vsid = map->host_vsid; @@ -131,8 +134,10 @@ map_again: /* In case we tried normal mapping already, let's nuke old entries */ if (attempt > 1) - if (ppc_md.hpte_remove(hpteg) < 0) - return -1; + if (ppc_md.hpte_remove(hpteg) < 0) { + r = -1; + goto out; + } ret = ppc_md.hpte_insert(hpteg, va, hpaddr, rflags, vflags, MMU_PAGE_4K, MMU_SEGSIZE_256M); @@ -162,7 +167,8 @@ map_again: kvmppc_mmu_hpte_cache_map(vcpu, pte); } - return 0; +out: + return r; } static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) @@ -207,25 +213,30 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid) { + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); int i; int max_slb_size = 64; int found_inval = -1; int r; - if (!to_svcpu(vcpu)->slb_max) - to_svcpu(vcpu)->slb_max = 1; + if (!svcpu->slb_max) + svcpu->slb_max = 1; /* Are we overwriting? */ - for (i = 1; i < to_svcpu(vcpu)->slb_max; i++) { - if (!(to_svcpu(vcpu)->slb[i].esid & SLB_ESID_V)) + for (i = 1; i < svcpu->slb_max; i++) { + if (!(svcpu->slb[i].esid & SLB_ESID_V)) found_inval = i; - else if ((to_svcpu(vcpu)->slb[i].esid & ESID_MASK) == esid) - return i; + else if ((svcpu->slb[i].esid & ESID_MASK) == esid) { + r = i; + goto out; + } } /* Found a spare entry that was invalidated before */ - if (found_inval > 0) - return found_inval; + if (found_inval > 0) { + r = found_inval; + goto out; + } /* No spare invalid entry, so create one */ @@ -233,30 +244,35 @@ static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid) max_slb_size = mmu_slb_size; /* Overflowing -> purge */ - if ((to_svcpu(vcpu)->slb_max) == max_slb_size) + if ((svcpu->slb_max) == max_slb_size) kvmppc_mmu_flush_segments(vcpu); - r = to_svcpu(vcpu)->slb_max; - to_svcpu(vcpu)->slb_max++; + r = svcpu->slb_max; + svcpu->slb_max++; +out: + svcpu_put(svcpu); return r; } int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) { + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); u64 esid = eaddr >> SID_SHIFT; u64 slb_esid = (eaddr & ESID_MASK) | SLB_ESID_V; u64 slb_vsid = SLB_VSID_USER; u64 gvsid; int slb_index; struct kvmppc_sid_map *map; + int r = 0; slb_index = kvmppc_mmu_next_segment(vcpu, eaddr & ESID_MASK); if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) { /* Invalidate an entry */ - to_svcpu(vcpu)->slb[slb_index].esid = 0; - return -ENOENT; + svcpu->slb[slb_index].esid = 0; + r = -ENOENT; + goto out; } map = find_sid_vsid(vcpu, gvsid); @@ -269,18 +285,22 @@ int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) slb_vsid &= ~SLB_VSID_KP; slb_esid |= slb_index; - to_svcpu(vcpu)->slb[slb_index].esid = slb_esid; - to_svcpu(vcpu)->slb[slb_index].vsid = slb_vsid; + svcpu->slb[slb_index].esid = slb_esid; + svcpu->slb[slb_index].vsid = slb_vsid; trace_kvm_book3s_slbmte(slb_vsid, slb_esid); - return 0; +out: + svcpu_put(svcpu); + return r; } void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) { - to_svcpu(vcpu)->slb_max = 1; - to_svcpu(vcpu)->slb[0].esid = 0; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + svcpu->slb_max = 1; + svcpu->slb[0].esid = 0; + svcpu_put(svcpu); } void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index bc3a2ea9421..ddc485a529f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -23,6 +23,7 @@ #include <linux/gfp.h> #include <linux/slab.h> #include <linux/hugetlb.h> +#include <linux/vmalloc.h> #include <asm/tlbflush.h> #include <asm/kvm_ppc.h> @@ -33,15 +34,6 @@ #include <asm/ppc-opcode.h> #include <asm/cputable.h> -/* For now use fixed-size 16MB page table */ -#define HPT_ORDER 24 -#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ -#define HPT_HASH_MASK (HPT_NPTEG - 1) - -/* Pages in the VRMA are 16MB pages */ -#define VRMA_PAGE_ORDER 24 -#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ - /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ #define MAX_LPID_970 63 #define NR_LPIDS (LPID_RSVD + 1) @@ -51,21 +43,41 @@ long kvmppc_alloc_hpt(struct kvm *kvm) { unsigned long hpt; unsigned long lpid; + struct revmap_entry *rev; + struct kvmppc_linear_info *li; + + /* Allocate guest's hashed page table */ + li = kvm_alloc_hpt(); + if (li) { + /* using preallocated memory */ + hpt = (ulong)li->base_virt; + kvm->arch.hpt_li = li; + } else { + /* using dynamic memory */ + hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| + __GFP_NOWARN, HPT_ORDER - PAGE_SHIFT); + } - hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|__GFP_NOWARN, - HPT_ORDER - PAGE_SHIFT); if (!hpt) { pr_err("kvm_alloc_hpt: Couldn't alloc HPT\n"); return -ENOMEM; } kvm->arch.hpt_virt = hpt; + /* Allocate reverse map array */ + rev = vmalloc(sizeof(struct revmap_entry) * HPT_NPTE); + if (!rev) { + pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); + goto out_freehpt; + } + kvm->arch.revmap = rev; + + /* Allocate the guest's logical partition ID */ do { lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS); if (lpid >= NR_LPIDS) { pr_err("kvm_alloc_hpt: No LPIDs free\n"); - free_pages(hpt, HPT_ORDER - PAGE_SHIFT); - return -ENOMEM; + goto out_freeboth; } } while (test_and_set_bit(lpid, lpid_inuse)); @@ -74,37 +86,64 @@ long kvmppc_alloc_hpt(struct kvm *kvm) pr_info("KVM guest htab at %lx, LPID %lx\n", hpt, lpid); return 0; + + out_freeboth: + vfree(rev); + out_freehpt: + free_pages(hpt, HPT_ORDER - PAGE_SHIFT); + return -ENOMEM; } void kvmppc_free_hpt(struct kvm *kvm) { clear_bit(kvm->arch.lpid, lpid_inuse); - free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); + vfree(kvm->arch.revmap); + if (kvm->arch.hpt_li) + kvm_release_hpt(kvm->arch.hpt_li); + else + free_pages(kvm->arch.hpt_virt, HPT_ORDER - PAGE_SHIFT); +} + +/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ +static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize) +{ + return (pgsize > 0x1000) ? HPTE_V_LARGE : 0; +} + +/* Bits in second HPTE dword for pagesize 4k, 64k or 16M */ +static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize) +{ + return (pgsize == 0x10000) ? 0x1000 : 0; } -void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) +void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, + unsigned long porder) { unsigned long i; - unsigned long npages = kvm->arch.ram_npages; - unsigned long pfn; - unsigned long *hpte; - unsigned long hash; - struct kvmppc_pginfo *pginfo = kvm->arch.ram_pginfo; + unsigned long npages; + unsigned long hp_v, hp_r; + unsigned long addr, hash; + unsigned long psize; + unsigned long hp0, hp1; + long ret; - if (!pginfo) - return; + psize = 1ul << porder; + npages = memslot->npages >> (porder - PAGE_SHIFT); /* VRMA can't be > 1TB */ - if (npages > 1ul << (40 - kvm->arch.ram_porder)) - npages = 1ul << (40 - kvm->arch.ram_porder); + if (npages > 1ul << (40 - porder)) + npages = 1ul << (40 - porder); /* Can't use more than 1 HPTE per HPTEG */ if (npages > HPT_NPTEG) npages = HPT_NPTEG; + hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | + HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); + hp1 = hpte1_pgsize_encoding(psize) | + HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; + for (i = 0; i < npages; ++i) { - pfn = pginfo[i].pfn; - if (!pfn) - break; + addr = i << porder; /* can't use hpt_hash since va > 64 bits */ hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & HPT_HASH_MASK; /* @@ -113,15 +152,15 @@ void kvmppc_map_vrma(struct kvm *kvm, struct kvm_userspace_memory_region *mem) * at most one HPTE per HPTEG, we just assume entry 7 * is available and use it. */ - hpte = (unsigned long *) (kvm->arch.hpt_virt + (hash << 7)); - hpte += 7 * 2; - /* HPTE low word - RPN, protection, etc. */ - hpte[1] = (pfn << PAGE_SHIFT) | HPTE_R_R | HPTE_R_C | - HPTE_R_M | PP_RWXX; - wmb(); - hpte[0] = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | - (i << (VRMA_PAGE_ORDER - 16)) | HPTE_V_BOLTED | - HPTE_V_LARGE | HPTE_V_VALID; + hash = (hash << 3) + 7; + hp_v = hp0 | ((addr >> 16) & ~0x7fUL); + hp_r = hp1 | addr; + ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r); + if (ret != H_SUCCESS) { + pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", + addr, ret); + break; + } } } @@ -158,10 +197,814 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) kvmppc_set_msr(vcpu, MSR_SF | MSR_ME); } +/* + * This is called to get a reference to a guest page if there isn't + * one already in the kvm->arch.slot_phys[][] arrays. + */ +static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, + struct kvm_memory_slot *memslot, + unsigned long psize) +{ + unsigned long start; + long np, err; + struct page *page, *hpage, *pages[1]; + unsigned long s, pgsize; + unsigned long *physp; + unsigned int is_io, got, pgorder; + struct vm_area_struct *vma; + unsigned long pfn, i, npages; + + physp = kvm->arch.slot_phys[memslot->id]; + if (!physp) + return -EINVAL; + if (physp[gfn - memslot->base_gfn]) + return 0; + + is_io = 0; + got = 0; + page = NULL; + pgsize = psize; + err = -EINVAL; + start = gfn_to_hva_memslot(memslot, gfn); + + /* Instantiate and get the page we want access to */ + np = get_user_pages_fast(start, 1, 1, pages); + if (np != 1) { + /* Look up the vma for the page */ + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, start); + if (!vma || vma->vm_start > start || + start + psize > vma->vm_end || + !(vma->vm_flags & VM_PFNMAP)) + goto up_err; + is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); + pfn = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + /* check alignment of pfn vs. requested page size */ + if (psize > PAGE_SIZE && (pfn & ((psize >> PAGE_SHIFT) - 1))) + goto up_err; + up_read(¤t->mm->mmap_sem); + + } else { + page = pages[0]; + got = KVMPPC_GOT_PAGE; + + /* See if this is a large page */ + s = PAGE_SIZE; + if (PageHuge(page)) { + hpage = compound_head(page); + s <<= compound_order(hpage); + /* Get the whole large page if slot alignment is ok */ + if (s > psize && slot_is_aligned(memslot, s) && + !(memslot->userspace_addr & (s - 1))) { + start &= ~(s - 1); + pgsize = s; + page = hpage; + } + } + if (s < psize) + goto out; + pfn = page_to_pfn(page); + } + + npages = pgsize >> PAGE_SHIFT; + pgorder = __ilog2(npages); + physp += (gfn - memslot->base_gfn) & ~(npages - 1); + spin_lock(&kvm->arch.slot_phys_lock); + for (i = 0; i < npages; ++i) { + if (!physp[i]) { + physp[i] = ((pfn + i) << PAGE_SHIFT) + + got + is_io + pgorder; + got = 0; + } + } + spin_unlock(&kvm->arch.slot_phys_lock); + err = 0; + + out: + if (got) { + if (PageHuge(page)) + page = compound_head(page); + put_page(page); + } + return err; + + up_err: + up_read(¤t->mm->mmap_sem); + return err; +} + +/* + * We come here on a H_ENTER call from the guest when we are not + * using mmu notifiers and we don't have the requested page pinned + * already. + */ +long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long psize, gpa, gfn; + struct kvm_memory_slot *memslot; + long ret; + + if (kvm->arch.using_mmu_notifiers) + goto do_insert; + + psize = hpte_page_size(pteh, ptel); + if (!psize) + return H_PARAMETER; + + pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); + + /* Find the memslot (if any) for this address */ + gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); + gfn = gpa >> PAGE_SHIFT; + memslot = gfn_to_memslot(kvm, gfn); + if (memslot && !(memslot->flags & KVM_MEMSLOT_INVALID)) { + if (!slot_is_aligned(memslot, psize)) + return H_PARAMETER; + if (kvmppc_get_guest_page(kvm, gfn, memslot, psize) < 0) + return H_PARAMETER; + } + + do_insert: + /* Protect linux PTE lookup from page table destruction */ + rcu_read_lock_sched(); /* this disables preemption too */ + vcpu->arch.pgdir = current->mm->pgd; + ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel); + rcu_read_unlock_sched(); + if (ret == H_TOO_HARD) { + /* this can't happen */ + pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); + ret = H_RESOURCE; /* or something */ + } + return ret; + +} + +static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, + gva_t eaddr) +{ + u64 mask; + int i; + + for (i = 0; i < vcpu->arch.slb_nr; i++) { + if (!(vcpu->arch.slb[i].orige & SLB_ESID_V)) + continue; + + if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T) + mask = ESID_MASK_1T; + else + mask = ESID_MASK; + + if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0) + return &vcpu->arch.slb[i]; + } + return NULL; +} + +static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, + unsigned long ea) +{ + unsigned long ra_mask; + + ra_mask = hpte_page_size(v, r) - 1; + return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); +} + static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, - struct kvmppc_pte *gpte, bool data) + struct kvmppc_pte *gpte, bool data) +{ + struct kvm *kvm = vcpu->kvm; + struct kvmppc_slb *slbe; + unsigned long slb_v; + unsigned long pp, key; + unsigned long v, gr; + unsigned long *hptep; + int index; + int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); + + /* Get SLB entry */ + if (virtmode) { + slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); + if (!slbe) + return -EINVAL; + slb_v = slbe->origv; + } else { + /* real mode access */ + slb_v = vcpu->kvm->arch.vrma_slb_v; + } + + /* Find the HPTE in the hash table */ + index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v, + HPTE_V_VALID | HPTE_V_ABSENT); + if (index < 0) + return -ENOENT; + hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); + v = hptep[0] & ~HPTE_V_HVLOCK; + gr = kvm->arch.revmap[index].guest_rpte; + + /* Unlock the HPTE */ + asm volatile("lwsync" : : : "memory"); + hptep[0] = v; + + gpte->eaddr = eaddr; + gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff); + + /* Get PP bits and key for permission check */ + pp = gr & (HPTE_R_PP0 | HPTE_R_PP); + key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; + key &= slb_v; + + /* Calculate permissions */ + gpte->may_read = hpte_read_permission(pp, key); + gpte->may_write = hpte_write_permission(pp, key); + gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G)); + + /* Storage key permission check for POWER7 */ + if (data && virtmode && cpu_has_feature(CPU_FTR_ARCH_206)) { + int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr); + if (amrfield & 1) + gpte->may_read = 0; + if (amrfield & 2) + gpte->may_write = 0; + } + + /* Get the guest physical address */ + gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr); + return 0; +} + +/* + * Quick test for whether an instruction is a load or a store. + * If the instruction is a load or a store, then this will indicate + * which it is, at least on server processors. (Embedded processors + * have some external PID instructions that don't follow the rule + * embodied here.) If the instruction isn't a load or store, then + * this doesn't return anything useful. + */ +static int instruction_is_store(unsigned int instr) +{ + unsigned int mask; + + mask = 0x10000000; + if ((instr & 0xfc000000) == 0x7c000000) + mask = 0x100; /* major opcode 31 */ + return (instr & mask) != 0; +} + +static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long gpa, int is_store) +{ + int ret; + u32 last_inst; + unsigned long srr0 = kvmppc_get_pc(vcpu); + + /* We try to load the last instruction. We don't let + * emulate_instruction do it as it doesn't check what + * kvmppc_ld returns. + * If we fail, we just return to the guest and try executing it again. + */ + if (vcpu->arch.last_inst == KVM_INST_FETCH_FAILED) { + ret = kvmppc_ld(vcpu, &srr0, sizeof(u32), &last_inst, false); + if (ret != EMULATE_DONE || last_inst == KVM_INST_FETCH_FAILED) + return RESUME_GUEST; + vcpu->arch.last_inst = last_inst; + } + + /* + * WARNING: We do not know for sure whether the instruction we just + * read from memory is the same that caused the fault in the first + * place. If the instruction we read is neither an load or a store, + * then it can't access memory, so we don't need to worry about + * enforcing access permissions. So, assuming it is a load or + * store, we just check that its direction (load or store) is + * consistent with the original fault, since that's what we + * checked the access permissions against. If there is a mismatch + * we just return and retry the instruction. + */ + + if (instruction_is_store(vcpu->arch.last_inst) != !!is_store) + return RESUME_GUEST; + + /* + * Emulated accesses are emulated by looking at the hash for + * translation once, then performing the access later. The + * translation could be invalidated in the meantime in which + * point performing the subsequent memory access on the old + * physical address could possibly be a security hole for the + * guest (but not the host). + * + * This is less of an issue for MMIO stores since they aren't + * globally visible. It could be an issue for MMIO loads to + * a certain extent but we'll ignore it for now. + */ + + vcpu->arch.paddr_accessed = gpa; + return kvmppc_emulate_mmio(run, vcpu); +} + +int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long ea, unsigned long dsisr) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long *hptep, hpte[3], r; + unsigned long mmu_seq, psize, pte_size; + unsigned long gfn, hva, pfn; + struct kvm_memory_slot *memslot; + unsigned long *rmap; + struct revmap_entry *rev; + struct page *page, *pages[1]; + long index, ret, npages; + unsigned long is_io; + unsigned int writing, write_ok; + struct vm_area_struct *vma; + unsigned long rcbits; + + /* + * Real-mode code has already searched the HPT and found the + * entry we're interested in. Lock the entry and check that + * it hasn't changed. If it has, just return and re-execute the + * instruction. + */ + if (ea != vcpu->arch.pgfault_addr) + return RESUME_GUEST; + index = vcpu->arch.pgfault_index; + hptep = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); + rev = &kvm->arch.revmap[index]; + preempt_disable(); + while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) + cpu_relax(); + hpte[0] = hptep[0] & ~HPTE_V_HVLOCK; + hpte[1] = hptep[1]; + hpte[2] = r = rev->guest_rpte; + asm volatile("lwsync" : : : "memory"); + hptep[0] = hpte[0]; + preempt_enable(); + + if (hpte[0] != vcpu->arch.pgfault_hpte[0] || + hpte[1] != vcpu->arch.pgfault_hpte[1]) + return RESUME_GUEST; + + /* Translate the logical address and get the page */ + psize = hpte_page_size(hpte[0], r); + gfn = hpte_rpn(r, psize); + memslot = gfn_to_memslot(kvm, gfn); + + /* No memslot means it's an emulated MMIO region */ + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { + unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1)); + return kvmppc_hv_emulate_mmio(run, vcpu, gpa, + dsisr & DSISR_ISSTORE); + } + + if (!kvm->arch.using_mmu_notifiers) + return -EFAULT; /* should never get here */ + + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + is_io = 0; + pfn = 0; + page = NULL; + pte_size = PAGE_SIZE; + writing = (dsisr & DSISR_ISSTORE) != 0; + /* If writing != 0, then the HPTE must allow writing, if we get here */ + write_ok = writing; + hva = gfn_to_hva_memslot(memslot, gfn); + npages = get_user_pages_fast(hva, 1, writing, pages); + if (npages < 1) { + /* Check if it's an I/O mapping */ + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, hva); + if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end && + (vma->vm_flags & VM_PFNMAP)) { + pfn = vma->vm_pgoff + + ((hva - vma->vm_start) >> PAGE_SHIFT); + pte_size = psize; + is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); + write_ok = vma->vm_flags & VM_WRITE; + } + up_read(¤t->mm->mmap_sem); + if (!pfn) + return -EFAULT; + } else { + page = pages[0]; + if (PageHuge(page)) { + page = compound_head(page); + pte_size <<= compound_order(page); + } + /* if the guest wants write access, see if that is OK */ + if (!writing && hpte_is_writable(r)) { + pte_t *ptep, pte; + + /* + * We need to protect against page table destruction + * while looking up and updating the pte. + */ + rcu_read_lock_sched(); + ptep = find_linux_pte_or_hugepte(current->mm->pgd, + hva, NULL); + if (ptep && pte_present(*ptep)) { + pte = kvmppc_read_update_linux_pte(ptep, 1); + if (pte_write(pte)) + write_ok = 1; + } + rcu_read_unlock_sched(); + } + pfn = page_to_pfn(page); + } + + ret = -EFAULT; + if (psize > pte_size) + goto out_put; + + /* Check WIMG vs. the actual page we're accessing */ + if (!hpte_cache_flags_ok(r, is_io)) { + if (is_io) + return -EFAULT; + /* + * Allow guest to map emulated device memory as + * uncacheable, but actually make it cacheable. + */ + r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M; + } + + /* Set the HPTE to point to pfn */ + r = (r & ~(HPTE_R_PP0 - pte_size)) | (pfn << PAGE_SHIFT); + if (hpte_is_writable(r) && !write_ok) + r = hpte_make_readonly(r); + ret = RESUME_GUEST; + preempt_disable(); + while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) + cpu_relax(); + if ((hptep[0] & ~HPTE_V_HVLOCK) != hpte[0] || hptep[1] != hpte[1] || + rev->guest_rpte != hpte[2]) + /* HPTE has been changed under us; let the guest retry */ + goto out_unlock; + hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; + + rmap = &memslot->rmap[gfn - memslot->base_gfn]; + lock_rmap(rmap); + + /* Check if we might have been invalidated; let the guest retry if so */ + ret = RESUME_GUEST; + if (mmu_notifier_retry(vcpu, mmu_seq)) { + unlock_rmap(rmap); + goto out_unlock; + } + + /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */ + rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; + r &= rcbits | ~(HPTE_R_R | HPTE_R_C); + + if (hptep[0] & HPTE_V_VALID) { + /* HPTE was previously valid, so we need to invalidate it */ + unlock_rmap(rmap); + hptep[0] |= HPTE_V_ABSENT; + kvmppc_invalidate_hpte(kvm, hptep, index); + /* don't lose previous R and C bits */ + r |= hptep[1] & (HPTE_R_R | HPTE_R_C); + } else { + kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); + } + + hptep[1] = r; + eieio(); + hptep[0] = hpte[0]; + asm volatile("ptesync" : : : "memory"); + preempt_enable(); + if (page && hpte_is_writable(r)) + SetPageDirty(page); + + out_put: + if (page) + put_page(page); + return ret; + + out_unlock: + hptep[0] &= ~HPTE_V_HVLOCK; + preempt_enable(); + goto out_put; +} + +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, + int (*handler)(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn)) +{ + int ret; + int retval = 0; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) { + unsigned long start = memslot->userspace_addr; + unsigned long end; + + end = start + (memslot->npages << PAGE_SHIFT); + if (hva >= start && hva < end) { + gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + + ret = handler(kvm, &memslot->rmap[gfn_offset], + memslot->base_gfn + gfn_offset); + retval |= ret; + } + } + + return retval; +} + +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long h, i, j; + unsigned long *hptep; + unsigned long ptel, psize, rcbits; + + for (;;) { + lock_rmap(rmapp); + if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { + unlock_rmap(rmapp); + break; + } + + /* + * To avoid an ABBA deadlock with the HPTE lock bit, + * we can't spin on the HPTE lock while holding the + * rmap chain lock. + */ + i = *rmapp & KVMPPC_RMAP_INDEX; + hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { + /* unlock rmap before spinning on the HPTE lock */ + unlock_rmap(rmapp); + while (hptep[0] & HPTE_V_HVLOCK) + cpu_relax(); + continue; + } + j = rev[i].forw; + if (j == i) { + /* chain is now empty */ + *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); + } else { + /* remove i from chain */ + h = rev[i].back; + rev[h].forw = j; + rev[j].back = h; + rev[i].forw = rev[i].back = i; + *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; + } + + /* Now check and modify the HPTE */ + ptel = rev[i].guest_rpte; + psize = hpte_page_size(hptep[0], ptel); + if ((hptep[0] & HPTE_V_VALID) && + hpte_rpn(ptel, psize) == gfn) { + hptep[0] |= HPTE_V_ABSENT; + kvmppc_invalidate_hpte(kvm, hptep, i); + /* Harvest R and C */ + rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C); + *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; + rev[i].guest_rpte = ptel | rcbits; + } + unlock_rmap(rmapp); + hptep[0] &= ~HPTE_V_HVLOCK; + } + return 0; +} + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + if (kvm->arch.using_mmu_notifiers) + kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); + return 0; +} + +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long head, i, j; + unsigned long *hptep; + int ret = 0; + + retry: + lock_rmap(rmapp); + if (*rmapp & KVMPPC_RMAP_REFERENCED) { + *rmapp &= ~KVMPPC_RMAP_REFERENCED; + ret = 1; + } + if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { + unlock_rmap(rmapp); + return ret; + } + + i = head = *rmapp & KVMPPC_RMAP_INDEX; + do { + hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + j = rev[i].forw; + + /* If this HPTE isn't referenced, ignore it */ + if (!(hptep[1] & HPTE_R_R)) + continue; + + if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { + /* unlock rmap before spinning on the HPTE lock */ + unlock_rmap(rmapp); + while (hptep[0] & HPTE_V_HVLOCK) + cpu_relax(); + goto retry; + } + + /* Now check and modify the HPTE */ + if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_R)) { + kvmppc_clear_ref_hpte(kvm, hptep, i); + rev[i].guest_rpte |= HPTE_R_R; + ret = 1; + } + hptep[0] &= ~HPTE_V_HVLOCK; + } while ((i = j) != head); + + unlock_rmap(rmapp); + return ret; +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + if (!kvm->arch.using_mmu_notifiers) + return 0; + return kvm_handle_hva(kvm, hva, kvm_age_rmapp); +} + +static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long head, i, j; + unsigned long *hp; + int ret = 1; + + if (*rmapp & KVMPPC_RMAP_REFERENCED) + return 1; + + lock_rmap(rmapp); + if (*rmapp & KVMPPC_RMAP_REFERENCED) + goto out; + + if (*rmapp & KVMPPC_RMAP_PRESENT) { + i = head = *rmapp & KVMPPC_RMAP_INDEX; + do { + hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4)); + j = rev[i].forw; + if (hp[1] & HPTE_R_R) + goto out; + } while ((i = j) != head); + } + ret = 0; + + out: + unlock_rmap(rmapp); + return ret; +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + if (!kvm->arch.using_mmu_notifiers) + return 0; + return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp); +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) { - return -ENOENT; + if (!kvm->arch.using_mmu_notifiers) + return; + kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); +} + +static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long head, i, j; + unsigned long *hptep; + int ret = 0; + + retry: + lock_rmap(rmapp); + if (*rmapp & KVMPPC_RMAP_CHANGED) { + *rmapp &= ~KVMPPC_RMAP_CHANGED; + ret = 1; + } + if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { + unlock_rmap(rmapp); + return ret; + } + + i = head = *rmapp & KVMPPC_RMAP_INDEX; + do { + hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); + j = rev[i].forw; + + if (!(hptep[1] & HPTE_R_C)) + continue; + + if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { + /* unlock rmap before spinning on the HPTE lock */ + unlock_rmap(rmapp); + while (hptep[0] & HPTE_V_HVLOCK) + cpu_relax(); + goto retry; + } + + /* Now check and modify the HPTE */ + if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) { + /* need to make it temporarily absent to clear C */ + hptep[0] |= HPTE_V_ABSENT; + kvmppc_invalidate_hpte(kvm, hptep, i); + hptep[1] &= ~HPTE_R_C; + eieio(); + hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; + rev[i].guest_rpte |= HPTE_R_C; + ret = 1; + } + hptep[0] &= ~HPTE_V_HVLOCK; + } while ((i = j) != head); + + unlock_rmap(rmapp); + return ret; +} + +long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ + unsigned long i; + unsigned long *rmapp, *map; + + preempt_disable(); + rmapp = memslot->rmap; + map = memslot->dirty_bitmap; + for (i = 0; i < memslot->npages; ++i) { + if (kvm_test_clear_dirty(kvm, rmapp)) + __set_bit_le(i, map); + ++rmapp; + } + preempt_enable(); + return 0; +} + +void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, + unsigned long *nb_ret) +{ + struct kvm_memory_slot *memslot; + unsigned long gfn = gpa >> PAGE_SHIFT; + struct page *page, *pages[1]; + int npages; + unsigned long hva, psize, offset; + unsigned long pa; + unsigned long *physp; + + memslot = gfn_to_memslot(kvm, gfn); + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + return NULL; + if (!kvm->arch.using_mmu_notifiers) { + physp = kvm->arch.slot_phys[memslot->id]; + if (!physp) + return NULL; + physp += gfn - memslot->base_gfn; + pa = *physp; + if (!pa) { + if (kvmppc_get_guest_page(kvm, gfn, memslot, + PAGE_SIZE) < 0) + return NULL; + pa = *physp; + } + page = pfn_to_page(pa >> PAGE_SHIFT); + } else { + hva = gfn_to_hva_memslot(memslot, gfn); + npages = get_user_pages_fast(hva, 1, 1, pages); + if (npages < 1) + return NULL; + page = pages[0]; + } + psize = PAGE_SIZE; + if (PageHuge(page)) { + page = compound_head(page); + psize <<= compound_order(page); + } + if (!kvm->arch.using_mmu_notifiers) + get_page(page); + offset = gpa & (psize - 1); + if (nb_ret) + *nb_ret = psize - offset; + return page_address(page) + offset; +} + +void kvmppc_unpin_guest_page(struct kvm *kvm, void *va) +{ + struct page *page = virt_to_page(va); + + page = compound_head(page); + put_page(page); } void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 0c9dc62532d..f1950d13182 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -230,9 +230,12 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, r = kvmppc_st(vcpu, &addr, 32, zeros, true); if ((r == -ENOENT) || (r == -EPERM)) { + struct kvmppc_book3s_shadow_vcpu *svcpu; + + svcpu = svcpu_get(vcpu); *advance = 0; vcpu->arch.shared->dar = vaddr; - to_svcpu(vcpu)->fault_dar = vaddr; + svcpu->fault_dar = vaddr; dsisr = DSISR_ISSTORE; if (r == -ENOENT) @@ -241,7 +244,8 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, dsisr |= DSISR_PROTFAULT; vcpu->arch.shared->dsisr = dsisr; - to_svcpu(vcpu)->fault_dsisr = dsisr; + svcpu->fault_dsisr = dsisr; + svcpu_put(svcpu); kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index ab4267760b7..01294a5099d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -49,22 +49,14 @@ #include <linux/gfp.h> #include <linux/vmalloc.h> #include <linux/highmem.h> - -/* - * For now, limit memory to 64GB and require it to be large pages. - * This value is chosen because it makes the ram_pginfo array be - * 64kB in size, which is about as large as we want to be trying - * to allocate with kmalloc. - */ -#define MAX_MEM_ORDER 36 - -#define LARGE_PAGE_ORDER 24 /* 16MB pages */ +#include <linux/hugetlb.h> /* #define EXIT_DEBUG */ /* #define EXIT_DEBUG_SIMPLE */ /* #define EXIT_DEBUG_INT */ static void kvmppc_end_cede(struct kvm_vcpu *vcpu); +static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu); void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { @@ -147,10 +139,10 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, unsigned long vcpuid, unsigned long vpa) { struct kvm *kvm = vcpu->kvm; - unsigned long pg_index, ra, len; - unsigned long pg_offset; + unsigned long len, nb; void *va; struct kvm_vcpu *tvcpu; + int err = H_PARAMETER; tvcpu = kvmppc_find_vcpu(kvm, vcpuid); if (!tvcpu) @@ -163,45 +155,41 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, if (flags < 4) { if (vpa & 0x7f) return H_PARAMETER; + if (flags >= 2 && !tvcpu->arch.vpa) + return H_RESOURCE; /* registering new area; convert logical addr to real */ - pg_index = vpa >> kvm->arch.ram_porder; - pg_offset = vpa & (kvm->arch.ram_psize - 1); - if (pg_index >= kvm->arch.ram_npages) + va = kvmppc_pin_guest_page(kvm, vpa, &nb); + if (va == NULL) return H_PARAMETER; - if (kvm->arch.ram_pginfo[pg_index].pfn == 0) - return H_PARAMETER; - ra = kvm->arch.ram_pginfo[pg_index].pfn << PAGE_SHIFT; - ra |= pg_offset; - va = __va(ra); if (flags <= 1) len = *(unsigned short *)(va + 4); else len = *(unsigned int *)(va + 4); - if (pg_offset + len > kvm->arch.ram_psize) - return H_PARAMETER; + if (len > nb) + goto out_unpin; switch (flags) { case 1: /* register VPA */ if (len < 640) - return H_PARAMETER; + goto out_unpin; + if (tvcpu->arch.vpa) + kvmppc_unpin_guest_page(kvm, vcpu->arch.vpa); tvcpu->arch.vpa = va; init_vpa(vcpu, va); break; case 2: /* register DTL */ if (len < 48) - return H_PARAMETER; - if (!tvcpu->arch.vpa) - return H_RESOURCE; + goto out_unpin; len -= len % 48; + if (tvcpu->arch.dtl) + kvmppc_unpin_guest_page(kvm, vcpu->arch.dtl); tvcpu->arch.dtl = va; tvcpu->arch.dtl_end = va + len; break; case 3: /* register SLB shadow buffer */ - if (len < 8) - return H_PARAMETER; - if (!tvcpu->arch.vpa) - return H_RESOURCE; - tvcpu->arch.slb_shadow = va; - len = (len - 16) / 16; + if (len < 16) + goto out_unpin; + if (tvcpu->arch.slb_shadow) + kvmppc_unpin_guest_page(kvm, vcpu->arch.slb_shadow); tvcpu->arch.slb_shadow = va; break; } @@ -210,17 +198,30 @@ static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, case 5: /* unregister VPA */ if (tvcpu->arch.slb_shadow || tvcpu->arch.dtl) return H_RESOURCE; + if (!tvcpu->arch.vpa) + break; + kvmppc_unpin_guest_page(kvm, tvcpu->arch.vpa); tvcpu->arch.vpa = NULL; break; case 6: /* unregister DTL */ + if (!tvcpu->arch.dtl) + break; + kvmppc_unpin_guest_page(kvm, tvcpu->arch.dtl); tvcpu->arch.dtl = NULL; break; case 7: /* unregister SLB shadow buffer */ + if (!tvcpu->arch.slb_shadow) + break; + kvmppc_unpin_guest_page(kvm, tvcpu->arch.slb_shadow); tvcpu->arch.slb_shadow = NULL; break; } } return H_SUCCESS; + + out_unpin: + kvmppc_unpin_guest_page(kvm, va); + return err; } int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) @@ -230,6 +231,12 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) struct kvm_vcpu *tvcpu; switch (req) { + case H_ENTER: + ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6), + kvmppc_get_gpr(vcpu, 7)); + break; case H_CEDE: break; case H_PROD: @@ -319,20 +326,19 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } /* - * We get these next two if the guest does a bad real-mode access, - * as we have enabled VRMA (virtualized real mode area) mode in the - * LPCR. We just generate an appropriate DSI/ISI to the guest. + * We get these next two if the guest accesses a page which it thinks + * it has mapped but which is not actually present, either because + * it is for an emulated I/O device or because the corresonding + * host page has been paged out. Any other HDSI/HISI interrupts + * have been handled already. */ case BOOK3S_INTERRUPT_H_DATA_STORAGE: - vcpu->arch.shregs.dsisr = vcpu->arch.fault_dsisr; - vcpu->arch.shregs.dar = vcpu->arch.fault_dar; - kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE, 0); - r = RESUME_GUEST; + r = kvmppc_book3s_hv_page_fault(run, vcpu, + vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); break; case BOOK3S_INTERRUPT_H_INST_STORAGE: - kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_INST_STORAGE, - 0x08000000); - r = RESUME_GUEST; + r = kvmppc_book3s_hv_page_fault(run, vcpu, + kvmppc_get_pc(vcpu), 0); break; /* * This occurs if the guest executes an illegal instruction. @@ -392,6 +398,42 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_PPC_HIOR: + r = put_user(0, (u64 __user *)reg->addr); + break; + default: + break; + } + + return r; +} + +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_PPC_HIOR: + { + u64 hior; + /* Only allow this to be set to zero */ + r = get_user(hior, (u64 __user *)reg->addr); + if (!r && (hior != 0)) + r = -EINVAL; + break; + } + default: + break; + } + + return r; +} + int kvmppc_core_check_processor_compat(void) { if (cpu_has_feature(CPU_FTR_HVMODE)) @@ -411,7 +453,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) goto out; err = -ENOMEM; - vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL); + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); if (!vcpu) goto out; @@ -463,15 +505,21 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) return vcpu; free_vcpu: - kfree(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); out: return ERR_PTR(err); } void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) { + if (vcpu->arch.dtl) + kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.dtl); + if (vcpu->arch.slb_shadow) + kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.slb_shadow); + if (vcpu->arch.vpa) + kvmppc_unpin_guest_page(vcpu->kvm, vcpu->arch.vpa); kvm_vcpu_uninit(vcpu); - kfree(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); } static void kvmppc_set_timer(struct kvm_vcpu *vcpu) @@ -482,7 +530,7 @@ static void kvmppc_set_timer(struct kvm_vcpu *vcpu) if (now > vcpu->arch.dec_expires) { /* decrementer has already gone negative */ kvmppc_core_queue_dec(vcpu); - kvmppc_core_deliver_interrupts(vcpu); + kvmppc_core_prepare_to_enter(vcpu); return; } dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC @@ -797,7 +845,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) list_for_each_entry_safe(v, vn, &vc->runnable_threads, arch.run_list) { - kvmppc_core_deliver_interrupts(v); + kvmppc_core_prepare_to_enter(v); if (signal_pending(v->arch.run_task)) { kvmppc_remove_runnable(vc, v); v->stat.signal_exits++; @@ -836,20 +884,26 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) return -EINVAL; } + kvmppc_core_prepare_to_enter(vcpu); + /* No need to go into the guest when all we'll do is come back out */ if (signal_pending(current)) { run->exit_reason = KVM_EXIT_INTR; return -EINTR; } - /* On PPC970, check that we have an RMA region */ - if (!vcpu->kvm->arch.rma && cpu_has_feature(CPU_FTR_ARCH_201)) - return -EPERM; + /* On the first time here, set up VRMA or RMA */ + if (!vcpu->kvm->arch.rma_setup_done) { + r = kvmppc_hv_setup_rma(vcpu); + if (r) + return r; + } flush_fp_to_thread(current); flush_altivec_to_thread(current); flush_vsx_to_thread(current); vcpu->arch.wqp = &vcpu->arch.vcore->wq; + vcpu->arch.pgdir = current->mm->pgd; do { r = kvmppc_run_vcpu(run, vcpu); @@ -857,7 +911,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) if (run->exit_reason == KVM_EXIT_PAPR_HCALL && !(vcpu->arch.shregs.msr & MSR_PR)) { r = kvmppc_pseries_do_hcall(vcpu); - kvmppc_core_deliver_interrupts(vcpu); + kvmppc_core_prepare_to_enter(vcpu); } } while (r == RESUME_GUEST); return r; @@ -1001,7 +1055,7 @@ static inline int lpcr_rmls(unsigned long rma_size) static int kvm_rma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct kvmppc_rma_info *ri = vma->vm_file->private_data; + struct kvmppc_linear_info *ri = vma->vm_file->private_data; struct page *page; if (vmf->pgoff >= ri->npages) @@ -1026,7 +1080,7 @@ static int kvm_rma_mmap(struct file *file, struct vm_area_struct *vma) static int kvm_rma_release(struct inode *inode, struct file *filp) { - struct kvmppc_rma_info *ri = filp->private_data; + struct kvmppc_linear_info *ri = filp->private_data; kvm_release_rma(ri); return 0; @@ -1039,7 +1093,7 @@ static struct file_operations kvm_rma_fops = { long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret) { - struct kvmppc_rma_info *ri; + struct kvmppc_linear_info *ri; long fd; ri = kvm_alloc_rma(); @@ -1054,89 +1108,189 @@ long kvm_vm_ioctl_allocate_rma(struct kvm *kvm, struct kvm_allocate_rma *ret) return fd; } -static struct page *hva_to_page(unsigned long addr) +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - struct page *page[1]; - int npages; + struct kvm_memory_slot *memslot; + int r; + unsigned long n; - might_sleep(); + mutex_lock(&kvm->slots_lock); - npages = get_user_pages_fast(addr, 1, 1, page); + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; - if (unlikely(npages != 1)) - return 0; + memslot = id_to_memslot(kvm->memslots, log->slot); + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = kvm_dirty_bitmap_bytes(memslot); + memset(memslot->dirty_bitmap, 0, n); + + r = kvmppc_hv_get_dirty_log(kvm, memslot); + if (r) + goto out; - return page[0]; + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) + goto out; + + r = 0; +out: + mutex_unlock(&kvm->slots_lock); + return r; +} + +static unsigned long slb_pgsize_encoding(unsigned long psize) +{ + unsigned long senc = 0; + + if (psize > 0x1000) { + senc = SLB_VSID_L; + if (psize == 0x10000) + senc |= SLB_VSID_LP_01; + } + return senc; } int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem) { - unsigned long psize, porder; - unsigned long i, npages, totalpages; - unsigned long pg_ix; - struct kvmppc_pginfo *pginfo; - unsigned long hva; - struct kvmppc_rma_info *ri = NULL; + unsigned long npages; + unsigned long *phys; + + /* Allocate a slot_phys array */ + phys = kvm->arch.slot_phys[mem->slot]; + if (!kvm->arch.using_mmu_notifiers && !phys) { + npages = mem->memory_size >> PAGE_SHIFT; + phys = vzalloc(npages * sizeof(unsigned long)); + if (!phys) + return -ENOMEM; + kvm->arch.slot_phys[mem->slot] = phys; + kvm->arch.slot_npages[mem->slot] = npages; + } + + return 0; +} + +static void unpin_slot(struct kvm *kvm, int slot_id) +{ + unsigned long *physp; + unsigned long j, npages, pfn; struct page *page; - /* For now, only allow 16MB pages */ - porder = LARGE_PAGE_ORDER; - psize = 1ul << porder; - if ((mem->memory_size & (psize - 1)) || - (mem->guest_phys_addr & (psize - 1))) { - pr_err("bad memory_size=%llx @ %llx\n", - mem->memory_size, mem->guest_phys_addr); - return -EINVAL; + physp = kvm->arch.slot_phys[slot_id]; + npages = kvm->arch.slot_npages[slot_id]; + if (physp) { + spin_lock(&kvm->arch.slot_phys_lock); + for (j = 0; j < npages; j++) { + if (!(physp[j] & KVMPPC_GOT_PAGE)) + continue; + pfn = physp[j] >> PAGE_SHIFT; + page = pfn_to_page(pfn); + if (PageHuge(page)) + page = compound_head(page); + SetPageDirty(page); + put_page(page); + } + kvm->arch.slot_phys[slot_id] = NULL; + spin_unlock(&kvm->arch.slot_phys_lock); + vfree(physp); } +} - npages = mem->memory_size >> porder; - totalpages = (mem->guest_phys_addr + mem->memory_size) >> porder; +void kvmppc_core_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem) +{ +} - /* More memory than we have space to track? */ - if (totalpages > (1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER))) - return -EINVAL; +static int kvmppc_hv_setup_rma(struct kvm_vcpu *vcpu) +{ + int err = 0; + struct kvm *kvm = vcpu->kvm; + struct kvmppc_linear_info *ri = NULL; + unsigned long hva; + struct kvm_memory_slot *memslot; + struct vm_area_struct *vma; + unsigned long lpcr, senc; + unsigned long psize, porder; + unsigned long rma_size; + unsigned long rmls; + unsigned long *physp; + unsigned long i, npages; - /* Do we already have an RMA registered? */ - if (mem->guest_phys_addr == 0 && kvm->arch.rma) - return -EINVAL; + mutex_lock(&kvm->lock); + if (kvm->arch.rma_setup_done) + goto out; /* another vcpu beat us to it */ - if (totalpages > kvm->arch.ram_npages) - kvm->arch.ram_npages = totalpages; + /* Look up the memslot for guest physical address 0 */ + memslot = gfn_to_memslot(kvm, 0); + + /* We must have some memory at 0 by now */ + err = -EINVAL; + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + goto out; + + /* Look up the VMA for the start of this memory slot */ + hva = memslot->userspace_addr; + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, hva); + if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO)) + goto up_out; + + psize = vma_kernel_pagesize(vma); + porder = __ilog2(psize); /* Is this one of our preallocated RMAs? */ - if (mem->guest_phys_addr == 0) { - struct vm_area_struct *vma; - - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, mem->userspace_addr); - if (vma && vma->vm_file && - vma->vm_file->f_op == &kvm_rma_fops && - mem->userspace_addr == vma->vm_start) - ri = vma->vm_file->private_data; - up_read(¤t->mm->mmap_sem); - if (!ri && cpu_has_feature(CPU_FTR_ARCH_201)) { - pr_err("CPU requires an RMO\n"); - return -EINVAL; + if (vma->vm_file && vma->vm_file->f_op == &kvm_rma_fops && + hva == vma->vm_start) + ri = vma->vm_file->private_data; + + up_read(¤t->mm->mmap_sem); + + if (!ri) { + /* On POWER7, use VRMA; on PPC970, give up */ + err = -EPERM; + if (cpu_has_feature(CPU_FTR_ARCH_201)) { + pr_err("KVM: CPU requires an RMO\n"); + goto out; } - } - if (ri) { - unsigned long rma_size; - unsigned long lpcr; - long rmls; + /* We can handle 4k, 64k or 16M pages in the VRMA */ + err = -EINVAL; + if (!(psize == 0x1000 || psize == 0x10000 || + psize == 0x1000000)) + goto out; + + /* Update VRMASD field in the LPCR */ + senc = slb_pgsize_encoding(psize); + kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | + (VRMA_VSID << SLB_VSID_SHIFT_1T); + lpcr = kvm->arch.lpcr & ~LPCR_VRMASD; + lpcr |= senc << (LPCR_VRMASD_SH - 4); + kvm->arch.lpcr = lpcr; - rma_size = ri->npages << PAGE_SHIFT; - if (rma_size > mem->memory_size) - rma_size = mem->memory_size; + /* Create HPTEs in the hash page table for the VRMA */ + kvmppc_map_vrma(vcpu, memslot, porder); + + } else { + /* Set up to use an RMO region */ + rma_size = ri->npages; + if (rma_size > memslot->npages) + rma_size = memslot->npages; + rma_size <<= PAGE_SHIFT; rmls = lpcr_rmls(rma_size); + err = -EINVAL; if (rmls < 0) { - pr_err("Can't use RMA of 0x%lx bytes\n", rma_size); - return -EINVAL; + pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size); + goto out; } atomic_inc(&ri->use_count); kvm->arch.rma = ri; - kvm->arch.n_rma_pages = rma_size >> porder; /* Update LPCR and RMOR */ lpcr = kvm->arch.lpcr; @@ -1156,53 +1310,35 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm, kvm->arch.rmor = kvm->arch.rma->base_pfn << PAGE_SHIFT; } kvm->arch.lpcr = lpcr; - pr_info("Using RMO at %lx size %lx (LPCR = %lx)\n", + pr_info("KVM: Using RMO at %lx size %lx (LPCR = %lx)\n", ri->base_pfn << PAGE_SHIFT, rma_size, lpcr); - } - pg_ix = mem->guest_phys_addr >> porder; - pginfo = kvm->arch.ram_pginfo + pg_ix; - for (i = 0; i < npages; ++i, ++pg_ix) { - if (ri && pg_ix < kvm->arch.n_rma_pages) { - pginfo[i].pfn = ri->base_pfn + - (pg_ix << (porder - PAGE_SHIFT)); - continue; - } - hva = mem->userspace_addr + (i << porder); - page = hva_to_page(hva); - if (!page) { - pr_err("oops, no pfn for hva %lx\n", hva); - goto err; - } - /* Check it's a 16MB page */ - if (!PageHead(page) || - compound_order(page) != (LARGE_PAGE_ORDER - PAGE_SHIFT)) { - pr_err("page at %lx isn't 16MB (o=%d)\n", - hva, compound_order(page)); - goto err; - } - pginfo[i].pfn = page_to_pfn(page); + /* Initialize phys addrs of pages in RMO */ + npages = ri->npages; + porder = __ilog2(npages); + physp = kvm->arch.slot_phys[memslot->id]; + spin_lock(&kvm->arch.slot_phys_lock); + for (i = 0; i < npages; ++i) + physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) + porder; + spin_unlock(&kvm->arch.slot_phys_lock); } - return 0; - - err: - return -EINVAL; -} + /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */ + smp_wmb(); + kvm->arch.rma_setup_done = 1; + err = 0; + out: + mutex_unlock(&kvm->lock); + return err; -void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) -{ - if (mem->guest_phys_addr == 0 && mem->memory_size != 0 && - !kvm->arch.rma) - kvmppc_map_vrma(kvm, mem); + up_out: + up_read(¤t->mm->mmap_sem); + goto out; } int kvmppc_core_init_vm(struct kvm *kvm) { long r; - unsigned long npages = 1ul << (MAX_MEM_ORDER - LARGE_PAGE_ORDER); - long err = -ENOMEM; unsigned long lpcr; /* Allocate hashed page table */ @@ -1212,19 +1348,7 @@ int kvmppc_core_init_vm(struct kvm *kvm) INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); - kvm->arch.ram_pginfo = kzalloc(npages * sizeof(struct kvmppc_pginfo), - GFP_KERNEL); - if (!kvm->arch.ram_pginfo) { - pr_err("kvmppc_core_init_vm: couldn't alloc %lu bytes\n", - npages * sizeof(struct kvmppc_pginfo)); - goto out_free; - } - - kvm->arch.ram_npages = 0; - kvm->arch.ram_psize = 1ul << LARGE_PAGE_ORDER; - kvm->arch.ram_porder = LARGE_PAGE_ORDER; kvm->arch.rma = NULL; - kvm->arch.n_rma_pages = 0; kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); @@ -1242,30 +1366,25 @@ int kvmppc_core_init_vm(struct kvm *kvm) kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); lpcr &= LPCR_PECE | LPCR_LPES; lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | - LPCR_VPM0 | LPCR_VRMA_L; + LPCR_VPM0 | LPCR_VPM1; + kvm->arch.vrma_slb_v = SLB_VSID_B_1T | + (VRMA_VSID << SLB_VSID_SHIFT_1T); } kvm->arch.lpcr = lpcr; + kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206); + spin_lock_init(&kvm->arch.slot_phys_lock); return 0; - - out_free: - kvmppc_free_hpt(kvm); - return err; } void kvmppc_core_destroy_vm(struct kvm *kvm) { - struct kvmppc_pginfo *pginfo; unsigned long i; - if (kvm->arch.ram_pginfo) { - pginfo = kvm->arch.ram_pginfo; - kvm->arch.ram_pginfo = NULL; - for (i = kvm->arch.n_rma_pages; i < kvm->arch.ram_npages; ++i) - if (pginfo[i].pfn) - put_page(pfn_to_page(pginfo[i].pfn)); - kfree(pginfo); - } + if (!kvm->arch.using_mmu_notifiers) + for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) + unpin_slot(kvm, i); + if (kvm->arch.rma) { kvm_release_rma(kvm->arch.rma); kvm->arch.rma = NULL; diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index a795a13f4a7..bed1279aa6a 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -18,6 +18,15 @@ #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> +#define KVM_LINEAR_RMA 0 +#define KVM_LINEAR_HPT 1 + +static void __init kvm_linear_init_one(ulong size, int count, int type); +static struct kvmppc_linear_info *kvm_alloc_linear(int type); +static void kvm_release_linear(struct kvmppc_linear_info *ri); + +/*************** RMA *************/ + /* * This maintains a list of RMAs (real mode areas) for KVM guests to use. * Each RMA has to be physically contiguous and of a size that the @@ -29,32 +38,6 @@ static unsigned long kvm_rma_size = 64 << 20; /* 64MB */ static unsigned long kvm_rma_count; -static int __init early_parse_rma_size(char *p) -{ - if (!p) - return 1; - - kvm_rma_size = memparse(p, &p); - - return 0; -} -early_param("kvm_rma_size", early_parse_rma_size); - -static int __init early_parse_rma_count(char *p) -{ - if (!p) - return 1; - - kvm_rma_count = simple_strtoul(p, NULL, 0); - - return 0; -} -early_param("kvm_rma_count", early_parse_rma_count); - -static struct kvmppc_rma_info *rma_info; -static LIST_HEAD(free_rmas); -static DEFINE_SPINLOCK(rma_lock); - /* Work out RMLS (real mode limit selector) field value for a given RMA size. Assumes POWER7 or PPC970. */ static inline int lpcr_rmls(unsigned long rma_size) @@ -81,45 +64,106 @@ static inline int lpcr_rmls(unsigned long rma_size) } } +static int __init early_parse_rma_size(char *p) +{ + if (!p) + return 1; + + kvm_rma_size = memparse(p, &p); + + return 0; +} +early_param("kvm_rma_size", early_parse_rma_size); + +static int __init early_parse_rma_count(char *p) +{ + if (!p) + return 1; + + kvm_rma_count = simple_strtoul(p, NULL, 0); + + return 0; +} +early_param("kvm_rma_count", early_parse_rma_count); + +struct kvmppc_linear_info *kvm_alloc_rma(void) +{ + return kvm_alloc_linear(KVM_LINEAR_RMA); +} +EXPORT_SYMBOL_GPL(kvm_alloc_rma); + +void kvm_release_rma(struct kvmppc_linear_info *ri) +{ + kvm_release_linear(ri); +} +EXPORT_SYMBOL_GPL(kvm_release_rma); + +/*************** HPT *************/ + /* - * Called at boot time while the bootmem allocator is active, - * to allocate contiguous physical memory for the real memory - * areas for guests. + * This maintains a list of big linear HPT tables that contain the GVA->HPA + * memory mappings. If we don't reserve those early on, we might not be able + * to get a big (usually 16MB) linear memory region from the kernel anymore. */ -void __init kvm_rma_init(void) + +static unsigned long kvm_hpt_count; + +static int __init early_parse_hpt_count(char *p) +{ + if (!p) + return 1; + + kvm_hpt_count = simple_strtoul(p, NULL, 0); + + return 0; +} +early_param("kvm_hpt_count", early_parse_hpt_count); + +struct kvmppc_linear_info *kvm_alloc_hpt(void) +{ + return kvm_alloc_linear(KVM_LINEAR_HPT); +} +EXPORT_SYMBOL_GPL(kvm_alloc_hpt); + +void kvm_release_hpt(struct kvmppc_linear_info *li) +{ + kvm_release_linear(li); +} +EXPORT_SYMBOL_GPL(kvm_release_hpt); + +/*************** generic *************/ + +static LIST_HEAD(free_linears); +static DEFINE_SPINLOCK(linear_lock); + +static void __init kvm_linear_init_one(ulong size, int count, int type) { unsigned long i; unsigned long j, npages; - void *rma; + void *linear; struct page *pg; + const char *typestr; + struct kvmppc_linear_info *linear_info; - /* Only do this on PPC970 in HV mode */ - if (!cpu_has_feature(CPU_FTR_HVMODE) || - !cpu_has_feature(CPU_FTR_ARCH_201)) - return; - - if (!kvm_rma_size || !kvm_rma_count) + if (!count) return; - /* Check that the requested size is one supported in hardware */ - if (lpcr_rmls(kvm_rma_size) < 0) { - pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size); - return; - } - - npages = kvm_rma_size >> PAGE_SHIFT; - rma_info = alloc_bootmem(kvm_rma_count * sizeof(struct kvmppc_rma_info)); - for (i = 0; i < kvm_rma_count; ++i) { - rma = alloc_bootmem_align(kvm_rma_size, kvm_rma_size); - pr_info("Allocated KVM RMA at %p (%ld MB)\n", rma, - kvm_rma_size >> 20); - rma_info[i].base_virt = rma; - rma_info[i].base_pfn = __pa(rma) >> PAGE_SHIFT; - rma_info[i].npages = npages; - list_add_tail(&rma_info[i].list, &free_rmas); - atomic_set(&rma_info[i].use_count, 0); - - pg = pfn_to_page(rma_info[i].base_pfn); + typestr = (type == KVM_LINEAR_RMA) ? "RMA" : "HPT"; + + npages = size >> PAGE_SHIFT; + linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info)); + for (i = 0; i < count; ++i) { + linear = alloc_bootmem_align(size, size); + pr_info("Allocated KVM %s at %p (%ld MB)\n", typestr, linear, + size >> 20); + linear_info[i].base_virt = linear; + linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT; + linear_info[i].npages = npages; + linear_info[i].type = type; + list_add_tail(&linear_info[i].list, &free_linears); + atomic_set(&linear_info[i].use_count, 0); + + pg = pfn_to_page(linear_info[i].base_pfn); for (j = 0; j < npages; ++j) { atomic_inc(&pg->_count); ++pg; @@ -127,30 +171,59 @@ void __init kvm_rma_init(void) } } -struct kvmppc_rma_info *kvm_alloc_rma(void) +static struct kvmppc_linear_info *kvm_alloc_linear(int type) { - struct kvmppc_rma_info *ri; + struct kvmppc_linear_info *ri; ri = NULL; - spin_lock(&rma_lock); - if (!list_empty(&free_rmas)) { - ri = list_first_entry(&free_rmas, struct kvmppc_rma_info, list); + spin_lock(&linear_lock); + list_for_each_entry(ri, &free_linears, list) { + if (ri->type != type) + continue; + list_del(&ri->list); atomic_inc(&ri->use_count); + break; } - spin_unlock(&rma_lock); + spin_unlock(&linear_lock); + memset(ri->base_virt, 0, ri->npages << PAGE_SHIFT); return ri; } -EXPORT_SYMBOL_GPL(kvm_alloc_rma); -void kvm_release_rma(struct kvmppc_rma_info *ri) +static void kvm_release_linear(struct kvmppc_linear_info *ri) { if (atomic_dec_and_test(&ri->use_count)) { - spin_lock(&rma_lock); - list_add_tail(&ri->list, &free_rmas); - spin_unlock(&rma_lock); + spin_lock(&linear_lock); + list_add_tail(&ri->list, &free_linears); + spin_unlock(&linear_lock); } } -EXPORT_SYMBOL_GPL(kvm_release_rma); +/* + * Called at boot time while the bootmem allocator is active, + * to allocate contiguous physical memory for the hash page + * tables for guests. + */ +void __init kvm_linear_init(void) +{ + /* HPT */ + kvm_linear_init_one(1 << HPT_ORDER, kvm_hpt_count, KVM_LINEAR_HPT); + + /* RMA */ + /* Only do this on PPC970 in HV mode */ + if (!cpu_has_feature(CPU_FTR_HVMODE) || + !cpu_has_feature(CPU_FTR_ARCH_201)) + return; + + if (!kvm_rma_size || !kvm_rma_count) + return; + + /* Check that the requested size is one supported in hardware */ + if (lpcr_rmls(kvm_rma_size) < 0) { + pr_err("RMA size of 0x%lx not supported\n", kvm_rma_size); + return; + } + + kvm_linear_init_one(kvm_rma_size, kvm_rma_count, KVM_LINEAR_RMA); +} diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index bacb0cfa360..def880aea63 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -11,6 +11,7 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/hugetlb.h> +#include <linux/module.h> #include <asm/tlbflush.h> #include <asm/kvm_ppc.h> @@ -20,95 +21,307 @@ #include <asm/synch.h> #include <asm/ppc-opcode.h> -/* For now use fixed-size 16MB page table */ -#define HPT_ORDER 24 -#define HPT_NPTEG (1ul << (HPT_ORDER - 7)) /* 128B per pteg */ -#define HPT_HASH_MASK (HPT_NPTEG - 1) +/* Translate address of a vmalloc'd thing to a linear map address */ +static void *real_vmalloc_addr(void *x) +{ + unsigned long addr = (unsigned long) x; + pte_t *p; -#define HPTE_V_HVLOCK 0x40UL + p = find_linux_pte(swapper_pg_dir, addr); + if (!p || !pte_present(*p)) + return NULL; + /* assume we don't have huge pages in vmalloc space... */ + addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK); + return __va(addr); +} -static inline long lock_hpte(unsigned long *hpte, unsigned long bits) +/* + * Add this HPTE into the chain for the real page. + * Must be called with the chain locked; it unlocks the chain. + */ +void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, + unsigned long *rmap, long pte_index, int realmode) { - unsigned long tmp, old; + struct revmap_entry *head, *tail; + unsigned long i; - asm volatile(" ldarx %0,0,%2\n" - " and. %1,%0,%3\n" - " bne 2f\n" - " ori %0,%0,%4\n" - " stdcx. %0,0,%2\n" - " beq+ 2f\n" - " li %1,%3\n" - "2: isync" - : "=&r" (tmp), "=&r" (old) - : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK) - : "cc", "memory"); - return old == 0; + if (*rmap & KVMPPC_RMAP_PRESENT) { + i = *rmap & KVMPPC_RMAP_INDEX; + head = &kvm->arch.revmap[i]; + if (realmode) + head = real_vmalloc_addr(head); + tail = &kvm->arch.revmap[head->back]; + if (realmode) + tail = real_vmalloc_addr(tail); + rev->forw = i; + rev->back = head->back; + tail->forw = pte_index; + head->back = pte_index; + } else { + rev->forw = rev->back = pte_index; + i = pte_index; + } + smp_wmb(); + *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */ +} +EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); + +/* Remove this HPTE from the chain for a real page */ +static void remove_revmap_chain(struct kvm *kvm, long pte_index, + struct revmap_entry *rev, + unsigned long hpte_v, unsigned long hpte_r) +{ + struct revmap_entry *next, *prev; + unsigned long gfn, ptel, head; + struct kvm_memory_slot *memslot; + unsigned long *rmap; + unsigned long rcbits; + + rcbits = hpte_r & (HPTE_R_R | HPTE_R_C); + ptel = rev->guest_rpte |= rcbits; + gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel)); + memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + return; + + rmap = real_vmalloc_addr(&memslot->rmap[gfn - memslot->base_gfn]); + lock_rmap(rmap); + + head = *rmap & KVMPPC_RMAP_INDEX; + next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]); + prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]); + next->back = rev->back; + prev->forw = rev->forw; + if (head == pte_index) { + head = rev->forw; + if (head == pte_index) + *rmap &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); + else + *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head; + } + *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT; + unlock_rmap(rmap); +} + +static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva, + int writing, unsigned long *pte_sizep) +{ + pte_t *ptep; + unsigned long ps = *pte_sizep; + unsigned int shift; + + ptep = find_linux_pte_or_hugepte(vcpu->arch.pgdir, hva, &shift); + if (!ptep) + return __pte(0); + if (shift) + *pte_sizep = 1ul << shift; + else + *pte_sizep = PAGE_SIZE; + if (ps > *pte_sizep) + return __pte(0); + if (!pte_present(*ptep)) + return __pte(0); + return kvmppc_read_update_linux_pte(ptep, writing); +} + +static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v) +{ + asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); + hpte[0] = hpte_v; } long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel) { - unsigned long porder; struct kvm *kvm = vcpu->kvm; - unsigned long i, lpn, pa; + unsigned long i, pa, gpa, gfn, psize; + unsigned long slot_fn, hva; unsigned long *hpte; + struct revmap_entry *rev; + unsigned long g_ptel = ptel; + struct kvm_memory_slot *memslot; + unsigned long *physp, pte_size; + unsigned long is_io; + unsigned long *rmap; + pte_t pte; + unsigned int writing; + unsigned long mmu_seq; + unsigned long rcbits; + bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING; - /* only handle 4k, 64k and 16M pages for now */ - porder = 12; - if (pteh & HPTE_V_LARGE) { - if (cpu_has_feature(CPU_FTR_ARCH_206) && - (ptel & 0xf000) == 0x1000) { - /* 64k page */ - porder = 16; - } else if ((ptel & 0xff000) == 0) { - /* 16M page */ - porder = 24; - /* lowest AVA bit must be 0 for 16M pages */ - if (pteh & 0x80) - return H_PARAMETER; - } else + psize = hpte_page_size(pteh, ptel); + if (!psize) + return H_PARAMETER; + writing = hpte_is_writable(ptel); + pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); + + /* used later to detect if we might have been invalidated */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + /* Find the memslot (if any) for this address */ + gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); + gfn = gpa >> PAGE_SHIFT; + memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); + pa = 0; + is_io = ~0ul; + rmap = NULL; + if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) { + /* PPC970 can't do emulated MMIO */ + if (!cpu_has_feature(CPU_FTR_ARCH_206)) return H_PARAMETER; + /* Emulated MMIO - mark this with key=31 */ + pteh |= HPTE_V_ABSENT; + ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO; + goto do_insert; } - lpn = (ptel & HPTE_R_RPN) >> kvm->arch.ram_porder; - if (lpn >= kvm->arch.ram_npages || porder > kvm->arch.ram_porder) - return H_PARAMETER; - pa = kvm->arch.ram_pginfo[lpn].pfn << PAGE_SHIFT; - if (!pa) + + /* Check if the requested page fits entirely in the memslot. */ + if (!slot_is_aligned(memslot, psize)) return H_PARAMETER; - /* Check WIMG */ - if ((ptel & HPTE_R_WIMG) != HPTE_R_M && - (ptel & HPTE_R_WIMG) != (HPTE_R_W | HPTE_R_I | HPTE_R_M)) + slot_fn = gfn - memslot->base_gfn; + rmap = &memslot->rmap[slot_fn]; + + if (!kvm->arch.using_mmu_notifiers) { + physp = kvm->arch.slot_phys[memslot->id]; + if (!physp) + return H_PARAMETER; + physp += slot_fn; + if (realmode) + physp = real_vmalloc_addr(physp); + pa = *physp; + if (!pa) + return H_TOO_HARD; + is_io = pa & (HPTE_R_I | HPTE_R_W); + pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK); + pa &= PAGE_MASK; + } else { + /* Translate to host virtual address */ + hva = gfn_to_hva_memslot(memslot, gfn); + + /* Look up the Linux PTE for the backing page */ + pte_size = psize; + pte = lookup_linux_pte(vcpu, hva, writing, &pte_size); + if (pte_present(pte)) { + if (writing && !pte_write(pte)) + /* make the actual HPTE be read-only */ + ptel = hpte_make_readonly(ptel); + is_io = hpte_cache_bits(pte_val(pte)); + pa = pte_pfn(pte) << PAGE_SHIFT; + } + } + if (pte_size < psize) return H_PARAMETER; - pteh &= ~0x60UL; - ptel &= ~(HPTE_R_PP0 - kvm->arch.ram_psize); + if (pa && pte_size > psize) + pa |= gpa & (pte_size - 1); + + ptel &= ~(HPTE_R_PP0 - psize); ptel |= pa; - if (pte_index >= (HPT_NPTEG << 3)) + + if (pa) + pteh |= HPTE_V_VALID; + else + pteh |= HPTE_V_ABSENT; + + /* Check WIMG */ + if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) { + if (is_io) + return H_PARAMETER; + /* + * Allow guest to map emulated device memory as + * uncacheable, but actually make it cacheable. + */ + ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G); + ptel |= HPTE_R_M; + } + + /* Find and lock the HPTEG slot to use */ + do_insert: + if (pte_index >= HPT_NPTE) return H_PARAMETER; if (likely((flags & H_EXACT) == 0)) { pte_index &= ~7UL; hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - for (i = 0; ; ++i) { - if (i == 8) - return H_PTEG_FULL; + for (i = 0; i < 8; ++i) { if ((*hpte & HPTE_V_VALID) == 0 && - lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) + try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | + HPTE_V_ABSENT)) break; hpte += 2; } + if (i == 8) { + /* + * Since try_lock_hpte doesn't retry (not even stdcx. + * failures), it could be that there is a free slot + * but we transiently failed to lock it. Try again, + * actually locking each slot and checking it. + */ + hpte -= 16; + for (i = 0; i < 8; ++i) { + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + if (!(*hpte & (HPTE_V_VALID | HPTE_V_ABSENT))) + break; + *hpte &= ~HPTE_V_HVLOCK; + hpte += 2; + } + if (i == 8) + return H_PTEG_FULL; + } + pte_index += i; } else { - i = 0; hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - if (!lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID)) - return H_PTEG_FULL; + if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | + HPTE_V_ABSENT)) { + /* Lock the slot and check again */ + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + if (*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)) { + *hpte &= ~HPTE_V_HVLOCK; + return H_PTEG_FULL; + } + } } + + /* Save away the guest's idea of the second HPTE dword */ + rev = &kvm->arch.revmap[pte_index]; + if (realmode) + rev = real_vmalloc_addr(rev); + if (rev) + rev->guest_rpte = g_ptel; + + /* Link HPTE into reverse-map chain */ + if (pteh & HPTE_V_VALID) { + if (realmode) + rmap = real_vmalloc_addr(rmap); + lock_rmap(rmap); + /* Check for pending invalidations under the rmap chain lock */ + if (kvm->arch.using_mmu_notifiers && + mmu_notifier_retry(vcpu, mmu_seq)) { + /* inval in progress, write a non-present HPTE */ + pteh |= HPTE_V_ABSENT; + pteh &= ~HPTE_V_VALID; + unlock_rmap(rmap); + } else { + kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, + realmode); + /* Only set R/C in real HPTE if already set in *rmap */ + rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; + ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C); + } + } + hpte[1] = ptel; + + /* Write the first HPTE dword, unlocking the HPTE and making it valid */ eieio(); hpte[0] = pteh; asm volatile("ptesync" : : : "memory"); - atomic_inc(&kvm->arch.ram_pginfo[lpn].refcnt); - vcpu->arch.gpr[4] = pte_index + i; + + vcpu->arch.gpr[4] = pte_index; return H_SUCCESS; } +EXPORT_SYMBOL_GPL(kvmppc_h_enter); #define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) @@ -137,37 +350,46 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, struct kvm *kvm = vcpu->kvm; unsigned long *hpte; unsigned long v, r, rb; + struct revmap_entry *rev; - if (pte_index >= (HPT_NPTEG << 3)) + if (pte_index >= HPT_NPTE) return H_PARAMETER; hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - while (!lock_hpte(hpte, HPTE_V_HVLOCK)) + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - if ((hpte[0] & HPTE_V_VALID) == 0 || + if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) || ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) { hpte[0] &= ~HPTE_V_HVLOCK; return H_NOT_FOUND; } - if (atomic_read(&kvm->online_vcpus) == 1) - flags |= H_LOCAL; - vcpu->arch.gpr[4] = v = hpte[0] & ~HPTE_V_HVLOCK; - vcpu->arch.gpr[5] = r = hpte[1]; - rb = compute_tlbie_rb(v, r, pte_index); - hpte[0] = 0; - if (!(flags & H_LOCAL)) { - while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) - cpu_relax(); - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" - : : "r" (rb), "r" (kvm->arch.lpid)); - asm volatile("ptesync" : : : "memory"); - kvm->arch.tlbie_lock = 0; - } else { - asm volatile("ptesync" : : : "memory"); - asm volatile("tlbiel %0" : : "r" (rb)); - asm volatile("ptesync" : : : "memory"); + + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + v = hpte[0] & ~HPTE_V_HVLOCK; + if (v & HPTE_V_VALID) { + hpte[0] &= ~HPTE_V_VALID; + rb = compute_tlbie_rb(v, hpte[1], pte_index); + if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) { + while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" + : : "r" (rb), "r" (kvm->arch.lpid)); + asm volatile("ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + asm volatile("ptesync" : : : "memory"); + asm volatile("tlbiel %0" : : "r" (rb)); + asm volatile("ptesync" : : : "memory"); + } + /* Read PTE low word after tlbie to get final R/C values */ + remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]); } + r = rev->guest_rpte; + unlock_hpte(hpte, 0); + + vcpu->arch.gpr[4] = v; + vcpu->arch.gpr[5] = r; return H_SUCCESS; } @@ -175,78 +397,117 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; unsigned long *args = &vcpu->arch.gpr[4]; - unsigned long *hp, tlbrb[4]; - long int i, found; - long int n_inval = 0; - unsigned long flags, req, pte_index; + unsigned long *hp, *hptes[4], tlbrb[4]; + long int i, j, k, n, found, indexes[4]; + unsigned long flags, req, pte_index, rcbits; long int local = 0; long int ret = H_SUCCESS; + struct revmap_entry *rev, *revs[4]; if (atomic_read(&kvm->online_vcpus) == 1) local = 1; - for (i = 0; i < 4; ++i) { - pte_index = args[i * 2]; - flags = pte_index >> 56; - pte_index &= ((1ul << 56) - 1); - req = flags >> 6; - flags &= 3; - if (req == 3) - break; - if (req != 1 || flags == 3 || - pte_index >= (HPT_NPTEG << 3)) { - /* parameter error */ - args[i * 2] = ((0xa0 | flags) << 56) + pte_index; - ret = H_PARAMETER; - break; - } - hp = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - while (!lock_hpte(hp, HPTE_V_HVLOCK)) - cpu_relax(); - found = 0; - if (hp[0] & HPTE_V_VALID) { - switch (flags & 3) { - case 0: /* absolute */ - found = 1; + for (i = 0; i < 4 && ret == H_SUCCESS; ) { + n = 0; + for (; i < 4; ++i) { + j = i * 2; + pte_index = args[j]; + flags = pte_index >> 56; + pte_index &= ((1ul << 56) - 1); + req = flags >> 6; + flags &= 3; + if (req == 3) { /* no more requests */ + i = 4; break; - case 1: /* andcond */ - if (!(hp[0] & args[i * 2 + 1])) - found = 1; + } + if (req != 1 || flags == 3 || pte_index >= HPT_NPTE) { + /* parameter error */ + args[j] = ((0xa0 | flags) << 56) + pte_index; + ret = H_PARAMETER; break; - case 2: /* AVPN */ - if ((hp[0] & ~0x7fUL) == args[i * 2 + 1]) + } + hp = (unsigned long *) + (kvm->arch.hpt_virt + (pte_index << 4)); + /* to avoid deadlock, don't spin except for first */ + if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) { + if (n) + break; + while (!try_lock_hpte(hp, HPTE_V_HVLOCK)) + cpu_relax(); + } + found = 0; + if (hp[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) { + switch (flags & 3) { + case 0: /* absolute */ found = 1; - break; + break; + case 1: /* andcond */ + if (!(hp[0] & args[j + 1])) + found = 1; + break; + case 2: /* AVPN */ + if ((hp[0] & ~0x7fUL) == args[j + 1]) + found = 1; + break; + } + } + if (!found) { + hp[0] &= ~HPTE_V_HVLOCK; + args[j] = ((0x90 | flags) << 56) + pte_index; + continue; } + + args[j] = ((0x80 | flags) << 56) + pte_index; + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + + if (!(hp[0] & HPTE_V_VALID)) { + /* insert R and C bits from PTE */ + rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); + args[j] |= rcbits << (56 - 5); + continue; + } + + hp[0] &= ~HPTE_V_VALID; /* leave it locked */ + tlbrb[n] = compute_tlbie_rb(hp[0], hp[1], pte_index); + indexes[n] = j; + hptes[n] = hp; + revs[n] = rev; + ++n; + } + + if (!n) + break; + + /* Now that we've collected a batch, do the tlbies */ + if (!local) { + while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + for (k = 0; k < n; ++k) + asm volatile(PPC_TLBIE(%1,%0) : : + "r" (tlbrb[k]), + "r" (kvm->arch.lpid)); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + asm volatile("ptesync" : : : "memory"); + for (k = 0; k < n; ++k) + asm volatile("tlbiel %0" : : "r" (tlbrb[k])); + asm volatile("ptesync" : : : "memory"); } - if (!found) { - hp[0] &= ~HPTE_V_HVLOCK; - args[i * 2] = ((0x90 | flags) << 56) + pte_index; - continue; + + /* Read PTE low words after tlbie to get final R/C values */ + for (k = 0; k < n; ++k) { + j = indexes[k]; + pte_index = args[j] & ((1ul << 56) - 1); + hp = hptes[k]; + rev = revs[k]; + remove_revmap_chain(kvm, pte_index, rev, hp[0], hp[1]); + rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); + args[j] |= rcbits << (56 - 5); + hp[0] = 0; } - /* insert R and C bits from PTE */ - flags |= (hp[1] >> 5) & 0x0c; - args[i * 2] = ((0x80 | flags) << 56) + pte_index; - tlbrb[n_inval++] = compute_tlbie_rb(hp[0], hp[1], pte_index); - hp[0] = 0; - } - if (n_inval == 0) - return ret; - - if (!local) { - while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) - cpu_relax(); - asm volatile("ptesync" : : : "memory"); - for (i = 0; i < n_inval; ++i) - asm volatile(PPC_TLBIE(%1,%0) - : : "r" (tlbrb[i]), "r" (kvm->arch.lpid)); - asm volatile("eieio; tlbsync; ptesync" : : : "memory"); - kvm->arch.tlbie_lock = 0; - } else { - asm volatile("ptesync" : : : "memory"); - for (i = 0; i < n_inval; ++i) - asm volatile("tlbiel %0" : : "r" (tlbrb[i])); - asm volatile("ptesync" : : : "memory"); } + return ret; } @@ -256,40 +517,55 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, { struct kvm *kvm = vcpu->kvm; unsigned long *hpte; - unsigned long v, r, rb; + struct revmap_entry *rev; + unsigned long v, r, rb, mask, bits; - if (pte_index >= (HPT_NPTEG << 3)) + if (pte_index >= HPT_NPTE) return H_PARAMETER; + hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); - while (!lock_hpte(hpte, HPTE_V_HVLOCK)) + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - if ((hpte[0] & HPTE_V_VALID) == 0 || + if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) { hpte[0] &= ~HPTE_V_HVLOCK; return H_NOT_FOUND; } + if (atomic_read(&kvm->online_vcpus) == 1) flags |= H_LOCAL; v = hpte[0]; - r = hpte[1] & ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | - HPTE_R_KEY_HI | HPTE_R_KEY_LO); - r |= (flags << 55) & HPTE_R_PP0; - r |= (flags << 48) & HPTE_R_KEY_HI; - r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); - rb = compute_tlbie_rb(v, r, pte_index); - hpte[0] = v & ~HPTE_V_VALID; - if (!(flags & H_LOCAL)) { - while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) - cpu_relax(); - asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" - : : "r" (rb), "r" (kvm->arch.lpid)); - asm volatile("ptesync" : : : "memory"); - kvm->arch.tlbie_lock = 0; - } else { - asm volatile("ptesync" : : : "memory"); - asm volatile("tlbiel %0" : : "r" (rb)); - asm volatile("ptesync" : : : "memory"); + bits = (flags << 55) & HPTE_R_PP0; + bits |= (flags << 48) & HPTE_R_KEY_HI; + bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); + + /* Update guest view of 2nd HPTE dword */ + mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | + HPTE_R_KEY_HI | HPTE_R_KEY_LO; + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + if (rev) { + r = (rev->guest_rpte & ~mask) | bits; + rev->guest_rpte = r; + } + r = (hpte[1] & ~mask) | bits; + + /* Update HPTE */ + if (v & HPTE_V_VALID) { + rb = compute_tlbie_rb(v, r, pte_index); + hpte[0] = v & ~HPTE_V_VALID; + if (!(flags & H_LOCAL)) { + while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" + : : "r" (rb), "r" (kvm->arch.lpid)); + asm volatile("ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + asm volatile("ptesync" : : : "memory"); + asm volatile("tlbiel %0" : : "r" (rb)); + asm volatile("ptesync" : : : "memory"); + } } hpte[1] = r; eieio(); @@ -298,40 +574,243 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, return H_SUCCESS; } -static unsigned long reverse_xlate(struct kvm *kvm, unsigned long realaddr) -{ - long int i; - unsigned long offset, rpn; - - offset = realaddr & (kvm->arch.ram_psize - 1); - rpn = (realaddr - offset) >> PAGE_SHIFT; - for (i = 0; i < kvm->arch.ram_npages; ++i) - if (rpn == kvm->arch.ram_pginfo[i].pfn) - return (i << PAGE_SHIFT) + offset; - return HPTE_R_RPN; /* all 1s in the RPN field */ -} - long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long pte_index) { struct kvm *kvm = vcpu->kvm; - unsigned long *hpte, r; + unsigned long *hpte, v, r; int i, n = 1; + struct revmap_entry *rev = NULL; - if (pte_index >= (HPT_NPTEG << 3)) + if (pte_index >= HPT_NPTE) return H_PARAMETER; if (flags & H_READ_4) { pte_index &= ~3; n = 4; } + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); for (i = 0; i < n; ++i, ++pte_index) { hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4)); + v = hpte[0] & ~HPTE_V_HVLOCK; r = hpte[1]; - if ((flags & H_R_XLATE) && (hpte[0] & HPTE_V_VALID)) - r = reverse_xlate(kvm, r & HPTE_R_RPN) | - (r & ~HPTE_R_RPN); - vcpu->arch.gpr[4 + i * 2] = hpte[0]; + if (v & HPTE_V_ABSENT) { + v &= ~HPTE_V_ABSENT; + v |= HPTE_V_VALID; + } + if (v & HPTE_V_VALID) + r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C)); + vcpu->arch.gpr[4 + i * 2] = v; vcpu->arch.gpr[5 + i * 2] = r; } return H_SUCCESS; } + +void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep, + unsigned long pte_index) +{ + unsigned long rb; + + hptep[0] &= ~HPTE_V_VALID; + rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index); + while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" + : : "r" (rb), "r" (kvm->arch.lpid)); + asm volatile("ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; +} +EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte); + +void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep, + unsigned long pte_index) +{ + unsigned long rb; + unsigned char rbyte; + + rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index); + rbyte = (hptep[1] & ~HPTE_R_R) >> 8; + /* modify only the second-last byte, which contains the ref bit */ + *((char *)hptep + 14) = rbyte; + while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync" + : : "r" (rb), "r" (kvm->arch.lpid)); + asm volatile("ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; +} +EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte); + +static int slb_base_page_shift[4] = { + 24, /* 16M */ + 16, /* 64k */ + 34, /* 16G */ + 20, /* 1M, unsupported */ +}; + +long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, + unsigned long valid) +{ + unsigned int i; + unsigned int pshift; + unsigned long somask; + unsigned long vsid, hash; + unsigned long avpn; + unsigned long *hpte; + unsigned long mask, val; + unsigned long v, r; + + /* Get page shift, work out hash and AVPN etc. */ + mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY; + val = 0; + pshift = 12; + if (slb_v & SLB_VSID_L) { + mask |= HPTE_V_LARGE; + val |= HPTE_V_LARGE; + pshift = slb_base_page_shift[(slb_v & SLB_VSID_LP) >> 4]; + } + if (slb_v & SLB_VSID_B_1T) { + somask = (1UL << 40) - 1; + vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T; + vsid ^= vsid << 25; + } else { + somask = (1UL << 28) - 1; + vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; + } + hash = (vsid ^ ((eaddr & somask) >> pshift)) & HPT_HASH_MASK; + avpn = slb_v & ~(somask >> 16); /* also includes B */ + avpn |= (eaddr & somask) >> 16; + + if (pshift >= 24) + avpn &= ~((1UL << (pshift - 16)) - 1); + else + avpn &= ~0x7fUL; + val |= avpn; + + for (;;) { + hpte = (unsigned long *)(kvm->arch.hpt_virt + (hash << 7)); + + for (i = 0; i < 16; i += 2) { + /* Read the PTE racily */ + v = hpte[i] & ~HPTE_V_HVLOCK; + + /* Check valid/absent, hash, segment size and AVPN */ + if (!(v & valid) || (v & mask) != val) + continue; + + /* Lock the PTE and read it under the lock */ + while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK)) + cpu_relax(); + v = hpte[i] & ~HPTE_V_HVLOCK; + r = hpte[i+1]; + + /* + * Check the HPTE again, including large page size + * Since we don't currently allow any MPSS (mixed + * page-size segment) page sizes, it is sufficient + * to check against the actual page size. + */ + if ((v & valid) && (v & mask) == val && + hpte_page_size(v, r) == (1ul << pshift)) + /* Return with the HPTE still locked */ + return (hash << 3) + (i >> 1); + + /* Unlock and move on */ + hpte[i] = v; + } + + if (val & HPTE_V_SECONDARY) + break; + val |= HPTE_V_SECONDARY; + hash = hash ^ HPT_HASH_MASK; + } + return -1; +} +EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte); + +/* + * Called in real mode to check whether an HPTE not found fault + * is due to accessing a paged-out page or an emulated MMIO page, + * or if a protection fault is due to accessing a page that the + * guest wanted read/write access to but which we made read-only. + * Returns a possibly modified status (DSISR) value if not + * (i.e. pass the interrupt to the guest), + * -1 to pass the fault up to host kernel mode code, -2 to do that + * and also load the instruction word (for MMIO emulation), + * or 0 if we should make the guest retry the access. + */ +long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, + unsigned long slb_v, unsigned int status, bool data) +{ + struct kvm *kvm = vcpu->kvm; + long int index; + unsigned long v, r, gr; + unsigned long *hpte; + unsigned long valid; + struct revmap_entry *rev; + unsigned long pp, key; + + /* For protection fault, expect to find a valid HPTE */ + valid = HPTE_V_VALID; + if (status & DSISR_NOHPTE) + valid |= HPTE_V_ABSENT; + + index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid); + if (index < 0) { + if (status & DSISR_NOHPTE) + return status; /* there really was no HPTE */ + return 0; /* for prot fault, HPTE disappeared */ + } + hpte = (unsigned long *)(kvm->arch.hpt_virt + (index << 4)); + v = hpte[0] & ~HPTE_V_HVLOCK; + r = hpte[1]; + rev = real_vmalloc_addr(&kvm->arch.revmap[index]); + gr = rev->guest_rpte; + + unlock_hpte(hpte, v); + + /* For not found, if the HPTE is valid by now, retry the instruction */ + if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID)) + return 0; + + /* Check access permissions to the page */ + pp = gr & (HPTE_R_PP0 | HPTE_R_PP); + key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; + status &= ~DSISR_NOHPTE; /* DSISR_NOHPTE == SRR1_ISI_NOPT */ + if (!data) { + if (gr & (HPTE_R_N | HPTE_R_G)) + return status | SRR1_ISI_N_OR_G; + if (!hpte_read_permission(pp, slb_v & key)) + return status | SRR1_ISI_PROT; + } else if (status & DSISR_ISSTORE) { + /* check write permission */ + if (!hpte_write_permission(pp, slb_v & key)) + return status | DSISR_PROTFAULT; + } else { + if (!hpte_read_permission(pp, slb_v & key)) + return status | DSISR_PROTFAULT; + } + + /* Check storage key, if applicable */ + if (data && (vcpu->arch.shregs.msr & MSR_DR)) { + unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr); + if (status & DSISR_ISSTORE) + perm >>= 1; + if (perm & 1) + return status | DSISR_KEYFAULT; + } + + /* Save HPTE info for virtual-mode handler */ + vcpu->arch.pgfault_addr = addr; + vcpu->arch.pgfault_index = index; + vcpu->arch.pgfault_hpte[0] = v; + vcpu->arch.pgfault_hpte[1] = r; + + /* Check the storage key to see if it is possibly emulated MMIO */ + if (data && (vcpu->arch.shregs.msr & MSR_IR) && + (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) == + (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) + return -2; /* MMIO emulation - load instr word */ + + return -1; /* send fault up to host kernel mode */ +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 5c8b26183f5..b70bf22a3ff 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -601,6 +601,30 @@ kvmppc_interrupt: stw r12,VCPU_TRAP(r9) + /* Save HEIR (HV emulation assist reg) in last_inst + if this is an HEI (HV emulation interrupt, e40) */ + li r3,KVM_INST_FETCH_FAILED +BEGIN_FTR_SECTION + cmpwi r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST + bne 11f + mfspr r3,SPRN_HEIR +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +11: stw r3,VCPU_LAST_INST(r9) + + /* these are volatile across C function calls */ + mfctr r3 + mfxer r4 + std r3, VCPU_CTR(r9) + stw r4, VCPU_XER(r9) + +BEGIN_FTR_SECTION + /* If this is a page table miss then see if it's theirs or ours */ + cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE + beq kvmppc_hdsi + cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE + beq kvmppc_hisi +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) + /* See if this is a leftover HDEC interrupt */ cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER bne 2f @@ -608,7 +632,7 @@ kvmppc_interrupt: cmpwi r3,0 bge ignore_hdec 2: - /* See if this is something we can handle in real mode */ + /* See if this is an hcall we can handle in real mode */ cmpwi r12,BOOK3S_INTERRUPT_SYSCALL beq hcall_try_real_mode @@ -624,6 +648,7 @@ BEGIN_FTR_SECTION 1: END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) +nohpte_cont: hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ /* Save DEC */ mfspr r5,SPRN_DEC @@ -632,36 +657,21 @@ hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ add r5,r5,r6 std r5,VCPU_DEC_EXPIRES(r9) - /* Save HEIR (HV emulation assist reg) in last_inst - if this is an HEI (HV emulation interrupt, e40) */ - li r3,-1 -BEGIN_FTR_SECTION - cmpwi r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST - bne 11f - mfspr r3,SPRN_HEIR -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) -11: stw r3,VCPU_LAST_INST(r9) - /* Save more register state */ - mfxer r5 mfdar r6 mfdsisr r7 - mfctr r8 - - stw r5, VCPU_XER(r9) std r6, VCPU_DAR(r9) stw r7, VCPU_DSISR(r9) - std r8, VCPU_CTR(r9) - /* grab HDAR & HDSISR if HV data storage interrupt (HDSI) */ BEGIN_FTR_SECTION + /* don't overwrite fault_dar/fault_dsisr if HDSI */ cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE beq 6f END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) -7: std r6, VCPU_FAULT_DAR(r9) + std r6, VCPU_FAULT_DAR(r9) stw r7, VCPU_FAULT_DSISR(r9) /* Save guest CTRL register, set runlatch to 1 */ - mfspr r6,SPRN_CTRLF +6: mfspr r6,SPRN_CTRLF stw r6,VCPU_CTRL(r9) andi. r0,r6,1 bne 4f @@ -1094,9 +1104,131 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) mtspr SPRN_HSRR1, r7 ba 0x500 -6: mfspr r6,SPRN_HDAR - mfspr r7,SPRN_HDSISR - b 7b +/* + * Check whether an HDSI is an HPTE not found fault or something else. + * If it is an HPTE not found fault that is due to the guest accessing + * a page that they have mapped but which we have paged out, then + * we continue on with the guest exit path. In all other cases, + * reflect the HDSI to the guest as a DSI. + */ +kvmppc_hdsi: + mfspr r4, SPRN_HDAR + mfspr r6, SPRN_HDSISR + /* HPTE not found fault or protection fault? */ + andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h + beq 1f /* if not, send it to the guest */ + andi. r0, r11, MSR_DR /* data relocation enabled? */ + beq 3f + clrrdi r0, r4, 28 + PPC_SLBFEE_DOT(r5, r0) /* if so, look up SLB */ + bne 1f /* if no SLB entry found */ +4: std r4, VCPU_FAULT_DAR(r9) + stw r6, VCPU_FAULT_DSISR(r9) + + /* Search the hash table. */ + mr r3, r9 /* vcpu pointer */ + li r7, 1 /* data fault */ + bl .kvmppc_hpte_hv_fault + ld r9, HSTATE_KVM_VCPU(r13) + ld r10, VCPU_PC(r9) + ld r11, VCPU_MSR(r9) + li r12, BOOK3S_INTERRUPT_H_DATA_STORAGE + cmpdi r3, 0 /* retry the instruction */ + beq 6f + cmpdi r3, -1 /* handle in kernel mode */ + beq nohpte_cont + cmpdi r3, -2 /* MMIO emulation; need instr word */ + beq 2f + + /* Synthesize a DSI for the guest */ + ld r4, VCPU_FAULT_DAR(r9) + mr r6, r3 +1: mtspr SPRN_DAR, r4 + mtspr SPRN_DSISR, r6 + mtspr SPRN_SRR0, r10 + mtspr SPRN_SRR1, r11 + li r10, BOOK3S_INTERRUPT_DATA_STORAGE + li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ + rotldi r11, r11, 63 +6: ld r7, VCPU_CTR(r9) + lwz r8, VCPU_XER(r9) + mtctr r7 + mtxer r8 + mr r4, r9 + b fast_guest_return + +3: ld r5, VCPU_KVM(r9) /* not relocated, use VRMA */ + ld r5, KVM_VRMA_SLB_V(r5) + b 4b + + /* If this is for emulated MMIO, load the instruction word */ +2: li r8, KVM_INST_FETCH_FAILED /* In case lwz faults */ + + /* Set guest mode to 'jump over instruction' so if lwz faults + * we'll just continue at the next IP. */ + li r0, KVM_GUEST_MODE_SKIP + stb r0, HSTATE_IN_GUEST(r13) + + /* Do the access with MSR:DR enabled */ + mfmsr r3 + ori r4, r3, MSR_DR /* Enable paging for data */ + mtmsrd r4 + lwz r8, 0(r10) + mtmsrd r3 + + /* Store the result */ + stw r8, VCPU_LAST_INST(r9) + + /* Unset guest mode. */ + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) + b nohpte_cont + +/* + * Similarly for an HISI, reflect it to the guest as an ISI unless + * it is an HPTE not found fault for a page that we have paged out. + */ +kvmppc_hisi: + andis. r0, r11, SRR1_ISI_NOPT@h + beq 1f + andi. r0, r11, MSR_IR /* instruction relocation enabled? */ + beq 3f + clrrdi r0, r10, 28 + PPC_SLBFEE_DOT(r5, r0) /* if so, look up SLB */ + bne 1f /* if no SLB entry found */ +4: + /* Search the hash table. */ + mr r3, r9 /* vcpu pointer */ + mr r4, r10 + mr r6, r11 + li r7, 0 /* instruction fault */ + bl .kvmppc_hpte_hv_fault + ld r9, HSTATE_KVM_VCPU(r13) + ld r10, VCPU_PC(r9) + ld r11, VCPU_MSR(r9) + li r12, BOOK3S_INTERRUPT_H_INST_STORAGE + cmpdi r3, 0 /* retry the instruction */ + beq 6f + cmpdi r3, -1 /* handle in kernel mode */ + beq nohpte_cont + + /* Synthesize an ISI for the guest */ + mr r11, r3 +1: mtspr SPRN_SRR0, r10 + mtspr SPRN_SRR1, r11 + li r10, BOOK3S_INTERRUPT_INST_STORAGE + li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ + rotldi r11, r11, 63 +6: ld r7, VCPU_CTR(r9) + lwz r8, VCPU_XER(r9) + mtctr r7 + mtxer r8 + mr r4, r9 + b fast_guest_return + +3: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */ + ld r5, KVM_VRMA_SLB_V(r6) + b 4b /* * Try to handle an hcall in real mode. diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c index 7b0ee96c1be..e70ef2d8643 100644 --- a/arch/powerpc/kvm/book3s_paired_singles.c +++ b/arch/powerpc/kvm/book3s_paired_singles.c @@ -196,7 +196,8 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_inject_pf(vcpu, addr, false); goto done_load; } else if (r == EMULATE_DO_MMIO) { - emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FPR | rs, len, 1); + emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FPR | rs, + len, 1); goto done_load; } @@ -286,11 +287,13 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_inject_pf(vcpu, addr, false); goto done_load; } else if ((r == EMULATE_DO_MMIO) && w) { - emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FPR | rs, 4, 1); + emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FPR | rs, + 4, 1); vcpu->arch.qpr[rs] = tmp[1]; goto done_load; } else if (r == EMULATE_DO_MMIO) { - emulated = kvmppc_handle_load(run, vcpu, KVM_REG_FQPR | rs, 8, 1); + emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FQPR | rs, + 8, 1); goto done_load; } diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 220fcdf2697..7340e1090b7 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -51,15 +51,19 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, #define MSR_USER32 MSR_USER #define MSR_USER64 MSR_USER #define HW_PAGE_SIZE PAGE_SIZE +#define __hard_irq_disable local_irq_disable +#define __hard_irq_enable local_irq_enable #endif void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { #ifdef CONFIG_PPC_BOOK3S_64 - memcpy(to_svcpu(vcpu)->slb, to_book3s(vcpu)->slb_shadow, sizeof(to_svcpu(vcpu)->slb)); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + memcpy(svcpu->slb, to_book3s(vcpu)->slb_shadow, sizeof(svcpu->slb)); memcpy(&get_paca()->shadow_vcpu, to_book3s(vcpu)->shadow_vcpu, sizeof(get_paca()->shadow_vcpu)); - to_svcpu(vcpu)->slb_max = to_book3s(vcpu)->slb_shadow_max; + svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max; + svcpu_put(svcpu); #endif #ifdef CONFIG_PPC_BOOK3S_32 @@ -70,10 +74,12 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) { #ifdef CONFIG_PPC_BOOK3S_64 - memcpy(to_book3s(vcpu)->slb_shadow, to_svcpu(vcpu)->slb, sizeof(to_svcpu(vcpu)->slb)); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb)); memcpy(to_book3s(vcpu)->shadow_vcpu, &get_paca()->shadow_vcpu, sizeof(get_paca()->shadow_vcpu)); - to_book3s(vcpu)->slb_shadow_max = to_svcpu(vcpu)->slb_max; + to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max; + svcpu_put(svcpu); #endif kvmppc_giveup_ext(vcpu, MSR_FP); @@ -151,14 +157,16 @@ void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) #ifdef CONFIG_PPC_BOOK3S_64 if ((pvr >= 0x330000) && (pvr < 0x70330000)) { kvmppc_mmu_book3s_64_init(vcpu); - to_book3s(vcpu)->hior = 0xfff00000; + if (!to_book3s(vcpu)->hior_explicit) + to_book3s(vcpu)->hior = 0xfff00000; to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; vcpu->arch.cpu_type = KVM_CPU_3S_64; } else #endif { kvmppc_mmu_book3s_32_init(vcpu); - to_book3s(vcpu)->hior = 0; + if (!to_book3s(vcpu)->hior_explicit) + to_book3s(vcpu)->hior = 0; to_book3s(vcpu)->msr_mask = 0xffffffffULL; vcpu->arch.cpu_type = KVM_CPU_3S_32; } @@ -308,19 +316,22 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, if (page_found == -ENOENT) { /* Page not found in guest PTE entries */ + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); - vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; + vcpu->arch.shared->dsisr = svcpu->fault_dsisr; vcpu->arch.shared->msr |= - (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); + (svcpu->shadow_srr1 & 0x00000000f8000000ULL); + svcpu_put(svcpu); kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EPERM) { /* Storage protection */ + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); vcpu->arch.shared->dar = kvmppc_get_fault_dar(vcpu); - vcpu->arch.shared->dsisr = - to_svcpu(vcpu)->fault_dsisr & ~DSISR_NOHPTE; + vcpu->arch.shared->dsisr = svcpu->fault_dsisr & ~DSISR_NOHPTE; vcpu->arch.shared->dsisr |= DSISR_PROTFAULT; vcpu->arch.shared->msr |= - (to_svcpu(vcpu)->shadow_srr1 & 0x00000000f8000000ULL); + svcpu->shadow_srr1 & 0x00000000f8000000ULL; + svcpu_put(svcpu); kvmppc_book3s_queue_irqprio(vcpu, vec); } else if (page_found == -EINVAL) { /* Page not found in guest SLB */ @@ -517,24 +528,29 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, run->ready_for_interrupt_injection = 1; trace_kvm_book3s_exit(exit_nr, vcpu); + preempt_enable(); kvm_resched(vcpu); switch (exit_nr) { case BOOK3S_INTERRUPT_INST_STORAGE: + { + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong shadow_srr1 = svcpu->shadow_srr1; vcpu->stat.pf_instruc++; #ifdef CONFIG_PPC_BOOK3S_32 /* We set segments as unused segments when invalidating them. So * treat the respective fault as segment fault. */ - if (to_svcpu(vcpu)->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] - == SR_INVALID) { + if (svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT] == SR_INVALID) { kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); r = RESUME_GUEST; + svcpu_put(svcpu); break; } #endif + svcpu_put(svcpu); /* only care about PTEG not found errors, but leave NX alone */ - if (to_svcpu(vcpu)->shadow_srr1 & 0x40000000) { + if (shadow_srr1 & 0x40000000) { r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr); vcpu->stat.sp_instruc++; } else if (vcpu->arch.mmu.is_dcbz32(vcpu) && @@ -547,33 +563,37 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL); r = RESUME_GUEST; } else { - vcpu->arch.shared->msr |= - to_svcpu(vcpu)->shadow_srr1 & 0x58000000; + vcpu->arch.shared->msr |= shadow_srr1 & 0x58000000; kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; } break; + } case BOOK3S_INTERRUPT_DATA_STORAGE: { ulong dar = kvmppc_get_fault_dar(vcpu); + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + u32 fault_dsisr = svcpu->fault_dsisr; vcpu->stat.pf_storage++; #ifdef CONFIG_PPC_BOOK3S_32 /* We set segments as unused segments when invalidating them. So * treat the respective fault as segment fault. */ - if ((to_svcpu(vcpu)->sr[dar >> SID_SHIFT]) == SR_INVALID) { + if ((svcpu->sr[dar >> SID_SHIFT]) == SR_INVALID) { kvmppc_mmu_map_segment(vcpu, dar); r = RESUME_GUEST; + svcpu_put(svcpu); break; } #endif + svcpu_put(svcpu); /* The only case we need to handle is missing shadow PTEs */ - if (to_svcpu(vcpu)->fault_dsisr & DSISR_NOHPTE) { + if (fault_dsisr & DSISR_NOHPTE) { r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr); } else { vcpu->arch.shared->dar = dar; - vcpu->arch.shared->dsisr = to_svcpu(vcpu)->fault_dsisr; + vcpu->arch.shared->dsisr = fault_dsisr; kvmppc_book3s_queue_irqprio(vcpu, exit_nr); r = RESUME_GUEST; } @@ -609,10 +629,13 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_PROGRAM: { enum emulation_result er; + struct kvmppc_book3s_shadow_vcpu *svcpu; ulong flags; program_interrupt: - flags = to_svcpu(vcpu)->shadow_srr1 & 0x1f0000ull; + svcpu = svcpu_get(vcpu); + flags = svcpu->shadow_srr1 & 0x1f0000ull; + svcpu_put(svcpu); if (vcpu->arch.shared->msr & MSR_PR) { #ifdef EXIT_DEBUG @@ -740,20 +763,33 @@ program_interrupt: r = RESUME_GUEST; break; default: + { + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong shadow_srr1 = svcpu->shadow_srr1; + svcpu_put(svcpu); /* Ugh - bork here! What did we get? */ printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", - exit_nr, kvmppc_get_pc(vcpu), to_svcpu(vcpu)->shadow_srr1); + exit_nr, kvmppc_get_pc(vcpu), shadow_srr1); r = RESUME_HOST; BUG(); break; } - + } if (!(r & RESUME_HOST)) { /* To avoid clobbering exit_reason, only check for signals if * we aren't already exiting to userspace for some other * reason. */ + + /* + * Interrupts could be timers for the guest which we have to + * inject again, so let's postpone them until we're in the guest + * and if we really did time things so badly, then we just exit + * again due to a host external interrupt. + */ + __hard_irq_disable(); if (signal_pending(current)) { + __hard_irq_enable(); #ifdef EXIT_DEBUG printk(KERN_EMERG "KVM: Going back to host\n"); #endif @@ -761,10 +797,12 @@ program_interrupt: run->exit_reason = KVM_EXIT_INTR; r = -EINTR; } else { + preempt_disable(); + /* In case an interrupt came in that was triggered * from userspace (like DEC), we need to check what * to inject now! */ - kvmppc_core_deliver_interrupts(vcpu); + kvmppc_core_prepare_to_enter(vcpu); } } @@ -836,6 +874,38 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return 0; } +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_PPC_HIOR: + r = put_user(to_book3s(vcpu)->hior, (u64 __user *)reg->addr); + break; + default: + break; + } + + return r; +} + +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r = -EINVAL; + + switch (reg->id) { + case KVM_REG_PPC_HIOR: + r = get_user(to_book3s(vcpu)->hior, (u64 __user *)reg->addr); + if (!r) + to_book3s(vcpu)->hior_explicit = true; + break; + default: + break; + } + + return r; +} + int kvmppc_core_check_processor_compat(void) { return 0; @@ -923,16 +993,31 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) #endif ulong ext_msr; + preempt_disable(); + /* Check if we can run the vcpu at all */ if (!vcpu->arch.sane) { kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - return -EINVAL; + ret = -EINVAL; + goto out; } + kvmppc_core_prepare_to_enter(vcpu); + + /* + * Interrupts could be timers for the guest which we have to inject + * again, so let's postpone them until we're in the guest and if we + * really did time things so badly, then we just exit again due to + * a host external interrupt. + */ + __hard_irq_disable(); + /* No need to go into the guest when all we do is going out */ if (signal_pending(current)) { + __hard_irq_enable(); kvm_run->exit_reason = KVM_EXIT_INTR; - return -EINTR; + ret = -EINTR; + goto out; } /* Save FPU state in stack */ @@ -974,8 +1059,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) kvm_guest_exit(); - local_irq_disable(); - current->thread.regs->msr = ext_msr; /* Make sure we save the guest FPU/Altivec/VSX state */ @@ -1002,9 +1085,50 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) current->thread.used_vsr = used_vsr; #endif +out: + preempt_enable(); return ret; } +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log) +{ + struct kvm_memory_slot *memslot; + struct kvm_vcpu *vcpu; + ulong ga, ga_end; + int is_dirty = 0; + int r; + unsigned long n; + + mutex_lock(&kvm->slots_lock); + + r = kvm_get_dirty_log(kvm, log, &is_dirty); + if (r) + goto out; + + /* If nothing is dirty, don't bother messing with page tables. */ + if (is_dirty) { + memslot = id_to_memslot(kvm->memslots, log->slot); + + ga = memslot->base_gfn << PAGE_SHIFT; + ga_end = ga + (memslot->npages << PAGE_SHIFT); + + kvm_for_each_vcpu(n, vcpu, kvm) + kvmppc_mmu_pte_pflush(vcpu, ga, ga_end); + + n = kvm_dirty_bitmap_bytes(memslot); + memset(memslot->dirty_bitmap, 0, n); + } + + r = 0; +out: + mutex_unlock(&kvm->slots_lock); + return r; +} + int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem) { diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index bb6c988f010..ee9e1ee9c85 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -124,12 +124,6 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) vcpu->arch.shared->msr = new_msr; kvmppc_mmu_msr_notify(vcpu, old_msr); - - if (vcpu->arch.shared->msr & MSR_WE) { - kvm_vcpu_block(vcpu); - kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); - }; - kvmppc_vcpu_sync_spe(vcpu); } @@ -258,9 +252,11 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, allowed = vcpu->arch.shared->msr & MSR_ME; msr_mask = 0; break; - case BOOKE_IRQPRIO_EXTERNAL: case BOOKE_IRQPRIO_DECREMENTER: case BOOKE_IRQPRIO_FIT: + keep_irq = true; + /* fall through */ + case BOOKE_IRQPRIO_EXTERNAL: allowed = vcpu->arch.shared->msr & MSR_EE; allowed = allowed && !crit; msr_mask = MSR_CE|MSR_ME|MSR_DE; @@ -276,7 +272,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, vcpu->arch.shared->srr1 = vcpu->arch.shared->msr; vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority]; if (update_esr == true) - vcpu->arch.esr = vcpu->arch.queued_esr; + vcpu->arch.shared->esr = vcpu->arch.queued_esr; if (update_dear == true) vcpu->arch.shared->dar = vcpu->arch.queued_dear; kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask); @@ -288,13 +284,26 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, return allowed; } -/* Check pending exceptions and deliver one, if possible. */ -void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) +static void update_timer_ints(struct kvm_vcpu *vcpu) +{ + if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS)) + kvmppc_core_queue_dec(vcpu); + else + kvmppc_core_dequeue_dec(vcpu); +} + +static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) { unsigned long *pending = &vcpu->arch.pending_exceptions; - unsigned long old_pending = vcpu->arch.pending_exceptions; unsigned int priority; + if (vcpu->requests) { + if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) { + smp_mb(); + update_timer_ints(vcpu); + } + } + priority = __ffs(*pending); while (priority <= BOOKE_IRQPRIO_MAX) { if (kvmppc_booke_irqprio_deliver(vcpu, priority)) @@ -306,10 +315,24 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) } /* Tell the guest about our interrupt status */ - if (*pending) - vcpu->arch.shared->int_pending = 1; - else if (old_pending) - vcpu->arch.shared->int_pending = 0; + vcpu->arch.shared->int_pending = !!*pending; +} + +/* Check pending exceptions and deliver one, if possible. */ +void kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) +{ + WARN_ON_ONCE(!irqs_disabled()); + + kvmppc_core_check_exceptions(vcpu); + + if (vcpu->arch.shared->msr & MSR_WE) { + local_irq_enable(); + kvm_vcpu_block(vcpu); + local_irq_disable(); + + kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); + kvmppc_core_check_exceptions(vcpu); + }; } int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) @@ -322,11 +345,21 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } local_irq_disable(); + + kvmppc_core_prepare_to_enter(vcpu); + + if (signal_pending(current)) { + kvm_run->exit_reason = KVM_EXIT_INTR; + ret = -EINTR; + goto out; + } + kvm_guest_enter(); ret = __kvmppc_vcpu_run(kvm_run, vcpu); kvm_guest_exit(); - local_irq_enable(); +out: + local_irq_enable(); return ret; } @@ -603,7 +636,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, local_irq_disable(); - kvmppc_core_deliver_interrupts(vcpu); + kvmppc_core_prepare_to_enter(vcpu); if (!(r & RESUME_HOST)) { /* To avoid clobbering exit_reason, only check for signals if @@ -628,6 +661,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.pc = 0; vcpu->arch.shared->msr = 0; vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; + vcpu->arch.shared->pir = vcpu->vcpu_id; kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ vcpu->arch.shadow_pid = 1; @@ -662,10 +696,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->sprg1 = vcpu->arch.shared->sprg1; regs->sprg2 = vcpu->arch.shared->sprg2; regs->sprg3 = vcpu->arch.shared->sprg3; - regs->sprg4 = vcpu->arch.sprg4; - regs->sprg5 = vcpu->arch.sprg5; - regs->sprg6 = vcpu->arch.sprg6; - regs->sprg7 = vcpu->arch.sprg7; + regs->sprg4 = vcpu->arch.shared->sprg4; + regs->sprg5 = vcpu->arch.shared->sprg5; + regs->sprg6 = vcpu->arch.shared->sprg6; + regs->sprg7 = vcpu->arch.shared->sprg7; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) regs->gpr[i] = kvmppc_get_gpr(vcpu, i); @@ -690,10 +724,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.shared->sprg1 = regs->sprg1; vcpu->arch.shared->sprg2 = regs->sprg2; vcpu->arch.shared->sprg3 = regs->sprg3; - vcpu->arch.sprg4 = regs->sprg4; - vcpu->arch.sprg5 = regs->sprg5; - vcpu->arch.sprg6 = regs->sprg6; - vcpu->arch.sprg7 = regs->sprg7; + vcpu->arch.shared->sprg4 = regs->sprg4; + vcpu->arch.shared->sprg5 = regs->sprg5; + vcpu->arch.shared->sprg6 = regs->sprg6; + vcpu->arch.shared->sprg7 = regs->sprg7; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) kvmppc_set_gpr(vcpu, i, regs->gpr[i]); @@ -711,7 +745,7 @@ static void get_sregs_base(struct kvm_vcpu *vcpu, sregs->u.e.csrr0 = vcpu->arch.csrr0; sregs->u.e.csrr1 = vcpu->arch.csrr1; sregs->u.e.mcsr = vcpu->arch.mcsr; - sregs->u.e.esr = vcpu->arch.esr; + sregs->u.e.esr = vcpu->arch.shared->esr; sregs->u.e.dear = vcpu->arch.shared->dar; sregs->u.e.tsr = vcpu->arch.tsr; sregs->u.e.tcr = vcpu->arch.tcr; @@ -729,28 +763,19 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, vcpu->arch.csrr0 = sregs->u.e.csrr0; vcpu->arch.csrr1 = sregs->u.e.csrr1; vcpu->arch.mcsr = sregs->u.e.mcsr; - vcpu->arch.esr = sregs->u.e.esr; + vcpu->arch.shared->esr = sregs->u.e.esr; vcpu->arch.shared->dar = sregs->u.e.dear; vcpu->arch.vrsave = sregs->u.e.vrsave; - vcpu->arch.tcr = sregs->u.e.tcr; + kvmppc_set_tcr(vcpu, sregs->u.e.tcr); - if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_DEC) + if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_DEC) { vcpu->arch.dec = sregs->u.e.dec; - - kvmppc_emulate_dec(vcpu); + kvmppc_emulate_dec(vcpu); + } if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { - /* - * FIXME: existing KVM timer handling is incomplete. - * TSR cannot be read by the guest, and its value in - * vcpu->arch is always zero. For now, just handle - * the case where the caller is trying to inject a - * decrementer interrupt. - */ - - if ((sregs->u.e.tsr & TSR_DIS) && - (vcpu->arch.tcr & TCR_DIE)) - kvmppc_core_queue_dec(vcpu); + vcpu->arch.tsr = sregs->u.e.tsr; + update_timer_ints(vcpu); } return 0; @@ -761,7 +786,7 @@ static void get_sregs_arch206(struct kvm_vcpu *vcpu, { sregs->u.e.features |= KVM_SREGS_E_ARCH206; - sregs->u.e.pir = 0; + sregs->u.e.pir = vcpu->vcpu_id; sregs->u.e.mcsrr0 = vcpu->arch.mcsrr0; sregs->u.e.mcsrr1 = vcpu->arch.mcsrr1; sregs->u.e.decar = vcpu->arch.decar; @@ -774,7 +799,7 @@ static int set_sregs_arch206(struct kvm_vcpu *vcpu, if (!(sregs->u.e.features & KVM_SREGS_E_ARCH206)) return 0; - if (sregs->u.e.pir != 0) + if (sregs->u.e.pir != vcpu->vcpu_id) return -EINVAL; vcpu->arch.mcsrr0 = sregs->u.e.mcsrr0; @@ -862,6 +887,16 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return kvmppc_core_set_sregs(vcpu, sregs); } +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + return -EINVAL; +} + +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + return -EINVAL; +} + int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -ENOTSUPP; @@ -906,6 +941,33 @@ void kvmppc_core_destroy_vm(struct kvm *kvm) { } +void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr) +{ + vcpu->arch.tcr = new_tcr; + update_timer_ints(vcpu); +} + +void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) +{ + set_bits(tsr_bits, &vcpu->arch.tsr); + smp_wmb(); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_vcpu_kick(vcpu); +} + +void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) +{ + clear_bits(tsr_bits, &vcpu->arch.tsr); + update_timer_ints(vcpu); +} + +void kvmppc_decrementer_func(unsigned long data) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; + + kvmppc_set_tsr_bits(vcpu, TSR_DIS); +} + int __init kvmppc_booke_init(void) { unsigned long ivor[16]; diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index 8e1fe33d64e..2fe202705a3 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -55,6 +55,10 @@ extern unsigned long kvmppc_booke_handlers; void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr); void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr); +void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr); +void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); +void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); + int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance); int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt); diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c index 1260f5f24c0..3e652da3653 100644 --- a/arch/powerpc/kvm/booke_emulate.c +++ b/arch/powerpc/kvm/booke_emulate.c @@ -13,6 +13,7 @@ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * Copyright IBM Corp. 2008 + * Copyright 2011 Freescale Semiconductor, Inc. * * Authors: Hollis Blanchard <hollisb@us.ibm.com> */ @@ -107,7 +108,7 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_DEAR: vcpu->arch.shared->dar = spr_val; break; case SPRN_ESR: - vcpu->arch.esr = spr_val; break; + vcpu->arch.shared->esr = spr_val; break; case SPRN_DBCR0: vcpu->arch.dbcr0 = spr_val; break; case SPRN_DBCR1: @@ -115,23 +116,23 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) case SPRN_DBSR: vcpu->arch.dbsr &= ~spr_val; break; case SPRN_TSR: - vcpu->arch.tsr &= ~spr_val; break; + kvmppc_clr_tsr_bits(vcpu, spr_val); + break; case SPRN_TCR: - vcpu->arch.tcr = spr_val; - kvmppc_emulate_dec(vcpu); + kvmppc_set_tcr(vcpu, spr_val); break; /* Note: SPRG4-7 are user-readable. These values are * loaded into the real SPRGs when resuming the * guest. */ case SPRN_SPRG4: - vcpu->arch.sprg4 = spr_val; break; + vcpu->arch.shared->sprg4 = spr_val; break; case SPRN_SPRG5: - vcpu->arch.sprg5 = spr_val; break; + vcpu->arch.shared->sprg5 = spr_val; break; case SPRN_SPRG6: - vcpu->arch.sprg6 = spr_val; break; + vcpu->arch.shared->sprg6 = spr_val; break; case SPRN_SPRG7: - vcpu->arch.sprg7 = spr_val; break; + vcpu->arch.shared->sprg7 = spr_val; break; case SPRN_IVPR: vcpu->arch.ivpr = spr_val; @@ -202,13 +203,17 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_DEAR: kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->dar); break; case SPRN_ESR: - kvmppc_set_gpr(vcpu, rt, vcpu->arch.esr); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->esr); break; case SPRN_DBCR0: kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr0); break; case SPRN_DBCR1: kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbcr1); break; case SPRN_DBSR: kvmppc_set_gpr(vcpu, rt, vcpu->arch.dbsr); break; + case SPRN_TSR: + kvmppc_set_gpr(vcpu, rt, vcpu->arch.tsr); break; + case SPRN_TCR: + kvmppc_set_gpr(vcpu, rt, vcpu->arch.tcr); break; case SPRN_IVOR0: kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]); diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index 42f2fb1f66e..10d8ef602e5 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -402,19 +402,25 @@ lightweight_exit: /* Save vcpu pointer for the exception handlers. */ mtspr SPRN_SPRG_WVCPU, r4 + lwz r5, VCPU_SHARED(r4) + /* Can't switch the stack pointer until after IVPR is switched, * because host interrupt handlers would get confused. */ lwz r1, VCPU_GPR(r1)(r4) - /* Host interrupt handlers may have clobbered these guest-readable - * SPRGs, so we need to reload them here with the guest's values. */ - lwz r3, VCPU_SPRG4(r4) + /* + * Host interrupt handlers may have clobbered these + * guest-readable SPRGs, or the guest kernel may have + * written directly to the shared area, so we + * need to reload them here with the guest's values. + */ + lwz r3, VCPU_SHARED_SPRG4(r5) mtspr SPRN_SPRG4W, r3 - lwz r3, VCPU_SPRG5(r4) + lwz r3, VCPU_SHARED_SPRG5(r5) mtspr SPRN_SPRG5W, r3 - lwz r3, VCPU_SPRG6(r4) + lwz r3, VCPU_SHARED_SPRG6(r5) mtspr SPRN_SPRG6W, r3 - lwz r3, VCPU_SPRG7(r4) + lwz r3, VCPU_SHARED_SPRG7(r5) mtspr SPRN_SPRG7W, r3 #ifdef CONFIG_KVM_EXIT_TIMING diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 8c0d45a6faf..ddcd896fa2f 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -71,9 +71,6 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.pvr = mfspr(SPRN_PVR); vcpu_e500->svr = mfspr(SPRN_SVR); - /* Since booke kvm only support one core, update all vcpus' PIR to 0 */ - vcpu->vcpu_id = 0; - vcpu->arch.cpu_type = KVM_CPU_E500V2; return 0; @@ -118,12 +115,12 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0; sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar; - sregs->u.e.mas0 = vcpu_e500->mas0; - sregs->u.e.mas1 = vcpu_e500->mas1; - sregs->u.e.mas2 = vcpu_e500->mas2; - sregs->u.e.mas7_3 = ((u64)vcpu_e500->mas7 << 32) | vcpu_e500->mas3; - sregs->u.e.mas4 = vcpu_e500->mas4; - sregs->u.e.mas6 = vcpu_e500->mas6; + sregs->u.e.mas0 = vcpu->arch.shared->mas0; + sregs->u.e.mas1 = vcpu->arch.shared->mas1; + sregs->u.e.mas2 = vcpu->arch.shared->mas2; + sregs->u.e.mas7_3 = vcpu->arch.shared->mas7_3; + sregs->u.e.mas4 = vcpu->arch.shared->mas4; + sregs->u.e.mas6 = vcpu->arch.shared->mas6; sregs->u.e.mmucfg = mfspr(SPRN_MMUCFG); sregs->u.e.tlbcfg[0] = vcpu_e500->tlb0cfg; @@ -151,13 +148,12 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) } if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) { - vcpu_e500->mas0 = sregs->u.e.mas0; - vcpu_e500->mas1 = sregs->u.e.mas1; - vcpu_e500->mas2 = sregs->u.e.mas2; - vcpu_e500->mas7 = sregs->u.e.mas7_3 >> 32; - vcpu_e500->mas3 = (u32)sregs->u.e.mas7_3; - vcpu_e500->mas4 = sregs->u.e.mas4; - vcpu_e500->mas6 = sregs->u.e.mas6; + vcpu->arch.shared->mas0 = sregs->u.e.mas0; + vcpu->arch.shared->mas1 = sregs->u.e.mas1; + vcpu->arch.shared->mas2 = sregs->u.e.mas2; + vcpu->arch.shared->mas7_3 = sregs->u.e.mas7_3; + vcpu->arch.shared->mas4 = sregs->u.e.mas4; + vcpu->arch.shared->mas6 = sregs->u.e.mas6; } if (!(sregs->u.e.features & KVM_SREGS_E_IVOR)) @@ -233,6 +229,10 @@ static int __init kvmppc_e500_init(void) unsigned long ivor[3]; unsigned long max_ivor = 0; + r = kvmppc_core_check_processor_compat(); + if (r) + return r; + r = kvmppc_booke_init(); if (r) return r; diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index d48ae396f41..6d0b2bd54fb 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -89,19 +89,23 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) return EMULATE_FAIL; vcpu_e500->pid[2] = spr_val; break; case SPRN_MAS0: - vcpu_e500->mas0 = spr_val; break; + vcpu->arch.shared->mas0 = spr_val; break; case SPRN_MAS1: - vcpu_e500->mas1 = spr_val; break; + vcpu->arch.shared->mas1 = spr_val; break; case SPRN_MAS2: - vcpu_e500->mas2 = spr_val; break; + vcpu->arch.shared->mas2 = spr_val; break; case SPRN_MAS3: - vcpu_e500->mas3 = spr_val; break; + vcpu->arch.shared->mas7_3 &= ~(u64)0xffffffff; + vcpu->arch.shared->mas7_3 |= spr_val; + break; case SPRN_MAS4: - vcpu_e500->mas4 = spr_val; break; + vcpu->arch.shared->mas4 = spr_val; break; case SPRN_MAS6: - vcpu_e500->mas6 = spr_val; break; + vcpu->arch.shared->mas6 = spr_val; break; case SPRN_MAS7: - vcpu_e500->mas7 = spr_val; break; + vcpu->arch.shared->mas7_3 &= (u64)0xffffffff; + vcpu->arch.shared->mas7_3 |= (u64)spr_val << 32; + break; case SPRN_L1CSR0: vcpu_e500->l1csr0 = spr_val; vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC); @@ -143,6 +147,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int emulated = EMULATE_DONE; + unsigned long val; switch (sprn) { case SPRN_PID: @@ -152,20 +157,23 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) case SPRN_PID2: kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[2]); break; case SPRN_MAS0: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas0); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas0); break; case SPRN_MAS1: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas1); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas1); break; case SPRN_MAS2: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas2); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas2); break; case SPRN_MAS3: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas3); break; + val = (u32)vcpu->arch.shared->mas7_3; + kvmppc_set_gpr(vcpu, rt, val); + break; case SPRN_MAS4: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas4); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas4); break; case SPRN_MAS6: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas6); break; + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->mas6); break; case SPRN_MAS7: - kvmppc_set_gpr(vcpu, rt, vcpu_e500->mas7); break; - + val = vcpu->arch.shared->mas7_3 >> 32; + kvmppc_set_gpr(vcpu, rt, val); + break; case SPRN_TLB0CFG: kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb0cfg); break; case SPRN_TLB1CFG: diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c index 13c432ea2fa..6e53e4164de 100644 --- a/arch/powerpc/kvm/e500_tlb.c +++ b/arch/powerpc/kvm/e500_tlb.c @@ -12,12 +12,19 @@ * published by the Free Software Foundation. */ +#include <linux/kernel.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/highmem.h> +#include <linux/log2.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <linux/rwsem.h> +#include <linux/vmalloc.h> +#include <linux/hugetlb.h> #include <asm/kvm_ppc.h> #include <asm/kvm_e500.h> @@ -26,7 +33,7 @@ #include "trace.h" #include "timing.h" -#define to_htlb1_esel(esel) (tlb1_entry_num - (esel) - 1) +#define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1) struct id { unsigned long val; @@ -63,7 +70,14 @@ static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids); * The valid range of shadow ID is [1..255] */ static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid); -static unsigned int tlb1_entry_num; +static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM]; + +static struct kvm_book3e_206_tlb_entry *get_entry( + struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int entry) +{ + int offset = vcpu_e500->gtlb_offset[tlbsel]; + return &vcpu_e500->gtlb_arch[offset + entry]; +} /* * Allocate a free shadow id and setup a valid sid mapping in given entry. @@ -116,13 +130,11 @@ static inline int local_sid_lookup(struct id *entry) return -1; } -/* Invalidate all id mappings on local core */ +/* Invalidate all id mappings on local core -- call with preempt disabled */ static inline void local_sid_destroy_all(void) { - preempt_disable(); __get_cpu_var(pcpu_last_used_sid) = 0; memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids))); - preempt_enable(); } static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500) @@ -218,34 +230,13 @@ void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500) preempt_enable(); } -void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu) -{ - struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct tlbe *tlbe; - int i, tlbsel; - - printk("| %8s | %8s | %8s | %8s | %8s |\n", - "nr", "mas1", "mas2", "mas3", "mas7"); - - for (tlbsel = 0; tlbsel < 2; tlbsel++) { - printk("Guest TLB%d:\n", tlbsel); - for (i = 0; i < vcpu_e500->gtlb_size[tlbsel]; i++) { - tlbe = &vcpu_e500->gtlb_arch[tlbsel][i]; - if (tlbe->mas1 & MAS1_VALID) - printk(" G[%d][%3d] | %08X | %08X | %08X | %08X |\n", - tlbsel, i, tlbe->mas1, tlbe->mas2, - tlbe->mas3, tlbe->mas7); - } - } -} - -static inline unsigned int tlb0_get_next_victim( +static inline unsigned int gtlb0_get_next_victim( struct kvmppc_vcpu_e500 *vcpu_e500) { unsigned int victim; victim = vcpu_e500->gtlb_nv[0]++; - if (unlikely(vcpu_e500->gtlb_nv[0] >= KVM_E500_TLB0_WAY_NUM)) + if (unlikely(vcpu_e500->gtlb_nv[0] >= vcpu_e500->gtlb_params[0].ways)) vcpu_e500->gtlb_nv[0] = 0; return victim; @@ -254,12 +245,12 @@ static inline unsigned int tlb0_get_next_victim( static inline unsigned int tlb1_max_shadow_size(void) { /* reserve one entry for magic page */ - return tlb1_entry_num - tlbcam_index - 1; + return host_tlb_params[1].entries - tlbcam_index - 1; } -static inline int tlbe_is_writable(struct tlbe *tlbe) +static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe) { - return tlbe->mas3 & (MAS3_SW|MAS3_UW); + return tlbe->mas7_3 & (MAS3_SW|MAS3_UW); } static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode) @@ -290,40 +281,66 @@ static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode) /* * writing shadow tlb entry to host TLB */ -static inline void __write_host_tlbe(struct tlbe *stlbe, uint32_t mas0) +static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe, + uint32_t mas0) { unsigned long flags; local_irq_save(flags); mtspr(SPRN_MAS0, mas0); mtspr(SPRN_MAS1, stlbe->mas1); - mtspr(SPRN_MAS2, stlbe->mas2); - mtspr(SPRN_MAS3, stlbe->mas3); - mtspr(SPRN_MAS7, stlbe->mas7); + mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2); + mtspr(SPRN_MAS3, (u32)stlbe->mas7_3); + mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32)); asm volatile("isync; tlbwe" : : : "memory"); local_irq_restore(flags); + + trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1, + stlbe->mas2, stlbe->mas7_3); +} + +/* + * Acquire a mas0 with victim hint, as if we just took a TLB miss. + * + * We don't care about the address we're searching for, other than that it's + * in the right set and is not present in the TLB. Using a zero PID and a + * userspace address means we don't have to set and then restore MAS5, or + * calculate a proper MAS6 value. + */ +static u32 get_host_mas0(unsigned long eaddr) +{ + unsigned long flags; + u32 mas0; + + local_irq_save(flags); + mtspr(SPRN_MAS6, 0); + asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET)); + mas0 = mfspr(SPRN_MAS0); + local_irq_restore(flags); + + return mas0; } +/* sesel is for tlb1 only */ static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel, struct tlbe *stlbe) + int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe) { + u32 mas0; + if (tlbsel == 0) { - __write_host_tlbe(stlbe, - MAS0_TLBSEL(0) | - MAS0_ESEL(esel & (KVM_E500_TLB0_WAY_NUM - 1))); + mas0 = get_host_mas0(stlbe->mas2); + __write_host_tlbe(stlbe, mas0); } else { __write_host_tlbe(stlbe, MAS0_TLBSEL(1) | - MAS0_ESEL(to_htlb1_esel(esel))); + MAS0_ESEL(to_htlb1_esel(sesel))); } - trace_kvm_stlb_write(index_of(tlbsel, esel), stlbe->mas1, stlbe->mas2, - stlbe->mas3, stlbe->mas7); } void kvmppc_map_magic(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct tlbe magic; + struct kvm_book3e_206_tlb_entry magic; ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; unsigned int stid; pfn_t pfn; @@ -337,9 +354,9 @@ void kvmppc_map_magic(struct kvm_vcpu *vcpu) magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) | MAS1_TSIZE(BOOK3E_PAGESZ_4K); magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M; - magic.mas3 = (pfn << PAGE_SHIFT) | - MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR; - magic.mas7 = pfn >> (32 - PAGE_SHIFT); + magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) | + MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR; + magic.mas8 = 0; __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index)); preempt_enable(); @@ -357,10 +374,11 @@ void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu) { } -static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel, int esel) +static void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, + int tlbsel, int esel) { - struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; + struct kvm_book3e_206_tlb_entry *gtlbe = + get_entry(vcpu_e500, tlbsel, esel); struct vcpu_id_table *idt = vcpu_e500->idt; unsigned int pr, tid, ts, pid; u32 val, eaddr; @@ -414,25 +432,57 @@ static void kvmppc_e500_stlbe_invalidate(struct kvmppc_vcpu_e500 *vcpu_e500, preempt_enable(); } +static int tlb0_set_base(gva_t addr, int sets, int ways) +{ + int set_base; + + set_base = (addr >> PAGE_SHIFT) & (sets - 1); + set_base *= ways; + + return set_base; +} + +static int gtlb0_set_base(struct kvmppc_vcpu_e500 *vcpu_e500, gva_t addr) +{ + return tlb0_set_base(addr, vcpu_e500->gtlb_params[0].sets, + vcpu_e500->gtlb_params[0].ways); +} + +static unsigned int get_tlb_esel(struct kvm_vcpu *vcpu, int tlbsel) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int esel = get_tlb_esel_bit(vcpu); + + if (tlbsel == 0) { + esel &= vcpu_e500->gtlb_params[0].ways - 1; + esel += gtlb0_set_base(vcpu_e500, vcpu->arch.shared->mas2); + } else { + esel &= vcpu_e500->gtlb_params[tlbsel].entries - 1; + } + + return esel; +} + /* Search the guest TLB for a matching entry. */ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, gva_t eaddr, int tlbsel, unsigned int pid, int as) { - int size = vcpu_e500->gtlb_size[tlbsel]; - int set_base; + int size = vcpu_e500->gtlb_params[tlbsel].entries; + unsigned int set_base, offset; int i; if (tlbsel == 0) { - int mask = size / KVM_E500_TLB0_WAY_NUM - 1; - set_base = (eaddr >> PAGE_SHIFT) & mask; - set_base *= KVM_E500_TLB0_WAY_NUM; - size = KVM_E500_TLB0_WAY_NUM; + set_base = gtlb0_set_base(vcpu_e500, eaddr); + size = vcpu_e500->gtlb_params[0].ways; } else { set_base = 0; } + offset = vcpu_e500->gtlb_offset[tlbsel]; + for (i = 0; i < size; i++) { - struct tlbe *tlbe = &vcpu_e500->gtlb_arch[tlbsel][set_base + i]; + struct kvm_book3e_206_tlb_entry *tlbe = + &vcpu_e500->gtlb_arch[offset + set_base + i]; unsigned int tid; if (eaddr < get_tlb_eaddr(tlbe)) @@ -457,27 +507,55 @@ static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, return -1; } -static inline void kvmppc_e500_priv_setup(struct tlbe_priv *priv, - struct tlbe *gtlbe, - pfn_t pfn) +static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, + struct kvm_book3e_206_tlb_entry *gtlbe, + pfn_t pfn) { - priv->pfn = pfn; - priv->flags = E500_TLB_VALID; + ref->pfn = pfn; + ref->flags = E500_TLB_VALID; if (tlbe_is_writable(gtlbe)) - priv->flags |= E500_TLB_DIRTY; + ref->flags |= E500_TLB_DIRTY; } -static inline void kvmppc_e500_priv_release(struct tlbe_priv *priv) +static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) { - if (priv->flags & E500_TLB_VALID) { - if (priv->flags & E500_TLB_DIRTY) - kvm_release_pfn_dirty(priv->pfn); + if (ref->flags & E500_TLB_VALID) { + if (ref->flags & E500_TLB_DIRTY) + kvm_release_pfn_dirty(ref->pfn); else - kvm_release_pfn_clean(priv->pfn); + kvm_release_pfn_clean(ref->pfn); + + ref->flags = 0; + } +} + +static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + int tlbsel = 0; + int i; + + for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) { + struct tlbe_ref *ref = + &vcpu_e500->gtlb_priv[tlbsel][i].ref; + kvmppc_e500_ref_release(ref); + } +} + +static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + int stlbsel = 1; + int i; + + kvmppc_e500_id_table_reset_all(vcpu_e500); - priv->flags = 0; + for (i = 0; i < host_tlb_params[stlbsel].entries; i++) { + struct tlbe_ref *ref = + &vcpu_e500->tlb_refs[stlbsel][i]; + kvmppc_e500_ref_release(ref); } + + clear_tlb_privs(vcpu_e500); } static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, @@ -488,59 +566,54 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, int tlbsel; /* since we only have two TLBs, only lower bit is used. */ - tlbsel = (vcpu_e500->mas4 >> 28) & 0x1; - victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; - pidsel = (vcpu_e500->mas4 >> 16) & 0xf; - tsized = (vcpu_e500->mas4 >> 7) & 0x1f; + tlbsel = (vcpu->arch.shared->mas4 >> 28) & 0x1; + victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0; + pidsel = (vcpu->arch.shared->mas4 >> 16) & 0xf; + tsized = (vcpu->arch.shared->mas4 >> 7) & 0x1f; - vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) + vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); - vcpu_e500->mas1 = MAS1_VALID | (as ? MAS1_TS : 0) + vcpu->arch.shared->mas1 = MAS1_VALID | (as ? MAS1_TS : 0) | MAS1_TID(vcpu_e500->pid[pidsel]) | MAS1_TSIZE(tsized); - vcpu_e500->mas2 = (eaddr & MAS2_EPN) - | (vcpu_e500->mas4 & MAS2_ATTRIB_MASK); - vcpu_e500->mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; - vcpu_e500->mas6 = (vcpu_e500->mas6 & MAS6_SPID1) + vcpu->arch.shared->mas2 = (eaddr & MAS2_EPN) + | (vcpu->arch.shared->mas4 & MAS2_ATTRIB_MASK); + vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; + vcpu->arch.shared->mas6 = (vcpu->arch.shared->mas6 & MAS6_SPID1) | (get_cur_pid(vcpu) << 16) | (as ? MAS6_SAS : 0); - vcpu_e500->mas7 = 0; } -static inline void kvmppc_e500_setup_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, - struct tlbe *gtlbe, int tsize, - struct tlbe_priv *priv, - u64 gvaddr, struct tlbe *stlbe) +/* TID must be supplied by the caller */ +static inline void kvmppc_e500_setup_stlbe( + struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe, + int tsize, struct tlbe_ref *ref, u64 gvaddr, + struct kvm_book3e_206_tlb_entry *stlbe) { - pfn_t pfn = priv->pfn; - unsigned int stid; + pfn_t pfn = ref->pfn; - stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe), - get_tlb_tid(gtlbe), - get_cur_pr(&vcpu_e500->vcpu), 0); + BUG_ON(!(ref->flags & E500_TLB_VALID)); /* Force TS=1 IPROT=0 for all guest mappings. */ - stlbe->mas1 = MAS1_TSIZE(tsize) - | MAS1_TID(stid) | MAS1_TS | MAS1_VALID; + stlbe->mas1 = MAS1_TSIZE(tsize) | MAS1_TS | MAS1_VALID; stlbe->mas2 = (gvaddr & MAS2_EPN) | e500_shadow_mas2_attrib(gtlbe->mas2, vcpu_e500->vcpu.arch.shared->msr & MSR_PR); - stlbe->mas3 = ((pfn << PAGE_SHIFT) & MAS3_RPN) - | e500_shadow_mas3_attrib(gtlbe->mas3, + stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) + | e500_shadow_mas3_attrib(gtlbe->mas7_3, vcpu_e500->vcpu.arch.shared->msr & MSR_PR); - stlbe->mas7 = (pfn >> (32 - PAGE_SHIFT)) & MAS7_RPN; } - static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, - u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, int tlbsel, int esel, - struct tlbe *stlbe) + u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, + int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe, + struct tlbe_ref *ref) { struct kvm_memory_slot *slot; unsigned long pfn, hva; int pfnmap = 0; int tsize = BOOK3E_PAGESZ_4K; - struct tlbe_priv *priv; /* * Translate guest physical to true physical, acquiring @@ -621,12 +694,31 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, pfn &= ~(tsize_pages - 1); break; } + } else if (vma && hva >= vma->vm_start && + (vma->vm_flags & VM_HUGETLB)) { + unsigned long psize = vma_kernel_pagesize(vma); + + tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> + MAS1_TSIZE_SHIFT; + + /* + * Take the largest page size that satisfies both host + * and guest mapping + */ + tsize = min(__ilog2(psize) - 10, tsize); + + /* + * e500 doesn't implement the lowest tsize bit, + * or 1K pages. + */ + tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); } up_read(¤t->mm->mmap_sem); } if (likely(!pfnmap)) { + unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); pfn = gfn_to_pfn_memslot(vcpu_e500->vcpu.kvm, slot, gfn); if (is_error_pfn(pfn)) { printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", @@ -634,45 +726,52 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, kvm_release_pfn_clean(pfn); return; } + + /* Align guest and physical address to page map boundaries */ + pfn &= ~(tsize_pages - 1); + gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); } - /* Drop old priv and setup new one. */ - priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; - kvmppc_e500_priv_release(priv); - kvmppc_e500_priv_setup(priv, gtlbe, pfn); + /* Drop old ref and setup new one. */ + kvmppc_e500_ref_release(ref); + kvmppc_e500_ref_setup(ref, gtlbe, pfn); - kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, priv, gvaddr, stlbe); + kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, ref, gvaddr, stlbe); } /* XXX only map the one-one case, for now use TLB0 */ -static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, - int esel, struct tlbe *stlbe) +static void kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, + int esel, + struct kvm_book3e_206_tlb_entry *stlbe) { - struct tlbe *gtlbe; + struct kvm_book3e_206_tlb_entry *gtlbe; + struct tlbe_ref *ref; - gtlbe = &vcpu_e500->gtlb_arch[0][esel]; + gtlbe = get_entry(vcpu_e500, 0, esel); + ref = &vcpu_e500->gtlb_priv[0][esel].ref; kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe), get_tlb_raddr(gtlbe) >> PAGE_SHIFT, - gtlbe, 0, esel, stlbe); - - return esel; + gtlbe, 0, stlbe, ref); } /* Caller must ensure that the specified guest TLB entry is safe to insert into * the shadow TLB. */ /* XXX for both one-one and one-to-many , for now use TLB1 */ static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, - u64 gvaddr, gfn_t gfn, struct tlbe *gtlbe, struct tlbe *stlbe) + u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, + struct kvm_book3e_206_tlb_entry *stlbe) { + struct tlbe_ref *ref; unsigned int victim; - victim = vcpu_e500->gtlb_nv[1]++; + victim = vcpu_e500->host_tlb1_nv++; - if (unlikely(vcpu_e500->gtlb_nv[1] >= tlb1_max_shadow_size())) - vcpu_e500->gtlb_nv[1] = 0; + if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size())) + vcpu_e500->host_tlb1_nv = 0; - kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, victim, stlbe); + ref = &vcpu_e500->tlb_refs[1][victim]; + kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe, ref); return victim; } @@ -689,7 +788,8 @@ static inline int kvmppc_e500_gtlbe_invalidate( struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int esel) { - struct tlbe *gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; + struct kvm_book3e_206_tlb_entry *gtlbe = + get_entry(vcpu_e500, tlbsel, esel); if (unlikely(get_tlb_iprot(gtlbe))) return -1; @@ -704,10 +804,10 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value) int esel; if (value & MMUCSR0_TLB0FI) - for (esel = 0; esel < vcpu_e500->gtlb_size[0]; esel++) + for (esel = 0; esel < vcpu_e500->gtlb_params[0].entries; esel++) kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel); if (value & MMUCSR0_TLB1FI) - for (esel = 0; esel < vcpu_e500->gtlb_size[1]; esel++) + for (esel = 0; esel < vcpu_e500->gtlb_params[1].entries; esel++) kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel); /* Invalidate all vcpu id mappings */ @@ -732,7 +832,8 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) if (ia) { /* invalidate all entries */ - for (esel = 0; esel < vcpu_e500->gtlb_size[tlbsel]; esel++) + for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; + esel++) kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); } else { ea &= 0xfffff000; @@ -752,18 +853,17 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); int tlbsel, esel; - struct tlbe *gtlbe; + struct kvm_book3e_206_tlb_entry *gtlbe; - tlbsel = get_tlb_tlbsel(vcpu_e500); - esel = get_tlb_esel(vcpu_e500, tlbsel); + tlbsel = get_tlb_tlbsel(vcpu); + esel = get_tlb_esel(vcpu, tlbsel); - gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; - vcpu_e500->mas0 &= ~MAS0_NV(~0); - vcpu_e500->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); - vcpu_e500->mas1 = gtlbe->mas1; - vcpu_e500->mas2 = gtlbe->mas2; - vcpu_e500->mas3 = gtlbe->mas3; - vcpu_e500->mas7 = gtlbe->mas7; + gtlbe = get_entry(vcpu_e500, tlbsel, esel); + vcpu->arch.shared->mas0 &= ~MAS0_NV(~0); + vcpu->arch.shared->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); + vcpu->arch.shared->mas1 = gtlbe->mas1; + vcpu->arch.shared->mas2 = gtlbe->mas2; + vcpu->arch.shared->mas7_3 = gtlbe->mas7_3; return EMULATE_DONE; } @@ -771,10 +871,10 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu) int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - int as = !!get_cur_sas(vcpu_e500); - unsigned int pid = get_cur_spid(vcpu_e500); + int as = !!get_cur_sas(vcpu); + unsigned int pid = get_cur_spid(vcpu); int esel, tlbsel; - struct tlbe *gtlbe = NULL; + struct kvm_book3e_206_tlb_entry *gtlbe = NULL; gva_t ea; ea = kvmppc_get_gpr(vcpu, rb); @@ -782,70 +882,90 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) for (tlbsel = 0; tlbsel < 2; tlbsel++) { esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); if (esel >= 0) { - gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; + gtlbe = get_entry(vcpu_e500, tlbsel, esel); break; } } if (gtlbe) { - vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel) + esel &= vcpu_e500->gtlb_params[tlbsel].ways - 1; + + vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel) | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); - vcpu_e500->mas1 = gtlbe->mas1; - vcpu_e500->mas2 = gtlbe->mas2; - vcpu_e500->mas3 = gtlbe->mas3; - vcpu_e500->mas7 = gtlbe->mas7; + vcpu->arch.shared->mas1 = gtlbe->mas1; + vcpu->arch.shared->mas2 = gtlbe->mas2; + vcpu->arch.shared->mas7_3 = gtlbe->mas7_3; } else { int victim; /* since we only have two TLBs, only lower bit is used. */ - tlbsel = vcpu_e500->mas4 >> 28 & 0x1; - victim = (tlbsel == 0) ? tlb0_get_next_victim(vcpu_e500) : 0; + tlbsel = vcpu->arch.shared->mas4 >> 28 & 0x1; + victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0; - vcpu_e500->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) + vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) + | MAS0_ESEL(victim) | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); - vcpu_e500->mas1 = (vcpu_e500->mas6 & MAS6_SPID0) - | (vcpu_e500->mas6 & (MAS6_SAS ? MAS1_TS : 0)) - | (vcpu_e500->mas4 & MAS4_TSIZED(~0)); - vcpu_e500->mas2 &= MAS2_EPN; - vcpu_e500->mas2 |= vcpu_e500->mas4 & MAS2_ATTRIB_MASK; - vcpu_e500->mas3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; - vcpu_e500->mas7 = 0; + vcpu->arch.shared->mas1 = + (vcpu->arch.shared->mas6 & MAS6_SPID0) + | (vcpu->arch.shared->mas6 & (MAS6_SAS ? MAS1_TS : 0)) + | (vcpu->arch.shared->mas4 & MAS4_TSIZED(~0)); + vcpu->arch.shared->mas2 &= MAS2_EPN; + vcpu->arch.shared->mas2 |= vcpu->arch.shared->mas4 & + MAS2_ATTRIB_MASK; + vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 | + MAS3_U2 | MAS3_U3; } kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS); return EMULATE_DONE; } +/* sesel is for tlb1 only */ +static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe, + struct kvm_book3e_206_tlb_entry *stlbe, + int stlbsel, int sesel) +{ + int stid; + + preempt_disable(); + stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe), + get_tlb_tid(gtlbe), + get_cur_pr(&vcpu_e500->vcpu), 0); + + stlbe->mas1 |= MAS1_TID(stid); + write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe); + preempt_enable(); +} + int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct tlbe *gtlbe; + struct kvm_book3e_206_tlb_entry *gtlbe; int tlbsel, esel; - tlbsel = get_tlb_tlbsel(vcpu_e500); - esel = get_tlb_esel(vcpu_e500, tlbsel); + tlbsel = get_tlb_tlbsel(vcpu); + esel = get_tlb_esel(vcpu, tlbsel); - gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; + gtlbe = get_entry(vcpu_e500, tlbsel, esel); if (get_tlb_v(gtlbe)) - kvmppc_e500_stlbe_invalidate(vcpu_e500, tlbsel, esel); + inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); - gtlbe->mas1 = vcpu_e500->mas1; - gtlbe->mas2 = vcpu_e500->mas2; - gtlbe->mas3 = vcpu_e500->mas3; - gtlbe->mas7 = vcpu_e500->mas7; + gtlbe->mas1 = vcpu->arch.shared->mas1; + gtlbe->mas2 = vcpu->arch.shared->mas2; + gtlbe->mas7_3 = vcpu->arch.shared->mas7_3; - trace_kvm_gtlb_write(vcpu_e500->mas0, gtlbe->mas1, gtlbe->mas2, - gtlbe->mas3, gtlbe->mas7); + trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1, + gtlbe->mas2, gtlbe->mas7_3); /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ if (tlbe_is_host_safe(vcpu, gtlbe)) { - struct tlbe stlbe; + struct kvm_book3e_206_tlb_entry stlbe; int stlbsel, sesel; u64 eaddr; u64 raddr; - preempt_disable(); switch (tlbsel) { case 0: /* TLB0 */ @@ -853,7 +973,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); stlbsel = 0; - sesel = kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); + kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); + sesel = 0; /* unused */ break; @@ -874,8 +995,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) default: BUG(); } - write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe); - preempt_enable(); + + write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel); } kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); @@ -914,9 +1035,11 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index, gva_t eaddr) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); - struct tlbe *gtlbe = - &vcpu_e500->gtlb_arch[tlbsel_of(index)][esel_of(index)]; - u64 pgmask = get_tlb_bytes(gtlbe) - 1; + struct kvm_book3e_206_tlb_entry *gtlbe; + u64 pgmask; + + gtlbe = get_entry(vcpu_e500, tlbsel_of(index), esel_of(index)); + pgmask = get_tlb_bytes(gtlbe) - 1; return get_tlb_raddr(gtlbe) | (eaddr & pgmask); } @@ -930,22 +1053,21 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); struct tlbe_priv *priv; - struct tlbe *gtlbe, stlbe; + struct kvm_book3e_206_tlb_entry *gtlbe, stlbe; int tlbsel = tlbsel_of(index); int esel = esel_of(index); int stlbsel, sesel; - gtlbe = &vcpu_e500->gtlb_arch[tlbsel][esel]; + gtlbe = get_entry(vcpu_e500, tlbsel, esel); - preempt_disable(); switch (tlbsel) { case 0: stlbsel = 0; - sesel = esel; - priv = &vcpu_e500->gtlb_priv[stlbsel][sesel]; + sesel = 0; /* unused */ + priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, BOOK3E_PAGESZ_4K, - priv, eaddr, &stlbe); + &priv->ref, eaddr, &stlbe); break; case 1: { @@ -962,8 +1084,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, break; } - write_host_tlbe(vcpu_e500, stlbsel, sesel, &stlbe); - preempt_enable(); + write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel); } int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, @@ -993,85 +1114,279 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid) void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) { - struct tlbe *tlbe; + struct kvm_book3e_206_tlb_entry *tlbe; /* Insert large initial mapping for guest. */ - tlbe = &vcpu_e500->gtlb_arch[1][0]; + tlbe = get_entry(vcpu_e500, 1, 0); tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M); tlbe->mas2 = 0; - tlbe->mas3 = E500_TLB_SUPER_PERM_MASK; - tlbe->mas7 = 0; + tlbe->mas7_3 = E500_TLB_SUPER_PERM_MASK; /* 4K map for serial output. Used by kernel wrapper. */ - tlbe = &vcpu_e500->gtlb_arch[1][1]; + tlbe = get_entry(vcpu_e500, 1, 1); tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K); tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; - tlbe->mas3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; - tlbe->mas7 = 0; + tlbe->mas7_3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; +} + +static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + int i; + + clear_tlb_refs(vcpu_e500); + kfree(vcpu_e500->gtlb_priv[0]); + kfree(vcpu_e500->gtlb_priv[1]); + + if (vcpu_e500->shared_tlb_pages) { + vfree((void *)(round_down((uintptr_t)vcpu_e500->gtlb_arch, + PAGE_SIZE))); + + for (i = 0; i < vcpu_e500->num_shared_tlb_pages; i++) { + set_page_dirty_lock(vcpu_e500->shared_tlb_pages[i]); + put_page(vcpu_e500->shared_tlb_pages[i]); + } + + vcpu_e500->num_shared_tlb_pages = 0; + vcpu_e500->shared_tlb_pages = NULL; + } else { + kfree(vcpu_e500->gtlb_arch); + } + + vcpu_e500->gtlb_arch = NULL; +} + +int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, + struct kvm_config_tlb *cfg) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct kvm_book3e_206_tlb_params params; + char *virt; + struct page **pages; + struct tlbe_priv *privs[2] = {}; + size_t array_len; + u32 sets; + int num_pages, ret, i; + + if (cfg->mmu_type != KVM_MMU_FSL_BOOKE_NOHV) + return -EINVAL; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)cfg->params, + sizeof(params))) + return -EFAULT; + + if (params.tlb_sizes[1] > 64) + return -EINVAL; + if (params.tlb_ways[1] != params.tlb_sizes[1]) + return -EINVAL; + if (params.tlb_sizes[2] != 0 || params.tlb_sizes[3] != 0) + return -EINVAL; + if (params.tlb_ways[2] != 0 || params.tlb_ways[3] != 0) + return -EINVAL; + + if (!is_power_of_2(params.tlb_ways[0])) + return -EINVAL; + + sets = params.tlb_sizes[0] >> ilog2(params.tlb_ways[0]); + if (!is_power_of_2(sets)) + return -EINVAL; + + array_len = params.tlb_sizes[0] + params.tlb_sizes[1]; + array_len *= sizeof(struct kvm_book3e_206_tlb_entry); + + if (cfg->array_len < array_len) + return -EINVAL; + + num_pages = DIV_ROUND_UP(cfg->array + array_len - 1, PAGE_SIZE) - + cfg->array / PAGE_SIZE; + pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); + if (!pages) + return -ENOMEM; + + ret = get_user_pages_fast(cfg->array, num_pages, 1, pages); + if (ret < 0) + goto err_pages; + + if (ret != num_pages) { + num_pages = ret; + ret = -EFAULT; + goto err_put_page; + } + + virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL); + if (!virt) + goto err_put_page; + + privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0], + GFP_KERNEL); + privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1], + GFP_KERNEL); + + if (!privs[0] || !privs[1]) + goto err_put_page; + + free_gtlb(vcpu_e500); + + vcpu_e500->gtlb_priv[0] = privs[0]; + vcpu_e500->gtlb_priv[1] = privs[1]; + + vcpu_e500->gtlb_arch = (struct kvm_book3e_206_tlb_entry *) + (virt + (cfg->array & (PAGE_SIZE - 1))); + + vcpu_e500->gtlb_params[0].entries = params.tlb_sizes[0]; + vcpu_e500->gtlb_params[1].entries = params.tlb_sizes[1]; + + vcpu_e500->gtlb_offset[0] = 0; + vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0]; + + vcpu_e500->tlb0cfg &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + if (params.tlb_sizes[0] <= 2048) + vcpu_e500->tlb0cfg |= params.tlb_sizes[0]; + vcpu_e500->tlb0cfg |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT; + + vcpu_e500->tlb1cfg &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu_e500->tlb1cfg |= params.tlb_sizes[1]; + vcpu_e500->tlb1cfg |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT; + + vcpu_e500->shared_tlb_pages = pages; + vcpu_e500->num_shared_tlb_pages = num_pages; + + vcpu_e500->gtlb_params[0].ways = params.tlb_ways[0]; + vcpu_e500->gtlb_params[0].sets = sets; + + vcpu_e500->gtlb_params[1].ways = params.tlb_sizes[1]; + vcpu_e500->gtlb_params[1].sets = 1; + + return 0; + +err_put_page: + kfree(privs[0]); + kfree(privs[1]); + + for (i = 0; i < num_pages; i++) + put_page(pages[i]); + +err_pages: + kfree(pages); + return ret; +} + +int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, + struct kvm_dirty_tlb *dirty) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + clear_tlb_refs(vcpu_e500); + return 0; } int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) { - tlb1_entry_num = mfspr(SPRN_TLB1CFG) & 0xFFF; - - vcpu_e500->gtlb_size[0] = KVM_E500_TLB0_SIZE; - vcpu_e500->gtlb_arch[0] = - kzalloc(sizeof(struct tlbe) * KVM_E500_TLB0_SIZE, GFP_KERNEL); - if (vcpu_e500->gtlb_arch[0] == NULL) - goto err_out; - - vcpu_e500->gtlb_size[1] = KVM_E500_TLB1_SIZE; - vcpu_e500->gtlb_arch[1] = - kzalloc(sizeof(struct tlbe) * KVM_E500_TLB1_SIZE, GFP_KERNEL); - if (vcpu_e500->gtlb_arch[1] == NULL) - goto err_out_guest0; - - vcpu_e500->gtlb_priv[0] = (struct tlbe_priv *) - kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB0_SIZE, GFP_KERNEL); - if (vcpu_e500->gtlb_priv[0] == NULL) - goto err_out_guest1; - vcpu_e500->gtlb_priv[1] = (struct tlbe_priv *) - kzalloc(sizeof(struct tlbe_priv) * KVM_E500_TLB1_SIZE, GFP_KERNEL); - - if (vcpu_e500->gtlb_priv[1] == NULL) - goto err_out_priv0; + int entry_size = sizeof(struct kvm_book3e_206_tlb_entry); + int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE; + + host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY; + host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + + /* + * This should never happen on real e500 hardware, but is + * architecturally possible -- e.g. in some weird nested + * virtualization case. + */ + if (host_tlb_params[0].entries == 0 || + host_tlb_params[1].entries == 0) { + pr_err("%s: need to know host tlb size\n", __func__); + return -ENODEV; + } + + host_tlb_params[0].ways = (mfspr(SPRN_TLB0CFG) & TLBnCFG_ASSOC) >> + TLBnCFG_ASSOC_SHIFT; + host_tlb_params[1].ways = host_tlb_params[1].entries; + + if (!is_power_of_2(host_tlb_params[0].entries) || + !is_power_of_2(host_tlb_params[0].ways) || + host_tlb_params[0].entries < host_tlb_params[0].ways || + host_tlb_params[0].ways == 0) { + pr_err("%s: bad tlb0 host config: %u entries %u ways\n", + __func__, host_tlb_params[0].entries, + host_tlb_params[0].ways); + return -ENODEV; + } + + host_tlb_params[0].sets = + host_tlb_params[0].entries / host_tlb_params[0].ways; + host_tlb_params[1].sets = 1; + + vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE; + vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE; + + vcpu_e500->gtlb_params[0].ways = KVM_E500_TLB0_WAY_NUM; + vcpu_e500->gtlb_params[0].sets = + KVM_E500_TLB0_SIZE / KVM_E500_TLB0_WAY_NUM; + + vcpu_e500->gtlb_params[1].ways = KVM_E500_TLB1_SIZE; + vcpu_e500->gtlb_params[1].sets = 1; + + vcpu_e500->gtlb_arch = kmalloc(entries * entry_size, GFP_KERNEL); + if (!vcpu_e500->gtlb_arch) + return -ENOMEM; + + vcpu_e500->gtlb_offset[0] = 0; + vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE; + + vcpu_e500->tlb_refs[0] = + kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[0].entries, + GFP_KERNEL); + if (!vcpu_e500->tlb_refs[0]) + goto err; + + vcpu_e500->tlb_refs[1] = + kzalloc(sizeof(struct tlbe_ref) * host_tlb_params[1].entries, + GFP_KERNEL); + if (!vcpu_e500->tlb_refs[1]) + goto err; + + vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) * + vcpu_e500->gtlb_params[0].entries, + GFP_KERNEL); + if (!vcpu_e500->gtlb_priv[0]) + goto err; + + vcpu_e500->gtlb_priv[1] = kzalloc(sizeof(struct tlbe_ref) * + vcpu_e500->gtlb_params[1].entries, + GFP_KERNEL); + if (!vcpu_e500->gtlb_priv[1]) + goto err; if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) - goto err_out_priv1; + goto err; /* Init TLB configuration register */ - vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & ~0xfffUL; - vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_size[0]; - vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & ~0xfffUL; - vcpu_e500->tlb1cfg |= vcpu_e500->gtlb_size[1]; + vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) & + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[0].entries; + vcpu_e500->tlb0cfg |= + vcpu_e500->gtlb_params[0].ways << TLBnCFG_ASSOC_SHIFT; + + vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) & + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[1].entries; + vcpu_e500->tlb0cfg |= + vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT; return 0; -err_out_priv1: - kfree(vcpu_e500->gtlb_priv[1]); -err_out_priv0: - kfree(vcpu_e500->gtlb_priv[0]); -err_out_guest1: - kfree(vcpu_e500->gtlb_arch[1]); -err_out_guest0: - kfree(vcpu_e500->gtlb_arch[0]); -err_out: +err: + free_gtlb(vcpu_e500); + kfree(vcpu_e500->tlb_refs[0]); + kfree(vcpu_e500->tlb_refs[1]); return -1; } void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) { - int stlbsel, i; - - /* release all privs */ - for (stlbsel = 0; stlbsel < 2; stlbsel++) - for (i = 0; i < vcpu_e500->gtlb_size[stlbsel]; i++) { - struct tlbe_priv *priv = - &vcpu_e500->gtlb_priv[stlbsel][i]; - kvmppc_e500_priv_release(priv); - } - + free_gtlb(vcpu_e500); kvmppc_e500_id_table_free(vcpu_e500); - kfree(vcpu_e500->gtlb_arch[1]); - kfree(vcpu_e500->gtlb_arch[0]); + + kfree(vcpu_e500->tlb_refs[0]); + kfree(vcpu_e500->tlb_refs[1]); } diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500_tlb.h index 59b88e99a23..5c6d2d7bf05 100644 --- a/arch/powerpc/kvm/e500_tlb.h +++ b/arch/powerpc/kvm/e500_tlb.h @@ -20,13 +20,9 @@ #include <asm/tlb.h> #include <asm/kvm_e500.h> -#define KVM_E500_TLB0_WAY_SIZE_BIT 7 /* Fixed */ -#define KVM_E500_TLB0_WAY_SIZE (1UL << KVM_E500_TLB0_WAY_SIZE_BIT) -#define KVM_E500_TLB0_WAY_SIZE_MASK (KVM_E500_TLB0_WAY_SIZE - 1) - -#define KVM_E500_TLB0_WAY_NUM_BIT 1 /* No greater than 7 */ -#define KVM_E500_TLB0_WAY_NUM (1UL << KVM_E500_TLB0_WAY_NUM_BIT) -#define KVM_E500_TLB0_WAY_NUM_MASK (KVM_E500_TLB0_WAY_NUM - 1) +/* This geometry is the legacy default -- can be overridden by userspace */ +#define KVM_E500_TLB0_WAY_SIZE 128 +#define KVM_E500_TLB0_WAY_NUM 2 #define KVM_E500_TLB0_SIZE (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM) #define KVM_E500_TLB1_SIZE 16 @@ -58,50 +54,54 @@ extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *); extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *); /* TLB helper functions */ -static inline unsigned int get_tlb_size(const struct tlbe *tlbe) +static inline unsigned int +get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe) { return (tlbe->mas1 >> 7) & 0x1f; } -static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe) +static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe) { return tlbe->mas2 & 0xfffff000; } -static inline u64 get_tlb_bytes(const struct tlbe *tlbe) +static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe) { unsigned int pgsize = get_tlb_size(tlbe); return 1ULL << 10 << pgsize; } -static inline gva_t get_tlb_end(const struct tlbe *tlbe) +static inline gva_t get_tlb_end(const struct kvm_book3e_206_tlb_entry *tlbe) { u64 bytes = get_tlb_bytes(tlbe); return get_tlb_eaddr(tlbe) + bytes - 1; } -static inline u64 get_tlb_raddr(const struct tlbe *tlbe) +static inline u64 get_tlb_raddr(const struct kvm_book3e_206_tlb_entry *tlbe) { - u64 rpn = tlbe->mas7; - return (rpn << 32) | (tlbe->mas3 & 0xfffff000); + return tlbe->mas7_3 & ~0xfffULL; } -static inline unsigned int get_tlb_tid(const struct tlbe *tlbe) +static inline unsigned int +get_tlb_tid(const struct kvm_book3e_206_tlb_entry *tlbe) { return (tlbe->mas1 >> 16) & 0xff; } -static inline unsigned int get_tlb_ts(const struct tlbe *tlbe) +static inline unsigned int +get_tlb_ts(const struct kvm_book3e_206_tlb_entry *tlbe) { return (tlbe->mas1 >> 12) & 0x1; } -static inline unsigned int get_tlb_v(const struct tlbe *tlbe) +static inline unsigned int +get_tlb_v(const struct kvm_book3e_206_tlb_entry *tlbe) { return (tlbe->mas1 >> 31) & 0x1; } -static inline unsigned int get_tlb_iprot(const struct tlbe *tlbe) +static inline unsigned int +get_tlb_iprot(const struct kvm_book3e_206_tlb_entry *tlbe) { return (tlbe->mas1 >> 30) & 0x1; } @@ -121,59 +121,37 @@ static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu) return !!(vcpu->arch.shared->msr & MSR_PR); } -static inline unsigned int get_cur_spid( - const struct kvmppc_vcpu_e500 *vcpu_e500) +static inline unsigned int get_cur_spid(const struct kvm_vcpu *vcpu) { - return (vcpu_e500->mas6 >> 16) & 0xff; + return (vcpu->arch.shared->mas6 >> 16) & 0xff; } -static inline unsigned int get_cur_sas( - const struct kvmppc_vcpu_e500 *vcpu_e500) +static inline unsigned int get_cur_sas(const struct kvm_vcpu *vcpu) { - return vcpu_e500->mas6 & 0x1; + return vcpu->arch.shared->mas6 & 0x1; } -static inline unsigned int get_tlb_tlbsel( - const struct kvmppc_vcpu_e500 *vcpu_e500) +static inline unsigned int get_tlb_tlbsel(const struct kvm_vcpu *vcpu) { /* * Manual says that tlbsel has 2 bits wide. * Since we only have two TLBs, only lower bit is used. */ - return (vcpu_e500->mas0 >> 28) & 0x1; -} - -static inline unsigned int get_tlb_nv_bit( - const struct kvmppc_vcpu_e500 *vcpu_e500) -{ - return vcpu_e500->mas0 & 0xfff; + return (vcpu->arch.shared->mas0 >> 28) & 0x1; } -static inline unsigned int get_tlb_esel_bit( - const struct kvmppc_vcpu_e500 *vcpu_e500) +static inline unsigned int get_tlb_nv_bit(const struct kvm_vcpu *vcpu) { - return (vcpu_e500->mas0 >> 16) & 0xfff; + return vcpu->arch.shared->mas0 & 0xfff; } -static inline unsigned int get_tlb_esel( - const struct kvmppc_vcpu_e500 *vcpu_e500, - int tlbsel) +static inline unsigned int get_tlb_esel_bit(const struct kvm_vcpu *vcpu) { - unsigned int esel = get_tlb_esel_bit(vcpu_e500); - - if (tlbsel == 0) { - esel &= KVM_E500_TLB0_WAY_NUM_MASK; - esel |= ((vcpu_e500->mas2 >> 12) & KVM_E500_TLB0_WAY_SIZE_MASK) - << KVM_E500_TLB0_WAY_NUM_BIT; - } else { - esel &= KVM_E500_TLB1_SIZE - 1; - } - - return esel; + return (vcpu->arch.shared->mas0 >> 16) & 0xfff; } static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu, - const struct tlbe *tlbe) + const struct kvm_book3e_206_tlb_entry *tlbe) { gpa_t gpa; diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 141dce3c681..968f4010188 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -13,6 +13,7 @@ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * Copyright IBM Corp. 2007 + * Copyright 2011 Freescale Semiconductor, Inc. * * Authors: Hollis Blanchard <hollisb@us.ibm.com> */ @@ -69,54 +70,55 @@ #define OP_STH 44 #define OP_STHU 45 -#ifdef CONFIG_PPC_BOOK3S -static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu) -{ - return 1; -} -#else -static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.tcr & TCR_DIE; -} -#endif - void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) { unsigned long dec_nsec; + unsigned long long dec_time; pr_debug("mtDEC: %x\n", vcpu->arch.dec); + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + #ifdef CONFIG_PPC_BOOK3S /* mtdec lowers the interrupt line when positive. */ kvmppc_core_dequeue_dec(vcpu); /* POWER4+ triggers a dec interrupt if the value is < 0 */ if (vcpu->arch.dec & 0x80000000) { - hrtimer_try_to_cancel(&vcpu->arch.dec_timer); kvmppc_core_queue_dec(vcpu); return; } #endif - if (kvmppc_dec_enabled(vcpu)) { - /* The decrementer ticks at the same rate as the timebase, so - * that's how we convert the guest DEC value to the number of - * host ticks. */ - - hrtimer_try_to_cancel(&vcpu->arch.dec_timer); - dec_nsec = vcpu->arch.dec; - dec_nsec *= 1000; - dec_nsec /= tb_ticks_per_usec; - hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec), - HRTIMER_MODE_REL); - vcpu->arch.dec_jiffies = get_tb(); - } else { - hrtimer_try_to_cancel(&vcpu->arch.dec_timer); - } + +#ifdef CONFIG_BOOKE + /* On BOOKE, DEC = 0 is as good as decrementer not enabled */ + if (vcpu->arch.dec == 0) + return; +#endif + + /* + * The decrementer ticks at the same rate as the timebase, so + * that's how we convert the guest DEC value to the number of + * host ticks. + */ + + dec_time = vcpu->arch.dec; + dec_time *= 1000; + do_div(dec_time, tb_ticks_per_usec); + dec_nsec = do_div(dec_time, NSEC_PER_SEC); + hrtimer_start(&vcpu->arch.dec_timer, + ktime_set(dec_time, dec_nsec), HRTIMER_MODE_REL); + vcpu->arch.dec_jiffies = get_tb(); } u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb) { u64 jd = tb - vcpu->arch.dec_jiffies; + +#ifdef CONFIG_BOOKE + if (vcpu->arch.dec < jd) + return 0; +#endif + return vcpu->arch.dec - jd; } @@ -159,7 +161,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case OP_TRAP_64: kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP); #else - kvmppc_core_queue_program(vcpu, vcpu->arch.esr | ESR_PTR); + kvmppc_core_queue_program(vcpu, + vcpu->arch.shared->esr | ESR_PTR); #endif advance = 0; break; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 607fbdf24b8..00d7e345b3f 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -39,7 +39,8 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { return !(v->arch.shared->msr & MSR_WE) || - !!(v->arch.pending_exceptions); + !!(v->arch.pending_exceptions) || + v->requests; } int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) @@ -66,7 +67,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) vcpu->arch.magic_page_pa = param1; vcpu->arch.magic_page_ea = param2; - r2 = KVM_MAGIC_FEAT_SR; + r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7; r = HC_EV_SUCCESS; break; @@ -171,8 +172,11 @@ void kvm_arch_check_processor_compat(void *rtn) *(int *)rtn = kvmppc_core_check_processor_compat(); } -int kvm_arch_init_vm(struct kvm *kvm) +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + if (type) + return -EINVAL; + return kvmppc_core_init_vm(kvm); } @@ -208,17 +212,22 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PPC_BOOKE_SREGS: #else case KVM_CAP_PPC_SEGSTATE: + case KVM_CAP_PPC_HIOR: case KVM_CAP_PPC_PAPR: #endif case KVM_CAP_PPC_UNSET_IRQ: case KVM_CAP_PPC_IRQ_LEVEL: case KVM_CAP_ENABLE_CAP: + case KVM_CAP_ONE_REG: r = 1; break; #ifndef CONFIG_KVM_BOOK3S_64_HV case KVM_CAP_PPC_PAIRED_SINGLES: case KVM_CAP_PPC_OSI: case KVM_CAP_PPC_GET_PVINFO: +#ifdef CONFIG_KVM_E500 + case KVM_CAP_SW_TLB: +#endif r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -238,7 +247,26 @@ int kvm_dev_ioctl_check_extension(long ext) if (cpu_has_feature(CPU_FTR_ARCH_201)) r = 2; break; + case KVM_CAP_SYNC_MMU: + r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0; + break; #endif + case KVM_CAP_NR_VCPUS: + /* + * Recommending a number of CPUs is somewhat arbitrary; we + * return the number of present CPUs for -HV (since a host + * will have secondary threads "offline"), and for other KVM + * implementations just count online CPUs. + */ +#ifdef CONFIG_KVM_BOOK3S_64_HV + r = num_present_cpus(); +#else + r = num_online_cpus(); +#endif + break; + case KVM_CAP_MAX_VCPUS: + r = KVM_MAX_VCPUS; + break; default: r = 0; break; @@ -253,6 +281,16 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + return 0; +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, @@ -279,9 +317,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { struct kvm_vcpu *vcpu; vcpu = kvmppc_core_vcpu_create(kvm, id); - vcpu->arch.wqp = &vcpu->wq; - if (!IS_ERR(vcpu)) + if (!IS_ERR(vcpu)) { + vcpu->arch.wqp = &vcpu->wq; kvmppc_create_vcpu_debugfs(vcpu, id); + } return vcpu; } @@ -305,18 +344,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return kvmppc_core_pending_dec(vcpu); } -static void kvmppc_decrementer_func(unsigned long data) -{ - struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; - - kvmppc_core_queue_dec(vcpu); - - if (waitqueue_active(vcpu->arch.wqp)) { - wake_up_interruptible(vcpu->arch.wqp); - vcpu->stat.halt_wakeup++; - } -} - /* * low level hrtimer wake routine. Because this runs in hardirq context * we schedule a tasklet to do the real work. @@ -431,20 +458,20 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); - switch (vcpu->arch.io_gpr & KVM_REG_EXT_MASK) { - case KVM_REG_GPR: + switch (vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) { + case KVM_MMIO_REG_GPR: kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); break; - case KVM_REG_FPR: - vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; + case KVM_MMIO_REG_FPR: + vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; break; #ifdef CONFIG_PPC_BOOK3S - case KVM_REG_QPR: - vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; + case KVM_MMIO_REG_QPR: + vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; break; - case KVM_REG_FQPR: - vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; - vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_REG_MASK] = gpr; + case KVM_MMIO_REG_FQPR: + vcpu->arch.fpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; + vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; break; #endif default: @@ -553,8 +580,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) vcpu->arch.hcall_needed = 0; } - kvmppc_core_deliver_interrupts(vcpu); - r = kvmppc_vcpu_run(run, vcpu); if (vcpu->sigset_active) @@ -563,6 +588,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) return r; } +void kvm_vcpu_kick(struct kvm_vcpu *vcpu) +{ + int me; + int cpu = vcpu->cpu; + + me = get_cpu(); + if (waitqueue_active(vcpu->arch.wqp)) { + wake_up_interruptible(vcpu->arch.wqp); + vcpu->stat.halt_wakeup++; + } else if (cpu != me && cpu != -1) { + smp_send_reschedule(vcpu->cpu); + } + put_cpu(); +} + int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) { if (irq->irq == KVM_INTERRUPT_UNSET) { @@ -571,13 +611,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) } kvmppc_core_queue_external(vcpu, irq); - - if (waitqueue_active(vcpu->arch.wqp)) { - wake_up_interruptible(vcpu->arch.wqp); - vcpu->stat.halt_wakeup++; - } else if (vcpu->cpu != -1) { - smp_send_reschedule(vcpu->cpu); - } + kvm_vcpu_kick(vcpu); return 0; } @@ -599,6 +633,19 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, r = 0; vcpu->arch.papr_enabled = true; break; +#ifdef CONFIG_KVM_E500 + case KVM_CAP_SW_TLB: { + struct kvm_config_tlb cfg; + void __user *user_ptr = (void __user *)(uintptr_t)cap->args[0]; + + r = -EFAULT; + if (copy_from_user(&cfg, user_ptr, sizeof(cfg))) + break; + + r = kvm_vcpu_ioctl_config_tlb(vcpu, &cfg); + break; + } +#endif default: r = -EINVAL; break; @@ -648,6 +695,32 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); break; } + + case KVM_SET_ONE_REG: + case KVM_GET_ONE_REG: + { + struct kvm_one_reg reg; + r = -EFAULT; + if (copy_from_user(®, argp, sizeof(reg))) + goto out; + if (ioctl == KVM_SET_ONE_REG) + r = kvm_vcpu_ioctl_set_one_reg(vcpu, ®); + else + r = kvm_vcpu_ioctl_get_one_reg(vcpu, ®); + break; + } + +#ifdef CONFIG_KVM_E500 + case KVM_DIRTY_TLB: { + struct kvm_dirty_tlb dirty; + r = -EFAULT; + if (copy_from_user(&dirty, argp, sizeof(dirty))) + goto out; + r = kvm_vcpu_ioctl_dirty_tlb(vcpu, &dirty); + break; + } +#endif + default: r = -EINVAL; } @@ -656,6 +729,11 @@ out: return r; } +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) { u32 inst_lis = 0x3c000000; diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index b135d3d397d..877186b7b1c 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h @@ -118,11 +118,14 @@ TRACE_EVENT(kvm_book3s_exit, ), TP_fast_assign( + struct kvmppc_book3s_shadow_vcpu *svcpu; __entry->exit_nr = exit_nr; __entry->pc = kvmppc_get_pc(vcpu); __entry->dar = kvmppc_get_fault_dar(vcpu); __entry->msr = vcpu->arch.shared->msr; - __entry->srr1 = to_svcpu(vcpu)->shadow_srr1; + svcpu = svcpu_get(vcpu); + __entry->srr1 = svcpu->shadow_srr1; + svcpu_put(svcpu); ), TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx", @@ -337,6 +340,63 @@ TRACE_EVENT(kvm_book3s_slbmte, #endif /* CONFIG_PPC_BOOK3S */ + +/************************************************************************* + * Book3E trace points * + *************************************************************************/ + +#ifdef CONFIG_BOOKE + +TRACE_EVENT(kvm_booke206_stlb_write, + TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3), + TP_ARGS(mas0, mas8, mas1, mas2, mas7_3), + + TP_STRUCT__entry( + __field( __u32, mas0 ) + __field( __u32, mas8 ) + __field( __u32, mas1 ) + __field( __u64, mas2 ) + __field( __u64, mas7_3 ) + ), + + TP_fast_assign( + __entry->mas0 = mas0; + __entry->mas8 = mas8; + __entry->mas1 = mas1; + __entry->mas2 = mas2; + __entry->mas7_3 = mas7_3; + ), + + TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx", + __entry->mas0, __entry->mas8, __entry->mas1, + __entry->mas2, __entry->mas7_3) +); + +TRACE_EVENT(kvm_booke206_gtlb_write, + TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3), + TP_ARGS(mas0, mas1, mas2, mas7_3), + + TP_STRUCT__entry( + __field( __u32, mas0 ) + __field( __u32, mas1 ) + __field( __u64, mas2 ) + __field( __u64, mas7_3 ) + ), + + TP_fast_assign( + __entry->mas0 = mas0; + __entry->mas1 = mas1; + __entry->mas2 = mas2; + __entry->mas7_3 = mas7_3; + ), + + TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx", + __entry->mas0, __entry->mas1, + __entry->mas2, __entry->mas7_3) +); + +#endif + #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 57c7465e656..fb05b123218 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -12,6 +12,7 @@ #include <linux/io.h> #include <linux/slab.h> #include <linux/hugetlb.h> +#include <linux/export.h> #include <linux/of_fdt.h> #include <linux/memblock.h> #include <linux/bootmem.h> @@ -103,6 +104,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift *shift = hugepd_shift(*hpdp); return hugepte_offset(hpdp, ea, pdshift); } +EXPORT_SYMBOL_GPL(find_linux_pte_or_hugepte); pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { @@ -310,7 +312,8 @@ void __init reserve_hugetlb_gpages(void) int i; strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE); - parse_args("hugetlb gpages", cmdline, NULL, 0, &do_gpage_early_setup); + parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0, + &do_gpage_early_setup); /* * Walk gpage list in reverse, allocating larger page sizes first. diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index c2e27ede07e..02aee03e713 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -116,14 +116,45 @@ static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) *addrp = mfspr(SPRN_SDAR); } +static inline u32 perf_flags_from_msr(struct pt_regs *regs) +{ + if (regs->msr & MSR_PR) + return PERF_RECORD_MISC_USER; + if ((regs->msr & MSR_HV) && freeze_events_kernel != MMCR0_FCHV) + return PERF_RECORD_MISC_HYPERVISOR; + return PERF_RECORD_MISC_KERNEL; +} + static inline u32 perf_get_misc_flags(struct pt_regs *regs) { unsigned long mmcra = regs->dsisr; unsigned long sihv = MMCRA_SIHV; unsigned long sipr = MMCRA_SIPR; + /* Not a PMU interrupt: Make up flags from regs->msr */ if (TRAP(regs) != 0xf00) - return 0; /* not a PMU interrupt */ + return perf_flags_from_msr(regs); + + /* + * If we don't support continuous sampling and this + * is not a marked event, same deal + */ + if ((ppmu->flags & PPMU_NO_CONT_SAMPLING) && + !(mmcra & MMCRA_SAMPLE_ENABLE)) + return perf_flags_from_msr(regs); + + /* + * If we don't have flags in MMCRA, rather than using + * the MSR, we intuit the flags from the address in + * SIAR which should give slightly more reliable + * results + */ + if (ppmu->flags & PPMU_NO_SIPR) { + unsigned long siar = mfspr(SPRN_SIAR); + if (siar >= PAGE_OFFSET) + return PERF_RECORD_MISC_KERNEL; + return PERF_RECORD_MISC_USER; + } if (ppmu->flags & PPMU_ALT_SIPR) { sihv = POWER6_MMCRA_SIHV; @@ -1299,13 +1330,18 @@ unsigned long perf_misc_flags(struct pt_regs *regs) */ unsigned long perf_instruction_pointer(struct pt_regs *regs) { - unsigned long ip; + unsigned long mmcra = regs->dsisr; + /* Not a PMU interrupt */ if (TRAP(regs) != 0xf00) - return regs->nip; /* not a PMU interrupt */ + return regs->nip; + + /* Processor doesn't support sampling non marked events */ + if ((ppmu->flags & PPMU_NO_CONT_SAMPLING) && + !(mmcra & MMCRA_SAMPLE_ENABLE)) + return regs->nip; - ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs); - return ip; + return mfspr(SPRN_SIAR) + perf_ip_adjust(regs); } static bool pmc_overflow(unsigned long val) diff --git a/arch/powerpc/perf/power4-pmu.c b/arch/powerpc/perf/power4-pmu.c index b4f1dda4d08..9103a1de864 100644 --- a/arch/powerpc/perf/power4-pmu.c +++ b/arch/powerpc/perf/power4-pmu.c @@ -607,6 +607,7 @@ static struct power_pmu power4_pmu = { .n_generic = ARRAY_SIZE(p4_generic_events), .generic_events = p4_generic_events, .cache_events = &power4_cache_events, + .flags = PPMU_NO_SIPR | PPMU_NO_CONT_SAMPLING, }; static int __init init_power4_pmu(void) diff --git a/arch/powerpc/perf/ppc970-pmu.c b/arch/powerpc/perf/ppc970-pmu.c index 111eb25bb0b..20139ceeacf 100644 --- a/arch/powerpc/perf/ppc970-pmu.c +++ b/arch/powerpc/perf/ppc970-pmu.c @@ -487,6 +487,7 @@ static struct power_pmu ppc970_pmu = { .n_generic = ARRAY_SIZE(ppc970_generic_events), .generic_events = ppc970_generic_events, .cache_events = &ppc970_cache_events, + .flags = PPMU_NO_SIPR | PPMU_NO_CONT_SAMPLING, }; static int __init init_ppc970_pmu(void) diff --git a/arch/powerpc/platforms/cell/beat_htab.c b/arch/powerpc/platforms/cell/beat_htab.c index 2516c1cf846..943c9d39aa1 100644 --- a/arch/powerpc/platforms/cell/beat_htab.c +++ b/arch/powerpc/platforms/cell/beat_htab.c @@ -95,7 +95,6 @@ static long beat_lpar_hpte_insert(unsigned long hpte_group, unsigned long lpar_rc; u64 hpte_v, hpte_r, slot; - /* same as iseries */ if (vflags & HPTE_V_SECONDARY) return -1; @@ -319,7 +318,6 @@ static long beat_lpar_hpte_insert_v3(unsigned long hpte_group, unsigned long lpar_rc; u64 hpte_v, hpte_r, slot; - /* same as iseries */ if (vflags & HPTE_V_SECONDARY) return -1; diff --git a/arch/powerpc/platforms/maple/pci.c b/arch/powerpc/platforms/maple/pci.c index 401e3f3f74c..465ee8f5c08 100644 --- a/arch/powerpc/platforms/maple/pci.c +++ b/arch/powerpc/platforms/maple/pci.c @@ -620,7 +620,7 @@ void __init maple_pci_init(void) } /* Tell pci.c to not change any resource allocations. */ - pci_probe_only = 1; + pci_add_flags(PCI_PROBE_ONLY); } int maple_pci_get_legacy_ide_irq(struct pci_dev *pdev, int channel) diff --git a/arch/powerpc/platforms/pasemi/pci.c b/arch/powerpc/platforms/pasemi/pci.c index b6a0ec45c69..aa862713258 100644 --- a/arch/powerpc/platforms/pasemi/pci.c +++ b/arch/powerpc/platforms/pasemi/pci.c @@ -229,9 +229,6 @@ void __init pas_pci_init(void) /* Setup the linkage between OF nodes and PHBs */ pci_devs_phb_init(); - - /* Use the common resource allocation mechanism */ - pci_probe_only = 1; } void __iomem *pasemi_pci_getcfgaddr(struct pci_dev *dev, int offset) diff --git a/arch/powerpc/platforms/powermac/pci.c b/arch/powerpc/platforms/powermac/pci.c index 31a7d3a7ce2..43bbe1bda93 100644 --- a/arch/powerpc/platforms/powermac/pci.c +++ b/arch/powerpc/platforms/powermac/pci.c @@ -1059,9 +1059,6 @@ void __init pmac_pci_init(void) } /* pmac_check_ht_link(); */ - /* We can allocate missing resources if any */ - pci_probe_only = 0; - #else /* CONFIG_PPC64 */ init_p2pbridge(); init_second_ohare(); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 5e155dfc432..fbdd74dac3a 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1299,15 +1299,14 @@ void __init pnv_pci_init_ioda1_phb(struct device_node *np) /* Setup MSI support */ pnv_pci_init_ioda_msis(phb); - /* We set both probe_only and PCI_REASSIGN_ALL_RSRC. This is an + /* We set both PCI_PROBE_ONLY and PCI_REASSIGN_ALL_RSRC. This is an * odd combination which essentially means that we skip all resource * fixups and assignments in the generic code, and do it all * ourselves here */ - pci_probe_only = 1; ppc_md.pcibios_fixup_phb = pnv_pci_ioda_fixup_phb; ppc_md.pcibios_enable_device_hook = pnv_pci_enable_device_hook; - pci_add_flags(PCI_REASSIGN_ALL_RSRC); + pci_add_flags(PCI_PROBE_ONLY | PCI_REASSIGN_ALL_RSRC); /* Reset IODA tables to a clean state */ rc = opal_pci_reset(phb_id, OPAL_PCI_IODA_TABLE_RESET, OPAL_ASSERT_RESET); diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 214478d781a..be3cfc5ceab 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -562,10 +562,7 @@ void __init pnv_pci_init(void) { struct device_node *np; - pci_set_flags(PCI_CAN_SKIP_ISA_ALIGN); - - /* We do not want to just probe */ - pci_probe_only = 0; + pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN); /* OPAL absent, try POPAL first then RTAS detection of PHBs */ if (!firmware_has_feature(FW_FEATURE_OPAL)) { diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c index 8011088392d..309d38ef732 100644 --- a/arch/powerpc/platforms/pseries/eeh.c +++ b/arch/powerpc/platforms/pseries/eeh.c @@ -984,7 +984,8 @@ int __exit eeh_ops_unregister(const char *name) */ void __init eeh_init(void) { - struct device_node *phb, *np; + struct pci_controller *hose, *tmp; + struct device_node *phb; int ret; /* call platform initialization function */ @@ -1000,19 +1001,9 @@ void __init eeh_init(void) raw_spin_lock_init(&confirm_error_lock); - np = of_find_node_by_path("/rtas"); - if (np == NULL) - return; - - /* Enable EEH for all adapters. Note that eeh requires buid's */ - for (phb = of_find_node_by_name(NULL, "pci"); phb; - phb = of_find_node_by_name(phb, "pci")) { - unsigned long buid; - - buid = get_phb_buid(phb); - if (buid == 0 || !of_node_to_eeh_dev(phb)) - continue; - + /* Enable EEH for all adapters */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { + phb = hose->dn; traverse_pci_devices(phb, eeh_early_enable, NULL); } diff --git a/arch/powerpc/platforms/pseries/eeh_dev.c b/arch/powerpc/platforms/pseries/eeh_dev.c index f3aed7dcae9..c4507d09590 100644 --- a/arch/powerpc/platforms/pseries/eeh_dev.c +++ b/arch/powerpc/platforms/pseries/eeh_dev.c @@ -62,7 +62,7 @@ void * __devinit eeh_dev_init(struct device_node *dn, void *data) } /* Associate EEH device with OF node */ - dn->edev = edev; + PCI_DN(dn)->edev = edev; edev->dn = dn; edev->phb = phb; diff --git a/arch/powerpc/platforms/pseries/io_event_irq.c b/arch/powerpc/platforms/pseries/io_event_irq.c index 1a709bc48ce..ef9d9d84c7d 100644 --- a/arch/powerpc/platforms/pseries/io_event_irq.c +++ b/arch/powerpc/platforms/pseries/io_event_irq.c @@ -63,73 +63,9 @@ EXPORT_SYMBOL_GPL(pseries_ioei_notifier_list); static int ioei_check_exception_token; -/* pSeries event log format */ - -/* Two bytes ASCII section IDs */ -#define PSERIES_ELOG_SECT_ID_PRIV_HDR (('P' << 8) | 'H') -#define PSERIES_ELOG_SECT_ID_USER_HDR (('U' << 8) | 'H') -#define PSERIES_ELOG_SECT_ID_PRIMARY_SRC (('P' << 8) | 'S') -#define PSERIES_ELOG_SECT_ID_EXTENDED_UH (('E' << 8) | 'H') -#define PSERIES_ELOG_SECT_ID_FAILING_MTMS (('M' << 8) | 'T') -#define PSERIES_ELOG_SECT_ID_SECONDARY_SRC (('S' << 8) | 'S') -#define PSERIES_ELOG_SECT_ID_DUMP_LOCATOR (('D' << 8) | 'H') -#define PSERIES_ELOG_SECT_ID_FW_ERROR (('S' << 8) | 'W') -#define PSERIES_ELOG_SECT_ID_IMPACT_PART_ID (('L' << 8) | 'P') -#define PSERIES_ELOG_SECT_ID_LOGIC_RESOURCE_ID (('L' << 8) | 'R') -#define PSERIES_ELOG_SECT_ID_HMC_ID (('H' << 8) | 'M') -#define PSERIES_ELOG_SECT_ID_EPOW (('E' << 8) | 'P') -#define PSERIES_ELOG_SECT_ID_IO_EVENT (('I' << 8) | 'E') -#define PSERIES_ELOG_SECT_ID_MANUFACT_INFO (('M' << 8) | 'I') -#define PSERIES_ELOG_SECT_ID_CALL_HOME (('C' << 8) | 'H') -#define PSERIES_ELOG_SECT_ID_USER_DEF (('U' << 8) | 'D') - -/* Vendor specific Platform Event Log Format, Version 6, section header */ -struct pseries_elog_section { - uint16_t id; /* 0x00 2-byte ASCII section ID */ - uint16_t length; /* 0x02 Section length in bytes */ - uint8_t version; /* 0x04 Section version */ - uint8_t subtype; /* 0x05 Section subtype */ - uint16_t creator_component; /* 0x06 Creator component ID */ - uint8_t data[]; /* 0x08 Start of section data */ -}; - static char ioei_rtas_buf[RTAS_DATA_BUF_SIZE] __cacheline_aligned; /** - * Find data portion of a specific section in RTAS extended event log. - * @elog: RTAS error/event log. - * @sect_id: secsion ID. - * - * Return: - * pointer to the section data of the specified section - * NULL if not found - */ -static struct pseries_elog_section *find_xelog_section(struct rtas_error_log *elog, - uint16_t sect_id) -{ - struct rtas_ext_event_log_v6 *xelog = - (struct rtas_ext_event_log_v6 *) elog->buffer; - struct pseries_elog_section *sect; - unsigned char *p, *log_end; - - /* Check that we understand the format */ - if (elog->extended_log_length < sizeof(struct rtas_ext_event_log_v6) || - xelog->log_format != RTAS_V6EXT_LOG_FORMAT_EVENT_LOG || - xelog->company_id != RTAS_V6EXT_COMPANY_ID_IBM) - return NULL; - - log_end = elog->buffer + elog->extended_log_length; - p = xelog->vendor_log; - while (p < log_end) { - sect = (struct pseries_elog_section *)p; - if (sect->id == sect_id) - return sect; - p += sect->length; - } - return NULL; -} - -/** * Find the data portion of an IO Event section from event log. * @elog: RTAS error/event log. * @@ -138,7 +74,7 @@ static struct pseries_elog_section *find_xelog_section(struct rtas_error_log *el */ static struct pseries_io_event * ioei_find_event(struct rtas_error_log *elog) { - struct pseries_elog_section *sect; + struct pseries_errorlog *sect; /* We should only ever get called for io-event interrupts, but if * we do get called for another type then something went wrong so @@ -152,7 +88,7 @@ static struct pseries_io_event * ioei_find_event(struct rtas_error_log *elog) return NULL; } - sect = find_xelog_section(elog, PSERIES_ELOG_SECT_ID_IO_EVENT); + sect = get_pseries_errorlog(elog, PSERIES_ELOG_SECT_ID_IO_EVENT); if (unlikely(!sect)) { printk_once(KERN_WARNING "io_event_irq: RTAS extended event " "log does not contain an IO Event section. " diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index c442f2b1980..0915b1ad66c 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -809,8 +809,7 @@ machine_arch_initcall(pseries, find_existing_ddw_windows); static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, struct ddw_query_response *query) { - struct device_node *dn; - struct pci_dn *pcidn; + struct eeh_dev *edev; u32 cfg_addr; u64 buid; int ret; @@ -821,12 +820,12 @@ static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, * Retrieve them from the pci device, not the node with the * dma-window property */ - dn = pci_device_to_OF_node(dev); - pcidn = PCI_DN(dn); - cfg_addr = pcidn->eeh_config_addr; - if (pcidn->eeh_pe_config_addr) - cfg_addr = pcidn->eeh_pe_config_addr; - buid = pcidn->phb->buid; + edev = pci_dev_to_eeh_dev(dev); + cfg_addr = edev->config_addr; + if (edev->pe_config_addr) + cfg_addr = edev->pe_config_addr; + buid = edev->phb->buid; + ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query, cfg_addr, BUID_HI(buid), BUID_LO(buid)); dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x" @@ -839,8 +838,7 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, struct ddw_create_response *create, int page_shift, int window_shift) { - struct device_node *dn; - struct pci_dn *pcidn; + struct eeh_dev *edev; u32 cfg_addr; u64 buid; int ret; @@ -851,12 +849,11 @@ static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, * Retrieve them from the pci device, not the node with the * dma-window property */ - dn = pci_device_to_OF_node(dev); - pcidn = PCI_DN(dn); - cfg_addr = pcidn->eeh_config_addr; - if (pcidn->eeh_pe_config_addr) - cfg_addr = pcidn->eeh_pe_config_addr; - buid = pcidn->phb->buid; + edev = pci_dev_to_eeh_dev(dev); + cfg_addr = edev->config_addr; + if (edev->pe_config_addr) + cfg_addr = edev->pe_config_addr; + buid = edev->phb->buid; do { /* extra outputs are LIOBN and dma-addr (hi, lo) */ diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index fbb21fc3080..8b7bafa489c 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -84,7 +84,7 @@ void pcibios_remove_pci_devices(struct pci_bus *bus) list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) { pr_debug(" * Removing %s...\n", pci_name(dev)); eeh_remove_bus_device(dev); - pci_remove_bus_device(dev); + pci_stop_and_remove_bus_device(dev); } } EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices); diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 9e248ef6cc6..c4dfccd3a3d 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -16,36 +16,15 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -/* Change Activity: - * 2001/09/21 : engebret : Created with minimal EPOW and HW exception support. - * End Change Activity - */ - -#include <linux/errno.h> -#include <linux/threads.h> -#include <linux/kernel_stat.h> -#include <linux/signal.h> #include <linux/sched.h> -#include <linux/ioport.h> #include <linux/interrupt.h> -#include <linux/timex.h> -#include <linux/init.h> -#include <linux/delay.h> #include <linux/irq.h> -#include <linux/random.h> -#include <linux/sysrq.h> -#include <linux/bitops.h> - -#include <asm/uaccess.h> -#include <asm/io.h> -#include <asm/pgtable.h> -#include <asm/irq.h> -#include <asm/cache.h> -#include <asm/prom.h> -#include <asm/ptrace.h> +#include <linux/of.h> +#include <linux/fs.h> +#include <linux/reboot.h> + #include <asm/machdep.h> #include <asm/rtas.h> -#include <asm/udbg.h> #include <asm/firmware.h> #include "pseries.h" @@ -56,7 +35,6 @@ static DEFINE_SPINLOCK(ras_log_buf_lock); static char global_mce_data_buf[RTAS_ERROR_LOG_MAX]; static DEFINE_PER_CPU(__u64, mce_data_buf); -static int ras_get_sensor_state_token; static int ras_check_exception_token; #define EPOW_SENSOR_TOKEN 9 @@ -74,7 +52,6 @@ static int __init init_ras_IRQ(void) { struct device_node *np; - ras_get_sensor_state_token = rtas_token("get-sensor-state"); ras_check_exception_token = rtas_token("check-exception"); /* Internal Errors */ @@ -94,26 +71,126 @@ static int __init init_ras_IRQ(void) return 0; } -__initcall(init_ras_IRQ); +subsys_initcall(init_ras_IRQ); -/* - * Handle power subsystem events (EPOW). - * - * Presently we just log the event has occurred. This should be fixed - * to examine the type of power failure and take appropriate action where - * the time horizon permits something useful to be done. - */ +#define EPOW_SHUTDOWN_NORMAL 1 +#define EPOW_SHUTDOWN_ON_UPS 2 +#define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3 +#define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4 + +static void handle_system_shutdown(char event_modifier) +{ + switch (event_modifier) { + case EPOW_SHUTDOWN_NORMAL: + pr_emerg("Firmware initiated power off"); + orderly_poweroff(1); + break; + + case EPOW_SHUTDOWN_ON_UPS: + pr_emerg("Loss of power reported by firmware, system is " + "running on UPS/battery"); + break; + + case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS: + pr_emerg("Loss of system critical functions reported by " + "firmware"); + pr_emerg("Check RTAS error log for details"); + orderly_poweroff(1); + break; + + case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH: + pr_emerg("Ambient temperature too high reported by firmware"); + pr_emerg("Check RTAS error log for details"); + orderly_poweroff(1); + break; + + default: + pr_err("Unknown power/cooling shutdown event (modifier %d)", + event_modifier); + } +} + +struct epow_errorlog { + unsigned char sensor_value; + unsigned char event_modifier; + unsigned char extended_modifier; + unsigned char reserved; + unsigned char platform_reason; +}; + +#define EPOW_RESET 0 +#define EPOW_WARN_COOLING 1 +#define EPOW_WARN_POWER 2 +#define EPOW_SYSTEM_SHUTDOWN 3 +#define EPOW_SYSTEM_HALT 4 +#define EPOW_MAIN_ENCLOSURE 5 +#define EPOW_POWER_OFF 7 + +void rtas_parse_epow_errlog(struct rtas_error_log *log) +{ + struct pseries_errorlog *pseries_log; + struct epow_errorlog *epow_log; + char action_code; + char modifier; + + pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW); + if (pseries_log == NULL) + return; + + epow_log = (struct epow_errorlog *)pseries_log->data; + action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */ + modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */ + + switch (action_code) { + case EPOW_RESET: + pr_err("Non critical power or cooling issue cleared"); + break; + + case EPOW_WARN_COOLING: + pr_err("Non critical cooling issue reported by firmware"); + pr_err("Check RTAS error log for details"); + break; + + case EPOW_WARN_POWER: + pr_err("Non critical power issue reported by firmware"); + pr_err("Check RTAS error log for details"); + break; + + case EPOW_SYSTEM_SHUTDOWN: + handle_system_shutdown(epow_log->event_modifier); + break; + + case EPOW_SYSTEM_HALT: + pr_emerg("Firmware initiated power off"); + orderly_poweroff(1); + break; + + case EPOW_MAIN_ENCLOSURE: + case EPOW_POWER_OFF: + pr_emerg("Critical power/cooling issue reported by firmware"); + pr_emerg("Check RTAS error log for details"); + pr_emerg("Immediate power off"); + emergency_sync(); + kernel_power_off(); + break; + + default: + pr_err("Unknown power/cooling event (action code %d)", + action_code); + } +} + +/* Handle environmental and power warning (EPOW) interrupts. */ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) { - int status = 0xdeadbeef; - int state = 0; + int status; + int state; int critical; - status = rtas_call(ras_get_sensor_state_token, 2, 2, &state, - EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX); + status = rtas_get_sensor(EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX, &state); if (state > 3) - critical = 1; /* Time Critical */ + critical = 1; /* Time Critical */ else critical = 0; @@ -122,18 +199,14 @@ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) status = rtas_call(ras_check_exception_token, 6, 1, NULL, RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq), - RTAS_EPOW_WARNING | RTAS_POWERMGM_EVENTS, + RTAS_EPOW_WARNING, critical, __pa(&ras_log_buf), rtas_get_error_log_max()); - udbg_printf("EPOW <0x%lx 0x%x 0x%x>\n", - *((unsigned long *)&ras_log_buf), status, state); - printk(KERN_WARNING "EPOW <0x%lx 0x%x 0x%x>\n", - *((unsigned long *)&ras_log_buf), status, state); - - /* format and print the extended information */ log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); + rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf); + spin_unlock(&ras_log_buf_lock); return IRQ_HANDLED; } @@ -149,7 +222,7 @@ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) static irqreturn_t ras_error_interrupt(int irq, void *dev_id) { struct rtas_error_log *rtas_elog; - int status = 0xdeadbeef; + int status; int fatal; spin_lock(&ras_log_buf_lock); @@ -157,7 +230,7 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id) status = rtas_call(ras_check_exception_token, 6, 1, NULL, RTAS_VECTOR_EXTERNAL_INTERRUPT, virq_to_hw(irq), - RTAS_INTERNAL_ERROR, 1 /*Time Critical */, + RTAS_INTERNAL_ERROR, 1 /* Time Critical */, __pa(&ras_log_buf), rtas_get_error_log_max()); @@ -172,24 +245,13 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id) log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal); if (fatal) { - udbg_printf("Fatal HW Error <0x%lx 0x%x>\n", - *((unsigned long *)&ras_log_buf), status); - printk(KERN_EMERG "Error: Fatal hardware error <0x%lx 0x%x>\n", - *((unsigned long *)&ras_log_buf), status); - -#ifndef DEBUG_RTAS_POWER_OFF - /* Don't actually power off when debugging so we can test - * without actually failing while injecting errors. - * Error data will not be logged to syslog. - */ - ppc_md.power_off(); -#endif + pr_emerg("Fatal hardware error reported by firmware"); + pr_emerg("Check RTAS error log for details"); + pr_emerg("Immediate power off"); + emergency_sync(); + kernel_power_off(); } else { - udbg_printf("Recoverable HW Error <0x%lx 0x%x>\n", - *((unsigned long *)&ras_log_buf), status); - printk(KERN_WARNING - "Warning: Recoverable hardware error <0x%lx 0x%x>\n", - *((unsigned long *)&ras_log_buf), status); + pr_err("Recoverable hardware error reported by firmware"); } spin_unlock(&ras_log_buf_lock); diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 8f137af616a..51ecac920dd 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -383,6 +383,9 @@ static void __init pSeries_setup_arch(void) fwnmi_init(); + /* By default, only probe PCI (can be overriden by rtas_pci) */ + pci_add_flags(PCI_PROBE_ONLY); + /* Find and initialize PCI host bridges */ init_pci_config_tokens(); eeh_pseries_init(); diff --git a/arch/powerpc/platforms/wsp/wsp_pci.c b/arch/powerpc/platforms/wsp/wsp_pci.c index 386879412e9..1526551f9fe 100644 --- a/arch/powerpc/platforms/wsp/wsp_pci.c +++ b/arch/powerpc/platforms/wsp/wsp_pci.c @@ -683,7 +683,6 @@ static int __init wsp_setup_one_phb(struct device_node *np) /* XXX Force re-assigning of everything for now */ pci_add_flags(PCI_REASSIGN_ALL_BUS | PCI_REASSIGN_ALL_RSRC | PCI_ENABLE_PROC_DOMAINS); - pci_probe_only = 0; /* Calculate how the TCE space is divided */ phb->dma32_base = 0; diff --git a/arch/powerpc/xmon/ppc-opc.c b/arch/powerpc/xmon/ppc-opc.c index af3780e52e7..6845e91ba04 100644 --- a/arch/powerpc/xmon/ppc-opc.c +++ b/arch/powerpc/xmon/ppc-opc.c @@ -22,6 +22,7 @@ #include <linux/stddef.h> #include <linux/kernel.h> +#include <linux/bug.h> #include "nonstdio.h" #include "ppc.h" diff --git a/arch/powerpc/xmon/spu-opc.c b/arch/powerpc/xmon/spu-opc.c index 530df3d6d7b..7d37597c4bc 100644 --- a/arch/powerpc/xmon/spu-opc.c +++ b/arch/powerpc/xmon/spu-opc.c @@ -19,6 +19,7 @@ 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA. */ #include <linux/kernel.h> +#include <linux/bug.h> #include "spu.h" /* This file holds the Spu opcode table */ |