diff options
Diffstat (limited to 'arch')
96 files changed, 4792 insertions, 1115 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2ba14e77296..61abde11a45 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -56,6 +56,16 @@ config IRQ_PER_CPU bool default y +config NR_IRQS + int "Number of virtual interrupt numbers" + range 32 512 + default "512" + help + This defines the number of virtual interrupt numbers the kernel + can manage. Virtual interrupt numbers are what you see in + /proc/interrupts. If you configure your system to have too few, + drivers will fail to load or worse - handle with care. + config STACKTRACE_SUPPORT bool default y @@ -199,19 +209,8 @@ config DEFAULT_UIMAGE config REDBOOT bool -config HIBERNATE_32 - bool - depends on (PPC_PMAC && !SMP) || BROKEN - default y - -config HIBERNATE_64 - bool - depends on BROKEN || (PPC_PMAC64 && EXPERIMENTAL) - default y - config ARCH_HIBERNATION_POSSIBLE bool - depends on (PPC64 && HIBERNATE_64) || (PPC32 && HIBERNATE_32) default y config ARCH_SUSPEND_POSSIBLE @@ -378,6 +377,19 @@ config IRQ_ALL_CPUS CPU. Generally saying Y is safe, although some problems have been reported with SMP Power Macintoshes with this option enabled. +config SPARSE_IRQ + bool "Support sparse irq numbering" + default y + help + This enables support for sparse irqs. This is useful for distro + kernels that want to define a high CONFIG_NR_CPUS value but still + want to have low kernel memory footprint on smaller machines. + + ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread + out the irq_desc[] array in a more NUMA-friendly way. ) + + If you don't know what to do here, say Y. + config NUMA bool "NUMA support" depends on PPC64 diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index a98653b2623..57c40007199 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -147,6 +147,7 @@ .globl label##_pSeries; \ label##_pSeries: \ HMT_MEDIUM; \ + DO_KVM n; \ mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \ EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common) @@ -170,6 +171,7 @@ label##_pSeries: \ .globl label##_pSeries; \ label##_pSeries: \ HMT_MEDIUM; \ + DO_KVM n; \ mtspr SPRN_SPRG_SCRATCH0,r13; /* save r13 */ \ mfspr r13,SPRN_SPRG_PACA; /* get paca address into r13 */ \ std r9,PACA_EXGEN+EX_R9(r13); /* save r9, r10 */ \ diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index b1dafb6a974..5856a66ab40 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -3,6 +3,10 @@ #include <asm/page.h> +pte_t *huge_pte_offset_and_shift(struct mm_struct *mm, + unsigned long addr, unsigned *shift); + +void flush_dcache_icache_hugepage(struct page *page); int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, unsigned long len); @@ -11,12 +15,6 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte); - -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); - /* * The version of vma_mmu_pagesize() in arch/powerpc/mm/hugetlbpage.c needs * to override the version in mm/hugetlb.c @@ -42,9 +40,26 @@ static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) { } + +static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + set_pte_at(mm, addr, ptep, pte); +} + +static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1); + return __pte(old); +} + static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + pte_t pte; + pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + flush_tlb_page(vma, addr); } static inline int huge_pte_none(pte_t pte) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index abbc2aaaced..9f4c9d4f580 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -64,11 +64,6 @@ extern void iseries_handle_interrupts(void); get_paca()->hard_enabled = 0; \ } while(0) -static inline int irqs_disabled_flags(unsigned long flags) -{ - return flags == 0; -} - #else #if defined(CONFIG_BOOKE) diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index bbcd1aaf3df..c85a32f1a17 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -17,8 +17,6 @@ #include <asm/atomic.h> -#define get_irq_desc(irq) (&irq_desc[(irq)]) - /* Define a way to iterate across irqs. */ #define for_each_irq(i) \ for ((i) = 0; (i) < NR_IRQS; ++(i)) @@ -34,12 +32,15 @@ extern atomic_t ppc_n_lost_interrupts; */ #define NO_IRQ_IGNORE ((unsigned int)-1) -/* Total number of virq in the platform (make it a CONFIG_* option ? */ -#define NR_IRQS 512 +/* Total number of virq in the platform */ +#define NR_IRQS CONFIG_NR_IRQS /* Number of irqs reserved for the legacy controller */ #define NUM_ISA_INTERRUPTS 16 +/* Same thing, used by the generic IRQ code */ +#define NR_IRQS_LEGACY NUM_ISA_INTERRUPTS + /* This type is the placeholder for a hardware interrupt number. It has to * be big enough to enclose whatever representation is used by a given * platform. diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h index bb2de6aa5ce..c9ca97f43bc 100644 --- a/arch/powerpc/include/asm/kvm.h +++ b/arch/powerpc/include/asm/kvm.h @@ -46,6 +46,8 @@ struct kvm_regs { }; struct kvm_sregs { + __u32 pvr; + char pad[1020]; }; struct kvm_fpu { diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index 56bfae59837..19ddb352fd0 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -49,6 +49,45 @@ #define BOOKE_INTERRUPT_SPE_FP_ROUND 34 #define BOOKE_INTERRUPT_PERFORMANCE_MONITOR 35 +/* book3s */ + +#define BOOK3S_INTERRUPT_SYSTEM_RESET 0x100 +#define BOOK3S_INTERRUPT_MACHINE_CHECK 0x200 +#define BOOK3S_INTERRUPT_DATA_STORAGE 0x300 +#define BOOK3S_INTERRUPT_DATA_SEGMENT 0x380 +#define BOOK3S_INTERRUPT_INST_STORAGE 0x400 +#define BOOK3S_INTERRUPT_INST_SEGMENT 0x480 +#define BOOK3S_INTERRUPT_EXTERNAL 0x500 +#define BOOK3S_INTERRUPT_ALIGNMENT 0x600 +#define BOOK3S_INTERRUPT_PROGRAM 0x700 +#define BOOK3S_INTERRUPT_FP_UNAVAIL 0x800 +#define BOOK3S_INTERRUPT_DECREMENTER 0x900 +#define BOOK3S_INTERRUPT_SYSCALL 0xc00 +#define BOOK3S_INTERRUPT_TRACE 0xd00 +#define BOOK3S_INTERRUPT_PERFMON 0xf00 +#define BOOK3S_INTERRUPT_ALTIVEC 0xf20 +#define BOOK3S_INTERRUPT_VSX 0xf40 + +#define BOOK3S_IRQPRIO_SYSTEM_RESET 0 +#define BOOK3S_IRQPRIO_DATA_SEGMENT 1 +#define BOOK3S_IRQPRIO_INST_SEGMENT 2 +#define BOOK3S_IRQPRIO_DATA_STORAGE 3 +#define BOOK3S_IRQPRIO_INST_STORAGE 4 +#define BOOK3S_IRQPRIO_ALIGNMENT 5 +#define BOOK3S_IRQPRIO_PROGRAM 6 +#define BOOK3S_IRQPRIO_FP_UNAVAIL 7 +#define BOOK3S_IRQPRIO_ALTIVEC 8 +#define BOOK3S_IRQPRIO_VSX 9 +#define BOOK3S_IRQPRIO_SYSCALL 10 +#define BOOK3S_IRQPRIO_MACHINE_CHECK 11 +#define BOOK3S_IRQPRIO_DEBUG 12 +#define BOOK3S_IRQPRIO_EXTERNAL 13 +#define BOOK3S_IRQPRIO_DECREMENTER 14 +#define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR 15 +#define BOOK3S_IRQPRIO_MAX 16 + +#define BOOK3S_HFLAG_DCBZ32 0x1 + #define RESUME_FLAG_NV (1<<0) /* Reload guest nonvolatile state? */ #define RESUME_FLAG_HOST (1<<1) /* Resume host? */ diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h new file mode 100644 index 00000000000..c6011336371 --- /dev/null +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -0,0 +1,136 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#ifndef __ASM_KVM_BOOK3S_H__ +#define __ASM_KVM_BOOK3S_H__ + +#include <linux/types.h> +#include <linux/kvm_host.h> +#include <asm/kvm_ppc.h> + +struct kvmppc_slb { + u64 esid; + u64 vsid; + u64 orige; + u64 origv; + bool valid; + bool Ks; + bool Kp; + bool nx; + bool large; + bool class; +}; + +struct kvmppc_sr { + u32 raw; + u32 vsid; + bool Ks; + bool Kp; + bool nx; +}; + +struct kvmppc_bat { + u32 bepi; + u32 bepi_mask; + bool vs; + bool vp; + u32 brpn; + u8 wimg; + u8 pp; +}; + +struct kvmppc_sid_map { + u64 guest_vsid; + u64 guest_esid; + u64 host_vsid; + bool valid; +}; + +#define SID_MAP_BITS 9 +#define SID_MAP_NUM (1 << SID_MAP_BITS) +#define SID_MAP_MASK (SID_MAP_NUM - 1) + +struct kvmppc_vcpu_book3s { + struct kvm_vcpu vcpu; + struct kvmppc_sid_map sid_map[SID_MAP_NUM]; + struct kvmppc_slb slb[64]; + struct { + u64 esid; + u64 vsid; + } slb_shadow[64]; + u8 slb_shadow_max; + struct kvmppc_sr sr[16]; + struct kvmppc_bat ibat[8]; + struct kvmppc_bat dbat[8]; + u64 hid[6]; + int slb_nr; + u64 sdr1; + u64 dsisr; + u64 hior; + u64 msr_mask; + u64 vsid_first; + u64 vsid_next; + u64 vsid_max; + int context_id; +}; + +#define CONTEXT_HOST 0 +#define CONTEXT_GUEST 1 +#define CONTEXT_GUEST_END 2 + +#define VSID_REAL 0xfffffffffff00000 +#define VSID_REAL_DR 0xffffffffffe00000 +#define VSID_REAL_IR 0xffffffffffd00000 +#define VSID_BAT 0xffffffffffc00000 +#define VSID_PR 0x8000000000000000 + +extern void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, u64 ea, u64 ea_mask); +extern void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 vp, u64 vp_mask); +extern void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, u64 pa_start, u64 pa_end); +extern void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 new_msr); +extern void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu); +extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu); +extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte); +extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr); +extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu); +extern struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data); +extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong eaddr, int size, void *ptr, bool data); +extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong eaddr, int size, void *ptr); +extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec); + +extern u32 kvmppc_trampoline_lowmem; +extern u32 kvmppc_trampoline_enter; + +static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct kvmppc_vcpu_book3s, vcpu); +} + +static inline ulong dsisr(void) +{ + ulong r; + asm ( "mfdsisr %0 " : "=r" (r) ); + return r; +} + +extern void kvm_return_point(void); + +#define INS_DCBZ 0x7c0007ec + +#endif /* __ASM_KVM_BOOK3S_H__ */ diff --git a/arch/powerpc/include/asm/kvm_book3s_64_asm.h b/arch/powerpc/include/asm/kvm_book3s_64_asm.h new file mode 100644 index 00000000000..2e06ee8184e --- /dev/null +++ b/arch/powerpc/include/asm/kvm_book3s_64_asm.h @@ -0,0 +1,58 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#ifndef __ASM_KVM_BOOK3S_ASM_H__ +#define __ASM_KVM_BOOK3S_ASM_H__ + +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER + +#include <asm/kvm_asm.h> + +.macro DO_KVM intno + .if (\intno == BOOK3S_INTERRUPT_SYSTEM_RESET) || \ + (\intno == BOOK3S_INTERRUPT_MACHINE_CHECK) || \ + (\intno == BOOK3S_INTERRUPT_DATA_STORAGE) || \ + (\intno == BOOK3S_INTERRUPT_INST_STORAGE) || \ + (\intno == BOOK3S_INTERRUPT_DATA_SEGMENT) || \ + (\intno == BOOK3S_INTERRUPT_INST_SEGMENT) || \ + (\intno == BOOK3S_INTERRUPT_EXTERNAL) || \ + (\intno == BOOK3S_INTERRUPT_ALIGNMENT) || \ + (\intno == BOOK3S_INTERRUPT_PROGRAM) || \ + (\intno == BOOK3S_INTERRUPT_FP_UNAVAIL) || \ + (\intno == BOOK3S_INTERRUPT_DECREMENTER) || \ + (\intno == BOOK3S_INTERRUPT_SYSCALL) || \ + (\intno == BOOK3S_INTERRUPT_TRACE) || \ + (\intno == BOOK3S_INTERRUPT_PERFMON) || \ + (\intno == BOOK3S_INTERRUPT_ALTIVEC) || \ + (\intno == BOOK3S_INTERRUPT_VSX) + + b kvmppc_trampoline_\intno +kvmppc_resume_\intno: + + .endif +.endm + +#else + +.macro DO_KVM intno +.endm + +#endif /* CONFIG_KVM_BOOK3S_64_HANDLER */ + +#endif /* __ASM_KVM_BOOK3S_ASM_H__ */ diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index c9c930ed11d..1201f62d0d7 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -21,7 +21,8 @@ #define __POWERPC_KVM_HOST_H__ #include <linux/mutex.h> -#include <linux/timer.h> +#include <linux/hrtimer.h> +#include <linux/interrupt.h> #include <linux/types.h> #include <linux/kvm_types.h> #include <asm/kvm_asm.h> @@ -37,6 +38,8 @@ #define KVM_NR_PAGE_SIZES 1 #define KVM_PAGES_PER_HPAGE(x) (1UL<<31) +#define HPTEG_CACHE_NUM 1024 + struct kvm; struct kvm_run; struct kvm_vcpu; @@ -63,6 +66,17 @@ struct kvm_vcpu_stat { u32 dec_exits; u32 ext_intr_exits; u32 halt_wakeup; +#ifdef CONFIG_PPC64 + u32 pf_storage; + u32 pf_instruc; + u32 sp_storage; + u32 sp_instruc; + u32 queue_intr; + u32 ld; + u32 ld_slow; + u32 st; + u32 st_slow; +#endif }; enum kvm_exit_types { @@ -109,9 +123,53 @@ struct kvmppc_exit_timing { struct kvm_arch { }; +struct kvmppc_pte { + u64 eaddr; + u64 vpage; + u64 raddr; + bool may_read; + bool may_write; + bool may_execute; +}; + +struct kvmppc_mmu { + /* book3s_64 only */ + void (*slbmte)(struct kvm_vcpu *vcpu, u64 rb, u64 rs); + u64 (*slbmfee)(struct kvm_vcpu *vcpu, u64 slb_nr); + u64 (*slbmfev)(struct kvm_vcpu *vcpu, u64 slb_nr); + void (*slbie)(struct kvm_vcpu *vcpu, u64 slb_nr); + void (*slbia)(struct kvm_vcpu *vcpu); + /* book3s */ + void (*mtsrin)(struct kvm_vcpu *vcpu, u32 srnum, ulong value); + u32 (*mfsrin)(struct kvm_vcpu *vcpu, u32 srnum); + int (*xlate)(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *pte, bool data); + void (*reset_msr)(struct kvm_vcpu *vcpu); + void (*tlbie)(struct kvm_vcpu *vcpu, ulong addr, bool large); + int (*esid_to_vsid)(struct kvm_vcpu *vcpu, u64 esid, u64 *vsid); + u64 (*ea_to_vp)(struct kvm_vcpu *vcpu, gva_t eaddr, bool data); + bool (*is_dcbz32)(struct kvm_vcpu *vcpu); +}; + +struct hpte_cache { + u64 host_va; + u64 pfn; + ulong slot; + struct kvmppc_pte pte; +}; + struct kvm_vcpu_arch { - u32 host_stack; + ulong host_stack; u32 host_pid; +#ifdef CONFIG_PPC64 + ulong host_msr; + ulong host_r2; + void *host_retip; + ulong trampoline_lowmem; + ulong trampoline_enter; + ulong highmem_handler; + ulong host_paca_phys; + struct kvmppc_mmu mmu; +#endif u64 fpr[32]; ulong gpr[32]; @@ -123,6 +181,10 @@ struct kvm_vcpu_arch { ulong xer; ulong msr; +#ifdef CONFIG_PPC64 + ulong shadow_msr; + ulong hflags; +#endif u32 mmucr; ulong sprg0; ulong sprg1; @@ -149,6 +211,7 @@ struct kvm_vcpu_arch { u32 ivor[64]; ulong ivpr; u32 pir; + u32 pvr; u32 shadow_pid; u32 pid; @@ -174,6 +237,9 @@ struct kvm_vcpu_arch { #endif u32 last_inst; +#ifdef CONFIG_PPC64 + ulong fault_dsisr; +#endif ulong fault_dear; ulong fault_esr; gpa_t paddr_accessed; @@ -185,8 +251,15 @@ struct kvm_vcpu_arch { u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ - struct timer_list dec_timer; + struct hrtimer dec_timer; + struct tasklet_struct tasklet; + u64 dec_jiffies; unsigned long pending_exceptions; + +#ifdef CONFIG_PPC64 + struct hpte_cache hpte_cache[HPTEG_CACHE_NUM]; + int hpte_cache_offset; +#endif }; #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2c6ee349df5..269ee46ab02 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -39,6 +39,7 @@ enum emulation_result { extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); extern char kvmppc_handlers_start[]; extern unsigned long kvmppc_handler_len; +extern void kvmppc_handler_highmem(void); extern void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu); extern int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index bebe31c2e90..7514ec2f854 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -173,14 +173,6 @@ extern unsigned long tce_alloc_start, tce_alloc_end; */ extern int mmu_ci_restrictions; -#ifdef CONFIG_HUGETLB_PAGE -/* - * The page size indexes of the huge pages for use by hugetlbfs - */ -extern unsigned int mmu_huge_psizes[MMU_PAGE_COUNT]; - -#endif /* CONFIG_HUGETLB_PAGE */ - /* * This function sets the AVPN and L fields of the HPTE appropriately * for the page size @@ -253,10 +245,11 @@ extern int __hash_page_64K(unsigned long ea, unsigned long access, unsigned long vsid, pte_t *ptep, unsigned long trap, unsigned int local, int ssize); struct mm_struct; +unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap); extern int hash_page(unsigned long ea, unsigned long access, unsigned long trap); -extern int hash_huge_page(struct mm_struct *mm, unsigned long access, - unsigned long ea, unsigned long vsid, int local, - unsigned long trap); +int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, + pte_t *ptep, unsigned long trap, int local, int ssize, + unsigned int shift, unsigned int mmu_psize); extern int htab_bolt_mapping(unsigned long vstart, unsigned long vend, unsigned long pstart, unsigned long prot, diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index b34e94d9443..26383e0778a 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -23,6 +23,8 @@ extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm); extern void set_context(unsigned long id, pgd_t *pgd); #ifdef CONFIG_PPC_BOOK3S_64 +extern int __init_new_context(void); +extern void __destroy_context(int context_id); static inline void mmu_context_init(void) { } #else extern void mmu_context_init(void); diff --git a/arch/powerpc/include/asm/nvram.h b/arch/powerpc/include/asm/nvram.h index 6c587eddee5..850b72f2744 100644 --- a/arch/powerpc/include/asm/nvram.h +++ b/arch/powerpc/include/asm/nvram.h @@ -73,7 +73,6 @@ extern int nvram_write_error_log(char * buff, int length, extern int nvram_read_error_log(char * buff, int length, unsigned int * err_type, unsigned int *err_seq); extern int nvram_clear_error_log(void); -extern struct nvram_partition *nvram_find_partition(int sig, const char *name); extern int pSeries_nvram_init(void); diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 7d8514cecea..5e9b4ef7141 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -129,6 +129,15 @@ struct paca_struct { u64 system_time; /* accumulated system TB ticks */ u64 startpurr; /* PURR/TB value snapshot */ u64 startspurr; /* SPURR value snapshot */ + +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER + struct { + u64 esid; + u64 vsid; + } kvm_slb[64]; /* guest SLB */ + u8 kvm_slb_max; /* highest used guest slb entry */ + u8 kvm_in_guest; /* are we inside the guest? */ +#endif }; extern struct paca_struct paca[]; diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index ff24254990e..e96d52a516b 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -229,6 +229,20 @@ typedef unsigned long pgprot_t; #endif +typedef struct { signed long pd; } hugepd_t; +#define HUGEPD_SHIFT_MASK 0x3f + +#ifdef CONFIG_HUGETLB_PAGE +static inline int hugepd_ok(hugepd_t hpd) +{ + return (hpd.pd > 0); +} + +#define is_hugepd(pdep) (hugepd_ok(*((hugepd_t *)(pdep)))) +#else /* CONFIG_HUGETLB_PAGE */ +#define is_hugepd(pdep) 0 +#endif /* CONFIG_HUGETLB_PAGE */ + struct page; extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg); extern void copy_user_page(void *to, void *from, unsigned long vaddr, diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index 3f17b83f55a..bfc4e027e2a 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -90,7 +90,7 @@ extern unsigned int HPAGE_SHIFT; #define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) -#define HUGE_MAX_HSTATE 3 +#define HUGE_MAX_HSTATE (MMU_PAGE_COUNT-1) #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/pgalloc-32.h b/arch/powerpc/include/asm/pgalloc-32.h index c9500d666a1..580cf73b96e 100644 --- a/arch/powerpc/include/asm/pgalloc-32.h +++ b/arch/powerpc/include/asm/pgalloc-32.h @@ -3,7 +3,8 @@ #include <linux/threads.h> -#define PTE_NONCACHE_NUM 0 /* dummy for now to share code w/ppc64 */ +/* For 32-bit, all levels of page tables are just drawn from get_free_page() */ +#define MAX_PGTABLE_INDEX_SIZE 0 extern void __bad_pte(pmd_t *pmd); @@ -36,11 +37,10 @@ extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr); -static inline void pgtable_free(pgtable_free_t pgf) +static inline void pgtable_free(void *table, unsigned index_size) { - void *p = (void *)(pgf.val & ~PGF_CACHENUM_MASK); - - free_page((unsigned long)p); + BUG_ON(index_size); /* 32-bit doesn't use this */ + free_page((unsigned long)table); } #define check_pgt_cache() do { } while (0) diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h index e6f069c4f71..5c1cd73dafa 100644 --- a/arch/powerpc/include/asm/pgalloc-64.h +++ b/arch/powerpc/include/asm/pgalloc-64.h @@ -11,27 +11,39 @@ #include <linux/cpumask.h> #include <linux/percpu.h> +/* + * Functions that deal with pagetables that could be at any level of + * the table need to be passed an "index_size" so they know how to + * handle allocation. For PTE pages (which are linked to a struct + * page for now, and drawn from the main get_free_pages() pool), the + * allocation size will be (2^index_size * sizeof(pointer)) and + * allocations are drawn from the kmem_cache in PGT_CACHE(index_size). + * + * The maximum index size needs to be big enough to allow any + * pagetable sizes we need, but small enough to fit in the low bits of + * any page table pointer. In other words all pagetables, even tiny + * ones, must be aligned to allow at least enough low 0 bits to + * contain this value. This value is also used as a mask, so it must + * be one less than a power of two. + */ +#define MAX_PGTABLE_INDEX_SIZE 0xf + #ifndef CONFIG_PPC_SUBPAGE_PROT static inline void subpage_prot_free(pgd_t *pgd) {} #endif extern struct kmem_cache *pgtable_cache[]; - -#define PGD_CACHE_NUM 0 -#define PUD_CACHE_NUM 1 -#define PMD_CACHE_NUM 1 -#define HUGEPTE_CACHE_NUM 2 -#define PTE_NONCACHE_NUM 7 /* from GFP rather than kmem_cache */ +#define PGT_CACHE(shift) (pgtable_cache[(shift)-1]) static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return kmem_cache_alloc(pgtable_cache[PGD_CACHE_NUM], GFP_KERNEL); + return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { subpage_prot_free(pgd); - kmem_cache_free(pgtable_cache[PGD_CACHE_NUM], pgd); + kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd); } #ifndef CONFIG_PPC_64K_PAGES @@ -40,13 +52,13 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return kmem_cache_alloc(pgtable_cache[PUD_CACHE_NUM], + return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), GFP_KERNEL|__GFP_REPEAT); } static inline void pud_free(struct mm_struct *mm, pud_t *pud) { - kmem_cache_free(pgtable_cache[PUD_CACHE_NUM], pud); + kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud); } static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) @@ -78,13 +90,13 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return kmem_cache_alloc(pgtable_cache[PMD_CACHE_NUM], + return kmem_cache_alloc(PGT_CACHE(PMD_INDEX_SIZE), GFP_KERNEL|__GFP_REPEAT); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { - kmem_cache_free(pgtable_cache[PMD_CACHE_NUM], pmd); + kmem_cache_free(PGT_CACHE(PMD_INDEX_SIZE), pmd); } static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, @@ -107,24 +119,22 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, return page; } -static inline void pgtable_free(pgtable_free_t pgf) +static inline void pgtable_free(void *table, unsigned index_size) { - void *p = (void *)(pgf.val & ~PGF_CACHENUM_MASK); - int cachenum = pgf.val & PGF_CACHENUM_MASK; - - if (cachenum == PTE_NONCACHE_NUM) - free_page((unsigned long)p); - else - kmem_cache_free(pgtable_cache[cachenum], p); + if (!index_size) + free_page((unsigned long)table); + else { + BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE); + kmem_cache_free(PGT_CACHE(index_size), table); + } } -#define __pmd_free_tlb(tlb, pmd,addr) \ - pgtable_free_tlb(tlb, pgtable_free_cache(pmd, \ - PMD_CACHE_NUM, PMD_TABLE_SIZE-1)) +#define __pmd_free_tlb(tlb, pmd, addr) \ + pgtable_free_tlb(tlb, pmd, PMD_INDEX_SIZE) #ifndef CONFIG_PPC_64K_PAGES #define __pud_free_tlb(tlb, pud, addr) \ - pgtable_free_tlb(tlb, pgtable_free_cache(pud, \ - PUD_CACHE_NUM, PUD_TABLE_SIZE-1)) + pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE) + #endif /* CONFIG_PPC_64K_PAGES */ #define check_pgt_cache() do { } while (0) diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index f2e812de7c3..abe8532bd14 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -24,25 +24,6 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) __free_page(ptepage); } -typedef struct pgtable_free { - unsigned long val; -} pgtable_free_t; - -/* This needs to be big enough to allow for MMU_PAGE_COUNT + 2 to be stored - * and small enough to fit in the low bits of any naturally aligned page - * table cache entry. Arbitrarily set to 0x1f, that should give us some - * room to grow - */ -#define PGF_CACHENUM_MASK 0x1f - -static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum, - unsigned long mask) -{ - BUG_ON(cachenum > PGF_CACHENUM_MASK); - - return (pgtable_free_t){.val = ((unsigned long) p & ~mask) | cachenum}; -} - #ifdef CONFIG_PPC64 #include <asm/pgalloc-64.h> #else @@ -50,12 +31,12 @@ static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum, #endif #ifdef CONFIG_SMP -extern void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf); +extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift); extern void pte_free_finish(void); #else /* CONFIG_SMP */ -static inline void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) +static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift) { - pgtable_free(pgf); + pgtable_free(table, shift); } static inline void pte_free_finish(void) { } #endif /* !CONFIG_SMP */ @@ -63,12 +44,9 @@ static inline void pte_free_finish(void) { } static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage, unsigned long address) { - pgtable_free_t pgf = pgtable_free_cache(page_address(ptepage), - PTE_NONCACHE_NUM, - PTE_TABLE_SIZE-1); tlb_flush_pgtable(tlb, address); pgtable_page_dtor(ptepage); - pgtable_free_tlb(tlb, pgf); + pgtable_free_tlb(tlb, page_address(ptepage), 0); } #endif /* __KERNEL__ */ diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 806abe7a3fa..49865045d56 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -354,6 +354,7 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) #define pgoff_to_pte(off) ((pte_t) {((off) << PTE_RPN_SHIFT)|_PAGE_FILE}) #define PTE_FILE_MAX_BITS (BITS_PER_LONG - PTE_RPN_SHIFT) +void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); void pgtable_cache_init(void); /* @@ -378,7 +379,18 @@ void pgtable_cache_init(void); return pt; } -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long address); +#ifdef CONFIG_HUGETLB_PAGE +pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, + unsigned *shift); +#else +static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, + unsigned *shift) +{ + if (shift) + *shift = 0; + return find_linux_pte(pgdir, ea); +} +#endif /* !CONFIG_HUGETLB_PAGE */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 2a5da069714..21207e54825 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -211,6 +211,9 @@ extern void paging_init(void); */ extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t); +extern int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr); + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index b23664a0b86..c002b041021 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -42,10 +42,11 @@ obj-$(CONFIG_ALTIVEC) += vecemu.o obj-$(CONFIG_PPC_970_NAP) += idle_power4.o obj-$(CONFIG_PPC_OF) += of_device.o of_platform.o prom_parse.o obj-$(CONFIG_PPC_CLOCK) += clock.o -procfs-$(CONFIG_PPC64) := proc_ppc64.o +procfs-y := proc_powerpc.o obj-$(CONFIG_PROC_FS) += $(procfs-y) rtaspci-$(CONFIG_PPC64)-$(CONFIG_PCI) := rtas_pci.o obj-$(CONFIG_PPC_RTAS) += rtas.o rtas-rtc.o $(rtaspci-y-y) +obj-$(CONFIG_PPC_RTAS_DAEMON) += rtasd.o obj-$(CONFIG_RTAS_FLASH) += rtas_flash.o obj-$(CONFIG_RTAS_PROC) += rtas-proc.o obj-$(CONFIG_LPARCFG) += lparcfg.o diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 0812b0f414b..e2e2082acf2 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -190,6 +190,11 @@ int main(void) DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time)); DEFINE(PACA_DATA_OFFSET, offsetof(struct paca_struct, data_offset)); DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save)); +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER + DEFINE(PACA_KVM_IN_GUEST, offsetof(struct paca_struct, kvm_in_guest)); + DEFINE(PACA_KVM_SLB, offsetof(struct paca_struct, kvm_slb)); + DEFINE(PACA_KVM_SLB_MAX, offsetof(struct paca_struct, kvm_slb_max)); +#endif #endif /* CONFIG_PPC64 */ /* RTAS */ @@ -398,6 +403,19 @@ int main(void) DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear)); DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr)); + + /* book3s_64 */ +#ifdef CONFIG_PPC64 + DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr)); + DEFINE(VCPU_HOST_RETIP, offsetof(struct kvm_vcpu, arch.host_retip)); + DEFINE(VCPU_HOST_R2, offsetof(struct kvm_vcpu, arch.host_r2)); + DEFINE(VCPU_HOST_MSR, offsetof(struct kvm_vcpu, arch.host_msr)); + DEFINE(VCPU_SHADOW_MSR, offsetof(struct kvm_vcpu, arch.shadow_msr)); + DEFINE(VCPU_TRAMPOLINE_LOWMEM, offsetof(struct kvm_vcpu, arch.trampoline_lowmem)); + DEFINE(VCPU_TRAMPOLINE_ENTER, offsetof(struct kvm_vcpu, arch.trampoline_enter)); + DEFINE(VCPU_HIGHMEM_HANDLER, offsetof(struct kvm_vcpu, arch.highmem_handler)); + DEFINE(VCPU_HFLAGS, offsetof(struct kvm_vcpu, arch.hflags)); +#endif #endif #ifdef CONFIG_44x DEFINE(PGD_T_LOG2, PGD_T_LOG2); diff --git a/arch/powerpc/kernel/crash.c b/arch/powerpc/kernel/crash.c index 0a8439aafdd..6f4613dd05e 100644 --- a/arch/powerpc/kernel/crash.c +++ b/arch/powerpc/kernel/crash.c @@ -373,7 +373,7 @@ void default_machine_crash_shutdown(struct pt_regs *regs) hard_irq_disable(); for_each_irq(i) { - struct irq_desc *desc = irq_desc + i; + struct irq_desc *desc = irq_to_desc(i); if (desc->status & IRQ_INPROGRESS) desc->chip->eoi(i); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 1808876edcc..fc3ead066ce 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -41,6 +41,7 @@ __start_interrupts: . = 0x200 _machine_check_pSeries: HMT_MEDIUM + DO_KVM 0x200 mtspr SPRN_SPRG_SCRATCH0,r13 /* save r13 */ EXCEPTION_PROLOG_PSERIES(PACA_EXMC, machine_check_common) @@ -48,6 +49,7 @@ _machine_check_pSeries: .globl data_access_pSeries data_access_pSeries: HMT_MEDIUM + DO_KVM 0x300 mtspr SPRN_SPRG_SCRATCH0,r13 BEGIN_FTR_SECTION mfspr r13,SPRN_SPRG_PACA @@ -77,6 +79,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_SLB) .globl data_access_slb_pSeries data_access_slb_pSeries: HMT_MEDIUM + DO_KVM 0x380 mtspr SPRN_SPRG_SCRATCH0,r13 mfspr r13,SPRN_SPRG_PACA /* get paca address into r13 */ std r3,PACA_EXSLB+EX_R3(r13) @@ -115,6 +118,7 @@ data_access_slb_pSeries: .globl instruction_access_slb_pSeries instruction_access_slb_pSeries: HMT_MEDIUM + DO_KVM 0x480 mtspr SPRN_SPRG_SCRATCH0,r13 mfspr r13,SPRN_SPRG_PACA /* get paca address into r13 */ std r3,PACA_EXSLB+EX_R3(r13) @@ -154,6 +158,7 @@ instruction_access_slb_pSeries: .globl system_call_pSeries system_call_pSeries: HMT_MEDIUM + DO_KVM 0xc00 BEGIN_FTR_SECTION cmpdi r0,0x1ebe beq- 1f @@ -186,12 +191,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) * trickery is thus necessary */ . = 0xf00 + DO_KVM 0xf00 b performance_monitor_pSeries . = 0xf20 + DO_KVM 0xf20 b altivec_unavailable_pSeries . = 0xf40 + DO_KVM 0xf40 b vsx_unavailable_pSeries #ifdef CONFIG_CBE_RAS diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index c38afdb45d7..92580748802 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -37,6 +37,7 @@ #include <asm/firmware.h> #include <asm/page_64.h> #include <asm/irqflags.h> +#include <asm/kvm_book3s_64_asm.h> /* The physical memory is layed out such that the secondary processor * spin code sits at 0x0000...0x00ff. On server, the vectors follow @@ -165,6 +166,12 @@ exception_marker: #include "exceptions-64s.S" #endif +/* KVM trampoline code needs to be close to the interrupt handlers */ + +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER +#include "../kvm/book3s_64_rmhandlers.S" +#endif + _GLOBAL(generic_secondary_thread_init) mr r24,r3 diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index e5d12117798..eba53923630 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -85,7 +85,10 @@ extern int tau_interrupts(int); #endif /* CONFIG_PPC32 */ #ifdef CONFIG_PPC64 + +#ifndef CONFIG_SPARSE_IRQ EXPORT_SYMBOL(irq_desc); +#endif int distribute_irqs = 1; @@ -187,33 +190,7 @@ int show_interrupts(struct seq_file *p, void *v) for_each_online_cpu(j) seq_printf(p, "CPU%d ", j); seq_putc(p, '\n'); - } - - if (i < NR_IRQS) { - desc = get_irq_desc(i); - spin_lock_irqsave(&desc->lock, flags); - action = desc->action; - if (!action || !action->handler) - goto skip; - seq_printf(p, "%3d: ", i); -#ifdef CONFIG_SMP - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); -#else - seq_printf(p, "%10u ", kstat_irqs(i)); -#endif /* CONFIG_SMP */ - if (desc->chip) - seq_printf(p, " %s ", desc->chip->typename); - else - seq_puts(p, " None "); - seq_printf(p, "%s", (desc->status & IRQ_LEVEL) ? "Level " : "Edge "); - seq_printf(p, " %s", action->name); - for (action = action->next; action; action = action->next) - seq_printf(p, ", %s", action->name); - seq_putc(p, '\n'); -skip: - spin_unlock_irqrestore(&desc->lock, flags); - } else if (i == NR_IRQS) { + } else if (i == nr_irqs) { #if defined(CONFIG_PPC32) && defined(CONFIG_TAU_INT) if (tau_initialized){ seq_puts(p, "TAU: "); @@ -223,30 +200,68 @@ skip: } #endif /* CONFIG_PPC32 && CONFIG_TAU_INT*/ seq_printf(p, "BAD: %10u\n", ppc_spurious_interrupts); + + return 0; } + + desc = irq_to_desc(i); + if (!desc) + return 0; + + spin_lock_irqsave(&desc->lock, flags); + + action = desc->action; + if (!action || !action->handler) + goto skip; + + seq_printf(p, "%3d: ", i); +#ifdef CONFIG_SMP + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); +#else + seq_printf(p, "%10u ", kstat_irqs(i)); +#endif /* CONFIG_SMP */ + + if (desc->chip) + seq_printf(p, " %s ", desc->chip->typename); + else + seq_puts(p, " None "); + + seq_printf(p, "%s", (desc->status & IRQ_LEVEL) ? "Level " : "Edge "); + seq_printf(p, " %s", action->name); + + for (action = action->next; action; action = action->next) + seq_printf(p, ", %s", action->name); + seq_putc(p, '\n'); + +skip: + spin_unlock_irqrestore(&desc->lock, flags); + return 0; } #ifdef CONFIG_HOTPLUG_CPU void fixup_irqs(cpumask_t map) { + struct irq_desc *desc; unsigned int irq; static int warned; for_each_irq(irq) { cpumask_t mask; - if (irq_desc[irq].status & IRQ_PER_CPU) + desc = irq_to_desc(irq); + if (desc && desc->status & IRQ_PER_CPU) continue; - cpumask_and(&mask, irq_desc[irq].affinity, &map); + cpumask_and(&mask, desc->affinity, &map); if (any_online_cpu(mask) == NR_CPUS) { printk("Breaking affinity for irq %i\n", irq); mask = map; } - if (irq_desc[irq].chip->set_affinity) - irq_desc[irq].chip->set_affinity(irq, &mask); - else if (irq_desc[irq].action && !(warned++)) + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, &mask); + else if (desc->action && !(warned++)) printk("Cannot set affinity for irq %i\n", irq); } @@ -273,7 +288,7 @@ static inline void handle_one_irq(unsigned int irq) return; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); saved_sp_limit = current->thread.ksp_limit; irqtp->task = curtp->task; @@ -535,7 +550,7 @@ struct irq_host *irq_alloc_host(struct device_node *of_node, smp_wmb(); /* Clear norequest flags */ - get_irq_desc(i)->status &= ~IRQ_NOREQUEST; + irq_to_desc(i)->status &= ~IRQ_NOREQUEST; /* Legacy flags are left to default at this point, * one can then use irq_create_mapping() to @@ -601,8 +616,16 @@ void irq_set_virq_count(unsigned int count) static int irq_setup_virq(struct irq_host *host, unsigned int virq, irq_hw_number_t hwirq) { + struct irq_desc *desc; + + desc = irq_to_desc_alloc_node(virq, 0); + if (!desc) { + pr_debug("irq: -> allocating desc failed\n"); + goto error; + } + /* Clear IRQ_NOREQUEST flag */ - get_irq_desc(virq)->status &= ~IRQ_NOREQUEST; + desc->status &= ~IRQ_NOREQUEST; /* map it */ smp_wmb(); @@ -611,11 +634,14 @@ static int irq_setup_virq(struct irq_host *host, unsigned int virq, if (host->ops->map(host, virq, hwirq)) { pr_debug("irq: -> mapping failed, freeing\n"); - irq_free_virt(virq, 1); - return -1; + goto error; } return 0; + +error: + irq_free_virt(virq, 1); + return -1; } unsigned int irq_create_direct_mapping(struct irq_host *host) @@ -732,7 +758,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller, /* Set type if specified and different than the current one */ if (type != IRQ_TYPE_NONE && - type != (get_irq_desc(virq)->status & IRQF_TRIGGER_MASK)) + type != (irq_to_desc(virq)->status & IRQF_TRIGGER_MASK)) set_irq_type(virq, type); return virq; } @@ -804,7 +830,7 @@ void irq_dispose_mapping(unsigned int virq) irq_map[virq].hwirq = host->inval_irq; /* Set some flags */ - get_irq_desc(virq)->status |= IRQ_NOREQUEST; + irq_to_desc(virq)->status |= IRQ_NOREQUEST; /* Free it */ irq_free_virt(virq, 1); @@ -996,12 +1022,24 @@ void irq_free_virt(unsigned int virq, unsigned int count) spin_unlock_irqrestore(&irq_big_lock, flags); } -void irq_early_init(void) +int arch_early_irq_init(void) { - unsigned int i; + struct irq_desc *desc; + int i; + + for (i = 0; i < NR_IRQS; i++) { + desc = irq_to_desc(i); + if (desc) + desc->status |= IRQ_NOREQUEST; + } - for (i = 0; i < NR_IRQS; i++) - get_irq_desc(i)->status |= IRQ_NOREQUEST; + return 0; +} + +int arch_init_chip_data(struct irq_desc *desc, int node) +{ + desc->status |= IRQ_NOREQUEST; + return 0; } /* We need to create the radix trees late */ @@ -1063,8 +1101,11 @@ static int virq_debug_show(struct seq_file *m, void *private) seq_printf(m, "%-5s %-7s %-15s %s\n", "virq", "hwirq", "chip name", "host name"); - for (i = 1; i < NR_IRQS; i++) { - desc = get_irq_desc(i); + for (i = 1; i < nr_irqs; i++) { + desc = irq_to_desc(i); + if (!desc) + continue; + spin_lock_irqsave(&desc->lock, flags); if (desc->action && desc->action->handler) { diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c index ed0ac4e4b8d..79a00bb9c64 100644 --- a/arch/powerpc/kernel/lparcfg.c +++ b/arch/powerpc/kernel/lparcfg.c @@ -781,9 +781,9 @@ static int __init lparcfg_init(void) !firmware_has_feature(FW_FEATURE_ISERIES)) mode |= S_IWUSR; - ent = proc_create("ppc64/lparcfg", mode, NULL, &lparcfg_fops); + ent = proc_create("powerpc/lparcfg", mode, NULL, &lparcfg_fops); if (!ent) { - printk(KERN_ERR "Failed to create ppc64/lparcfg\n"); + printk(KERN_ERR "Failed to create powerpc/lparcfg\n"); return -EIO; } diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 0ed31f22048..a8f9251f4ca 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -184,7 +184,7 @@ static struct miscdevice nvram_dev = { #ifdef DEBUG_NVRAM -static void nvram_print_partitions(char * label) +static void __init nvram_print_partitions(char * label) { struct list_head * p; struct nvram_partition * tmp_part; @@ -202,7 +202,7 @@ static void nvram_print_partitions(char * label) #endif -static int nvram_write_header(struct nvram_partition * part) +static int __init nvram_write_header(struct nvram_partition * part) { loff_t tmp_index; int rc; @@ -214,7 +214,7 @@ static int nvram_write_header(struct nvram_partition * part) } -static unsigned char nvram_checksum(struct nvram_header *p) +static unsigned char __init nvram_checksum(struct nvram_header *p) { unsigned int c_sum, c_sum2; unsigned short *sp = (unsigned short *)p->name; /* assume 6 shorts */ @@ -228,32 +228,7 @@ static unsigned char nvram_checksum(struct nvram_header *p) return c_sum; } - -/* - * Find an nvram partition, sig can be 0 for any - * partition or name can be NULL for any name, else - * tries to match both - */ -struct nvram_partition *nvram_find_partition(int sig, const char *name) -{ - struct nvram_partition * part; - struct list_head * p; - - list_for_each(p, &nvram_part->partition) { - part = list_entry(p, struct nvram_partition, partition); - - if (sig && part->header.signature != sig) - continue; - if (name && 0 != strncmp(name, part->header.name, 12)) - continue; - return part; - } - return NULL; -} -EXPORT_SYMBOL(nvram_find_partition); - - -static int nvram_remove_os_partition(void) +static int __init nvram_remove_os_partition(void) { struct list_head *i; struct list_head *j; @@ -319,7 +294,7 @@ static int nvram_remove_os_partition(void) * Will create a partition starting at the first free * space found if space has enough room. */ -static int nvram_create_os_partition(void) +static int __init nvram_create_os_partition(void) { struct nvram_partition *part; struct nvram_partition *new_part; @@ -422,7 +397,7 @@ static int nvram_create_os_partition(void) * 5.) If the max chunk cannot be allocated then try finding a chunk * that will satisfy the minum needed (NVRAM_MIN_REQ). */ -static int nvram_setup_partition(void) +static int __init nvram_setup_partition(void) { struct list_head * p; struct nvram_partition * part; @@ -480,7 +455,7 @@ static int nvram_setup_partition(void) } -static int nvram_scan_partitions(void) +static int __init nvram_scan_partitions(void) { loff_t cur_index = 0; struct nvram_header phead; @@ -706,6 +681,9 @@ int nvram_clear_error_log(void) int clear_word = ERR_FLAG_ALREADY_LOGGED; int rc; + if (nvram_error_log_index == -1) + return -1; + tmp_index = nvram_error_log_index; rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index); diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c index 0a03cf70d24..936f04dbfc6 100644 --- a/arch/powerpc/kernel/perf_callchain.c +++ b/arch/powerpc/kernel/perf_callchain.c @@ -119,13 +119,6 @@ static void perf_callchain_kernel(struct pt_regs *regs, } #ifdef CONFIG_PPC64 - -#ifdef CONFIG_HUGETLB_PAGE -#define is_huge_psize(pagesize) (HPAGE_SHIFT && mmu_huge_psizes[pagesize]) -#else -#define is_huge_psize(pagesize) 0 -#endif - /* * On 64-bit we don't want to invoke hash_page on user addresses from * interrupt context, so if the access faults, we read the page tables @@ -135,7 +128,7 @@ static int read_user_stack_slow(void __user *ptr, void *ret, int nb) { pgd_t *pgdir; pte_t *ptep, pte; - int pagesize; + unsigned shift; unsigned long addr = (unsigned long) ptr; unsigned long offset; unsigned long pfn; @@ -145,17 +138,14 @@ static int read_user_stack_slow(void __user *ptr, void *ret, int nb) if (!pgdir) return -EFAULT; - pagesize = get_slice_psize(current->mm, addr); + ptep = find_linux_pte_or_hugepte(pgdir, addr, &shift); + if (!shift) + shift = PAGE_SHIFT; /* align address to page boundary */ - offset = addr & ((1ul << mmu_psize_defs[pagesize].shift) - 1); + offset = addr & ((1UL << shift) - 1); addr -= offset; - if (is_huge_psize(pagesize)) - ptep = huge_pte_offset(current->mm, addr); - else - ptep = find_linux_pte(pgdir, addr); - if (ptep == NULL) return -EFAULT; pte = *ptep; diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index c8b27bb4dbd..07115d6cd4b 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -162,7 +162,6 @@ EXPORT_SYMBOL(screen_info); #ifdef CONFIG_PPC32 EXPORT_SYMBOL(timer_interrupt); -EXPORT_SYMBOL(irq_desc); EXPORT_SYMBOL(tb_ticks_per_jiffy); EXPORT_SYMBOL(cacheable_memcpy); EXPORT_SYMBOL(cacheable_memzero); diff --git a/arch/powerpc/kernel/proc_ppc64.c b/arch/powerpc/kernel/proc_powerpc.c index c647ddef40d..1ed3b8d7981 100644 --- a/arch/powerpc/kernel/proc_ppc64.c +++ b/arch/powerpc/kernel/proc_powerpc.c @@ -28,55 +28,7 @@ #include <asm/uaccess.h> #include <asm/prom.h> -static loff_t page_map_seek( struct file *file, loff_t off, int whence); -static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes, - loff_t *ppos); -static int page_map_mmap( struct file *file, struct vm_area_struct *vma ); - -static const struct file_operations page_map_fops = { - .llseek = page_map_seek, - .read = page_map_read, - .mmap = page_map_mmap -}; - -/* - * Create the ppc64 and ppc64/rtas directories early. This allows us to - * assume that they have been previously created in drivers. - */ -static int __init proc_ppc64_create(void) -{ - struct proc_dir_entry *root; - - root = proc_mkdir("ppc64", NULL); - if (!root) - return 1; - - if (!of_find_node_by_path("/rtas")) - return 0; - - if (!proc_mkdir("rtas", root)) - return 1; - - if (!proc_symlink("rtas", NULL, "ppc64/rtas")) - return 1; - - return 0; -} -core_initcall(proc_ppc64_create); - -static int __init proc_ppc64_init(void) -{ - struct proc_dir_entry *pde; - - pde = proc_create_data("ppc64/systemcfg", S_IFREG|S_IRUGO, NULL, - &page_map_fops, vdso_data); - if (!pde) - return 1; - pde->size = PAGE_SIZE; - - return 0; -} -__initcall(proc_ppc64_init); +#ifdef CONFIG_PPC64 static loff_t page_map_seek( struct file *file, loff_t off, int whence) { @@ -120,3 +72,55 @@ static int page_map_mmap( struct file *file, struct vm_area_struct *vma ) return 0; } +static const struct file_operations page_map_fops = { + .llseek = page_map_seek, + .read = page_map_read, + .mmap = page_map_mmap +}; + + +static int __init proc_ppc64_init(void) +{ + struct proc_dir_entry *pde; + + pde = proc_create_data("powerpc/systemcfg", S_IFREG|S_IRUGO, NULL, + &page_map_fops, vdso_data); + if (!pde) + return 1; + pde->size = PAGE_SIZE; + + return 0; +} +__initcall(proc_ppc64_init); + +#endif /* CONFIG_PPC64 */ + +/* + * Create the ppc64 and ppc64/rtas directories early. This allows us to + * assume that they have been previously created in drivers. + */ +static int __init proc_ppc64_create(void) +{ + struct proc_dir_entry *root; + + root = proc_mkdir("powerpc", NULL); + if (!root) + return 1; + +#ifdef CONFIG_PPC64 + if (!proc_symlink("ppc64", NULL, "powerpc")) + pr_err("Failed to create link /proc/ppc64 -> /proc/powerpc\n"); +#endif + + if (!of_find_node_by_path("/rtas")) + return 0; + + if (!proc_mkdir("rtas", root)) + return 1; + + if (!proc_symlink("rtas", NULL, "powerpc/rtas")) + return 1; + + return 0; +} +core_initcall(proc_ppc64_create); diff --git a/arch/powerpc/kernel/rtas_flash.c b/arch/powerpc/kernel/rtas_flash.c index 13011a96a97..a85117d5c9a 100644 --- a/arch/powerpc/kernel/rtas_flash.c +++ b/arch/powerpc/kernel/rtas_flash.c @@ -6,7 +6,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * /proc/ppc64/rtas/firmware_flash interface + * /proc/powerpc/rtas/firmware_flash interface * * This file implements a firmware_flash interface to pump a firmware * image into the kernel. At reboot time rtas_restart() will see the @@ -740,7 +740,7 @@ static int __init rtas_flash_init(void) return 1; } - firmware_flash_pde = create_flash_pde("ppc64/rtas/" + firmware_flash_pde = create_flash_pde("powerpc/rtas/" FIRMWARE_FLASH_NAME, &rtas_flash_operations); if (firmware_flash_pde == NULL) { @@ -754,7 +754,7 @@ static int __init rtas_flash_init(void) if (rc != 0) goto cleanup; - firmware_update_pde = create_flash_pde("ppc64/rtas/" + firmware_update_pde = create_flash_pde("powerpc/rtas/" FIRMWARE_UPDATE_NAME, &rtas_flash_operations); if (firmware_update_pde == NULL) { @@ -768,7 +768,7 @@ static int __init rtas_flash_init(void) if (rc != 0) goto cleanup; - validate_pde = create_flash_pde("ppc64/rtas/" VALIDATE_FLASH_NAME, + validate_pde = create_flash_pde("powerpc/rtas/" VALIDATE_FLASH_NAME, &validate_flash_operations); if (validate_pde == NULL) { rc = -ENOMEM; @@ -781,7 +781,7 @@ static int __init rtas_flash_init(void) if (rc != 0) goto cleanup; - manage_pde = create_flash_pde("ppc64/rtas/" MANAGE_FLASH_NAME, + manage_pde = create_flash_pde("powerpc/rtas/" MANAGE_FLASH_NAME, &manage_flash_operations); if (manage_pde == NULL) { rc = -ENOMEM; diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/kernel/rtasd.c index b3cbac85592..2e4832ab210 100644 --- a/arch/powerpc/platforms/pseries/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -39,6 +39,7 @@ static unsigned long rtas_log_start; static unsigned long rtas_log_size; static int surveillance_timeout = -1; + static unsigned int rtas_error_log_max; static unsigned int rtas_error_log_buffer_max; @@ -213,9 +214,11 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal) return; } +#ifdef CONFIG_PPC64 /* Write error to NVRAM */ if (logging_enabled && !(err_type & ERR_FLAG_BOOT)) nvram_write_error_log(buf, len, err_type, error_log_cnt); +#endif /* CONFIG_PPC64 */ /* * rtas errors can occur during boot, and we do want to capture @@ -264,7 +267,6 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal) } - static int rtas_log_open(struct inode * inode, struct file * file) { return 0; @@ -300,6 +302,7 @@ static ssize_t rtas_log_read(struct file * file, char __user * buf, return -ENOMEM; spin_lock_irqsave(&rtasd_log_lock, s); + /* if it's 0, then we know we got the last one (the one in NVRAM) */ while (rtas_log_size == 0) { if (file->f_flags & O_NONBLOCK) { @@ -313,7 +316,9 @@ static ssize_t rtas_log_read(struct file * file, char __user * buf, error = -ENODATA; goto out; } +#ifdef CONFIG_PPC64 nvram_clear_error_log(); +#endif /* CONFIG_PPC64 */ spin_unlock_irqrestore(&rtasd_log_lock, s); error = wait_event_interruptible(rtas_log_wait, rtas_log_size); @@ -427,14 +432,11 @@ static void rtas_event_scan(struct work_struct *w) put_online_cpus(); } -static void start_event_scan(void) +#ifdef CONFIG_PPC64 +static void retreive_nvram_error_log(void) { - unsigned int err_type; - int rc; - - printk(KERN_DEBUG "RTAS daemon started\n"); - pr_debug("rtasd: will sleep for %d milliseconds\n", - (30000 / rtas_event_scan_rate)); + unsigned int err_type ; + int rc ; /* See if we have any error stored in NVRAM */ memset(logdata, 0, rtas_error_log_max); @@ -442,12 +444,26 @@ static void start_event_scan(void) &err_type, &error_log_cnt); /* We can use rtas_log_buf now */ logging_enabled = 1; - if (!rc) { if (err_type != ERR_FLAG_ALREADY_LOGGED) { pSeries_log_error(logdata, err_type | ERR_FLAG_BOOT, 0); } } +} +#else /* CONFIG_PPC64 */ +static void retreive_nvram_error_log(void) +{ +} +#endif /* CONFIG_PPC64 */ + +static void start_event_scan(void) +{ + printk(KERN_DEBUG "RTAS daemon started\n"); + pr_debug("rtasd: will sleep for %d milliseconds\n", + (30000 / rtas_event_scan_rate)); + + /* Retreive errors from nvram if any */ + retreive_nvram_error_log(); schedule_delayed_work_on(first_cpu(cpu_online_map), &event_scan_work, event_scan_delay); @@ -457,13 +473,13 @@ static int __init rtas_init(void) { struct proc_dir_entry *entry; - if (!machine_is(pseries)) + if (!machine_is(pseries) && !machine_is(chrp)) return 0; /* No RTAS */ event_scan = rtas_token("event-scan"); if (event_scan == RTAS_UNKNOWN_SERVICE) { - printk(KERN_DEBUG "rtasd: no event-scan on system\n"); + printk(KERN_INFO "rtasd: No event-scan on system\n"); return -ENODEV; } @@ -483,7 +499,7 @@ static int __init rtas_init(void) return -ENOMEM; } - entry = proc_create("ppc64/rtas/error_log", S_IRUSR, NULL, + entry = proc_create("powerpc/rtas/error_log", S_IRUSR, NULL, &proc_rtas_log_operations); if (!entry) printk(KERN_ERR "Failed to create error_log proc entry\n"); @@ -492,11 +508,16 @@ static int __init rtas_init(void) return 0; } +__initcall(rtas_init); static int __init surveillance_setup(char *str) { int i; + /* We only do surveillance on pseries */ + if (!machine_is(pseries)) + return 0; + if (get_option(&str,&i)) { if (i >= 0 && i <= 255) surveillance_timeout = i; @@ -504,6 +525,7 @@ static int __init surveillance_setup(char *str) return 1; } +__setup("surveillance=", surveillance_setup); static int __init rtasmsgs_setup(char *str) { @@ -514,6 +536,4 @@ static int __init rtasmsgs_setup(char *str) return 1; } -__initcall(rtas_init); -__setup("surveillance=", surveillance_setup); __setup("rtasmsgs=", rtasmsgs_setup); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 04f638d82fb..fd785f7a279 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -356,11 +356,6 @@ void __init setup_system(void) */ initialize_cache_info(); - /* - * Initialize irq remapping subsystem - */ - irq_early_init(); - #ifdef CONFIG_PPC_RTAS /* * Initialize RTAS if available diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index a136a11c490..6c9e20898fa 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -268,6 +268,7 @@ void account_system_vtime(struct task_struct *tsk) per_cpu(cputime_scaled_last_delta, smp_processor_id()) = deltascaled; local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(account_system_vtime); /* * Transfer the user and system times accumulated in the paca diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index c2992684661..07703f72330 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -21,6 +21,23 @@ config KVM select PREEMPT_NOTIFIERS select ANON_INODES +config KVM_BOOK3S_64_HANDLER + bool + +config KVM_BOOK3S_64 + tristate "KVM support for PowerPC book3s_64 processors" + depends on EXPERIMENTAL && PPC64 + select KVM + select KVM_BOOK3S_64_HANDLER + ---help--- + Support running unmodified book3s_64 and book3s_32 guest kernels + in virtual machines on book3s_64 host processors. + + This module provides access to the hardware capabilities through + a character device node named /dev/kvm. + + If unsure, say N. + config KVM_440 bool "KVM support for PowerPC 440 processors" depends on EXPERIMENTAL && 44x diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 37655fe19f2..56484d65237 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -12,26 +12,45 @@ CFLAGS_44x_tlb.o := -I. CFLAGS_e500_tlb.o := -I. CFLAGS_emulate.o := -I. -kvm-objs := $(common-objs-y) powerpc.o emulate.o +common-objs-y += powerpc.o emulate.o obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o -obj-$(CONFIG_KVM) += kvm.o +obj-$(CONFIG_KVM_BOOK3S_64_HANDLER) += book3s_64_exports.o AFLAGS_booke_interrupts.o := -I$(obj) kvm-440-objs := \ + $(common-objs-y) \ booke.o \ booke_emulate.o \ booke_interrupts.o \ 44x.o \ 44x_tlb.o \ 44x_emulate.o -obj-$(CONFIG_KVM_440) += kvm-440.o +kvm-objs-$(CONFIG_KVM_440) := $(kvm-440-objs) kvm-e500-objs := \ + $(common-objs-y) \ booke.o \ booke_emulate.o \ booke_interrupts.o \ e500.o \ e500_tlb.o \ e500_emulate.o -obj-$(CONFIG_KVM_E500) += kvm-e500.o +kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs) + +kvm-book3s_64-objs := \ + $(common-objs-y) \ + book3s.o \ + book3s_64_emulate.o \ + book3s_64_interrupts.o \ + book3s_64_mmu_host.o \ + book3s_64_mmu.o \ + book3s_32_mmu.o +kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-objs) + +kvm-objs := $(kvm-objs-m) $(kvm-objs-y) + +obj-$(CONFIG_KVM_440) += kvm.o +obj-$(CONFIG_KVM_E500) += kvm.o +obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o + diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c new file mode 100644 index 00000000000..42037d46a41 --- /dev/null +++ b/arch/powerpc/kvm/book3s.c @@ -0,0 +1,925 @@ +/* + * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved. + * + * Authors: + * Alexander Graf <agraf@suse.de> + * Kevin Wolf <mail@kevin-wolf.de> + * + * Description: + * This file is derived from arch/powerpc/kvm/44x.c, + * by Hollis Blanchard <hollisb@us.ibm.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kvm_host.h> +#include <linux/err.h> + +#include <asm/reg.h> +#include <asm/cputable.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu_context.h> +#include <linux/sched.h> +#include <linux/vmalloc.h> + +#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU + +/* #define EXIT_DEBUG */ +/* #define EXIT_DEBUG_SIMPLE */ + +/* Without AGGRESSIVE_DEC we only fire off a DEC interrupt when DEC turns 0. + * When set, we retrigger a DEC interrupt after that if DEC <= 0. + * PPC32 Linux runs faster without AGGRESSIVE_DEC, PPC64 Linux requires it. */ + +/* #define AGGRESSIVE_DEC */ + +struct kvm_stats_debugfs_item debugfs_entries[] = { + { "exits", VCPU_STAT(sum_exits) }, + { "mmio", VCPU_STAT(mmio_exits) }, + { "sig", VCPU_STAT(signal_exits) }, + { "sysc", VCPU_STAT(syscall_exits) }, + { "inst_emu", VCPU_STAT(emulated_inst_exits) }, + { "dec", VCPU_STAT(dec_exits) }, + { "ext_intr", VCPU_STAT(ext_intr_exits) }, + { "queue_intr", VCPU_STAT(queue_intr) }, + { "halt_wakeup", VCPU_STAT(halt_wakeup) }, + { "pf_storage", VCPU_STAT(pf_storage) }, + { "sp_storage", VCPU_STAT(sp_storage) }, + { "pf_instruc", VCPU_STAT(pf_instruc) }, + { "sp_instruc", VCPU_STAT(sp_instruc) }, + { "ld", VCPU_STAT(ld) }, + { "ld_slow", VCPU_STAT(ld_slow) }, + { "st", VCPU_STAT(st) }, + { "st_slow", VCPU_STAT(st_slow) }, + { NULL } +}; + +void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu) +{ +} + +void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) +{ +} + +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + memcpy(get_paca()->kvm_slb, to_book3s(vcpu)->slb_shadow, sizeof(get_paca()->kvm_slb)); + get_paca()->kvm_slb_max = to_book3s(vcpu)->slb_shadow_max; +} + +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +{ + memcpy(to_book3s(vcpu)->slb_shadow, get_paca()->kvm_slb, sizeof(get_paca()->kvm_slb)); + to_book3s(vcpu)->slb_shadow_max = get_paca()->kvm_slb_max; +} + +#if defined(AGGRESSIVE_DEC) || defined(EXIT_DEBUG) +static u32 kvmppc_get_dec(struct kvm_vcpu *vcpu) +{ + u64 jd = mftb() - vcpu->arch.dec_jiffies; + return vcpu->arch.dec - jd; +} +#endif + +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) +{ + ulong old_msr = vcpu->arch.msr; + +#ifdef EXIT_DEBUG + printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr); +#endif + msr &= to_book3s(vcpu)->msr_mask; + vcpu->arch.msr = msr; + vcpu->arch.shadow_msr = msr | MSR_USER32; + vcpu->arch.shadow_msr &= ( MSR_VEC | MSR_VSX | MSR_FP | MSR_FE0 | + MSR_USER64 | MSR_SE | MSR_BE | MSR_DE | + MSR_FE1); + + if (msr & (MSR_WE|MSR_POW)) { + if (!vcpu->arch.pending_exceptions) { + kvm_vcpu_block(vcpu); + vcpu->stat.halt_wakeup++; + } + } + + if (((vcpu->arch.msr & (MSR_IR|MSR_DR)) != (old_msr & (MSR_IR|MSR_DR))) || + (vcpu->arch.msr & MSR_PR) != (old_msr & MSR_PR)) { + kvmppc_mmu_flush_segments(vcpu); + kvmppc_mmu_map_segment(vcpu, vcpu->arch.pc); + } +} + +void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) +{ + vcpu->arch.srr0 = vcpu->arch.pc; + vcpu->arch.srr1 = vcpu->arch.msr | flags; + vcpu->arch.pc = to_book3s(vcpu)->hior + vec; + vcpu->arch.mmu.reset_msr(vcpu); +} + +void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) +{ + unsigned int prio; + + vcpu->stat.queue_intr++; + switch (vec) { + case 0x100: prio = BOOK3S_IRQPRIO_SYSTEM_RESET; break; + case 0x200: prio = BOOK3S_IRQPRIO_MACHINE_CHECK; break; + case 0x300: prio = BOOK3S_IRQPRIO_DATA_STORAGE; break; + case 0x380: prio = BOOK3S_IRQPRIO_DATA_SEGMENT; break; + case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE; break; + case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT; break; + case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL; break; + case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT; break; + case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM; break; + case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL; break; + case 0x900: prio = BOOK3S_IRQPRIO_DECREMENTER; break; + case 0xc00: prio = BOOK3S_IRQPRIO_SYSCALL; break; + case 0xd00: prio = BOOK3S_IRQPRIO_DEBUG; break; + case 0xf20: prio = BOOK3S_IRQPRIO_ALTIVEC; break; + case 0xf40: prio = BOOK3S_IRQPRIO_VSX; break; + default: prio = BOOK3S_IRQPRIO_MAX; break; + } + + set_bit(prio, &vcpu->arch.pending_exceptions); +#ifdef EXIT_DEBUG + printk(KERN_INFO "Queueing interrupt %x\n", vec); +#endif +} + + +void kvmppc_core_queue_program(struct kvm_vcpu *vcpu) +{ + kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_PROGRAM); +} + +void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu) +{ + kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER); +} + +int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu) +{ + return test_bit(BOOK3S_INTERRUPT_DECREMENTER >> 7, &vcpu->arch.pending_exceptions); +} + +void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, + struct kvm_interrupt *irq) +{ + kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); +} + +int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) +{ + int deliver = 1; + int vec = 0; + + switch (priority) { + case BOOK3S_IRQPRIO_DECREMENTER: + deliver = vcpu->arch.msr & MSR_EE; + vec = BOOK3S_INTERRUPT_DECREMENTER; + break; + case BOOK3S_IRQPRIO_EXTERNAL: + deliver = vcpu->arch.msr & MSR_EE; + vec = BOOK3S_INTERRUPT_EXTERNAL; + break; + case BOOK3S_IRQPRIO_SYSTEM_RESET: + vec = BOOK3S_INTERRUPT_SYSTEM_RESET; + break; + case BOOK3S_IRQPRIO_MACHINE_CHECK: + vec = BOOK3S_INTERRUPT_MACHINE_CHECK; + break; + case BOOK3S_IRQPRIO_DATA_STORAGE: + vec = BOOK3S_INTERRUPT_DATA_STORAGE; + break; + case BOOK3S_IRQPRIO_INST_STORAGE: + vec = BOOK3S_INTERRUPT_INST_STORAGE; + break; + case BOOK3S_IRQPRIO_DATA_SEGMENT: + vec = BOOK3S_INTERRUPT_DATA_SEGMENT; + break; + case BOOK3S_IRQPRIO_INST_SEGMENT: + vec = BOOK3S_INTERRUPT_INST_SEGMENT; + break; + case BOOK3S_IRQPRIO_ALIGNMENT: + vec = BOOK3S_INTERRUPT_ALIGNMENT; + break; + case BOOK3S_IRQPRIO_PROGRAM: + vec = BOOK3S_INTERRUPT_PROGRAM; + break; + case BOOK3S_IRQPRIO_VSX: + vec = BOOK3S_INTERRUPT_VSX; + break; + case BOOK3S_IRQPRIO_ALTIVEC: + vec = BOOK3S_INTERRUPT_ALTIVEC; + break; + case BOOK3S_IRQPRIO_FP_UNAVAIL: + vec = BOOK3S_INTERRUPT_FP_UNAVAIL; + break; + case BOOK3S_IRQPRIO_SYSCALL: + vec = BOOK3S_INTERRUPT_SYSCALL; + break; + case BOOK3S_IRQPRIO_DEBUG: + vec = BOOK3S_INTERRUPT_TRACE; + break; + case BOOK3S_IRQPRIO_PERFORMANCE_MONITOR: + vec = BOOK3S_INTERRUPT_PERFMON; + break; + default: + deliver = 0; + printk(KERN_ERR "KVM: Unknown interrupt: 0x%x\n", priority); + break; + } + +#if 0 + printk(KERN_INFO "Deliver interrupt 0x%x? %x\n", vec, deliver); +#endif + + if (deliver) + kvmppc_inject_interrupt(vcpu, vec, 0ULL); + + return deliver; +} + +void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu) +{ + unsigned long *pending = &vcpu->arch.pending_exceptions; + unsigned int priority; + + /* XXX be more clever here - no need to mftb() on every entry */ + /* Issue DEC again if it's still active */ +#ifdef AGGRESSIVE_DEC + if (vcpu->arch.msr & MSR_EE) + if (kvmppc_get_dec(vcpu) & 0x80000000) + kvmppc_core_queue_dec(vcpu); +#endif + +#ifdef EXIT_DEBUG + if (vcpu->arch.pending_exceptions) + printk(KERN_EMERG "KVM: Check pending: %lx\n", vcpu->arch.pending_exceptions); +#endif + priority = __ffs(*pending); + while (priority <= (sizeof(unsigned int) * 8)) { + if (kvmppc_book3s_irqprio_deliver(vcpu, priority)) { + clear_bit(priority, &vcpu->arch.pending_exceptions); + break; + } + + priority = find_next_bit(pending, + BITS_PER_BYTE * sizeof(*pending), + priority + 1); + } +} + +void kvmppc_set_pvr(struct kvm_vcpu *vcpu, u32 pvr) +{ + vcpu->arch.pvr = pvr; + if ((pvr >= 0x330000) && (pvr < 0x70330000)) { + kvmppc_mmu_book3s_64_init(vcpu); + to_book3s(vcpu)->hior = 0xfff00000; + to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; + } else { + kvmppc_mmu_book3s_32_init(vcpu); + to_book3s(vcpu)->hior = 0; + to_book3s(vcpu)->msr_mask = 0xffffffffULL; + } + + /* If we are in hypervisor level on 970, we can tell the CPU to + * treat DCBZ as 32 bytes store */ + vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32; + if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) && + !strcmp(cur_cpu_spec->platform, "ppc970")) + vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; + +} + +/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To + * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to + * emulate 32 bytes dcbz length. + * + * The Book3s_64 inventors also realized this case and implemented a special bit + * in the HID5 register, which is a hypervisor ressource. Thus we can't use it. + * + * My approach here is to patch the dcbz instruction on executing pages. + */ +static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) +{ + bool touched = false; + hva_t hpage; + u32 *page; + int i; + + hpage = gfn_to_hva(vcpu->kvm, pte->raddr >> PAGE_SHIFT); + if (kvm_is_error_hva(hpage)) + return; + + hpage |= pte->raddr & ~PAGE_MASK; + hpage &= ~0xFFFULL; + + page = vmalloc(HW_PAGE_SIZE); + + if (copy_from_user(page, (void __user *)hpage, HW_PAGE_SIZE)) + goto out; + + for (i=0; i < HW_PAGE_SIZE / 4; i++) + if ((page[i] & 0xff0007ff) == INS_DCBZ) { + page[i] &= 0xfffffff7; // reserved instruction, so we trap + touched = true; + } + + if (touched) + copy_to_user((void __user *)hpage, page, HW_PAGE_SIZE); + +out: + vfree(page); +} + +static int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, bool data, + struct kvmppc_pte *pte) +{ + int relocated = (vcpu->arch.msr & (data ? MSR_DR : MSR_IR)); + int r; + + if (relocated) { + r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data); + } else { + pte->eaddr = eaddr; + pte->raddr = eaddr & 0xffffffff; + pte->vpage = eaddr >> 12; + switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) { + case 0: + pte->vpage |= VSID_REAL; + case MSR_DR: + pte->vpage |= VSID_REAL_DR; + case MSR_IR: + pte->vpage |= VSID_REAL_IR; + } + pte->may_read = true; + pte->may_write = true; + pte->may_execute = true; + r = 0; + } + + return r; +} + +static hva_t kvmppc_bad_hva(void) +{ + return PAGE_OFFSET; +} + +static hva_t kvmppc_pte_to_hva(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte, + bool read) +{ + hva_t hpage; + + if (read && !pte->may_read) + goto err; + + if (!read && !pte->may_write) + goto err; + + hpage = gfn_to_hva(vcpu->kvm, pte->raddr >> PAGE_SHIFT); + if (kvm_is_error_hva(hpage)) + goto err; + + return hpage | (pte->raddr & ~PAGE_MASK); +err: + return kvmppc_bad_hva(); +} + +int kvmppc_st(struct kvm_vcpu *vcpu, ulong eaddr, int size, void *ptr) +{ + struct kvmppc_pte pte; + hva_t hva = eaddr; + + vcpu->stat.st++; + + if (kvmppc_xlate(vcpu, eaddr, false, &pte)) + goto err; + + hva = kvmppc_pte_to_hva(vcpu, &pte, false); + if (kvm_is_error_hva(hva)) + goto err; + + if (copy_to_user((void __user *)hva, ptr, size)) { + printk(KERN_INFO "kvmppc_st at 0x%lx failed\n", hva); + goto err; + } + + return 0; + +err: + return -ENOENT; +} + +int kvmppc_ld(struct kvm_vcpu *vcpu, ulong eaddr, int size, void *ptr, + bool data) +{ + struct kvmppc_pte pte; + hva_t hva = eaddr; + + vcpu->stat.ld++; + + if (kvmppc_xlate(vcpu, eaddr, data, &pte)) + goto err; + + hva = kvmppc_pte_to_hva(vcpu, &pte, true); + if (kvm_is_error_hva(hva)) + goto err; + + if (copy_from_user(ptr, (void __user *)hva, size)) { + printk(KERN_INFO "kvmppc_ld at 0x%lx failed\n", hva); + goto err; + } + + return 0; + +err: + return -ENOENT; +} + +static int kvmppc_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + return kvm_is_visible_gfn(vcpu->kvm, gfn); +} + +int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, + ulong eaddr, int vec) +{ + bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE); + int r = RESUME_GUEST; + int relocated; + int page_found = 0; + struct kvmppc_pte pte; + bool is_mmio = false; + + if ( vec == BOOK3S_INTERRUPT_DATA_STORAGE ) { + relocated = (vcpu->arch.msr & MSR_DR); + } else { + relocated = (vcpu->arch.msr & MSR_IR); + } + + /* Resolve real address if translation turned on */ + if (relocated) { + page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data); + } else { + pte.may_execute = true; + pte.may_read = true; + pte.may_write = true; + pte.raddr = eaddr & 0xffffffff; + pte.eaddr = eaddr; + pte.vpage = eaddr >> 12; + switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) { + case 0: + pte.vpage |= VSID_REAL; + case MSR_DR: + pte.vpage |= VSID_REAL_DR; + case MSR_IR: + pte.vpage |= VSID_REAL_IR; + } + } + + if (vcpu->arch.mmu.is_dcbz32(vcpu) && + (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { + /* + * If we do the dcbz hack, we have to NX on every execution, + * so we can patch the executing code. This renders our guest + * NX-less. + */ + pte.may_execute = !data; + } + + if (page_found == -ENOENT) { + /* Page not found in guest PTE entries */ + vcpu->arch.dear = vcpu->arch.fault_dear; + to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr; + vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x00000000f8000000ULL); + kvmppc_book3s_queue_irqprio(vcpu, vec); + } else if (page_found == -EPERM) { + /* Storage protection */ + vcpu->arch.dear = vcpu->arch.fault_dear; + to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr & ~DSISR_NOHPTE; + to_book3s(vcpu)->dsisr |= DSISR_PROTFAULT; + vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x00000000f8000000ULL); + kvmppc_book3s_queue_irqprio(vcpu, vec); + } else if (page_found == -EINVAL) { + /* Page not found in guest SLB */ + vcpu->arch.dear = vcpu->arch.fault_dear; + kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80); + } else if (!is_mmio && + kvmppc_visible_gfn(vcpu, pte.raddr >> PAGE_SHIFT)) { + /* The guest's PTE is not mapped yet. Map on the host */ + kvmppc_mmu_map_page(vcpu, &pte); + if (data) + vcpu->stat.sp_storage++; + else if (vcpu->arch.mmu.is_dcbz32(vcpu) && + (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) + kvmppc_patch_dcbz(vcpu, &pte); + } else { + /* MMIO */ + vcpu->stat.mmio_exits++; + vcpu->arch.paddr_accessed = pte.raddr; + r = kvmppc_emulate_mmio(run, vcpu); + if ( r == RESUME_HOST_NV ) + r = RESUME_HOST; + if ( r == RESUME_GUEST_NV ) + r = RESUME_GUEST; + } + + return r; +} + +int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int exit_nr) +{ + int r = RESUME_HOST; + + vcpu->stat.sum_exits++; + + run->exit_reason = KVM_EXIT_UNKNOWN; + run->ready_for_interrupt_injection = 1; +#ifdef EXIT_DEBUG + printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | dar=0x%lx | dec=0x%x | msr=0x%lx\n", + exit_nr, vcpu->arch.pc, vcpu->arch.fault_dear, + kvmppc_get_dec(vcpu), vcpu->arch.msr); +#elif defined (EXIT_DEBUG_SIMPLE) + if ((exit_nr != 0x900) && (exit_nr != 0x500)) + printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | dar=0x%lx | msr=0x%lx\n", + exit_nr, vcpu->arch.pc, vcpu->arch.fault_dear, + vcpu->arch.msr); +#endif + kvm_resched(vcpu); + switch (exit_nr) { + case BOOK3S_INTERRUPT_INST_STORAGE: + vcpu->stat.pf_instruc++; + /* only care about PTEG not found errors, but leave NX alone */ + if (vcpu->arch.shadow_msr & 0x40000000) { + r = kvmppc_handle_pagefault(run, vcpu, vcpu->arch.pc, exit_nr); + vcpu->stat.sp_instruc++; + } else if (vcpu->arch.mmu.is_dcbz32(vcpu) && + (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { + /* + * XXX If we do the dcbz hack we use the NX bit to flush&patch the page, + * so we can't use the NX bit inside the guest. Let's cross our fingers, + * that no guest that needs the dcbz hack does NX. + */ + kvmppc_mmu_pte_flush(vcpu, vcpu->arch.pc, ~0xFFFULL); + } else { + vcpu->arch.msr |= (vcpu->arch.shadow_msr & 0x58000000); + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + kvmppc_mmu_pte_flush(vcpu, vcpu->arch.pc, ~0xFFFULL); + r = RESUME_GUEST; + } + break; + case BOOK3S_INTERRUPT_DATA_STORAGE: + vcpu->stat.pf_storage++; + /* The only case we need to handle is missing shadow PTEs */ + if (vcpu->arch.fault_dsisr & DSISR_NOHPTE) { + r = kvmppc_handle_pagefault(run, vcpu, vcpu->arch.fault_dear, exit_nr); + } else { + vcpu->arch.dear = vcpu->arch.fault_dear; + to_book3s(vcpu)->dsisr = vcpu->arch.fault_dsisr; + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + kvmppc_mmu_pte_flush(vcpu, vcpu->arch.dear, ~0xFFFULL); + r = RESUME_GUEST; + } + break; + case BOOK3S_INTERRUPT_DATA_SEGMENT: + if (kvmppc_mmu_map_segment(vcpu, vcpu->arch.fault_dear) < 0) { + vcpu->arch.dear = vcpu->arch.fault_dear; + kvmppc_book3s_queue_irqprio(vcpu, + BOOK3S_INTERRUPT_DATA_SEGMENT); + } + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_INST_SEGMENT: + if (kvmppc_mmu_map_segment(vcpu, vcpu->arch.pc) < 0) { + kvmppc_book3s_queue_irqprio(vcpu, + BOOK3S_INTERRUPT_INST_SEGMENT); + } + r = RESUME_GUEST; + break; + /* We're good on these - the host merely wanted to get our attention */ + case BOOK3S_INTERRUPT_DECREMENTER: + vcpu->stat.dec_exits++; + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_EXTERNAL: + vcpu->stat.ext_intr_exits++; + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_PROGRAM: + { + enum emulation_result er; + + if (vcpu->arch.msr & MSR_PR) { +#ifdef EXIT_DEBUG + printk(KERN_INFO "Userspace triggered 0x700 exception at 0x%lx (0x%x)\n", vcpu->arch.pc, vcpu->arch.last_inst); +#endif + if ((vcpu->arch.last_inst & 0xff0007ff) != + (INS_DCBZ & 0xfffffff7)) { + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + r = RESUME_GUEST; + break; + } + } + + vcpu->stat.emulated_inst_exits++; + er = kvmppc_emulate_instruction(run, vcpu); + switch (er) { + case EMULATE_DONE: + r = RESUME_GUEST; + break; + case EMULATE_FAIL: + printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", + __func__, vcpu->arch.pc, vcpu->arch.last_inst); + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + r = RESUME_GUEST; + break; + default: + BUG(); + } + break; + } + case BOOK3S_INTERRUPT_SYSCALL: +#ifdef EXIT_DEBUG + printk(KERN_INFO "Syscall Nr %d\n", (int)vcpu->arch.gpr[0]); +#endif + vcpu->stat.syscall_exits++; + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_MACHINE_CHECK: + case BOOK3S_INTERRUPT_FP_UNAVAIL: + case BOOK3S_INTERRUPT_TRACE: + case BOOK3S_INTERRUPT_ALTIVEC: + case BOOK3S_INTERRUPT_VSX: + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + r = RESUME_GUEST; + break; + default: + /* Ugh - bork here! What did we get? */ + printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", exit_nr, vcpu->arch.pc, vcpu->arch.shadow_msr); + r = RESUME_HOST; + BUG(); + break; + } + + + if (!(r & RESUME_HOST)) { + /* To avoid clobbering exit_reason, only check for signals if + * we aren't already exiting to userspace for some other + * reason. */ + if (signal_pending(current)) { +#ifdef EXIT_DEBUG + printk(KERN_EMERG "KVM: Going back to host\n"); +#endif + vcpu->stat.signal_exits++; + run->exit_reason = KVM_EXIT_INTR; + r = -EINTR; + } else { + /* In case an interrupt came in that was triggered + * from userspace (like DEC), we need to check what + * to inject now! */ + kvmppc_core_deliver_interrupts(vcpu); + } + } + +#ifdef EXIT_DEBUG + printk(KERN_EMERG "KVM exit: vcpu=0x%p pc=0x%lx r=0x%x\n", vcpu, vcpu->arch.pc, r); +#endif + + return r; +} + +int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) +{ + return 0; +} + +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + int i; + + regs->pc = vcpu->arch.pc; + regs->cr = vcpu->arch.cr; + regs->ctr = vcpu->arch.ctr; + regs->lr = vcpu->arch.lr; + regs->xer = vcpu->arch.xer; + regs->msr = vcpu->arch.msr; + regs->srr0 = vcpu->arch.srr0; + regs->srr1 = vcpu->arch.srr1; + regs->pid = vcpu->arch.pid; + regs->sprg0 = vcpu->arch.sprg0; + regs->sprg1 = vcpu->arch.sprg1; + regs->sprg2 = vcpu->arch.sprg2; + regs->sprg3 = vcpu->arch.sprg3; + regs->sprg5 = vcpu->arch.sprg4; + regs->sprg6 = vcpu->arch.sprg5; + regs->sprg7 = vcpu->arch.sprg6; + + for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) + regs->gpr[i] = vcpu->arch.gpr[i]; + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + int i; + + vcpu->arch.pc = regs->pc; + vcpu->arch.cr = regs->cr; + vcpu->arch.ctr = regs->ctr; + vcpu->arch.lr = regs->lr; + vcpu->arch.xer = regs->xer; + kvmppc_set_msr(vcpu, regs->msr); + vcpu->arch.srr0 = regs->srr0; + vcpu->arch.srr1 = regs->srr1; + vcpu->arch.sprg0 = regs->sprg0; + vcpu->arch.sprg1 = regs->sprg1; + vcpu->arch.sprg2 = regs->sprg2; + vcpu->arch.sprg3 = regs->sprg3; + vcpu->arch.sprg5 = regs->sprg4; + vcpu->arch.sprg6 = regs->sprg5; + vcpu->arch.sprg7 = regs->sprg6; + + for (i = 0; i < ARRAY_SIZE(vcpu->arch.gpr); i++) + vcpu->arch.gpr[i] = regs->gpr[i]; + + return 0; +} + +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + sregs->pvr = vcpu->arch.pvr; + return 0; +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + kvmppc_set_pvr(vcpu, sregs->pvr); + return 0; +} + +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + return -ENOTSUPP; +} + +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + return -ENOTSUPP; +} + +int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr) +{ + return 0; +} + +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, + struct kvm_dirty_log *log) +{ + struct kvm_memory_slot *memslot; + struct kvm_vcpu *vcpu; + ulong ga, ga_end; + int is_dirty = 0; + int r, n; + + down_write(&kvm->slots_lock); + + r = kvm_get_dirty_log(kvm, log, &is_dirty); + if (r) + goto out; + + /* If nothing is dirty, don't bother messing with page tables. */ + if (is_dirty) { + memslot = &kvm->memslots[log->slot]; + + ga = memslot->base_gfn << PAGE_SHIFT; + ga_end = ga + (memslot->npages << PAGE_SHIFT); + + kvm_for_each_vcpu(n, vcpu, kvm) + kvmppc_mmu_pte_pflush(vcpu, ga, ga_end); + + n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + memset(memslot->dirty_bitmap, 0, n); + } + + r = 0; +out: + up_write(&kvm->slots_lock); + return r; +} + +int kvmppc_core_check_processor_compat(void) +{ + return 0; +} + +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s; + struct kvm_vcpu *vcpu; + int err; + + vcpu_book3s = (struct kvmppc_vcpu_book3s *)__get_free_pages( GFP_KERNEL | __GFP_ZERO, + get_order(sizeof(struct kvmppc_vcpu_book3s))); + if (!vcpu_book3s) { + err = -ENOMEM; + goto out; + } + + vcpu = &vcpu_book3s->vcpu; + err = kvm_vcpu_init(vcpu, kvm, id); + if (err) + goto free_vcpu; + + vcpu->arch.host_retip = kvm_return_point; + vcpu->arch.host_msr = mfmsr(); + /* default to book3s_64 (970fx) */ + vcpu->arch.pvr = 0x3C0301; + kvmppc_set_pvr(vcpu, vcpu->arch.pvr); + vcpu_book3s->slb_nr = 64; + + /* remember where some real-mode handlers are */ + vcpu->arch.trampoline_lowmem = kvmppc_trampoline_lowmem; + vcpu->arch.trampoline_enter = kvmppc_trampoline_enter; + vcpu->arch.highmem_handler = (ulong)kvmppc_handler_highmem; + + vcpu->arch.shadow_msr = MSR_USER64; + + err = __init_new_context(); + if (err < 0) + goto free_vcpu; + vcpu_book3s->context_id = err; + + vcpu_book3s->vsid_max = ((vcpu_book3s->context_id + 1) << USER_ESID_BITS) - 1; + vcpu_book3s->vsid_first = vcpu_book3s->context_id << USER_ESID_BITS; + vcpu_book3s->vsid_next = vcpu_book3s->vsid_first; + + return vcpu; + +free_vcpu: + free_pages((long)vcpu_book3s, get_order(sizeof(struct kvmppc_vcpu_book3s))); +out: + return ERR_PTR(err); +} + +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + + __destroy_context(vcpu_book3s->context_id); + kvm_vcpu_uninit(vcpu); + free_pages((long)vcpu_book3s, get_order(sizeof(struct kvmppc_vcpu_book3s))); +} + +extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); +int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + int ret; + + /* No need to go into the guest when all we do is going out */ + if (signal_pending(current)) { + kvm_run->exit_reason = KVM_EXIT_INTR; + return -EINTR; + } + + /* XXX we get called with irq disabled - change that! */ + local_irq_enable(); + + ret = __kvmppc_vcpu_entry(kvm_run, vcpu); + + local_irq_disable(); + + return ret; +} + +static int kvmppc_book3s_init(void) +{ + return kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), THIS_MODULE); +} + +static void kvmppc_book3s_exit(void) +{ + kvm_exit(); +} + +module_init(kvmppc_book3s_init); +module_exit(kvmppc_book3s_exit); diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c new file mode 100644 index 00000000000..faf99f20d99 --- /dev/null +++ b/arch/powerpc/kvm/book3s_32_mmu.c @@ -0,0 +1,372 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> + +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> + +/* #define DEBUG_MMU */ +/* #define DEBUG_MMU_PTE */ +/* #define DEBUG_MMU_PTE_IP 0xfff14c40 */ + +#ifdef DEBUG_MMU +#define dprintk(X...) printk(KERN_INFO X) +#else +#define dprintk(X...) do { } while(0) +#endif + +#ifdef DEBUG_PTE +#define dprintk_pte(X...) printk(KERN_INFO X) +#else +#define dprintk_pte(X...) do { } while(0) +#endif + +#define PTEG_FLAG_ACCESSED 0x00000100 +#define PTEG_FLAG_DIRTY 0x00000080 + +static inline bool check_debug_ip(struct kvm_vcpu *vcpu) +{ +#ifdef DEBUG_MMU_PTE_IP + return vcpu->arch.pc == DEBUG_MMU_PTE_IP; +#else + return true; +#endif +} + +static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *pte, bool data); + +static struct kvmppc_sr *find_sr(struct kvmppc_vcpu_book3s *vcpu_book3s, gva_t eaddr) +{ + return &vcpu_book3s->sr[(eaddr >> 28) & 0xf]; +} + +static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, + bool data) +{ + struct kvmppc_sr *sre = find_sr(to_book3s(vcpu), eaddr); + struct kvmppc_pte pte; + + if (!kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, &pte, data)) + return pte.vpage; + + return (((u64)eaddr >> 12) & 0xffff) | (((u64)sre->vsid) << 16); +} + +static void kvmppc_mmu_book3s_32_reset_msr(struct kvm_vcpu *vcpu) +{ + kvmppc_set_msr(vcpu, 0); +} + +static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvmppc_vcpu_book3s *vcpu_book3s, + struct kvmppc_sr *sre, gva_t eaddr, + bool primary) +{ + u32 page, hash, pteg, htabmask; + hva_t r; + + page = (eaddr & 0x0FFFFFFF) >> 12; + htabmask = ((vcpu_book3s->sdr1 & 0x1FF) << 16) | 0xFFC0; + + hash = ((sre->vsid ^ page) << 6); + if (!primary) + hash = ~hash; + hash &= htabmask; + + pteg = (vcpu_book3s->sdr1 & 0xffff0000) | hash; + + dprintk("MMU: pc=0x%lx eaddr=0x%lx sdr1=0x%llx pteg=0x%x vsid=0x%x\n", + vcpu_book3s->vcpu.arch.pc, eaddr, vcpu_book3s->sdr1, pteg, + sre->vsid); + + r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); + if (kvm_is_error_hva(r)) + return r; + return r | (pteg & ~PAGE_MASK); +} + +static u32 kvmppc_mmu_book3s_32_get_ptem(struct kvmppc_sr *sre, gva_t eaddr, + bool primary) +{ + return ((eaddr & 0x0fffffff) >> 22) | (sre->vsid << 7) | + (primary ? 0 : 0x40) | 0x80000000; +} + +static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *pte, bool data) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + struct kvmppc_bat *bat; + int i; + + for (i = 0; i < 8; i++) { + if (data) + bat = &vcpu_book3s->dbat[i]; + else + bat = &vcpu_book3s->ibat[i]; + + if (vcpu->arch.msr & MSR_PR) { + if (!bat->vp) + continue; + } else { + if (!bat->vs) + continue; + } + + if (check_debug_ip(vcpu)) + { + dprintk_pte("%cBAT %02d: 0x%lx - 0x%x (0x%x)\n", + data ? 'd' : 'i', i, eaddr, bat->bepi, + bat->bepi_mask); + } + if ((eaddr & bat->bepi_mask) == bat->bepi) { + pte->raddr = bat->brpn | (eaddr & ~bat->bepi_mask); + pte->vpage = (eaddr >> 12) | VSID_BAT; + pte->may_read = bat->pp; + pte->may_write = bat->pp > 1; + pte->may_execute = true; + if (!pte->may_read) { + printk(KERN_INFO "BAT is not readable!\n"); + continue; + } + if (!pte->may_write) { + /* let's treat r/o BATs as not-readable for now */ + dprintk_pte("BAT is read-only!\n"); + continue; + } + + return 0; + } + } + + return -ENOENT; +} + +static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *pte, bool data, + bool primary) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + struct kvmppc_sr *sre; + hva_t ptegp; + u32 pteg[16]; + u64 ptem = 0; + int i; + int found = 0; + + sre = find_sr(vcpu_book3s, eaddr); + + dprintk_pte("SR 0x%lx: vsid=0x%x, raw=0x%x\n", eaddr >> 28, + sre->vsid, sre->raw); + + pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data); + + ptegp = kvmppc_mmu_book3s_32_get_pteg(vcpu_book3s, sre, eaddr, primary); + if (kvm_is_error_hva(ptegp)) { + printk(KERN_INFO "KVM: Invalid PTEG!\n"); + goto no_page_found; + } + + ptem = kvmppc_mmu_book3s_32_get_ptem(sre, eaddr, primary); + + if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) { + printk(KERN_ERR "KVM: Can't copy data from 0x%lx!\n", ptegp); + goto no_page_found; + } + + for (i=0; i<16; i+=2) { + if (ptem == pteg[i]) { + u8 pp; + + pte->raddr = (pteg[i+1] & ~(0xFFFULL)) | (eaddr & 0xFFF); + pp = pteg[i+1] & 3; + + if ((sre->Kp && (vcpu->arch.msr & MSR_PR)) || + (sre->Ks && !(vcpu->arch.msr & MSR_PR))) + pp |= 4; + + pte->may_write = false; + pte->may_read = false; + pte->may_execute = true; + switch (pp) { + case 0: + case 1: + case 2: + case 6: + pte->may_write = true; + case 3: + case 5: + case 7: + pte->may_read = true; + break; + } + + if ( !pte->may_read ) + continue; + + dprintk_pte("MMU: Found PTE -> %x %x - %x\n", + pteg[i], pteg[i+1], pp); + found = 1; + break; + } + } + + /* Update PTE C and A bits, so the guest's swapper knows we used the + page */ + if (found) { + u32 oldpte = pteg[i+1]; + + if (pte->may_read) + pteg[i+1] |= PTEG_FLAG_ACCESSED; + if (pte->may_write) + pteg[i+1] |= PTEG_FLAG_DIRTY; + else + dprintk_pte("KVM: Mapping read-only page!\n"); + + /* Write back into the PTEG */ + if (pteg[i+1] != oldpte) + copy_to_user((void __user *)ptegp, pteg, sizeof(pteg)); + + return 0; + } + +no_page_found: + + if (check_debug_ip(vcpu)) { + dprintk_pte("KVM MMU: No PTE found (sdr1=0x%llx ptegp=0x%lx)\n", + to_book3s(vcpu)->sdr1, ptegp); + for (i=0; i<16; i+=2) { + dprintk_pte(" %02d: 0x%x - 0x%x (0x%llx)\n", + i, pteg[i], pteg[i+1], ptem); + } + } + + return -ENOENT; +} + +static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *pte, bool data) +{ + int r; + + pte->eaddr = eaddr; + r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data); + if (r < 0) + r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, true); + if (r < 0) + r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, data, false); + + return r; +} + + +static u32 kvmppc_mmu_book3s_32_mfsrin(struct kvm_vcpu *vcpu, u32 srnum) +{ + return to_book3s(vcpu)->sr[srnum].raw; +} + +static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum, + ulong value) +{ + struct kvmppc_sr *sre; + + sre = &to_book3s(vcpu)->sr[srnum]; + + /* Flush any left-over shadows from the previous SR */ + + /* XXX Not necessary? */ + /* kvmppc_mmu_pte_flush(vcpu, ((u64)sre->vsid) << 28, 0xf0000000ULL); */ + + /* And then put in the new SR */ + sre->raw = value; + sre->vsid = (value & 0x0fffffff); + sre->Ks = (value & 0x40000000) ? true : false; + sre->Kp = (value & 0x20000000) ? true : false; + sre->nx = (value & 0x10000000) ? true : false; + + /* Map the new segment */ + kvmppc_mmu_map_segment(vcpu, srnum << SID_SHIFT); +} + +static void kvmppc_mmu_book3s_32_tlbie(struct kvm_vcpu *vcpu, ulong ea, bool large) +{ + kvmppc_mmu_pte_flush(vcpu, ea, ~0xFFFULL); +} + +static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, u64 esid, + u64 *vsid) +{ + /* In case we only have one of MSR_IR or MSR_DR set, let's put + that in the real-mode context (and hope RM doesn't access + high memory) */ + switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) { + case 0: + *vsid = (VSID_REAL >> 16) | esid; + break; + case MSR_IR: + *vsid = (VSID_REAL_IR >> 16) | esid; + break; + case MSR_DR: + *vsid = (VSID_REAL_DR >> 16) | esid; + break; + case MSR_DR|MSR_IR: + { + ulong ea; + ea = esid << SID_SHIFT; + *vsid = find_sr(to_book3s(vcpu), ea)->vsid; + break; + } + default: + BUG(); + } + + return 0; +} + +static bool kvmppc_mmu_book3s_32_is_dcbz32(struct kvm_vcpu *vcpu) +{ + return true; +} + + +void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu) +{ + struct kvmppc_mmu *mmu = &vcpu->arch.mmu; + + mmu->mtsrin = kvmppc_mmu_book3s_32_mtsrin; + mmu->mfsrin = kvmppc_mmu_book3s_32_mfsrin; + mmu->xlate = kvmppc_mmu_book3s_32_xlate; + mmu->reset_msr = kvmppc_mmu_book3s_32_reset_msr; + mmu->tlbie = kvmppc_mmu_book3s_32_tlbie; + mmu->esid_to_vsid = kvmppc_mmu_book3s_32_esid_to_vsid; + mmu->ea_to_vp = kvmppc_mmu_book3s_32_ea_to_vp; + mmu->is_dcbz32 = kvmppc_mmu_book3s_32_is_dcbz32; + + mmu->slbmte = NULL; + mmu->slbmfee = NULL; + mmu->slbmfev = NULL; + mmu->slbie = NULL; + mmu->slbia = NULL; +} diff --git a/arch/powerpc/kvm/book3s_64_emulate.c b/arch/powerpc/kvm/book3s_64_emulate.c new file mode 100644 index 00000000000..c343e67306e --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_emulate.c @@ -0,0 +1,337 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <asm/kvm_ppc.h> +#include <asm/disassemble.h> +#include <asm/kvm_book3s.h> +#include <asm/reg.h> + +#define OP_19_XOP_RFID 18 +#define OP_19_XOP_RFI 50 + +#define OP_31_XOP_MFMSR 83 +#define OP_31_XOP_MTMSR 146 +#define OP_31_XOP_MTMSRD 178 +#define OP_31_XOP_MTSRIN 242 +#define OP_31_XOP_TLBIEL 274 +#define OP_31_XOP_TLBIE 306 +#define OP_31_XOP_SLBMTE 402 +#define OP_31_XOP_SLBIE 434 +#define OP_31_XOP_SLBIA 498 +#define OP_31_XOP_MFSRIN 659 +#define OP_31_XOP_SLBMFEV 851 +#define OP_31_XOP_EIOIO 854 +#define OP_31_XOP_SLBMFEE 915 + +/* DCBZ is actually 1014, but we patch it to 1010 so we get a trap */ +#define OP_31_XOP_DCBZ 1010 + +int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) +{ + int emulated = EMULATE_DONE; + + switch (get_op(inst)) { + case 19: + switch (get_xop(inst)) { + case OP_19_XOP_RFID: + case OP_19_XOP_RFI: + vcpu->arch.pc = vcpu->arch.srr0; + kvmppc_set_msr(vcpu, vcpu->arch.srr1); + *advance = 0; + break; + + default: + emulated = EMULATE_FAIL; + break; + } + break; + case 31: + switch (get_xop(inst)) { + case OP_31_XOP_MFMSR: + vcpu->arch.gpr[get_rt(inst)] = vcpu->arch.msr; + break; + case OP_31_XOP_MTMSRD: + { + ulong rs = vcpu->arch.gpr[get_rs(inst)]; + if (inst & 0x10000) { + vcpu->arch.msr &= ~(MSR_RI | MSR_EE); + vcpu->arch.msr |= rs & (MSR_RI | MSR_EE); + } else + kvmppc_set_msr(vcpu, rs); + break; + } + case OP_31_XOP_MTMSR: + kvmppc_set_msr(vcpu, vcpu->arch.gpr[get_rs(inst)]); + break; + case OP_31_XOP_MFSRIN: + { + int srnum; + + srnum = (vcpu->arch.gpr[get_rb(inst)] >> 28) & 0xf; + if (vcpu->arch.mmu.mfsrin) { + u32 sr; + sr = vcpu->arch.mmu.mfsrin(vcpu, srnum); + vcpu->arch.gpr[get_rt(inst)] = sr; + } + break; + } + case OP_31_XOP_MTSRIN: + vcpu->arch.mmu.mtsrin(vcpu, + (vcpu->arch.gpr[get_rb(inst)] >> 28) & 0xf, + vcpu->arch.gpr[get_rs(inst)]); + break; + case OP_31_XOP_TLBIE: + case OP_31_XOP_TLBIEL: + { + bool large = (inst & 0x00200000) ? true : false; + ulong addr = vcpu->arch.gpr[get_rb(inst)]; + vcpu->arch.mmu.tlbie(vcpu, addr, large); + break; + } + case OP_31_XOP_EIOIO: + break; + case OP_31_XOP_SLBMTE: + if (!vcpu->arch.mmu.slbmte) + return EMULATE_FAIL; + + vcpu->arch.mmu.slbmte(vcpu, vcpu->arch.gpr[get_rs(inst)], + vcpu->arch.gpr[get_rb(inst)]); + break; + case OP_31_XOP_SLBIE: + if (!vcpu->arch.mmu.slbie) + return EMULATE_FAIL; + + vcpu->arch.mmu.slbie(vcpu, vcpu->arch.gpr[get_rb(inst)]); + break; + case OP_31_XOP_SLBIA: + if (!vcpu->arch.mmu.slbia) + return EMULATE_FAIL; + + vcpu->arch.mmu.slbia(vcpu); + break; + case OP_31_XOP_SLBMFEE: + if (!vcpu->arch.mmu.slbmfee) { + emulated = EMULATE_FAIL; + } else { + ulong t, rb; + + rb = vcpu->arch.gpr[get_rb(inst)]; + t = vcpu->arch.mmu.slbmfee(vcpu, rb); + vcpu->arch.gpr[get_rt(inst)] = t; + } + break; + case OP_31_XOP_SLBMFEV: + if (!vcpu->arch.mmu.slbmfev) { + emulated = EMULATE_FAIL; + } else { + ulong t, rb; + + rb = vcpu->arch.gpr[get_rb(inst)]; + t = vcpu->arch.mmu.slbmfev(vcpu, rb); + vcpu->arch.gpr[get_rt(inst)] = t; + } + break; + case OP_31_XOP_DCBZ: + { + ulong rb = vcpu->arch.gpr[get_rb(inst)]; + ulong ra = 0; + ulong addr; + u32 zeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + + if (get_ra(inst)) + ra = vcpu->arch.gpr[get_ra(inst)]; + + addr = (ra + rb) & ~31ULL; + if (!(vcpu->arch.msr & MSR_SF)) + addr &= 0xffffffff; + + if (kvmppc_st(vcpu, addr, 32, zeros)) { + vcpu->arch.dear = addr; + vcpu->arch.fault_dear = addr; + to_book3s(vcpu)->dsisr = DSISR_PROTFAULT | + DSISR_ISSTORE; + kvmppc_book3s_queue_irqprio(vcpu, + BOOK3S_INTERRUPT_DATA_STORAGE); + kvmppc_mmu_pte_flush(vcpu, addr, ~0xFFFULL); + } + + break; + } + default: + emulated = EMULATE_FAIL; + } + break; + default: + emulated = EMULATE_FAIL; + } + + return emulated; +} + +static void kvmppc_write_bat(struct kvm_vcpu *vcpu, int sprn, u64 val) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + struct kvmppc_bat *bat; + + switch (sprn) { + case SPRN_IBAT0U ... SPRN_IBAT3L: + bat = &vcpu_book3s->ibat[(sprn - SPRN_IBAT0U) / 2]; + break; + case SPRN_IBAT4U ... SPRN_IBAT7L: + bat = &vcpu_book3s->ibat[(sprn - SPRN_IBAT4U) / 2]; + break; + case SPRN_DBAT0U ... SPRN_DBAT3L: + bat = &vcpu_book3s->dbat[(sprn - SPRN_DBAT0U) / 2]; + break; + case SPRN_DBAT4U ... SPRN_DBAT7L: + bat = &vcpu_book3s->dbat[(sprn - SPRN_DBAT4U) / 2]; + break; + default: + BUG(); + } + + if (!(sprn % 2)) { + /* Upper BAT */ + u32 bl = (val >> 2) & 0x7ff; + bat->bepi_mask = (~bl << 17); + bat->bepi = val & 0xfffe0000; + bat->vs = (val & 2) ? 1 : 0; + bat->vp = (val & 1) ? 1 : 0; + } else { + /* Lower BAT */ + bat->brpn = val & 0xfffe0000; + bat->wimg = (val >> 3) & 0xf; + bat->pp = val & 3; + } +} + +int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +{ + int emulated = EMULATE_DONE; + + switch (sprn) { + case SPRN_SDR1: + to_book3s(vcpu)->sdr1 = vcpu->arch.gpr[rs]; + break; + case SPRN_DSISR: + to_book3s(vcpu)->dsisr = vcpu->arch.gpr[rs]; + break; + case SPRN_DAR: + vcpu->arch.dear = vcpu->arch.gpr[rs]; + break; + case SPRN_HIOR: + to_book3s(vcpu)->hior = vcpu->arch.gpr[rs]; + break; + case SPRN_IBAT0U ... SPRN_IBAT3L: + case SPRN_IBAT4U ... SPRN_IBAT7L: + case SPRN_DBAT0U ... SPRN_DBAT3L: + case SPRN_DBAT4U ... SPRN_DBAT7L: + kvmppc_write_bat(vcpu, sprn, vcpu->arch.gpr[rs]); + /* BAT writes happen so rarely that we're ok to flush + * everything here */ + kvmppc_mmu_pte_flush(vcpu, 0, 0); + break; + case SPRN_HID0: + to_book3s(vcpu)->hid[0] = vcpu->arch.gpr[rs]; + break; + case SPRN_HID1: + to_book3s(vcpu)->hid[1] = vcpu->arch.gpr[rs]; + break; + case SPRN_HID2: + to_book3s(vcpu)->hid[2] = vcpu->arch.gpr[rs]; + break; + case SPRN_HID4: + to_book3s(vcpu)->hid[4] = vcpu->arch.gpr[rs]; + break; + case SPRN_HID5: + to_book3s(vcpu)->hid[5] = vcpu->arch.gpr[rs]; + /* guest HID5 set can change is_dcbz32 */ + if (vcpu->arch.mmu.is_dcbz32(vcpu) && + (mfmsr() & MSR_HV)) + vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; + break; + case SPRN_ICTC: + case SPRN_THRM1: + case SPRN_THRM2: + case SPRN_THRM3: + case SPRN_CTRLF: + case SPRN_CTRLT: + break; + default: + printk(KERN_INFO "KVM: invalid SPR write: %d\n", sprn); +#ifndef DEBUG_SPR + emulated = EMULATE_FAIL; +#endif + break; + } + + return emulated; +} + +int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +{ + int emulated = EMULATE_DONE; + + switch (sprn) { + case SPRN_SDR1: + vcpu->arch.gpr[rt] = to_book3s(vcpu)->sdr1; + break; + case SPRN_DSISR: + vcpu->arch.gpr[rt] = to_book3s(vcpu)->dsisr; + break; + case SPRN_DAR: + vcpu->arch.gpr[rt] = vcpu->arch.dear; + break; + case SPRN_HIOR: + vcpu->arch.gpr[rt] = to_book3s(vcpu)->hior; + break; + case SPRN_HID0: + vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[0]; + break; + case SPRN_HID1: + vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[1]; + break; + case SPRN_HID2: + vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[2]; + break; + case SPRN_HID4: + vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[4]; + break; + case SPRN_HID5: + vcpu->arch.gpr[rt] = to_book3s(vcpu)->hid[5]; + break; + case SPRN_THRM1: + case SPRN_THRM2: + case SPRN_THRM3: + case SPRN_CTRLF: + case SPRN_CTRLT: + vcpu->arch.gpr[rt] = 0; + break; + default: + printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn); +#ifndef DEBUG_SPR + emulated = EMULATE_FAIL; +#endif + break; + } + + return emulated; +} + diff --git a/arch/powerpc/kvm/book3s_64_exports.c b/arch/powerpc/kvm/book3s_64_exports.c new file mode 100644 index 00000000000..5b2db38ed86 --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_exports.c @@ -0,0 +1,24 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <linux/module.h> +#include <asm/kvm_book3s.h> + +EXPORT_SYMBOL_GPL(kvmppc_trampoline_enter); +EXPORT_SYMBOL_GPL(kvmppc_trampoline_lowmem); diff --git a/arch/powerpc/kvm/book3s_64_interrupts.S b/arch/powerpc/kvm/book3s_64_interrupts.S new file mode 100644 index 00000000000..7b55d8094c8 --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_interrupts.S @@ -0,0 +1,392 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <asm/ppc_asm.h> +#include <asm/kvm_asm.h> +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/asm-offsets.h> +#include <asm/exception-64s.h> + +#define KVMPPC_HANDLE_EXIT .kvmppc_handle_exit +#define ULONG_SIZE 8 +#define VCPU_GPR(n) (VCPU_GPRS + (n * ULONG_SIZE)) + +.macro mfpaca tmp_reg, src_reg, offset, vcpu_reg + ld \tmp_reg, (PACA_EXMC+\offset)(r13) + std \tmp_reg, VCPU_GPR(\src_reg)(\vcpu_reg) +.endm + +.macro DISABLE_INTERRUPTS + mfmsr r0 + rldicl r0,r0,48,1 + rotldi r0,r0,16 + mtmsrd r0,1 +.endm + +/***************************************************************************** + * * + * Guest entry / exit code that is in kernel module memory (highmem) * + * * + ****************************************************************************/ + +/* Registers: + * r3: kvm_run pointer + * r4: vcpu pointer + */ +_GLOBAL(__kvmppc_vcpu_entry) + +kvm_start_entry: + /* Write correct stack frame */ + mflr r0 + std r0,16(r1) + + /* Save host state to the stack */ + stdu r1, -SWITCH_FRAME_SIZE(r1) + + /* Save r3 (kvm_run) and r4 (vcpu) */ + SAVE_2GPRS(3, r1) + + /* Save non-volatile registers (r14 - r31) */ + SAVE_NVGPRS(r1) + + /* Save LR */ + mflr r14 + std r14, _LINK(r1) + +/* XXX optimize non-volatile loading away */ +kvm_start_lightweight: + + DISABLE_INTERRUPTS + + /* Save R1/R2 in the PACA */ + std r1, PACAR1(r13) + std r2, (PACA_EXMC+EX_SRR0)(r13) + ld r3, VCPU_HIGHMEM_HANDLER(r4) + std r3, PACASAVEDMSR(r13) + + /* Load non-volatile guest state from the vcpu */ + ld r14, VCPU_GPR(r14)(r4) + ld r15, VCPU_GPR(r15)(r4) + ld r16, VCPU_GPR(r16)(r4) + ld r17, VCPU_GPR(r17)(r4) + ld r18, VCPU_GPR(r18)(r4) + ld r19, VCPU_GPR(r19)(r4) + ld r20, VCPU_GPR(r20)(r4) + ld r21, VCPU_GPR(r21)(r4) + ld r22, VCPU_GPR(r22)(r4) + ld r23, VCPU_GPR(r23)(r4) + ld r24, VCPU_GPR(r24)(r4) + ld r25, VCPU_GPR(r25)(r4) + ld r26, VCPU_GPR(r26)(r4) + ld r27, VCPU_GPR(r27)(r4) + ld r28, VCPU_GPR(r28)(r4) + ld r29, VCPU_GPR(r29)(r4) + ld r30, VCPU_GPR(r30)(r4) + ld r31, VCPU_GPR(r31)(r4) + + ld r9, VCPU_PC(r4) /* r9 = vcpu->arch.pc */ + ld r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */ + + ld r3, VCPU_TRAMPOLINE_ENTER(r4) + mtsrr0 r3 + + LOAD_REG_IMMEDIATE(r3, MSR_KERNEL & ~(MSR_IR | MSR_DR)) + mtsrr1 r3 + + /* Load guest state in the respective registers */ + lwz r3, VCPU_CR(r4) /* r3 = vcpu->arch.cr */ + stw r3, (PACA_EXMC + EX_CCR)(r13) + + ld r3, VCPU_CTR(r4) /* r3 = vcpu->arch.ctr */ + mtctr r3 /* CTR = r3 */ + + ld r3, VCPU_LR(r4) /* r3 = vcpu->arch.lr */ + mtlr r3 /* LR = r3 */ + + ld r3, VCPU_XER(r4) /* r3 = vcpu->arch.xer */ + std r3, (PACA_EXMC + EX_R3)(r13) + + /* Some guests may need to have dcbz set to 32 byte length. + * + * Usually we ensure that by patching the guest's instructions + * to trap on dcbz and emulate it in the hypervisor. + * + * If we can, we should tell the CPU to use 32 byte dcbz though, + * because that's a lot faster. + */ + + ld r3, VCPU_HFLAGS(r4) + rldicl. r3, r3, 0, 63 /* CR = ((r3 & 1) == 0) */ + beq no_dcbz32_on + + mfspr r3,SPRN_HID5 + ori r3, r3, 0x80 /* XXX HID5_dcbz32 = 0x80 */ + mtspr SPRN_HID5,r3 + +no_dcbz32_on: + /* Load guest GPRs */ + + ld r3, VCPU_GPR(r9)(r4) + std r3, (PACA_EXMC + EX_R9)(r13) + ld r3, VCPU_GPR(r10)(r4) + std r3, (PACA_EXMC + EX_R10)(r13) + ld r3, VCPU_GPR(r11)(r4) + std r3, (PACA_EXMC + EX_R11)(r13) + ld r3, VCPU_GPR(r12)(r4) + std r3, (PACA_EXMC + EX_R12)(r13) + ld r3, VCPU_GPR(r13)(r4) + std r3, (PACA_EXMC + EX_R13)(r13) + + ld r0, VCPU_GPR(r0)(r4) + ld r1, VCPU_GPR(r1)(r4) + ld r2, VCPU_GPR(r2)(r4) + ld r3, VCPU_GPR(r3)(r4) + ld r5, VCPU_GPR(r5)(r4) + ld r6, VCPU_GPR(r6)(r4) + ld r7, VCPU_GPR(r7)(r4) + ld r8, VCPU_GPR(r8)(r4) + ld r4, VCPU_GPR(r4)(r4) + + /* This sets the Magic value for the trampoline */ + + li r11, 1 + stb r11, PACA_KVM_IN_GUEST(r13) + + /* Jump to SLB patching handlder and into our guest */ + RFI + +/* + * This is the handler in module memory. It gets jumped at from the + * lowmem trampoline code, so it's basically the guest exit code. + * + */ + +.global kvmppc_handler_highmem +kvmppc_handler_highmem: + + /* + * Register usage at this point: + * + * R00 = guest R13 + * R01 = host R1 + * R02 = host R2 + * R10 = guest PC + * R11 = guest MSR + * R12 = exit handler id + * R13 = PACA + * PACA.exmc.R9 = guest R1 + * PACA.exmc.R10 = guest R10 + * PACA.exmc.R11 = guest R11 + * PACA.exmc.R12 = guest R12 + * PACA.exmc.R13 = guest R2 + * PACA.exmc.DAR = guest DAR + * PACA.exmc.DSISR = guest DSISR + * PACA.exmc.LR = guest instruction + * PACA.exmc.CCR = guest CR + * PACA.exmc.SRR0 = guest R0 + * + */ + + std r3, (PACA_EXMC+EX_R3)(r13) + + /* save the exit id in R3 */ + mr r3, r12 + + /* R12 = vcpu */ + ld r12, GPR4(r1) + + /* Now save the guest state */ + + std r0, VCPU_GPR(r13)(r12) + std r4, VCPU_GPR(r4)(r12) + std r5, VCPU_GPR(r5)(r12) + std r6, VCPU_GPR(r6)(r12) + std r7, VCPU_GPR(r7)(r12) + std r8, VCPU_GPR(r8)(r12) + std r9, VCPU_GPR(r9)(r12) + + /* get registers from PACA */ + mfpaca r5, r0, EX_SRR0, r12 + mfpaca r5, r3, EX_R3, r12 + mfpaca r5, r1, EX_R9, r12 + mfpaca r5, r10, EX_R10, r12 + mfpaca r5, r11, EX_R11, r12 + mfpaca r5, r12, EX_R12, r12 + mfpaca r5, r2, EX_R13, r12 + + lwz r5, (PACA_EXMC+EX_LR)(r13) + stw r5, VCPU_LAST_INST(r12) + + lwz r5, (PACA_EXMC+EX_CCR)(r13) + stw r5, VCPU_CR(r12) + + ld r5, VCPU_HFLAGS(r12) + rldicl. r5, r5, 0, 63 /* CR = ((r5 & 1) == 0) */ + beq no_dcbz32_off + + mfspr r5,SPRN_HID5 + rldimi r5,r5,6,56 + mtspr SPRN_HID5,r5 + +no_dcbz32_off: + + /* XXX maybe skip on lightweight? */ + std r14, VCPU_GPR(r14)(r12) + std r15, VCPU_GPR(r15)(r12) + std r16, VCPU_GPR(r16)(r12) + std r17, VCPU_GPR(r17)(r12) + std r18, VCPU_GPR(r18)(r12) + std r19, VCPU_GPR(r19)(r12) + std r20, VCPU_GPR(r20)(r12) + std r21, VCPU_GPR(r21)(r12) + std r22, VCPU_GPR(r22)(r12) + std r23, VCPU_GPR(r23)(r12) + std r24, VCPU_GPR(r24)(r12) + std r25, VCPU_GPR(r25)(r12) + std r26, VCPU_GPR(r26)(r12) + std r27, VCPU_GPR(r27)(r12) + std r28, VCPU_GPR(r28)(r12) + std r29, VCPU_GPR(r29)(r12) + std r30, VCPU_GPR(r30)(r12) + std r31, VCPU_GPR(r31)(r12) + + /* Restore non-volatile host registers (r14 - r31) */ + REST_NVGPRS(r1) + + /* Save guest PC (R10) */ + std r10, VCPU_PC(r12) + + /* Save guest msr (R11) */ + std r11, VCPU_SHADOW_MSR(r12) + + /* Save guest CTR (in R12) */ + mfctr r5 + std r5, VCPU_CTR(r12) + + /* Save guest LR */ + mflr r5 + std r5, VCPU_LR(r12) + + /* Save guest XER */ + mfxer r5 + std r5, VCPU_XER(r12) + + /* Save guest DAR */ + ld r5, (PACA_EXMC+EX_DAR)(r13) + std r5, VCPU_FAULT_DEAR(r12) + + /* Save guest DSISR */ + lwz r5, (PACA_EXMC+EX_DSISR)(r13) + std r5, VCPU_FAULT_DSISR(r12) + + /* Restore host msr -> SRR1 */ + ld r7, VCPU_HOST_MSR(r12) + mtsrr1 r7 + + /* Restore host IP -> SRR0 */ + ld r6, VCPU_HOST_RETIP(r12) + mtsrr0 r6 + + /* + * For some interrupts, we need to call the real Linux + * handler, so it can do work for us. This has to happen + * as if the interrupt arrived from the kernel though, + * so let's fake it here where most state is restored. + * + * Call Linux for hardware interrupts/decrementer + * r3 = address of interrupt handler (exit reason) + */ + + cmpwi r3, BOOK3S_INTERRUPT_EXTERNAL + beq call_linux_handler + cmpwi r3, BOOK3S_INTERRUPT_DECREMENTER + beq call_linux_handler + + /* Back to Interruptable Mode! (goto kvm_return_point) */ + RFI + +call_linux_handler: + + /* + * If we land here we need to jump back to the handler we + * came from. + * + * We have a page that we can access from real mode, so let's + * jump back to that and use it as a trampoline to get back into the + * interrupt handler! + * + * R3 still contains the exit code, + * R6 VCPU_HOST_RETIP and + * R7 VCPU_HOST_MSR + */ + + mtlr r3 + + ld r5, VCPU_TRAMPOLINE_LOWMEM(r12) + mtsrr0 r5 + LOAD_REG_IMMEDIATE(r5, MSR_KERNEL & ~(MSR_IR | MSR_DR)) + mtsrr1 r5 + + RFI + +.global kvm_return_point +kvm_return_point: + + /* Jump back to lightweight entry if we're supposed to */ + /* go back into the guest */ + mr r5, r3 + /* Restore r3 (kvm_run) and r4 (vcpu) */ + REST_2GPRS(3, r1) + bl KVMPPC_HANDLE_EXIT + +#if 0 /* XXX get lightweight exits back */ + cmpwi r3, RESUME_GUEST + bne kvm_exit_heavyweight + + /* put VCPU and KVM_RUN back into place and roll again! */ + REST_2GPRS(3, r1) + b kvm_start_lightweight + +kvm_exit_heavyweight: + /* Restore non-volatile host registers */ + ld r14, _LINK(r1) + mtlr r14 + REST_NVGPRS(r1) + + addi r1, r1, SWITCH_FRAME_SIZE +#else + ld r4, _LINK(r1) + mtlr r4 + + cmpwi r3, RESUME_GUEST + bne kvm_exit_heavyweight + + REST_2GPRS(3, r1) + + addi r1, r1, SWITCH_FRAME_SIZE + + b kvm_start_entry + +kvm_exit_heavyweight: + + addi r1, r1, SWITCH_FRAME_SIZE +#endif + + blr diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c new file mode 100644 index 00000000000..a31f9c677d2 --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -0,0 +1,476 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> + +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> + +/* #define DEBUG_MMU */ + +#ifdef DEBUG_MMU +#define dprintk(X...) printk(KERN_INFO X) +#else +#define dprintk(X...) do { } while(0) +#endif + +static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu) +{ + kvmppc_set_msr(vcpu, MSR_SF); +} + +static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( + struct kvmppc_vcpu_book3s *vcpu_book3s, + gva_t eaddr) +{ + int i; + u64 esid = GET_ESID(eaddr); + u64 esid_1t = GET_ESID_1T(eaddr); + + for (i = 0; i < vcpu_book3s->slb_nr; i++) { + u64 cmp_esid = esid; + + if (!vcpu_book3s->slb[i].valid) + continue; + + if (vcpu_book3s->slb[i].large) + cmp_esid = esid_1t; + + if (vcpu_book3s->slb[i].esid == cmp_esid) + return &vcpu_book3s->slb[i]; + } + + dprintk("KVM: No SLB entry found for 0x%lx [%llx | %llx]\n", + eaddr, esid, esid_1t); + for (i = 0; i < vcpu_book3s->slb_nr; i++) { + if (vcpu_book3s->slb[i].vsid) + dprintk(" %d: %c%c %llx %llx\n", i, + vcpu_book3s->slb[i].valid ? 'v' : ' ', + vcpu_book3s->slb[i].large ? 'l' : ' ', + vcpu_book3s->slb[i].esid, + vcpu_book3s->slb[i].vsid); + } + + return NULL; +} + +static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, + bool data) +{ + struct kvmppc_slb *slb; + + slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), eaddr); + if (!slb) + return 0; + + if (slb->large) + return (((u64)eaddr >> 12) & 0xfffffff) | + (((u64)slb->vsid) << 28); + + return (((u64)eaddr >> 12) & 0xffff) | (((u64)slb->vsid) << 16); +} + +static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe) +{ + return slbe->large ? 24 : 12; +} + +static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr) +{ + int p = kvmppc_mmu_book3s_64_get_pagesize(slbe); + return ((eaddr & 0xfffffff) >> p); +} + +static hva_t kvmppc_mmu_book3s_64_get_pteg( + struct kvmppc_vcpu_book3s *vcpu_book3s, + struct kvmppc_slb *slbe, gva_t eaddr, + bool second) +{ + u64 hash, pteg, htabsize; + u32 page; + hva_t r; + + page = kvmppc_mmu_book3s_64_get_page(slbe, eaddr); + htabsize = ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1); + + hash = slbe->vsid ^ page; + if (second) + hash = ~hash; + hash &= ((1ULL << 39ULL) - 1ULL); + hash &= htabsize; + hash <<= 7ULL; + + pteg = vcpu_book3s->sdr1 & 0xfffffffffffc0000ULL; + pteg |= hash; + + dprintk("MMU: page=0x%x sdr1=0x%llx pteg=0x%llx vsid=0x%llx\n", + page, vcpu_book3s->sdr1, pteg, slbe->vsid); + + r = gfn_to_hva(vcpu_book3s->vcpu.kvm, pteg >> PAGE_SHIFT); + if (kvm_is_error_hva(r)) + return r; + return r | (pteg & ~PAGE_MASK); +} + +static u64 kvmppc_mmu_book3s_64_get_avpn(struct kvmppc_slb *slbe, gva_t eaddr) +{ + int p = kvmppc_mmu_book3s_64_get_pagesize(slbe); + u64 avpn; + + avpn = kvmppc_mmu_book3s_64_get_page(slbe, eaddr); + avpn |= slbe->vsid << (28 - p); + + if (p < 24) + avpn >>= ((80 - p) - 56) - 8; + else + avpn <<= 8; + + return avpn; +} + +static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, bool data) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + struct kvmppc_slb *slbe; + hva_t ptegp; + u64 pteg[16]; + u64 avpn = 0; + int i; + u8 key = 0; + bool found = false; + bool perm_err = false; + int second = 0; + + slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, eaddr); + if (!slbe) + goto no_seg_found; + +do_second: + ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu_book3s, slbe, eaddr, second); + if (kvm_is_error_hva(ptegp)) + goto no_page_found; + + avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr); + + if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) { + printk(KERN_ERR "KVM can't copy data from 0x%lx!\n", ptegp); + goto no_page_found; + } + + if ((vcpu->arch.msr & MSR_PR) && slbe->Kp) + key = 4; + else if (!(vcpu->arch.msr & MSR_PR) && slbe->Ks) + key = 4; + + for (i=0; i<16; i+=2) { + u64 v = pteg[i]; + u64 r = pteg[i+1]; + + /* Valid check */ + if (!(v & HPTE_V_VALID)) + continue; + /* Hash check */ + if ((v & HPTE_V_SECONDARY) != second) + continue; + + /* AVPN compare */ + if (HPTE_V_AVPN_VAL(avpn) == HPTE_V_AVPN_VAL(v)) { + u8 pp = (r & HPTE_R_PP) | key; + int eaddr_mask = 0xFFF; + + gpte->eaddr = eaddr; + gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, + eaddr, + data); + if (slbe->large) + eaddr_mask = 0xFFFFFF; + gpte->raddr = (r & HPTE_R_RPN) | (eaddr & eaddr_mask); + gpte->may_execute = ((r & HPTE_R_N) ? false : true); + gpte->may_read = false; + gpte->may_write = false; + + switch (pp) { + case 0: + case 1: + case 2: + case 6: + gpte->may_write = true; + /* fall through */ + case 3: + case 5: + case 7: + gpte->may_read = true; + break; + } + + if (!gpte->may_read) { + perm_err = true; + continue; + } + + dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx " + "-> 0x%llx\n", + eaddr, avpn, gpte->vpage, gpte->raddr); + found = true; + break; + } + } + + /* Update PTE R and C bits, so the guest's swapper knows we used the + * page */ + if (found) { + u32 oldr = pteg[i+1]; + + if (gpte->may_read) { + /* Set the accessed flag */ + pteg[i+1] |= HPTE_R_R; + } + if (gpte->may_write) { + /* Set the dirty flag */ + pteg[i+1] |= HPTE_R_C; + } else { + dprintk("KVM: Mapping read-only page!\n"); + } + + /* Write back into the PTEG */ + if (pteg[i+1] != oldr) + copy_to_user((void __user *)ptegp, pteg, sizeof(pteg)); + + return 0; + } else { + dprintk("KVM MMU: No PTE found (ea=0x%lx sdr1=0x%llx " + "ptegp=0x%lx)\n", + eaddr, to_book3s(vcpu)->sdr1, ptegp); + for (i = 0; i < 16; i += 2) + dprintk(" %02d: 0x%llx - 0x%llx (0x%llx)\n", + i, pteg[i], pteg[i+1], avpn); + + if (!second) { + second = HPTE_V_SECONDARY; + goto do_second; + } + } + + +no_page_found: + + + if (perm_err) + return -EPERM; + + return -ENOENT; + +no_seg_found: + + dprintk("KVM MMU: Trigger segment fault\n"); + return -EINVAL; +} + +static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s; + u64 esid, esid_1t; + int slb_nr; + struct kvmppc_slb *slbe; + + dprintk("KVM MMU: slbmte(0x%llx, 0x%llx)\n", rs, rb); + + vcpu_book3s = to_book3s(vcpu); + + esid = GET_ESID(rb); + esid_1t = GET_ESID_1T(rb); + slb_nr = rb & 0xfff; + + if (slb_nr > vcpu_book3s->slb_nr) + return; + + slbe = &vcpu_book3s->slb[slb_nr]; + + slbe->large = (rs & SLB_VSID_L) ? 1 : 0; + slbe->esid = slbe->large ? esid_1t : esid; + slbe->vsid = rs >> 12; + slbe->valid = (rb & SLB_ESID_V) ? 1 : 0; + slbe->Ks = (rs & SLB_VSID_KS) ? 1 : 0; + slbe->Kp = (rs & SLB_VSID_KP) ? 1 : 0; + slbe->nx = (rs & SLB_VSID_N) ? 1 : 0; + slbe->class = (rs & SLB_VSID_C) ? 1 : 0; + + slbe->orige = rb & (ESID_MASK | SLB_ESID_V); + slbe->origv = rs; + + /* Map the new segment */ + kvmppc_mmu_map_segment(vcpu, esid << SID_SHIFT); +} + +static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + struct kvmppc_slb *slbe; + + if (slb_nr > vcpu_book3s->slb_nr) + return 0; + + slbe = &vcpu_book3s->slb[slb_nr]; + + return slbe->orige; +} + +static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + struct kvmppc_slb *slbe; + + if (slb_nr > vcpu_book3s->slb_nr) + return 0; + + slbe = &vcpu_book3s->slb[slb_nr]; + + return slbe->origv; +} + +static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + struct kvmppc_slb *slbe; + + dprintk("KVM MMU: slbie(0x%llx)\n", ea); + + slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu_book3s, ea); + + if (!slbe) + return; + + dprintk("KVM MMU: slbie(0x%llx, 0x%llx)\n", ea, slbe->esid); + + slbe->valid = false; + + kvmppc_mmu_map_segment(vcpu, ea); +} + +static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + int i; + + dprintk("KVM MMU: slbia()\n"); + + for (i = 1; i < vcpu_book3s->slb_nr; i++) + vcpu_book3s->slb[i].valid = false; + + if (vcpu->arch.msr & MSR_IR) { + kvmppc_mmu_flush_segments(vcpu); + kvmppc_mmu_map_segment(vcpu, vcpu->arch.pc); + } +} + +static void kvmppc_mmu_book3s_64_mtsrin(struct kvm_vcpu *vcpu, u32 srnum, + ulong value) +{ + u64 rb = 0, rs = 0; + + /* ESID = srnum */ + rb |= (srnum & 0xf) << 28; + /* Set the valid bit */ + rb |= 1 << 27; + /* Index = ESID */ + rb |= srnum; + + /* VSID = VSID */ + rs |= (value & 0xfffffff) << 12; + /* flags = flags */ + rs |= ((value >> 27) & 0xf) << 9; + + kvmppc_mmu_book3s_64_slbmte(vcpu, rs, rb); +} + +static void kvmppc_mmu_book3s_64_tlbie(struct kvm_vcpu *vcpu, ulong va, + bool large) +{ + u64 mask = 0xFFFFFFFFFULL; + + dprintk("KVM MMU: tlbie(0x%lx)\n", va); + + if (large) + mask = 0xFFFFFF000ULL; + kvmppc_mmu_pte_vflush(vcpu, va >> 12, mask); +} + +static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, u64 esid, + u64 *vsid) +{ + switch (vcpu->arch.msr & (MSR_DR|MSR_IR)) { + case 0: + *vsid = (VSID_REAL >> 16) | esid; + break; + case MSR_IR: + *vsid = (VSID_REAL_IR >> 16) | esid; + break; + case MSR_DR: + *vsid = (VSID_REAL_DR >> 16) | esid; + break; + case MSR_DR|MSR_IR: + { + ulong ea; + struct kvmppc_slb *slb; + ea = esid << SID_SHIFT; + slb = kvmppc_mmu_book3s_64_find_slbe(to_book3s(vcpu), ea); + if (slb) + *vsid = slb->vsid; + else + return -ENOENT; + + break; + } + default: + BUG(); + break; + } + + return 0; +} + +static bool kvmppc_mmu_book3s_64_is_dcbz32(struct kvm_vcpu *vcpu) +{ + return (to_book3s(vcpu)->hid[5] & 0x80); +} + +void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu) +{ + struct kvmppc_mmu *mmu = &vcpu->arch.mmu; + + mmu->mfsrin = NULL; + mmu->mtsrin = kvmppc_mmu_book3s_64_mtsrin; + mmu->slbmte = kvmppc_mmu_book3s_64_slbmte; + mmu->slbmfee = kvmppc_mmu_book3s_64_slbmfee; + mmu->slbmfev = kvmppc_mmu_book3s_64_slbmfev; + mmu->slbie = kvmppc_mmu_book3s_64_slbie; + mmu->slbia = kvmppc_mmu_book3s_64_slbia; + mmu->xlate = kvmppc_mmu_book3s_64_xlate; + mmu->reset_msr = kvmppc_mmu_book3s_64_reset_msr; + mmu->tlbie = kvmppc_mmu_book3s_64_tlbie; + mmu->esid_to_vsid = kvmppc_mmu_book3s_64_esid_to_vsid; + mmu->ea_to_vp = kvmppc_mmu_book3s_64_ea_to_vp; + mmu->is_dcbz32 = kvmppc_mmu_book3s_64_is_dcbz32; +} diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c new file mode 100644 index 00000000000..f2899b297ff --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -0,0 +1,408 @@ +/* + * Copyright (C) 2009 SUSE Linux Products GmbH. All rights reserved. + * + * Authors: + * Alexander Graf <agraf@suse.de> + * Kevin Wolf <mail@kevin-wolf.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <linux/kvm_host.h> + +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h> +#include <asm/machdep.h> +#include <asm/mmu_context.h> +#include <asm/hw_irq.h> + +#define PTE_SIZE 12 +#define VSID_ALL 0 + +/* #define DEBUG_MMU */ +/* #define DEBUG_SLB */ + +#ifdef DEBUG_MMU +#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__) +#else +#define dprintk_mmu(a, ...) do { } while(0) +#endif + +#ifdef DEBUG_SLB +#define dprintk_slb(a, ...) printk(KERN_INFO a, __VA_ARGS__) +#else +#define dprintk_slb(a, ...) do { } while(0) +#endif + +static void invalidate_pte(struct hpte_cache *pte) +{ + dprintk_mmu("KVM: Flushing SPT %d: 0x%llx (0x%llx) -> 0x%llx\n", + i, pte->pte.eaddr, pte->pte.vpage, pte->host_va); + + ppc_md.hpte_invalidate(pte->slot, pte->host_va, + MMU_PAGE_4K, MMU_SEGSIZE_256M, + false); + pte->host_va = 0; + kvm_release_pfn_dirty(pte->pfn); +} + +void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, u64 guest_ea, u64 ea_mask) +{ + int i; + + dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%llx & 0x%llx\n", + vcpu->arch.hpte_cache_offset, guest_ea, ea_mask); + BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM); + + guest_ea &= ea_mask; + for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) { + struct hpte_cache *pte; + + pte = &vcpu->arch.hpte_cache[i]; + if (!pte->host_va) + continue; + + if ((pte->pte.eaddr & ea_mask) == guest_ea) { + invalidate_pte(pte); + } + } + + /* Doing a complete flush -> start from scratch */ + if (!ea_mask) + vcpu->arch.hpte_cache_offset = 0; +} + +void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask) +{ + int i; + + dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n", + vcpu->arch.hpte_cache_offset, guest_vp, vp_mask); + BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM); + + guest_vp &= vp_mask; + for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) { + struct hpte_cache *pte; + + pte = &vcpu->arch.hpte_cache[i]; + if (!pte->host_va) + continue; + + if ((pte->pte.vpage & vp_mask) == guest_vp) { + invalidate_pte(pte); + } + } +} + +void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, u64 pa_start, u64 pa_end) +{ + int i; + + dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%llx & 0x%llx\n", + vcpu->arch.hpte_cache_offset, guest_pa, pa_mask); + BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM); + + for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) { + struct hpte_cache *pte; + + pte = &vcpu->arch.hpte_cache[i]; + if (!pte->host_va) + continue; + + if ((pte->pte.raddr >= pa_start) && + (pte->pte.raddr < pa_end)) { + invalidate_pte(pte); + } + } +} + +struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data) +{ + int i; + u64 guest_vp; + + guest_vp = vcpu->arch.mmu.ea_to_vp(vcpu, ea, false); + for (i=0; i<vcpu->arch.hpte_cache_offset; i++) { + struct hpte_cache *pte; + + pte = &vcpu->arch.hpte_cache[i]; + if (!pte->host_va) + continue; + + if (pte->pte.vpage == guest_vp) + return &pte->pte; + } + + return NULL; +} + +static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.hpte_cache_offset == HPTEG_CACHE_NUM) + kvmppc_mmu_pte_flush(vcpu, 0, 0); + + return vcpu->arch.hpte_cache_offset++; +} + +/* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using + * a hash, so we don't waste cycles on looping */ +static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid) +{ + return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK)); +} + + +static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid) +{ + struct kvmppc_sid_map *map; + u16 sid_map_mask; + + if (vcpu->arch.msr & MSR_PR) + gvsid |= VSID_PR; + + sid_map_mask = kvmppc_sid_hash(vcpu, gvsid); + map = &to_book3s(vcpu)->sid_map[sid_map_mask]; + if (map->guest_vsid == gvsid) { + dprintk_slb("SLB: Searching 0x%llx -> 0x%llx\n", + gvsid, map->host_vsid); + return map; + } + + map = &to_book3s(vcpu)->sid_map[SID_MAP_MASK - sid_map_mask]; + if (map->guest_vsid == gvsid) { + dprintk_slb("SLB: Searching 0x%llx -> 0x%llx\n", + gvsid, map->host_vsid); + return map; + } + + dprintk_slb("SLB: Searching 0x%llx -> not found\n", gvsid); + return NULL; +} + +int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte) +{ + pfn_t hpaddr; + ulong hash, hpteg, va; + u64 vsid; + int ret; + int rflags = 0x192; + int vflags = 0; + int attempt = 0; + struct kvmppc_sid_map *map; + + /* Get host physical address for gpa */ + hpaddr = gfn_to_pfn(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT); + if (kvm_is_error_hva(hpaddr)) { + printk(KERN_INFO "Couldn't get guest page for gfn %llx!\n", orig_pte->eaddr); + return -EINVAL; + } + hpaddr <<= PAGE_SHIFT; +#if PAGE_SHIFT == 12 +#elif PAGE_SHIFT == 16 + hpaddr |= orig_pte->raddr & 0xf000; +#else +#error Unknown page size +#endif + + /* and write the mapping ea -> hpa into the pt */ + vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid); + map = find_sid_vsid(vcpu, vsid); + if (!map) { + kvmppc_mmu_map_segment(vcpu, orig_pte->eaddr); + map = find_sid_vsid(vcpu, vsid); + } + BUG_ON(!map); + + vsid = map->host_vsid; + va = hpt_va(orig_pte->eaddr, vsid, MMU_SEGSIZE_256M); + + if (!orig_pte->may_write) + rflags |= HPTE_R_PP; + else + mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT); + + if (!orig_pte->may_execute) + rflags |= HPTE_R_N; + + hash = hpt_hash(va, PTE_SIZE, MMU_SEGSIZE_256M); + +map_again: + hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); + + /* In case we tried normal mapping already, let's nuke old entries */ + if (attempt > 1) + if (ppc_md.hpte_remove(hpteg) < 0) + return -1; + + ret = ppc_md.hpte_insert(hpteg, va, hpaddr, rflags, vflags, MMU_PAGE_4K, MMU_SEGSIZE_256M); + + if (ret < 0) { + /* If we couldn't map a primary PTE, try a secondary */ +#ifdef USE_SECONDARY + hash = ~hash; + attempt++; + if (attempt % 2) + vflags = HPTE_V_SECONDARY; + else + vflags = 0; +#else + attempt = 2; +#endif + goto map_again; + } else { + int hpte_id = kvmppc_mmu_hpte_cache_next(vcpu); + struct hpte_cache *pte = &vcpu->arch.hpte_cache[hpte_id]; + + dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%lx (0x%llx) -> %lx\n", + ((rflags & HPTE_R_PP) == 3) ? '-' : 'w', + (rflags & HPTE_R_N) ? '-' : 'x', + orig_pte->eaddr, hpteg, va, orig_pte->vpage, hpaddr); + + pte->slot = hpteg + (ret & 7); + pte->host_va = va; + pte->pte = *orig_pte; + pte->pfn = hpaddr >> PAGE_SHIFT; + } + + return 0; +} + +static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) +{ + struct kvmppc_sid_map *map; + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + u16 sid_map_mask; + static int backwards_map = 0; + + if (vcpu->arch.msr & MSR_PR) + gvsid |= VSID_PR; + + /* We might get collisions that trap in preceding order, so let's + map them differently */ + + sid_map_mask = kvmppc_sid_hash(vcpu, gvsid); + if (backwards_map) + sid_map_mask = SID_MAP_MASK - sid_map_mask; + + map = &to_book3s(vcpu)->sid_map[sid_map_mask]; + + /* Make sure we're taking the other map next time */ + backwards_map = !backwards_map; + + /* Uh-oh ... out of mappings. Let's flush! */ + if (vcpu_book3s->vsid_next == vcpu_book3s->vsid_max) { + vcpu_book3s->vsid_next = vcpu_book3s->vsid_first; + memset(vcpu_book3s->sid_map, 0, + sizeof(struct kvmppc_sid_map) * SID_MAP_NUM); + kvmppc_mmu_pte_flush(vcpu, 0, 0); + kvmppc_mmu_flush_segments(vcpu); + } + map->host_vsid = vcpu_book3s->vsid_next++; + + map->guest_vsid = gvsid; + map->valid = true; + + return map; +} + +static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid) +{ + int i; + int max_slb_size = 64; + int found_inval = -1; + int r; + + if (!get_paca()->kvm_slb_max) + get_paca()->kvm_slb_max = 1; + + /* Are we overwriting? */ + for (i = 1; i < get_paca()->kvm_slb_max; i++) { + if (!(get_paca()->kvm_slb[i].esid & SLB_ESID_V)) + found_inval = i; + else if ((get_paca()->kvm_slb[i].esid & ESID_MASK) == esid) + return i; + } + + /* Found a spare entry that was invalidated before */ + if (found_inval > 0) + return found_inval; + + /* No spare invalid entry, so create one */ + + if (mmu_slb_size < 64) + max_slb_size = mmu_slb_size; + + /* Overflowing -> purge */ + if ((get_paca()->kvm_slb_max) == max_slb_size) + kvmppc_mmu_flush_segments(vcpu); + + r = get_paca()->kvm_slb_max; + get_paca()->kvm_slb_max++; + + return r; +} + +int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) +{ + u64 esid = eaddr >> SID_SHIFT; + u64 slb_esid = (eaddr & ESID_MASK) | SLB_ESID_V; + u64 slb_vsid = SLB_VSID_USER; + u64 gvsid; + int slb_index; + struct kvmppc_sid_map *map; + + slb_index = kvmppc_mmu_next_segment(vcpu, eaddr & ESID_MASK); + + if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) { + /* Invalidate an entry */ + get_paca()->kvm_slb[slb_index].esid = 0; + return -ENOENT; + } + + map = find_sid_vsid(vcpu, gvsid); + if (!map) + map = create_sid_map(vcpu, gvsid); + + map->guest_esid = esid; + + slb_vsid |= (map->host_vsid << 12); + slb_vsid &= ~SLB_VSID_KP; + slb_esid |= slb_index; + + get_paca()->kvm_slb[slb_index].esid = slb_esid; + get_paca()->kvm_slb[slb_index].vsid = slb_vsid; + + dprintk_slb("slbmte %#llx, %#llx\n", slb_vsid, slb_esid); + + return 0; +} + +void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) +{ + get_paca()->kvm_slb_max = 1; + get_paca()->kvm_slb[0].esid = 0; +} + +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +{ + kvmppc_mmu_pte_flush(vcpu, 0, 0); +} diff --git a/arch/powerpc/kvm/book3s_64_rmhandlers.S b/arch/powerpc/kvm/book3s_64_rmhandlers.S new file mode 100644 index 00000000000..fb7dd2e9ac8 --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_rmhandlers.S @@ -0,0 +1,131 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <asm/ppc_asm.h> +#include <asm/kvm_asm.h> +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/asm-offsets.h> +#include <asm/exception-64s.h> + +/***************************************************************************** + * * + * Real Mode handlers that need to be in low physical memory * + * * + ****************************************************************************/ + + +.macro INTERRUPT_TRAMPOLINE intno + +.global kvmppc_trampoline_\intno +kvmppc_trampoline_\intno: + + mtspr SPRN_SPRG_SCRATCH0, r13 /* Save r13 */ + + /* + * First thing to do is to find out if we're coming + * from a KVM guest or a Linux process. + * + * To distinguish, we check a magic byte in the PACA + */ + mfspr r13, SPRN_SPRG_PACA /* r13 = PACA */ + std r12, (PACA_EXMC + EX_R12)(r13) + mfcr r12 + stw r12, (PACA_EXMC + EX_CCR)(r13) + lbz r12, PACA_KVM_IN_GUEST(r13) + cmpwi r12, 0 + bne ..kvmppc_handler_hasmagic_\intno + /* No KVM guest? Then jump back to the Linux handler! */ + lwz r12, (PACA_EXMC + EX_CCR)(r13) + mtcr r12 + ld r12, (PACA_EXMC + EX_R12)(r13) + mfspr r13, SPRN_SPRG_SCRATCH0 /* r13 = original r13 */ + b kvmppc_resume_\intno /* Get back original handler */ + + /* Now we know we're handling a KVM guest */ +..kvmppc_handler_hasmagic_\intno: + /* Unset guest state */ + li r12, 0 + stb r12, PACA_KVM_IN_GUEST(r13) + + std r1, (PACA_EXMC+EX_R9)(r13) + std r10, (PACA_EXMC+EX_R10)(r13) + std r11, (PACA_EXMC+EX_R11)(r13) + std r2, (PACA_EXMC+EX_R13)(r13) + + mfsrr0 r10 + mfsrr1 r11 + + /* Restore R1/R2 so we can handle faults */ + ld r1, PACAR1(r13) + ld r2, (PACA_EXMC+EX_SRR0)(r13) + + /* Let's store which interrupt we're handling */ + li r12, \intno + + /* Jump into the SLB exit code that goes to the highmem handler */ + b kvmppc_handler_trampoline_exit + +.endm + +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_SYSTEM_RESET +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_MACHINE_CHECK +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_DATA_STORAGE +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_DATA_SEGMENT +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_INST_STORAGE +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_INST_SEGMENT +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_EXTERNAL +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALIGNMENT +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_PROGRAM +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_FP_UNAVAIL +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_DECREMENTER +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_SYSCALL +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_TRACE +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_PERFMON +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALTIVEC +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_VSX + +/* + * This trampoline brings us back to a real mode handler + * + * Input Registers: + * + * R6 = SRR0 + * R7 = SRR1 + * LR = real-mode IP + * + */ +.global kvmppc_handler_lowmem_trampoline +kvmppc_handler_lowmem_trampoline: + + mtsrr0 r6 + mtsrr1 r7 + blr +kvmppc_handler_lowmem_trampoline_end: + +.global kvmppc_trampoline_lowmem +kvmppc_trampoline_lowmem: + .long kvmppc_handler_lowmem_trampoline - _stext + +.global kvmppc_trampoline_enter +kvmppc_trampoline_enter: + .long kvmppc_handler_trampoline_enter - _stext + +#include "book3s_64_slb.S" + diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S new file mode 100644 index 00000000000..ecd237a03fd --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_slb.S @@ -0,0 +1,262 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#define SHADOW_SLB_ESID(num) (SLBSHADOW_SAVEAREA + (num * 0x10)) +#define SHADOW_SLB_VSID(num) (SLBSHADOW_SAVEAREA + (num * 0x10) + 0x8) +#define UNBOLT_SLB_ENTRY(num) \ + ld r9, SHADOW_SLB_ESID(num)(r12); \ + /* Invalid? Skip. */; \ + rldicl. r0, r9, 37, 63; \ + beq slb_entry_skip_ ## num; \ + xoris r9, r9, SLB_ESID_V@h; \ + std r9, SHADOW_SLB_ESID(num)(r12); \ + slb_entry_skip_ ## num: + +#define REBOLT_SLB_ENTRY(num) \ + ld r10, SHADOW_SLB_ESID(num)(r11); \ + cmpdi r10, 0; \ + beq slb_exit_skip_1; \ + oris r10, r10, SLB_ESID_V@h; \ + ld r9, SHADOW_SLB_VSID(num)(r11); \ + slbmte r9, r10; \ + std r10, SHADOW_SLB_ESID(num)(r11); \ +slb_exit_skip_ ## num: + +/****************************************************************************** + * * + * Entry code * + * * + *****************************************************************************/ + +.global kvmppc_handler_trampoline_enter +kvmppc_handler_trampoline_enter: + + /* Required state: + * + * MSR = ~IR|DR + * R13 = PACA + * R9 = guest IP + * R10 = guest MSR + * R11 = free + * R12 = free + * PACA[PACA_EXMC + EX_R9] = guest R9 + * PACA[PACA_EXMC + EX_R10] = guest R10 + * PACA[PACA_EXMC + EX_R11] = guest R11 + * PACA[PACA_EXMC + EX_R12] = guest R12 + * PACA[PACA_EXMC + EX_R13] = guest R13 + * PACA[PACA_EXMC + EX_CCR] = guest CR + * PACA[PACA_EXMC + EX_R3] = guest XER + */ + + mtsrr0 r9 + mtsrr1 r10 + + mtspr SPRN_SPRG_SCRATCH0, r0 + + /* Remove LPAR shadow entries */ + +#if SLB_NUM_BOLTED == 3 + + ld r12, PACA_SLBSHADOWPTR(r13) + + /* Save off the first entry so we can slbie it later */ + ld r10, SHADOW_SLB_ESID(0)(r12) + ld r11, SHADOW_SLB_VSID(0)(r12) + + /* Remove bolted entries */ + UNBOLT_SLB_ENTRY(0) + UNBOLT_SLB_ENTRY(1) + UNBOLT_SLB_ENTRY(2) + +#else +#error unknown number of bolted entries +#endif + + /* Flush SLB */ + + slbia + + /* r0 = esid & ESID_MASK */ + rldicr r10, r10, 0, 35 + /* r0 |= CLASS_BIT(VSID) */ + rldic r12, r11, 56 - 36, 36 + or r10, r10, r12 + slbie r10 + + isync + + /* Fill SLB with our shadow */ + + lbz r12, PACA_KVM_SLB_MAX(r13) + mulli r12, r12, 16 + addi r12, r12, PACA_KVM_SLB + add r12, r12, r13 + + /* for (r11 = kvm_slb; r11 < kvm_slb + kvm_slb_size; r11+=slb_entry) */ + li r11, PACA_KVM_SLB + add r11, r11, r13 + +slb_loop_enter: + + ld r10, 0(r11) + + rldicl. r0, r10, 37, 63 + beq slb_loop_enter_skip + + ld r9, 8(r11) + slbmte r9, r10 + +slb_loop_enter_skip: + addi r11, r11, 16 + cmpd cr0, r11, r12 + blt slb_loop_enter + +slb_do_enter: + + /* Enter guest */ + + mfspr r0, SPRN_SPRG_SCRATCH0 + + ld r9, (PACA_EXMC+EX_R9)(r13) + ld r10, (PACA_EXMC+EX_R10)(r13) + ld r12, (PACA_EXMC+EX_R12)(r13) + + lwz r11, (PACA_EXMC+EX_CCR)(r13) + mtcr r11 + + ld r11, (PACA_EXMC+EX_R3)(r13) + mtxer r11 + + ld r11, (PACA_EXMC+EX_R11)(r13) + ld r13, (PACA_EXMC+EX_R13)(r13) + + RFI +kvmppc_handler_trampoline_enter_end: + + + +/****************************************************************************** + * * + * Exit code * + * * + *****************************************************************************/ + +.global kvmppc_handler_trampoline_exit +kvmppc_handler_trampoline_exit: + + /* Register usage at this point: + * + * SPRG_SCRATCH0 = guest R13 + * R01 = host R1 + * R02 = host R2 + * R10 = guest PC + * R11 = guest MSR + * R12 = exit handler id + * R13 = PACA + * PACA.exmc.CCR = guest CR + * PACA.exmc.R9 = guest R1 + * PACA.exmc.R10 = guest R10 + * PACA.exmc.R11 = guest R11 + * PACA.exmc.R12 = guest R12 + * PACA.exmc.R13 = guest R2 + * + */ + + /* Save registers */ + + std r0, (PACA_EXMC+EX_SRR0)(r13) + std r9, (PACA_EXMC+EX_R3)(r13) + std r10, (PACA_EXMC+EX_LR)(r13) + std r11, (PACA_EXMC+EX_DAR)(r13) + + /* + * In order for us to easily get the last instruction, + * we got the #vmexit at, we exploit the fact that the + * virtual layout is still the same here, so we can just + * ld from the guest's PC address + */ + + /* We only load the last instruction when it's safe */ + cmpwi r12, BOOK3S_INTERRUPT_DATA_STORAGE + beq ld_last_inst + cmpwi r12, BOOK3S_INTERRUPT_PROGRAM + beq ld_last_inst + + b no_ld_last_inst + +ld_last_inst: + /* Save off the guest instruction we're at */ + /* 1) enable paging for data */ + mfmsr r9 + ori r11, r9, MSR_DR /* Enable paging for data */ + mtmsr r11 + /* 2) fetch the instruction */ + lwz r0, 0(r10) + /* 3) disable paging again */ + mtmsr r9 + +no_ld_last_inst: + + /* Restore bolted entries from the shadow and fix it along the way */ + + /* We don't store anything in entry 0, so we don't need to take care of it */ + slbia + isync + +#if SLB_NUM_BOLTED == 3 + + ld r11, PACA_SLBSHADOWPTR(r13) + + REBOLT_SLB_ENTRY(0) + REBOLT_SLB_ENTRY(1) + REBOLT_SLB_ENTRY(2) + +#else +#error unknown number of bolted entries +#endif + +slb_do_exit: + + /* Restore registers */ + + ld r11, (PACA_EXMC+EX_DAR)(r13) + ld r10, (PACA_EXMC+EX_LR)(r13) + ld r9, (PACA_EXMC+EX_R3)(r13) + + /* Save last inst */ + stw r0, (PACA_EXMC+EX_LR)(r13) + + /* Save DAR and DSISR before going to paged mode */ + mfdar r0 + std r0, (PACA_EXMC+EX_DAR)(r13) + mfdsisr r0 + stw r0, (PACA_EXMC+EX_DSISR)(r13) + + /* RFI into the highmem handler */ + mfmsr r0 + ori r0, r0, MSR_IR|MSR_DR|MSR_RI /* Enable paging */ + mtsrr1 r0 + ld r0, PACASAVEDMSR(r13) /* Highmem handler address */ + mtsrr0 r0 + + mfspr r0, SPRN_SPRG_SCRATCH0 + + RFI +kvmppc_handler_trampoline_exit_end: + diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index e7bf4d02948..06f5a9ecc42 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -520,6 +520,11 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, return kvmppc_core_vcpu_translate(vcpu, tr); } +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +{ + return -ENOTSUPP; +} + int __init kvmppc_booke_init(void) { unsigned long ivor[16]; diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index 7737146af3f..4a9ac6640fa 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -18,7 +18,7 @@ */ #include <linux/jiffies.h> -#include <linux/timer.h> +#include <linux/hrtimer.h> #include <linux/types.h> #include <linux/string.h> #include <linux/kvm_host.h> @@ -32,6 +32,7 @@ #include "trace.h" #define OP_TRAP 3 +#define OP_TRAP_64 2 #define OP_31_XOP_LWZX 23 #define OP_31_XOP_LBZX 87 @@ -64,19 +65,45 @@ #define OP_STH 44 #define OP_STHU 45 +#ifdef CONFIG_PPC64 +static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu) +{ + return 1; +} +#else +static int kvmppc_dec_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.tcr & TCR_DIE; +} +#endif + void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) { - if (vcpu->arch.tcr & TCR_DIE) { + unsigned long dec_nsec; + + pr_debug("mtDEC: %x\n", vcpu->arch.dec); +#ifdef CONFIG_PPC64 + /* POWER4+ triggers a dec interrupt if the value is < 0 */ + if (vcpu->arch.dec & 0x80000000) { + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + kvmppc_core_queue_dec(vcpu); + return; + } +#endif + if (kvmppc_dec_enabled(vcpu)) { /* The decrementer ticks at the same rate as the timebase, so * that's how we convert the guest DEC value to the number of * host ticks. */ - unsigned long nr_jiffies; - nr_jiffies = vcpu->arch.dec / tb_ticks_per_jiffy; - mod_timer(&vcpu->arch.dec_timer, - get_jiffies_64() + nr_jiffies); + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + dec_nsec = vcpu->arch.dec; + dec_nsec *= 1000; + dec_nsec /= tb_ticks_per_usec; + hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec), + HRTIMER_MODE_REL); + vcpu->arch.dec_jiffies = get_tb(); } else { - del_timer(&vcpu->arch.dec_timer); + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); } } @@ -111,9 +138,15 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) /* this default type might be overwritten by subcategories */ kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS); + pr_debug(KERN_INFO "Emulating opcode %d / %d\n", get_op(inst), get_xop(inst)); + switch (get_op(inst)) { case OP_TRAP: +#ifdef CONFIG_PPC64 + case OP_TRAP_64: +#else vcpu->arch.esr |= ESR_PTR; +#endif kvmppc_core_queue_program(vcpu); advance = 0; break; @@ -188,17 +221,19 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case SPRN_SRR1: vcpu->arch.gpr[rt] = vcpu->arch.srr1; break; case SPRN_PVR: - vcpu->arch.gpr[rt] = mfspr(SPRN_PVR); break; + vcpu->arch.gpr[rt] = vcpu->arch.pvr; break; case SPRN_PIR: - vcpu->arch.gpr[rt] = mfspr(SPRN_PIR); break; + vcpu->arch.gpr[rt] = vcpu->vcpu_id; break; + case SPRN_MSSSR0: + vcpu->arch.gpr[rt] = 0; break; /* Note: mftb and TBRL/TBWL are user-accessible, so * the guest can always access the real TB anyways. * In fact, we probably will never see these traps. */ case SPRN_TBWL: - vcpu->arch.gpr[rt] = mftbl(); break; + vcpu->arch.gpr[rt] = get_tb() >> 32; break; case SPRN_TBWU: - vcpu->arch.gpr[rt] = mftbu(); break; + vcpu->arch.gpr[rt] = get_tb(); break; case SPRN_SPRG0: vcpu->arch.gpr[rt] = vcpu->arch.sprg0; break; @@ -211,6 +246,13 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) /* Note: SPRG4-7 are user-readable, so we don't get * a trap. */ + case SPRN_DEC: + { + u64 jd = get_tb() - vcpu->arch.dec_jiffies; + vcpu->arch.gpr[rt] = vcpu->arch.dec - jd; + pr_debug(KERN_INFO "mfDEC: %x - %llx = %lx\n", vcpu->arch.dec, jd, vcpu->arch.gpr[rt]); + break; + } default: emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, rt); if (emulated == EMULATE_FAIL) { @@ -260,6 +302,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case SPRN_TBWL: break; case SPRN_TBWU: break; + case SPRN_MSSSR0: break; + case SPRN_DEC: vcpu->arch.dec = vcpu->arch.gpr[rs]; kvmppc_emulate_dec(vcpu); diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2a4551f78f6..692c3709011 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -23,6 +23,7 @@ #include <linux/kvm_host.h> #include <linux/module.h> #include <linux/vmalloc.h> +#include <linux/hrtimer.h> #include <linux/fs.h> #include <asm/cputable.h> #include <asm/uaccess.h> @@ -208,10 +209,25 @@ static void kvmppc_decrementer_func(unsigned long data) } } +/* + * low level hrtimer wake routine. Because this runs in hardirq context + * we schedule a tasklet to do the real work. + */ +enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer) +{ + struct kvm_vcpu *vcpu; + + vcpu = container_of(timer, struct kvm_vcpu, arch.dec_timer); + tasklet_schedule(&vcpu->arch.tasklet); + + return HRTIMER_NORESTART; +} + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { - setup_timer(&vcpu->arch.dec_timer, kvmppc_decrementer_func, - (unsigned long)vcpu); + hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); + tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu); + vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; return 0; } @@ -409,11 +425,6 @@ out: return r; } -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) -{ - return -ENOTSUPP; -} - long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c index 2aa371e3007..70378551c0c 100644 --- a/arch/powerpc/kvm/timing.c +++ b/arch/powerpc/kvm/timing.c @@ -23,6 +23,7 @@ #include <linux/seq_file.h> #include <linux/debugfs.h> #include <linux/uaccess.h> +#include <linux/module.h> #include <asm/time.h> #include <asm-generic/div64.h> diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index 67f219de045..a8e84001805 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h @@ -12,8 +12,8 @@ * Tracepoint for guest mode entry. */ TRACE_EVENT(kvm_ppc_instr, - TP_PROTO(unsigned int inst, unsigned long pc, unsigned int emulate), - TP_ARGS(inst, pc, emulate), + TP_PROTO(unsigned int inst, unsigned long _pc, unsigned int emulate), + TP_ARGS(inst, _pc, emulate), TP_STRUCT__entry( __field( unsigned int, inst ) @@ -23,7 +23,7 @@ TRACE_EVENT(kvm_ppc_instr, TP_fast_assign( __entry->inst = inst; - __entry->pc = pc; + __entry->pc = _pc; __entry->emulate = emulate; ), diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 6fb8fc8d2fe..ce68708bbad 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -28,7 +28,10 @@ obj-$(CONFIG_44x) += 44x_mmu.o obj-$(CONFIG_FSL_BOOKE) += fsl_booke_mmu.o obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o -obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o +ifeq ($(CONFIG_HUGETLB_PAGE),y) +obj-y += hugetlbpage.o +obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o +endif obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_HIGHMEM) += highmem.o diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index bc122a120bf..d7efdbf640c 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -55,57 +55,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, return 1; } -#ifdef CONFIG_HUGETLB_PAGE -static noinline int gup_huge_pte(pte_t *ptep, struct hstate *hstate, - unsigned long *addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long mask; - unsigned long pte_end; - struct page *head, *page; - pte_t pte; - int refs; - - pte_end = (*addr + huge_page_size(hstate)) & huge_page_mask(hstate); - if (pte_end < end) - end = pte_end; - - pte = *ptep; - mask = _PAGE_PRESENT|_PAGE_USER; - if (write) - mask |= _PAGE_RW; - if ((pte_val(pte) & mask) != mask) - return 0; - /* hugepages are never "special" */ - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - - refs = 0; - head = pte_page(pte); - page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT); - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (*addr += PAGE_SIZE, *addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - if (unlikely(pte_val(pte) != pte_val(*ptep))) { - /* Could be optimized better */ - while (*nr) { - put_page(page); - (*nr)--; - } - } - - return 1; -} -#endif /* CONFIG_HUGETLB_PAGE */ - static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -119,7 +68,11 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, next = pmd_addr_end(addr, end); if (pmd_none(pmd)) return 0; - if (!gup_pte_range(pmd, addr, next, write, pages, nr)) + if (is_hugepd(pmdp)) { + if (!gup_hugepd((hugepd_t *)pmdp, PMD_SHIFT, + addr, next, write, pages, nr)) + return 0; + } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); @@ -139,7 +92,11 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, next = pud_addr_end(addr, end); if (pud_none(pud)) return 0; - if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + if (is_hugepd(pudp)) { + if (!gup_hugepd((hugepd_t *)pudp, PUD_SHIFT, + addr, next, write, pages, nr)) + return 0; + } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) return 0; } while (pudp++, addr = next, addr != end); @@ -154,10 +111,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, unsigned long next; pgd_t *pgdp; int nr = 0; -#ifdef CONFIG_PPC64 - unsigned int shift; - int psize; -#endif pr_devel("%s(%lx,%x,%s)\n", __func__, start, nr_pages, write ? "write" : "read"); @@ -172,25 +125,6 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, pr_devel(" aligned: %lx .. %lx\n", start, end); -#ifdef CONFIG_HUGETLB_PAGE - /* We bail out on slice boundary crossing when hugetlb is - * enabled in order to not have to deal with two different - * page table formats - */ - if (addr < SLICE_LOW_TOP) { - if (end > SLICE_LOW_TOP) - goto slow_irqon; - - if (unlikely(GET_LOW_SLICE_INDEX(addr) != - GET_LOW_SLICE_INDEX(end - 1))) - goto slow_irqon; - } else { - if (unlikely(GET_HIGH_SLICE_INDEX(addr) != - GET_HIGH_SLICE_INDEX(end - 1))) - goto slow_irqon; - } -#endif /* CONFIG_HUGETLB_PAGE */ - /* * XXX: batch / limit 'nr', to avoid large irq off latency * needs some instrumenting to determine the common sizes used by @@ -210,54 +144,23 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, */ local_irq_disable(); -#ifdef CONFIG_PPC64 - /* Those bits are related to hugetlbfs implementation and only exist - * on 64-bit for now - */ - psize = get_slice_psize(mm, addr); - shift = mmu_psize_defs[psize].shift; -#endif /* CONFIG_PPC64 */ - -#ifdef CONFIG_HUGETLB_PAGE - if (unlikely(mmu_huge_psizes[psize])) { - pte_t *ptep; - unsigned long a = addr; - unsigned long sz = ((1UL) << shift); - struct hstate *hstate = size_to_hstate(sz); - - BUG_ON(!hstate); - /* - * XXX: could be optimized to avoid hstate - * lookup entirely (just use shift) - */ - - do { - VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, a)].shift); - ptep = huge_pte_offset(mm, a); - pr_devel(" %016lx: huge ptep %p\n", a, ptep); - if (!ptep || !gup_huge_pte(ptep, hstate, &a, end, write, pages, - &nr)) - goto slow; - } while (a != end); - } else -#endif /* CONFIG_HUGETLB_PAGE */ - { - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - -#ifdef CONFIG_PPC64 - VM_BUG_ON(shift != mmu_psize_defs[get_slice_psize(mm, addr)].shift); -#endif - pr_devel(" %016lx: normal pgd %p\n", addr, - (void *)pgd_val(pgd)); - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = *pgdp; + + pr_devel(" %016lx: normal pgd %p\n", addr, + (void *)pgd_val(pgd)); + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + goto slow; + if (is_hugepd(pgdp)) { + if (!gup_hugepd((hugepd_t *)pgdp, PGDIR_SHIFT, + addr, next, write, pages, &nr)) goto slow; - } while (pgdp++, addr = next, addr != end); - } + } else if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + goto slow; + } while (pgdp++, addr = next, addr != end); + local_irq_enable(); VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 1ade7eb6ae0..6810128aba3 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -92,6 +92,7 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; struct hash_pte *htab_address; unsigned long htab_size_bytes; unsigned long htab_hash_mask; +EXPORT_SYMBOL_GPL(htab_hash_mask); int mmu_linear_psize = MMU_PAGE_4K; int mmu_virtual_psize = MMU_PAGE_4K; int mmu_vmalloc_psize = MMU_PAGE_4K; @@ -102,6 +103,7 @@ int mmu_io_psize = MMU_PAGE_4K; int mmu_kernel_ssize = MMU_SEGSIZE_256M; int mmu_highuser_ssize = MMU_SEGSIZE_256M; u16 mmu_slb_size = 64; +EXPORT_SYMBOL_GPL(mmu_slb_size); #ifdef CONFIG_HUGETLB_PAGE unsigned int HPAGE_SHIFT; #endif @@ -481,16 +483,6 @@ static void __init htab_init_page_sizes(void) #ifdef CONFIG_HUGETLB_PAGE /* Reserve 16G huge page memory sections for huge pages */ of_scan_flat_dt(htab_dt_scan_hugepage_blocks, NULL); - -/* Set default large page size. Currently, we pick 16M or 1M depending - * on what is available - */ - if (mmu_psize_defs[MMU_PAGE_16M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; - /* With 4k/4level pagetables, we can't (for now) cope with a - * huge page size < PMD_SIZE */ - else if (mmu_psize_defs[MMU_PAGE_1M].shift) - HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; #endif /* CONFIG_HUGETLB_PAGE */ } @@ -785,7 +777,7 @@ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) /* page is dirty */ if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { if (trap == 0x400) { - __flush_dcache_icache(page_address(page)); + flush_dcache_icache_page(page); set_bit(PG_arch_1, &page->flags); } else pp |= HPTE_R_N; @@ -891,6 +883,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) unsigned long vsid; struct mm_struct *mm; pte_t *ptep; + unsigned hugeshift; const struct cpumask *tmp; int rc, user_region = 0, local = 0; int psize, ssize; @@ -943,30 +936,31 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) if (user_region && cpumask_equal(mm_cpumask(mm), tmp)) local = 1; -#ifdef CONFIG_HUGETLB_PAGE - /* Handle hugepage regions */ - if (HPAGE_SHIFT && mmu_huge_psizes[psize]) { - DBG_LOW(" -> huge page !\n"); - return hash_huge_page(mm, access, ea, vsid, local, trap); - } -#endif /* CONFIG_HUGETLB_PAGE */ - #ifndef CONFIG_PPC_64K_PAGES - /* If we use 4K pages and our psize is not 4K, then we are hitting - * a special driver mapping, we need to align the address before - * we fetch the PTE + /* If we use 4K pages and our psize is not 4K, then we might + * be hitting a special driver mapping, and need to align the + * address before we fetch the PTE. + * + * It could also be a hugepage mapping, in which case this is + * not necessary, but it's not harmful, either. */ if (psize != MMU_PAGE_4K) ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); #endif /* CONFIG_PPC_64K_PAGES */ /* Get PTE and page size from page tables */ - ptep = find_linux_pte(pgdir, ea); + ptep = find_linux_pte_or_hugepte(pgdir, ea, &hugeshift); if (ptep == NULL || !pte_present(*ptep)) { DBG_LOW(" no PTE !\n"); return 1; } +#ifdef CONFIG_HUGETLB_PAGE + if (hugeshift) + return __hash_page_huge(ea, access, vsid, ptep, trap, local, + ssize, hugeshift, psize); +#endif /* CONFIG_HUGETLB_PAGE */ + #ifndef CONFIG_PPC_64K_PAGES DBG_LOW(" i-pte: %016lx\n", pte_val(*ptep)); #else diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c new file mode 100644 index 00000000000..199539882f9 --- /dev/null +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -0,0 +1,139 @@ +/* + * PPC64 Huge TLB Page Support for hash based MMUs (POWER4 and later) + * + * Copyright (C) 2003 David Gibson, IBM Corporation. + * + * Based on the IA-32 version: + * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> + */ + +#include <linux/mm.h> +#include <linux/hugetlb.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/cacheflush.h> +#include <asm/machdep.h> + +int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, + pte_t *ptep, unsigned long trap, int local, int ssize, + unsigned int shift, unsigned int mmu_psize) +{ + unsigned long old_pte, new_pte; + unsigned long va, rflags, pa, sz; + long slot; + int err = 1; + + BUG_ON(shift != mmu_psize_defs[mmu_psize].shift); + + /* Search the Linux page table for a match with va */ + va = hpt_va(ea, vsid, ssize); + + /* + * Check the user's access rights to the page. If access should be + * prevented then send the problem up to do_page_fault. + */ + if (unlikely(access & ~pte_val(*ptep))) + goto out; + /* + * At this point, we have a pte (old_pte) which can be used to build + * or update an HPTE. There are 2 cases: + * + * 1. There is a valid (present) pte with no associated HPTE (this is + * the most common case) + * 2. There is a valid (present) pte with an associated HPTE. The + * current values of the pp bits in the HPTE prevent access + * because we are doing software DIRTY bit management and the + * page is currently not DIRTY. + */ + + + do { + old_pte = pte_val(*ptep); + if (old_pte & _PAGE_BUSY) + goto out; + new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; + } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, + old_pte, new_pte)); + + rflags = 0x2 | (!(new_pte & _PAGE_RW)); + /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ + rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); + sz = ((1UL) << shift); + if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) + /* No CPU has hugepages but lacks no execute, so we + * don't need to worry about that case */ + rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); + + /* Check if pte already has an hpte (case 2) */ + if (unlikely(old_pte & _PAGE_HASHPTE)) { + /* There MIGHT be an HPTE for this pte */ + unsigned long hash, slot; + + hash = hpt_hash(va, shift, ssize); + if (old_pte & _PAGE_F_SECOND) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += (old_pte & _PAGE_F_GIX) >> 12; + + if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize, + ssize, local) == -1) + old_pte &= ~_PAGE_HPTEFLAGS; + } + + if (likely(!(old_pte & _PAGE_HASHPTE))) { + unsigned long hash = hpt_hash(va, shift, ssize); + unsigned long hpte_group; + + pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; + +repeat: + hpte_group = ((hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + + /* clear HPTE slot informations in new PTE */ +#ifdef CONFIG_PPC_64K_PAGES + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0; +#else + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; +#endif + /* Add in WIMG bits */ + rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | + _PAGE_COHERENT | _PAGE_GUARDED)); + + /* Insert into the hash table, primary slot */ + slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0, + mmu_psize, ssize); + + /* Primary is full, try the secondary */ + if (unlikely(slot == -1)) { + hpte_group = ((~hash & htab_hash_mask) * + HPTES_PER_GROUP) & ~0x7UL; + slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, + HPTE_V_SECONDARY, + mmu_psize, ssize); + if (slot == -1) { + if (mftb() & 0x1) + hpte_group = ((hash & htab_hash_mask) * + HPTES_PER_GROUP)&~0x7UL; + + ppc_md.hpte_remove(hpte_group); + goto repeat; + } + } + + if (unlikely(slot == -2)) + panic("hash_huge_page: pte_insert failed\n"); + + new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); + } + + /* + * No need to use ldarx/stdcx here + */ + *ptep = __pte(new_pte & ~_PAGE_BUSY); + + err = 0; + + out: + return err; +} diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 90df6ffe3a4..53b200abb02 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -7,29 +7,17 @@ * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> */ -#include <linux/init.h> -#include <linux/fs.h> #include <linux/mm.h> +#include <linux/io.h> #include <linux/hugetlb.h> -#include <linux/pagemap.h> -#include <linux/slab.h> -#include <linux/err.h> -#include <linux/sysctl.h> -#include <asm/mman.h> +#include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/tlb.h> -#include <asm/tlbflush.h> -#include <asm/mmu_context.h> -#include <asm/machdep.h> -#include <asm/cputable.h> -#include <asm/spu.h> #define PAGE_SHIFT_64K 16 #define PAGE_SHIFT_16M 24 #define PAGE_SHIFT_16G 34 -#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT) -#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT) #define MAX_NUMBER_GPAGES 1024 /* Tracks the 16G pages after the device tree is scanned and before the @@ -37,53 +25,17 @@ static unsigned long gpage_freearray[MAX_NUMBER_GPAGES]; static unsigned nr_gpages; -/* Array of valid huge page sizes - non-zero value(hugepte_shift) is - * stored for the huge page sizes that are valid. - */ -unsigned int mmu_huge_psizes[MMU_PAGE_COUNT] = { }; /* initialize all to 0 */ - -#define hugepte_shift mmu_huge_psizes -#define PTRS_PER_HUGEPTE(psize) (1 << hugepte_shift[psize]) -#define HUGEPTE_TABLE_SIZE(psize) (sizeof(pte_t) << hugepte_shift[psize]) - -#define HUGEPD_SHIFT(psize) (mmu_psize_to_shift(psize) \ - + hugepte_shift[psize]) -#define HUGEPD_SIZE(psize) (1UL << HUGEPD_SHIFT(psize)) -#define HUGEPD_MASK(psize) (~(HUGEPD_SIZE(psize)-1)) - -/* Subtract one from array size because we don't need a cache for 4K since - * is not a huge page size */ -#define HUGE_PGTABLE_INDEX(psize) (HUGEPTE_CACHE_NUM + psize - 1) -#define HUGEPTE_CACHE_NAME(psize) (huge_pgtable_cache_name[psize]) - -static const char *huge_pgtable_cache_name[MMU_PAGE_COUNT] = { - [MMU_PAGE_64K] = "hugepte_cache_64K", - [MMU_PAGE_1M] = "hugepte_cache_1M", - [MMU_PAGE_16M] = "hugepte_cache_16M", - [MMU_PAGE_16G] = "hugepte_cache_16G", -}; - /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() * will choke on pointers to hugepte tables, which is handy for * catching screwups early. */ -#define HUGEPD_OK 0x1 - -typedef struct { unsigned long pd; } hugepd_t; - -#define hugepd_none(hpd) ((hpd).pd == 0) static inline int shift_to_mmu_psize(unsigned int shift) { - switch (shift) { -#ifndef CONFIG_PPC_64K_PAGES - case PAGE_SHIFT_64K: - return MMU_PAGE_64K; -#endif - case PAGE_SHIFT_16M: - return MMU_PAGE_16M; - case PAGE_SHIFT_16G: - return MMU_PAGE_16G; - } + int psize; + + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) + if (mmu_psize_defs[psize].shift == shift) + return psize; return -1; } @@ -94,71 +46,126 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) BUG(); } +#define hugepd_none(hpd) ((hpd).pd == 0) + static inline pte_t *hugepd_page(hugepd_t hpd) { - BUG_ON(!(hpd.pd & HUGEPD_OK)); - return (pte_t *)(hpd.pd & ~HUGEPD_OK); + BUG_ON(!hugepd_ok(hpd)); + return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000); } -static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, - struct hstate *hstate) +static inline unsigned int hugepd_shift(hugepd_t hpd) { - unsigned int shift = huge_page_shift(hstate); - int psize = shift_to_mmu_psize(shift); - unsigned long idx = ((addr >> shift) & (PTRS_PER_HUGEPTE(psize)-1)); + return hpd.pd & HUGEPD_SHIFT_MASK; +} + +static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift) +{ + unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp); pte_t *dir = hugepd_page(*hpdp); return dir + idx; } +pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift) +{ + pgd_t *pg; + pud_t *pu; + pmd_t *pm; + hugepd_t *hpdp = NULL; + unsigned pdshift = PGDIR_SHIFT; + + if (shift) + *shift = 0; + + pg = pgdir + pgd_index(ea); + if (is_hugepd(pg)) { + hpdp = (hugepd_t *)pg; + } else if (!pgd_none(*pg)) { + pdshift = PUD_SHIFT; + pu = pud_offset(pg, ea); + if (is_hugepd(pu)) + hpdp = (hugepd_t *)pu; + else if (!pud_none(*pu)) { + pdshift = PMD_SHIFT; + pm = pmd_offset(pu, ea); + if (is_hugepd(pm)) + hpdp = (hugepd_t *)pm; + else if (!pmd_none(*pm)) { + return pte_offset_map(pm, ea); + } + } + } + + if (!hpdp) + return NULL; + + if (shift) + *shift = hugepd_shift(*hpdp); + return hugepte_offset(hpdp, ea, pdshift); +} + +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +{ + return find_linux_pte_or_hugepte(mm->pgd, addr, NULL); +} + static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, - unsigned long address, unsigned int psize) + unsigned long address, unsigned pdshift, unsigned pshift) { - pte_t *new = kmem_cache_zalloc(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], - GFP_KERNEL|__GFP_REPEAT); + pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift), + GFP_KERNEL|__GFP_REPEAT); + + BUG_ON(pshift > HUGEPD_SHIFT_MASK); + BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); if (! new) return -ENOMEM; spin_lock(&mm->page_table_lock); if (!hugepd_none(*hpdp)) - kmem_cache_free(pgtable_cache[HUGE_PGTABLE_INDEX(psize)], new); + kmem_cache_free(PGT_CACHE(pdshift - pshift), new); else - hpdp->pd = (unsigned long)new | HUGEPD_OK; + hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift; spin_unlock(&mm->page_table_lock); return 0; } - -static pud_t *hpud_offset(pgd_t *pgd, unsigned long addr, struct hstate *hstate) +pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { - if (huge_page_shift(hstate) < PUD_SHIFT) - return pud_offset(pgd, addr); - else - return (pud_t *) pgd; -} -static pud_t *hpud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, - struct hstate *hstate) -{ - if (huge_page_shift(hstate) < PUD_SHIFT) - return pud_alloc(mm, pgd, addr); - else - return (pud_t *) pgd; -} -static pmd_t *hpmd_offset(pud_t *pud, unsigned long addr, struct hstate *hstate) -{ - if (huge_page_shift(hstate) < PMD_SHIFT) - return pmd_offset(pud, addr); - else - return (pmd_t *) pud; -} -static pmd_t *hpmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long addr, - struct hstate *hstate) -{ - if (huge_page_shift(hstate) < PMD_SHIFT) - return pmd_alloc(mm, pud, addr); - else - return (pmd_t *) pud; + pgd_t *pg; + pud_t *pu; + pmd_t *pm; + hugepd_t *hpdp = NULL; + unsigned pshift = __ffs(sz); + unsigned pdshift = PGDIR_SHIFT; + + addr &= ~(sz-1); + + pg = pgd_offset(mm, addr); + if (pshift >= PUD_SHIFT) { + hpdp = (hugepd_t *)pg; + } else { + pdshift = PUD_SHIFT; + pu = pud_alloc(mm, pg, addr); + if (pshift >= PMD_SHIFT) { + hpdp = (hugepd_t *)pu; + } else { + pdshift = PMD_SHIFT; + pm = pmd_alloc(mm, pu, addr); + hpdp = (hugepd_t *)pm; + } + } + + if (!hpdp) + return NULL; + + BUG_ON(!hugepd_none(*hpdp) && !hugepd_ok(*hpdp)); + + if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) + return NULL; + + return hugepte_offset(hpdp, addr, pdshift); } /* Build list of addresses of gigantic pages. This function is used in early @@ -192,94 +199,38 @@ int alloc_bootmem_huge_page(struct hstate *hstate) return 1; } - -/* Modelled after find_linux_pte() */ -pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) -{ - pgd_t *pg; - pud_t *pu; - pmd_t *pm; - - unsigned int psize; - unsigned int shift; - unsigned long sz; - struct hstate *hstate; - psize = get_slice_psize(mm, addr); - shift = mmu_psize_to_shift(psize); - sz = ((1UL) << shift); - hstate = size_to_hstate(sz); - - addr &= hstate->mask; - - pg = pgd_offset(mm, addr); - if (!pgd_none(*pg)) { - pu = hpud_offset(pg, addr, hstate); - if (!pud_none(*pu)) { - pm = hpmd_offset(pu, addr, hstate); - if (!pmd_none(*pm)) - return hugepte_offset((hugepd_t *)pm, addr, - hstate); - } - } - - return NULL; -} - -pte_t *huge_pte_alloc(struct mm_struct *mm, - unsigned long addr, unsigned long sz) -{ - pgd_t *pg; - pud_t *pu; - pmd_t *pm; - hugepd_t *hpdp = NULL; - struct hstate *hstate; - unsigned int psize; - hstate = size_to_hstate(sz); - - psize = get_slice_psize(mm, addr); - BUG_ON(!mmu_huge_psizes[psize]); - - addr &= hstate->mask; - - pg = pgd_offset(mm, addr); - pu = hpud_alloc(mm, pg, addr, hstate); - - if (pu) { - pm = hpmd_alloc(mm, pu, addr, hstate); - if (pm) - hpdp = (hugepd_t *)pm; - } - - if (! hpdp) - return NULL; - - if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr, psize)) - return NULL; - - return hugepte_offset(hpdp, addr, hstate); -} - int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) { return 0; } -static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp, - unsigned int psize) +static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift, + unsigned long start, unsigned long end, + unsigned long floor, unsigned long ceiling) { pte_t *hugepte = hugepd_page(*hpdp); + unsigned shift = hugepd_shift(*hpdp); + unsigned long pdmask = ~((1UL << pdshift) - 1); + + start &= pdmask; + if (start < floor) + return; + if (ceiling) { + ceiling &= pdmask; + if (! ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; hpdp->pd = 0; tlb->need_flush = 1; - pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, - HUGEPTE_CACHE_NUM+psize-1, - PGF_CACHENUM_MASK)); + pgtable_free_tlb(tlb, hugepte, pdshift - shift); } static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr, unsigned long end, - unsigned long floor, unsigned long ceiling, - unsigned int psize) + unsigned long floor, unsigned long ceiling) { pmd_t *pmd; unsigned long next; @@ -291,7 +242,8 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none(*pmd)) continue; - free_hugepte_range(tlb, (hugepd_t *)pmd, psize); + free_hugepd_range(tlb, (hugepd_t *)pmd, PMD_SHIFT, + addr, next, floor, ceiling); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; @@ -317,23 +269,19 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud_t *pud; unsigned long next; unsigned long start; - unsigned int shift; - unsigned int psize = get_slice_psize(tlb->mm, addr); - shift = mmu_psize_to_shift(psize); start = addr; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); - if (shift < PMD_SHIFT) { + if (!is_hugepd(pud)) { if (pud_none_or_clear_bad(pud)) continue; hugetlb_free_pmd_range(tlb, pud, addr, next, floor, - ceiling, psize); + ceiling); } else { - if (pud_none(*pud)) - continue; - free_hugepte_range(tlb, (hugepd_t *)pud, psize); + free_hugepd_range(tlb, (hugepd_t *)pud, PUD_SHIFT, + addr, next, floor, ceiling); } } while (pud++, addr = next, addr != end); @@ -364,121 +312,56 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb, { pgd_t *pgd; unsigned long next; - unsigned long start; /* - * Comments below take from the normal free_pgd_range(). They - * apply here too. The tests against HUGEPD_MASK below are - * essential, because we *don't* test for this at the bottom - * level. Without them we'll attempt to free a hugepte table - * when we unmap just part of it, even if there are other - * active mappings using it. - * - * The next few lines have given us lots of grief... + * Because there are a number of different possible pagetable + * layouts for hugepage ranges, we limit knowledge of how + * things should be laid out to the allocation path + * (huge_pte_alloc(), above). Everything else works out the + * structure as it goes from information in the hugepd + * pointers. That means that we can't here use the + * optimization used in the normal page free_pgd_range(), of + * checking whether we're actually covering a large enough + * range to have to do anything at the top level of the walk + * instead of at the bottom. * - * Why are we testing HUGEPD* at this top level? Because - * often there will be no work to do at all, and we'd prefer - * not to go all the way down to the bottom just to discover - * that. - * - * Why all these "- 1"s? Because 0 represents both the bottom - * of the address space and the top of it (using -1 for the - * top wouldn't help much: the masks would do the wrong thing). - * The rule is that addr 0 and floor 0 refer to the bottom of - * the address space, but end 0 and ceiling 0 refer to the top - * Comparisons need to use "end - 1" and "ceiling - 1" (though - * that end 0 case should be mythical). - * - * Wherever addr is brought up or ceiling brought down, we - * must be careful to reject "the opposite 0" before it - * confuses the subsequent tests. But what about where end is - * brought down by HUGEPD_SIZE below? no, end can't go down to - * 0 there. - * - * Whereas we round start (addr) and ceiling down, by different - * masks at different levels, in order to test whether a table - * now has no other vmas using it, so can be freed, we don't - * bother to round floor or end up - the tests don't need that. + * To make sense of this, you should probably go read the big + * block comment at the top of the normal free_pgd_range(), + * too. */ - unsigned int psize = get_slice_psize(tlb->mm, addr); - - addr &= HUGEPD_MASK(psize); - if (addr < floor) { - addr += HUGEPD_SIZE(psize); - if (!addr) - return; - } - if (ceiling) { - ceiling &= HUGEPD_MASK(psize); - if (!ceiling) - return; - } - if (end - 1 > ceiling - 1) - end -= HUGEPD_SIZE(psize); - if (addr > end - 1) - return; - start = addr; pgd = pgd_offset(tlb->mm, addr); do { - psize = get_slice_psize(tlb->mm, addr); - BUG_ON(!mmu_huge_psizes[psize]); next = pgd_addr_end(addr, end); - if (mmu_psize_to_shift(psize) < PUD_SHIFT) { + if (!is_hugepd(pgd)) { if (pgd_none_or_clear_bad(pgd)) continue; hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling); } else { - if (pgd_none(*pgd)) - continue; - free_hugepte_range(tlb, (hugepd_t *)pgd, psize); + free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT, + addr, next, floor, ceiling); } } while (pgd++, addr = next, addr != end); } -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - if (pte_present(*ptep)) { - /* We open-code pte_clear because we need to pass the right - * argument to hpte_need_flush (huge / !huge). Might not be - * necessary anymore if we make hpte_need_flush() get the - * page size from the slices - */ - unsigned int psize = get_slice_psize(mm, addr); - unsigned int shift = mmu_psize_to_shift(psize); - unsigned long sz = ((1UL) << shift); - struct hstate *hstate = size_to_hstate(sz); - pte_update(mm, addr & hstate->mask, ptep, ~0UL, 1); - } - *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); -} - -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - unsigned long old = pte_update(mm, addr, ptep, ~0UL, 1); - return __pte(old); -} - struct page * follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { pte_t *ptep; struct page *page; - unsigned int mmu_psize = get_slice_psize(mm, address); + unsigned shift; + unsigned long mask; + + ptep = find_linux_pte_or_hugepte(mm->pgd, address, &shift); /* Verify it is a huge page else bail. */ - if (!mmu_huge_psizes[mmu_psize]) + if (!ptep || !shift) return ERR_PTR(-EINVAL); - ptep = huge_pte_offset(mm, address); + mask = (1UL << shift) - 1; page = pte_page(*ptep); - if (page) { - unsigned int shift = mmu_psize_to_shift(mmu_psize); - unsigned long sz = ((1UL) << shift); - page += (address % sz) / PAGE_SIZE; - } + if (page) + page += (address & mask) / PAGE_SIZE; return page; } @@ -501,6 +384,73 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } +static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + unsigned long pte_end; + struct page *head, *page; + pte_t pte; + int refs; + + pte_end = (addr + sz) & ~(sz-1); + if (pte_end < end) + end = pte_end; + + pte = *ptep; + mask = _PAGE_PRESENT | _PAGE_USER; + if (write) + mask |= _PAGE_RW; + + if ((pte_val(pte) & mask) != mask) + return 0; + + /* hugepages are never "special" */ + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + + page = head + ((addr & (sz-1)) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + + if (!page_cache_add_speculative(head, refs)) { + *nr -= refs; + return 0; + } + + if (unlikely(pte_val(pte) != pte_val(*ptep))) { + /* Could be optimized better */ + while (*nr) { + put_page(page); + (*nr)--; + } + } + + return 1; +} + +int gup_hugepd(hugepd_t *hugepd, unsigned pdshift, + unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + pte_t *ptep; + unsigned long sz = 1UL << hugepd_shift(*hugepd); + + ptep = hugepte_offset(hugepd, addr, pdshift); + do { + if (!gup_hugepte(ptep, sz, addr, end, write, pages, nr)) + return 0; + } while (ptep++, addr += sz, addr != end); + + return 1; +} unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, @@ -509,8 +459,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct hstate *hstate = hstate_file(file); int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); - if (!mmu_huge_psizes[mmu_psize]) - return -EINVAL; return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); } @@ -521,229 +469,46 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) return 1UL << mmu_psize_to_shift(psize); } -/* - * Called by asm hashtable.S for doing lazy icache flush - */ -static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags, - pte_t pte, int trap, unsigned long sz) +static int __init add_huge_page_size(unsigned long long size) { - struct page *page; - int i; - - if (!pfn_valid(pte_pfn(pte))) - return rflags; - - page = pte_page(pte); - - /* page is dirty */ - if (!test_bit(PG_arch_1, &page->flags) && !PageReserved(page)) { - if (trap == 0x400) { - for (i = 0; i < (sz / PAGE_SIZE); i++) - __flush_dcache_icache(page_address(page+i)); - set_bit(PG_arch_1, &page->flags); - } else { - rflags |= HPTE_R_N; - } - } - return rflags; -} + int shift = __ffs(size); + int mmu_psize; -int hash_huge_page(struct mm_struct *mm, unsigned long access, - unsigned long ea, unsigned long vsid, int local, - unsigned long trap) -{ - pte_t *ptep; - unsigned long old_pte, new_pte; - unsigned long va, rflags, pa, sz; - long slot; - int err = 1; - int ssize = user_segment_size(ea); - unsigned int mmu_psize; - int shift; - mmu_psize = get_slice_psize(mm, ea); - - if (!mmu_huge_psizes[mmu_psize]) - goto out; - ptep = huge_pte_offset(mm, ea); - - /* Search the Linux page table for a match with va */ - va = hpt_va(ea, vsid, ssize); + /* Check that it is a page size supported by the hardware and + * that it fits within pagetable and slice limits. */ + if (!is_power_of_2(size) + || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT)) + return -EINVAL; - /* - * If no pte found or not present, send the problem up to - * do_page_fault - */ - if (unlikely(!ptep || pte_none(*ptep))) - goto out; + if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) + return -EINVAL; - /* - * Check the user's access rights to the page. If access should be - * prevented then send the problem up to do_page_fault. - */ - if (unlikely(access & ~pte_val(*ptep))) - goto out; - /* - * At this point, we have a pte (old_pte) which can be used to build - * or update an HPTE. There are 2 cases: - * - * 1. There is a valid (present) pte with no associated HPTE (this is - * the most common case) - * 2. There is a valid (present) pte with an associated HPTE. The - * current values of the pp bits in the HPTE prevent access - * because we are doing software DIRTY bit management and the - * page is currently not DIRTY. +#ifdef CONFIG_SPU_FS_64K_LS + /* Disable support for 64K huge pages when 64K SPU local store + * support is enabled as the current implementation conflicts. */ + if (shift == PAGE_SHIFT_64K) + return -EINVAL; +#endif /* CONFIG_SPU_FS_64K_LS */ + BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); - do { - old_pte = pte_val(*ptep); - if (old_pte & _PAGE_BUSY) - goto out; - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; - } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, - old_pte, new_pte)); - - rflags = 0x2 | (!(new_pte & _PAGE_RW)); - /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */ - rflags |= ((new_pte & _PAGE_EXEC) ? 0 : HPTE_R_N); - shift = mmu_psize_to_shift(mmu_psize); - sz = ((1UL) << shift); - if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) - /* No CPU has hugepages but lacks no execute, so we - * don't need to worry about that case */ - rflags = hash_huge_page_do_lazy_icache(rflags, __pte(old_pte), - trap, sz); - - /* Check if pte already has an hpte (case 2) */ - if (unlikely(old_pte & _PAGE_HASHPTE)) { - /* There MIGHT be an HPTE for this pte */ - unsigned long hash, slot; - - hash = hpt_hash(va, shift, ssize); - if (old_pte & _PAGE_F_SECOND) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (old_pte & _PAGE_F_GIX) >> 12; - - if (ppc_md.hpte_updatepp(slot, rflags, va, mmu_psize, - ssize, local) == -1) - old_pte &= ~_PAGE_HPTEFLAGS; - } - - if (likely(!(old_pte & _PAGE_HASHPTE))) { - unsigned long hash = hpt_hash(va, shift, ssize); - unsigned long hpte_group; - - pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; - -repeat: - hpte_group = ((hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; - - /* clear HPTE slot informations in new PTE */ -#ifdef CONFIG_PPC_64K_PAGES - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HPTE_SUB0; -#else - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; -#endif - /* Add in WIMG bits */ - rflags |= (new_pte & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | - _PAGE_COHERENT | _PAGE_GUARDED)); - - /* Insert into the hash table, primary slot */ - slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, 0, - mmu_psize, ssize); - - /* Primary is full, try the secondary */ - if (unlikely(slot == -1)) { - hpte_group = ((~hash & htab_hash_mask) * - HPTES_PER_GROUP) & ~0x7UL; - slot = ppc_md.hpte_insert(hpte_group, va, pa, rflags, - HPTE_V_SECONDARY, - mmu_psize, ssize); - if (slot == -1) { - if (mftb() & 0x1) - hpte_group = ((hash & htab_hash_mask) * - HPTES_PER_GROUP)&~0x7UL; - - ppc_md.hpte_remove(hpte_group); - goto repeat; - } - } - - if (unlikely(slot == -2)) - panic("hash_huge_page: pte_insert failed\n"); - - new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); - } - - /* - * No need to use ldarx/stdcx here - */ - *ptep = __pte(new_pte & ~_PAGE_BUSY); + /* Return if huge page size has already been setup */ + if (size_to_hstate(size)) + return 0; - err = 0; + hugetlb_add_hstate(shift - PAGE_SHIFT); - out: - return err; -} - -static void __init set_huge_psize(int psize) -{ - /* Check that it is a page size supported by the hardware and - * that it fits within pagetable limits. */ - if (mmu_psize_defs[psize].shift && - mmu_psize_defs[psize].shift < SID_SHIFT_1T && - (mmu_psize_defs[psize].shift > MIN_HUGEPTE_SHIFT || - mmu_psize_defs[psize].shift == PAGE_SHIFT_64K || - mmu_psize_defs[psize].shift == PAGE_SHIFT_16G)) { - /* Return if huge page size has already been setup or is the - * same as the base page size. */ - if (mmu_huge_psizes[psize] || - mmu_psize_defs[psize].shift == PAGE_SHIFT) - return; - if (WARN_ON(HUGEPTE_CACHE_NAME(psize) == NULL)) - return; - hugetlb_add_hstate(mmu_psize_defs[psize].shift - PAGE_SHIFT); - - switch (mmu_psize_defs[psize].shift) { - case PAGE_SHIFT_64K: - /* We only allow 64k hpages with 4k base page, - * which was checked above, and always put them - * at the PMD */ - hugepte_shift[psize] = PMD_SHIFT; - break; - case PAGE_SHIFT_16M: - /* 16M pages can be at two different levels - * of pagestables based on base page size */ - if (PAGE_SHIFT == PAGE_SHIFT_64K) - hugepte_shift[psize] = PMD_SHIFT; - else /* 4k base page */ - hugepte_shift[psize] = PUD_SHIFT; - break; - case PAGE_SHIFT_16G: - /* 16G pages are always at PGD level */ - hugepte_shift[psize] = PGDIR_SHIFT; - break; - } - hugepte_shift[psize] -= mmu_psize_defs[psize].shift; - } else - hugepte_shift[psize] = 0; + return 0; } static int __init hugepage_setup_sz(char *str) { unsigned long long size; - int mmu_psize; - int shift; size = memparse(str, &str); - shift = __ffs(size); - mmu_psize = shift_to_mmu_psize(shift); - if (mmu_psize >= 0 && mmu_psize_defs[mmu_psize].shift) - set_huge_psize(mmu_psize); - else + if (add_huge_page_size(size) != 0) printk(KERN_WARNING "Invalid huge page size specified(%llu)\n", size); return 1; @@ -752,41 +517,55 @@ __setup("hugepagesz=", hugepage_setup_sz); static int __init hugetlbpage_init(void) { - unsigned int psize; + int psize; if (!cpu_has_feature(CPU_FTR_16M_PAGE)) return -ENODEV; - /* Add supported huge page sizes. Need to change HUGE_MAX_HSTATE - * and adjust PTE_NONCACHE_NUM if the number of supported huge page - * sizes changes. - */ - set_huge_psize(MMU_PAGE_16M); - set_huge_psize(MMU_PAGE_16G); + for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { + unsigned shift; + unsigned pdshift; - /* Temporarily disable support for 64K huge pages when 64K SPU local - * store support is enabled as the current implementation conflicts. - */ -#ifndef CONFIG_SPU_FS_64K_LS - set_huge_psize(MMU_PAGE_64K); -#endif + if (!mmu_psize_defs[psize].shift) + continue; - for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { - if (mmu_huge_psizes[psize]) { - pgtable_cache[HUGE_PGTABLE_INDEX(psize)] = - kmem_cache_create( - HUGEPTE_CACHE_NAME(psize), - HUGEPTE_TABLE_SIZE(psize), - HUGEPTE_TABLE_SIZE(psize), - 0, - NULL); - if (!pgtable_cache[HUGE_PGTABLE_INDEX(psize)]) - panic("hugetlbpage_init(): could not create %s"\ - "\n", HUGEPTE_CACHE_NAME(psize)); - } + shift = mmu_psize_to_shift(psize); + + if (add_huge_page_size(1ULL << shift) < 0) + continue; + + if (shift < PMD_SHIFT) + pdshift = PMD_SHIFT; + else if (shift < PUD_SHIFT) + pdshift = PUD_SHIFT; + else + pdshift = PGDIR_SHIFT; + + pgtable_cache_add(pdshift - shift, NULL); + if (!PGT_CACHE(pdshift - shift)) + panic("hugetlbpage_init(): could not create " + "pgtable cache for %d bit pagesize\n", shift); } + /* Set default large page size. Currently, we pick 16M or 1M + * depending on what is available + */ + if (mmu_psize_defs[MMU_PAGE_16M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; + else if (mmu_psize_defs[MMU_PAGE_1M].shift) + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; + return 0; } module_init(hugetlbpage_init); + +void flush_dcache_icache_hugepage(struct page *page) +{ + int i; + + BUG_ON(!PageCompound(page)); + + for (i = 0; i < (1UL << compound_order(page)); i++) + __flush_dcache_icache(page_address(page+i)); +} diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 335c578b9cc..776f28d02b6 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -41,6 +41,7 @@ #include <linux/module.h> #include <linux/poison.h> #include <linux/lmb.h> +#include <linux/hugetlb.h> #include <asm/pgalloc.h> #include <asm/page.h> @@ -119,30 +120,63 @@ static void pmd_ctor(void *addr) memset(addr, 0, PMD_TABLE_SIZE); } -static const unsigned int pgtable_cache_size[2] = { - PGD_TABLE_SIZE, PMD_TABLE_SIZE -}; -static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = { -#ifdef CONFIG_PPC_64K_PAGES - "pgd_cache", "pmd_cache", -#else - "pgd_cache", "pud_pmd_cache", -#endif /* CONFIG_PPC_64K_PAGES */ -}; - -#ifdef CONFIG_HUGETLB_PAGE -/* Hugepages need an extra cache per hugepagesize, initialized in - * hugetlbpage.c. We can't put into the tables above, because HPAGE_SHIFT - * is not compile time constant. */ -struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+MMU_PAGE_COUNT]; -#else -struct kmem_cache *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)]; -#endif +struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE]; + +/* + * Create a kmem_cache() for pagetables. This is not used for PTE + * pages - they're linked to struct page, come from the normal free + * pages pool and have a different entry size (see real_pte_t) to + * everything else. Caches created by this function are used for all + * the higher level pagetables, and for hugepage pagetables. + */ +void pgtable_cache_add(unsigned shift, void (*ctor)(void *)) +{ + char *name; + unsigned long table_size = sizeof(void *) << shift; + unsigned long align = table_size; + + /* When batching pgtable pointers for RCU freeing, we store + * the index size in the low bits. Table alignment must be + * big enough to fit it. + * + * Likewise, hugeapge pagetable pointers contain a (different) + * shift value in the low bits. All tables must be aligned so + * as to leave enough 0 bits in the address to contain it. */ + unsigned long minalign = max(MAX_PGTABLE_INDEX_SIZE + 1, + HUGEPD_SHIFT_MASK + 1); + struct kmem_cache *new; + + /* It would be nice if this was a BUILD_BUG_ON(), but at the + * moment, gcc doesn't seem to recognize is_power_of_2 as a + * constant expression, so so much for that. */ + BUG_ON(!is_power_of_2(minalign)); + BUG_ON((shift < 1) || (shift > MAX_PGTABLE_INDEX_SIZE)); + + if (PGT_CACHE(shift)) + return; /* Already have a cache of this size */ + + align = max_t(unsigned long, align, minalign); + name = kasprintf(GFP_KERNEL, "pgtable-2^%d", shift); + new = kmem_cache_create(name, table_size, align, 0, ctor); + PGT_CACHE(shift) = new; + + pr_debug("Allocated pgtable cache for order %d\n", shift); +} + void pgtable_cache_init(void) { - pgtable_cache[0] = kmem_cache_create(pgtable_cache_name[0], PGD_TABLE_SIZE, PGD_TABLE_SIZE, SLAB_PANIC, pgd_ctor); - pgtable_cache[1] = kmem_cache_create(pgtable_cache_name[1], PMD_TABLE_SIZE, PMD_TABLE_SIZE, SLAB_PANIC, pmd_ctor); + pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor); + pgtable_cache_add(PMD_INDEX_SIZE, pmd_ctor); + if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_INDEX_SIZE)) + panic("Couldn't allocate pgtable caches"); + + /* In all current configs, when the PUD index exists it's the + * same size as either the pgd or pmd index. Verify that the + * initialization above has also created a PUD cache. This + * will need re-examiniation if we add new possibilities for + * the pagetable layout. */ + BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE)); } #ifdef CONFIG_SPARSEMEM_VMEMMAP diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 59736317bf0..b9b152558f9 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -32,6 +32,7 @@ #include <linux/pagemap.h> #include <linux/suspend.h> #include <linux/lmb.h> +#include <linux/hugetlb.h> #include <asm/pgalloc.h> #include <asm/prom.h> @@ -417,18 +418,26 @@ EXPORT_SYMBOL(flush_dcache_page); void flush_dcache_icache_page(struct page *page) { +#ifdef CONFIG_HUGETLB_PAGE + if (PageCompound(page)) { + flush_dcache_icache_hugepage(page); + return; + } +#endif #ifdef CONFIG_BOOKE - void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE); - __flush_dcache_icache(start); - kunmap_atomic(start, KM_PPC_SYNC_ICACHE); + { + void *start = kmap_atomic(page, KM_PPC_SYNC_ICACHE); + __flush_dcache_icache(start); + kunmap_atomic(start, KM_PPC_SYNC_ICACHE); + } #elif defined(CONFIG_8xx) || defined(CONFIG_PPC64) /* On 8xx there is no need to kmap since highmem is not supported */ __flush_dcache_icache(page_address(page)); #else __flush_dcache_icache_phys(page_to_pfn(page) << PAGE_SHIFT); #endif - } + void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { clear_page(page); diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c index dbeb86ac90c..b9e4cc2c205 100644 --- a/arch/powerpc/mm/mmu_context_hash64.c +++ b/arch/powerpc/mm/mmu_context_hash64.c @@ -18,6 +18,7 @@ #include <linux/mm.h> #include <linux/spinlock.h> #include <linux/idr.h> +#include <linux/module.h> #include <asm/mmu_context.h> @@ -32,7 +33,7 @@ static DEFINE_IDR(mmu_context_idr); #define NO_CONTEXT 0 #define MAX_CONTEXT ((1UL << 19) - 1) -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +int __init_new_context(void) { int index; int err; @@ -57,6 +58,18 @@ again: return -ENOMEM; } + return index; +} +EXPORT_SYMBOL_GPL(__init_new_context); + +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + int index; + + index = __init_new_context(); + if (index < 0) + return index; + /* The old code would re-promote on fork, we don't do that * when using slices as it could cause problem promoting slices * that have been forced down to 4K @@ -68,11 +81,16 @@ again: return 0; } -void destroy_context(struct mm_struct *mm) +void __destroy_context(int context_id) { spin_lock(&mmu_context_lock); - idr_remove(&mmu_context_idr, mm->context.id); + idr_remove(&mmu_context_idr, context_id); spin_unlock(&mmu_context_lock); +} +EXPORT_SYMBOL_GPL(__destroy_context); +void destroy_context(struct mm_struct *mm) +{ + __destroy_context(mm->context.id); mm->context.id = NO_CONTEXT; } diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 53040931de3..99df697c601 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -49,12 +49,12 @@ struct pte_freelist_batch { struct rcu_head rcu; unsigned int index; - pgtable_free_t tables[0]; + unsigned long tables[0]; }; #define PTE_FREELIST_SIZE \ ((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \ - / sizeof(pgtable_free_t)) + / sizeof(unsigned long)) static void pte_free_smp_sync(void *arg) { @@ -64,13 +64,13 @@ static void pte_free_smp_sync(void *arg) /* This is only called when we are critically out of memory * (and fail to get a page in pte_free_tlb). */ -static void pgtable_free_now(pgtable_free_t pgf) +static void pgtable_free_now(void *table, unsigned shift) { pte_freelist_forced_free++; smp_call_function(pte_free_smp_sync, NULL, 1); - pgtable_free(pgf); + pgtable_free(table, shift); } static void pte_free_rcu_callback(struct rcu_head *head) @@ -79,8 +79,12 @@ static void pte_free_rcu_callback(struct rcu_head *head) container_of(head, struct pte_freelist_batch, rcu); unsigned int i; - for (i = 0; i < batch->index; i++) - pgtable_free(batch->tables[i]); + for (i = 0; i < batch->index; i++) { + void *table = (void *)(batch->tables[i] & ~MAX_PGTABLE_INDEX_SIZE); + unsigned shift = batch->tables[i] & MAX_PGTABLE_INDEX_SIZE; + + pgtable_free(table, shift); + } free_page((unsigned long)batch); } @@ -91,25 +95,28 @@ static void pte_free_submit(struct pte_freelist_batch *batch) call_rcu(&batch->rcu, pte_free_rcu_callback); } -void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) +void pgtable_free_tlb(struct mmu_gather *tlb, void *table, unsigned shift) { /* This is safe since tlb_gather_mmu has disabled preemption */ struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); + unsigned long pgf; if (atomic_read(&tlb->mm->mm_users) < 2 || cpumask_equal(mm_cpumask(tlb->mm), cpumask_of(smp_processor_id()))){ - pgtable_free(pgf); + pgtable_free(table, shift); return; } if (*batchp == NULL) { *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC); if (*batchp == NULL) { - pgtable_free_now(pgf); + pgtable_free_now(table, shift); return; } (*batchp)->index = 0; } + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); + pgf = (unsigned long)table | shift; (*batchp)->tables[(*batchp)->index++] = pgf; if ((*batchp)->index == PTE_FREELIST_SIZE) { pte_free_submit(*batchp); diff --git a/arch/powerpc/mm/tlb_hash64.c b/arch/powerpc/mm/tlb_hash64.c index 2b2f35f6985..282d9306361 100644 --- a/arch/powerpc/mm/tlb_hash64.c +++ b/arch/powerpc/mm/tlb_hash64.c @@ -53,11 +53,6 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, i = batch->index; - /* We mask the address for the base page size. Huge pages will - * have applied their own masking already - */ - addr &= PAGE_MASK; - /* Get page size (maybe move back to caller). * * NOTE: when using special 64K mappings in 4K environment like @@ -75,6 +70,9 @@ void hpte_need_flush(struct mm_struct *mm, unsigned long addr, } else psize = pte_pagesize_index(mm, addr, pte); + /* Mask the address for the correct page size */ + addr &= ~((1UL << mmu_psize_defs[psize].shift) - 1); + /* Build full vaddr */ if (!is_kernel_addr(addr)) { ssize = user_segment_size(addr); diff --git a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c index a6ce8056662..cd70ee1667f 100644 --- a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c +++ b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c @@ -132,7 +132,7 @@ static int cpld_pic_host_map(struct irq_host *h, unsigned int virq, irq_hw_number_t hw) { - get_irq_desc(virq)->status |= IRQ_LEVEL; + irq_to_desc(virq)->status |= IRQ_LEVEL; set_irq_chip_and_handler(virq, &cpld_pic, handle_level_irq); return 0; } diff --git a/arch/powerpc/platforms/52xx/media5200.c b/arch/powerpc/platforms/52xx/media5200.c index 68e4f1696d1..47802035874 100644 --- a/arch/powerpc/platforms/52xx/media5200.c +++ b/arch/powerpc/platforms/52xx/media5200.c @@ -114,7 +114,7 @@ void media5200_irq_cascade(unsigned int virq, struct irq_desc *desc) static int media5200_irq_map(struct irq_host *h, unsigned int virq, irq_hw_number_t hw) { - struct irq_desc *desc = get_irq_desc(virq); + struct irq_desc *desc = irq_to_desc(virq); pr_debug("%s: h=%p, virq=%i, hwirq=%i\n", __func__, h, virq, (int)hw); set_irq_chip_data(virq, &media5200_irq); diff --git a/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c b/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c index 7ee979f323d..a682331ba0f 100644 --- a/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c +++ b/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c @@ -107,7 +107,7 @@ static void pq2ads_pci_irq_demux(unsigned int irq, struct irq_desc *desc) static int pci_pic_host_map(struct irq_host *h, unsigned int virq, irq_hw_number_t hw) { - get_irq_desc(virq)->status |= IRQ_LEVEL; + irq_to_desc(virq)->status |= IRQ_LEVEL; set_irq_chip_data(virq, h->host_data); set_irq_chip_and_handler(virq, &pq2ads_pci_ic, handle_level_irq); return 0; diff --git a/arch/powerpc/platforms/85xx/socrates_fpga_pic.c b/arch/powerpc/platforms/85xx/socrates_fpga_pic.c index 60edf63d015..e59920aa666 100644 --- a/arch/powerpc/platforms/85xx/socrates_fpga_pic.c +++ b/arch/powerpc/platforms/85xx/socrates_fpga_pic.c @@ -245,7 +245,7 @@ static int socrates_fpga_pic_host_map(struct irq_host *h, unsigned int virq, irq_hw_number_t hwirq) { /* All interrupts are LEVEL sensitive */ - get_irq_desc(virq)->status |= IRQ_LEVEL; + irq_to_desc(virq)->status |= IRQ_LEVEL; set_irq_chip_and_handler(virq, &socrates_fpga_pic_chip, handle_fasteoi_irq); diff --git a/arch/powerpc/platforms/86xx/gef_pic.c b/arch/powerpc/platforms/86xx/gef_pic.c index 50d0a2b6380..978d6cb3751 100644 --- a/arch/powerpc/platforms/86xx/gef_pic.c +++ b/arch/powerpc/platforms/86xx/gef_pic.c @@ -163,7 +163,7 @@ static int gef_pic_host_map(struct irq_host *h, unsigned int virq, irq_hw_number_t hwirq) { /* All interrupts are LEVEL sensitive */ - get_irq_desc(virq)->status |= IRQ_LEVEL; + irq_to_desc(virq)->status |= IRQ_LEVEL; set_irq_chip_and_handler(virq, &gef_pic_chip, handle_level_irq); return 0; diff --git a/arch/powerpc/platforms/8xx/m8xx_setup.c b/arch/powerpc/platforms/8xx/m8xx_setup.c index 385acfc4839..242954c4293 100644 --- a/arch/powerpc/platforms/8xx/m8xx_setup.c +++ b/arch/powerpc/platforms/8xx/m8xx_setup.c @@ -222,7 +222,7 @@ static void cpm_cascade(unsigned int irq, struct irq_desc *desc) int cascade_irq; if ((cascade_irq = cpm_get_irq()) >= 0) { - struct irq_desc *cdesc = irq_desc + cascade_irq; + struct irq_desc *cdesc = irq_to_desc(cascade_irq); generic_handle_irq(cascade_irq); cdesc->chip->eoi(cascade_irq); diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index 04a8061045c..56bf12692f3 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -86,6 +86,11 @@ config RTAS_ERROR_LOGGING depends on PPC_RTAS default n +config PPC_RTAS_DAEMON + bool + depends on PPC_RTAS + default n + config RTAS_PROC bool "Proc interface to RTAS" depends on PPC_RTAS diff --git a/arch/powerpc/platforms/cell/beat_interrupt.c b/arch/powerpc/platforms/cell/beat_interrupt.c index 72254848a22..4a2bbff5769 100644 --- a/arch/powerpc/platforms/cell/beat_interrupt.c +++ b/arch/powerpc/platforms/cell/beat_interrupt.c @@ -136,7 +136,7 @@ static void beatic_pic_host_unmap(struct irq_host *h, unsigned int virq) static int beatic_pic_host_map(struct irq_host *h, unsigned int virq, irq_hw_number_t hw) { - struct irq_desc *desc = get_irq_desc(virq); + struct irq_desc *desc = irq_to_desc(virq); int64_t err; err = beat_construct_and_connect_irq_plug(virq, hw); diff --git a/arch/powerpc/platforms/cell/spider-pic.c b/arch/powerpc/platforms/cell/spider-pic.c index 4e5655624ae..9dd63c5d11a 100644 --- a/arch/powerpc/platforms/cell/spider-pic.c +++ b/arch/powerpc/platforms/cell/spider-pic.c @@ -102,7 +102,7 @@ static void spider_ack_irq(unsigned int virq) /* Reset edge detection logic if necessary */ - if (get_irq_desc(virq)->status & IRQ_LEVEL) + if (irq_to_desc(virq)->status & IRQ_LEVEL) return; /* Only interrupts 47 to 50 can be set to edge */ @@ -119,7 +119,7 @@ static int spider_set_irq_type(unsigned int virq, unsigned int type) struct spider_pic *pic = spider_virq_to_pic(virq); unsigned int hw = irq_map[virq].hwirq; void __iomem *cfg = spider_get_irq_config(pic, hw); - struct irq_desc *desc = get_irq_desc(virq); + struct irq_desc *desc = irq_to_desc(virq); u32 old_mask; u32 ic; diff --git a/arch/powerpc/platforms/chrp/Kconfig b/arch/powerpc/platforms/chrp/Kconfig index 37d438bd5b7..bc0b0efdc5f 100644 --- a/arch/powerpc/platforms/chrp/Kconfig +++ b/arch/powerpc/platforms/chrp/Kconfig @@ -5,6 +5,8 @@ config PPC_CHRP select PPC_I8259 select PPC_INDIRECT_PCI select PPC_RTAS + select PPC_RTAS_DAEMON + select RTAS_ERROR_LOGGING select PPC_MPC106 select PPC_UDBG_16550 select PPC_NATIVE diff --git a/arch/powerpc/platforms/chrp/setup.c b/arch/powerpc/platforms/chrp/setup.c index cd4ad9aea76..52f3df3b4ca 100644 --- a/arch/powerpc/platforms/chrp/setup.c +++ b/arch/powerpc/platforms/chrp/setup.c @@ -364,19 +364,6 @@ void __init chrp_setup_arch(void) if (ppc_md.progress) ppc_md.progress("Linux/PPC "UTS_RELEASE"\n", 0x0); } -void -chrp_event_scan(unsigned long unused) -{ - unsigned char log[1024]; - int ret = 0; - - /* XXX: we should loop until the hardware says no more error logs -- Cort */ - rtas_call(rtas_token("event-scan"), 4, 1, &ret, 0xffffffff, 0, - __pa(log), 1024); - mod_timer(&__get_cpu_var(heartbeat_timer), - jiffies + event_scan_interval); -} - static void chrp_8259_cascade(unsigned int irq, struct irq_desc *desc) { unsigned int cascade_irq = i8259_irq(); @@ -568,9 +555,6 @@ void __init chrp_init_IRQ(void) void __init chrp_init2(void) { - struct device_node *device; - const unsigned int *p = NULL; - #ifdef CONFIG_NVRAM chrp_nvram_init(); #endif @@ -582,40 +566,6 @@ chrp_init2(void) request_region(0x80,0x10,"dma page reg"); request_region(0xc0,0x20,"dma2"); - /* Get the event scan rate for the rtas so we know how - * often it expects a heartbeat. -- Cort - */ - device = of_find_node_by_name(NULL, "rtas"); - if (device) - p = of_get_property(device, "rtas-event-scan-rate", NULL); - if (p && *p) { - /* - * Arrange to call chrp_event_scan at least *p times - * per minute. We use 59 rather than 60 here so that - * the rate will be slightly higher than the minimum. - * This all assumes we don't do hotplug CPU on any - * machine that needs the event scans done. - */ - unsigned long interval, offset; - int cpu, ncpus; - struct timer_list *timer; - - interval = HZ * 59 / *p; - offset = HZ; - ncpus = num_online_cpus(); - event_scan_interval = ncpus * interval; - for (cpu = 0; cpu < ncpus; ++cpu) { - timer = &per_cpu(heartbeat_timer, cpu); - setup_timer(timer, chrp_event_scan, 0); - timer->expires = jiffies + offset; - add_timer_on(timer, cpu); - offset += interval; - } - printk("RTAS Event Scan Rate: %u (%lu jiffies)\n", - *p, interval); - } - of_node_put(device); - if (ppc_md.progress) ppc_md.progress(" Have fun! ", 0x7777); } diff --git a/arch/powerpc/platforms/iseries/irq.c b/arch/powerpc/platforms/iseries/irq.c index 94f44475883..f8446ea3118 100644 --- a/arch/powerpc/platforms/iseries/irq.c +++ b/arch/powerpc/platforms/iseries/irq.c @@ -214,7 +214,7 @@ void __init iSeries_activate_IRQs() unsigned long flags; for_each_irq (irq) { - struct irq_desc *desc = get_irq_desc(irq); + struct irq_desc *desc = irq_to_desc(irq); if (desc && desc->chip && desc->chip->startup) { spin_lock_irqsave(&desc->lock, flags); diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c index d212006a5b3..484d21e55c6 100644 --- a/arch/powerpc/platforms/powermac/pic.c +++ b/arch/powerpc/platforms/powermac/pic.c @@ -152,12 +152,12 @@ static unsigned int pmac_startup_irq(unsigned int virq) unsigned long bit = 1UL << (src & 0x1f); int i = src >> 5; - spin_lock_irqsave(&pmac_pic_lock, flags); - if ((irq_desc[virq].status & IRQ_LEVEL) == 0) + spin_lock_irqsave(&pmac_pic_lock, flags); + if ((irq_to_desc(virq)->status & IRQ_LEVEL) == 0) out_le32(&pmac_irq_hw[i]->ack, bit); __set_bit(src, ppc_cached_irq_mask); __pmac_set_irq_mask(src, 0); - spin_unlock_irqrestore(&pmac_pic_lock, flags); + spin_unlock_irqrestore(&pmac_pic_lock, flags); return 0; } @@ -285,7 +285,7 @@ static int pmac_pic_host_match(struct irq_host *h, struct device_node *node) static int pmac_pic_host_map(struct irq_host *h, unsigned int virq, irq_hw_number_t hw) { - struct irq_desc *desc = get_irq_desc(virq); + struct irq_desc *desc = irq_to_desc(virq); int level; if (hw >= max_irqs) diff --git a/arch/powerpc/platforms/ps3/mm.c b/arch/powerpc/platforms/ps3/mm.c index 189a25b8073..e81b028a2a4 100644 --- a/arch/powerpc/platforms/ps3/mm.c +++ b/arch/powerpc/platforms/ps3/mm.c @@ -34,7 +34,7 @@ #if defined(DEBUG) #define DBG udbg_printf #else -#define DBG pr_debug +#define DBG pr_devel #endif enum { diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index f0e6f28427b..27554c807fd 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -4,6 +4,7 @@ config PPC_PSERIES select MPIC select PPC_I8259 select PPC_RTAS + select PPC_RTAS_DAEMON select RTAS_ERROR_LOGGING select PPC_UDBG_16550 select PPC_NATIVE @@ -59,7 +60,7 @@ config PPC_SMLPAR config CMM tristate "Collaborative memory management" - depends on PPC_SMLPAR && !CRASH_DUMP + depends on PPC_SMLPAR default y help Select this option, if you want to enable the kernel interface diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index 790c0b872d4..4b1c422b814 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -7,7 +7,7 @@ EXTRA_CFLAGS += -DDEBUG endif obj-y := lpar.o hvCall.o nvram.o reconfig.o \ - setup.o iommu.o ras.o rtasd.o \ + setup.o iommu.o ras.o \ firmware.o power.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_XICS) += xics.o diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 6567439fe78..bcdcf0ccc8d 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -229,8 +229,9 @@ static void cmm_get_mpp(void) { int rc; struct hvcall_mpp_data mpp_data; - unsigned long active_pages_target; - signed long page_loan_request; + signed long active_pages_target, page_loan_request, target; + signed long total_pages = totalram_pages + loaned_pages; + signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE; rc = h_get_mpp(&mpp_data); @@ -238,17 +239,25 @@ static void cmm_get_mpp(void) return; page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE); - loaned_pages_target = page_loan_request + loaned_pages; - if (loaned_pages_target > oom_freed_pages) - loaned_pages_target -= oom_freed_pages; + target = page_loan_request + (signed long)loaned_pages; + + if (target < 0 || total_pages < min_mem_pages) + target = 0; + + if (target > oom_freed_pages) + target -= oom_freed_pages; else - loaned_pages_target = 0; + target = 0; + + active_pages_target = total_pages - target; + + if (min_mem_pages > active_pages_target) + target = total_pages - min_mem_pages; - active_pages_target = totalram_pages + loaned_pages - loaned_pages_target; + if (target < 0) + target = 0; - if ((min_mem_mb * 1024 * 1024) > (active_pages_target * PAGE_SIZE)) - loaned_pages_target = totalram_pages + loaned_pages - - ((min_mem_mb * 1024 * 1024) / PAGE_SIZE); + loaned_pages_target = target; cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n", page_loan_request, loaned_pages, loaned_pages_target, diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c index 0e8db677125..ef8e4544848 100644 --- a/arch/powerpc/platforms/pseries/eeh_driver.c +++ b/arch/powerpc/platforms/pseries/eeh_driver.c @@ -63,22 +63,6 @@ static void print_device_node_tree(struct pci_dn *pdn, int dent) } #endif -/** - * irq_in_use - return true if this irq is being used - */ -static int irq_in_use(unsigned int irq) -{ - int rc = 0; - unsigned long flags; - struct irq_desc *desc = irq_desc + irq; - - spin_lock_irqsave(&desc->lock, flags); - if (desc->action) - rc = 1; - spin_unlock_irqrestore(&desc->lock, flags); - return rc; -} - /** * eeh_disable_irq - disable interrupt for the recovering device */ @@ -93,7 +77,7 @@ static void eeh_disable_irq(struct pci_dev *dev) if (dev->msi_enabled || dev->msix_enabled) return; - if (!irq_in_use(dev->irq)) + if (!irq_has_action(dev->irq)) return; PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED; diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c index 2e2bbe120b9..5182d2b992c 100644 --- a/arch/powerpc/platforms/pseries/reconfig.c +++ b/arch/powerpc/platforms/pseries/reconfig.c @@ -184,7 +184,7 @@ static int pSeries_reconfig_remove_node(struct device_node *np) } /* - * /proc/ppc64/ofdt - yucky binary interface for adding and removing + * /proc/powerpc/ofdt - yucky binary interface for adding and removing * OF device nodes. Should be deprecated as soon as we get an * in-kernel wrapper for the RTAS ibm,configure-connector call. */ @@ -543,7 +543,7 @@ static const struct file_operations ofdt_fops = { .write = ofdt_write }; -/* create /proc/ppc64/ofdt write-only by root */ +/* create /proc/powerpc/ofdt write-only by root */ static int proc_ppc64_create_ofdt(void) { struct proc_dir_entry *ent; @@ -551,7 +551,7 @@ static int proc_ppc64_create_ofdt(void) if (!machine_is(pseries)) return 0; - ent = proc_create("ppc64/ofdt", S_IWUSR, NULL, &ofdt_fops); + ent = proc_create("powerpc/ofdt", S_IWUSR, NULL, &ofdt_fops); if (ent) ent->size = 0; diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c index 417eca79df6..1b45c458f95 100644 --- a/arch/powerpc/platforms/pseries/scanlog.c +++ b/arch/powerpc/platforms/pseries/scanlog.c @@ -13,7 +13,7 @@ * of this data using this driver. A dump exists if the device-tree * /chosen/ibm,scan-log-data property exists. * - * This driver exports /proc/ppc64/scan-log-dump which can be read. + * This driver exports /proc/powerpc/scan-log-dump which can be read. * The driver supports only sequential reads. * * The driver looks at a write to the driver for the single word "reset". @@ -186,7 +186,7 @@ static int __init scanlog_init(void) if (!data) goto err; - ent = proc_create_data("ppc64/rtas/scan-log-dump", S_IRUSR, NULL, + ent = proc_create_data("powerpc/rtas/scan-log-dump", S_IRUSR, NULL, &scanlog_fops, data); if (!ent) goto err; diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c index b9bf0eedccf..097e8a2a3c5 100644 --- a/arch/powerpc/platforms/pseries/xics.c +++ b/arch/powerpc/platforms/pseries/xics.c @@ -157,7 +157,7 @@ static int get_irq_server(unsigned int virq, unsigned int strict_check) cpumask_t cpumask; cpumask_t tmp = CPU_MASK_NONE; - cpumask_copy(&cpumask, irq_desc[virq].affinity); + cpumask_copy(&cpumask, irq_to_desc(virq)->affinity); if (!distribute_irqs) return default_server; @@ -428,7 +428,7 @@ static int xics_host_map(struct irq_host *h, unsigned int virq, /* Insert the interrupt mapping into the radix tree for fast lookup */ irq_radix_revmap_insert(xics_host, virq, hw); - get_irq_desc(virq)->status |= IRQ_LEVEL; + irq_to_desc(virq)->status |= IRQ_LEVEL; set_irq_chip_and_handler(virq, xics_irq_chip, handle_fasteoi_irq); return 0; } @@ -852,7 +852,7 @@ void xics_migrate_irqs_away(void) /* We need to get IPIs still. */ if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) continue; - desc = get_irq_desc(virq); + desc = irq_to_desc(virq); /* We only need to migrate enabled IRQS */ if (desc == NULL || desc->chip == NULL @@ -881,7 +881,7 @@ void xics_migrate_irqs_away(void) virq, cpu); /* Reset affinity to all cpus */ - cpumask_setall(irq_desc[virq].affinity); + cpumask_setall(irq_to_desc(virq)->affinity); desc->chip->set_affinity(virq, cpu_all_mask); unlock: spin_unlock_irqrestore(&desc->lock, flags); diff --git a/arch/powerpc/sysdev/cpm1.c b/arch/powerpc/sysdev/cpm1.c index 82424cd7e12..523537300ad 100644 --- a/arch/powerpc/sysdev/cpm1.c +++ b/arch/powerpc/sysdev/cpm1.c @@ -102,7 +102,7 @@ static int cpm_pic_host_map(struct irq_host *h, unsigned int virq, { pr_debug("cpm_pic_host_map(%d, 0x%lx)\n", virq, hw); - get_irq_desc(virq)->status |= IRQ_LEVEL; + irq_to_desc(virq)->status |= IRQ_LEVEL; set_irq_chip_and_handler(virq, &cpm_pic, handle_fasteoi_irq); return 0; } diff --git a/arch/powerpc/sysdev/cpm2_pic.c b/arch/powerpc/sysdev/cpm2_pic.c index 78f1f7cca0a..722cf72e190 100644 --- a/arch/powerpc/sysdev/cpm2_pic.c +++ b/arch/powerpc/sysdev/cpm2_pic.c @@ -115,11 +115,13 @@ static void cpm2_ack(unsigned int virq) static void cpm2_end_irq(unsigned int virq) { + struct irq_desc *desc; int bit, word; unsigned int irq_nr = virq_to_hw(virq); - if (!(irq_desc[irq_nr].status & (IRQ_DISABLED|IRQ_INPROGRESS)) - && irq_desc[irq_nr].action) { + desc = irq_to_desc(irq_nr); + if (!(desc->status & (IRQ_DISABLED|IRQ_INPROGRESS)) + && desc->action) { bit = irq_to_siubit[irq_nr]; word = irq_to_siureg[irq_nr]; @@ -138,7 +140,7 @@ static void cpm2_end_irq(unsigned int virq) static int cpm2_set_irq_type(unsigned int virq, unsigned int flow_type) { unsigned int src = virq_to_hw(virq); - struct irq_desc *desc = get_irq_desc(virq); + struct irq_desc *desc = irq_to_desc(virq); unsigned int vold, vnew, edibit; if (flow_type == IRQ_TYPE_NONE) @@ -210,7 +212,7 @@ static int cpm2_pic_host_map(struct irq_host *h, unsigned int virq, { pr_debug("cpm2_pic_host_map(%d, 0x%lx)\n", virq, hw); - get_irq_desc(virq)->status |= IRQ_LEVEL; + irq_to_desc(virq)->status |= IRQ_LEVEL; set_irq_chip_and_handler(virq, &cpm2_pic, handle_level_irq); return 0; } diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c index da38a1ff97b..7174374f90f 100644 --- a/arch/powerpc/sysdev/fsl_msi.c +++ b/arch/powerpc/sysdev/fsl_msi.c @@ -55,7 +55,7 @@ static int fsl_msi_host_map(struct irq_host *h, unsigned int virq, { struct irq_chip *chip = &fsl_msi_chip; - get_irq_desc(virq)->status |= IRQ_TYPE_EDGE_FALLING; + irq_to_desc(virq)->status |= IRQ_TYPE_EDGE_FALLING; set_irq_chip_and_handler(virq, chip, handle_edge_irq); diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c index a96584ab33d..78ed945453d 100644 --- a/arch/powerpc/sysdev/i8259.c +++ b/arch/powerpc/sysdev/i8259.c @@ -175,12 +175,12 @@ static int i8259_host_map(struct irq_host *h, unsigned int virq, /* We block the internal cascade */ if (hw == 2) - get_irq_desc(virq)->status |= IRQ_NOREQUEST; + irq_to_desc(virq)->status |= IRQ_NOREQUEST; /* We use the level handler only for now, we might want to * be more cautious here but that works for now */ - get_irq_desc(virq)->status |= IRQ_LEVEL; + irq_to_desc(virq)->status |= IRQ_LEVEL; set_irq_chip_and_handler(virq, &i8259_pic, handle_level_irq); return 0; } diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index cb7689c4bfb..f042c1d6900 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -605,7 +605,7 @@ static int ipic_set_irq_type(unsigned int virq, unsigned int flow_type) { struct ipic *ipic = ipic_from_irq(virq); unsigned int src = ipic_irq_to_hw(virq); - struct irq_desc *desc = get_irq_desc(virq); + struct irq_desc *desc = irq_to_desc(virq); unsigned int vold, vnew, edibit; if (flow_type == IRQ_TYPE_NONE) diff --git a/arch/powerpc/sysdev/mpc8xx_pic.c b/arch/powerpc/sysdev/mpc8xx_pic.c index 5d2d5522ef4..01179587df2 100644 --- a/arch/powerpc/sysdev/mpc8xx_pic.c +++ b/arch/powerpc/sysdev/mpc8xx_pic.c @@ -72,7 +72,7 @@ static void mpc8xx_end_irq(unsigned int virq) static int mpc8xx_set_irq_type(unsigned int virq, unsigned int flow_type) { - struct irq_desc *desc = get_irq_desc(virq); + struct irq_desc *desc = irq_to_desc(virq); desc->status &= ~(IRQ_TYPE_SENSE_MASK | IRQ_LEVEL); desc->status |= flow_type & IRQ_TYPE_SENSE_MASK; diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index 30c44e6b041..4fd57ab956b 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -572,7 +572,7 @@ static int irq_choose_cpu(unsigned int virt_irq) cpumask_t mask; int cpuid; - cpumask_copy(&mask, irq_desc[virt_irq].affinity); + cpumask_copy(&mask, irq_to_desc(virt_irq)->affinity); if (cpus_equal(mask, CPU_MASK_ALL)) { static int irq_rover; static DEFINE_SPINLOCK(irq_rover_lock); @@ -621,7 +621,7 @@ static struct mpic *mpic_find(unsigned int irq) if (irq < NUM_ISA_INTERRUPTS) return NULL; - return irq_desc[irq].chip_data; + return irq_to_desc(irq)->chip_data; } /* Determine if the linux irq is an IPI */ @@ -648,14 +648,14 @@ static inline u32 mpic_physmask(u32 cpumask) /* Get the mpic structure from the IPI number */ static inline struct mpic * mpic_from_ipi(unsigned int ipi) { - return irq_desc[ipi].chip_data; + return irq_to_desc(ipi)->chip_data; } #endif /* Get the mpic structure from the irq number */ static inline struct mpic * mpic_from_irq(unsigned int irq) { - return irq_desc[irq].chip_data; + return irq_to_desc(irq)->chip_data; } /* Send an EOI */ @@ -735,7 +735,7 @@ static void mpic_unmask_ht_irq(unsigned int irq) mpic_unmask_irq(irq); - if (irq_desc[irq].status & IRQ_LEVEL) + if (irq_to_desc(irq)->status & IRQ_LEVEL) mpic_ht_end_irq(mpic, src); } @@ -745,7 +745,7 @@ static unsigned int mpic_startup_ht_irq(unsigned int irq) unsigned int src = mpic_irq_to_hw(irq); mpic_unmask_irq(irq); - mpic_startup_ht_interrupt(mpic, src, irq_desc[irq].status); + mpic_startup_ht_interrupt(mpic, src, irq_to_desc(irq)->status); return 0; } @@ -755,7 +755,7 @@ static void mpic_shutdown_ht_irq(unsigned int irq) struct mpic *mpic = mpic_from_irq(irq); unsigned int src = mpic_irq_to_hw(irq); - mpic_shutdown_ht_interrupt(mpic, src, irq_desc[irq].status); + mpic_shutdown_ht_interrupt(mpic, src, irq_to_desc(irq)->status); mpic_mask_irq(irq); } @@ -772,7 +772,7 @@ static void mpic_end_ht_irq(unsigned int irq) * latched another edge interrupt coming in anyway */ - if (irq_desc[irq].status & IRQ_LEVEL) + if (irq_to_desc(irq)->status & IRQ_LEVEL) mpic_ht_end_irq(mpic, src); mpic_eoi(mpic); } @@ -856,7 +856,7 @@ int mpic_set_irq_type(unsigned int virq, unsigned int flow_type) { struct mpic *mpic = mpic_from_irq(virq); unsigned int src = mpic_irq_to_hw(virq); - struct irq_desc *desc = get_irq_desc(virq); + struct irq_desc *desc = irq_to_desc(virq); unsigned int vecpri, vold, vnew; DBG("mpic: set_irq_type(mpic:@%p,virq:%d,src:0x%x,type:0x%x)\n", diff --git a/arch/powerpc/sysdev/mv64x60_pic.c b/arch/powerpc/sysdev/mv64x60_pic.c index 2aa4ed066db..485b92477d7 100644 --- a/arch/powerpc/sysdev/mv64x60_pic.c +++ b/arch/powerpc/sysdev/mv64x60_pic.c @@ -213,7 +213,7 @@ static int mv64x60_host_map(struct irq_host *h, unsigned int virq, { int level1; - get_irq_desc(virq)->status |= IRQ_LEVEL; + irq_to_desc(virq)->status |= IRQ_LEVEL; level1 = (hwirq & MV64x60_LEVEL1_MASK) >> MV64x60_LEVEL1_OFFSET; BUG_ON(level1 > MV64x60_LEVEL1_GPP); diff --git a/arch/powerpc/sysdev/qe_lib/qe_ic.c b/arch/powerpc/sysdev/qe_lib/qe_ic.c index 3faa42e03a8..fc098744ad8 100644 --- a/arch/powerpc/sysdev/qe_lib/qe_ic.c +++ b/arch/powerpc/sysdev/qe_lib/qe_ic.c @@ -189,7 +189,7 @@ static inline void qe_ic_write(volatile __be32 __iomem * base, unsigned int reg static inline struct qe_ic *qe_ic_from_irq(unsigned int virq) { - return irq_desc[virq].chip_data; + return irq_to_desc(virq)->chip_data; } #define virq_to_hw(virq) ((unsigned int)irq_map[virq].hwirq) @@ -263,7 +263,7 @@ static int qe_ic_host_map(struct irq_host *h, unsigned int virq, chip = &qe_ic->hc_irq; set_irq_chip_data(virq, qe_ic); - get_irq_desc(virq)->status |= IRQ_LEVEL; + irq_to_desc(virq)->status |= IRQ_LEVEL; set_irq_chip_and_handler(virq, chip, handle_level_irq); diff --git a/arch/powerpc/sysdev/tsi108_pci.c b/arch/powerpc/sysdev/tsi108_pci.c index cf244a419e9..02f600991dc 100644 --- a/arch/powerpc/sysdev/tsi108_pci.c +++ b/arch/powerpc/sysdev/tsi108_pci.c @@ -398,7 +398,7 @@ static int pci_irq_host_map(struct irq_host *h, unsigned int virq, DBG("%s(%d, 0x%lx)\n", __func__, virq, hw); if ((virq >= 1) && (virq <= 4)){ irq = virq + IRQ_PCI_INTAD_BASE - 1; - get_irq_desc(irq)->status |= IRQ_LEVEL; + irq_to_desc(irq)->status |= IRQ_LEVEL; set_irq_chip(irq, &tsi108_pci_irq); } return 0; diff --git a/arch/powerpc/sysdev/uic.c b/arch/powerpc/sysdev/uic.c index 466ce9ace12..cf97935863c 100644 --- a/arch/powerpc/sysdev/uic.c +++ b/arch/powerpc/sysdev/uic.c @@ -57,7 +57,7 @@ struct uic { static void uic_unmask_irq(unsigned int virq) { - struct irq_desc *desc = get_irq_desc(virq); + struct irq_desc *desc = irq_to_desc(virq); struct uic *uic = get_irq_chip_data(virq); unsigned int src = uic_irq_to_hw(virq); unsigned long flags; @@ -101,7 +101,7 @@ static void uic_ack_irq(unsigned int virq) static void uic_mask_ack_irq(unsigned int virq) { - struct irq_desc *desc = get_irq_desc(virq); + struct irq_desc *desc = irq_to_desc(virq); struct uic *uic = get_irq_chip_data(virq); unsigned int src = uic_irq_to_hw(virq); unsigned long flags; @@ -129,7 +129,7 @@ static int uic_set_irq_type(unsigned int virq, unsigned int flow_type) { struct uic *uic = get_irq_chip_data(virq); unsigned int src = uic_irq_to_hw(virq); - struct irq_desc *desc = get_irq_desc(virq); + struct irq_desc *desc = irq_to_desc(virq); unsigned long flags; int trigger, polarity; u32 tr, pr, mask; diff --git a/arch/powerpc/sysdev/xilinx_intc.c b/arch/powerpc/sysdev/xilinx_intc.c index 40edad52077..ab743718876 100644 --- a/arch/powerpc/sysdev/xilinx_intc.c +++ b/arch/powerpc/sysdev/xilinx_intc.c @@ -79,7 +79,7 @@ static void xilinx_intc_mask(unsigned int virq) static int xilinx_intc_set_type(unsigned int virq, unsigned int flow_type) { - struct irq_desc *desc = get_irq_desc(virq); + struct irq_desc *desc = irq_to_desc(virq); desc->status &= ~(IRQ_TYPE_SENSE_MASK | IRQ_LEVEL); desc->status |= flow_type & IRQ_TYPE_SENSE_MASK; |