From 6dbde3530850d4d8bfc1b6bd4006d92786a2787f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 Jan 2009 22:15:53 +0900 Subject: percpu: add optimized generic percpu accessors It is an optimization and a cleanup, and adds the following new generic percpu methods: percpu_read() percpu_write() percpu_add() percpu_sub() percpu_and() percpu_or() percpu_xor() and implements support for them on x86. (other architectures will fall back to a default implementation) The advantage is that for example to read a local percpu variable, instead of this sequence: return __get_cpu_var(var); ffffffff8102ca2b: 48 8b 14 fd 80 09 74 mov -0x7e8bf680(,%rdi,8),%rdx ffffffff8102ca32: 81 ffffffff8102ca33: 48 c7 c0 d8 59 00 00 mov $0x59d8,%rax ffffffff8102ca3a: 48 8b 04 10 mov (%rax,%rdx,1),%rax We can get a single instruction by using the optimized variants: return percpu_read(var); ffffffff8102ca3f: 65 48 8b 05 91 8f fd mov %gs:0x7efd8f91(%rip),%rax I also cleaned up the x86-specific APIs and made the x86 code use these new generic percpu primitives. tj: * fixed generic percpu_sub() definition as Roel Kluin pointed out * added percpu_and() for completeness's sake * made generic percpu ops atomic against preemption Signed-off-by: Ingo Molnar Signed-off-by: Tejun Heo --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 503c240e26c..7bc7852cc5c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1074,7 +1074,7 @@ static void drop_other_mm_ref(void *info) /* If this cpu still has a stale cr3 reference, then make sure it has been flushed. */ - if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) { + if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) { load_cr3(swapper_pg_dir); arch_flush_lazy_cpu_mode(); } -- cgit v1.2.3-70-g09d2 From 9eb912d1aa6b8106e06a73ea6702ec3dab0d6a1a Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 19 Jan 2009 00:38:57 +0900 Subject: x86-64: Move TLB state from PDA to per-cpu and consolidate with 32-bit. Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/include/asm/mmu_context_64.h | 16 +++++++--------- arch/x86/include/asm/pda.h | 2 -- arch/x86/include/asm/tlbflush.h | 7 ++----- arch/x86/kernel/cpu/common.c | 2 -- arch/x86/kernel/tlb_32.c | 12 ++---------- arch/x86/kernel/tlb_64.c | 13 ++++++++----- arch/x86/xen/mmu.c | 6 +----- 7 files changed, 20 insertions(+), 38 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/include/asm/mmu_context_64.h b/arch/x86/include/asm/mmu_context_64.h index 677d36e9540..c4572505ab3 100644 --- a/arch/x86/include/asm/mmu_context_64.h +++ b/arch/x86/include/asm/mmu_context_64.h @@ -1,13 +1,11 @@ #ifndef _ASM_X86_MMU_CONTEXT_64_H #define _ASM_X86_MMU_CONTEXT_64_H -#include - static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { #ifdef CONFIG_SMP - if (read_pda(mmu_state) == TLBSTATE_OK) - write_pda(mmu_state, TLBSTATE_LAZY); + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) + percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); #endif } @@ -19,8 +17,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* stop flush ipis for the previous mm */ cpu_clear(cpu, prev->cpu_vm_mask); #ifdef CONFIG_SMP - write_pda(mmu_state, TLBSTATE_OK); - write_pda(active_mm, next); + percpu_write(cpu_tlbstate.state, TLBSTATE_OK); + percpu_write(cpu_tlbstate.active_mm, next); #endif cpu_set(cpu, next->cpu_vm_mask); load_cr3(next->pgd); @@ -30,9 +28,9 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, } #ifdef CONFIG_SMP else { - write_pda(mmu_state, TLBSTATE_OK); - if (read_pda(active_mm) != next) - BUG(); + percpu_write(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); + if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { /* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h index 69a40757e21..8ee835ed10e 100644 --- a/arch/x86/include/asm/pda.h +++ b/arch/x86/include/asm/pda.h @@ -25,9 +25,7 @@ struct x8664_pda { char *irqstackptr; short nodenumber; /* number of current node (32k max) */ short in_bootmem; /* pda lives in bootmem */ - short mmu_state; short isidle; - struct mm_struct *active_mm; } ____cacheline_aligned_in_smp; DECLARE_PER_CPU(struct x8664_pda, __pda); diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 17feaa9c7e7..d3539f998f8 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -148,20 +148,17 @@ void native_flush_tlb_others(const struct cpumask *cpumask, #define TLBSTATE_OK 1 #define TLBSTATE_LAZY 2 -#ifdef CONFIG_X86_32 struct tlb_state { struct mm_struct *active_mm; int state; - char __cacheline_padding[L1_CACHE_BYTES-8]; }; DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); -void reset_lazy_tlbstate(void); -#else static inline void reset_lazy_tlbstate(void) { + percpu_write(cpu_tlbstate.state, 0); + percpu_write(cpu_tlbstate.active_mm, &init_mm); } -#endif #endif /* SMP */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c49498d4083..3d0cc6f1711 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -897,8 +897,6 @@ void __cpuinit pda_init(int cpu) pda->irqcount = -1; pda->kernelstack = (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; - pda->active_mm = &init_mm; - pda->mmu_state = 0; if (cpu == 0) { /* others are initialized in smpboot.c */ diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index e65449d0f7d..abf0808d6fc 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -4,8 +4,8 @@ #include -DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) - ____cacheline_aligned = { &init_mm, 0, }; +DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) + = { &init_mm, 0, }; /* must come after the send_IPI functions above for inlining */ #include @@ -231,14 +231,6 @@ void flush_tlb_all(void) on_each_cpu(do_flush_tlb_all, NULL, 1); } -void reset_lazy_tlbstate(void) -{ - int cpu = raw_smp_processor_id(); - - per_cpu(cpu_tlbstate, cpu).state = 0; - per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; -} - static int init_flush_cpumask(void) { alloc_cpumask_var(&flush_cpumask, GFP_KERNEL); diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c index 7f4141d3b66..e64a32c4882 100644 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c @@ -18,6 +18,9 @@ #include #include +DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) + = { &init_mm, 0, }; + #include /* * Smarter SMP flushing macros. @@ -62,9 +65,9 @@ static DEFINE_PER_CPU(union smp_flush_state, flush_state); */ void leave_mm(int cpu) { - if (read_pda(mmu_state) == TLBSTATE_OK) + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) BUG(); - cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); + cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask); load_cr3(swapper_pg_dir); } EXPORT_SYMBOL_GPL(leave_mm); @@ -142,8 +145,8 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) * BUG(); */ - if (f->flush_mm == read_pda(active_mm)) { - if (read_pda(mmu_state) == TLBSTATE_OK) { + if (f->flush_mm == percpu_read(cpu_tlbstate.active_mm)) { + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { if (f->flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else @@ -281,7 +284,7 @@ static void do_flush_tlb_all(void *info) unsigned long cpu = smp_processor_id(); __flush_tlb_all(); - if (read_pda(mmu_state) == TLBSTATE_LAZY) + if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(cpu); } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 7bc7852cc5c..98cb9869eb2 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1063,11 +1063,7 @@ static void drop_other_mm_ref(void *info) struct mm_struct *mm = info; struct mm_struct *active_mm; -#ifdef CONFIG_X86_64 - active_mm = read_pda(active_mm); -#else - active_mm = __get_cpu_var(cpu_tlbstate).active_mm; -#endif + active_mm = percpu_read(cpu_tlbstate.active_mm); if (active_mm == mm) leave_mm(smp_processor_id()); -- cgit v1.2.3-70-g09d2 From 319f3ba52c71630865b10ac3b99dd020440d681d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 28 Jan 2009 14:35:01 -0800 Subject: xen: move remaining mmu-related stuff into mmu.c Impact: Cleanup Move remaining mmu-related stuff into mmu.c. A general cleanup, and lay the groundwork for later patches. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/xen/enlighten.c | 731 +---------------------------------------------- arch/x86/xen/mmu.c | 700 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/mmu.h | 3 + arch/x86/xen/xen-ops.h | 8 + 4 files changed, 712 insertions(+), 730 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 6b3f7eef57e..0cd2a165f17 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -61,35 +61,6 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); enum xen_domain_type xen_domain_type = XEN_NATIVE; EXPORT_SYMBOL_GPL(xen_domain_type); -/* - * Identity map, in addition to plain kernel map. This needs to be - * large enough to allocate page table pages to allocate the rest. - * Each page can map 2MB. - */ -static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; - -#ifdef CONFIG_X86_64 -/* l3 pud for userspace vsyscall mapping */ -static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; -#endif /* CONFIG_X86_64 */ - -/* - * Note about cr3 (pagetable base) values: - * - * xen_cr3 contains the current logical cr3 value; it contains the - * last set cr3. This may not be the current effective cr3, because - * its update may be being lazily deferred. However, a vcpu looking - * at its own cr3 can use this value knowing that it everything will - * be self-consistent. - * - * xen_current_cr3 contains the actual vcpu cr3; it is set once the - * hypercall to set the vcpu cr3 is complete (so it may be a little - * out of date, but it will never be set early). If one vcpu is - * looking at another vcpu's cr3 value, it should use this variable. - */ -DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ -DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ - struct start_info *xen_start_info; EXPORT_SYMBOL_GPL(xen_start_info); @@ -237,7 +208,7 @@ static unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } -static void xen_leave_lazy(void) +void xen_leave_lazy(void) { paravirt_leave_lazy(paravirt_get_lazy_mode()); xen_mc_flush(); @@ -598,76 +569,6 @@ static struct apic_ops xen_basic_apic_ops = { #endif -static void xen_flush_tlb(void) -{ - struct mmuext_op *op; - struct multicall_space mcs; - - preempt_disable(); - - mcs = xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = MMUEXT_TLB_FLUSH_LOCAL; - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -static void xen_flush_tlb_single(unsigned long addr) -{ - struct mmuext_op *op; - struct multicall_space mcs; - - preempt_disable(); - - mcs = xen_mc_entry(sizeof(*op)); - op = mcs.args; - op->cmd = MMUEXT_INVLPG_LOCAL; - op->arg1.linear_addr = addr & PAGE_MASK; - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -static void xen_flush_tlb_others(const struct cpumask *cpus, - struct mm_struct *mm, unsigned long va) -{ - struct { - struct mmuext_op op; - DECLARE_BITMAP(mask, NR_CPUS); - } *args; - struct multicall_space mcs; - - BUG_ON(cpumask_empty(cpus)); - BUG_ON(!mm); - - mcs = xen_mc_entry(sizeof(*args)); - args = mcs.args; - args->op.arg2.vcpumask = to_cpumask(args->mask); - - /* Remove us, and any offline CPUS. */ - cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); - if (unlikely(cpumask_empty(to_cpumask(args->mask)))) - goto issue; - - if (va == TLB_FLUSH_ALL) { - args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; - } else { - args->op.cmd = MMUEXT_INVLPG_MULTI; - args->op.arg1.linear_addr = va; - } - - MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); - -issue: - xen_mc_issue(PARAVIRT_LAZY_MMU); -} static void xen_clts(void) { @@ -693,21 +594,6 @@ static void xen_write_cr0(unsigned long cr0) xen_mc_issue(PARAVIRT_LAZY_CPU); } -static void xen_write_cr2(unsigned long cr2) -{ - percpu_read(xen_vcpu)->arch.cr2 = cr2; -} - -static unsigned long xen_read_cr2(void) -{ - return percpu_read(xen_vcpu)->arch.cr2; -} - -static unsigned long xen_read_cr2_direct(void) -{ - return percpu_read(xen_vcpu_info.arch.cr2); -} - static void xen_write_cr4(unsigned long cr4) { cr4 &= ~X86_CR4_PGE; @@ -716,71 +602,6 @@ static void xen_write_cr4(unsigned long cr4) native_write_cr4(cr4); } -static unsigned long xen_read_cr3(void) -{ - return percpu_read(xen_cr3); -} - -static void set_current_cr3(void *v) -{ - percpu_write(xen_current_cr3, (unsigned long)v); -} - -static void __xen_write_cr3(bool kernel, unsigned long cr3) -{ - struct mmuext_op *op; - struct multicall_space mcs; - unsigned long mfn; - - if (cr3) - mfn = pfn_to_mfn(PFN_DOWN(cr3)); - else - mfn = 0; - - WARN_ON(mfn == 0 && kernel); - - mcs = __xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; - op->arg1.mfn = mfn; - - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - if (kernel) { - percpu_write(xen_cr3, cr3); - - /* Update xen_current_cr3 once the batch has actually - been submitted. */ - xen_mc_callback(set_current_cr3, (void *)cr3); - } -} - -static void xen_write_cr3(unsigned long cr3) -{ - BUG_ON(preemptible()); - - xen_mc_batch(); /* disables interrupts */ - - /* Update while interrupts are disabled, so its atomic with - respect to ipis */ - percpu_write(xen_cr3, cr3); - - __xen_write_cr3(true, cr3); - -#ifdef CONFIG_X86_64 - { - pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); - if (user_pgd) - __xen_write_cr3(false, __pa(user_pgd)); - else - __xen_write_cr3(false, 0); - } -#endif - - xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ -} - static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) { int ret; @@ -822,185 +643,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) return ret; } -/* Early in boot, while setting up the initial pagetable, assume - everything is pinned. */ -static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) -{ -#ifdef CONFIG_FLATMEM - BUG_ON(mem_map); /* should only be used early */ -#endif - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); -} - -/* Early release_pte assumes that all pts are pinned, since there's - only init_mm and anything attached to that is pinned. */ -static void xen_release_pte_init(unsigned long pfn) -{ - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); -} - -static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) -{ - struct mmuext_op op; - op.cmd = cmd; - op.arg1.mfn = pfn_to_mfn(pfn); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); -} - -/* This needs to make sure the new pte page is pinned iff its being - attached to a pinned pagetable. */ -static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) -{ - struct page *page = pfn_to_page(pfn); - - if (PagePinned(virt_to_page(mm->pgd))) { - SetPagePinned(page); - - vm_unmap_aliases(); - if (!PageHighMem(page)) { - make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); - if (level == PT_PTE && USE_SPLIT_PTLOCKS) - pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); - } else { - /* make sure there are no stray mappings of - this page */ - kmap_flush_unused(); - } - } -} - -static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) -{ - xen_alloc_ptpage(mm, pfn, PT_PTE); -} - -static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) -{ - xen_alloc_ptpage(mm, pfn, PT_PMD); -} - -static int xen_pgd_alloc(struct mm_struct *mm) -{ - pgd_t *pgd = mm->pgd; - int ret = 0; - - BUG_ON(PagePinned(virt_to_page(pgd))); - -#ifdef CONFIG_X86_64 - { - struct page *page = virt_to_page(pgd); - pgd_t *user_pgd; - - BUG_ON(page->private != 0); - - ret = -ENOMEM; - - user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - page->private = (unsigned long)user_pgd; - - if (user_pgd != NULL) { - user_pgd[pgd_index(VSYSCALL_START)] = - __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); - ret = 0; - } - - BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); - } -#endif - - return ret; -} - -static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ -#ifdef CONFIG_X86_64 - pgd_t *user_pgd = xen_get_user_pgd(pgd); - - if (user_pgd) - free_page((unsigned long)user_pgd); -#endif -} - -/* This should never happen until we're OK to use struct page */ -static void xen_release_ptpage(unsigned long pfn, unsigned level) -{ - struct page *page = pfn_to_page(pfn); - - if (PagePinned(page)) { - if (!PageHighMem(page)) { - if (level == PT_PTE && USE_SPLIT_PTLOCKS) - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); - } - ClearPagePinned(page); - } -} - -static void xen_release_pte(unsigned long pfn) -{ - xen_release_ptpage(pfn, PT_PTE); -} - -static void xen_release_pmd(unsigned long pfn) -{ - xen_release_ptpage(pfn, PT_PMD); -} - -#if PAGETABLE_LEVELS == 4 -static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) -{ - xen_alloc_ptpage(mm, pfn, PT_PUD); -} - -static void xen_release_pud(unsigned long pfn) -{ - xen_release_ptpage(pfn, PT_PUD); -} -#endif - -#ifdef CONFIG_HIGHPTE -static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) -{ - pgprot_t prot = PAGE_KERNEL; - - if (PagePinned(page)) - prot = PAGE_KERNEL_RO; - - if (0 && PageHighMem(page)) - printk("mapping highpte %lx type %d prot %s\n", - page_to_pfn(page), type, - (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); - - return kmap_atomic_prot(page, type, prot); -} -#endif - -#ifdef CONFIG_X86_32 -static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) -{ - /* If there's an existing pte, then don't allow _PAGE_RW to be set */ - if (pte_val_ma(*ptep) & _PAGE_PRESENT) - pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & - pte_val_ma(pte)); - - return pte; -} - -/* Init-time set_pte while constructing initial pagetables, which - doesn't allow RO pagetable pages to be remapped RW */ -static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) -{ - pte = mask_rw_pte(ptep, pte); - - xen_set_pte(ptep, pte); -} -#endif - -static __init void xen_pagetable_setup_start(pgd_t *base) -{ -} - void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { @@ -1021,37 +663,6 @@ void xen_setup_shared_info(void) xen_setup_mfn_list_list(); } -static __init void xen_pagetable_setup_done(pgd_t *base) -{ - xen_setup_shared_info(); -} - -static __init void xen_post_allocator_init(void) -{ - pv_mmu_ops.set_pte = xen_set_pte; - pv_mmu_ops.set_pmd = xen_set_pmd; - pv_mmu_ops.set_pud = xen_set_pud; -#if PAGETABLE_LEVELS == 4 - pv_mmu_ops.set_pgd = xen_set_pgd; -#endif - - /* This will work as long as patching hasn't happened yet - (which it hasn't) */ - pv_mmu_ops.alloc_pte = xen_alloc_pte; - pv_mmu_ops.alloc_pmd = xen_alloc_pmd; - pv_mmu_ops.release_pte = xen_release_pte; - pv_mmu_ops.release_pmd = xen_release_pmd; -#if PAGETABLE_LEVELS == 4 - pv_mmu_ops.alloc_pud = xen_alloc_pud; - pv_mmu_ops.release_pud = xen_release_pud; -#endif - -#ifdef CONFIG_X86_64 - SetPagePinned(virt_to_page(level3_user_vsyscall)); -#endif - xen_mark_init_mm_pinned(); -} - /* This is called once we have the cpu_possible_map */ void xen_setup_vcpu_info_placement(void) { @@ -1126,49 +737,6 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, return ret; } -static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) -{ - pte_t pte; - - phys >>= PAGE_SHIFT; - - switch (idx) { - case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: -#ifdef CONFIG_X86_F00F_BUG - case FIX_F00F_IDT: -#endif -#ifdef CONFIG_X86_32 - case FIX_WP_TEST: - case FIX_VDSO: -# ifdef CONFIG_HIGHMEM - case FIX_KMAP_BEGIN ... FIX_KMAP_END: -# endif -#else - case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: -#endif -#ifdef CONFIG_X86_LOCAL_APIC - case FIX_APIC_BASE: /* maps dummy local APIC */ -#endif - pte = pfn_pte(phys, prot); - break; - - default: - pte = mfn_pte(phys, prot); - break; - } - - __native_set_fixmap(idx, pte); - -#ifdef CONFIG_X86_64 - /* Replicate changes to map the vsyscall page into the user - pagetable vsyscall mapping. */ - if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { - unsigned long vaddr = __fix_to_virt(idx); - set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); - } -#endif -} - static const struct pv_info xen_info __initdata = { .paravirt_enabled = 1, .shared_kernel_pmd = 0, @@ -1264,86 +832,6 @@ static const struct pv_apic_ops xen_apic_ops __initdata = { #endif }; -static const struct pv_mmu_ops xen_mmu_ops __initdata = { - .pagetable_setup_start = xen_pagetable_setup_start, - .pagetable_setup_done = xen_pagetable_setup_done, - - .read_cr2 = xen_read_cr2, - .write_cr2 = xen_write_cr2, - - .read_cr3 = xen_read_cr3, - .write_cr3 = xen_write_cr3, - - .flush_tlb_user = xen_flush_tlb, - .flush_tlb_kernel = xen_flush_tlb, - .flush_tlb_single = xen_flush_tlb_single, - .flush_tlb_others = xen_flush_tlb_others, - - .pte_update = paravirt_nop, - .pte_update_defer = paravirt_nop, - - .pgd_alloc = xen_pgd_alloc, - .pgd_free = xen_pgd_free, - - .alloc_pte = xen_alloc_pte_init, - .release_pte = xen_release_pte_init, - .alloc_pmd = xen_alloc_pte_init, - .alloc_pmd_clone = paravirt_nop, - .release_pmd = xen_release_pte_init, - -#ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = xen_kmap_atomic_pte, -#endif - -#ifdef CONFIG_X86_64 - .set_pte = xen_set_pte, -#else - .set_pte = xen_set_pte_init, -#endif - .set_pte_at = xen_set_pte_at, - .set_pmd = xen_set_pmd_hyper, - - .ptep_modify_prot_start = __ptep_modify_prot_start, - .ptep_modify_prot_commit = __ptep_modify_prot_commit, - - .pte_val = xen_pte_val, - .pgd_val = xen_pgd_val, - - .make_pte = xen_make_pte, - .make_pgd = xen_make_pgd, - -#ifdef CONFIG_X86_PAE - .set_pte_atomic = xen_set_pte_atomic, - .set_pte_present = xen_set_pte_at, - .pte_clear = xen_pte_clear, - .pmd_clear = xen_pmd_clear, -#endif /* CONFIG_X86_PAE */ - .set_pud = xen_set_pud_hyper, - - .make_pmd = xen_make_pmd, - .pmd_val = xen_pmd_val, - -#if PAGETABLE_LEVELS == 4 - .pud_val = xen_pud_val, - .make_pud = xen_make_pud, - .set_pgd = xen_set_pgd_hyper, - - .alloc_pud = xen_alloc_pte_init, - .release_pud = xen_release_pte_init, -#endif /* PAGETABLE_LEVELS == 4 */ - - .activate_mm = xen_activate_mm, - .dup_mmap = xen_dup_mmap, - .exit_mmap = xen_exit_mmap, - - .lazy_mode = { - .enter = paravirt_enter_lazy_mmu, - .leave = xen_leave_lazy, - }, - - .set_fixmap = xen_set_fixmap, -}; - static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; @@ -1386,223 +874,6 @@ static const struct machine_ops __initdata xen_machine_ops = { }; -static void __init xen_reserve_top(void) -{ -#ifdef CONFIG_X86_32 - unsigned long top = HYPERVISOR_VIRT_START; - struct xen_platform_parameters pp; - - if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) - top = pp.virt_start; - - reserve_top_address(-top); -#endif /* CONFIG_X86_32 */ -} - -/* - * Like __va(), but returns address in the kernel mapping (which is - * all we have until the physical memory mapping has been set up. - */ -static void *__ka(phys_addr_t paddr) -{ -#ifdef CONFIG_X86_64 - return (void *)(paddr + __START_KERNEL_map); -#else - return __va(paddr); -#endif -} - -/* Convert a machine address to physical address */ -static unsigned long m2p(phys_addr_t maddr) -{ - phys_addr_t paddr; - - maddr &= PTE_PFN_MASK; - paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; - - return paddr; -} - -/* Convert a machine address to kernel virtual */ -static void *m2v(phys_addr_t maddr) -{ - return __ka(m2p(maddr)); -} - -static void set_page_prot(void *addr, pgprot_t prot) -{ - unsigned long pfn = __pa(addr) >> PAGE_SHIFT; - pte_t pte = pfn_pte(pfn, prot); - - if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) - BUG(); -} - -static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) -{ - unsigned pmdidx, pteidx; - unsigned ident_pte; - unsigned long pfn; - - ident_pte = 0; - pfn = 0; - for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { - pte_t *pte_page; - - /* Reuse or allocate a page of ptes */ - if (pmd_present(pmd[pmdidx])) - pte_page = m2v(pmd[pmdidx].pmd); - else { - /* Check for free pte pages */ - if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) - break; - - pte_page = &level1_ident_pgt[ident_pte]; - ident_pte += PTRS_PER_PTE; - - pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); - } - - /* Install mappings */ - for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { - pte_t pte; - - if (pfn > max_pfn_mapped) - max_pfn_mapped = pfn; - - if (!pte_none(pte_page[pteidx])) - continue; - - pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); - pte_page[pteidx] = pte; - } - } - - for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) - set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); - - set_page_prot(pmd, PAGE_KERNEL_RO); -} - -#ifdef CONFIG_X86_64 -static void convert_pfn_mfn(void *v) -{ - pte_t *pte = v; - int i; - - /* All levels are converted the same way, so just treat them - as ptes. */ - for (i = 0; i < PTRS_PER_PTE; i++) - pte[i] = xen_make_pte(pte[i].pte); -} - -/* - * Set up the inital kernel pagetable. - * - * We can construct this by grafting the Xen provided pagetable into - * head_64.S's preconstructed pagetables. We copy the Xen L2's into - * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This - * means that only the kernel has a physical mapping to start with - - * but that's enough to get __va working. We need to fill in the rest - * of the physical mapping once some sort of allocator has been set - * up. - */ -static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, - unsigned long max_pfn) -{ - pud_t *l3; - pmd_t *l2; - - /* Zap identity mapping */ - init_level4_pgt[0] = __pgd(0); - - /* Pre-constructed entries are in pfn, so convert to mfn */ - convert_pfn_mfn(init_level4_pgt); - convert_pfn_mfn(level3_ident_pgt); - convert_pfn_mfn(level3_kernel_pgt); - - l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); - l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); - - memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); - memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); - - l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); - l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); - memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); - - /* Set up identity map */ - xen_map_identity_early(level2_ident_pgt, max_pfn); - - /* Make pagetable pieces RO */ - set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); - set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); - - /* Pin down new L4 */ - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, - PFN_DOWN(__pa_symbol(init_level4_pgt))); - - /* Unpin Xen-provided one */ - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - - /* Switch over */ - pgd = init_level4_pgt; - - /* - * At this stage there can be no user pgd, and no page - * structure to attach it to, so make sure we just set kernel - * pgd. - */ - xen_mc_batch(); - __xen_write_cr3(true, __pa(pgd)); - xen_mc_issue(PARAVIRT_LAZY_CPU); - - reserve_early(__pa(xen_start_info->pt_base), - __pa(xen_start_info->pt_base + - xen_start_info->nr_pt_frames * PAGE_SIZE), - "XEN PAGETABLES"); - - return pgd; -} -#else /* !CONFIG_X86_64 */ -static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; - -static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, - unsigned long max_pfn) -{ - pmd_t *kernel_pmd; - - init_pg_tables_start = __pa(pgd); - init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; - max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); - - kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); - memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); - - xen_map_identity_early(level2_kernel_pgt, max_pfn); - - memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); - set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], - __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); - - set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); - set_page_prot(empty_zero_page, PAGE_KERNEL_RO); - - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - - xen_write_cr3(__pa(swapper_pg_dir)); - - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); - - return swapper_pg_dir; -} -#endif /* CONFIG_X86_64 */ - /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 98cb9869eb2..94e452c0b00 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -55,6 +56,8 @@ #include #include +#include +#include #include "multicalls.h" #include "mmu.h" @@ -114,6 +117,37 @@ static inline void check_zero(void) #endif /* CONFIG_XEN_DEBUG_FS */ + +/* + * Identity map, in addition to plain kernel map. This needs to be + * large enough to allocate page table pages to allocate the rest. + * Each page can map 2MB. + */ +static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; + +#ifdef CONFIG_X86_64 +/* l3 pud for userspace vsyscall mapping */ +static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; +#endif /* CONFIG_X86_64 */ + +/* + * Note about cr3 (pagetable base) values: + * + * xen_cr3 contains the current logical cr3 value; it contains the + * last set cr3. This may not be the current effective cr3, because + * its update may be being lazily deferred. However, a vcpu looking + * at its own cr3 can use this value knowing that it everything will + * be self-consistent. + * + * xen_current_cr3 contains the actual vcpu cr3; it is set once the + * hypercall to set the vcpu cr3 is complete (so it may be a little + * out of date, but it will never be set early). If one vcpu is + * looking at another vcpu's cr3 value, it should use this variable. + */ +DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ +DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ + + /* * Just beyond the highest usermode address. STACK_TOP_MAX has a * redzone above it, so round it up to a PGD boundary. @@ -1152,6 +1186,672 @@ void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } +static __init void xen_pagetable_setup_start(pgd_t *base) +{ +} + +static __init void xen_pagetable_setup_done(pgd_t *base) +{ + xen_setup_shared_info(); +} + +static void xen_write_cr2(unsigned long cr2) +{ + percpu_read(xen_vcpu)->arch.cr2 = cr2; +} + +static unsigned long xen_read_cr2(void) +{ + return percpu_read(xen_vcpu)->arch.cr2; +} + +unsigned long xen_read_cr2_direct(void) +{ + return percpu_read(xen_vcpu_info.arch.cr2); +} + +static void xen_flush_tlb(void) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_TLB_FLUSH_LOCAL; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_flush_tlb_single(unsigned long addr) +{ + struct mmuext_op *op; + struct multicall_space mcs; + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); + op = mcs.args; + op->cmd = MMUEXT_INVLPG_LOCAL; + op->arg1.linear_addr = addr & PAGE_MASK; + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +static void xen_flush_tlb_others(const struct cpumask *cpus, + struct mm_struct *mm, unsigned long va) +{ + struct { + struct mmuext_op op; + DECLARE_BITMAP(mask, NR_CPUS); + } *args; + struct multicall_space mcs; + + BUG_ON(cpumask_empty(cpus)); + BUG_ON(!mm); + + mcs = xen_mc_entry(sizeof(*args)); + args = mcs.args; + args->op.arg2.vcpumask = to_cpumask(args->mask); + + /* Remove us, and any offline CPUS. */ + cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); + if (unlikely(cpumask_empty(to_cpumask(args->mask)))) + goto issue; + + if (va == TLB_FLUSH_ALL) { + args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; + } else { + args->op.cmd = MMUEXT_INVLPG_MULTI; + args->op.arg1.linear_addr = va; + } + + MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); + +issue: + xen_mc_issue(PARAVIRT_LAZY_MMU); +} + +static unsigned long xen_read_cr3(void) +{ + return percpu_read(xen_cr3); +} + +static void set_current_cr3(void *v) +{ + percpu_write(xen_current_cr3, (unsigned long)v); +} + +static void __xen_write_cr3(bool kernel, unsigned long cr3) +{ + struct mmuext_op *op; + struct multicall_space mcs; + unsigned long mfn; + + if (cr3) + mfn = pfn_to_mfn(PFN_DOWN(cr3)); + else + mfn = 0; + + WARN_ON(mfn == 0 && kernel); + + mcs = __xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; + op->arg1.mfn = mfn; + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + if (kernel) { + percpu_write(xen_cr3, cr3); + + /* Update xen_current_cr3 once the batch has actually + been submitted. */ + xen_mc_callback(set_current_cr3, (void *)cr3); + } +} + +static void xen_write_cr3(unsigned long cr3) +{ + BUG_ON(preemptible()); + + xen_mc_batch(); /* disables interrupts */ + + /* Update while interrupts are disabled, so its atomic with + respect to ipis */ + percpu_write(xen_cr3, cr3); + + __xen_write_cr3(true, cr3); + +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); + if (user_pgd) + __xen_write_cr3(false, __pa(user_pgd)); + else + __xen_write_cr3(false, 0); + } +#endif + + xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ +} + +static int xen_pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = mm->pgd; + int ret = 0; + + BUG_ON(PagePinned(virt_to_page(pgd))); + +#ifdef CONFIG_X86_64 + { + struct page *page = virt_to_page(pgd); + pgd_t *user_pgd; + + BUG_ON(page->private != 0); + + ret = -ENOMEM; + + user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + page->private = (unsigned long)user_pgd; + + if (user_pgd != NULL) { + user_pgd[pgd_index(VSYSCALL_START)] = + __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); + ret = 0; + } + + BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); + } +#endif + + return ret; +} + +static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ +#ifdef CONFIG_X86_64 + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + if (user_pgd) + free_page((unsigned long)user_pgd); +#endif +} + + +/* Early in boot, while setting up the initial pagetable, assume + everything is pinned. */ +static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) +{ +#ifdef CONFIG_FLATMEM + BUG_ON(mem_map); /* should only be used early */ +#endif + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); +} + +/* Early release_pte assumes that all pts are pinned, since there's + only init_mm and anything attached to that is pinned. */ +static void xen_release_pte_init(unsigned long pfn) +{ + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +} + +static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +{ + struct mmuext_op op; + op.cmd = cmd; + op.arg1.mfn = pfn_to_mfn(pfn); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + +/* This needs to make sure the new pte page is pinned iff its being + attached to a pinned pagetable. */ +static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) +{ + struct page *page = pfn_to_page(pfn); + + if (PagePinned(virt_to_page(mm->pgd))) { + SetPagePinned(page); + + vm_unmap_aliases(); + if (!PageHighMem(page)) { + make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); + if (level == PT_PTE && USE_SPLIT_PTLOCKS) + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); + } else { + /* make sure there are no stray mappings of + this page */ + kmap_flush_unused(); + } + } +} + +static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PTE); +} + +static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PMD); +} + +/* This should never happen until we're OK to use struct page */ +static void xen_release_ptpage(unsigned long pfn, unsigned level) +{ + struct page *page = pfn_to_page(pfn); + + if (PagePinned(page)) { + if (!PageHighMem(page)) { + if (level == PT_PTE && USE_SPLIT_PTLOCKS) + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + } + ClearPagePinned(page); + } +} + +static void xen_release_pte(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PTE); +} + +static void xen_release_pmd(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PMD); +} + +#if PAGETABLE_LEVELS == 4 +static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PUD); +} + +static void xen_release_pud(unsigned long pfn) +{ + xen_release_ptpage(pfn, PT_PUD); +} +#endif + +void __init xen_reserve_top(void) +{ +#ifdef CONFIG_X86_32 + unsigned long top = HYPERVISOR_VIRT_START; + struct xen_platform_parameters pp; + + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) + top = pp.virt_start; + + reserve_top_address(-top); +#endif /* CONFIG_X86_32 */ +} + +/* + * Like __va(), but returns address in the kernel mapping (which is + * all we have until the physical memory mapping has been set up. + */ +static void *__ka(phys_addr_t paddr) +{ +#ifdef CONFIG_X86_64 + return (void *)(paddr + __START_KERNEL_map); +#else + return __va(paddr); +#endif +} + +/* Convert a machine address to physical address */ +static unsigned long m2p(phys_addr_t maddr) +{ + phys_addr_t paddr; + + maddr &= PTE_PFN_MASK; + paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; + + return paddr; +} + +/* Convert a machine address to kernel virtual */ +static void *m2v(phys_addr_t maddr) +{ + return __ka(m2p(maddr)); +} + +static void set_page_prot(void *addr, pgprot_t prot) +{ + unsigned long pfn = __pa(addr) >> PAGE_SHIFT; + pte_t pte = pfn_pte(pfn, prot); + + if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) + BUG(); +} + +static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) +{ + unsigned pmdidx, pteidx; + unsigned ident_pte; + unsigned long pfn; + + ident_pte = 0; + pfn = 0; + for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { + pte_t *pte_page; + + /* Reuse or allocate a page of ptes */ + if (pmd_present(pmd[pmdidx])) + pte_page = m2v(pmd[pmdidx].pmd); + else { + /* Check for free pte pages */ + if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) + break; + + pte_page = &level1_ident_pgt[ident_pte]; + ident_pte += PTRS_PER_PTE; + + pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); + } + + /* Install mappings */ + for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { + pte_t pte; + + if (pfn > max_pfn_mapped) + max_pfn_mapped = pfn; + + if (!pte_none(pte_page[pteidx])) + continue; + + pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); + pte_page[pteidx] = pte; + } + } + + for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) + set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); + + set_page_prot(pmd, PAGE_KERNEL_RO); +} + +#ifdef CONFIG_X86_64 +static void convert_pfn_mfn(void *v) +{ + pte_t *pte = v; + int i; + + /* All levels are converted the same way, so just treat them + as ptes. */ + for (i = 0; i < PTRS_PER_PTE; i++) + pte[i] = xen_make_pte(pte[i].pte); +} + +/* + * Set up the inital kernel pagetable. + * + * We can construct this by grafting the Xen provided pagetable into + * head_64.S's preconstructed pagetables. We copy the Xen L2's into + * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This + * means that only the kernel has a physical mapping to start with - + * but that's enough to get __va working. We need to fill in the rest + * of the physical mapping once some sort of allocator has been set + * up. + */ +__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + unsigned long max_pfn) +{ + pud_t *l3; + pmd_t *l2; + + /* Zap identity mapping */ + init_level4_pgt[0] = __pgd(0); + + /* Pre-constructed entries are in pfn, so convert to mfn */ + convert_pfn_mfn(init_level4_pgt); + convert_pfn_mfn(level3_ident_pgt); + convert_pfn_mfn(level3_kernel_pgt); + + l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); + + memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + + l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); + memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + + /* Set up identity map */ + xen_map_identity_early(level2_ident_pgt, max_pfn); + + /* Make pagetable pieces RO */ + set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); + + /* Pin down new L4 */ + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa_symbol(init_level4_pgt))); + + /* Unpin Xen-provided one */ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + /* Switch over */ + pgd = init_level4_pgt; + + /* + * At this stage there can be no user pgd, and no page + * structure to attach it to, so make sure we just set kernel + * pgd. + */ + xen_mc_batch(); + __xen_write_cr3(true, __pa(pgd)); + xen_mc_issue(PARAVIRT_LAZY_CPU); + + reserve_early(__pa(xen_start_info->pt_base), + __pa(xen_start_info->pt_base + + xen_start_info->nr_pt_frames * PAGE_SIZE), + "XEN PAGETABLES"); + + return pgd; +} +#else /* !CONFIG_X86_64 */ +static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; + +__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, + unsigned long max_pfn) +{ + pmd_t *kernel_pmd; + + init_pg_tables_start = __pa(pgd); + init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; + max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); + + kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); + memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); + + xen_map_identity_early(level2_kernel_pgt, max_pfn); + + memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); + set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], + __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); + + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); + set_page_prot(empty_zero_page, PAGE_KERNEL_RO); + + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + xen_write_cr3(__pa(swapper_pg_dir)); + + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); + + return swapper_pg_dir; +} +#endif /* CONFIG_X86_64 */ + +static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) +{ + pte_t pte; + + phys >>= PAGE_SHIFT; + + switch (idx) { + case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: +#ifdef CONFIG_X86_F00F_BUG + case FIX_F00F_IDT: +#endif +#ifdef CONFIG_X86_32 + case FIX_WP_TEST: + case FIX_VDSO: +# ifdef CONFIG_HIGHMEM + case FIX_KMAP_BEGIN ... FIX_KMAP_END: +# endif +#else + case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: +#endif +#ifdef CONFIG_X86_LOCAL_APIC + case FIX_APIC_BASE: /* maps dummy local APIC */ +#endif + pte = pfn_pte(phys, prot); + break; + + default: + pte = mfn_pte(phys, prot); + break; + } + + __native_set_fixmap(idx, pte); + +#ifdef CONFIG_X86_64 + /* Replicate changes to map the vsyscall page into the user + pagetable vsyscall mapping. */ + if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { + unsigned long vaddr = __fix_to_virt(idx); + set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); + } +#endif +} + +__init void xen_post_allocator_init(void) +{ + pv_mmu_ops.set_pte = xen_set_pte; + pv_mmu_ops.set_pmd = xen_set_pmd; + pv_mmu_ops.set_pud = xen_set_pud; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.set_pgd = xen_set_pgd; +#endif + + /* This will work as long as patching hasn't happened yet + (which it hasn't) */ + pv_mmu_ops.alloc_pte = xen_alloc_pte; + pv_mmu_ops.alloc_pmd = xen_alloc_pmd; + pv_mmu_ops.release_pte = xen_release_pte; + pv_mmu_ops.release_pmd = xen_release_pmd; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.alloc_pud = xen_alloc_pud; + pv_mmu_ops.release_pud = xen_release_pud; +#endif + +#ifdef CONFIG_X86_64 + SetPagePinned(virt_to_page(level3_user_vsyscall)); +#endif + xen_mark_init_mm_pinned(); +} + + +const struct pv_mmu_ops xen_mmu_ops __initdata = { + .pagetable_setup_start = xen_pagetable_setup_start, + .pagetable_setup_done = xen_pagetable_setup_done, + + .read_cr2 = xen_read_cr2, + .write_cr2 = xen_write_cr2, + + .read_cr3 = xen_read_cr3, + .write_cr3 = xen_write_cr3, + + .flush_tlb_user = xen_flush_tlb, + .flush_tlb_kernel = xen_flush_tlb, + .flush_tlb_single = xen_flush_tlb_single, + .flush_tlb_others = xen_flush_tlb_others, + + .pte_update = paravirt_nop, + .pte_update_defer = paravirt_nop, + + .pgd_alloc = xen_pgd_alloc, + .pgd_free = xen_pgd_free, + + .alloc_pte = xen_alloc_pte_init, + .release_pte = xen_release_pte_init, + .alloc_pmd = xen_alloc_pte_init, + .alloc_pmd_clone = paravirt_nop, + .release_pmd = xen_release_pte_init, + +#ifdef CONFIG_HIGHPTE + .kmap_atomic_pte = xen_kmap_atomic_pte, +#endif + +#ifdef CONFIG_X86_64 + .set_pte = xen_set_pte, +#else + .set_pte = xen_set_pte_init, +#endif + .set_pte_at = xen_set_pte_at, + .set_pmd = xen_set_pmd_hyper, + + .ptep_modify_prot_start = __ptep_modify_prot_start, + .ptep_modify_prot_commit = __ptep_modify_prot_commit, + + .pte_val = xen_pte_val, + .pgd_val = xen_pgd_val, + + .make_pte = xen_make_pte, + .make_pgd = xen_make_pgd, + +#ifdef CONFIG_X86_PAE + .set_pte_atomic = xen_set_pte_atomic, + .set_pte_present = xen_set_pte_at, + .pte_clear = xen_pte_clear, + .pmd_clear = xen_pmd_clear, +#endif /* CONFIG_X86_PAE */ + .set_pud = xen_set_pud_hyper, + + .make_pmd = xen_make_pmd, + .pmd_val = xen_pmd_val, + +#if PAGETABLE_LEVELS == 4 + .pud_val = xen_pud_val, + .make_pud = xen_make_pud, + .set_pgd = xen_set_pgd_hyper, + + .alloc_pud = xen_alloc_pte_init, + .release_pud = xen_release_pte_init, +#endif /* PAGETABLE_LEVELS == 4 */ + + .activate_mm = xen_activate_mm, + .dup_mmap = xen_dup_mmap, + .exit_mmap = xen_exit_mmap, + + .lazy_mode = { + .enter = paravirt_enter_lazy_mmu, + .leave = xen_leave_lazy, + }, + + .set_fixmap = xen_set_fixmap, +}; + + #ifdef CONFIG_XEN_DEBUG_FS static struct dentry *d_mmu_debug; diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 98d71659da5..24d1b44a337 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -54,4 +54,7 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); +unsigned long xen_read_cr2_direct(void); + +extern const struct pv_mmu_ops xen_mmu_ops; #endif /* _XEN_MMU_H */ diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index c1f8faf0a2c..11913fc94c1 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -13,6 +13,7 @@ extern const char xen_failsafe_callback[]; struct trap_info; void xen_copy_trap_info(struct trap_info *traps); +DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info); DECLARE_PER_CPU(unsigned long, xen_cr3); DECLARE_PER_CPU(unsigned long, xen_current_cr3); @@ -22,6 +23,13 @@ extern struct shared_info *HYPERVISOR_shared_info; void xen_setup_mfn_list_list(void); void xen_setup_shared_info(void); +void xen_setup_machphys_mapping(void); +pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); +void xen_ident_map_ISA(void); +void xen_reserve_top(void); + +void xen_leave_lazy(void); +void xen_post_allocator_init(void); char * __init xen_memory_setup(void); void __init xen_arch_setup(void); -- cgit v1.2.3-70-g09d2 From da5de7c22eb705be709a57e486e7475a6969b994 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 28 Jan 2009 14:35:07 -0800 Subject: x86/paravirt: use callee-saved convention for pte_val/make_pte/etc Impact: Optimization In the native case, pte_val, make_pte, etc are all just identity functions, so there's no need to clobber a lot of registers over them. (This changes the 32-bit callee-save calling convention to return both EAX and EDX so functions can return 64-bit values.) Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/paravirt.h | 78 ++++++++++++++++++++--------------------- arch/x86/kernel/paravirt.c | 53 +++++++--------------------- arch/x86/xen/mmu.c | 24 ++++++++----- 3 files changed, 67 insertions(+), 88 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index beb10ecdbe6..2d098b78bc1 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -19,7 +19,7 @@ #define CLBR_ANY ((1 << 4) - 1) #define CLBR_ARG_REGS (CLBR_EAX | CLBR_EDX | CLBR_ECX) -#define CLBR_RET_REG (CLBR_EAX) +#define CLBR_RET_REG (CLBR_EAX | CLBR_EDX) #define CLBR_SCRATCH (0) #else #define CLBR_RAX CLBR_EAX @@ -308,11 +308,11 @@ struct pv_mmu_ops { void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); - pteval_t (*pte_val)(pte_t); - pte_t (*make_pte)(pteval_t pte); + struct paravirt_callee_save pte_val; + struct paravirt_callee_save make_pte; - pgdval_t (*pgd_val)(pgd_t); - pgd_t (*make_pgd)(pgdval_t pgd); + struct paravirt_callee_save pgd_val; + struct paravirt_callee_save make_pgd; #if PAGETABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE @@ -327,12 +327,12 @@ struct pv_mmu_ops { void (*set_pud)(pud_t *pudp, pud_t pudval); - pmdval_t (*pmd_val)(pmd_t); - pmd_t (*make_pmd)(pmdval_t pmd); + struct paravirt_callee_save pmd_val; + struct paravirt_callee_save make_pmd; #if PAGETABLE_LEVELS == 4 - pudval_t (*pud_val)(pud_t); - pud_t (*make_pud)(pudval_t pud); + struct paravirt_callee_save pud_val; + struct paravirt_callee_save make_pud; void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); #endif /* PAGETABLE_LEVELS == 4 */ @@ -1155,13 +1155,13 @@ static inline pte_t __pte(pteval_t val) pteval_t ret; if (sizeof(pteval_t) > sizeof(long)) - ret = PVOP_CALL2(pteval_t, - pv_mmu_ops.make_pte, - val, (u64)val >> 32); + ret = PVOP_CALLEE2(pteval_t, + pv_mmu_ops.make_pte, + val, (u64)val >> 32); else - ret = PVOP_CALL1(pteval_t, - pv_mmu_ops.make_pte, - val); + ret = PVOP_CALLEE1(pteval_t, + pv_mmu_ops.make_pte, + val); return (pte_t) { .pte = ret }; } @@ -1171,11 +1171,11 @@ static inline pteval_t pte_val(pte_t pte) pteval_t ret; if (sizeof(pteval_t) > sizeof(long)) - ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val, - pte.pte, (u64)pte.pte >> 32); + ret = PVOP_CALLEE2(pteval_t, pv_mmu_ops.pte_val, + pte.pte, (u64)pte.pte >> 32); else - ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val, - pte.pte); + ret = PVOP_CALLEE1(pteval_t, pv_mmu_ops.pte_val, + pte.pte); return ret; } @@ -1185,11 +1185,11 @@ static inline pgd_t __pgd(pgdval_t val) pgdval_t ret; if (sizeof(pgdval_t) > sizeof(long)) - ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd, - val, (u64)val >> 32); + ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.make_pgd, + val, (u64)val >> 32); else - ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd, - val); + ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.make_pgd, + val); return (pgd_t) { ret }; } @@ -1199,11 +1199,11 @@ static inline pgdval_t pgd_val(pgd_t pgd) pgdval_t ret; if (sizeof(pgdval_t) > sizeof(long)) - ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val, - pgd.pgd, (u64)pgd.pgd >> 32); + ret = PVOP_CALLEE2(pgdval_t, pv_mmu_ops.pgd_val, + pgd.pgd, (u64)pgd.pgd >> 32); else - ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val, - pgd.pgd); + ret = PVOP_CALLEE1(pgdval_t, pv_mmu_ops.pgd_val, + pgd.pgd); return ret; } @@ -1267,11 +1267,11 @@ static inline pmd_t __pmd(pmdval_t val) pmdval_t ret; if (sizeof(pmdval_t) > sizeof(long)) - ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd, - val, (u64)val >> 32); + ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.make_pmd, + val, (u64)val >> 32); else - ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd, - val); + ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.make_pmd, + val); return (pmd_t) { ret }; } @@ -1281,11 +1281,11 @@ static inline pmdval_t pmd_val(pmd_t pmd) pmdval_t ret; if (sizeof(pmdval_t) > sizeof(long)) - ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val, - pmd.pmd, (u64)pmd.pmd >> 32); + ret = PVOP_CALLEE2(pmdval_t, pv_mmu_ops.pmd_val, + pmd.pmd, (u64)pmd.pmd >> 32); else - ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val, - pmd.pmd); + ret = PVOP_CALLEE1(pmdval_t, pv_mmu_ops.pmd_val, + pmd.pmd); return ret; } @@ -1307,11 +1307,11 @@ static inline pud_t __pud(pudval_t val) pudval_t ret; if (sizeof(pudval_t) > sizeof(long)) - ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud, - val, (u64)val >> 32); + ret = PVOP_CALLEE2(pudval_t, pv_mmu_ops.make_pud, + val, (u64)val >> 32); else - ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud, - val); + ret = PVOP_CALLEE1(pudval_t, pv_mmu_ops.make_pud, + val); return (pud_t) { ret }; } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 8adb6b5aa42..cea11c8e304 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -391,43 +391,12 @@ struct pv_apic_ops pv_apic_ops = { #endif }; -typedef pte_t make_pte_t(pteval_t); -typedef pmd_t make_pmd_t(pmdval_t); -typedef pud_t make_pud_t(pudval_t); -typedef pgd_t make_pgd_t(pgdval_t); - -typedef pteval_t pte_val_t(pte_t); -typedef pmdval_t pmd_val_t(pmd_t); -typedef pudval_t pud_val_t(pud_t); -typedef pgdval_t pgd_val_t(pgd_t); - - #if defined(CONFIG_X86_32) && !defined(CONFIG_X86_PAE) /* 32-bit pagetable entries */ -#define paravirt_native_make_pte (make_pte_t *)_paravirt_ident_32 -#define paravirt_native_pte_val (pte_val_t *)_paravirt_ident_32 - -#define paravirt_native_make_pmd (make_pmd_t *)_paravirt_ident_32 -#define paravirt_native_pmd_val (pmd_val_t *)_paravirt_ident_32 - -#define paravirt_native_make_pud (make_pud_t *)_paravirt_ident_32 -#define paravirt_native_pud_val (pud_val_t *)_paravirt_ident_32 - -#define paravirt_native_make_pgd (make_pgd_t *)_paravirt_ident_32 -#define paravirt_native_pgd_val (pgd_val_t *)_paravirt_ident_32 +#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_32) #else /* 64-bit pagetable entries */ -#define paravirt_native_make_pte (make_pte_t *)_paravirt_ident_64 -#define paravirt_native_pte_val (pte_val_t *)_paravirt_ident_64 - -#define paravirt_native_make_pmd (make_pmd_t *)_paravirt_ident_64 -#define paravirt_native_pmd_val (pmd_val_t *)_paravirt_ident_64 - -#define paravirt_native_make_pud (make_pud_t *)_paravirt_ident_64 -#define paravirt_native_pud_val (pud_val_t *)_paravirt_ident_64 - -#define paravirt_native_make_pgd (make_pgd_t *)_paravirt_ident_64 -#define paravirt_native_pgd_val (pgd_val_t *)_paravirt_ident_64 +#define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) #endif struct pv_mmu_ops pv_mmu_ops = { @@ -481,21 +450,23 @@ struct pv_mmu_ops pv_mmu_ops = { .pmd_clear = native_pmd_clear, #endif .set_pud = native_set_pud, - .pmd_val = paravirt_native_pmd_val, - .make_pmd = paravirt_native_make_pmd, + + .pmd_val = PTE_IDENT, + .make_pmd = PTE_IDENT, #if PAGETABLE_LEVELS == 4 - .pud_val = paravirt_native_pud_val, - .make_pud = paravirt_native_make_pud, + .pud_val = PTE_IDENT, + .make_pud = PTE_IDENT, + .set_pgd = native_set_pgd, #endif #endif /* PAGETABLE_LEVELS >= 3 */ - .pte_val = paravirt_native_pte_val, - .pgd_val = paravirt_native_pgd_val, + .pte_val = PTE_IDENT, + .pgd_val = PTE_IDENT, - .make_pte = paravirt_native_make_pte, - .make_pgd = paravirt_native_make_pgd, + .make_pte = PTE_IDENT, + .make_pgd = PTE_IDENT, .dup_mmap = paravirt_nop, .exit_mmap = paravirt_nop, diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 94e452c0b00..5e41f7fc6cf 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -492,28 +492,33 @@ pteval_t xen_pte_val(pte_t pte) { return pte_mfn_to_pfn(pte.pte); } +PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); pgdval_t xen_pgd_val(pgd_t pgd) { return pte_mfn_to_pfn(pgd.pgd); } +PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); pte_t xen_make_pte(pteval_t pte) { pte = pte_pfn_to_mfn(pte); return native_make_pte(pte); } +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); pgd_t xen_make_pgd(pgdval_t pgd) { pgd = pte_pfn_to_mfn(pgd); return native_make_pgd(pgd); } +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); pmdval_t xen_pmd_val(pmd_t pmd) { return pte_mfn_to_pfn(pmd.pmd); } +PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); void xen_set_pud_hyper(pud_t *ptr, pud_t val) { @@ -590,12 +595,14 @@ pmd_t xen_make_pmd(pmdval_t pmd) pmd = pte_pfn_to_mfn(pmd); return native_make_pmd(pmd); } +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); #if PAGETABLE_LEVELS == 4 pudval_t xen_pud_val(pud_t pud) { return pte_mfn_to_pfn(pud.pud); } +PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); pud_t xen_make_pud(pudval_t pud) { @@ -603,6 +610,7 @@ pud_t xen_make_pud(pudval_t pud) return native_make_pud(pud); } +PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); pgd_t *xen_get_user_pgd(pgd_t *pgd) { @@ -1813,11 +1821,11 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = { .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, - .pte_val = xen_pte_val, - .pgd_val = xen_pgd_val, + .pte_val = PV_CALLEE_SAVE(xen_pte_val), + .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), - .make_pte = xen_make_pte, - .make_pgd = xen_make_pgd, + .make_pte = PV_CALLEE_SAVE(xen_make_pte), + .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), #ifdef CONFIG_X86_PAE .set_pte_atomic = xen_set_pte_atomic, @@ -1827,12 +1835,12 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = { #endif /* CONFIG_X86_PAE */ .set_pud = xen_set_pud_hyper, - .make_pmd = xen_make_pmd, - .pmd_val = xen_pmd_val, + .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), + .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), #if PAGETABLE_LEVELS == 4 - .pud_val = xen_pud_val, - .make_pud = xen_make_pud, + .pud_val = PV_CALLEE_SAVE(xen_pud_val), + .make_pud = PV_CALLEE_SAVE(xen_make_pud), .set_pgd = xen_set_pgd_hyper, .alloc_pud = xen_alloc_pte_init, -- cgit v1.2.3-70-g09d2 From 1f4f931501e9270c156d05ee76b7b872de486304 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 2 Feb 2009 13:58:06 -0800 Subject: xen: fix 32-bit build resulting from mmu move Moving the mmu code from enlighten.c to mmu.c inadvertently broke the 32-bit build. Fix it. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/xen/mmu.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 5e41f7fc6cf..d2e8ed1aff3 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1396,6 +1396,43 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) #endif } +#ifdef CONFIG_HIGHPTE +static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) +{ + pgprot_t prot = PAGE_KERNEL; + + if (PagePinned(page)) + prot = PAGE_KERNEL_RO; + + if (0 && PageHighMem(page)) + printk("mapping highpte %lx type %d prot %s\n", + page_to_pfn(page), type, + (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); + + return kmap_atomic_prot(page, type, prot); +} +#endif + +#ifdef CONFIG_X86_32 +static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) +{ + /* If there's an existing pte, then don't allow _PAGE_RW to be set */ + if (pte_val_ma(*ptep) & _PAGE_PRESENT) + pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & + pte_val_ma(pte)); + + return pte; +} + +/* Init-time set_pte while constructing initial pagetables, which + doesn't allow RO pagetable pages to be remapped RW */ +static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) +{ + pte = mask_rw_pte(ptep, pte); + + xen_set_pte(ptep, pte); +} +#endif /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ -- cgit v1.2.3-70-g09d2 From 694aa960608d2976666d850bd4ef78053bbd0c84 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 12 Feb 2009 16:25:42 -0800 Subject: xen: fix xen_flush_tlb_others The commit commit 4595f9620cda8a1e973588e743cf5f8436dd20c6 Author: Rusty Russell Date: Sat Jan 10 21:58:09 2009 -0800 x86: change flush_tlb_others to take a const struct cpumask causes xen_flush_tlb_others to allocate a multicall and then issue it without initializing it in the case where the cpumask is empty, leading to: [ 8.354898] 1 multicall(s) failed: cpu 1 [ 8.354921] Pid: 2213, comm: bootclean Not tainted 2.6.29-rc3-x86_32p-xenU-tip #135 [ 8.354937] Call Trace: [ 8.354955] [] xen_mc_flush+0x133/0x1b0 [ 8.354971] [] ? xen_force_evtchn_callback+0x1a/0x30 [ 8.354988] [] xen_flush_tlb_others+0xb0/0xd0 [ 8.355003] [] flush_tlb_page+0x53/0xa0 [ 8.355018] [] do_wp_page+0x2a0/0x7c0 [ 8.355034] [] ? notify_remote_via_irq+0x3a/0x70 [ 8.355049] [] handle_mm_fault+0x7b0/0xa50 [ 8.355065] [] ? wake_up_new_task+0x8e/0xb0 [ 8.355079] [] ? do_fork+0xe5/0x320 [ 8.355095] [] do_page_fault+0xe9/0x240 [ 8.355109] [] ? do_page_fault+0x0/0x240 [ 8.355125] [] error_code+0x72/0x78 [ 8.355139] call 1/1: op=2863311530 arg=[aaaaaaaa] result=-38 xen_flush_tlb_others+0x41/0xd0 Since empty cpumasks are rare and undoing an xen_mc_entry() is tricky just issue such requests normally. Signed-off-by: Ian Campbell Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index d2e8ed1aff3..319bd40a57c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1273,8 +1273,6 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, /* Remove us, and any offline CPUS. */ cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); - if (unlikely(cpumask_empty(to_cpumask(args->mask)))) - goto issue; if (va == TLB_FLUSH_ALL) { args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; @@ -1285,7 +1283,6 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); -issue: xen_mc_issue(PARAVIRT_LAZY_MMU); } -- cgit v1.2.3-70-g09d2 From 9976b39b5031bbf76f715893cf080b6a17683881 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 27 Feb 2009 09:19:26 -0800 Subject: xen: deal with virtually mapped percpu data The virtually mapped percpu space causes us two problems: - for hypercalls which take an mfn, we need to do a full pagetable walk to convert the percpu va into an mfn, and - when a hypercall requires a page to be mapped RO via all its aliases, we need to make sure its RO in both the percpu mapping and in the linear mapping This primarily affects the gdt and the vcpu info structure. Signed-off-by: Jeremy Fitzhardinge Cc: Xen-devel Cc: Gerd Hoffmann Cc: Rusty Russell Cc: Tejun Heo Signed-off-by: Ingo Molnar --- arch/x86/include/asm/xen/page.h | 1 + arch/x86/xen/enlighten.c | 10 ++++++---- arch/x86/xen/mmu.c | 7 +++++++ arch/x86/xen/smp.c | 6 +++++- 4 files changed, 19 insertions(+), 5 deletions(-) (limited to 'arch/x86/xen/mmu.c') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 4bd990ee43d..1a918dde46b 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -164,6 +164,7 @@ static inline pte_t __pte_ma(pteval_t x) xmaddr_t arbitrary_virt_to_machine(void *address); +unsigned long arbitrary_virt_to_mfn(void *vaddr); void make_lowmem_page_readonly(void *vaddr); void make_lowmem_page_readwrite(void *vaddr); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 86497d5f44c..352ea683065 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -103,7 +103,7 @@ static void xen_vcpu_setup(int cpu) vcpup = &per_cpu(xen_vcpu_info, cpu); - info.mfn = virt_to_mfn(vcpup); + info.mfn = arbitrary_virt_to_mfn(vcpup); info.offset = offset_in_page(vcpup); printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", @@ -301,8 +301,10 @@ static void xen_load_gdt(const struct desc_ptr *dtr) frames = mcs.args; for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { - frames[f] = virt_to_mfn(va); + frames[f] = arbitrary_virt_to_mfn((void *)va); + make_lowmem_page_readonly((void *)va); + make_lowmem_page_readonly(mfn_to_virt(frames[f])); } MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); @@ -314,7 +316,7 @@ static void load_TLS_descriptor(struct thread_struct *t, unsigned int cpu, unsigned int i) { struct desc_struct *gdt = get_cpu_gdt_table(cpu); - xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); + xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); struct multicall_space mc = __xen_mc_entry(0); MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); @@ -488,7 +490,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, break; default: { - xmaddr_t maddr = virt_to_machine(&dt[entry]); + xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]); xen_mc_flush(); if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 319bd40a57c..cb6afa4ec95 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -276,6 +276,13 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) p2m_top[topidx][idx] = mfn; } +unsigned long arbitrary_virt_to_mfn(void *vaddr) +{ + xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); + + return PFN_DOWN(maddr.maddr); +} + xmaddr_t arbitrary_virt_to_machine(void *vaddr) { unsigned long address = (unsigned long)vaddr; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 035582ae815..8d470562ffc 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -219,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) { struct vcpu_guest_context *ctxt; struct desc_struct *gdt; + unsigned long gdt_mfn; if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) return 0; @@ -248,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->ldt_ents = 0; BUG_ON((unsigned long)gdt & ~PAGE_MASK); + + gdt_mfn = arbitrary_virt_to_mfn(gdt); make_lowmem_page_readonly(gdt); + make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); - ctxt->gdt_frames[0] = virt_to_mfn(gdt); + ctxt->gdt_frames[0] = gdt_mfn; ctxt->gdt_ents = GDT_ENTRIES; ctxt->user_regs.cs = __KERNEL_CS; -- cgit v1.2.3-70-g09d2