diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/include/asm/spinlock.h | 3 | ||||
-rw-r--r-- | arch/x86/kernel/alternative.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/irq.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/microcode_amd.c | 7 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 30 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 13 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 5 | ||||
-rw-r--r-- | arch/x86/xen/enlighten.c | 118 | ||||
-rw-r--r-- | arch/x86/xen/p2m.c | 95 | ||||
-rw-r--r-- | arch/x86/xen/setup.c | 9 | ||||
-rw-r--r-- | arch/x86/xen/suspend.c | 2 | ||||
-rw-r--r-- | arch/x86/xen/xen-ops.h | 2 |
12 files changed, 154 insertions, 134 deletions
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index b315a33867f..33692eaabab 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -12,8 +12,7 @@ * Simple spin lock operations. There are two variants, one clears IRQ's * on the local processor, one does not. * - * These are fair FIFO ticket locks, which are currently limited to 256 - * CPUs. + * These are fair FIFO ticket locks, which support up to 2^16 CPUs. * * (the type definitions are in asm/spinlock_types.h) */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index afb7ff79a29..ced4534baed 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -165,7 +165,7 @@ static const unsigned char * const k7_nops[ASM_NOP_MAX+2] = #endif #ifdef P6_NOP1 -static const unsigned char __initconst_or_module p6nops[] = +static const unsigned char p6nops[] = { P6_NOP1, P6_NOP2, diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 7ad683d7864..d44f7829968 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -270,7 +270,7 @@ void fixup_irqs(void) if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { break_affinity = 1; - affinity = cpu_all_mask; + affinity = cpu_online_mask; } chip = irq_data_get_irq_chip(data); diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 8a2ce8fd41c..82746f942cd 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -143,11 +143,12 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr, unsigned int *current_size) { struct microcode_header_amd *mc_hdr; - unsigned int actual_size; + unsigned int actual_size, patch_size; u16 equiv_cpu_id; /* size of the current patch we're staring at */ - *current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE; + patch_size = *(u32 *)(ucode_ptr + 4); + *current_size = patch_size + SECTION_HDR_SIZE; equiv_cpu_id = find_equiv_id(); if (!equiv_cpu_id) @@ -174,7 +175,7 @@ static int get_matching_microcode(int cpu, const u8 *ucode_ptr, /* * now that the header looks sane, verify its size */ - actual_size = verify_ucode_size(cpu, *current_size, leftover_size); + actual_size = verify_ucode_size(cpu, patch_size, leftover_size); if (!actual_size) return 0; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 97d9a9914ba..a3b57a27be8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -475,13 +475,26 @@ register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg) return address_mask(ctxt, reg); } +static void masked_increment(ulong *reg, ulong mask, int inc) +{ + assign_masked(reg, *reg + inc, mask); +} + static inline void register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc) { + ulong mask; + if (ctxt->ad_bytes == sizeof(unsigned long)) - *reg += inc; + mask = ~0UL; else - *reg = (*reg & ~ad_mask(ctxt)) | ((*reg + inc) & ad_mask(ctxt)); + mask = ad_mask(ctxt); + masked_increment(reg, mask, inc); +} + +static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) +{ + masked_increment(&ctxt->regs[VCPU_REGS_RSP], stack_mask(ctxt), inc); } static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) @@ -1522,8 +1535,8 @@ static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes) { struct segmented_address addr; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes); - addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); + rsp_increment(ctxt, -bytes); + addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; return segmented_write(ctxt, addr, data, bytes); @@ -1542,13 +1555,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, int rc; struct segmented_address addr; - addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); + addr.ea = ctxt->regs[VCPU_REGS_RSP] & stack_mask(ctxt); addr.seg = VCPU_SREG_SS; rc = segmented_read(ctxt, addr, dest, len); if (rc != X86EMUL_CONTINUE) return rc; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], len); + rsp_increment(ctxt, len); return rc; } @@ -1688,8 +1701,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt) while (reg >= VCPU_REGS_RAX) { if (reg == VCPU_REGS_RSP) { - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], - ctxt->op_bytes); + rsp_increment(ctxt, ctxt->op_bytes); --reg; } @@ -2825,7 +2837,7 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], ctxt->src.val); + rsp_increment(ctxt, ctxt->src.val); return X86EMUL_CONTINUE; } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 01ca0042393..7fbd0d273ea 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4113,16 +4113,21 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) LIST_HEAD(invalid_list); /* + * Never scan more than sc->nr_to_scan VM instances. + * Will not hit this condition practically since we do not try + * to shrink more than one VM and it is very unlikely to see + * !n_used_mmu_pages so many times. + */ + if (!nr_to_scan--) + break; + /* * n_used_mmu_pages is accessed without holding kvm->mmu_lock * here. We may skip a VM instance errorneosly, but we do not * want to shrink a VM that only started to populate its MMU * anyway. */ - if (kvm->arch.n_used_mmu_pages > 0) { - if (!nr_to_scan--) - break; + if (!kvm->arch.n_used_mmu_pages) continue; - } idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 42bce48f692..148ed666e31 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -806,7 +806,7 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc); * kvm-specific. Those are put in the beginning of the list. */ -#define KVM_SAVE_MSRS_BEGIN 9 +#define KVM_SAVE_MSRS_BEGIN 10 static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, @@ -2000,6 +2000,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_KVM_STEAL_TIME: data = vcpu->arch.st.msr_val; break; + case MSR_KVM_PV_EOI_EN: + data = vcpu->arch.pv_eoi.msr_val; + break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index bf4bda6d3e9..9642d4a3860 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -31,7 +31,6 @@ #include <linux/pci.h> #include <linux/gfp.h> #include <linux/memblock.h> -#include <linux/syscore_ops.h> #include <xen/xen.h> #include <xen/interface/xen.h> @@ -1470,130 +1469,38 @@ asmlinkage void __init xen_start_kernel(void) #endif } -#ifdef CONFIG_XEN_PVHVM -/* - * The pfn containing the shared_info is located somewhere in RAM. This - * will cause trouble if the current kernel is doing a kexec boot into a - * new kernel. The new kernel (and its startup code) can not know where - * the pfn is, so it can not reserve the page. The hypervisor will - * continue to update the pfn, and as a result memory corruption occours - * in the new kernel. - * - * One way to work around this issue is to allocate a page in the - * xen-platform pci device's BAR memory range. But pci init is done very - * late and the shared_info page is already in use very early to read - * the pvclock. So moving the pfn from RAM to MMIO is racy because some - * code paths on other vcpus could access the pfn during the small - * window when the old pfn is moved to the new pfn. There is even a - * small window were the old pfn is not backed by a mfn, and during that - * time all reads return -1. - * - * Because it is not known upfront where the MMIO region is located it - * can not be used right from the start in xen_hvm_init_shared_info. - * - * To minimise trouble the move of the pfn is done shortly before kexec. - * This does not eliminate the race because all vcpus are still online - * when the syscore_ops will be called. But hopefully there is no work - * pending at this point in time. Also the syscore_op is run last which - * reduces the risk further. - */ - -static struct shared_info *xen_hvm_shared_info; - -static void xen_hvm_connect_shared_info(unsigned long pfn) +void __ref xen_hvm_init_shared_info(void) { + int cpu; struct xen_add_to_physmap xatp; + static struct shared_info *shared_info_page = 0; + if (!shared_info_page) + shared_info_page = (struct shared_info *) + extend_brk(PAGE_SIZE, PAGE_SIZE); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = pfn; + xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) BUG(); -} -static void xen_hvm_set_shared_info(struct shared_info *sip) -{ - int cpu; - - HYPERVISOR_shared_info = sip; + HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info * page, we use it in the event channel upcall and in some pvclock * related functions. We don't need the vcpu_info placement * optimizations because we don't use any pv_mmu or pv_irq op on * HVM. - * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is - * online but xen_hvm_set_shared_info is run at resume time too and + * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is + * online but xen_hvm_init_shared_info is run at resume time too and * in that case multiple vcpus might be online. */ for_each_online_cpu(cpu) { per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; } } -/* Reconnect the shared_info pfn to a mfn */ -void xen_hvm_resume_shared_info(void) -{ - xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); -} - -#ifdef CONFIG_KEXEC -static struct shared_info *xen_hvm_shared_info_kexec; -static unsigned long xen_hvm_shared_info_pfn_kexec; - -/* Remember a pfn in MMIO space for kexec reboot */ -void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn) -{ - xen_hvm_shared_info_kexec = sip; - xen_hvm_shared_info_pfn_kexec = pfn; -} - -static void xen_hvm_syscore_shutdown(void) -{ - struct xen_memory_reservation reservation = { - .domid = DOMID_SELF, - .nr_extents = 1, - }; - unsigned long prev_pfn; - int rc; - - if (!xen_hvm_shared_info_kexec) - return; - - prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT; - set_xen_guest_handle(reservation.extent_start, &prev_pfn); - - /* Move pfn to MMIO, disconnects previous pfn from mfn */ - xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec); - - /* Update pointers, following hypercall is also a memory barrier */ - xen_hvm_set_shared_info(xen_hvm_shared_info_kexec); - - /* Allocate new mfn for previous pfn */ - do { - rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); - if (rc == 0) - msleep(123); - } while (rc == 0); - - /* Make sure the previous pfn is really connected to a (new) mfn */ - BUG_ON(rc != 1); -} - -static struct syscore_ops xen_hvm_syscore_ops = { - .shutdown = xen_hvm_syscore_shutdown, -}; -#endif - -/* Use a pfn in RAM, may move to MMIO before kexec. */ -static void __init xen_hvm_init_shared_info(void) -{ - /* Remember pointer for resume */ - xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE); - xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); - xen_hvm_set_shared_info(xen_hvm_shared_info); -} - +#ifdef CONFIG_XEN_PVHVM static void __init init_hvm_pv_info(void) { int major, minor; @@ -1644,9 +1551,6 @@ static void __init xen_hvm_guest_init(void) init_hvm_pv_info(); xen_hvm_init_shared_info(); -#ifdef CONFIG_KEXEC - register_syscore_ops(&xen_hvm_syscore_ops); -#endif if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index b2e91d40a4c..d4b25546325 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -196,9 +196,11 @@ RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3); /* When we populate back during bootup, the amount of pages can vary. The * max we have is seen is 395979, but that does not mean it can't be more. - * But some machines can have 3GB I/O holes even. So lets reserve enough - * for 4GB of I/O and E820 holes. */ -RESERVE_BRK(p2m_populated, PMD_SIZE * 4); + * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle + * it can re-use Xen provided mfn_list array, so we only need to allocate at + * most three P2M top nodes. */ +RESERVE_BRK(p2m_populated, PAGE_SIZE * 3); + static inline unsigned p2m_top_index(unsigned long pfn) { BUG_ON(pfn >= MAX_P2M_PFN); @@ -575,12 +577,99 @@ static bool __init early_alloc_p2m(unsigned long pfn) } return true; } + +/* + * Skim over the P2M tree looking at pages that are either filled with + * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and + * replace the P2M leaf with a p2m_missing or p2m_identity. + * Stick the old page in the new P2M tree location. + */ +bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn) +{ + unsigned topidx; + unsigned mididx; + unsigned ident_pfns; + unsigned inv_pfns; + unsigned long *p2m; + unsigned long *mid_mfn_p; + unsigned idx; + unsigned long pfn; + + /* We only look when this entails a P2M middle layer */ + if (p2m_index(set_pfn)) + return false; + + for (pfn = 0; pfn <= MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) { + topidx = p2m_top_index(pfn); + + if (!p2m_top[topidx]) + continue; + + if (p2m_top[topidx] == p2m_mid_missing) + continue; + + mididx = p2m_mid_index(pfn); + p2m = p2m_top[topidx][mididx]; + if (!p2m) + continue; + + if ((p2m == p2m_missing) || (p2m == p2m_identity)) + continue; + + if ((unsigned long)p2m == INVALID_P2M_ENTRY) + continue; + + ident_pfns = 0; + inv_pfns = 0; + for (idx = 0; idx < P2M_PER_PAGE; idx++) { + /* IDENTITY_PFNs are 1:1 */ + if (p2m[idx] == IDENTITY_FRAME(pfn + idx)) + ident_pfns++; + else if (p2m[idx] == INVALID_P2M_ENTRY) + inv_pfns++; + else + break; + } + if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE)) + goto found; + } + return false; +found: + /* Found one, replace old with p2m_identity or p2m_missing */ + p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing); + /* And the other for save/restore.. */ + mid_mfn_p = p2m_top_mfn_p[topidx]; + /* NOTE: Even if it is a p2m_identity it should still be point to + * a page filled with INVALID_P2M_ENTRY entries. */ + mid_mfn_p[mididx] = virt_to_mfn(p2m_missing); + + /* Reset where we want to stick the old page in. */ + topidx = p2m_top_index(set_pfn); + mididx = p2m_mid_index(set_pfn); + + /* This shouldn't happen */ + if (WARN_ON(p2m_top[topidx] == p2m_mid_missing)) + early_alloc_p2m(set_pfn); + + if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing)) + return false; + + p2m_init(p2m); + p2m_top[topidx][mididx] = p2m; + mid_mfn_p = p2m_top_mfn_p[topidx]; + mid_mfn_p[mididx] = virt_to_mfn(p2m); + + return true; +} bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) { if (unlikely(!__set_phys_to_machine(pfn, mfn))) { if (!early_alloc_p2m(pfn)) return false; + if (early_can_reuse_p2m_middle(pfn, mfn)) + return __set_phys_to_machine(pfn, mfn); + if (!early_alloc_p2m_middle(pfn, false /* boundary crossover OK!*/)) return false; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index ead85576d54..d11ca11d14f 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -78,9 +78,16 @@ static void __init xen_add_extra_mem(u64 start, u64 size) memblock_reserve(start, size); xen_max_p2m_pfn = PFN_DOWN(start + size); + for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) { + unsigned long mfn = pfn_to_mfn(pfn); + + if (WARN(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn)) + continue; + WARN(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n", + pfn, mfn); - for (pfn = PFN_DOWN(start); pfn <= xen_max_p2m_pfn; pfn++) __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + } } static unsigned long __init xen_do_chunk(unsigned long start, diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index ae8a00c39de..45329c8c226 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled) { #ifdef CONFIG_XEN_PVHVM int cpu; - xen_hvm_resume_shared_info(); + xen_hvm_init_shared_info(); xen_callback_vector(); xen_unplug_emulated_devices(); if (xen_feature(XENFEAT_hvm_safe_pvclock)) { diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 1e4329e04e0..202d4c15015 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -41,7 +41,7 @@ void xen_enable_syscall(void); void xen_vcpu_restore(void); void xen_callback_vector(void); -void xen_hvm_resume_shared_info(void); +void xen_hvm_init_shared_info(void); void xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); |