From db023ea595015058270be6a62fe60a7b6b5c50d7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 14 Sep 2012 19:05:46 +0200 Subject: uprobes: Move clear_thread_flag(TIF_UPROBE) to uprobe_notify_resume() Move clear_thread_flag(TIF_UPROBE) from do_notify_resume() to uprobe_notify_resume() for !CONFIG_UPROBES case. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- arch/x86/kernel/signal.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b280908a376..0041e5a5293 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -785,10 +785,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) mce_notify_process(); #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ - if (thread_info_flags & _TIF_UPROBE) { - clear_thread_flag(TIF_UPROBE); + if (thread_info_flags & _TIF_UPROBE) uprobe_notify_resume(regs); - } /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) -- cgit v1.2.3-70-g09d2 From 1396adc3c2bdc556d4cdd1cf107aa0b6d59fbb1e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 1 Oct 2012 14:34:42 -0700 Subject: x86, suspend: Correct the restore of CR4, EFER; skip computing EFLAGS.ID The patch: 73201dbe x86, suspend: On wakeup always initialize cr4 and EFER ... was incorrectly committed in an intermediate (unfinished) form. - We need to test CF, not ZF, for a bit test with btl. - We don't actually need to compute the existence of EFLAGS.ID, since we set a flag at suspend time if CR4 should be restored. Signed-off-by: H. Peter Anvin Cc: Rafael J. Wysocki Link: http://lkml.kernel.org/r/1348529239-17943-1-git-send-email-hpa@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/realmode/rm/wakeup_asm.S | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/realmode/rm/wakeup_asm.S b/arch/x86/realmode/rm/wakeup_asm.S index e56479e5805..9e7e14797a7 100644 --- a/arch/x86/realmode/rm/wakeup_asm.S +++ b/arch/x86/realmode/rm/wakeup_asm.S @@ -74,18 +74,9 @@ ENTRY(wakeup_start) lidtl wakeup_idt - /* Clear the EFLAGS but remember if we have EFLAGS.ID */ - movl $X86_EFLAGS_ID, %ecx - pushl %ecx - popfl - pushfl - popl %edi + /* Clear the EFLAGS */ pushl $0 popfl - pushfl - popl %edx - xorl %edx, %edi - andl %ecx, %edi /* %edi is zero iff CPUID & %cr4 are missing */ /* Check header signature... */ movl signature, %eax @@ -120,12 +111,12 @@ ENTRY(wakeup_start) movl %eax, %cr3 btl $WAKEUP_BEHAVIOR_RESTORE_CR4, %edi - jz 1f + jnc 1f movl pmode_cr4, %eax movl %eax, %cr4 1: btl $WAKEUP_BEHAVIOR_RESTORE_EFER, %edi - jz 1f + jnc 1f movl pmode_efer, %eax movl pmode_efer + 4, %edx movl $MSR_EFER, %ecx -- cgit v1.2.3-70-g09d2 From fcd8af585f587741c051f7124b8dee6c73c8629b Mon Sep 17 00:00:00 2001 From: David Hooper Date: Tue, 2 Oct 2012 15:26:53 +0100 Subject: x86/reboot: Remove quirk entry for SBC FITPC Remove the quirk for the SBC FITPC. It seems ot have been required when the default was kbd reboot, but no longer required now that the default is acpi reboot. Furthermore, BIOS reboot no longer works for this board as of 2.6.39 or any of the 3.x kernels. Signed-off-by: David Hooper Signed-off-by: Alan Cox Link: http://lkml.kernel.org/r/20121002142635.17403.59959.stgit@localhost.localdomain Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 52190a938b4..4e8ba39eaf0 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -358,14 +358,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), }, }, - { /* Handle problems with rebooting on CompuLab SBC-FITPC2 */ - .callback = set_bios_reboot, - .ident = "CompuLab SBC-FITPC2", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"), - DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), - }, - }, { /* Handle problems with rebooting on ASUS P4S800 */ .callback = set_bios_reboot, .ident = "ASUS P4S800", -- cgit v1.2.3-70-g09d2 From 961c79761dda351b5fb263a0654b98daac130b7a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 2 Oct 2012 11:34:09 +0300 Subject: x86/cache_info: Use ARRAY_SIZE() in amd_l3_attrs() Using ARRAY_SIZE() is more readable. Signed-off-by: Dan Carpenter Reviewed-by: Srivatsa S. Bhat Cc: Shai Fultheim Cc: Greg Kroah-Hartman Cc: Andreas Herrmann Link: http://lkml.kernel.org/r/20121002083409.GM12398@elgon.mountain Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 9a7c90d80bc..93c5451bdd5 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -991,7 +991,7 @@ static struct attribute ** __cpuinit amd_l3_attrs(void) if (attrs) return attrs; - n = sizeof (default_attrs) / sizeof (struct attribute *); + n = ARRAY_SIZE(default_attrs); if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) n += 2; -- cgit v1.2.3-70-g09d2 From b64b9c937a533f0bfbfc9f6ac93d3c3e2f97ab02 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 29 Sep 2012 21:31:08 +0200 Subject: uprobes/x86: Only rep+nop can be emulated correctly __skip_sstep() correctly detects the "nontrivial" nop insns, but since it doesn't update regs->ip we can not really skip "0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0", the probed application is killed by SIGILL'ed handle_swbp(). Remove these additional checks. If we want to implement this correctly we need to know the full insn length to update ->ip. rep* + nop is fine even without updating ->ip. Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- arch/x86/kernel/uprobes.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 9538f00827a..aafa5557b39 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -651,31 +651,19 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) /* * Skip these instructions as per the currently known x86 ISA. - * 0x66* { 0x90 | 0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0 } + * rep=0x66*; nop=0x90 */ static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) { int i; for (i = 0; i < MAX_UINSN_BYTES; i++) { - if ((auprobe->insn[i] == 0x66)) + if (auprobe->insn[i] == 0x66) continue; if (auprobe->insn[i] == 0x90) return true; - if (i == (MAX_UINSN_BYTES - 1)) - break; - - if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x1f)) - return true; - - if ((auprobe->insn[i] == 0x0f) && (auprobe->insn[i+1] == 0x19)) - return true; - - if ((auprobe->insn[i] == 0x87) && (auprobe->insn[i+1] == 0xc0)) - return true; - break; } return false; -- cgit v1.2.3-70-g09d2 From 44009105081b51417f311f4c3be0061870b6b8ed Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 10 Oct 2012 10:18:35 +0300 Subject: oprofile, x86: Fix wrapping bug in op_x86_get_ctrl() The "event" variable is a u16 so the shift will always wrap to zero making the line a no-op. Signed-off-by: Dan Carpenter Cc: v2.6.32.. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 26b8a8514ee..48768df2471 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -55,7 +55,7 @@ u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, val |= counter_config->extra; event &= model->event_mask ? model->event_mask : 0xFF; val |= event & 0xFF; - val |= (event & 0x0F00) << 24; + val |= (u64)(event & 0x0F00) << 24; return val; } -- cgit v1.2.3-70-g09d2 From 3ce9e53e788881da0d5f3912f80e0dd6b501f304 Mon Sep 17 00:00:00 2001 From: Michal Marek Date: Mon, 15 Oct 2012 21:16:56 +0200 Subject: kbuild: Fix accidental revert in commit fe04ddf Commit fe04ddf7c291 ("kbuild: Do not package /boot and /lib in make tar-pkg") accidentally reverted two previous kbuild commits. I don't know what I was thinking. This brings back changes made by commits 24cc7fb69a5b ("x86/kbuild: archscripts depends on scripts_basic") and c1c1a59e37da ("firmware: fix directory creation rule matching with make 3.80") Reported-by: Jan Beulich Cc: Signed-off-by: Michal Marek Signed-off-by: Linus Torvalds --- arch/x86/Makefile | 2 +- scripts/Makefile.fwinst | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 58790bd85c1..05afcca66de 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -142,7 +142,7 @@ KBUILD_CFLAGS += $(call cc-option,-mno-avx,) KBUILD_CFLAGS += $(mflags-y) KBUILD_AFLAGS += $(mflags-y) -archscripts: +archscripts: scripts_basic $(Q)$(MAKE) $(build)=arch/x86/tools relocs ### diff --git a/scripts/Makefile.fwinst b/scripts/Makefile.fwinst index c3f69ae275d..4d908d16c03 100644 --- a/scripts/Makefile.fwinst +++ b/scripts/Makefile.fwinst @@ -27,7 +27,7 @@ endif installed-mod-fw := $(addprefix $(INSTALL_FW_PATH)/,$(mod-fw)) installed-fw := $(addprefix $(INSTALL_FW_PATH)/,$(fw-shipped-all)) -installed-fw-dirs := $(sort $(dir $(installed-fw))) $(INSTALL_FW_PATH)/. +installed-fw-dirs := $(sort $(dir $(installed-fw))) $(INSTALL_FW_PATH)/./ # Workaround for make < 3.81, where .SECONDEXPANSION doesn't work. PHONY += $(INSTALL_FW_PATH)/$$(%) install-all-dirs @@ -42,7 +42,7 @@ quiet_cmd_install = INSTALL $(subst $(srctree)/,,$@) $(installed-fw-dirs): $(call cmd,mkdir) -$(installed-fw): $(INSTALL_FW_PATH)/%: $(obj)/% | $$(dir $(INSTALL_FW_PATH)/%) +$(installed-fw): $(INSTALL_FW_PATH)/%: $(obj)/% | $(INSTALL_FW_PATH)/$$(dir %) $(call cmd,install) PHONY += __fw_install __fw_modinst FORCE -- cgit v1.2.3-70-g09d2 From 20b279ddb38ca42f8863cec07b4d45ec24589f13 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Sep 2012 14:59:14 -0600 Subject: perf: Require exclude_guest to use PEBS - kernel side enforcement Intel PEBS in VT-x context uses the DS address as a guest linear address, even though its programmed by the host as a host linear address. This either results in guest memory corruption and or the hardware faulting and 'crashing' the virtual machine. Therefore we have to disable PEBS on VT-x enter and re-enable on VT-x exit, enforcing a strict exclude_guest. This patch enforces exclude_guest kernel side. Signed-off-by: Peter Zijlstra Cc: Avi Kivity Cc: David Ahern Cc: Gleb Natapov Cc: Ingo Molnar Cc: Robert Richter Link: http://lkml.kernel.org/r/1347569955-54626-3-git-send-email-dsahern@gmail.com Signed-off-by: David Ahern Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/kernel/cpu/perf_event.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 915b876edd1..3373f84d139 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -338,6 +338,9 @@ int x86_setup_perfctr(struct perf_event *event) /* BTS is currently only allowed for user-mode. */ if (!attr->exclude_kernel) return -EOPNOTSUPP; + + if (!attr->exclude_guest) + return -EOPNOTSUPP; } hwc->config |= config; @@ -380,6 +383,9 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip) { int precise = 0; + if (!event->attr.exclude_guest) + return -EOPNOTSUPP; + /* Support for constant skid */ if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { precise++; -- cgit v1.2.3-70-g09d2 From 1bbbbe779aabe1f0768c2bf8f8c0a5583679b54a Mon Sep 17 00:00:00 2001 From: Jacob Shin Date: Thu, 20 Oct 2011 16:15:26 -0500 Subject: x86: Exclude E820_RESERVED regions and memory holes above 4 GB from direct mapping. On systems with very large memory (1 TB in our case), BIOS may report a reserved region or a hole in the E820 map, even above the 4 GB range. Exclude these from the direct mapping. [ hpa: this should be done not just for > 4 GB but for everything above the legacy region (1 MB), at the very least. That, however, turns out to require significant restructuring. That work is well underway, but is not suitable for rc/stable. ] Cc: stable@kernel.org # > 2.6.32 Signed-off-by: Jacob Shin Link: http://lkml.kernel.org/r/1319145326-13902-1-git-send-email-jacob.shin@amd.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f4b9b80e1b9..198e774498e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -919,8 +919,21 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { - max_pfn_mapped = init_memory_mapping(1UL<<32, - max_pfn<addr + ei->size <= 1UL << 32) + continue; + + if (ei->type == E820_RESERVED) + continue; + + max_pfn_mapped = init_memory_mapping( + ei->addr < 1UL << 32 ? 1UL << 32 : ei->addr, + ei->addr + ei->size); + } + /* can we preseve max_low_pfn ?*/ max_low_pfn = max_pfn; } -- cgit v1.2.3-70-g09d2 From 21c5e50e15b1abd797e62f18fd7f90b9cc004cbd Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Mon, 1 Oct 2012 14:42:05 +0800 Subject: x86, amd, mce: Avoid NULL pointer reference on CPU northbridge lookup When booting on a federated multi-server system (NumaScale), the processor Northbridge lookup returns NULL; add guards to prevent this causing an oops. On those systems, the northbridge is accessed through MMIO and the "normal" northbridge enumeration in amd_nb.c doesn't work since we're generating the northbridge ID from the initial APIC ID and the last is not unique on those systems. Long story short, we end up without northbridge descriptors. Signed-off-by: Daniel J Blueman Cc: stable@vger.kernel.org # 3.6 Link: http://lkml.kernel.org/r/1349073725-14093-1-git-send-email-daniel@numascale-asia.com [ Boris: beef up commit message ] Signed-off-by: Borislav Petkov Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index c4e916d7737..698b6ec12e0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -576,12 +576,10 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) int err = 0; if (shared_bank[bank]) { - nb = node_to_amd_nb(amd_get_nb_id(cpu)); - WARN_ON(!nb); /* threshold descriptor already initialized on this node? */ - if (nb->bank4) { + if (nb && nb->bank4) { /* yes, use it */ b = nb->bank4; err = kobject_add(b->kobj, &dev->kobj, name); @@ -615,8 +613,10 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) atomic_set(&b->cpus, 1); /* nb is already initialized, see above */ - WARN_ON(nb->bank4); - nb->bank4 = b; + if (nb) { + WARN_ON(nb->bank4); + nb->bank4 = b; + } } err = allocate_threshold_blocks(cpu, bank, 0, -- cgit v1.2.3-70-g09d2 From 32bec973a8435afc0b032da22174701f836549b2 Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Thu, 18 Oct 2012 23:24:57 +0300 Subject: crypto: aesni - fix XTS mode on x86-32, add wrapper function for asmlinkage aesni_enc() Calling convention for internal functions and 'asmlinkage' functions is different on x86-32. Therefore do not directly cast aesni_enc as XTS tweak function, but use wrapper function in between. Fixes crash with "XTS + aesni_intel + x86-32" combination. Cc: stable@vger.kernel.org Reported-by: Krzysztof Kolasa Signed-off-by: Jussi Kivilinna Acked-by: David S. Miller Signed-off-by: Linus Torvalds --- arch/x86/crypto/aesni-intel_glue.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 7c04d0da709..1b9c22bea8a 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -515,6 +515,11 @@ static int xts_aesni_setkey(struct crypto_tfm *tfm, const u8 *key, } +static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in) +{ + aesni_enc(ctx, out, in); +} + static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { @@ -525,7 +530,7 @@ static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, .tbuflen = sizeof(buf), .tweak_ctx = aes_ctx(ctx->raw_tweak_ctx), - .tweak_fn = XTS_TWEAK_CAST(aesni_enc), + .tweak_fn = aesni_xts_tweak, .crypt_ctx = aes_ctx(ctx->raw_crypt_ctx), .crypt_fn = lrw_xts_encrypt_callback, }; @@ -550,7 +555,7 @@ static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, .tbuflen = sizeof(buf), .tweak_ctx = aes_ctx(ctx->raw_tweak_ctx), - .tweak_fn = XTS_TWEAK_CAST(aesni_enc), + .tweak_fn = aesni_xts_tweak, .crypt_ctx = aes_ctx(ctx->raw_crypt_ctx), .crypt_fn = lrw_xts_decrypt_callback, }; -- cgit v1.2.3-70-g09d2 From 5bc66170dc486556a1e36fd384463536573f4b82 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 18 Oct 2012 15:10:56 +0200 Subject: x86, MCE: Remove bios_cmci_threshold sysfs attribute 450cc201038f3 ("x86/mce: Provide boot argument to honour bios-set CMCI threshold") added the bios_cmci_threshold sysfs attribute which was supposed to communicate to userspace tools that BIOS CMCI threshold has been honoured. However, this info is not of any importance to userspace - it should rather get the actual error count it has been thresholded already from MCi_STATUS[38:52]. So drop this before it becomes a used interface (good thing we caught this early in 3.7-rc1, right after the merge window closed). Cc: Naveen N. Rao Acked-by: Tony Luck Link: http://lkml.kernel.org/r/20121017105940.GA14590@x1.osrc.amd.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 29e87d3b284..46cbf868969 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2209,11 +2209,6 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = { &mce_cmci_disabled }; -static struct dev_ext_attribute dev_attr_bios_cmci_threshold = { - __ATTR(bios_cmci_threshold, 0444, device_show_int, NULL), - &mce_bios_cmci_threshold -}; - static struct device_attribute *mce_device_attrs[] = { &dev_attr_tolerant.attr, &dev_attr_check_interval.attr, @@ -2222,7 +2217,6 @@ static struct device_attribute *mce_device_attrs[] = { &dev_attr_dont_log_ce.attr, &dev_attr_ignore_ce.attr, &dev_attr_cmci_disabled.attr, - &dev_attr_bios_cmci_threshold.attr, NULL }; -- cgit v1.2.3-70-g09d2 From c2103b7ef7ecb4d17138b6ced4aad1b576affc4e Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sun, 7 Oct 2012 22:06:47 +0800 Subject: xen/x86: remove duplicated include from enlighten.c Remove duplicated include. dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) CC: stable@vger.kernel.org Signed-off-by: Wei Yongjun Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 4466feb4c69..783522b66b0 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -81,8 +81,6 @@ #include "smp.h" #include "multicalls.h" -#include - EXPORT_SYMBOL_GPL(hypercall_page); DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); -- cgit v1.2.3-70-g09d2 From 37ea0fcb6a3f3318bf45888e624722a2945cec04 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Wed, 17 Oct 2012 09:39:10 +0100 Subject: xen: sysfs: fix build warning. Define PRI macros for xen_ulong_t and xen_pfn_t and use to fix: drivers/xen/sys-hypervisor.c:288:4: warning: format '%lx' expects argument of type 'long unsigned int', but argument 3 has type 'xen_ulong_t' [-Wformat] Ideally this would use PRIx64 on ARM but these (or equivalent) don't seem to be available in the kernel. Acked-by: Stefano Stabellini Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/arm/include/asm/xen/interface.h | 2 ++ arch/x86/include/asm/xen/interface.h | 2 ++ drivers/xen/sys-hypervisor.c | 3 ++- 3 files changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/arm/include/asm/xen/interface.h b/arch/arm/include/asm/xen/interface.h index ae05e56dd17..62160f259b0 100644 --- a/arch/arm/include/asm/xen/interface.h +++ b/arch/arm/include/asm/xen/interface.h @@ -31,7 +31,9 @@ /* Explicitly size integers that represent pfns in the interface with * Xen so that we can have one ABI that works for 32 and 64 bit guests. */ typedef uint64_t xen_pfn_t; +#define PRI_xen_pfn "llx" typedef uint64_t xen_ulong_t; +#define PRI_xen_ulong "llx" /* Guest handles for primitive C types. */ __DEFINE_GUEST_HANDLE(uchar, unsigned char); __DEFINE_GUEST_HANDLE(uint, unsigned int); diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 28fc6211a79..c5b13e76d73 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -51,7 +51,9 @@ * with Xen so that on ARM we can have one ABI that works for 32 and 64 * bit guests. */ typedef unsigned long xen_pfn_t; +#define PRI_xen_pfn "lx" typedef unsigned long xen_ulong_t; +#define PRI_xen_ulong "lx" /* Guest handles for primitive C types. */ __DEFINE_GUEST_HANDLE(uchar, unsigned char); __DEFINE_GUEST_HANDLE(uint, unsigned int); diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index 66a0a1475ac..96453f8a85c 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -285,7 +285,8 @@ static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer) ret = HYPERVISOR_xen_version(XENVER_platform_parameters, parms); if (!ret) - ret = sprintf(buffer, "%lx\n", parms->virt_start); + ret = sprintf(buffer, "%"PRI_xen_ulong"\n", + parms->virt_start); kfree(parms); } -- cgit v1.2.3-70-g09d2 From ef32f89298c094b6ed76c0c4981b7a51e939cb71 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Wed, 17 Oct 2012 09:39:14 +0100 Subject: xen: grant: use xen_pfn_t type for frame_list. This correctly sizes it as 64 bit on ARM but leaves it as unsigned long on x86 (therefore no intended change on x86). The long and ulong guest handles are now unused (and a bit dangerous) so remove them. Acked-by: Stefano Stabellini Signed-off-by: Ian Campbell Signed-off-by: Konrad Rzeszutek Wilk --- arch/arm/include/asm/xen/interface.h | 2 -- arch/arm/xen/grant-table.c | 2 +- arch/x86/include/asm/xen/interface.h | 2 -- drivers/xen/grant-table.c | 8 ++++---- include/xen/grant_table.h | 2 +- include/xen/interface/grant_table.h | 2 +- 6 files changed, 7 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm/include/asm/xen/interface.h b/arch/arm/include/asm/xen/interface.h index 62160f259b0..1d6ef9c2d1d 100644 --- a/arch/arm/include/asm/xen/interface.h +++ b/arch/arm/include/asm/xen/interface.h @@ -37,10 +37,8 @@ typedef uint64_t xen_ulong_t; /* Guest handles for primitive C types. */ __DEFINE_GUEST_HANDLE(uchar, unsigned char); __DEFINE_GUEST_HANDLE(uint, unsigned int); -__DEFINE_GUEST_HANDLE(ulong, unsigned long); DEFINE_GUEST_HANDLE(char); DEFINE_GUEST_HANDLE(int); -DEFINE_GUEST_HANDLE(long); DEFINE_GUEST_HANDLE(void); DEFINE_GUEST_HANDLE(uint64_t); DEFINE_GUEST_HANDLE(uint32_t); diff --git a/arch/arm/xen/grant-table.c b/arch/arm/xen/grant-table.c index dbd1330c019..859a9bb002d 100644 --- a/arch/arm/xen/grant-table.c +++ b/arch/arm/xen/grant-table.c @@ -33,7 +33,7 @@ #include #include -int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, +int arch_gnttab_map_shared(xen_pfn_t *frames, unsigned long nr_gframes, unsigned long max_nr_gframes, void **__shared) { diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index c5b13e76d73..ca9487d4f17 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -57,10 +57,8 @@ typedef unsigned long xen_ulong_t; /* Guest handles for primitive C types. */ __DEFINE_GUEST_HANDLE(uchar, unsigned char); __DEFINE_GUEST_HANDLE(uint, unsigned int); -__DEFINE_GUEST_HANDLE(ulong, unsigned long); DEFINE_GUEST_HANDLE(char); DEFINE_GUEST_HANDLE(int); -DEFINE_GUEST_HANDLE(long); DEFINE_GUEST_HANDLE(void); DEFINE_GUEST_HANDLE(uint64_t); DEFINE_GUEST_HANDLE(uint32_t); diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 3a567b15600..39aefa89011 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -84,7 +84,7 @@ struct gnttab_ops { * nr_gframes is the number of frames to map grant table. Returning * GNTST_okay means success and negative value means failure. */ - int (*map_frames)(unsigned long *frames, unsigned int nr_gframes); + int (*map_frames)(xen_pfn_t *frames, unsigned int nr_gframes); /* * Release a list of frames which are mapped in map_frames for grant * entry status. @@ -958,7 +958,7 @@ static unsigned nr_status_frames(unsigned nr_grant_frames) return (nr_grant_frames * GREFS_PER_GRANT_FRAME + SPP - 1) / SPP; } -static int gnttab_map_frames_v1(unsigned long *frames, unsigned int nr_gframes) +static int gnttab_map_frames_v1(xen_pfn_t *frames, unsigned int nr_gframes) { int rc; @@ -975,7 +975,7 @@ static void gnttab_unmap_frames_v1(void) arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames); } -static int gnttab_map_frames_v2(unsigned long *frames, unsigned int nr_gframes) +static int gnttab_map_frames_v2(xen_pfn_t *frames, unsigned int nr_gframes) { uint64_t *sframes; unsigned int nr_sframes; @@ -1027,7 +1027,7 @@ static void gnttab_unmap_frames_v2(void) static int gnttab_map(unsigned int start_idx, unsigned int end_idx) { struct gnttab_setup_table setup; - unsigned long *frames; + xen_pfn_t *frames; unsigned int nr_gframes = end_idx + 1; int rc; diff --git a/include/xen/grant_table.h b/include/xen/grant_table.h index ba0d77529a2..962b7df5eab 100644 --- a/include/xen/grant_table.h +++ b/include/xen/grant_table.h @@ -170,7 +170,7 @@ gnttab_set_unmap_op(struct gnttab_unmap_grant_ref *unmap, phys_addr_t addr, unmap->dev_bus_addr = 0; } -int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, +int arch_gnttab_map_shared(xen_pfn_t *frames, unsigned long nr_gframes, unsigned long max_nr_gframes, void **__shared); int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes, diff --git a/include/xen/interface/grant_table.h b/include/xen/interface/grant_table.h index f9f8b975ae7..e40fae9bf11 100644 --- a/include/xen/interface/grant_table.h +++ b/include/xen/interface/grant_table.h @@ -310,7 +310,7 @@ struct gnttab_setup_table { uint32_t nr_frames; /* OUT parameters. */ int16_t status; /* GNTST_* */ - GUEST_HANDLE(ulong) frame_list; + GUEST_HANDLE(xen_pfn_t) frame_list; }; DEFINE_GUEST_HANDLE_STRUCT(gnttab_setup_table); -- cgit v1.2.3-70-g09d2 From a349e23d1cf746f8bdc603dcc61fae9ee4a695f6 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Fri, 19 Oct 2012 17:29:07 +0100 Subject: xen/x86: don't corrupt %eip when returning from a signal handler In 32 bit guests, if a userspace process has %eax == -ERESTARTSYS (-512) or -ERESTARTNOINTR (-513) when it is interrupted by an event /and/ the process has a pending signal then %eip (and %eax) are corrupted when returning to the main process after handling the signal. The application may then crash with SIGSEGV or a SIGILL or it may have subtly incorrect behaviour (depending on what instruction it returned to). The occurs because handle_signal() is incorrectly thinking that there is a system call that needs to restarted so it adjusts %eip and %eax to re-execute the system call instruction (even though user space had not done a system call). If %eax == -514 (-ERESTARTNOHAND (-514) or -ERESTART_RESTARTBLOCK (-516) then handle_signal() only corrupted %eax (by setting it to -EINTR). This may cause the application to crash or have incorrect behaviour. handle_signal() assumes that regs->orig_ax >= 0 means a system call so any kernel entry point that is not for a system call must push a negative value for orig_ax. For example, for physical interrupts on bare metal the inverse of the vector is pushed and page_fault() sets regs->orig_ax to -1, overwriting the hardware provided error code. xen_hypervisor_callback() was incorrectly pushing 0 for orig_ax instead of -1. Classic Xen kernels pushed %eax which works as %eax cannot be both non-negative and -RESTARTSYS (etc.), but using -1 is consistent with other non-system call entry points and avoids some of the tests in handle_signal(). There were similar bugs in xen_failsafe_callback() of both 32 and 64-bit guests. If the fault was corrected and the normal return path was used then 0 was incorrectly pushed as the value for orig_ax. Signed-off-by: David Vrabel Acked-by: Jan Beulich Acked-by: Ian Campbell Cc: stable@vger.kernel.org Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/kernel/entry_32.S | 8 +++++--- arch/x86/kernel/entry_64.S | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 623f2883747..8f8e8eea908 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1016,7 +1016,7 @@ ENTRY(xen_sysenter_target) ENTRY(xen_hypervisor_callback) CFI_STARTPROC - pushl_cfi $0 + pushl_cfi $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL TRACE_IRQS_OFF @@ -1058,14 +1058,16 @@ ENTRY(xen_failsafe_callback) 2: mov 8(%esp),%es 3: mov 12(%esp),%fs 4: mov 16(%esp),%gs + /* EAX == 0 => Category 1 (Bad segment) + EAX != 0 => Category 2 (Bad IRET) */ testl %eax,%eax popl_cfi %eax lea 16(%esp),%esp CFI_ADJUST_CFA_OFFSET -16 jz 5f addl $16,%esp - jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) -5: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment) + jmp iret_exc +5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL jmp ret_from_exception CFI_ENDPROC diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 69babd8c834..dcdd0ea33a3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1363,7 +1363,7 @@ ENTRY(xen_failsafe_callback) CFI_RESTORE r11 addq $0x30,%rsp CFI_ADJUST_CFA_OFFSET -0x30 - pushq_cfi $0 + pushq_cfi $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL jmp error_exit CFI_ENDPROC -- cgit v1.2.3-70-g09d2 From a05123bdd1b9ba961ed262864924a5b3ee81afe8 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 21 Aug 2012 17:08:37 +0800 Subject: perf/x86: Disable uncore on virtualized CPUs Initializing uncore PMU on virtualized CPU may hang the kernel. This is because kvm does not emulate the entire hardware. Thers are lots of uncore related MSRs, making kvm enumerate them all is a non-trival task. So just disable uncore on virtualized CPU. Signed-off-by: Yan, Zheng Tested-by: Pekka Enberg Cc: a.p.zijlstra@chello.nl Cc: eranian@google.com Cc: andi@firstfloor.org Cc: avi@redhat.com Link: http://lkml.kernel.org/r/1345540117-14164-1-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 99d96a4978b..5df8d32ba91 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -2926,6 +2926,9 @@ static int __init intel_uncore_init(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return -ENODEV; + if (cpu_has_hypervisor) + return -ENODEV; + ret = uncore_pci_init(); if (ret) goto fail; -- cgit v1.2.3-70-g09d2 From f3ac1a4b667eeffcedf779f45529c95d66ddc71a Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 16 Oct 2012 20:07:03 +0800 Subject: KVM: MMU: fix release noslot pfn We can not directly call kvm_release_pfn_clean to release the pfn since we can meet noslot pfn which is used to cache mmio info into spte Signed-off-by: Xiao Guangrong Cc: stable@vger.kernel.org Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 3 +-- virt/kvm/kvm_main.c | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d289fee1ffb..6f85fe0bf95 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2497,8 +2497,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } - if (!is_error_pfn(pfn)) - kvm_release_pfn_clean(pfn); + kvm_release_pfn_clean(pfn); } static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c353b4599ce..a65bc02a375 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1322,9 +1322,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean); void kvm_release_pfn_clean(pfn_t pfn) { - WARN_ON(is_error_pfn(pfn)); - - if (!kvm_is_mmio_pfn(pfn)) + if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn)) put_page(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); -- cgit v1.2.3-70-g09d2 From 7f46ddbd487e0d0528d89534fdfb31d885977804 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 14 Oct 2012 13:08:58 +0200 Subject: KVM: apic: fix LDR calculation in x2apic mode Signed-off-by: Gleb Natapov Reviewed-by: Chegu Vinod Tested-by: Chegu Vinod Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c6e6b721b6e..43e9fadca5d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1311,7 +1311,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) vcpu->arch.apic_base = value; if (apic_x2apic_mode(apic)) { u32 id = kvm_apic_id(apic); - u32 ldr = ((id & ~0xf) << 16) | (1 << (id & 0xf)); + u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); kvm_apic_set_ldr(apic, ldr); } apic->base_address = apic->vcpu->arch.apic_base & -- cgit v1.2.3-70-g09d2 From c5e015d4949aa665c486cae6884beb00b97e3dea Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Fri, 19 Oct 2012 12:11:55 -0400 Subject: KVM guest: exit idleness when handling KVM_PV_REASON_PAGE_NOT_PRESENT KVM_PV_REASON_PAGE_NOT_PRESENT kicks cpu out of idleness, but we haven't marked that spot as an exit from idleness. Not doing so can cause RCU warnings such as: [ 732.788386] =============================== [ 732.789803] [ INFO: suspicious RCU usage. ] [ 732.790032] 3.7.0-rc1-next-20121019-sasha-00002-g6d8d02d-dirty #63 Tainted: G W [ 732.790032] ------------------------------- [ 732.790032] include/linux/rcupdate.h:738 rcu_read_lock() used illegally while idle! [ 732.790032] [ 732.790032] other info that might help us debug this: [ 732.790032] [ 732.790032] [ 732.790032] RCU used illegally from idle CPU! [ 732.790032] rcu_scheduler_active = 1, debug_locks = 1 [ 732.790032] RCU used illegally from extended quiescent state! [ 732.790032] 2 locks held by trinity-child31/8252: [ 732.790032] #0: (&rq->lock){-.-.-.}, at: [] __schedule+0x178/0x8f0 [ 732.790032] #1: (rcu_read_lock){.+.+..}, at: [] cpuacct_charge+0xe/0x200 [ 732.790032] [ 732.790032] stack backtrace: [ 732.790032] Pid: 8252, comm: trinity-child31 Tainted: G W 3.7.0-rc1-next-20121019-sasha-00002-g6d8d02d-dirty #63 [ 732.790032] Call Trace: [ 732.790032] [] lockdep_rcu_suspicious+0x10b/0x120 [ 732.790032] [] cpuacct_charge+0x90/0x200 [ 732.790032] [] ? cpuacct_charge+0xe/0x200 [ 732.790032] [] update_curr+0x1a3/0x270 [ 732.790032] [] dequeue_entity+0x2a/0x210 [ 732.790032] [] dequeue_task_fair+0x45/0x130 [ 732.790032] [] dequeue_task+0x89/0xa0 [ 732.790032] [] deactivate_task+0x1e/0x20 [ 732.790032] [] __schedule+0x879/0x8f0 [ 732.790032] [] ? trace_hardirqs_off+0xd/0x10 [ 732.790032] [] ? kvm_async_pf_task_wait+0x1d5/0x2b0 [ 732.790032] [] schedule+0x55/0x60 [ 732.790032] [] kvm_async_pf_task_wait+0x1f4/0x2b0 [ 732.790032] [] ? abort_exclusive_wait+0xb0/0xb0 [ 732.790032] [] ? prepare_to_wait+0x25/0x90 [ 732.790032] [] do_async_page_fault+0x56/0xa0 [ 732.790032] [] async_page_fault+0x28/0x30 Signed-off-by: Sasha Levin Acked-by: Gleb Natapov Acked-by: Paul E. McKenney Signed-off-by: Avi Kivity --- arch/x86/kernel/kvm.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b3e5e51bc90..4180a874c76 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -247,7 +247,10 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ + rcu_irq_enter(); + exit_idle(); kvm_async_pf_task_wait((u32)read_cr2()); + rcu_irq_exit(); break; case KVM_PV_REASON_PAGE_READY: rcu_irq_enter(); -- cgit v1.2.3-70-g09d2 From bffd5fc26043cce33158d4e027576e79fab2f7bb Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Tue, 9 Oct 2012 17:38:35 +0200 Subject: x86/perf: Fix virtualization sanity check In check_hw_exists() we try to detect non-emulated MSR accesses by writing an arbitrary value into one of the PMU registers and check if it's value after a readout is still the same. This algorithm silently assumes that the register does not contain the magic value already, which is wrong in at least one situation. Fix the algorithm to really do a read-modify-write cycle. This fixes a warning under Xen under some circumstances on AMD family 10h CPUs. The reasons in more details actually sound like a story from Believe It or Not!: First you need an AMD family 10h/12h CPU. These do not reset the PERF_CTR registers on a reboot. Now you boot bare metal Linux, which goes successfully through this check, but leaves the magic value of 0xabcd in the register. You don't use the performance counters, but do a reboot (warm reset). Then you choose to boot Xen. The check will be triggered with a recent Linux kernel as Dom0 again, trying to write 0xabcd into the MSR. Xen silently drops the write (expected), but the subsequent read will return the value in the register, which just happens to be the expected magic value. Thus the test misleadingly succeeds, leaving the kernel in the belief that the PMU is available. This will trigger the following message: [ 0.020294] ------------[ cut here ]------------ [ 0.020311] WARNING: at arch/x86/xen/enlighten.c:730 xen_apic_write+0x15/0x17() [ 0.020318] Hardware name: empty [ 0.020323] Modules linked in: [ 0.020334] Pid: 1, comm: swapper/0 Not tainted 3.3.8 #7 [ 0.020340] Call Trace: [ 0.020354] [] warn_slowpath_common+0x80/0x98 [ 0.020369] [] warn_slowpath_null+0x15/0x17 [ 0.020378] [] xen_apic_write+0x15/0x17 [ 0.020392] [] perf_events_lapic_init+0x2e/0x30 [ 0.020410] [] init_hw_perf_events+0x250/0x407 [ 0.020419] [] ? check_bugs+0x2d/0x2d [ 0.020430] [] do_one_initcall+0x7a/0x131 [ 0.020444] [] kernel_init+0x91/0x15d [ 0.020456] [] kernel_thread_helper+0x4/0x10 [ 0.020471] [] ? retint_restore_args+0x5/0x6 [ 0.020481] [] ? gs_change+0x13/0x13 [ 0.020500] ---[ end trace a7919e7f17c0a725 ]--- The new code will change every of the 16 low bits read from the register and tries to write and read-back that modified number from the MSR. Signed-off-by: Andre Przywara Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Avi Kivity Link: http://lkml.kernel.org/r/1349797115-28346-2-git-send-email-andre.przywara@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 3373f84d139..4a3374e61a9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -208,12 +208,14 @@ static bool check_hw_exists(void) } /* - * Now write a value and read it back to see if it matches, - * this is needed to detect certain hardware emulators (qemu/kvm) - * that don't trap on the MSR access and always return 0s. + * Read the current value, change it and read it back to see if it + * matches, this is needed to detect certain hardware emulators + * (qemu/kvm) that don't trap on the MSR access and always return 0s. */ - val = 0xabcdUL; reg = x86_pmu_event_addr(0); + if (rdmsrl_safe(reg, &val)) + goto msr_fail; + val ^= 0xffffUL; ret = wrmsrl_safe(reg, val); ret |= rdmsrl_safe(reg, &val_new); if (ret || val != val_new) -- cgit v1.2.3-70-g09d2 From 7b16bbf97375d9fb7fc107b3f80afeb94a204e44 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Thu, 18 Oct 2012 14:33:23 +0800 Subject: Revert "x86/mm: Fix the size calculation of mapping tables" Commit: 722bc6b16771 x86/mm: Fix the size calculation of mapping tables Tried to address the issue that the first 2/4M should use 4k pages if PSE enabled, but extra counts should only be valid for x86_32. This commit caused a kdump regression: the kdump kernel hangs. Work is in progress to fundamentally fix the various page table initialization issues that we have, via the design suggested by H. Peter Anvin, but it's not ready yet to be merged. So, to get a working kdump revert to the last known working version, which is the revert of this commit and of a followup fix (which was incomplete): bd2753b2dda7 x86/mm: Only add extra pages count for the first memory range during pre-allocation Tested kdump on physical and virtual machines. Signed-off-by: Dave Young Acked-by: Yinghai Lu Acked-by: Cong Wang Acked-by: Flavio Leitner Tested-by: Flavio Leitner Cc: Dan Carpenter Cc: Cong Wang Cc: Flavio Leitner Cc: Tejun Heo Cc: ianfang.cn@gmail.com Cc: Vivek Goyal Cc: Linus Torvalds Cc: Andrew Morton Cc: Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index ab1f6a93b52..8653b3a722b 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -29,14 +29,8 @@ int direct_gbpages #endif ; -struct map_range { - unsigned long start; - unsigned long end; - unsigned page_size_mask; -}; - -static void __init find_early_table_space(struct map_range *mr, unsigned long end, - int use_pse, int use_gbpages) +static void __init find_early_table_space(unsigned long end, int use_pse, + int use_gbpages) { unsigned long puds, pmds, ptes, tables, start = 0, good_end = end; phys_addr_t base; @@ -61,10 +55,6 @@ static void __init find_early_table_space(struct map_range *mr, unsigned long en #ifdef CONFIG_X86_32 extra += PMD_SIZE; #endif - /* The first 2/4M doesn't use large pages. */ - if (mr->start < PMD_SIZE) - extra += mr->end - mr->start; - ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -95,6 +85,12 @@ void __init native_pagetable_reserve(u64 start, u64 end) memblock_reserve(start, end - start); } +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + #ifdef CONFIG_X86_32 #define NR_RANGE_MR 3 #else /* CONFIG_X86_64 */ @@ -267,7 +263,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * nodes are discovered. */ if (!after_bootmem) - find_early_table_space(&mr[0], end, use_pse, use_gbpages); + find_early_table_space(end, use_pse, use_gbpages); for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, -- cgit v1.2.3-70-g09d2 From 7991c9ca40d3127dd2ffa3a9c1e33f7d4005495a Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 19 Oct 2012 17:30:01 -0400 Subject: perf/x86: Fix P6 FP_ASSIST event constraint According to Intel SDM Volume 3B, FP_ASSIST is limited to Counter 1 only, not Counter 0. Tested on a Pentium II. Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1210191728570.14552@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index e4dd0f7a045..0ff5f7fb64c 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -34,7 +34,7 @@ static struct event_constraint p6_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ - INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ -- cgit v1.2.3-70-g09d2 From e09df47885d767e418902067ce1885aafa3b27db Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 19 Oct 2012 17:31:54 -0400 Subject: perf/x86: Update/fix generic events on P6 PMU This patch updates the generic events on p6, including some new extended cache events. Values for these events were taken from the equivelant PAPI predefined events. Tested on a Pentium II. Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1210191730080.14552@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p6.c | 111 +++++++++++++++++++++++++++++++++--- 1 file changed, 104 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 0ff5f7fb64c..9582fcbcd8e 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -8,13 +8,106 @@ */ static const u64 p6_perfmon_event_map[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, /* CPU_CLK_UNHALTED */ + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, /* INST_RETIRED */ + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, /* L2_RQSTS:M:E:S:I */ + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, /* L2_RQSTS:I */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, /* BR_INST_RETIRED */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, /* BR_MISS_PRED_RETIRED */ + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, /* BUS_DRDY_CLOCKS */ + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2, /* RESOURCE_STALLS */ + +}; + +static __initconst u64 p6_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */ + [ C(RESULT_MISS) ] = 0x0045, /* DCU_LINES_IN */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0f29, /* L2_LD:M:E:S:I */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */ + [ C(RESULT_MISS) ] = 0x0f28, /* L2_IFETCH:M:E:S:I */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0025, /* L2_M_LINES_INM */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */ + [ C(RESULT_MISS) ] = 0x0085, /* ITLB_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED */ + [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISS_PRED_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, }; static u64 p6_pmu_event_map(int hw_event) @@ -158,5 +251,9 @@ __init int p6_pmu_init(void) x86_pmu = p6_pmu; + memcpy(hw_cache_event_ids, p6_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + return 0; } -- cgit v1.2.3-70-g09d2 From 58e9eaf06f5476cb2192ec1d012674ce5e79dd21 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 19 Oct 2012 17:33:38 -0400 Subject: perf/x86: Remove P6 cpuc->enabled check Between 2.6.33 and 2.6.34 the PMU code was made modular. The x86_pmu_enable() call was extended to disable cpuc->enabled and iterate the counters, enabling one at a time, before calling enable_all() at the end, followed by re-enabling cpuc->enabled. Since cpuc->enabled was set to 0, that change effectively caused the "val |= ARCH_PERFMON_EVENTSEL_ENABLE;" code in p6_pmu_enable_event() and p6_pmu_disable_event() to be dead code that was never called. This change removes this code (which was confusing) and adds some extra commentary to make it more clear what is going on. Signed-off-by: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1210191732000.14552@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_p6.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 9582fcbcd8e..7d0270bd793 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -157,25 +157,25 @@ static void p6_pmu_enable_all(int added) static inline void p6_pmu_disable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val = P6_NOP_EVENT; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)wrmsrl_safe(hwc->config_base, val); } static void p6_pmu_enable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL_ENABLE; + + /* + * p6 only has a global event enable, set on PerfEvtSel0 + * We "disable" events by programming P6_NOP_EVENT + * and we rely on p6_pmu_enable_all() being called + * to actually enable the events. + */ (void)wrmsrl_safe(hwc->config_base, val); } -- cgit v1.2.3-70-g09d2 From 876ee61aadf01aa0db981b5d249cbdd53dc28b5e Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 4 Oct 2012 14:48:10 +0100 Subject: x86-64: Fix page table accounting Commit 20167d3421a089a1bf1bd680b150dc69c9506810 ("x86-64: Fix accounting in kernel_physical_mapping_init()") went a little too far by entirely removing the counting of pre-populated page tables: this should be done at boot time (to cover the page tables set up in early boot code), but shouldn't be done during memory hot add. Hence, re-add the removed increments of "pages", but make them and the one in phys_pte_init() conditional upon !after_bootmem. Reported-Acked-and-Tested-by: Hugh Dickins Signed-off-by: Jan Beulich Cc: Link: http://lkml.kernel.org/r/506DAFBA020000780009FA8C@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 2b6b4a3c8be..3baff255ada 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -386,7 +386,8 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, * these mappings are more intelligent. */ if (pte_val(*pte)) { - pages++; + if (!after_bootmem) + pages++; continue; } @@ -451,6 +452,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, * attributes. */ if (page_size_mask & (1 << PG_LEVEL_2M)) { + if (!after_bootmem) + pages++; last_map_addr = next; continue; } @@ -526,6 +529,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, * attributes. */ if (page_size_mask & (1 << PG_LEVEL_1G)) { + if (!after_bootmem) + pages++; last_map_addr = next; continue; } -- cgit v1.2.3-70-g09d2 From 032c3851f51141e30de02ed0bc50a7743dfd776d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 24 Oct 2012 16:42:20 +0800 Subject: perf/x86/uncore: Handle pci_read_config_dword() errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This, beyond handling corner cases, also fixes some build warnings: arch/x86/kernel/cpu/perf_event_intel_uncore.c: In function ‘snbep_uncore_pci_disable_box’: arch/x86/kernel/cpu/perf_event_intel_uncore.c:124:9: warning: ‘config’ is used uninitialized in this function [-Wuninitialized] arch/x86/kernel/cpu/perf_event_intel_uncore.c: In function ‘snbep_uncore_pci_enable_box’: arch/x86/kernel/cpu/perf_event_intel_uncore.c:135:9: warning: ‘config’ is used uninitialized in this function [-Wuninitialized] arch/x86/kernel/cpu/perf_event_intel_uncore.c: In function ‘snbep_uncore_pci_read_counter’: arch/x86/kernel/cpu/perf_event_intel_uncore.c:164:2: warning: ‘count’ is used uninitialized in this function [-Wuninitialized] Signed-off-by: Yan, Zheng Cc: a.p.zijlstra@chello.nl Link: http://lkml.kernel.org/r/1351068140-13456-1-git-send-email-zheng.z.yan@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 43 +++++++++++++++++---------- 1 file changed, 28 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 5df8d32ba91..6f874771ee6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -118,22 +118,24 @@ static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) { struct pci_dev *pdev = box->pci_dev; int box_ctl = uncore_pci_box_ctl(box); - u32 config; + u32 config = 0; - pci_read_config_dword(pdev, box_ctl, &config); - config |= SNBEP_PMON_BOX_CTL_FRZ; - pci_write_config_dword(pdev, box_ctl, config); + if (!pci_read_config_dword(pdev, box_ctl, &config)) { + config |= SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); + } } static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box) { struct pci_dev *pdev = box->pci_dev; int box_ctl = uncore_pci_box_ctl(box); - u32 config; + u32 config = 0; - pci_read_config_dword(pdev, box_ctl, &config); - config &= ~SNBEP_PMON_BOX_CTL_FRZ; - pci_write_config_dword(pdev, box_ctl, config); + if (!pci_read_config_dword(pdev, box_ctl, &config)) { + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); + } } static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event) @@ -156,7 +158,7 @@ static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct pe { struct pci_dev *pdev = box->pci_dev; struct hw_perf_event *hwc = &event->hw; - u64 count; + u64 count = 0; pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); @@ -603,11 +605,12 @@ static struct pci_driver snbep_uncore_pci_driver = { /* * build pci bus to socket mapping */ -static void snbep_pci2phy_map_init(void) +static int snbep_pci2phy_map_init(void) { struct pci_dev *ubox_dev = NULL; int i, bus, nodeid; - u32 config; + int err = 0; + u32 config = 0; while (1) { /* find the UBOX device */ @@ -618,10 +621,14 @@ static void snbep_pci2phy_map_init(void) break; bus = ubox_dev->bus->number; /* get the Node ID of the local register */ - pci_read_config_dword(ubox_dev, 0x40, &config); + err = pci_read_config_dword(ubox_dev, 0x40, &config); + if (err) + break; nodeid = config; /* get the Node ID mapping */ - pci_read_config_dword(ubox_dev, 0x54, &config); + err = pci_read_config_dword(ubox_dev, 0x54, &config); + if (err) + break; /* * every three bits in the Node ID mapping register maps * to a particular node. @@ -633,7 +640,11 @@ static void snbep_pci2phy_map_init(void) } } }; - return; + + if (ubox_dev) + pci_dev_put(ubox_dev); + + return err ? pcibios_err_to_errno(err) : 0; } /* end of Sandy Bridge-EP uncore support */ @@ -2578,9 +2589,11 @@ static int __init uncore_pci_init(void) switch (boot_cpu_data.x86_model) { case 45: /* Sandy Bridge-EP */ + ret = snbep_pci2phy_map_init(); + if (ret) + return ret; pci_uncores = snbep_pci_uncores; uncore_pci_driver = &snbep_uncore_pci_driver; - snbep_pci2phy_map_init(); break; default: return 0; -- cgit v1.2.3-70-g09d2 From ae5ba47a990a18c869d66916fd72fb334c45cf91 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Wed, 17 Oct 2012 13:03:21 -0400 Subject: perf/x86: Make Intel KNC use full 40-bit width of counters Early versions of Intel KNC chips have a bug where bits above 32 were not properly set. We worked around this by only using the bottom 32 bits (out of 40 that should be available). It turns out this workaround breaks overflow handling. The buggy silicon will in theory never be used in production systems, so remove this workaround so we get proper overflow support. Signed-off-by: Vince Weaver Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: eranian@gmail.com Cc: Meadows Lawrence F Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1210171302140.23243@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_knc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c index 7c46bfdbc37..73bcfbdedd5 100644 --- a/arch/x86/kernel/cpu/perf_event_knc.c +++ b/arch/x86/kernel/cpu/perf_event_knc.c @@ -226,12 +226,11 @@ static __initconst struct x86_pmu knc_pmu = { .event_map = knc_pmu_event_map, .max_events = ARRAY_SIZE(knc_perfmon_event_map), .apic = 1, - .max_period = (1ULL << 31) - 1, + .max_period = (1ULL << 39) - 1, .version = 0, .num_counters = 2, - /* in theory 40 bits, early silicon is buggy though */ - .cntval_bits = 32, - .cntval_mask = (1ULL << 32) - 1, + .cntval_bits = 40, + .cntval_mask = (1ULL << 40) - 1, .get_event_constraints = x86_get_event_constraints, .event_constraints = knc_event_constraints, .format_attrs = intel_knc_formats_attr, -- cgit v1.2.3-70-g09d2 From 7d011962afbaa6e572cd8e0dbb7abf773e166e64 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Wed, 17 Oct 2012 13:04:33 -0400 Subject: perf/x86: Remove cpuc->enable check on Intl KNC event enable/disable x86_pmu.enable() is called from x86_pmu_enable() with cpuc->enabled set to 0. This means we weren't re-enabling the counters after a context switch. This patch just removes the check, as it should't be necessary (and the equivelent x86_ generic code does not have the checks). The origin of this problem is the KNC driver being based on the P6 one. The P6 driver also has this issue, but works anyway due to various lucky accidents. Signed-off-by: Vince Weaver Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: eranian@gmail.com Cc: Meadows Cc: Lawrence F Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1210171303290.23243@vincent-weaver-1.um.maine.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_knc.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c index 73bcfbdedd5..f3a2af41552 100644 --- a/arch/x86/kernel/cpu/perf_event_knc.c +++ b/arch/x86/kernel/cpu/perf_event_knc.c @@ -173,26 +173,22 @@ static void knc_pmu_enable_all(int added) static inline void knc_pmu_disable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; - if (cpuc->enabled) - val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); } static void knc_pmu_enable_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct hw_perf_event *hwc = &event->hw; u64 val; val = hwc->config; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); } -- cgit v1.2.3-70-g09d2 From e4074b3049f99c6ad6e1a33e6d93d8ec0652e2c1 Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Wed, 17 Oct 2012 13:05:45 -0400 Subject: perf/x86: Enable overflow on Intel KNC with a custom knc_pmu_handle_irq() Although based on the Intel P6 design, the interrupt mechnanism for KNC more closely resembles the Intel architectural perfmon one. We can't just re-use that code though, because KNC has different MSR numbers for the status and ack registers. In this case we just cut-and paste from perf_event_intel.c with some minor changes, as it looks like it would not be worth the trouble to change that code to be MSR-configurable. Signed-off-by: Vince Weaver Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: eranian@gmail.com Cc: Meadows Lawrence F Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1210171304410.23243@vincent-weaver-1.um.maine.edu [ Small stylistic edits. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_knc.c | 78 +++++++++++++++++++++++++++++++++++- 1 file changed, 77 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c index f3a2af41552..4b7731bf23a 100644 --- a/arch/x86/kernel/cpu/perf_event_knc.c +++ b/arch/x86/kernel/cpu/perf_event_knc.c @@ -3,6 +3,8 @@ #include #include +#include + #include "perf_event.h" static const u64 knc_perfmon_event_map[] = @@ -193,6 +195,80 @@ static void knc_pmu_enable_event(struct perf_event *event) (void)wrmsrl_safe(hwc->config_base + hwc->idx, val); } +static inline u64 knc_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void knc_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack); +} + +static int knc_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int handled = 0; + int bit, loops; + u64 status; + + cpuc = &__get_cpu_var(cpu_hw_events); + + knc_pmu_disable_all(); + + status = knc_pmu_get_status(); + if (!status) { + knc_pmu_enable_all(0); + return handled; + } + + loops = 0; +again: + knc_pmu_ack_status(status); + if (++loops > 100) { + WARN_ONCE(1, "perf: irq loop stuck!\n"); + perf_event_print_debug(); + goto done; + } + + inc_irq_stat(apic_perf_irqs); + + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + handled++; + + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(event)) + continue; + + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + } + + /* + * Repeat if there is more work to be done: + */ + status = knc_pmu_get_status(); + if (status) + goto again; + +done: + knc_pmu_enable_all(0); + + return handled; +} + + PMU_FORMAT_ATTR(event, "config:0-7" ); PMU_FORMAT_ATTR(umask, "config:8-15" ); PMU_FORMAT_ATTR(edge, "config:18" ); @@ -210,7 +286,7 @@ static struct attribute *intel_knc_formats_attr[] = { static __initconst struct x86_pmu knc_pmu = { .name = "knc", - .handle_irq = x86_pmu_handle_irq, + .handle_irq = knc_pmu_handle_irq, .disable_all = knc_pmu_disable_all, .enable_all = knc_pmu_enable_all, .enable = knc_pmu_enable_event, -- cgit v1.2.3-70-g09d2 From 3e8fa263a97079c74880675c451587bb6899e661 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 19 Oct 2012 13:25:46 +0100 Subject: x86/efi: Fix oops caused by incorrect set_memory_uc() usage Calling __pa() with an ioremap'd address is invalid. If we encounter an efi_memory_desc_t without EFI_MEMORY_WB set in ->attribute we currently call set_memory_uc(), which in turn calls __pa() on a potentially ioremap'd address. On CONFIG_X86_32 this results in the following oops: BUG: unable to handle kernel paging request at f7f22280 IP: [] reserve_ram_pages_type+0x89/0x210 *pdpt = 0000000001978001 *pde = 0000000001ffb067 *pte = 0000000000000000 Oops: 0000 [#1] PREEMPT SMP Modules linked in: Pid: 0, comm: swapper Not tainted 3.0.0-acpi-efi-0805 #3 EIP: 0060:[] EFLAGS: 00010202 CPU: 0 EIP is at reserve_ram_pages_type+0x89/0x210 EAX: 0070e280 EBX: 38714000 ECX: f7814000 EDX: 00000000 ESI: 00000000 EDI: 38715000 EBP: c189fef0 ESP: c189fea8 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 Process swapper (pid: 0, ti=c189e000 task=c18bbe60 task.ti=c189e000) Stack: 80000200 ff108000 00000000 c189ff00 00038714 00000000 00000000 c189fed0 c104f8ca 00038714 00000000 00038715 00000000 00000000 00038715 00000000 00000010 38715000 c189ff48 c1025aff 38715000 00000000 00000010 00000000 Call Trace: [] ? page_is_ram+0x1a/0x40 [] reserve_memtype+0xdf/0x2f0 [] set_memory_uc+0x49/0xa0 [] efi_enter_virtual_mode+0x1c2/0x3aa [] start_kernel+0x291/0x2f2 [] ? loglevel+0x1b/0x1b [] i386_start_kernel+0xbf/0xc8 The only time we can call set_memory_uc() for a memory region is when it is part of the direct kernel mapping. For the case where we ioremap a memory region we must leave it alone. This patch reimplements the fix from e8c7106280a3 ("x86, efi: Calling __pa() with an ioremap()ed address is invalid") which was reverted in e1ad783b12ec because it caused a regression on some MacBooks (they hung at boot). The regression was caused because the commit only marked EFI_RUNTIME_SERVICES_DATA as E820_RESERVED_EFI, when it should have marked all regions that have the EFI_MEMORY_RUNTIME attribute. Despite first impressions, it's not possible to use ioremap_cache() to map all cached memory regions on CONFIG_X86_64 because of the way that the memory map might be configured as detailed in the following bug report, https://bugzilla.redhat.com/show_bug.cgi?id=748516 e.g. some of the EFI memory regions *need* to be mapped as part of the direct kernel mapping. Signed-off-by: Matt Fleming Cc: Matthew Garrett Cc: Zhang Rui Cc: Huang Ying Cc: Keith Packard Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1350649546-23541-1-git-send-email-matt@console-pimps.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/efi.h | 5 +++-- arch/x86/platform/efi/efi.c | 29 ++++++++++++++++++----------- arch/x86/platform/efi/efi_64.c | 7 +++++-- 3 files changed, 26 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index c9dcc181d4d..36ff332da13 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -35,7 +35,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ efi_call_virt(f, a1, a2, a3, a4, a5, a6) -#define efi_ioremap(addr, size, type) ioremap_cache(addr, size) +#define efi_ioremap(addr, size, type, attr) ioremap_cache(addr, size) #else /* !CONFIG_X86_32 */ @@ -89,7 +89,7 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3, (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, - u32 type); + u32 type, u64 attribute); #endif /* CONFIG_X86_32 */ @@ -98,6 +98,7 @@ extern void efi_set_executable(efi_memory_desc_t *md, bool executable); extern int efi_memblock_x86_reserve_range(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); +extern void efi_memory_uc(u64 addr, unsigned long size); #ifndef CONFIG_EFI /* diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index aded2a91162..cb34839c97c 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -810,6 +810,16 @@ void __iomem *efi_lookup_mapped_addr(u64 phys_addr) return NULL; } +void efi_memory_uc(u64 addr, unsigned long size) +{ + unsigned long page_shift = 1UL << EFI_PAGE_SHIFT; + u64 npages; + + npages = round_up(size, page_shift) / page_shift; + memrange_efi_to_native(&addr, &npages); + set_memory_uc(addr, npages); +} + /* * This function will switch the EFI runtime services to virtual mode. * Essentially, look through the EFI memmap and map every region that @@ -823,7 +833,7 @@ void __init efi_enter_virtual_mode(void) efi_memory_desc_t *md, *prev_md = NULL; efi_status_t status; unsigned long size; - u64 end, systab, addr, npages, end_pfn; + u64 end, systab, end_pfn; void *p, *va, *new_memmap = NULL; int count = 0; @@ -879,10 +889,14 @@ void __init efi_enter_virtual_mode(void) end_pfn = PFN_UP(end); if (end_pfn <= max_low_pfn_mapped || (end_pfn > (1UL << (32 - PAGE_SHIFT)) - && end_pfn <= max_pfn_mapped)) + && end_pfn <= max_pfn_mapped)) { va = __va(md->phys_addr); - else - va = efi_ioremap(md->phys_addr, size, md->type); + + if (!(md->attribute & EFI_MEMORY_WB)) + efi_memory_uc((u64)(unsigned long)va, size); + } else + va = efi_ioremap(md->phys_addr, size, + md->type, md->attribute); md->virt_addr = (u64) (unsigned long) va; @@ -892,13 +906,6 @@ void __init efi_enter_virtual_mode(void) continue; } - if (!(md->attribute & EFI_MEMORY_WB)) { - addr = md->virt_addr; - npages = md->num_pages; - memrange_efi_to_native(&addr, &npages); - set_memory_uc(addr, npages); - } - systab = (u64) (unsigned long) efi_phys.systab; if (md->phys_addr <= systab && systab < end) { systab += md->virt_addr - md->phys_addr; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index ac3aa54e265..95fd505dfeb 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -82,7 +82,7 @@ void __init efi_call_phys_epilog(void) } void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, - u32 type) + u32 type, u64 attribute) { unsigned long last_map_pfn; @@ -92,8 +92,11 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) { unsigned long top = last_map_pfn << PAGE_SHIFT; - efi_ioremap(top, size - (top - phys_addr), type); + efi_ioremap(top, size - (top - phys_addr), type, attribute); } + if (!(attribute & EFI_MEMORY_WB)) + efi_memory_uc((u64)(unsigned long)__va(phys_addr), size); + return (void __iomem *)__va(phys_addr); } -- cgit v1.2.3-70-g09d2 From 64dfab8e83644902ad2fd559a56c411b47e3ef3c Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 22 Oct 2012 16:51:38 +0800 Subject: perf/x86: Remove unused variable in nhmex_rbox_alter_er() The variable port is initialized but never used otherwise, so remove the unused variable. dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Cc: Yan, Zheng Cc: a.p.zijlstra@chello.nl Cc: paulus@samba.org Cc: acme@ghostprotocols.net Link: http://lkml.kernel.org/r/CAPgLHd8NZkYSkZm22FpZxiEh6HcA0q-V%3D29vdnheiDhgrJZ%2Byw@mail.gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 6f874771ee6..3cf3d97cce3 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -1558,7 +1558,6 @@ void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; struct hw_perf_event_extra *reg1 = &hwc->extra_reg; - int port; /* adjust the main event selector and extra register index */ if (reg1->idx % 2) { @@ -1570,7 +1569,6 @@ void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) } /* adjust extra register config */ - port = reg1->idx / 6 + box->pmu->pmu_idx * 4; switch (reg1->idx % 6) { case 2: /* shift the 8~15 bits to the 0~7 bits */ -- cgit v1.2.3-70-g09d2 From 94777fc51b3ad85ff9f705ddf7cdd0eb3bbad5a6 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Tue, 16 Oct 2012 07:50:21 -0500 Subject: x86/irq/ioapic: Check for valid irq_cfg pointer in smp_irq_move_cleanup_interrupt Posting this patch to fix an issue concerning sparse irq's that I raised a while back. There was discussion about adding refcounting to sparse irqs (to fix other potential race conditions), but that does not appear to have been addressed yet. This covers the only issue of this type that I've encountered in this area. A NULL pointer dereference can occur in smp_irq_move_cleanup_interrupt() if we haven't yet setup the irq_cfg pointer in the irq_desc.irq_data.chip_data. In create_irq_nr() there is a window where we have set vector_irq in __assign_irq_vector(), but not yet called irq_set_chip_data() to set the irq_cfg pointer. Should an IRQ_MOVE_CLEANUP_VECTOR hit the cpu in question during this time, smp_irq_move_cleanup_interrupt() will attempt to process the aforementioned irq, but panic when accessing irq_cfg. Only continue processing the irq if irq_cfg is non-NULL. Signed-off-by: Dimitri Sivanich Cc: Suresh Siddha Cc: Joerg Roedel Cc: Yinghai Lu Cc: Alexander Gordeev Link: http://lkml.kernel.org/r/20121016125021.GA22935@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c265593ec2c..1817fa91102 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2257,6 +2257,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) continue; cfg = irq_cfg(irq); + if (!cfg) + continue; + raw_spin_lock(&desc->lock); /* -- cgit v1.2.3-70-g09d2 From 6ede1fd3cb404c0016de6ac529df46d561bd558b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 22 Oct 2012 16:35:18 -0700 Subject: x86, mm: Trim memory in memblock to be page aligned We will not map partial pages, so need to make sure memblock allocation will not allocate those bytes out. Also we will use for_each_mem_pfn_range() to loop to map memory range to keep them consistent. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/CAE9FiQVZirvaBMFYRfXMmWEcHbKSicQEHz4VAwUv0xFCk51ZNw@mail.gmail.com Acked-by: Jacob Shin Signed-off-by: H. Peter Anvin Cc: --- arch/x86/kernel/e820.c | 3 +++ include/linux/memblock.h | 1 + mm/memblock.c | 24 ++++++++++++++++++++++++ 3 files changed, 28 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index ed858e9e9a7..df06ade26be 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1077,6 +1077,9 @@ void __init memblock_x86_fill(void) memblock_add(ei->addr, ei->size); } + /* throw away partial pages */ + memblock_trim_memory(PAGE_SIZE); + memblock_dump_all(); } diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 569d67d4243..d452ee19106 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -57,6 +57,7 @@ int memblock_add(phys_addr_t base, phys_addr_t size); int memblock_remove(phys_addr_t base, phys_addr_t size); int memblock_free(phys_addr_t base, phys_addr_t size); int memblock_reserve(phys_addr_t base, phys_addr_t size); +void memblock_trim_memory(phys_addr_t align); #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, diff --git a/mm/memblock.c b/mm/memblock.c index 931eef145af..625905523c2 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -930,6 +930,30 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; } +void __init_memblock memblock_trim_memory(phys_addr_t align) +{ + int i; + phys_addr_t start, end, orig_start, orig_end; + struct memblock_type *mem = &memblock.memory; + + for (i = 0; i < mem->cnt; i++) { + orig_start = mem->regions[i].base; + orig_end = mem->regions[i].base + mem->regions[i].size; + start = round_up(orig_start, align); + end = round_down(orig_end, align); + + if (start == orig_start && end == orig_end) + continue; + + if (start < end) { + mem->regions[i].base = start; + mem->regions[i].size = end - start; + } else { + memblock_remove_region(mem, i); + i--; + } + } +} void __init_memblock memblock_set_current_limit(phys_addr_t limit) { -- cgit v1.2.3-70-g09d2 From 1f2ff682ac951ed82cc043cf140d2851084512df Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 22 Oct 2012 16:35:18 -0700 Subject: x86, mm: Use memblock memory loop instead of e820_RAM We need to handle E820_RAM and E820_RESERVED_KERNEL at the same time. Also memblock has page aligned range for ram, so we could avoid mapping partial pages. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/CAE9FiQVZirvaBMFYRfXMmWEcHbKSicQEHz4VAwUv0xFCk51ZNw@mail.gmail.com Acked-by: Jacob Shin Signed-off-by: H. Peter Anvin Cc: --- arch/x86/kernel/setup.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 468e98dfd44..5d888af8682 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -921,18 +921,19 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + unsigned long start, end; + unsigned long start_pfn, end_pfn; - if (ei->addr + ei->size <= 1UL << 32) - continue; + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, + NULL) { - if (ei->type == E820_RESERVED) + end = PFN_PHYS(end_pfn); + if (end <= (1UL<<32)) continue; + start = PFN_PHYS(start_pfn); max_pfn_mapped = init_memory_mapping( - ei->addr < 1UL << 32 ? 1UL << 32 : ei->addr, - ei->addr + ei->size); + max((1UL<<32), start), end); } /* can we preseve max_low_pfn ?*/ -- cgit v1.2.3-70-g09d2 From 844ab6f993b1d32eb40512503d35ff6ad0c57030 Mon Sep 17 00:00:00 2001 From: Jacob Shin Date: Wed, 24 Oct 2012 14:24:44 -0500 Subject: x86, mm: Find_early_table_space based on ranges that are actually being mapped Current logic finds enough space for direct mapping page tables from 0 to end. Instead, we only need to find enough space to cover mr[0].start to mr[nr_range].end -- the range that is actually being mapped by init_memory_mapping() This is needed after 1bbbbe779aabe1f0768c2bf8f8c0a5583679b54a, to address the panic reported here: https://lkml.org/lkml/2012/10/20/160 https://lkml.org/lkml/2012/10/21/157 Signed-off-by: Jacob Shin Link: http://lkml.kernel.org/r/20121024195311.GB11779@jshin-Toonie Tested-by: Tom Rini Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 70 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 8653b3a722b..bc287d62bf1 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -29,36 +29,54 @@ int direct_gbpages #endif ; -static void __init find_early_table_space(unsigned long end, int use_pse, - int use_gbpages) +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; + +/* + * First calculate space needed for kernel direct mapping page tables to cover + * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB + * pages. Then find enough contiguous space for those page tables. + */ +static void __init find_early_table_space(struct map_range *mr, int nr_range) { - unsigned long puds, pmds, ptes, tables, start = 0, good_end = end; + int i; + unsigned long puds = 0, pmds = 0, ptes = 0, tables; + unsigned long start = 0, good_end; phys_addr_t base; - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + for (i = 0; i < nr_range; i++) { + unsigned long range, extra; - if (use_gbpages) { - unsigned long extra; + range = mr[i].end - mr[i].start; + puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; - extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); - pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); + if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { + extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); + pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else { + pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; + } - if (use_pse) { - unsigned long extra; - - extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); + if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { + extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); #ifdef CONFIG_X86_32 - extra += PMD_SIZE; + extra += PMD_SIZE; #endif - ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else - ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + /* The first 2/4M doesn't use large pages. */ + if (mr[i].start < PMD_SIZE) + extra += range; + + ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else { + ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; + } + } + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); #ifdef CONFIG_X86_32 @@ -76,7 +94,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse, pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx]\n", - end - 1, pgt_buf_start << PAGE_SHIFT, + mr[nr_range - 1].end - 1, pgt_buf_start << PAGE_SHIFT, (pgt_buf_top << PAGE_SHIFT) - 1); } @@ -85,12 +103,6 @@ void __init native_pagetable_reserve(u64 start, u64 end) memblock_reserve(start, end - start); } -struct map_range { - unsigned long start; - unsigned long end; - unsigned page_size_mask; -}; - #ifdef CONFIG_X86_32 #define NR_RANGE_MR 3 #else /* CONFIG_X86_64 */ @@ -263,7 +275,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * nodes are discovered. */ if (!after_bootmem) - find_early_table_space(end, use_pse, use_gbpages); + find_early_table_space(mr, nr_range); for (i = 0; i < nr_range; i++) ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, -- cgit v1.2.3-70-g09d2 From 5189c2a7c7769ee9d037d76c1a7b8550ccf3481c Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Wed, 24 Oct 2012 10:00:44 -0700 Subject: x86: efi: Turn off efi_enabled after setup on mixed fw/kernel When 32-bit EFI is used with 64-bit kernel (or vice versa), turn off efi_enabled once setup is done. Beyond setup, it is normally used to determine if runtime services are available and we will have none. This will resolve issues stemming from efivars modprobe panicking on a 32/64-bit setup, as well as some reboot issues on similar setups. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=45991 Reported-by: Marko Kohtala Reported-by: Maxim Kammerer Signed-off-by: Olof Johansson Acked-by: Maarten Lankhorst Cc: stable@kernel.org # 3.4 - 3.6 Cc: Matthew Garrett Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 1 + arch/x86/kernel/setup.c | 12 ++++++++++++ arch/x86/platform/efi/efi.c | 18 ++++++++++-------- 3 files changed, 23 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index c9dcc181d4d..029189db84d 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -98,6 +98,7 @@ extern void efi_set_executable(efi_memory_desc_t *md, bool executable); extern int efi_memblock_x86_reserve_range(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); +extern void efi_unmap_memmap(void); #ifndef CONFIG_EFI /* diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a2bb18e0283..6fd7752cee9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1035,6 +1035,18 @@ void __init setup_arch(char **cmdline_p) arch_init_ideal_nops(); register_refined_jiffies(CLOCK_TICK_RATE); + +#ifdef CONFIG_EFI + /* Once setup is done above, disable efi_enabled on mismatched + * firmware/kernel archtectures since there is no support for + * runtime services. + */ + if (efi_enabled && IS_ENABLED(CONFIG_X86_64) != efi_64bit) { + pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n"); + efi_unmap_memmap(); + efi_enabled = 0; + } +#endif } #ifdef CONFIG_X86_32 diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index aded2a91162..9bae3b73fcc 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -70,11 +70,15 @@ EXPORT_SYMBOL(efi); struct efi_memory_map memmap; bool efi_64bit; -static bool efi_native; static struct efi efi_phys __initdata; static efi_system_table_t efi_systab __initdata; +static inline bool efi_is_native(void) +{ + return IS_ENABLED(CONFIG_X86_64) == efi_64bit; +} + static int __init setup_noefi(char *arg) { efi_enabled = 0; @@ -420,7 +424,7 @@ void __init efi_reserve_boot_services(void) } } -static void __init efi_unmap_memmap(void) +void __init efi_unmap_memmap(void) { if (memmap.map) { early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); @@ -432,7 +436,7 @@ void __init efi_free_boot_services(void) { void *p; - if (!efi_native) + if (!efi_is_native()) return; for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { @@ -684,12 +688,10 @@ void __init efi_init(void) return; } efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; - efi_native = !efi_64bit; #else efi_phys.systab = (efi_system_table_t *) (boot_params.efi_info.efi_systab | ((__u64)boot_params.efi_info.efi_systab_hi<<32)); - efi_native = efi_64bit; #endif if (efi_systab_init(efi_phys.systab)) { @@ -723,7 +725,7 @@ void __init efi_init(void) * that doesn't match the kernel 32/64-bit mode. */ - if (!efi_native) + if (!efi_is_native()) pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); else if (efi_runtime_init()) { efi_enabled = 0; @@ -735,7 +737,7 @@ void __init efi_init(void) return; } #ifdef CONFIG_X86_32 - if (efi_native) { + if (efi_is_native()) { x86_platform.get_wallclock = efi_get_time; x86_platform.set_wallclock = efi_set_rtc_mmss; } @@ -834,7 +836,7 @@ void __init efi_enter_virtual_mode(void) * non-native EFI */ - if (!efi_native) { + if (!efi_is_native()) { efi_unmap_memmap(); return; } -- cgit v1.2.3-70-g09d2 From f82f64dd9f485e13f29f369772d4a0e868e5633a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 25 Oct 2012 15:45:26 -0700 Subject: x86, mm: Undo incorrect revert in arch/x86/mm/init.c Commit 844ab6f9 x86, mm: Find_early_table_space based on ranges that are actually being mapped added back some lines back wrongly that has been removed in commit 7b16bbf97 Revert "x86/mm: Fix the size calculation of mapping tables" remove them again. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/CAE9FiQW_vuaYQbmagVnxT2DGsYc=9tNeAbdBq53sYkitPOwxSQ@mail.gmail.com Acked-by: Jacob Shin Signed-off-by: H. Peter Anvin --- arch/x86/mm/init.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bc287d62bf1..d7aea41563b 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -65,10 +65,6 @@ static void __init find_early_table_space(struct map_range *mr, int nr_range) #ifdef CONFIG_X86_32 extra += PMD_SIZE; #endif - /* The first 2/4M doesn't use large pages. */ - if (mr[i].start < PMD_SIZE) - extra += range; - ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else { ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; -- cgit v1.2.3-70-g09d2 From 87da7e66a40532b743cd50972fcf85a1f15b14ea Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 24 Oct 2012 14:07:59 +0800 Subject: KVM: x86: fix vcpu->mmio_fragments overflow After commit b3356bf0dbb349 (KVM: emulator: optimize "rep ins" handling), the pieces of io data can be collected and write them to the guest memory or MMIO together Unfortunately, kvm splits the mmio access into 8 bytes and store them to vcpu->mmio_fragments. If the guest uses "rep ins" to move large data, it will cause vcpu->mmio_fragments overflow The bug can be exposed by isapc (-M isapc): [23154.818733] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC [ ......] [23154.858083] Call Trace: [23154.859874] [] kvm_get_cr8+0x1d/0x28 [kvm] [23154.861677] [] kvm_arch_vcpu_ioctl_run+0xcda/0xe45 [kvm] [23154.863604] [] ? kvm_arch_vcpu_load+0x17b/0x180 [kvm] Actually, we can use one mmio_fragment to store a large mmio access then split it when we pass the mmio-exit-info to userspace. After that, we only need two entries to store mmio info for the cross-mmio pages access Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 60 +++++++++++++++++++++++++++--------------------- include/linux/kvm_host.h | 15 ++---------- 2 files changed, 36 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1eefebe5d72..224a7e78cb6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3779,7 +3779,7 @@ static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, { struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0]; - memcpy(vcpu->run->mmio.data, frag->data, frag->len); + memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len)); return X86EMUL_CONTINUE; } @@ -3832,18 +3832,11 @@ mmio: bytes -= handled; val += handled; - while (bytes) { - unsigned now = min(bytes, 8U); - - frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; - frag->gpa = gpa; - frag->data = val; - frag->len = now; - - gpa += now; - val += now; - bytes -= now; - } + WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS); + frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; + frag->gpa = gpa; + frag->data = val; + frag->len = bytes; return X86EMUL_CONTINUE; } @@ -3890,7 +3883,7 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, vcpu->mmio_needed = 1; vcpu->mmio_cur_fragment = 0; - vcpu->run->mmio.len = vcpu->mmio_fragments[0].len; + vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len); vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write; vcpu->run->exit_reason = KVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = gpa; @@ -5522,28 +5515,44 @@ static int complete_emulated_pio(struct kvm_vcpu *vcpu) * * read: * for each fragment - * write gpa, len - * exit - * copy data + * for each mmio piece in the fragment + * write gpa, len + * exit + * copy data * execute insn * * write: * for each fragment - * write gpa, len - * copy data - * exit + * for each mmio piece in the fragment + * write gpa, len + * copy data + * exit */ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; struct kvm_mmio_fragment *frag; + unsigned len; BUG_ON(!vcpu->mmio_needed); /* Complete previous fragment */ - frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++]; + frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment]; + len = min(8u, frag->len); if (!vcpu->mmio_is_write) - memcpy(frag->data, run->mmio.data, frag->len); + memcpy(frag->data, run->mmio.data, len); + + if (frag->len <= 8) { + /* Switch to the next fragment. */ + frag++; + vcpu->mmio_cur_fragment++; + } else { + /* Go forward to the next mmio piece. */ + frag->data += len; + frag->gpa += len; + frag->len -= len; + } + if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) { vcpu->mmio_needed = 0; if (vcpu->mmio_is_write) @@ -5551,13 +5560,12 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) vcpu->mmio_read_completed = 1; return complete_emulated_io(vcpu); } - /* Initiate next fragment */ - ++frag; + run->exit_reason = KVM_EXIT_MMIO; run->mmio.phys_addr = frag->gpa; if (vcpu->mmio_is_write) - memcpy(run->mmio.data, frag->data, frag->len); - run->mmio.len = frag->len; + memcpy(run->mmio.data, frag->data, min(8u, frag->len)); + run->mmio.len = min(8u, frag->len); run->mmio.is_write = vcpu->mmio_is_write; vcpu->arch.complete_userspace_io = complete_emulated_mmio; return 0; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 93bfc9f9815..ecc554374e4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -42,19 +42,8 @@ */ #define KVM_MEMSLOT_INVALID (1UL << 16) -/* - * If we support unaligned MMIO, at most one fragment will be split into two: - */ -#ifdef KVM_UNALIGNED_MMIO -# define KVM_EXTRA_MMIO_FRAGMENTS 1 -#else -# define KVM_EXTRA_MMIO_FRAGMENTS 0 -#endif - -#define KVM_USER_MMIO_SIZE 8 - -#define KVM_MAX_MMIO_FRAGMENTS \ - (KVM_MMIO_SIZE / KVM_USER_MMIO_SIZE + KVM_EXTRA_MMIO_FRAGMENTS) +/* Two fragments for cross MMIO pages. */ +#define KVM_MAX_MMIO_FRAGMENTS 2 /* * For the normal pfn, the highest 12 bits should be zero, -- cgit v1.2.3-70-g09d2