From 4febd95a8a85dd38b1a71fcf9726e19c7fd20039 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 7 Mar 2013 15:48:16 +1100 Subject: Select VIRT_TO_BUS directly where needed In commit 887cbce0adea ("arch Kconfig: centralise ARCH_NO_VIRT_TO_BUS") I introduced the config sybmol HAVE_VIRT_TO_BUS and selected that where needed. I am not sure what I was thinking. Instead, just directly select VIRT_TO_BUS where it is needed. Signed-off-by: Stephen Rothwell Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a4f24f5b121..70c0f3da047 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -112,7 +112,7 @@ config X86 select GENERIC_STRNLEN_USER select HAVE_CONTEXT_TRACKING if X86_64 select HAVE_IRQ_TIME_ACCOUNTING - select HAVE_VIRT_TO_BUS + select VIRT_TO_BUS select MODULES_USE_ELF_REL if X86_32 select MODULES_USE_ELF_RELA if X86_64 select CLONE_BACKWARDS if X86_32 -- cgit v1.2.3-70-g09d2 From 1d9d8639c063caf6efc2447f5f26aa637f844ff6 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 15 Mar 2013 14:26:07 +0100 Subject: perf,x86: fix kernel crash with PEBS/BTS after suspend/resume This patch fixes a kernel crash when using precise sampling (PEBS) after a suspend/resume. Turns out the CPU notifier code is not invoked on CPU0 (BP). Therefore, the DS_AREA (used by PEBS) is not restored properly by the kernel and keeps it power-on/resume value of 0 causing any PEBS measurement to crash when running on CPU0. The workaround is to add a hook in the actual resume code to restore the DS Area MSR value. It is invoked for all CPUS. So for all but CPU0, the DS_AREA will be restored twice but this is harmless. Reported-by: Linus Torvalds Signed-off-by: Stephane Eranian Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 8 ++++++++ arch/x86/power/cpu.c | 2 ++ include/linux/perf_event.h | 2 ++ 3 files changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 826054a4f2e..0e9bdd3cb01 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -729,3 +729,11 @@ void intel_ds_init(void) } } } + +void perf_restore_debug_store(void) +{ + if (!x86_pmu.bts && !x86_pmu.pebs) + return; + + init_debug_store_on_cpu(smp_processor_id()); +} diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 120cee1c3f8..3c68768d7a7 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -228,6 +229,7 @@ static void __restore_processor_state(struct saved_context *ctxt) do_fpu_end(); x86_platform.restore_sched_clock_state(); mtrr_bp_restore(); + perf_restore_debug_store(); } /* Needed by apm.c */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e47ee462c2f..71caed8626b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -758,6 +758,7 @@ extern void perf_event_enable(struct perf_event *event); extern void perf_event_disable(struct perf_event *event); extern int __perf_event_disable(void *info); extern void perf_event_task_tick(void); +extern void perf_restore_debug_store(void); #else static inline void perf_event_task_sched_in(struct task_struct *prev, @@ -797,6 +798,7 @@ static inline void perf_event_enable(struct perf_event *event) { } static inline void perf_event_disable(struct perf_event *event) { } static inline int __perf_event_disable(void *info) { return -1; } static inline void perf_event_task_tick(void) { } +static inline void perf_restore_debug_store(void) { } #endif #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x)) -- cgit v1.2.3-70-g09d2 From 2a6e06b2aed6995af401dcd4feb5e79a0c7ea554 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 17 Mar 2013 15:44:43 -0700 Subject: perf,x86: fix wrmsr_on_cpu() warning on suspend/resume Commit 1d9d8639c063 ("perf,x86: fix kernel crash with PEBS/BTS after suspend/resume") fixed a crash when doing PEBS performance profiling after resuming, but in using init_debug_store_on_cpu() to restore the DS_AREA mtrr it also resulted in a new WARN_ON() triggering. init_debug_store_on_cpu() uses "wrmsr_on_cpu()", which in turn uses CPU cross-calls to do the MSR update. Which is not really valid at the early resume stage, and the warning is quite reasonable. Now, it all happens to _work_, for the simple reason that smp_call_function_single() ends up just doing the call directly on the CPU when the CPU number matches, but we really should just do the wrmsr() directly instead. This duplicates the wrmsr() logic, but hopefully we can just remove the wrmsr_on_cpu() version eventually. Reported-and-tested-by: Parag Warudkar Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 0e9bdd3cb01..b05a575d56f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -732,8 +732,10 @@ void intel_ds_init(void) void perf_restore_debug_store(void) { + struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds); + if (!x86_pmu.bts && !x86_pmu.pebs) return; - init_debug_store_on_cpu(smp_processor_id()); + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds); } -- cgit v1.2.3-70-g09d2 From 9a556ab998071457e79b319f2527642dd6e50617 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 14 Mar 2013 20:52:43 +0900 Subject: kprobes/x86: Check Interrupt Flag modifier when registering probe Currently kprobes check whether the copied instruction modifies IF (interrupt flag) on each probe hit. This results not only in introducing overhead but also involving inat_get_opcode_attribute into the kprobes hot path, and it can cause an infinite recursive call (and kernel panic in the end). Actually, since the copied instruction itself can never be modified on the buffer, it is needless to analyze the instruction on every probe hit. To fix this issue, we check it only once when registering probe and store the result on ainsn->if_modifier. Reported-by: Timo Juhani Lindfors Signed-off-by: Masami Hiramatsu Acked-by: Ananth N Mavinakayanahalli Cc: yrl.pp-manager.tt@hitachi.com Cc: Steven Rostedt Cc: David S. Miller Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20130314115242.19690.33573.stgit@mhiramat-M0-7522 Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kprobes.h | 1 + arch/x86/kernel/kprobes/core.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index d3ddd17405d..5a6d2873f80 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -77,6 +77,7 @@ struct arch_specific_insn { * a post_handler or break_handler). */ int boostable; + bool if_modifier; }; struct arch_optimized_insn { diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 3f06e614998..7bfe318d3d8 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -375,6 +375,9 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) else p->ainsn.boostable = -1; + /* Check whether the instruction modifies Interrupt Flag or not */ + p->ainsn.if_modifier = is_IF_modifier(p->ainsn.insn); + /* Also, displacement change doesn't affect the first byte */ p->opcode = p->ainsn.insn[0]; } @@ -434,7 +437,7 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, __this_cpu_write(current_kprobe, p); kcb->kprobe_saved_flags = kcb->kprobe_old_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); - if (is_IF_modifier(p->ainsn.insn)) + if (p->ainsn.if_modifier) kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF; } -- cgit v1.2.3-70-g09d2 From fd4a5aef002bb57e8a35ed34d8a878034b9bde94 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Sun, 17 Mar 2013 14:49:57 +0100 Subject: perf/x86: Add SNB/SNB-EP scheduling constraints for cycle_activity event Add scheduling constraints for SNB/SNB-EP CYCLE_ACTIVITY event as defined by SDM Jan 2013 edition. The STALLS umasks are combinations with the NO_DISPATCH umask. Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: ak@linux.intel.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/20130317134957.GA8550@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 529c8931fc0..dab7580c47a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -101,6 +101,10 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */ + INTEL_UEVENT_CONSTRAINT(0x05a3, 0xf), /* CYCLE_ACTIVITY.STALLS_L2_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ + INTEL_UEVENT_CONSTRAINT(0x06a3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ -- cgit v1.2.3-70-g09d2 From c09664bb44184b3846e8c5254db4eae4b932682a Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 18 Mar 2013 13:54:32 -0300 Subject: KVM: x86: fix deadlock in clock-in-progress request handling There is a deadlock in pvclock handling: cpu0: cpu1: kvm_gen_update_masterclock() kvm_guest_time_update() spin_lock(pvclock_gtod_sync_lock) local_irq_save(flags) spin_lock(pvclock_gtod_sync_lock) kvm_make_mclock_inprogress_request(kvm) make_all_cpus_request() smp_call_function_many() Now if smp_call_function_many() called by cpu0 tries to call function on cpu1 there will be a deadlock. Fix by moving pvclock_gtod_sync_lock protected section outside irq disabled section. Analyzed by Gleb Natapov Acked-by: Gleb Natapov Reported-and-Tested-by: Yongjie Ren Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f71500af1f8..f7c850b3691 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1416,15 +1416,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) kernel_ns = 0; host_tsc = 0; - /* Keep irq disabled to prevent changes to the clock */ - local_irq_save(flags); - this_tsc_khz = __get_cpu_var(cpu_tsc_khz); - if (unlikely(this_tsc_khz == 0)) { - local_irq_restore(flags); - kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); - return 1; - } - /* * If the host uses TSC clock, then passthrough TSC as stable * to the guest. @@ -1436,6 +1427,15 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) kernel_ns = ka->master_kernel_ns; } spin_unlock(&ka->pvclock_gtod_sync_lock); + + /* Keep irq disabled to prevent changes to the clock */ + local_irq_save(flags); + this_tsc_khz = __get_cpu_var(cpu_tsc_khz); + if (unlikely(this_tsc_khz == 0)) { + local_irq_restore(flags); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); + return 1; + } if (!use_master_clock) { host_tsc = native_read_tsc(); kernel_ns = get_kernel_ns(); -- cgit v1.2.3-70-g09d2 From c300aa64ddf57d9c5d9c898a64b36877345dd4a9 Mon Sep 17 00:00:00 2001 From: Andy Honig Date: Mon, 11 Mar 2013 09:34:52 -0700 Subject: KVM: x86: fix for buffer overflow in handling of MSR_KVM_SYSTEM_TIME (CVE-2013-1796) If the guest sets the GPA of the time_page so that the request to update the time straddles a page then KVM will write onto an incorrect page. The write is done byusing kmap atomic to get a pointer to the page for the time structure and then performing a memcpy to that page starting at an offset that the guest controls. Well behaved guests always provide a 32-byte aligned address, however a malicious guest could use this to corrupt host kernel memory. Tested: Tested against kvmclock unit test. Signed-off-by: Andrew Honig Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f7c850b3691..2ade60c2540 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1959,6 +1959,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) /* ...but clean it before doing the actual write */ vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); + /* Check that the address is 32-byte aligned. */ + if (vcpu->arch.time_offset & + (sizeof(struct pvclock_vcpu_time_info) - 1)) + break; + vcpu->arch.time_page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); -- cgit v1.2.3-70-g09d2 From 0b79459b482e85cb7426aa7da683a9f2c97aeae1 Mon Sep 17 00:00:00 2001 From: Andy Honig Date: Wed, 20 Feb 2013 14:48:10 -0800 Subject: KVM: x86: Convert MSR_KVM_SYSTEM_TIME to use gfn_to_hva_cache functions (CVE-2013-1797) There is a potential use after free issue with the handling of MSR_KVM_SYSTEM_TIME. If the guest specifies a GPA in a movable or removable memory such as frame buffers then KVM might continue to write to that address even after it's removed via KVM_SET_USER_MEMORY_REGION. KVM pins the page in memory so it's unlikely to cause an issue, but if the user space component re-purposes the memory previously used for the guest, then the guest will be able to corrupt that memory. Tested: Tested against kvmclock unit test Signed-off-by: Andrew Honig Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_host.h | 4 ++-- arch/x86/kvm/x86.c | 47 ++++++++++++++++++----------------------- 2 files changed, 22 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 635a74d2240..4979778cc7f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -414,8 +414,8 @@ struct kvm_vcpu_arch { gpa_t time; struct pvclock_vcpu_time_info hv_clock; unsigned int hw_tsc_khz; - unsigned int time_offset; - struct page *time_page; + struct gfn_to_hva_cache pv_time; + bool pv_time_enabled; /* set guest stopped flag in pvclock flags field */ bool pvclock_set_guest_stopped_request; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2ade60c2540..f19ac0aca60 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1406,10 +1406,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) unsigned long flags, this_tsc_khz; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; - void *shared_kaddr; s64 kernel_ns, max_kernel_ns; u64 tsc_timestamp, host_tsc; - struct pvclock_vcpu_time_info *guest_hv_clock; + struct pvclock_vcpu_time_info guest_hv_clock; u8 pvclock_flags; bool use_master_clock; @@ -1463,7 +1462,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) local_irq_restore(flags); - if (!vcpu->time_page) + if (!vcpu->pv_time_enabled) return 0; /* @@ -1525,12 +1524,12 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) */ vcpu->hv_clock.version += 2; - shared_kaddr = kmap_atomic(vcpu->time_page); - - guest_hv_clock = shared_kaddr + vcpu->time_offset; + if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, + &guest_hv_clock, sizeof(guest_hv_clock)))) + return 0; /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ - pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); + pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); if (vcpu->pvclock_set_guest_stopped_request) { pvclock_flags |= PVCLOCK_GUEST_STOPPED; @@ -1543,12 +1542,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->hv_clock.flags = pvclock_flags; - memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); - - kunmap_atomic(shared_kaddr); - - mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); return 0; } @@ -1837,10 +1833,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) static void kvmclock_reset(struct kvm_vcpu *vcpu) { - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + vcpu->arch.pv_time_enabled = false; } static void accumulate_steal_time(struct kvm_vcpu *vcpu) @@ -1947,6 +1940,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { + u64 gpa_offset; kvmclock_reset(vcpu); vcpu->arch.time = data; @@ -1956,19 +1950,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & 1)) break; - /* ...but clean it before doing the actual write */ - vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); + gpa_offset = data & ~(PAGE_MASK | 1); /* Check that the address is 32-byte aligned. */ - if (vcpu->arch.time_offset & - (sizeof(struct pvclock_vcpu_time_info) - 1)) + if (gpa_offset & (sizeof(struct pvclock_vcpu_time_info) - 1)) break; - vcpu->arch.time_page = - gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); - - if (is_error_page(vcpu->arch.time_page)) - vcpu->arch.time_page = NULL; + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.pv_time, data & ~1ULL)) + vcpu->arch.pv_time_enabled = false; + else + vcpu->arch.pv_time_enabled = true; break; } @@ -2972,7 +2964,7 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, */ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) { - if (!vcpu->arch.time_page) + if (!vcpu->arch.pv_time_enabled) return -EINVAL; vcpu->arch.pvclock_set_guest_stopped_request = true; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -6723,6 +6715,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_wbinvd_dirty_mask; vcpu->arch.ia32_tsc_adjust_msr = 0x0; + vcpu->arch.pv_time_enabled = false; kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); -- cgit v1.2.3-70-g09d2