From 9f339e7028e2855717af3193c938f9960ad13b38 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Wed, 11 Feb 2009 15:10:27 +0100 Subject: x86, ptrace, mm: fix double-free on race Ptrace_detach() races with __ptrace_unlink() if the traced task is reaped while detaching. This might cause a double-free of the BTS buffer. Change the ptrace_detach() path to only do the memory accounting in ptrace_bts_detach() and leave the buffer free to ptrace_bts_untrace() which will be called from __ptrace_unlink(). The fix follows a proposal from Oleg Nesterov. Reported-by: Oleg Nesterov Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0a5df5f82fb..5a4c23d8989 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -810,12 +810,16 @@ static void ptrace_bts_untrace(struct task_struct *child) static void ptrace_bts_detach(struct task_struct *child) { - if (unlikely(child->bts)) { - ds_release_bts(child->bts); - child->bts = NULL; - - ptrace_bts_free_buffer(child); - } + /* + * Ptrace_detach() races with ptrace_untrace() in case + * the child dies and is reaped by another thread. + * + * We only do the memory accounting at this point and + * leave the buffer deallocation and the bts tracer + * release to ptrace_bts_untrace() which will be called + * later on with tasklist_lock held. + */ + release_locked_buffer(child->bts_buffer, child->bts_size); } #else static inline void ptrace_bts_fork(struct task_struct *tsk) {} -- cgit v1.2.3-70-g09d2 From d85cf93da66977dbc645352be1b2084a659d8a0b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 12 Feb 2009 10:02:56 -0800 Subject: x86/paravirt: make arch_flush_lazy_mmu/cpu disable preemption Impact: avoid access to percpu vars in preempible context They are intended to be used whenever there's the possibility that there's some stale state which is going to be overwritten with a queued update, or to force a state change when we may be in lazy mode. Either way, we could end up calling it with preemption enabled, so wrap the functions in their own little preempt-disable section so they can be safely called in any context (though preemption should never be enabled if we're actually in a lazy state). (Move out of line to avoid #include dependencies.) Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/paravirt.h | 17 ++--------------- arch/x86/kernel/paravirt.c | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index ba3e2ff6aed..a660eceaa27 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -1352,14 +1352,7 @@ static inline void arch_leave_lazy_cpu_mode(void) PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave); } -static inline void arch_flush_lazy_cpu_mode(void) -{ - if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)) { - arch_leave_lazy_cpu_mode(); - arch_enter_lazy_cpu_mode(); - } -} - +void arch_flush_lazy_cpu_mode(void); #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) @@ -1372,13 +1365,7 @@ static inline void arch_leave_lazy_mmu_mode(void) PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave); } -static inline void arch_flush_lazy_mmu_mode(void) -{ - if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU)) { - arch_leave_lazy_mmu_mode(); - arch_enter_lazy_mmu_mode(); - } -} +void arch_flush_lazy_mmu_mode(void); static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, unsigned long phys, pgprot_t flags) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e4c8fb60887..dcba6c567a2 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -268,6 +268,30 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) return __get_cpu_var(paravirt_lazy_mode); } +void arch_flush_lazy_mmu_mode(void) +{ + preempt_disable(); + + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + arch_enter_lazy_mmu_mode(); + } + + preempt_enable(); +} + +void arch_flush_lazy_cpu_mode(void) +{ + preempt_disable(); + + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { + arch_leave_lazy_cpu_mode(); + arch_enter_lazy_cpu_mode(); + } + + preempt_enable(); +} + struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, -- cgit v1.2.3-70-g09d2 From 34b0900d323122113683685b200aae9f9b75e63b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 12 Feb 2009 21:30:48 +0100 Subject: x86: warn if arch_flush_lazy_mmu_cpu is called in preemptible context Impact: Catch cases where lazy MMU state is active in a preemtible context arch_flush_lazy_mmu_cpu() has been changed to disable preemption so the checks in enter/leave will never trigger. Put the preemtible() check into arch_flush_lazy_mmu_cpu() to catch such cases. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/paravirt.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index dcba6c567a2..c6520a4e85d 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -273,6 +273,7 @@ void arch_flush_lazy_mmu_mode(void) preempt_disable(); if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { + WARN_ON(preempt_count() == 1); arch_leave_lazy_mmu_mode(); arch_enter_lazy_mmu_mode(); } @@ -285,6 +286,7 @@ void arch_flush_lazy_cpu_mode(void) preempt_disable(); if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { + WARN_ON(preempt_count() == 1); arch_leave_lazy_cpu_mode(); arch_enter_lazy_cpu_mode(); } -- cgit v1.2.3-70-g09d2 From b13e24644c138d0ddbc451403c30a96b09bfd556 Mon Sep 17 00:00:00 2001 From: john stultz Date: Thu, 12 Feb 2009 18:48:53 -0800 Subject: x86, hpet: fix for LS21 + HPET = boot hang Between 2.6.23 and 2.6.24-rc1 a change was made that broke IBM LS21 systems that had the HPET enabled in the BIOS, resulting in boot hangs for x86_64. Specifically commit b8ce33590687888ebb900d09557b8807c4539022, which merges the i386 and x86_64 HPET code. Prior to this commit, when we setup the HPET timers in x86_64, we did the following: hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT, HPET_T0_CFG); However after the i386/x86_64 HPET merge, we do the following: cfg = hpet_readl(HPET_Tn_CFG(timer)); cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT; hpet_writel(cfg, HPET_Tn_CFG(timer)); However on LS21s with HPET enabled in the BIOS, the HPET_T0_CFG register boots with Level triggered interrupts (HPET_TN_LEVEL) enabled. This causes the periodic interrupt to be not so periodic, and that results in the boot time hang I reported earlier in the delay calibration. My fix: Always disable HPET_TN_LEVEL when setting up periodic mode. Signed-off-by: John Stultz Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 64d5ad0b8ad..5c8da2c2c18 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -269,6 +269,8 @@ static void hpet_set_mode(enum clock_event_mode mode, now = hpet_readl(HPET_COUNTER); cmp = now + (unsigned long) delta; cfg = hpet_readl(HPET_Tn_CFG(timer)); + /* Make sure we use edge triggered interrupts */ + cfg &= ~HPET_TN_LEVEL; cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT; hpet_writel(cfg, HPET_Tn_CFG(timer)); -- cgit v1.2.3-70-g09d2 From e49590b6dd356f8ef10ba3531a29e5086f6f2e3a Mon Sep 17 00:00:00 2001 From: Chris Ball Date: Fri, 13 Feb 2009 20:56:18 -0500 Subject: x86, olpc: fix model detection without OFW Impact: fix "garbled display, laptop is unusable" bug Commit e51a1ac2dfca9ad869471e88f828281db7e810c0 ("x86, olpc: fix endian bug in openfirmware workaround") breaks model comparison on OLPC; the value 0xc2 needs to be scaled up by olpc_board(). The pre-patch version was wrong, but accidentally worked anyway (big-endian 0xc2 is big enough to satisfy all other board revisions, but little endian 0xc2 is not). Signed-off-by: Chris Ball Cc: Andrew Morton Acked-by: Andres Salomon Cc: Harvey Harrison Signed-off-by: Ingo Molnar --- arch/x86/kernel/olpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 7a13fac63a1..4006c522adc 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -203,7 +203,7 @@ static void __init platform_detect(void) static void __init platform_detect(void) { /* stopgap until OFW support is added to the kernel */ - olpc_platform_info.boardrev = 0xc2; + olpc_platform_info.boardrev = olpc_board(0xc2); } #endif -- cgit v1.2.3-70-g09d2 From be716615fe596ee117292dc615e95f707fb67fd1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Jan 2009 23:36:34 +0100 Subject: x86, vm86: fix preemption bug Commit 3d2a71a596bd9c761c8487a2178e95f8a61da083 ("x86, traps: converge do_debug handlers") changed the preemption disable logic of do_debug() so vm86_handle_trap() is called with preemption disabled resulting in: BUG: sleeping function called from invalid context at include/linux/kernel.h:155 in_atomic(): 1, irqs_disabled(): 0, pid: 3005, name: dosemu.bin Pid: 3005, comm: dosemu.bin Tainted: G W 2.6.29-rc1 #51 Call Trace: [] copy_to_user+0x33/0x108 [] save_v86_state+0x65/0x149 [] handle_vm86_trap+0x20/0x8f [] do_debug+0x15b/0x1a4 [] debug_stack_correct+0x27/0x2c [] sysenter_do_call+0x12/0x2f BUG: scheduling while atomic: dosemu.bin/3005/0x10000001 Restore the original calling convention and reenable preemption before calling handle_vm86_trap(). Reported-by: Michal Suchanek Cc: stable@kernel.org Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7932338d7cb..a9e7548e179 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -99,6 +99,12 @@ static inline void preempt_conditional_sti(struct pt_regs *regs) local_irq_enable(); } +static inline void conditional_cli(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_disable(); +} + static inline void preempt_conditional_cli(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) @@ -626,8 +632,10 @@ clear_dr7: #ifdef CONFIG_X86_32 debug_vm86: + /* reenable preemption: handle_vm86_trap() might sleep */ + dec_preempt_count(); handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - preempt_conditional_cli(regs); + conditional_cli(regs); return; #endif -- cgit v1.2.3-70-g09d2 From a0abd520fd69295f4a3735e29a9448a32e101d47 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 16 Feb 2009 17:31:58 -0600 Subject: cpumask: fix powernow-k8: partial revert of 2fdf66b491ac706657946442789ec644cc317e1a Impact: fix powernow-k8 when acpi=off (or other error). There was a spurious change introduced into powernow-k8 in this patch: so that we try to "restore" the cpus_allowed we never saved. We revert that file. See lkml "[PATCH] x86/powernow: fix cpus_allowed brokage when acpi=off" from Yinghai for the bug report. Cc: Mike Travis Cc: Yinghai Lu Signed-off-by: Rusty Russell Acked-by: Ingo Molnar --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index fb039cd345d..6428aa17b40 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1157,8 +1157,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) data->cpu = pol->cpu; data->currpstate = HW_PSTATE_INVALID; - rc = powernow_k8_cpu_init_acpi(data); - if (rc) { + if (powernow_k8_cpu_init_acpi(data)) { /* * Use the PSB BIOS structure. This is only availabe on * an UP version, and is deprecated by AMD. @@ -1176,17 +1175,20 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) "ACPI maintainers and complain to your BIOS " "vendor.\n"); #endif - goto err_out; + kfree(data); + return -ENODEV; } if (pol->cpu != 0) { printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for " "CPU other than CPU0. Complain to your BIOS " "vendor.\n"); - goto err_out; + kfree(data); + return -ENODEV; } rc = find_psb_table(data); if (rc) { - goto err_out; + kfree(data); + return -ENODEV; } /* Take a crude guess here. * That guess was in microseconds, so multiply with 1000 */ -- cgit v1.2.3-70-g09d2 From bf51935f3e988e0ed6f34b55593e5912f990750a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Feb 2009 06:01:30 -0800 Subject: x86, rcu: fix strange load average and ksoftirqd behavior Damien Wyart reported high ksoftirqd CPU usage (20%) on an otherwise idle system. The function-graph trace Damien provided: > 799.521187 | 1) -0 | | rcu_check_callbacks() { > 799.521371 | 1) -0 | | rcu_check_callbacks() { > 799.521555 | 1) -0 | | rcu_check_callbacks() { > 799.521738 | 1) -0 | | rcu_check_callbacks() { > 799.521934 | 1) -0 | | rcu_check_callbacks() { > 799.522068 | 1) ksoftir-2324 | | rcu_check_callbacks() { > 799.522208 | 1) -0 | | rcu_check_callbacks() { > 799.522392 | 1) -0 | | rcu_check_callbacks() { > 799.522575 | 1) -0 | | rcu_check_callbacks() { > 799.522759 | 1) -0 | | rcu_check_callbacks() { > 799.522956 | 1) -0 | | rcu_check_callbacks() { > 799.523074 | 1) ksoftir-2324 | | rcu_check_callbacks() { > 799.523214 | 1) -0 | | rcu_check_callbacks() { > 799.523397 | 1) -0 | | rcu_check_callbacks() { > 799.523579 | 1) -0 | | rcu_check_callbacks() { > 799.523762 | 1) -0 | | rcu_check_callbacks() { > 799.523960 | 1) -0 | | rcu_check_callbacks() { > 799.524079 | 1) ksoftir-2324 | | rcu_check_callbacks() { > 799.524220 | 1) -0 | | rcu_check_callbacks() { > 799.524403 | 1) -0 | | rcu_check_callbacks() { > 799.524587 | 1) -0 | | rcu_check_callbacks() { > 799.524770 | 1) -0 | | rcu_check_callbacks() { > [ . . . ] Shows rcu_check_callbacks() being invoked way too often. It should be called once per jiffy, and here it is called no less than 22 times in about 3.5 milliseconds, meaning one call every 160 microseconds or so. Why do we need to call rcu_pending() and rcu_check_callbacks() from the idle loop of 32-bit x86, especially given that no other architecture does this? The following patch removes the call to rcu_pending() and rcu_check_callbacks() from the x86 32-bit idle loop in order to reduce the softirq load on idle systems. Reported-by: Damien Wyart Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index a546f55c77b..bd4da2af08a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -104,9 +104,6 @@ void cpu_idle(void) check_pgt_cache(); rmb(); - if (rcu_pending(cpu)) - rcu_check_callbacks(cpu, 0); - if (cpu_is_offline(cpu)) play_dead(); -- cgit v1.2.3-70-g09d2 From 6ec68bff3c81e776a455f6aca95c8c5f1d630198 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:26 +0100 Subject: x86, mce: reinitialize per cpu features on resume Impact: Bug fix This fixes a long standing bug in the machine check code. On resume the boot CPU wouldn't get its vendor specific state like thermal handling reinitialized. This means the boot cpu wouldn't ever get any thermal events reported again. Call the respective initialization functions on resume v2: Remove ancient init because they don't have a resume device anyways. Pointed out by Thomas Gleixner. v3: Now fix the Subject too to reflect v2 change Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 1c838032fd3..1f184efb6bc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -734,6 +734,7 @@ __setup("mce=", mcheck_enable); static int mce_resume(struct sys_device *dev) { mce_init(NULL); + mce_cpu_features(¤t_cpu_data); return 0; } -- cgit v1.2.3-70-g09d2 From 380851bc6b1b4107c61dfa2997f9095dcf779336 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:33 +0100 Subject: x86, mce: use force_sig_info to kill process in machine check Impact: bug fix (with tolerant == 3) do_exit cannot be called directly from the exception handler because it can sleep and the exception handler runs on the exception stack. Use force_sig() instead. Based on a earlier patch by Ying Huang who debugged the problem. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 1f184efb6bc..25cf624eccb 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -295,11 +295,11 @@ void do_machine_check(struct pt_regs * regs, long error_code) * If we know that the error was in user space, send a * SIGBUS. Otherwise, panic if tolerance is low. * - * do_exit() takes an awful lot of locks and has a slight + * force_sig() takes an awful lot of locks and has a slight * risk of deadlocking. */ if (user_space) { - do_exit(SIGBUS); + force_sig(SIGBUS, current); } else if (panic_on_oops || tolerant < 2) { mce_panic("Uncorrected machine check", &panicm, mcestart); -- cgit v1.2.3-70-g09d2 From 07db1c140eb233971341396e492cc73d4280e698 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 12 Feb 2009 13:39:35 +0100 Subject: x86, mce: fix ifdef for 64bit thermal apic vector clear on shutdown Impact: Bugfix The ifdef for the apic clear on shutdown for the 64bit intel thermal vector was incorrect and never triggered. Fix that. Signed-off-by: Andi Kleen Acked-by: Thomas Gleixner Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 115449f869e..570f36e44e5 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -862,7 +862,7 @@ void clear_local_APIC(void) } /* lets not touch this if we didn't frob it */ -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL) +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) if (maxlvt >= 5) { v = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); -- cgit v1.2.3-70-g09d2 From 48ffc70b675aa7798a52a2e92e20f6cce9140b3d Mon Sep 17 00:00:00 2001 From: Alok N Kataria Date: Wed, 18 Feb 2009 12:33:55 -0800 Subject: x86, vmi: TSC going backwards check in vmi clocksource Impact: fix time warps under vmware Similar to the check for TSC going backwards in the TSC clocksource, we also need this check for VMI clocksource. Signed-off-by: Alok N Kataria Cc: Zachary Amsden Signed-off-by: Ingo Molnar Cc: stable@kernel.org --- arch/x86/kernel/vmiclock_32.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index c4c1f9e0940..bde106cae0a 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -283,10 +283,13 @@ void __devinit vmi_time_ap_init(void) #endif /** vmi clocksource */ +static struct clocksource clocksource_vmi; static cycle_t read_real_cycles(void) { - return vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); + cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL); + return ret >= clocksource_vmi.cycle_last ? + ret : clocksource_vmi.cycle_last; } static struct clocksource clocksource_vmi = { -- cgit v1.2.3-70-g09d2 From cc3ca22063784076bd240fda87217387a8f2ae92 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 20 Feb 2009 23:35:51 -0800 Subject: x86, mce: remove incorrect __cpuinit for mce_cpu_features() Impact: Bug fix on UP Checkin 6ec68bff3c81e776a455f6aca95c8c5f1d630198: x86, mce: reinitialize per cpu features on resume introduced a call to mce_cpu_features() in the resume path, in order for the MCE machinery to get properly reinitialized after a resume. However, this function (and its successors) was flagged __cpuinit, which becomes __init on UP configurations (on SMP suspend/resume requires CPU hotplug and so this would not be seen.) Remove the offending __cpuinit annotations for mce_cpu_features() and its successor functions. Cc: Andi Kleen Cc: Linus Torvalds Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/mce_64.c | 2 +- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 2 +- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 25cf624eccb..fe79985ce0f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -490,7 +490,7 @@ static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) } -static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c) +static void mce_cpu_features(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { case X86_VENDOR_INTEL: diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 8ae8c4ff094..f2ee0ae29bd 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -121,7 +121,7 @@ static long threshold_restart_bank(void *_tr) } /* cpu init entry point, called from mce.c with preempt off */ -void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) +void mce_amd_feature_init(struct cpuinfo_x86 *c) { unsigned int bank, block; unsigned int cpu = smp_processor_id(); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index 4b48f251fd3..f44c3662436 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -30,7 +30,7 @@ asmlinkage void smp_thermal_interrupt(void) irq_exit(); } -static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c) +static void intel_init_thermal(struct cpuinfo_x86 *c) { u32 l, h; int tm2 = 0; @@ -84,7 +84,7 @@ static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c) return; } -void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c) +void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); } -- cgit v1.2.3-70-g09d2 From e6bd6760c92dc8475c79c4c4a8a16ac313c0b93d Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Sun, 15 Feb 2009 22:45:49 +0100 Subject: x86_64: acpi/wakeup_64 cleanup - remove %ds re-set, it's already set in wakeup_long64 - remove double labels and alignment (ENTRY already adds both) - use meaningful resume point labelname - skip alignment while jumping from wakeup_long64 to the resume point - remove .size, .type and unused labels [v2] - added ENDPROCs Signed-off-by: Jiri Slaby Acked-by: Cyrill Gorcunov Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- arch/x86/kernel/acpi/wakeup_64.S | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index bcc293423a7..b5dee6a0de3 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -13,7 +13,6 @@ * Hooray, we are in Long 64-bit mode (but still running in low memory) */ ENTRY(wakeup_long64) -wakeup_long64: movq saved_magic, %rax movq $0x123456789abcdef0, %rdx cmpq %rdx, %rax @@ -34,16 +33,12 @@ wakeup_long64: movq saved_rip, %rax jmp *%rax +ENDPROC(wakeup_long64) bogus_64_magic: jmp bogus_64_magic - .align 2 - .p2align 4,,15 -.globl do_suspend_lowlevel - .type do_suspend_lowlevel,@function -do_suspend_lowlevel: -.LFB5: +ENTRY(do_suspend_lowlevel) subq $8, %rsp xorl %eax, %eax call save_processor_state @@ -67,7 +62,7 @@ do_suspend_lowlevel: pushfq popq pt_regs_flags(%rax) - movq $.L97, saved_rip(%rip) + movq $resume_point, saved_rip(%rip) movq %rsp, saved_rsp movq %rbp, saved_rbp @@ -79,13 +74,9 @@ do_suspend_lowlevel: movl $3, %edi xorl %eax, %eax jmp acpi_enter_sleep_state -.L97: - .p2align 4,,7 -.L99: - .align 4 - movl $24, %eax - movw %ax, %ds + .align 4 +resume_point: /* We don't restore %rax, it must be 0 anyway */ movq $saved_context, %rax movq saved_context_cr4(%rax), %rbx @@ -117,12 +108,9 @@ do_suspend_lowlevel: xorl %eax, %eax addq $8, %rsp jmp restore_processor_state -.LFE5: -.Lfe5: - .size do_suspend_lowlevel, .Lfe5-do_suspend_lowlevel - +ENDPROC(do_suspend_lowlevel) + .data -ALIGN ENTRY(saved_rbp) .quad 0 ENTRY(saved_rsi) .quad 0 ENTRY(saved_rdi) .quad 0 -- cgit v1.2.3-70-g09d2 From 6defa2fe2019f3729933516fba5cfd75eecd07de Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Sun, 15 Feb 2009 22:46:45 +0100 Subject: x86_64: Fix S3 fail path As acpi_enter_sleep_state can fail, take this into account in do_suspend_lowlevel and don't return to the do_suspend_lowlevel's caller. This would break (currently) fpu status and preempt count. Technically, this means use `call' instead of `jmp' and `jmp' to the `resume_point' after the `call' (i.e. if acpi_enter_sleep_state returns=fails). `resume_point' will handle the restore of fpu and preempt count gracefully. Signed-off-by: Jiri Slaby Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- arch/x86/kernel/acpi/wakeup_64.S | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index b5dee6a0de3..96258d9dc97 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -73,7 +73,9 @@ ENTRY(do_suspend_lowlevel) addq $8, %rsp movl $3, %edi xorl %eax, %eax - jmp acpi_enter_sleep_state + call acpi_enter_sleep_state + /* in case something went wrong, restore the machine status and go on */ + jmp resume_point .align 4 resume_point: -- cgit v1.2.3-70-g09d2 From 936577c61d0c10b8929608a92c98d839b22053bc Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 22 Feb 2009 10:27:49 -0800 Subject: x86: Add IRQF_TIMER to legacy x86 timer interrupt descriptors Right now nobody cares, but the suspend/resume code will eventually want to suspend device interrupts without suspending the timer, and will depend on this flag to know. The modern x86 timer infrastructure uses the local APIC timers and never shows up as a device interrupt at all, so it isn't affected and doesn't need any of this. Cc: Rafael J. Wysocki Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- arch/x86/kernel/time_64.c | 2 +- arch/x86/kernel/vmiclock_32.c | 2 +- arch/x86/mach-default/setup.c | 2 +- arch/x86/mach-voyager/setup.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index e6e695acd72..241ec3923f6 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -115,7 +115,7 @@ unsigned long __init calibrate_cpu(void) static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, + .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER, .mask = CPU_MASK_NONE, .name = "timer" }; diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index bde106cae0a..e5b088fffa4 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -202,7 +202,7 @@ static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id) static struct irqaction vmi_clock_action = { .name = "vmi-timer", .handler = vmi_timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER, .mask = CPU_MASK_ALL, }; diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index a265a7c6319..50b59187112 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -96,7 +96,7 @@ void __init trap_init_hook(void) static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, .mask = CPU_MASK_NONE, .name = "timer" }; diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c index d914a7996a6..8e5118371f0 100644 --- a/arch/x86/mach-voyager/setup.c +++ b/arch/x86/mach-voyager/setup.c @@ -56,7 +56,7 @@ void __init trap_init_hook(void) static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER, .mask = CPU_MASK_NONE, .name = "timer" }; -- cgit v1.2.3-70-g09d2 From 770824bdc421ff58a64db608294323571c949f4c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 22 Feb 2009 18:38:50 +0100 Subject: PM: Split up sysdev_[suspend|resume] from device_power_[down|up] Move the sysdev_suspend/resume from the callee to the callers, with no real change in semantics, so that we can rework the disabling of interrupts during suspend/hibernation. This is based on an earlier patch from Linus. Signed-off-by: Rafael J. Wysocki Signed-off-by: Linus Torvalds --- arch/x86/kernel/apm_32.c | 4 ++++ drivers/base/base.h | 2 -- drivers/base/power/main.c | 3 --- drivers/xen/manage.c | 8 ++++++++ include/linux/pm.h | 2 ++ kernel/kexec.c | 7 +++++++ kernel/power/disk.c | 11 +++++++++++ kernel/power/main.c | 8 ++++++-- 8 files changed, 38 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 98807bb095a..266ec6c18b6 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1192,6 +1192,7 @@ static int suspend(int vetoable) device_suspend(PMSG_SUSPEND); local_irq_disable(); device_power_down(PMSG_SUSPEND); + sysdev_suspend(PMSG_SUSPEND); local_irq_enable(); @@ -1208,6 +1209,7 @@ static int suspend(int vetoable) if (err != APM_SUCCESS) apm_error("suspend", err); err = (err == APM_SUCCESS) ? 0 : -EIO; + sysdev_resume(); device_power_up(PMSG_RESUME); local_irq_enable(); device_resume(PMSG_RESUME); @@ -1228,6 +1230,7 @@ static void standby(void) local_irq_disable(); device_power_down(PMSG_SUSPEND); + sysdev_suspend(PMSG_SUSPEND); local_irq_enable(); err = set_system_power_state(APM_STATE_STANDBY); @@ -1235,6 +1238,7 @@ static void standby(void) apm_error("standby", err); local_irq_disable(); + sysdev_resume(); device_power_up(PMSG_RESUME); local_irq_enable(); } diff --git a/drivers/base/base.h b/drivers/base/base.h index 0a5f055dffb..9f50f1b545d 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -88,8 +88,6 @@ extern void driver_detach(struct device_driver *drv); extern int driver_probe_device(struct device_driver *drv, struct device *dev); extern void sysdev_shutdown(void); -extern int sysdev_suspend(pm_message_t state); -extern int sysdev_resume(void); extern char *make_class_name(const char *name, struct kobject *kobj); diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 670c9d6c140..2d14f4ae6c0 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -333,7 +333,6 @@ static void dpm_power_up(pm_message_t state) */ void device_power_up(pm_message_t state) { - sysdev_resume(); dpm_power_up(state); } EXPORT_SYMBOL_GPL(device_power_up); @@ -577,8 +576,6 @@ int device_power_down(pm_message_t state) } dev->power.status = DPM_OFF_IRQ; } - if (!error) - error = sysdev_suspend(state); if (error) dpm_power_up(resume_event(state)); return error; diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 9b91617b958..56892a142ee 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -45,6 +45,13 @@ static int xen_suspend(void *data) err); return err; } + err = sysdev_suspend(PMSG_SUSPEND); + if (err) { + printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n", + err); + device_power_up(PMSG_RESUME); + return err; + } xen_mm_pin_all(); gnttab_suspend(); @@ -61,6 +68,7 @@ static int xen_suspend(void *data) gnttab_resume(); xen_mm_unpin_all(); + sysdev_resume(); device_power_up(PMSG_RESUME); if (!*cancelled) { diff --git a/include/linux/pm.h b/include/linux/pm.h index de2e0a8f672..24ba5f67b3a 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -381,10 +381,12 @@ struct dev_pm_info { #ifdef CONFIG_PM_SLEEP extern void device_pm_lock(void); +extern int sysdev_resume(void); extern void device_power_up(pm_message_t state); extern void device_resume(pm_message_t state); extern void device_pm_unlock(void); +extern int sysdev_suspend(pm_message_t state); extern int device_power_down(pm_message_t state); extern int device_suspend(pm_message_t state); extern int device_prepare_suspend(pm_message_t state); diff --git a/kernel/kexec.c b/kernel/kexec.c index 8a6d7b08864..48389957825 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1465,6 +1465,11 @@ int kernel_kexec(void) error = device_power_down(PMSG_FREEZE); if (error) goto Enable_irqs; + + /* Suspend system devices */ + error = sysdev_suspend(PMSG_FREEZE); + if (error) + goto Power_up_devices; } else #endif { @@ -1477,6 +1482,8 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { + sysdev_resume(); + Power_up_devices: device_power_up(PMSG_RESTORE); Enable_irqs: local_irq_enable(); diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 7b40e94b1d4..4a4a206b197 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -227,6 +227,12 @@ static int create_image(int platform_mode) "aborting hibernation\n"); goto Enable_irqs; } + sysdev_suspend(PMSG_FREEZE); + if (error) { + printk(KERN_ERR "PM: Some devices failed to power down, " + "aborting hibernation\n"); + goto Power_up_devices; + } if (hibernation_test(TEST_CORE)) goto Power_up; @@ -242,9 +248,11 @@ static int create_image(int platform_mode) if (!in_suspend) platform_leave(platform_mode); Power_up: + sysdev_resume(); /* NOTE: device_power_up() is just a resume() for devices * that suspended with irqs off ... no overall powerup. */ + Power_up_devices: device_power_up(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); Enable_irqs: @@ -335,6 +343,7 @@ static int resume_target_kernel(void) "aborting resume\n"); goto Enable_irqs; } + sysdev_suspend(PMSG_QUIESCE); /* We'll ignore saved state, but this gets preempt count (etc) right */ save_processor_state(); error = restore_highmem(); @@ -357,6 +366,7 @@ static int resume_target_kernel(void) swsusp_free(); restore_processor_state(); touch_softlockup_watchdog(); + sysdev_resume(); device_power_up(PMSG_RECOVER); Enable_irqs: local_irq_enable(); @@ -440,6 +450,7 @@ int hibernation_platform_enter(void) local_irq_disable(); error = device_power_down(PMSG_HIBERNATE); if (!error) { + sysdev_suspend(PMSG_HIBERNATE); hibernation_ops->enter(); /* We should never get here */ while (1); diff --git a/kernel/power/main.c b/kernel/power/main.c index b4d219016b6..c9632f841f6 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -298,8 +298,12 @@ static int suspend_enter(suspend_state_t state) goto Done; } - if (!suspend_test(TEST_CORE)) - error = suspend_ops->enter(state); + error = sysdev_suspend(PMSG_SUSPEND); + if (!error) { + if (!suspend_test(TEST_CORE)) + error = suspend_ops->enter(state); + sysdev_resume(); + } device_power_up(PMSG_RESUME); Done: -- cgit v1.2.3-70-g09d2