From 15d8791cae75dca27bfda8ecfe87dca9379d6bb0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 16 Feb 2012 09:15:04 -0800 Subject: i387: fix x86-64 preemption-unsafe user stack save/restore Commit 5b1cbac37798 ("i387: make irq_fpu_usable() tests more robust") added a sanity check to the #NM handler to verify that we never cause the "Device Not Available" exception in kernel mode. However, that check actually pinpointed a (fundamental) race where we do cause that exception as part of the signal stack FPU state save/restore code. Because we use the floating point instructions themselves to save and restore state directly from user mode, we cannot do that atomically with testing the TS_USEDFPU bit: the user mode access itself may cause a page fault, which causes a task switch, which saves and restores the FP/MMX state from the kernel buffers. This kind of "recursive" FP state save is fine per se, but it means that when the signal stack save/restore gets restarted, it will now take the '#NM' exception we originally tried to avoid. With preemption this can happen even without the page fault - but because of the user access, we cannot just disable preemption around the save/restore instruction. There are various ways to solve this, including using the "enable/disable_page_fault()" helpers to not allow page faults at all during the sequence, and fall back to copying things by hand without the use of the native FP state save/restore instructions. However, the simplest thing to do is to just allow the #NM from kernel space, but fix the race in setting and clearing CR0.TS that this all exposed: the TS bit changes and the TS_USEDFPU bit absolutely have to be atomic wrt scheduling, so while the actual state save/restore can be interrupted and restarted, the act of actually clearing/setting CR0.TS and the TS_USEDFPU bit together must not. Instead of just adding random "preempt_disable/enable()" calls to what is already excessively ugly code, this introduces some helper functions that mostly mirror the "kernel_fpu_begin/end()" functionality, just for the user state instead. Those helper functions should probably eventually replace the other ad-hoc CR0.TS and TS_USEDFPU tests too, but I'll need to think about it some more: the task switching functionality in particular needs to expose the difference between the 'prev' and 'next' threads, while the new helper functions intentionally were written to only work with 'current'. Signed-off-by: Linus Torvalds --- arch/x86/kernel/xsave.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel/xsave.c') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index a3911343976..86f1f09a738 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -168,7 +168,7 @@ int save_i387_xstate(void __user *buf) if (!used_math()) return 0; - if (task_thread_info(tsk)->status & TS_USEDFPU) { + if (user_has_fpu()) { if (use_xsave()) err = xsave_user(buf); else @@ -176,8 +176,7 @@ int save_i387_xstate(void __user *buf) if (err) return err; - task_thread_info(tsk)->status &= ~TS_USEDFPU; - stts(); + user_fpu_end(); } else { sanitize_i387_state(tsk); if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, @@ -292,10 +291,7 @@ int restore_i387_xstate(void __user *buf) return err; } - if (!(task_thread_info(current)->status & TS_USEDFPU)) { - clts(); - task_thread_info(current)->status |= TS_USEDFPU; - } + user_fpu_begin(); if (use_xsave()) err = restore_user_xstate(buf); else -- cgit v1.2.3-70-g09d2 From 6d59d7a9f5b723a7ac1925c136e93ec83c0c3043 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 16 Feb 2012 13:33:12 -0800 Subject: i387: don't ever touch TS_USEDFPU directly, use helper functions This creates three helper functions that do the TS_USEDFPU accesses, and makes everybody that used to do it by hand use those helpers instead. In addition, there's a couple of helper functions for the "change both CR0.TS and TS_USEDFPU at the same time" case, and the places that do that together have been changed to use those. That means that we have fewer random places that open-code this situation. The intent is partly to clarify the code without actually changing any semantics yet (since we clearly still have some hard to reproduce bug in this area), but also to make it much easier to use another approach entirely to caching the CR0.TS bit for software accesses. Right now we use a bit in the thread-info 'status' variable (this patch does not change that), but we might want to make it a full field of its own or even make it a per-cpu variable. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 75 +++++++++++++++++++++++++++++++++------------ arch/x86/kernel/traps.c | 2 +- arch/x86/kernel/xsave.c | 2 +- arch/x86/kvm/vmx.c | 2 +- 4 files changed, 58 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel/xsave.c') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 1e12c2d087e..548b2c07ac9 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -279,6 +279,47 @@ static inline int restore_fpu_checking(struct task_struct *tsk) return fpu_restore_checking(&tsk->thread.fpu); } +/* + * Software FPU state helpers. Careful: these need to + * be preemption protection *and* they need to be + * properly paired with the CR0.TS changes! + */ +static inline int __thread_has_fpu(struct thread_info *ti) +{ + return ti->status & TS_USEDFPU; +} + +/* Must be paired with an 'stts' after! */ +static inline void __thread_clear_has_fpu(struct thread_info *ti) +{ + ti->status &= ~TS_USEDFPU; +} + +/* Must be paired with a 'clts' before! */ +static inline void __thread_set_has_fpu(struct thread_info *ti) +{ + ti->status |= TS_USEDFPU; +} + +/* + * Encapsulate the CR0.TS handling together with the + * software flag. + * + * These generally need preemption protection to work, + * do try to avoid using these on their own. + */ +static inline void __thread_fpu_end(struct thread_info *ti) +{ + __thread_clear_has_fpu(ti); + stts(); +} + +static inline void __thread_fpu_begin(struct thread_info *ti) +{ + clts(); + __thread_set_has_fpu(ti); +} + /* * Signal frame handlers... */ @@ -287,23 +328,21 @@ extern int restore_i387_xstate(void __user *buf); static inline void __unlazy_fpu(struct task_struct *tsk) { - if (task_thread_info(tsk)->status & TS_USEDFPU) { + if (__thread_has_fpu(task_thread_info(tsk))) { __save_init_fpu(tsk); - task_thread_info(tsk)->status &= ~TS_USEDFPU; - stts(); + __thread_fpu_end(task_thread_info(tsk)); } else tsk->fpu_counter = 0; } static inline void __clear_fpu(struct task_struct *tsk) { - if (task_thread_info(tsk)->status & TS_USEDFPU) { + if (__thread_has_fpu(task_thread_info(tsk))) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); - task_thread_info(tsk)->status &= ~TS_USEDFPU; - stts(); + __thread_fpu_end(task_thread_info(tsk)); } } @@ -311,14 +350,14 @@ static inline void __clear_fpu(struct task_struct *tsk) * Were we in an interrupt that interrupted kernel mode? * * We can do a kernel_fpu_begin/end() pair *ONLY* if that - * pair does nothing at all: TS_USEDFPU must be clear (so + * pair does nothing at all: the thread must not have fpu (so * that we don't try to save the FPU state), and TS must * be set (so that the clts/stts pair does nothing that is * visible in the interrupted kernel thread). */ static inline bool interrupted_kernel_fpu_idle(void) { - return !(current_thread_info()->status & TS_USEDFPU) && + return !__thread_has_fpu(current_thread_info()) && (read_cr0() & X86_CR0_TS); } @@ -356,9 +395,9 @@ static inline void kernel_fpu_begin(void) WARN_ON_ONCE(!irq_fpu_usable()); preempt_disable(); - if (me->status & TS_USEDFPU) { + if (__thread_has_fpu(me)) { __save_init_fpu(me->task); - me->status &= ~TS_USEDFPU; + __thread_clear_has_fpu(me); /* We do 'stts()' in kernel_fpu_end() */ } else clts(); @@ -422,24 +461,21 @@ static inline void irq_ts_restore(int TS_state) */ static inline int user_has_fpu(void) { - return current_thread_info()->status & TS_USEDFPU; + return __thread_has_fpu(current_thread_info()); } static inline void user_fpu_end(void) { preempt_disable(); - current_thread_info()->status &= ~TS_USEDFPU; - stts(); + __thread_fpu_end(current_thread_info()); preempt_enable(); } static inline void user_fpu_begin(void) { preempt_disable(); - if (!user_has_fpu()) { - clts(); - current_thread_info()->status |= TS_USEDFPU; - } + if (!user_has_fpu()) + __thread_fpu_begin(current_thread_info()); preempt_enable(); } @@ -448,11 +484,10 @@ static inline void user_fpu_begin(void) */ static inline void save_init_fpu(struct task_struct *tsk) { - WARN_ON_ONCE(!(task_thread_info(tsk)->status & TS_USEDFPU)); + WARN_ON_ONCE(!__thread_has_fpu(task_thread_info(tsk))); preempt_disable(); __save_init_fpu(tsk); - task_thread_info(tsk)->status &= ~TS_USEDFPU; - stts(); + __thread_fpu_end(task_thread_info(tsk)); preempt_enable(); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 982433b5da3..fc676e44c77 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -588,7 +588,7 @@ void __math_state_restore(void) return; } - thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + __thread_set_has_fpu(thread); /* clts in caller! */ tsk->fpu_counter++; } diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 86f1f09a738..a0bcd0dbc95 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -47,7 +47,7 @@ void __sanitize_i387_state(struct task_struct *tsk) if (!fx) return; - BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU); + BUG_ON(__thread_has_fpu(task_thread_info(tsk))); xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d29216c462b..36091dd04b4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1457,7 +1457,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif - if (current_thread_info()->status & TS_USEDFPU) + if (__thread_has_fpu(current_thread_info())) clts(); load_gdt(&__get_cpu_var(host_gdt)); } -- cgit v1.2.3-70-g09d2 From f94edacf998516ac9d849f7bc6949a703977a7f3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 17 Feb 2012 21:48:54 -0800 Subject: i387: move TS_USEDFPU flag from thread_info to task_struct This moves the bit that indicates whether a thread has ownership of the FPU from the TS_USEDFPU bit in thread_info->status to a word of its own (called 'has_fpu') in task_struct->thread.has_fpu. This fixes two independent bugs at the same time: - changing 'thread_info->status' from the scheduler causes nasty problems for the other users of that variable, since it is defined to be thread-synchronous (that's what the "TS_" part of the naming was supposed to indicate). So perfectly valid code could (and did) do ti->status |= TS_RESTORE_SIGMASK; and the compiler was free to do that as separate load, or and store instructions. Which can cause problems with preemption, since a task switch could happen in between, and change the TS_USEDFPU bit. The change to TS_USEDFPU would be overwritten by the final store. In practice, this seldom happened, though, because the 'status' field was seldom used more than once, so gcc would generally tend to generate code that used a read-modify-write instruction and thus happened to avoid this problem - RMW instructions are naturally low fat and preemption-safe. - On x86-32, the current_thread_info() pointer would, during interrupts and softirqs, point to a *copy* of the real thread_info, because x86-32 uses %esp to calculate the thread_info address, and thus the separate irq (and softirq) stacks would cause these kinds of odd thread_info copy aliases. This is normally not a problem, since interrupts aren't supposed to look at thread information anyway (what thread is running at interrupt time really isn't very well-defined), but it confused the heck out of irq_fpu_usable() and the code that tried to squirrel away the FPU state. (It also caused untold confusion for us poor kernel developers). It also turns out that using 'task_struct' is actually much more natural for most of the call sites that care about the FPU state, since they tend to work with the task struct for other reasons anyway (ie scheduling). And the FPU data that we are going to save/restore is found there too. Thanks to Arjan Van De Ven for pointing us to the %esp issue. Cc: Arjan van de Ven Reported-and-tested-by: Raphael Prevost Acked-and-tested-by: Suresh Siddha Tested-by: Peter Anvin Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 44 +++++++++++++++++++------------------- arch/x86/include/asm/processor.h | 1 + arch/x86/include/asm/thread_info.h | 2 -- arch/x86/kernel/traps.c | 11 +++++----- arch/x86/kernel/xsave.c | 2 +- arch/x86/kvm/vmx.c | 2 +- 6 files changed, 30 insertions(+), 32 deletions(-) (limited to 'arch/x86/kernel/xsave.c') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 01b115d8677..f5376676f89 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -264,21 +264,21 @@ static inline int restore_fpu_checking(struct task_struct *tsk) * be preemption protection *and* they need to be * properly paired with the CR0.TS changes! */ -static inline int __thread_has_fpu(struct thread_info *ti) +static inline int __thread_has_fpu(struct task_struct *tsk) { - return ti->status & TS_USEDFPU; + return tsk->thread.has_fpu; } /* Must be paired with an 'stts' after! */ -static inline void __thread_clear_has_fpu(struct thread_info *ti) +static inline void __thread_clear_has_fpu(struct task_struct *tsk) { - ti->status &= ~TS_USEDFPU; + tsk->thread.has_fpu = 0; } /* Must be paired with a 'clts' before! */ -static inline void __thread_set_has_fpu(struct thread_info *ti) +static inline void __thread_set_has_fpu(struct task_struct *tsk) { - ti->status |= TS_USEDFPU; + tsk->thread.has_fpu = 1; } /* @@ -288,16 +288,16 @@ static inline void __thread_set_has_fpu(struct thread_info *ti) * These generally need preemption protection to work, * do try to avoid using these on their own. */ -static inline void __thread_fpu_end(struct thread_info *ti) +static inline void __thread_fpu_end(struct task_struct *tsk) { - __thread_clear_has_fpu(ti); + __thread_clear_has_fpu(tsk); stts(); } -static inline void __thread_fpu_begin(struct thread_info *ti) +static inline void __thread_fpu_begin(struct task_struct *tsk) { clts(); - __thread_set_has_fpu(ti); + __thread_set_has_fpu(tsk); } /* @@ -308,21 +308,21 @@ extern int restore_i387_xstate(void __user *buf); static inline void __unlazy_fpu(struct task_struct *tsk) { - if (__thread_has_fpu(task_thread_info(tsk))) { + if (__thread_has_fpu(tsk)) { __save_init_fpu(tsk); - __thread_fpu_end(task_thread_info(tsk)); + __thread_fpu_end(tsk); } else tsk->fpu_counter = 0; } static inline void __clear_fpu(struct task_struct *tsk) { - if (__thread_has_fpu(task_thread_info(tsk))) { + if (__thread_has_fpu(tsk)) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); - __thread_fpu_end(task_thread_info(tsk)); + __thread_fpu_end(tsk); } } @@ -337,7 +337,7 @@ static inline void __clear_fpu(struct task_struct *tsk) */ static inline bool interrupted_kernel_fpu_idle(void) { - return !__thread_has_fpu(current_thread_info()) && + return !__thread_has_fpu(current) && (read_cr0() & X86_CR0_TS); } @@ -371,12 +371,12 @@ static inline bool irq_fpu_usable(void) static inline void kernel_fpu_begin(void) { - struct thread_info *me = current_thread_info(); + struct task_struct *me = current; WARN_ON_ONCE(!irq_fpu_usable()); preempt_disable(); if (__thread_has_fpu(me)) { - __save_init_fpu(me->task); + __save_init_fpu(me); __thread_clear_has_fpu(me); /* We do 'stts()' in kernel_fpu_end() */ } else @@ -441,13 +441,13 @@ static inline void irq_ts_restore(int TS_state) */ static inline int user_has_fpu(void) { - return __thread_has_fpu(current_thread_info()); + return __thread_has_fpu(current); } static inline void user_fpu_end(void) { preempt_disable(); - __thread_fpu_end(current_thread_info()); + __thread_fpu_end(current); preempt_enable(); } @@ -455,7 +455,7 @@ static inline void user_fpu_begin(void) { preempt_disable(); if (!user_has_fpu()) - __thread_fpu_begin(current_thread_info()); + __thread_fpu_begin(current); preempt_enable(); } @@ -464,10 +464,10 @@ static inline void user_fpu_begin(void) */ static inline void save_init_fpu(struct task_struct *tsk) { - WARN_ON_ONCE(!__thread_has_fpu(task_thread_info(tsk))); + WARN_ON_ONCE(!__thread_has_fpu(tsk)); preempt_disable(); __save_init_fpu(tsk); - __thread_fpu_end(task_thread_info(tsk)); + __thread_fpu_end(tsk); preempt_enable(); } diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index aa9088c2693..f7c89e231c6 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -454,6 +454,7 @@ struct thread_struct { unsigned long trap_no; unsigned long error_code; /* floating point and extended processor state */ + unsigned long has_fpu; struct fpu fpu; #ifdef CONFIG_X86_32 /* Virtual 86 mode info */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index bc817cd8b44..cfd8144d552 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -247,8 +247,6 @@ static inline struct thread_info *current_thread_info(void) * ever touches our thread-synchronous status, so we don't * have to worry about atomic accesses. */ -#define TS_USEDFPU 0x0001 /* FPU was used by this task - this quantum (SMP) */ #define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ #define TS_POLLING 0x0004 /* idle task polling need_resched, skip sending interrupt */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4d42300dcd2..ad25e51f40c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -582,12 +582,11 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void) */ void math_state_restore(void) { - struct thread_info *thread = current_thread_info(); - struct task_struct *tsk = thread->task; + struct task_struct *tsk = current; /* We need a safe address that is cheap to find and that is already - in L1. We just brought in "thread->task", so use that */ -#define safe_address (thread->task) + in L1. We're just bringing in "tsk->thread.has_fpu", so use that */ +#define safe_address (tsk->thread.has_fpu) if (!tsk_used_math(tsk)) { local_irq_enable(); @@ -604,7 +603,7 @@ void math_state_restore(void) local_irq_disable(); } - __thread_fpu_begin(thread); + __thread_fpu_begin(tsk); /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. Clear the x87 state here by setting it to fixed @@ -620,7 +619,7 @@ void math_state_restore(void) * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ if (unlikely(restore_fpu_checking(tsk))) { - __thread_fpu_end(thread); + __thread_fpu_end(tsk); force_sig(SIGSEGV, tsk); return; } diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index a0bcd0dbc95..71109111411 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -47,7 +47,7 @@ void __sanitize_i387_state(struct task_struct *tsk) if (!fx) return; - BUG_ON(__thread_has_fpu(task_thread_info(tsk))); + BUG_ON(__thread_has_fpu(tsk)); xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 36091dd04b4..3b4c8d8ad90 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1457,7 +1457,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #ifdef CONFIG_X86_64 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif - if (__thread_has_fpu(current_thread_info())) + if (__thread_has_fpu(current)) clts(); load_gdt(&__get_cpu_var(host_gdt)); } -- cgit v1.2.3-70-g09d2