From 6dbde3530850d4d8bfc1b6bd4006d92786a2787f Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@elte.hu> Date: Thu, 15 Jan 2009 22:15:53 +0900 Subject: percpu: add optimized generic percpu accessors It is an optimization and a cleanup, and adds the following new generic percpu methods: percpu_read() percpu_write() percpu_add() percpu_sub() percpu_and() percpu_or() percpu_xor() and implements support for them on x86. (other architectures will fall back to a default implementation) The advantage is that for example to read a local percpu variable, instead of this sequence: return __get_cpu_var(var); ffffffff8102ca2b: 48 8b 14 fd 80 09 74 mov -0x7e8bf680(,%rdi,8),%rdx ffffffff8102ca32: 81 ffffffff8102ca33: 48 c7 c0 d8 59 00 00 mov $0x59d8,%rax ffffffff8102ca3a: 48 8b 04 10 mov (%rax,%rdx,1),%rax We can get a single instruction by using the optimized variants: return percpu_read(var); ffffffff8102ca3f: 65 48 8b 05 91 8f fd mov %gs:0x7efd8f91(%rip),%rax I also cleaned up the x86-specific APIs and made the x86 code use these new generic percpu primitives. tj: * fixed generic percpu_sub() definition as Roel Kluin pointed out * added percpu_and() for completeness's sake * made generic percpu ops atomic against preemption Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Tejun Heo <tj@kernel.org> --- arch/x86/kernel/process_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel/process_32.c') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index a546f55c77b..77d546817d9 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -591,7 +591,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) loadsegment(gs, next->gs); - x86_write_percpu(current_task, next_p); + percpu_write(current_task, next_p); return prev_p; } -- cgit v1.2.3-70-g09d2 From ea9279066de44053d0c20ea855bc9f4706652d84 Mon Sep 17 00:00:00 2001 From: Brian Gerst <brgerst@gmail.com> Date: Mon, 19 Jan 2009 00:38:58 +0900 Subject: x86-64: Move cpu number from PDA to per-cpu and consolidate with 32-bit. tj: moved cpu_number definition out of CONFIG_HAVE_SETUP_PER_CPU_AREA for voyager. Signed-off-by: Brian Gerst <brgerst@gmail.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- arch/x86/include/asm/pda.h | 2 +- arch/x86/include/asm/smp.h | 4 +--- arch/x86/kernel/asm-offsets_64.c | 1 - arch/x86/kernel/cpu/common.c | 1 - arch/x86/kernel/process_32.c | 3 --- arch/x86/kernel/setup_percpu.c | 10 ++++++++++ arch/x86/kernel/smpcommon.c | 2 -- 7 files changed, 12 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel/process_32.c') diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h index 09965f7a216..668d5a5b6f7 100644 --- a/arch/x86/include/asm/pda.h +++ b/arch/x86/include/asm/pda.h @@ -16,7 +16,7 @@ struct x8664_pda { unsigned long kernelstack; /* 16 top of kernel stack for current */ unsigned long oldrsp; /* 24 user rsp for system call */ int irqcount; /* 32 Irq nesting counter. Starts -1 */ - unsigned int cpunumber; /* 36 Logical CPU number */ + unsigned int unused6; /* 36 was cpunumber */ #ifdef CONFIG_CC_STACKPROTECTOR unsigned long stack_canary; /* 40 stack canary value */ /* gcc-ABI: this canary MUST be at diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index c7bbbbe65d3..68636e767a9 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -25,9 +25,7 @@ extern unsigned int num_processors; DECLARE_PER_CPU(cpumask_t, cpu_sibling_map); DECLARE_PER_CPU(cpumask_t, cpu_core_map); DECLARE_PER_CPU(u16, cpu_llc_id); -#ifdef CONFIG_X86_32 DECLARE_PER_CPU(int, cpu_number); -#endif static inline struct cpumask *cpu_sibling_mask(int cpu) { @@ -164,7 +162,7 @@ extern unsigned disabled_cpus __cpuinitdata; extern int safe_smp_processor_id(void); #elif defined(CONFIG_X86_64_SMP) -#define raw_smp_processor_id() read_pda(cpunumber) +#define raw_smp_processor_id() (percpu_read(cpu_number)) #define stack_smp_processor_id() \ ({ \ diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 5b821fbdaf7..cae6697c099 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -53,7 +53,6 @@ int main(void) ENTRY(oldrsp); ENTRY(pcurrent); ENTRY(irqcount); - ENTRY(cpunumber); DEFINE(pda_size, sizeof(struct x8664_pda)); BLANK(); #undef ENTRY diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b6d7eec0be7..4221e920886 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -899,7 +899,6 @@ void __cpuinit pda_init(int cpu) load_pda_offset(cpu); - pda->cpunumber = cpu; pda->irqcount = -1; pda->kernelstack = (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 77d546817d9..2c00a57ccb9 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -66,9 +66,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(int, cpu_number); -EXPORT_PER_CPU_SYMBOL(cpu_number); - /* * Return saved PC of a blocked thread. */ diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 8b53ef83c61..258497f93f4 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -22,6 +22,15 @@ # define DBG(x...) #endif +/* + * Could be inside CONFIG_HAVE_SETUP_PER_CPU_AREA with other stuff but + * voyager wants cpu_number too. + */ +#ifdef CONFIG_SMP +DEFINE_PER_CPU(int, cpu_number); +EXPORT_PER_CPU_SYMBOL(cpu_number); +#endif + #ifdef CONFIG_X86_LOCAL_APIC unsigned int num_processors; unsigned disabled_cpus __cpuinitdata; @@ -193,6 +202,7 @@ void __init setup_per_cpu_areas(void) memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start); per_cpu_offset(cpu) = ptr - __per_cpu_start; per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); + per_cpu(cpu_number, cpu) = cpu; #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = (char *)per_cpu(irq_stack, cpu) + IRQ_STACK_SIZE - 64; diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c index 7e157810062..add36b4e37c 100644 --- a/arch/x86/kernel/smpcommon.c +++ b/arch/x86/kernel/smpcommon.c @@ -28,7 +28,5 @@ __cpuinit void init_gdt(int cpu) write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); - - per_cpu(cpu_number, cpu) = cpu; } #endif -- cgit v1.2.3-70-g09d2 From 03d2989df9c1c7df5b33c7a87e0790465461836a Mon Sep 17 00:00:00 2001 From: Brian Gerst <brgerst@gmail.com> Date: Fri, 23 Jan 2009 11:03:28 +0900 Subject: x86: remove idle_timestamp from 32bit irq_cpustat_t Impact: bogus irq_cpustat field removed idle_timestamp is left over from the removed irqbalance code. Signed-off-by: Brian Gerst <brgerst@gmail.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- arch/x86/include/asm/hardirq_32.h | 1 - arch/x86/kernel/process_32.c | 1 - 2 files changed, 2 deletions(-) (limited to 'arch/x86/kernel/process_32.c') diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h index d4b5d731073..a70ed050fde 100644 --- a/arch/x86/include/asm/hardirq_32.h +++ b/arch/x86/include/asm/hardirq_32.h @@ -6,7 +6,6 @@ typedef struct { unsigned int __softirq_pending; - unsigned long idle_timestamp; unsigned int __nmi_count; /* arch dependent */ unsigned int apic_timer_irqs; /* arch dependent */ unsigned int irq0_irqs; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 2c00a57ccb9..1a1ae8edc40 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -108,7 +108,6 @@ void cpu_idle(void) play_dead(); local_irq_disable(); - __get_cpu_var(irq_stat).idle_timestamp = jiffies; /* Don't trace irqs off for idle */ stop_critical_timings(); pm_idle(); -- cgit v1.2.3-70-g09d2 From d9a89a26e02ef9ed03f74a755a8b4d8f3a066622 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 9 Feb 2009 22:17:40 +0900 Subject: x86: add %gs accessors for x86_32 Impact: cleanup On x86_32, %gs is handled lazily. It's not saved and restored on kernel entry/exit but only when necessary which usually is during task switch but there are few other places. Currently, it's done by calling savesegment() and loadsegment() explicitly. Define get_user_gs(), set_user_gs() and task_user_gs() and use them instead. While at it, clean up register access macros in signal.c. This cleans up code a bit and will help future changes. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/include/asm/a.out-core.h | 2 +- arch/x86/include/asm/elf.h | 2 +- arch/x86/include/asm/mmu_context.h | 2 +- arch/x86/include/asm/system.h | 9 +++++++++ arch/x86/kernel/process_32.c | 6 +++--- arch/x86/kernel/ptrace.c | 14 ++++++------- arch/x86/kernel/signal.c | 41 +++++++++++++++----------------------- arch/x86/kernel/vm86_32.c | 4 ++-- arch/x86/math-emu/get_address.c | 6 ++---- 9 files changed, 41 insertions(+), 45 deletions(-) (limited to 'arch/x86/kernel/process_32.c') diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h index 3c601f8224b..bb70e397aa8 100644 --- a/arch/x86/include/asm/a.out-core.h +++ b/arch/x86/include/asm/a.out-core.h @@ -55,7 +55,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump) dump->regs.ds = (u16)regs->ds; dump->regs.es = (u16)regs->es; dump->regs.fs = (u16)regs->fs; - savesegment(gs, dump->regs.gs); + dump->regs.gs = get_user_gs(regs); dump->regs.orig_ax = regs->orig_ax; dump->regs.ip = regs->ip; dump->regs.cs = (u16)regs->cs; diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index f51a3ddde01..39b0aac1675 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -124,7 +124,7 @@ do { \ pr_reg[7] = regs->ds & 0xffff; \ pr_reg[8] = regs->es & 0xffff; \ pr_reg[9] = regs->fs & 0xffff; \ - savesegment(gs, pr_reg[10]); \ + pr_reg[10] = get_user_gs(regs); \ pr_reg[11] = regs->orig_ax; \ pr_reg[12] = regs->ip; \ pr_reg[13] = regs->cs & 0xffff; \ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 52948df9cd1..4955165682c 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -79,7 +79,7 @@ do { \ #ifdef CONFIG_X86_32 #define deactivate_mm(tsk, mm) \ do { \ - loadsegment(gs, 0); \ + set_user_gs(task_pt_regs(tsk), 0); \ } while (0) #else #define deactivate_mm(tsk, mm) \ diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 2fcc70bc85f..70c74b8db87 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -182,6 +182,15 @@ extern void native_load_gs_index(unsigned); #define savesegment(seg, value) \ asm("mov %%" #seg ",%0":"=r" (value) : : "memory") +/* + * x86_32 user gs accessors. + */ +#ifdef CONFIG_X86_32 +#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) +#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) +#define task_user_gs(tsk) ((tsk)->thread.gs) +#endif + static inline unsigned long get_limit(unsigned long segment) { unsigned long __limit; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 1a1ae8edc40..d58a340e1be 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -131,7 +131,7 @@ void __show_regs(struct pt_regs *regs, int all) if (user_mode_vm(regs)) { sp = regs->sp; ss = regs->ss & 0xffff; - savesegment(gs, gs); + gs = get_user_gs(regs); } else { sp = (unsigned long) (®s->sp); savesegment(ss, ss); @@ -304,7 +304,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, p->thread.ip = (unsigned long) ret_from_fork; - savesegment(gs, p->thread.gs); + task_user_gs(p) = get_user_gs(regs); tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { @@ -342,7 +342,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { - __asm__("movl %0, %%gs" : : "r"(0)); + set_user_gs(regs, 0); regs->fs = 0; set_fs(USER_DS); regs->ds = __USER_DS; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0a5df5f82fb..508b6b57d0c 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -90,9 +90,10 @@ static u16 get_segment_reg(struct task_struct *task, unsigned long offset) if (offset != offsetof(struct user_regs_struct, gs)) retval = *pt_regs_access(task_pt_regs(task), offset); else { - retval = task->thread.gs; if (task == current) - savesegment(gs, retval); + retval = get_user_gs(task_pt_regs(task)); + else + retval = task_user_gs(task); } return retval; } @@ -126,13 +127,10 @@ static int set_segment_reg(struct task_struct *task, break; case offsetof(struct user_regs_struct, gs): - task->thread.gs = value; if (task == current) - /* - * The user-mode %gs is not affected by - * kernel entry, so we must update the CPU. - */ - loadsegment(gs, value); + set_user_gs(task_pt_regs(task), value); + else + task_user_gs(task) = value; } return 0; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 7fc78b01981..8562387c75a 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -50,27 +50,23 @@ # define FIX_EFLAGS __FIX_EFLAGS #endif -#define COPY(x) { \ - get_user_ex(regs->x, &sc->x); \ -} +#define COPY(x) do { \ + get_user_ex(regs->x, &sc->x); \ +} while (0) -#define COPY_SEG(seg) { \ - unsigned short tmp; \ - get_user_ex(tmp, &sc->seg); \ - regs->seg = tmp; \ -} +#define GET_SEG(seg) ({ \ + unsigned short tmp; \ + get_user_ex(tmp, &sc->seg); \ + tmp; \ +}) -#define COPY_SEG_CPL3(seg) { \ - unsigned short tmp; \ - get_user_ex(tmp, &sc->seg); \ - regs->seg = tmp | 3; \ -} +#define COPY_SEG(seg) do { \ + regs->seg = GET_SEG(seg); \ +} while (0) -#define GET_SEG(seg) { \ - unsigned short tmp; \ - get_user_ex(tmp, &sc->seg); \ - loadsegment(seg, tmp); \ -} +#define COPY_SEG_CPL3(seg) do { \ + regs->seg = GET_SEG(seg) | 3; \ +} while (0) static int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, @@ -86,7 +82,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, get_user_try { #ifdef CONFIG_X86_32 - GET_SEG(gs); + set_user_gs(regs, GET_SEG(gs)); COPY_SEG(fs); COPY_SEG(es); COPY_SEG(ds); @@ -138,12 +134,7 @@ setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, put_user_try { #ifdef CONFIG_X86_32 - { - unsigned int tmp; - - savesegment(gs, tmp); - put_user_ex(tmp, (unsigned int __user *)&sc->gs); - } + put_user_ex(get_user_gs(regs), (unsigned int __user *)&sc->gs); put_user_ex(regs->fs, (unsigned int __user *)&sc->fs); put_user_ex(regs->es, (unsigned int __user *)&sc->es); put_user_ex(regs->ds, (unsigned int __user *)&sc->ds); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 4eeb5cf9720..55ea30d2a3d 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -158,7 +158,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) ret = KVM86->regs32; ret->fs = current->thread.saved_fs; - loadsegment(gs, current->thread.saved_gs); + set_user_gs(ret, current->thread.saved_gs); return ret; } @@ -323,7 +323,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk info->regs32->ax = 0; tsk->thread.saved_sp0 = tsk->thread.sp0; tsk->thread.saved_fs = info->regs32->fs; - savesegment(gs, tsk->thread.saved_gs); + tsk->thread.saved_gs = get_user_gs(info->regs32); tss = &per_cpu(init_tss, get_cpu()); tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index 420b3b6e391..6ef5e99380f 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -150,11 +150,9 @@ static long pm_address(u_char FPU_modrm, u_char segment, #endif /* PARANOID */ switch (segment) { - /* gs isn't used by the kernel, so it still has its - user-space value. */ case PREFIX_GS_ - 1: - /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */ - savesegment(gs, addr->selector); + /* user gs handling can be lazy, use special accessors */ + addr->selector = get_user_gs(FPU_info->regs); break; default: addr->selector = PM_REG_(segment); -- cgit v1.2.3-70-g09d2 From ccbeed3a05908d201b47b6c3dd1a373138bba566 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 9 Feb 2009 22:17:40 +0900 Subject: x86: make lazy %gs optional on x86_32 Impact: pt_regs changed, lazy gs handling made optional, add slight overhead to SAVE_ALL, simplifies error_code path a bit On x86_32, %gs hasn't been used by kernel and handled lazily. pt_regs doesn't have place for it and gs is saved/loaded only when necessary. In preparation for stack protector support, this patch makes lazy %gs handling optional by doing the followings. * Add CONFIG_X86_32_LAZY_GS and place for gs in pt_regs. * Save and restore %gs along with other registers in entry_32.S unless LAZY_GS. Note that this unfortunately adds "pushl $0" on SAVE_ALL even when LAZY_GS. However, it adds no overhead to common exit path and simplifies entry path with error code. * Define different user_gs accessors depending on LAZY_GS and add lazy_save_gs() and lazy_load_gs() which are noop if !LAZY_GS. The lazy_*_gs() ops are used to save, load and clear %gs lazily. * Define ELF_CORE_COPY_KERNEL_REGS() which always read %gs directly. xen and lguest changes need to be verified. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Jeremy Fitzhardinge <jeremy@xensource.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/Kconfig | 4 ++ arch/x86/include/asm/elf.h | 15 ++++- arch/x86/include/asm/mmu_context.h | 2 +- arch/x86/include/asm/ptrace.h | 4 +- arch/x86/include/asm/system.h | 12 +++- arch/x86/kernel/asm-offsets_32.c | 1 + arch/x86/kernel/entry_32.S | 132 +++++++++++++++++++++++++++++++------ arch/x86/kernel/process_32.c | 4 +- arch/x86/kernel/ptrace.c | 5 +- arch/x86/lguest/boot.c | 2 +- arch/x86/xen/enlighten.c | 17 ++--- 11 files changed, 158 insertions(+), 40 deletions(-) (limited to 'arch/x86/kernel/process_32.c') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5c8e353c112..5bcdede71ba 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -207,6 +207,10 @@ config X86_TRAMPOLINE depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP) default y +config X86_32_LAZY_GS + def_bool y + depends on X86_32 + config KTIME_SCALAR def_bool X86_32 source "init/Kconfig" diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 39b0aac1675..83c1bc8d2e8 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -112,7 +112,7 @@ extern unsigned int vdso_enabled; * now struct_user_regs, they are different) */ -#define ELF_CORE_COPY_REGS(pr_reg, regs) \ +#define ELF_CORE_COPY_REGS_COMMON(pr_reg, regs) \ do { \ pr_reg[0] = regs->bx; \ pr_reg[1] = regs->cx; \ @@ -124,7 +124,6 @@ do { \ pr_reg[7] = regs->ds & 0xffff; \ pr_reg[8] = regs->es & 0xffff; \ pr_reg[9] = regs->fs & 0xffff; \ - pr_reg[10] = get_user_gs(regs); \ pr_reg[11] = regs->orig_ax; \ pr_reg[12] = regs->ip; \ pr_reg[13] = regs->cs & 0xffff; \ @@ -133,6 +132,18 @@ do { \ pr_reg[16] = regs->ss & 0xffff; \ } while (0); +#define ELF_CORE_COPY_REGS(pr_reg, regs) \ +do { \ + ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\ + pr_reg[10] = get_user_gs(regs); \ +} while (0); + +#define ELF_CORE_COPY_KERNEL_REGS(pr_reg, regs) \ +do { \ + ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\ + savesegment(gs, pr_reg[10]); \ +} while (0); + #define ELF_PLATFORM (utsname()->machine) #define set_personality_64bit() do { } while (0) diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 4955165682c..f923203dc39 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -79,7 +79,7 @@ do { \ #ifdef CONFIG_X86_32 #define deactivate_mm(tsk, mm) \ do { \ - set_user_gs(task_pt_regs(tsk), 0); \ + lazy_load_gs(0); \ } while (0) #else #define deactivate_mm(tsk, mm) \ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 6d34d954c22..e304b66abee 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -28,7 +28,7 @@ struct pt_regs { int xds; int xes; int xfs; - /* int gs; */ + int xgs; long orig_eax; long eip; int xcs; @@ -50,7 +50,7 @@ struct pt_regs { unsigned long ds; unsigned long es; unsigned long fs; - /* int gs; */ + unsigned long gs; unsigned long orig_ax; unsigned long ip; unsigned long cs; diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 70c74b8db87..79b98e5b96f 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -186,10 +186,20 @@ extern void native_load_gs_index(unsigned); * x86_32 user gs accessors. */ #ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_32_LAZY_GS #define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) #define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) #define task_user_gs(tsk) ((tsk)->thread.gs) -#endif +#define lazy_save_gs(v) savesegment(gs, (v)) +#define lazy_load_gs(v) loadsegment(gs, (v)) +#else /* X86_32_LAZY_GS */ +#define get_user_gs(regs) (u16)((regs)->gs) +#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) +#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) +#define lazy_save_gs(v) do { } while (0) +#define lazy_load_gs(v) do { } while (0) +#endif /* X86_32_LAZY_GS */ +#endif /* X86_32 */ static inline unsigned long get_limit(unsigned long segment) { diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index ee4df08feee..fbf2f33e308 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -75,6 +75,7 @@ void foo(void) OFFSET(PT_DS, pt_regs, ds); OFFSET(PT_ES, pt_regs, es); OFFSET(PT_FS, pt_regs, fs); + OFFSET(PT_GS, pt_regs, gs); OFFSET(PT_ORIG_EAX, pt_regs, orig_ax); OFFSET(PT_EIP, pt_regs, ip); OFFSET(PT_CS, pt_regs, cs); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c461925d3b6..82e6868bee4 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -30,12 +30,13 @@ * 1C(%esp) - %ds * 20(%esp) - %es * 24(%esp) - %fs - * 28(%esp) - orig_eax - * 2C(%esp) - %eip - * 30(%esp) - %cs - * 34(%esp) - %eflags - * 38(%esp) - %oldesp - * 3C(%esp) - %oldss + * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS + * 2C(%esp) - orig_eax + * 30(%esp) - %eip + * 34(%esp) - %cs + * 38(%esp) - %eflags + * 3C(%esp) - %oldesp + * 40(%esp) - %oldss * * "current" is in register %ebx during any slow entries. */ @@ -101,8 +102,99 @@ #define resume_userspace_sig resume_userspace #endif +/* + * User gs save/restore + * + * %gs is used for userland TLS and kernel only uses it for stack + * canary which is required to be at %gs:20 by gcc. Read the comment + * at the top of stackprotector.h for more info. + * + * Local labels 98 and 99 are used. + */ +#ifdef CONFIG_X86_32_LAZY_GS + + /* unfortunately push/pop can't be no-op */ +.macro PUSH_GS + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 +.endm +.macro POP_GS pop=0 + addl $(4 + \pop), %esp + CFI_ADJUST_CFA_OFFSET -(4 + \pop) +.endm +.macro POP_GS_EX +.endm + + /* all the rest are no-op */ +.macro PTGS_TO_GS +.endm +.macro PTGS_TO_GS_EX +.endm +.macro GS_TO_REG reg +.endm +.macro REG_TO_PTGS reg +.endm +.macro SET_KERNEL_GS reg +.endm + +#else /* CONFIG_X86_32_LAZY_GS */ + +.macro PUSH_GS + pushl %gs + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET gs, 0*/ +.endm + +.macro POP_GS pop=0 +98: popl %gs + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_RESTORE gs*/ + .if \pop <> 0 + add $\pop, %esp + CFI_ADJUST_CFA_OFFSET -\pop + .endif +.endm +.macro POP_GS_EX +.pushsection .fixup, "ax" +99: movl $0, (%esp) + jmp 98b +.section __ex_table, "a" + .align 4 + .long 98b, 99b +.popsection +.endm + +.macro PTGS_TO_GS +98: mov PT_GS(%esp), %gs +.endm +.macro PTGS_TO_GS_EX +.pushsection .fixup, "ax" +99: movl $0, PT_GS(%esp) + jmp 98b +.section __ex_table, "a" + .align 4 + .long 98b, 99b +.popsection +.endm + +.macro GS_TO_REG reg + movl %gs, \reg + /*CFI_REGISTER gs, \reg*/ +.endm +.macro REG_TO_PTGS reg + movl \reg, PT_GS(%esp) + /*CFI_REL_OFFSET gs, PT_GS*/ +.endm +.macro SET_KERNEL_GS reg + xorl \reg, \reg + movl \reg, %gs +.endm + +#endif /* CONFIG_X86_32_LAZY_GS */ + .macro SAVE_ALL cld + PUSH_GS pushl %fs CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET fs, 0;*/ @@ -138,6 +230,7 @@ movl %edx, %es movl $(__KERNEL_PERCPU), %edx movl %edx, %fs + SET_KERNEL_GS %edx .endm .macro RESTORE_INT_REGS @@ -164,7 +257,7 @@ CFI_RESTORE eax .endm -.macro RESTORE_REGS +.macro RESTORE_REGS pop=0 RESTORE_INT_REGS 1: popl %ds CFI_ADJUST_CFA_OFFSET -4 @@ -175,6 +268,7 @@ 3: popl %fs CFI_ADJUST_CFA_OFFSET -4 /*CFI_RESTORE fs;*/ + POP_GS \pop .pushsection .fixup, "ax" 4: movl $0, (%esp) jmp 1b @@ -188,6 +282,7 @@ .long 2b, 5b .long 3b, 6b .popsection + POP_GS_EX .endm .macro RING0_INT_FRAME @@ -368,6 +463,7 @@ sysenter_exit: xorl %ebp,%ebp TRACE_IRQS_ON 1: mov PT_FS(%esp), %fs + PTGS_TO_GS ENABLE_INTERRUPTS_SYSEXIT #ifdef CONFIG_AUDITSYSCALL @@ -416,6 +512,7 @@ sysexit_audit: .align 4 .long 1b,2b .popsection + PTGS_TO_GS_EX ENDPROC(ia32_sysenter_target) # system call handler stub @@ -458,8 +555,7 @@ restore_all: restore_nocheck: TRACE_IRQS_IRET restore_nocheck_notrace: - RESTORE_REGS - addl $4, %esp # skip orig_eax/error_code + RESTORE_REGS 4 # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 irq_return: INTERRUPT_RETURN @@ -1078,7 +1174,10 @@ ENTRY(page_fault) CFI_ADJUST_CFA_OFFSET 4 ALIGN error_code: - /* the function address is in %fs's slot on the stack */ + /* the function address is in %gs's slot on the stack */ + pushl %fs + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET fs, 0*/ pushl %es CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET es, 0*/ @@ -1107,20 +1206,15 @@ error_code: CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ebx, 0 cld - pushl %fs - CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET fs, 0*/ movl $(__KERNEL_PERCPU), %ecx movl %ecx, %fs UNWIND_ESPFIX_STACK - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 - /*CFI_REGISTER es, ecx*/ - movl PT_FS(%esp), %edi # get the function address + GS_TO_REG %ecx + movl PT_GS(%esp), %edi # get the function address movl PT_ORIG_EAX(%esp), %edx # get the error code movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart - mov %ecx, PT_FS(%esp) - /*CFI_REL_OFFSET fs, ES*/ + REG_TO_PTGS %ecx + SET_KERNEL_GS %ecx movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index d58a340e1be..86122fa2a1b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -539,7 +539,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * used %fs or %gs (it does not today), or if the kernel is * running inside of a hypervisor layer. */ - savesegment(gs, prev->gs); + lazy_save_gs(prev->gs); /* * Load the per-thread Thread-Local Storage descriptor. @@ -585,7 +585,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Restore %gs if needed (which is common) */ if (prev->gs | next->gs) - loadsegment(gs, next->gs); + lazy_load_gs(next->gs); percpu_write(current_task, next_p); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 508b6b57d0c..7ec39ab37a2 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -75,10 +75,7 @@ static inline bool invalid_selector(u16 value) static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); - regno >>= 2; - if (regno > FS) - --regno; - return ®s->bx + regno; + return ®s->bx + (regno >> 2); } static u16 get_segment_reg(struct task_struct *task, unsigned long offset) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 19e33b6cd59..da2e314f61b 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -283,7 +283,7 @@ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) /* There's one problem which normal hardware doesn't have: the Host * can't handle us removing entries we're currently using. So we clear * the GS register here: if it's needed it'll be reloaded anyway. */ - loadsegment(gs, 0); + lazy_load_gs(0); lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0); } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 37230342c2c..95ff6a0e942 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -323,13 +323,14 @@ static void load_TLS_descriptor(struct thread_struct *t, static void xen_load_tls(struct thread_struct *t, unsigned int cpu) { /* - * XXX sleazy hack: If we're being called in a lazy-cpu zone, - * it means we're in a context switch, and %gs has just been - * saved. This means we can zero it out to prevent faults on - * exit from the hypervisor if the next process has no %gs. - * Either way, it has been saved, and the new value will get - * loaded properly. This will go away as soon as Xen has been - * modified to not save/restore %gs for normal hypercalls. + * XXX sleazy hack: If we're being called in a lazy-cpu zone + * and lazy gs handling is enabled, it means we're in a + * context switch, and %gs has just been saved. This means we + * can zero it out to prevent faults on exit from the + * hypervisor if the next process has no %gs. Either way, it + * has been saved, and the new value will get loaded properly. + * This will go away as soon as Xen has been modified to not + * save/restore %gs for normal hypercalls. * * On x86_64, this hack is not used for %gs, because gs points * to KERNEL_GS_BASE (and uses it for PDA references), so we @@ -341,7 +342,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) */ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { #ifdef CONFIG_X86_32 - loadsegment(gs, 0); + lazy_load_gs(0); #else loadsegment(fs, 0); #endif -- cgit v1.2.3-70-g09d2 From 60a5317ff0f42dd313094b88f809f63041568b08 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 9 Feb 2009 22:17:40 +0900 Subject: x86: implement x86_32 stack protector Impact: stack protector for x86_32 Implement stack protector for x86_32. GDT entry 28 is used for it. It's set to point to stack_canary-20 and have the length of 24 bytes. CONFIG_CC_STACKPROTECTOR turns off CONFIG_X86_32_LAZY_GS and sets %gs to the stack canary segment on entry. As %gs is otherwise unused by the kernel, the canary can be anywhere. It's defined as a percpu variable. x86_32 exception handlers take register frame on stack directly as struct pt_regs. With -fstack-protector turned on, gcc copies the whole structure after the stack canary and (of course) doesn't copy back on return thus losing all changed. For now, -fno-stack-protector is added to all files which contain those functions. We definitely need something better. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/Kconfig | 3 +- arch/x86/include/asm/processor.h | 4 ++ arch/x86/include/asm/segment.h | 9 ++- arch/x86/include/asm/stackprotector.h | 91 +++++++++++++++++++++++++++++-- arch/x86/include/asm/system.h | 21 +++++++ arch/x86/kernel/Makefile | 18 ++++++ arch/x86/kernel/cpu/common.c | 17 ++++-- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/head_32.S | 20 ++++++- arch/x86/kernel/process_32.c | 1 + arch/x86/kernel/setup_percpu.c | 2 + scripts/gcc-x86_32-has-stack-protector.sh | 8 +++ 12 files changed, 180 insertions(+), 16 deletions(-) create mode 100644 scripts/gcc-x86_32-has-stack-protector.sh (limited to 'arch/x86/kernel/process_32.c') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5bcdede71ba..f760a22f95d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -209,7 +209,7 @@ config X86_TRAMPOLINE config X86_32_LAZY_GS def_bool y - depends on X86_32 + depends on X86_32 && !CC_STACKPROTECTOR config KTIME_SCALAR def_bool X86_32 @@ -1356,7 +1356,6 @@ config CC_STACKPROTECTOR_ALL config CC_STACKPROTECTOR bool "Enable -fstack-protector buffer overflow detection (EXPERIMENTAL)" - depends on X86_64 select CC_STACKPROTECTOR_ALL help This option turns on the -fstack-protector GCC feature. This diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 9763eb70013..5a947210425 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -396,7 +396,11 @@ DECLARE_PER_CPU(union irq_stack_union, irq_stack_union); DECLARE_INIT_PER_CPU(irq_stack_union); DECLARE_PER_CPU(char *, irq_stack_ptr); +#else /* X86_64 */ +#ifdef CONFIG_CC_STACKPROTECTOR +DECLARE_PER_CPU(unsigned long, stack_canary); #endif +#endif /* X86_64 */ extern void print_cpu_info(struct cpuinfo_x86 *); extern unsigned int xstate_size; diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 1dc1b51ac62..14e0ed86a6f 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -61,7 +61,7 @@ * * 26 - ESPFIX small SS * 27 - per-cpu [ offset to per-cpu data area ] - * 28 - unused + * 28 - stack_canary-20 [ for stack protector ] * 29 - unused * 30 - unused * 31 - TSS for double fault handler @@ -95,6 +95,13 @@ #define __KERNEL_PERCPU 0 #endif +#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE + 16) +#ifdef CONFIG_CC_STACKPROTECTOR +#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY * 8) +#else +#define __KERNEL_STACK_CANARY 0 +#endif + #define GDT_ENTRY_DOUBLEFAULT_TSS 31 /* diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index ee275e9f48a..fa7e5bd6fbe 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -1,3 +1,35 @@ +/* + * GCC stack protector support. + * + * Stack protector works by putting predefined pattern at the start of + * the stack frame and verifying that it hasn't been overwritten when + * returning from the function. The pattern is called stack canary + * and unfortunately gcc requires it to be at a fixed offset from %gs. + * On x86_64, the offset is 40 bytes and on x86_32 20 bytes. x86_64 + * and x86_32 use segment registers differently and thus handles this + * requirement differently. + * + * On x86_64, %gs is shared by percpu area and stack canary. All + * percpu symbols are zero based and %gs points to the base of percpu + * area. The first occupant of the percpu area is always + * irq_stack_union which contains stack_canary at offset 40. Userland + * %gs is always saved and restored on kernel entry and exit using + * swapgs, so stack protector doesn't add any complexity there. + * + * On x86_32, it's slightly more complicated. As in x86_64, %gs is + * used for userland TLS. Unfortunately, some processors are much + * slower at loading segment registers with different value when + * entering and leaving the kernel, so the kernel uses %fs for percpu + * area and manages %gs lazily so that %gs is switched only when + * necessary, usually during task switch. + * + * As gcc requires the stack canary at %gs:20, %gs can't be managed + * lazily if stack protector is enabled, so the kernel saves and + * restores userland %gs on kernel entry and exit. This behavior is + * controlled by CONFIG_X86_32_LAZY_GS and accessors are defined in + * system.h to hide the details. + */ + #ifndef _ASM_STACKPROTECTOR_H #define _ASM_STACKPROTECTOR_H 1 @@ -6,8 +38,18 @@ #include <asm/tsc.h> #include <asm/processor.h> #include <asm/percpu.h> +#include <asm/system.h> +#include <asm/desc.h> #include <linux/random.h> +/* + * 24 byte read-only segment initializer for stack canary. Linker + * can't handle the address bit shifting. Address will be set in + * head_32 for boot CPU and setup_per_cpu_areas() for others. + */ +#define GDT_STACK_CANARY_INIT \ + [GDT_ENTRY_STACK_CANARY] = { { { 0x00000018, 0x00409000 } } }, + /* * Initialize the stackprotector canary value. * @@ -19,12 +61,9 @@ static __always_inline void boot_init_stack_canary(void) u64 canary; u64 tsc; - /* - * Build time only check to make sure the stack_canary is at - * offset 40 in the pda; this is a gcc ABI requirement - */ +#ifdef CONFIG_X86_64 BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40); - +#endif /* * We both use the random pool and the current TSC as a source * of randomness. The TSC only matters for very early init, @@ -36,7 +75,49 @@ static __always_inline void boot_init_stack_canary(void) canary += tsc + (tsc << 32UL); current->stack_canary = canary; +#ifdef CONFIG_X86_64 percpu_write(irq_stack_union.stack_canary, canary); +#else + percpu_write(stack_canary, canary); +#endif +} + +static inline void setup_stack_canary_segment(int cpu) +{ +#ifdef CONFIG_X86_32 + unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu); + struct desc_struct *gdt_table = get_cpu_gdt_table(cpu); + struct desc_struct desc; + + desc = gdt_table[GDT_ENTRY_STACK_CANARY]; + desc.base0 = canary & 0xffff; + desc.base1 = (canary >> 16) & 0xff; + desc.base2 = (canary >> 24) & 0xff; + write_gdt_entry(gdt_table, GDT_ENTRY_STACK_CANARY, &desc, DESCTYPE_S); +#endif +} + +static inline void load_stack_canary_segment(void) +{ +#ifdef CONFIG_X86_32 + asm("mov %0, %%gs" : : "r" (__KERNEL_STACK_CANARY) : "memory"); +#endif +} + +#else /* CC_STACKPROTECTOR */ + +#define GDT_STACK_CANARY_INIT + +/* dummy boot_init_stack_canary() is defined in linux/stackprotector.h */ + +static inline void setup_stack_canary_segment(int cpu) +{ } + +static inline void load_stack_canary_segment(void) +{ +#ifdef CONFIG_X86_32 + asm volatile ("mov %0, %%gs" : : "r" (0)); +#endif } #endif /* CC_STACKPROTECTOR */ diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 79b98e5b96f..2692ee8ef03 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -23,6 +23,22 @@ struct task_struct *__switch_to(struct task_struct *prev, #ifdef CONFIG_X86_32 +#ifdef CONFIG_CC_STACKPROTECTOR +#define __switch_canary \ + "movl "__percpu_arg([current_task])",%%ebx\n\t" \ + "movl %P[task_canary](%%ebx),%%ebx\n\t" \ + "movl %%ebx,"__percpu_arg([stack_canary])"\n\t" +#define __switch_canary_oparam \ + , [stack_canary] "=m" (per_cpu_var(stack_canary)) +#define __switch_canary_iparam \ + , [current_task] "m" (per_cpu_var(current_task)) \ + , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) +#else /* CC_STACKPROTECTOR */ +#define __switch_canary +#define __switch_canary_oparam +#define __switch_canary_iparam +#endif /* CC_STACKPROTECTOR */ + /* * Saving eflags is important. It switches not only IOPL between tasks, * it also protects other tasks from NT leaking through sysenter etc. @@ -46,6 +62,7 @@ do { \ "pushl %[next_ip]\n\t" /* restore EIP */ \ "jmp __switch_to\n" /* regparm call */ \ "1:\t" \ + __switch_canary \ "popl %%ebp\n\t" /* restore EBP */ \ "popfl\n" /* restore flags */ \ \ @@ -58,6 +75,8 @@ do { \ "=b" (ebx), "=c" (ecx), "=d" (edx), \ "=S" (esi), "=D" (edi) \ \ + __switch_canary_oparam \ + \ /* input parameters: */ \ : [next_sp] "m" (next->thread.sp), \ [next_ip] "m" (next->thread.ip), \ @@ -66,6 +85,8 @@ do { \ [prev] "a" (prev), \ [next] "d" (next) \ \ + __switch_canary_iparam \ + \ : /* reloaded segment registers */ \ "memory"); \ } while (0) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 37fa30bada1..b1f8be33300 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -24,6 +24,24 @@ CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) CFLAGS_hpet.o := $(nostackp) CFLAGS_tsc.o := $(nostackp) CFLAGS_paravirt.o := $(nostackp) +# +# On x86_32, register frame is passed verbatim on stack as struct +# pt_regs. gcc considers the parameter to belong to the callee and +# with -fstack-protector it copies pt_regs to the callee's stack frame +# to put the structure after the stack canary causing changes made by +# the exception handlers to be lost. Turn off stack protector for all +# files containing functions which take struct pt_regs from register +# frame. +# +# The proper way to fix this is to teach gcc that the argument belongs +# to the caller for these functions, oh well... +# +ifdef CONFIG_X86_32 +CFLAGS_process_32.o := $(nostackp) +CFLAGS_vm86_32.o := $(nostackp) +CFLAGS_signal.o := $(nostackp) +CFLAGS_traps.o := $(nostackp) +endif obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 41b0de6df87..260fe4cb2c8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -39,6 +39,7 @@ #include <asm/sections.h> #include <asm/setup.h> #include <asm/hypervisor.h> +#include <asm/stackprotector.h> #include "cpu.h" @@ -122,6 +123,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } }, + GDT_STACK_CANARY_INIT #endif } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); @@ -261,6 +263,7 @@ void load_percpu_segment(int cpu) loadsegment(gs, 0); wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu)); #endif + load_stack_canary_segment(); } /* Current gdt points %fs at the "master" per-cpu area: after this, @@ -946,16 +949,21 @@ unsigned long kernel_eflags; */ DEFINE_PER_CPU(struct orig_ist, orig_ist); -#else +#else /* x86_64 */ + +#ifdef CONFIG_CC_STACKPROTECTOR +DEFINE_PER_CPU(unsigned long, stack_canary); +#endif -/* Make sure %fs is initialized properly in idle threads */ +/* Make sure %fs and %gs are initialized properly in idle threads */ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); regs->fs = __KERNEL_PERCPU; + regs->gs = __KERNEL_STACK_CANARY; return regs; } -#endif +#endif /* x86_64 */ /* * cpu_init() initializes state that is per-CPU. Some data is already @@ -1120,9 +1128,6 @@ void __cpuinit cpu_init(void) __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); #endif - /* Clear %gs. */ - asm volatile ("mov %0, %%gs" : : "r" (0)); - /* Clear all 6 debug registers: */ set_debugreg(0, 0); set_debugreg(0, 1); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 82e6868bee4..5f5bd22adcd 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -186,7 +186,7 @@ /*CFI_REL_OFFSET gs, PT_GS*/ .endm .macro SET_KERNEL_GS reg - xorl \reg, \reg + movl $(__KERNEL_STACK_CANARY), \reg movl \reg, %gs .endm diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 24c0e5cd71e..924e31615fb 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -19,6 +19,7 @@ #include <asm/asm-offsets.h> #include <asm/setup.h> #include <asm/processor-flags.h> +#include <asm/percpu.h> /* Physical address */ #define pa(X) ((X) - __PAGE_OFFSET) @@ -437,8 +438,25 @@ is386: movl $2,%ecx # set MP movl $(__KERNEL_PERCPU), %eax movl %eax,%fs # set this cpu's percpu - xorl %eax,%eax # Clear GS and LDT +#ifdef CONFIG_CC_STACKPROTECTOR + /* + * The linker can't handle this by relocation. Manually set + * base address in stack canary segment descriptor. + */ + cmpb $0,ready + jne 1f + movl $per_cpu__gdt_page,%eax + movl $per_cpu__stack_canary,%ecx + movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) + shrl $16, %ecx + movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) + movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) +1: +#endif + movl $(__KERNEL_STACK_CANARY),%eax movl %eax,%gs + + xorl %eax,%eax # Clear LDT lldt %ax cld # gcc2 wants the direction flag cleared at all times diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 86122fa2a1b..9a62383e7c3 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -212,6 +212,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.ds = __USER_DS; regs.es = __USER_DS; regs.fs = __KERNEL_PERCPU; + regs.gs = __KERNEL_STACK_CANARY; regs.orig_ax = -1; regs.ip = (unsigned long) kernel_thread_helper; regs.cs = __KERNEL_CS | get_kernel_rpl(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ef91747bbed..d992e6cff73 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -16,6 +16,7 @@ #include <asm/proto.h> #include <asm/cpumask.h> #include <asm/cpu.h> +#include <asm/stackprotector.h> #ifdef CONFIG_DEBUG_PER_CPU_MAPS # define DBG(x...) printk(KERN_DEBUG x) @@ -95,6 +96,7 @@ void __init setup_per_cpu_areas(void) per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); per_cpu(cpu_number, cpu) = cpu; setup_percpu_segment(cpu); + setup_stack_canary_segment(cpu); /* * Copy data used in early init routines from the * initial arrays to the per cpu data areas. These diff --git a/scripts/gcc-x86_32-has-stack-protector.sh b/scripts/gcc-x86_32-has-stack-protector.sh new file mode 100644 index 00000000000..4fdf6ce1b06 --- /dev/null +++ b/scripts/gcc-x86_32-has-stack-protector.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +echo "int foo(void) { char X[200]; return 3; }" | $1 -S -xc -c -O0 -fstack-protector - -o - 2> /dev/null | grep -q "%gs" +if [ "$?" -eq "0" ] ; then + echo y +else + echo n +fi -- cgit v1.2.3-70-g09d2 From 5c79d2a517a9905599d192db8ce77ab5f1a2faca Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 11 Feb 2009 16:31:00 +0900 Subject: x86: fix x86_32 stack protector bugs Impact: fix x86_32 stack protector Brian Gerst found out that %gs was being initialized to stack_canary instead of stack_canary - 20, which basically gave the same canary value for all threads. Fixing this also exposed the following bugs. * cpu_idle() didn't call boot_init_stack_canary() * stack canary switching in switch_to() was being done too late making the initial run of a new thread use the old stack canary value. Fix all of them and while at it update comment in cpu_idle() about calling boot_init_stack_canary(). Reported-by: Brian Gerst <brgerst@gmail.com> Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/include/asm/stackprotector.h | 2 +- arch/x86/include/asm/system.h | 8 +++----- arch/x86/kernel/head_32.S | 1 + arch/x86/kernel/process_32.c | 10 ++++++++++ arch/x86/kernel/process_64.c | 11 +++++------ 5 files changed, 20 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel/process_32.c') diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index fa7e5bd6fbe..c2d742c6e15 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -85,7 +85,7 @@ static __always_inline void boot_init_stack_canary(void) static inline void setup_stack_canary_segment(int cpu) { #ifdef CONFIG_X86_32 - unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu); + unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu) - 20; struct desc_struct *gdt_table = get_cpu_gdt_table(cpu); struct desc_struct desc; diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 2692ee8ef03..7a80f72bec4 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -25,13 +25,11 @@ struct task_struct *__switch_to(struct task_struct *prev, #ifdef CONFIG_CC_STACKPROTECTOR #define __switch_canary \ - "movl "__percpu_arg([current_task])",%%ebx\n\t" \ - "movl %P[task_canary](%%ebx),%%ebx\n\t" \ - "movl %%ebx,"__percpu_arg([stack_canary])"\n\t" + "movl %P[task_canary](%[next]), %%ebx\n\t" \ + "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" #define __switch_canary_oparam \ , [stack_canary] "=m" (per_cpu_var(stack_canary)) #define __switch_canary_iparam \ - , [current_task] "m" (per_cpu_var(current_task)) \ , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) #else /* CC_STACKPROTECTOR */ #define __switch_canary @@ -60,9 +58,9 @@ do { \ "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ "pushl %[next_ip]\n\t" /* restore EIP */ \ + __switch_canary \ "jmp __switch_to\n" /* regparm call */ \ "1:\t" \ - __switch_canary \ "popl %%ebp\n\t" /* restore EBP */ \ "popfl\n" /* restore flags */ \ \ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 924e31615fb..cf21fd0cf6a 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -447,6 +447,7 @@ is386: movl $2,%ecx # set MP jne 1f movl $per_cpu__gdt_page,%eax movl $per_cpu__stack_canary,%ecx + subl $20, %ecx movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) shrl $16, %ecx movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9a62383e7c3..b50604bb1e4 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -11,6 +11,7 @@ #include <stdarg.h> +#include <linux/stackprotector.h> #include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> @@ -91,6 +92,15 @@ void cpu_idle(void) { int cpu = smp_processor_id(); + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. CPU0 already has it initialized but no harm in + * doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); + current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 8eb169e4558..836ef6575f0 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -120,12 +120,11 @@ void cpu_idle(void) current_thread_info()->status |= TS_POLLING; /* - * If we're the non-boot CPU, nothing set the PDA stack - * canary up for us - and if we are the boot CPU we have - * a 0 stack canary. This is a good place for updating - * it, as we wont ever return from this function (so the - * invalid canaries already on the stack wont ever - * trigger): + * If we're the non-boot CPU, nothing set the stack canary up + * for us. CPU0 already has it initialized but no harm in + * doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). */ boot_init_stack_canary(); -- cgit v1.2.3-70-g09d2 From 253f29a4ae9cc6cdc7b94f96517f27a93885a6ce Mon Sep 17 00:00:00 2001 From: Brian Gerst <brgerst@gmail.com> Date: Tue, 10 Feb 2009 09:51:46 -0500 Subject: x86: pass in pt_regs pointer for syscalls that need it Some syscalls need to access the pt_regs structure, either to copy user register state or to modifiy it. This patch adds stubs to load the address of the pt_regs struct into the %eax register, and changes the syscalls to regparm(1) to receive the pt_regs pointer as the first argument. Signed-off-by: Brian Gerst <brgerst@gmail.com> Acked-by: Tejun Heo <tj@kernel.org> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/include/asm/linkage.h | 7 +++++++ arch/x86/include/asm/syscalls.h | 25 +++++++++++++++---------- arch/x86/kernel/entry_32.S | 20 ++++++++++++++++++++ arch/x86/kernel/ioport.c | 4 +--- arch/x86/kernel/process_32.c | 35 ++++++++++++++--------------------- arch/x86/kernel/signal.c | 35 +++++++---------------------------- arch/x86/kernel/syscall_table_32.S | 20 ++++++++++---------- arch/x86/kernel/vm86_32.c | 15 +++++++-------- 8 files changed, 81 insertions(+), 80 deletions(-) (limited to 'arch/x86/kernel/process_32.c') diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 5d98d0b68ff..2fd5926fb97 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -17,6 +17,13 @@ */ #define asmregparm __attribute__((regparm(3))) +/* + * For syscalls that need a pointer to the pt_regs struct (ie. fork). + * The regs pointer is passed in %eax as the first argument. The + * remaining function arguments remain on the stack. + */ +#define ptregscall __attribute__((regparm(1))) + /* * Make sure the compiler doesn't do anything stupid with the * arguments on the stack - they are owned by the *caller*, not diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index c0b0bda754e..617295255a1 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -29,21 +29,26 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *); /* X86_32 only */ #ifdef CONFIG_X86_32 /* kernel/process_32.c */ -asmlinkage int sys_fork(struct pt_regs); -asmlinkage int sys_clone(struct pt_regs); -asmlinkage int sys_vfork(struct pt_regs); -asmlinkage int sys_execve(struct pt_regs); +ptregscall int sys_fork(struct pt_regs *); +ptregscall int sys_clone(struct pt_regs *, unsigned long, + unsigned long, int __user *, + unsigned long, int __user *); +ptregscall int sys_vfork(struct pt_regs *); +ptregscall int sys_execve(struct pt_regs *, char __user *, + char __user * __user *, + char __user * __user *); /* kernel/signal_32.c */ asmlinkage int sys_sigsuspend(int, int, old_sigset_t); asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, struct old_sigaction __user *); -asmlinkage int sys_sigaltstack(unsigned long); -asmlinkage unsigned long sys_sigreturn(unsigned long); -asmlinkage int sys_rt_sigreturn(unsigned long); +ptregscall int sys_sigaltstack(struct pt_regs *, const stack_t __user *, + stack_t __user *); +ptregscall unsigned long sys_sigreturn(struct pt_regs *); +ptregscall int sys_rt_sigreturn(struct pt_regs *); /* kernel/ioport.c */ -asmlinkage long sys_iopl(unsigned long); +ptregscall long sys_iopl(struct pt_regs *, unsigned int); /* kernel/sys_i386_32.c */ asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, @@ -59,8 +64,8 @@ struct oldold_utsname; asmlinkage int sys_olduname(struct oldold_utsname __user *); /* kernel/vm86_32.c */ -asmlinkage int sys_vm86old(struct pt_regs); -asmlinkage int sys_vm86(struct pt_regs); +ptregscall int sys_vm86old(struct pt_regs *, struct vm86_struct __user *); +ptregscall int sys_vm86(struct pt_regs *, unsigned long, unsigned long); #else /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 5f5bd22adcd..3de7b5710dc 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -697,6 +697,26 @@ syscall_badsys: END(syscall_badsys) CFI_ENDPROC +/* + * System calls that need a pt_regs pointer. + */ +#define PTREGSCALL(name) \ + ALIGN; \ +ptregs_##name: \ + leal 4(%esp),%eax; \ + jmp sys_##name; + +PTREGSCALL(iopl) +PTREGSCALL(fork) +PTREGSCALL(clone) +PTREGSCALL(vfork) +PTREGSCALL(execve) +PTREGSCALL(sigaltstack) +PTREGSCALL(sigreturn) +PTREGSCALL(rt_sigreturn) +PTREGSCALL(vm86) +PTREGSCALL(vm86old) + .macro FIXUP_ESPFIX_STACK /* since we are on a wrong stack, we cant make it a C code :( */ PER_CPU(gdt_page, %ebx) diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index b12208f4dfe..7ec14864631 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -131,10 +131,8 @@ static int do_iopl(unsigned int level, struct pt_regs *regs) } #ifdef CONFIG_X86_32 -asmlinkage long sys_iopl(unsigned long regsp) +ptregscall long sys_iopl(struct pt_regs *regs, unsigned int level) { - struct pt_regs *regs = (struct pt_regs *)®sp; - unsigned int level = regs->bx; struct thread_struct *t = ¤t->thread; int rc; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index b50604bb1e4..5a9dcfb01f7 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -603,24 +603,18 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } -asmlinkage int sys_fork(struct pt_regs regs) +ptregscall int sys_fork(struct pt_regs *regs) { - return do_fork(SIGCHLD, regs.sp, ®s, 0, NULL, NULL); + return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); } -asmlinkage int sys_clone(struct pt_regs regs) +ptregscall int sys_clone(struct pt_regs *regs, unsigned long clone_flags, + unsigned long newsp, int __user *parent_tidptr, + unsigned long unused, int __user *child_tidptr) { - unsigned long clone_flags; - unsigned long newsp; - int __user *parent_tidptr, *child_tidptr; - - clone_flags = regs.bx; - newsp = regs.cx; - parent_tidptr = (int __user *)regs.dx; - child_tidptr = (int __user *)regs.di; if (!newsp) - newsp = regs.sp; - return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); + newsp = regs->sp; + return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr); } /* @@ -633,27 +627,26 @@ asmlinkage int sys_clone(struct pt_regs regs) * do not have enough call-clobbered registers to hold all * the information you need. */ -asmlinkage int sys_vfork(struct pt_regs regs) +ptregscall int sys_vfork(struct pt_regs *regs) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, ®s, 0, NULL, NULL); + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, NULL, NULL); } /* * sys_execve() executes a new program. */ -asmlinkage int sys_execve(struct pt_regs regs) +ptregscall int sys_execve(struct pt_regs *regs, char __user *u_filename, + char __user * __user *argv, + char __user * __user *envp) { int error; char *filename; - filename = getname((char __user *) regs.bx); + filename = getname(u_filename); error = PTR_ERR(filename); if (IS_ERR(filename)) goto out; - error = do_execve(filename, - (char __user * __user *) regs.cx, - (char __user * __user *) regs.dx, - ®s); + error = do_execve(filename, argv, envp, regs); if (error == 0) { /* Make sure we don't return using sysenter.. */ set_thread_flag(TIF_IRET); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 8562387c75a..d7a158367e3 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -549,39 +549,28 @@ sys_sigaction(int sig, const struct old_sigaction __user *act, #endif /* CONFIG_X86_32 */ #ifdef CONFIG_X86_32 -asmlinkage int sys_sigaltstack(unsigned long bx) -{ - /* - * This is needed to make gcc realize it doesn't own the - * "struct pt_regs" - */ - struct pt_regs *regs = (struct pt_regs *)&bx; - const stack_t __user *uss = (const stack_t __user *)bx; - stack_t __user *uoss = (stack_t __user *)regs->cx; - - return do_sigaltstack(uss, uoss, regs->sp); -} +ptregscall int +sys_sigaltstack(struct pt_regs *regs, const stack_t __user *uss, + stack_t __user *uoss) #else /* !CONFIG_X86_32 */ asmlinkage long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) +#endif /* CONFIG_X86_32 */ { return do_sigaltstack(uss, uoss, regs->sp); } -#endif /* CONFIG_X86_32 */ /* * Do a signal return; undo the signal stack. */ #ifdef CONFIG_X86_32 -asmlinkage unsigned long sys_sigreturn(unsigned long __unused) +ptregscall unsigned long sys_sigreturn(struct pt_regs *regs) { struct sigframe __user *frame; - struct pt_regs *regs; unsigned long ax; sigset_t set; - regs = (struct pt_regs *) &__unused; frame = (struct sigframe __user *)(regs->sp - 8); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) @@ -640,23 +629,13 @@ badframe: } #ifdef CONFIG_X86_32 -/* - * Note: do not pass in pt_regs directly as with tail-call optimization - * GCC will incorrectly stomp on the caller's frame and corrupt user-space - * register state: - */ -asmlinkage int sys_rt_sigreturn(unsigned long __unused) -{ - struct pt_regs *regs = (struct pt_regs *)&__unused; - - return do_rt_sigreturn(regs); -} +ptregscall int sys_rt_sigreturn(struct pt_regs *regs) #else /* !CONFIG_X86_32 */ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +#endif /* CONFIG_X86_32 */ { return do_rt_sigreturn(regs); } -#endif /* CONFIG_X86_32 */ /* * OK, we're invoking a handler: diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index e2e86a08f31..3bdb64829b8 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -1,7 +1,7 @@ ENTRY(sys_call_table) .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ .long sys_exit - .long sys_fork + .long ptregs_fork .long sys_read .long sys_write .long sys_open /* 5 */ @@ -10,7 +10,7 @@ ENTRY(sys_call_table) .long sys_creat .long sys_link .long sys_unlink /* 10 */ - .long sys_execve + .long ptregs_execve .long sys_chdir .long sys_time .long sys_mknod @@ -109,17 +109,17 @@ ENTRY(sys_call_table) .long sys_newlstat .long sys_newfstat .long sys_uname - .long sys_iopl /* 110 */ + .long ptregs_iopl /* 110 */ .long sys_vhangup .long sys_ni_syscall /* old "idle" system call */ - .long sys_vm86old + .long ptregs_vm86old .long sys_wait4 .long sys_swapoff /* 115 */ .long sys_sysinfo .long sys_ipc .long sys_fsync - .long sys_sigreturn - .long sys_clone /* 120 */ + .long ptregs_sigreturn + .long ptregs_clone /* 120 */ .long sys_setdomainname .long sys_newuname .long sys_modify_ldt @@ -165,14 +165,14 @@ ENTRY(sys_call_table) .long sys_mremap .long sys_setresuid16 .long sys_getresuid16 /* 165 */ - .long sys_vm86 + .long ptregs_vm86 .long sys_ni_syscall /* Old sys_query_module */ .long sys_poll .long sys_nfsservctl .long sys_setresgid16 /* 170 */ .long sys_getresgid16 .long sys_prctl - .long sys_rt_sigreturn + .long ptregs_rt_sigreturn .long sys_rt_sigaction .long sys_rt_sigprocmask /* 175 */ .long sys_rt_sigpending @@ -185,11 +185,11 @@ ENTRY(sys_call_table) .long sys_getcwd .long sys_capget .long sys_capset /* 185 */ - .long sys_sigaltstack + .long ptregs_sigaltstack .long sys_sendfile .long sys_ni_syscall /* reserved for streams1 */ .long sys_ni_syscall /* reserved for streams2 */ - .long sys_vfork /* 190 */ + .long ptregs_vfork /* 190 */ .long sys_getrlimit .long sys_mmap2 .long sys_truncate64 diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 55ea30d2a3d..8fa6ba7c923 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -197,9 +197,8 @@ out: static int do_vm86_irq_handling(int subfunction, int irqnumber); static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); -asmlinkage int sys_vm86old(struct pt_regs regs) +ptregscall int sys_vm86old(struct pt_regs *regs, struct vm86_struct __user *v86) { - struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.bx; struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. * This remains on the stack until we @@ -218,7 +217,7 @@ asmlinkage int sys_vm86old(struct pt_regs regs) if (tmp) goto out; memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); - info.regs32 = ®s; + info.regs32 = regs; tsk->thread.vm86_info = v86; do_sys_vm86(&info, tsk); ret = 0; /* we never return here */ @@ -227,7 +226,7 @@ out: } -asmlinkage int sys_vm86(struct pt_regs regs) +ptregscall int sys_vm86(struct pt_regs *regs, unsigned long cmd, unsigned long arg) { struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. @@ -239,12 +238,12 @@ asmlinkage int sys_vm86(struct pt_regs regs) struct vm86plus_struct __user *v86; tsk = current; - switch (regs.bx) { + switch (cmd) { case VM86_REQUEST_IRQ: case VM86_FREE_IRQ: case VM86_GET_IRQ_BITS: case VM86_GET_AND_RESET_IRQ: - ret = do_vm86_irq_handling(regs.bx, (int)regs.cx); + ret = do_vm86_irq_handling(cmd, (int)arg); goto out; case VM86_PLUS_INSTALL_CHECK: /* @@ -261,14 +260,14 @@ asmlinkage int sys_vm86(struct pt_regs regs) ret = -EPERM; if (tsk->thread.saved_sp0) goto out; - v86 = (struct vm86plus_struct __user *)regs.cx; + v86 = (struct vm86plus_struct __user *)arg; tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, offsetof(struct kernel_vm86_struct, regs32) - sizeof(info.regs)); ret = -EFAULT; if (tmp) goto out; - info.regs32 = ®s; + info.regs32 = regs; info.vm86plus.is_vm86pus = 1; tsk->thread.vm86_info = (struct vm86_struct __user *)v86; do_sys_vm86(&info, tsk); -- cgit v1.2.3-70-g09d2 From b12bdaf11f935d7be030207e3c77faeaeab8ded3 Mon Sep 17 00:00:00 2001 From: Brian Gerst <brgerst@gmail.com> Date: Wed, 11 Feb 2009 16:43:58 -0500 Subject: x86: use regparm(3) for passed-in pt_regs pointer Some syscalls need to access the pt_regs structure, either to copy user register state or to modifiy it. This patch adds stubs to load the address of the pt_regs struct into the %eax register, and changes the syscalls to take the pointer as an argument instead of relying on the assumption that the pt_regs structure overlaps the function arguments. Drop the use of regparm(1) due to concern about gcc bugs, and to move in the direction of the eventual removal of regparm(0) for asmlinkage. Signed-off-by: Brian Gerst <brgerst@gmail.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> --- arch/x86/include/asm/linkage.h | 7 ------- arch/x86/include/asm/syscalls.h | 25 ++++++++++--------------- arch/x86/kernel/ioport.c | 3 ++- arch/x86/kernel/process_32.c | 27 +++++++++++++++++---------- arch/x86/kernel/signal.c | 21 ++++++++++++++------- arch/x86/kernel/vm86_32.c | 11 ++++++----- 6 files changed, 49 insertions(+), 45 deletions(-) (limited to 'arch/x86/kernel/process_32.c') diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 2fd5926fb97..5d98d0b68ff 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -17,13 +17,6 @@ */ #define asmregparm __attribute__((regparm(3))) -/* - * For syscalls that need a pointer to the pt_regs struct (ie. fork). - * The regs pointer is passed in %eax as the first argument. The - * remaining function arguments remain on the stack. - */ -#define ptregscall __attribute__((regparm(1))) - /* * Make sure the compiler doesn't do anything stupid with the * arguments on the stack - they are owned by the *caller*, not diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 617295255a1..77bb31a88ba 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -29,26 +29,21 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *); /* X86_32 only */ #ifdef CONFIG_X86_32 /* kernel/process_32.c */ -ptregscall int sys_fork(struct pt_regs *); -ptregscall int sys_clone(struct pt_regs *, unsigned long, - unsigned long, int __user *, - unsigned long, int __user *); -ptregscall int sys_vfork(struct pt_regs *); -ptregscall int sys_execve(struct pt_regs *, char __user *, - char __user * __user *, - char __user * __user *); +int sys_fork(struct pt_regs *); +int sys_clone(struct pt_regs *); +int sys_vfork(struct pt_regs *); +int sys_execve(struct pt_regs *); /* kernel/signal_32.c */ asmlinkage int sys_sigsuspend(int, int, old_sigset_t); asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, struct old_sigaction __user *); -ptregscall int sys_sigaltstack(struct pt_regs *, const stack_t __user *, - stack_t __user *); -ptregscall unsigned long sys_sigreturn(struct pt_regs *); -ptregscall int sys_rt_sigreturn(struct pt_regs *); +int sys_sigaltstack(struct pt_regs *); +unsigned long sys_sigreturn(struct pt_regs *); +int sys_rt_sigreturn(struct pt_regs *); /* kernel/ioport.c */ -ptregscall long sys_iopl(struct pt_regs *, unsigned int); +long sys_iopl(struct pt_regs *); /* kernel/sys_i386_32.c */ asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, @@ -64,8 +59,8 @@ struct oldold_utsname; asmlinkage int sys_olduname(struct oldold_utsname __user *); /* kernel/vm86_32.c */ -ptregscall int sys_vm86old(struct pt_regs *, struct vm86_struct __user *); -ptregscall int sys_vm86(struct pt_regs *, unsigned long, unsigned long); +int sys_vm86old(struct pt_regs *); +int sys_vm86(struct pt_regs *); #else /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 7ec14864631..e41980a373a 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -131,8 +131,9 @@ static int do_iopl(unsigned int level, struct pt_regs *regs) } #ifdef CONFIG_X86_32 -ptregscall long sys_iopl(struct pt_regs *regs, unsigned int level) +long sys_iopl(struct pt_regs *regs) { + unsigned int level = regs->bx; struct thread_struct *t = ¤t->thread; int rc; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 5a9dcfb01f7..fec79ad85dc 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -603,15 +603,21 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } -ptregscall int sys_fork(struct pt_regs *regs) +int sys_fork(struct pt_regs *regs) { return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); } -ptregscall int sys_clone(struct pt_regs *regs, unsigned long clone_flags, - unsigned long newsp, int __user *parent_tidptr, - unsigned long unused, int __user *child_tidptr) +int sys_clone(struct pt_regs *regs) { + unsigned long clone_flags; + unsigned long newsp; + int __user *parent_tidptr, *child_tidptr; + + clone_flags = regs->bx; + newsp = regs->cx; + parent_tidptr = (int __user *)regs->dx; + child_tidptr = (int __user *)regs->di; if (!newsp) newsp = regs->sp; return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr); @@ -627,7 +633,7 @@ ptregscall int sys_clone(struct pt_regs *regs, unsigned long clone_flags, * do not have enough call-clobbered registers to hold all * the information you need. */ -ptregscall int sys_vfork(struct pt_regs *regs) +int sys_vfork(struct pt_regs *regs) { return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, NULL, NULL); } @@ -635,18 +641,19 @@ ptregscall int sys_vfork(struct pt_regs *regs) /* * sys_execve() executes a new program. */ -ptregscall int sys_execve(struct pt_regs *regs, char __user *u_filename, - char __user * __user *argv, - char __user * __user *envp) +int sys_execve(struct pt_regs *regs) { int error; char *filename; - filename = getname(u_filename); + filename = getname((char __user *) regs->bx); error = PTR_ERR(filename); if (IS_ERR(filename)) goto out; - error = do_execve(filename, argv, envp, regs); + error = do_execve(filename, + (char __user * __user *) regs->cx, + (char __user * __user *) regs->dx, + regs); if (error == 0) { /* Make sure we don't return using sysenter.. */ set_thread_flag(TIF_IRET); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index d7a158367e3..ccfb27412f0 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -549,23 +549,27 @@ sys_sigaction(int sig, const struct old_sigaction __user *act, #endif /* CONFIG_X86_32 */ #ifdef CONFIG_X86_32 -ptregscall int -sys_sigaltstack(struct pt_regs *regs, const stack_t __user *uss, - stack_t __user *uoss) +int sys_sigaltstack(struct pt_regs *regs) +{ + const stack_t __user *uss = (const stack_t __user *)regs->bx; + stack_t __user *uoss = (stack_t __user *)regs->cx; + + return do_sigaltstack(uss, uoss, regs->sp); +} #else /* !CONFIG_X86_32 */ asmlinkage long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) -#endif /* CONFIG_X86_32 */ { return do_sigaltstack(uss, uoss, regs->sp); } +#endif /* CONFIG_X86_32 */ /* * Do a signal return; undo the signal stack. */ #ifdef CONFIG_X86_32 -ptregscall unsigned long sys_sigreturn(struct pt_regs *regs) +unsigned long sys_sigreturn(struct pt_regs *regs) { struct sigframe __user *frame; unsigned long ax; @@ -629,13 +633,16 @@ badframe: } #ifdef CONFIG_X86_32 -ptregscall int sys_rt_sigreturn(struct pt_regs *regs) +int sys_rt_sigreturn(struct pt_regs *regs) +{ + return do_rt_sigreturn(regs); +} #else /* !CONFIG_X86_32 */ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) -#endif /* CONFIG_X86_32 */ { return do_rt_sigreturn(regs); } +#endif /* CONFIG_X86_32 */ /* * OK, we're invoking a handler: diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 8fa6ba7c923..d7ac84e7fc1 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -197,8 +197,9 @@ out: static int do_vm86_irq_handling(int subfunction, int irqnumber); static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); -ptregscall int sys_vm86old(struct pt_regs *regs, struct vm86_struct __user *v86) +int sys_vm86old(struct pt_regs *regs) { + struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx; struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. * This remains on the stack until we @@ -226,7 +227,7 @@ out: } -ptregscall int sys_vm86(struct pt_regs *regs, unsigned long cmd, unsigned long arg) +int sys_vm86(struct pt_regs *regs) { struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. @@ -238,12 +239,12 @@ ptregscall int sys_vm86(struct pt_regs *regs, unsigned long cmd, unsigned long a struct vm86plus_struct __user *v86; tsk = current; - switch (cmd) { + switch (regs->bx) { case VM86_REQUEST_IRQ: case VM86_FREE_IRQ: case VM86_GET_IRQ_BITS: case VM86_GET_AND_RESET_IRQ: - ret = do_vm86_irq_handling(cmd, (int)arg); + ret = do_vm86_irq_handling(regs->bx, (int)regs->cx); goto out; case VM86_PLUS_INSTALL_CHECK: /* @@ -260,7 +261,7 @@ ptregscall int sys_vm86(struct pt_regs *regs, unsigned long cmd, unsigned long a ret = -EPERM; if (tsk->thread.saved_sp0) goto out; - v86 = (struct vm86plus_struct __user *)arg; + v86 = (struct vm86plus_struct __user *)regs->cx; tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, offsetof(struct kernel_vm86_struct, regs32) - sizeof(info.regs)); -- cgit v1.2.3-70-g09d2 From db949bba3c7cf2e664ac12e237c6d4c914f0c69d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Fri, 27 Feb 2009 13:25:21 -0800 Subject: x86-32: use non-lazy io bitmap context switching Impact: remove 32-bit optimization to prepare unification x86-32 and -64 differ in the way they context-switch tasks with io permission bitmaps. x86-64 simply copies the next tasks io bitmap into place (if any) on context switch. x86-32 invalidates the bitmap on context switch, so that the next IO instruction will fault; at that point it installs the appropriate IO bitmap. This makes context switching IO-bitmap-using tasks a bit more less expensive, at the cost of making the next IO instruction slower due to the extra fault. This tradeoff only makes sense if IO-bitmap-using processes are relatively common, but they don't actually use IO instructions very often. However, in a typical desktop system, the only process likely to be using IO bitmaps is the X server, and nothing at all on a server. Therefore the lazy context switch doesn't really win all that much, and its just a gratuitious difference from 64-bit code. This patch removes the lazy context switch, with a view to unifying this code in a later change. Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/include/asm/processor.h | 6 ------ arch/x86/kernel/ioport.c | 11 ---------- arch/x86/kernel/process_32.c | 36 ++++++++----------------------- arch/x86/kernel/traps.c | 46 ---------------------------------------- 4 files changed, 9 insertions(+), 90 deletions(-) (limited to 'arch/x86/kernel/process_32.c') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c7a98f73821..76139506c3e 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -248,7 +248,6 @@ struct x86_hw_tss { #define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) #define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) #define INVALID_IO_BITMAP_OFFSET 0x8000 -#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000 struct tss_struct { /* @@ -263,11 +262,6 @@ struct tss_struct { * be within the limit. */ unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; - /* - * Cache the current maximum and the last task that used the bitmap: - */ - unsigned long io_bitmap_max; - struct thread_struct *io_bitmap_owner; /* * .. and then another 0x100 bytes for the emergency kernel stack: diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index e41980a373a..99c4d308f16 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -85,19 +85,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) t->io_bitmap_max = bytes; -#ifdef CONFIG_X86_32 - /* - * Sets the lazy trigger so that the next I/O operation will - * reload the correct bitmap. - * Reset the owner so that a process switch will not set - * tss->io_bitmap_base to IO_BITMAP_OFFSET. - */ - tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; - tss->io_bitmap_owner = NULL; -#else /* Update the TSS: */ memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); -#endif put_cpu(); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 646da41a620..a59314e877f 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -248,11 +248,8 @@ void exit_thread(void) /* * Careful, clear this in the TSS too: */ - memset(tss->io_bitmap, 0xff, tss->io_bitmap_max); + memset(tss->io_bitmap, 0xff, t->io_bitmap_max); t->io_bitmap_max = 0; - tss->io_bitmap_owner = NULL; - tss->io_bitmap_max = 0; - tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; put_cpu(); } @@ -458,34 +455,19 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, hard_enable_TSC(); } - if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { + if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { /* - * Disable the bitmap via an invalid offset. We still cache - * the previous bitmap owner and the IO bitmap contents: + * Copy the relevant range of the IO bitmap. + * Normally this is 128 bytes or less: */ - tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; - return; - } - - if (likely(next == tss->io_bitmap_owner)) { + memcpy(tss->io_bitmap, next->io_bitmap_ptr, + max(prev->io_bitmap_max, next->io_bitmap_max)); + } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { /* - * Previous owner of the bitmap (hence the bitmap content) - * matches the next task, we dont have to do anything but - * to set a valid offset in the TSS: + * Clear any possible leftover bits: */ - tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - return; + memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } - /* - * Lazy TSS's I/O bitmap copy. We set an invalid offset here - * and we let the task to get a GPF in case an I/O instruction - * is performed. The handler of the GPF will verify that the - * faulting task has a valid I/O bitmap and, it true, does the - * real copy and restart the instruction. This will save us - * redundant copies when the currently switched task does not - * perform any I/O during its timeslice. - */ - tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; } /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c05430ac1b4..a1d288327ff 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -118,47 +118,6 @@ die_if_kernel(const char *str, struct pt_regs *regs, long err) if (!user_mode_vm(regs)) die(str, regs, err); } - -/* - * Perform the lazy TSS's I/O bitmap copy. If the TSS has an - * invalid offset set (the LAZY one) and the faulting thread has - * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, - * we set the offset field correctly and return 1. - */ -static int lazy_iobitmap_copy(void) -{ - struct thread_struct *thread; - struct tss_struct *tss; - int cpu; - - cpu = get_cpu(); - tss = &per_cpu(init_tss, cpu); - thread = ¤t->thread; - - if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && - thread->io_bitmap_ptr) { - memcpy(tss->io_bitmap, thread->io_bitmap_ptr, - thread->io_bitmap_max); - /* - * If the previously set map was extending to higher ports - * than the current one, pad extra space with 0xff (no access). - */ - if (thread->io_bitmap_max < tss->io_bitmap_max) { - memset((char *) tss->io_bitmap + - thread->io_bitmap_max, 0xff, - tss->io_bitmap_max - thread->io_bitmap_max); - } - tss->io_bitmap_max = thread->io_bitmap_max; - tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - tss->io_bitmap_owner = thread; - put_cpu(); - - return 1; - } - put_cpu(); - - return 0; -} #endif static void __kprobes @@ -309,11 +268,6 @@ do_general_protection(struct pt_regs *regs, long error_code) conditional_sti(regs); #ifdef CONFIG_X86_32 - if (lazy_iobitmap_copy()) { - /* restart the faulting instruction */ - return; - } - if (regs->flags & X86_VM_MASK) goto gp_in_vm86; #endif -- cgit v1.2.3-70-g09d2 From 389d1fb11e5f2a16b5e34c547756f0c4dec641f7 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge <jeremy@goop.org> Date: Fri, 27 Feb 2009 13:25:28 -0800 Subject: x86: unify chunks of kernel/process*.c With x86-32 and -64 using the same mechanism for managing the tss io permissions bitmap, large chunks of process*.c are trivially unifyable, including: - exit_thread - flush_thread - __switch_to_xtra (along with tsc enable/disable) and as bonus pickups: - sys_fork - sys_vfork (Note: asmlinkage expands to empty on x86-64) Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com> Signed-off-by: Ingo Molnar <mingo@elte.hu> --- arch/x86/include/asm/system.h | 2 + arch/x86/kernel/process.c | 191 +++++++++++++++++++++++++++++++++++++++++- arch/x86/kernel/process_32.c | 172 ------------------------------------- arch/x86/kernel/process_64.c | 188 ----------------------------------------- 4 files changed, 192 insertions(+), 361 deletions(-) (limited to 'arch/x86/kernel/process_32.c') diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index c00bfdbdd45..1a7bf39f72d 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -20,6 +20,8 @@ struct task_struct; /* one of the stranger aspects of C forward declarations */ struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss); #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 87b69d4fac1..6afa5232dbb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -1,8 +1,8 @@ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/mm.h> -#include <asm/idle.h> #include <linux/smp.h> +#include <linux/prctl.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/module.h> @@ -11,6 +11,9 @@ #include <linux/ftrace.h> #include <asm/system.h> #include <asm/apic.h> +#include <asm/idle.h> +#include <asm/uaccess.h> +#include <asm/i387.h> unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -55,6 +58,192 @@ void arch_task_cache_init(void) SLAB_PANIC, NULL); } +/* + * Free current thread data structures etc.. + */ +void exit_thread(void) +{ + struct task_struct *me = current; + struct thread_struct *t = &me->thread; + + if (me->thread.io_bitmap_ptr) { + struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + + kfree(t->io_bitmap_ptr); + t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); + /* + * Careful, clear this in the TSS too: + */ + memset(tss->io_bitmap, 0xff, t->io_bitmap_max); + t->io_bitmap_max = 0; + put_cpu(); + } + + ds_exit_thread(current); +} + +void flush_thread(void) +{ + struct task_struct *tsk = current; + +#ifdef CONFIG_X86_64 + if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { + clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); + if (test_tsk_thread_flag(tsk, TIF_IA32)) { + clear_tsk_thread_flag(tsk, TIF_IA32); + } else { + set_tsk_thread_flag(tsk, TIF_IA32); + current_thread_info()->status |= TS_COMPAT; + } + } +#endif + + clear_tsk_thread_flag(tsk, TIF_DEBUG); + + tsk->thread.debugreg0 = 0; + tsk->thread.debugreg1 = 0; + tsk->thread.debugreg2 = 0; + tsk->thread.debugreg3 = 0; + tsk->thread.debugreg6 = 0; + tsk->thread.debugreg7 = 0; + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + /* + * Forget coprocessor state.. + */ + tsk->fpu_counter = 0; + clear_fpu(tsk); + clear_used_math(); +} + +static void hard_disable_TSC(void) +{ + write_cr4(read_cr4() | X86_CR4_TSD); +} + +void disable_TSC(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + hard_disable_TSC(); + preempt_enable(); +} + +static void hard_enable_TSC(void) +{ + write_cr4(read_cr4() & ~X86_CR4_TSD); +} + +static void enable_TSC(void) +{ + preempt_disable(); + if (test_and_clear_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + hard_enable_TSC(); + preempt_enable(); +} + +int get_tsc_mode(unsigned long adr) +{ + unsigned int val; + + if (test_thread_flag(TIF_NOTSC)) + val = PR_TSC_SIGSEGV; + else + val = PR_TSC_ENABLE; + + return put_user(val, (unsigned int __user *)adr); +} + +int set_tsc_mode(unsigned int val) +{ + if (val == PR_TSC_SIGSEGV) + disable_TSC(); + else if (val == PR_TSC_ENABLE) + enable_TSC(); + else + return -EINVAL; + + return 0; +} + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss) +{ + struct thread_struct *prev, *next; + + prev = &prev_p->thread; + next = &next_p->thread; + + if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || + test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) + ds_switch_to(prev_p, next_p); + else if (next->debugctlmsr != prev->debugctlmsr) + update_debugctlmsr(next->debugctlmsr); + + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { + set_debugreg(next->debugreg0, 0); + set_debugreg(next->debugreg1, 1); + set_debugreg(next->debugreg2, 2); + set_debugreg(next->debugreg3, 3); + /* no 4 and 5 */ + set_debugreg(next->debugreg6, 6); + set_debugreg(next->debugreg7, 7); + } + + if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ + test_tsk_thread_flag(next_p, TIF_NOTSC)) { + /* prev and next are different */ + if (test_tsk_thread_flag(next_p, TIF_NOTSC)) + hard_disable_TSC(); + else + hard_enable_TSC(); + } + + if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { + /* + * Copy the relevant range of the IO bitmap. + * Normally this is 128 bytes or less: + */ + memcpy(tss->io_bitmap, next->io_bitmap_ptr, + max(prev->io_bitmap_max, next->io_bitmap_max)); + } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { + /* + * Clear any possible leftover bits: + */ + memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); + } +} + +int sys_fork(struct pt_regs *regs) +{ + return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); +} + +/* + * This is trivial, and on the face of it looks like it + * could equally well be done in user mode. + * + * Not so, for quite unobvious reasons - register pressure. + * In user mode vfork() cannot have a stack frame, and if + * done by calling the "clone()" system call directly, you + * do not have enough call-clobbered registers to hold all + * the information you need. + */ +int sys_vfork(struct pt_regs *regs) +{ + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, + NULL, NULL); +} + + /* * Idle related variables and functions */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index a59314e877f..14014d766ca 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -230,52 +230,6 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) } EXPORT_SYMBOL(kernel_thread); -/* - * Free current thread data structures etc.. - */ -void exit_thread(void) -{ - /* The process may have allocated an io port bitmap... nuke it. */ - if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { - struct task_struct *tsk = current; - struct thread_struct *t = &tsk->thread; - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); - - kfree(t->io_bitmap_ptr); - t->io_bitmap_ptr = NULL; - clear_thread_flag(TIF_IO_BITMAP); - /* - * Careful, clear this in the TSS too: - */ - memset(tss->io_bitmap, 0xff, t->io_bitmap_max); - t->io_bitmap_max = 0; - put_cpu(); - } - - ds_exit_thread(current); -} - -void flush_thread(void) -{ - struct task_struct *tsk = current; - - tsk->thread.debugreg0 = 0; - tsk->thread.debugreg1 = 0; - tsk->thread.debugreg2 = 0; - tsk->thread.debugreg3 = 0; - tsk->thread.debugreg6 = 0; - tsk->thread.debugreg7 = 0; - memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - clear_tsk_thread_flag(tsk, TIF_DEBUG); - /* - * Forget coprocessor state.. - */ - tsk->fpu_counter = 0; - clear_fpu(tsk); - clear_used_math(); -} - void release_thread(struct task_struct *dead_task) { BUG_ON(dead_task->mm); @@ -363,112 +317,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) } EXPORT_SYMBOL_GPL(start_thread); -static void hard_disable_TSC(void) -{ - write_cr4(read_cr4() | X86_CR4_TSD); -} - -void disable_TSC(void) -{ - preempt_disable(); - if (!test_and_set_thread_flag(TIF_NOTSC)) - /* - * Must flip the CPU state synchronously with - * TIF_NOTSC in the current running context. - */ - hard_disable_TSC(); - preempt_enable(); -} - -static void hard_enable_TSC(void) -{ - write_cr4(read_cr4() & ~X86_CR4_TSD); -} - -static void enable_TSC(void) -{ - preempt_disable(); - if (test_and_clear_thread_flag(TIF_NOTSC)) - /* - * Must flip the CPU state synchronously with - * TIF_NOTSC in the current running context. - */ - hard_enable_TSC(); - preempt_enable(); -} - -int get_tsc_mode(unsigned long adr) -{ - unsigned int val; - - if (test_thread_flag(TIF_NOTSC)) - val = PR_TSC_SIGSEGV; - else - val = PR_TSC_ENABLE; - - return put_user(val, (unsigned int __user *)adr); -} - -int set_tsc_mode(unsigned int val) -{ - if (val == PR_TSC_SIGSEGV) - disable_TSC(); - else if (val == PR_TSC_ENABLE) - enable_TSC(); - else - return -EINVAL; - - return 0; -} - -static noinline void -__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, - struct tss_struct *tss) -{ - struct thread_struct *prev, *next; - - prev = &prev_p->thread; - next = &next_p->thread; - - if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || - test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) - ds_switch_to(prev_p, next_p); - else if (next->debugctlmsr != prev->debugctlmsr) - update_debugctlmsr(next->debugctlmsr); - - if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg0, 0); - set_debugreg(next->debugreg1, 1); - set_debugreg(next->debugreg2, 2); - set_debugreg(next->debugreg3, 3); - /* no 4 and 5 */ - set_debugreg(next->debugreg6, 6); - set_debugreg(next->debugreg7, 7); - } - - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ - test_tsk_thread_flag(next_p, TIF_NOTSC)) { - /* prev and next are different */ - if (test_tsk_thread_flag(next_p, TIF_NOTSC)) - hard_disable_TSC(); - else - hard_enable_TSC(); - } - - if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { - /* - * Copy the relevant range of the IO bitmap. - * Normally this is 128 bytes or less: - */ - memcpy(tss->io_bitmap, next->io_bitmap_ptr, - max(prev->io_bitmap_max, next->io_bitmap_max)); - } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { - /* - * Clear any possible leftover bits: - */ - memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); - } -} /* * switch_to(x,yn) should switch tasks from x to y. @@ -582,11 +430,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) return prev_p; } -int sys_fork(struct pt_regs *regs) -{ - return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); -} - int sys_clone(struct pt_regs *regs) { unsigned long clone_flags; @@ -602,21 +445,6 @@ int sys_clone(struct pt_regs *regs) return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr); } -/* - * This is trivial, and on the face of it looks like it - * could equally well be done in user mode. - * - * Not so, for quite unobvious reasons - register pressure. - * In user mode vfork() cannot have a stack frame, and if - * done by calling the "clone()" system call directly, you - * do not have enough call-clobbered registers to hold all - * the information you need. - */ -int sys_vfork(struct pt_regs *regs) -{ - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, NULL, NULL); -} - /* * sys_execve() executes a new program. */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 836ef6575f0..abb7e6a7f0c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -237,61 +237,6 @@ void show_regs(struct pt_regs *regs) show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } -/* - * Free current thread data structures etc.. - */ -void exit_thread(void) -{ - struct task_struct *me = current; - struct thread_struct *t = &me->thread; - - if (me->thread.io_bitmap_ptr) { - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); - - kfree(t->io_bitmap_ptr); - t->io_bitmap_ptr = NULL; - clear_thread_flag(TIF_IO_BITMAP); - /* - * Careful, clear this in the TSS too: - */ - memset(tss->io_bitmap, 0xff, t->io_bitmap_max); - t->io_bitmap_max = 0; - put_cpu(); - } - - ds_exit_thread(current); -} - -void flush_thread(void) -{ - struct task_struct *tsk = current; - - if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { - clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); - if (test_tsk_thread_flag(tsk, TIF_IA32)) { - clear_tsk_thread_flag(tsk, TIF_IA32); - } else { - set_tsk_thread_flag(tsk, TIF_IA32); - current_thread_info()->status |= TS_COMPAT; - } - } - clear_tsk_thread_flag(tsk, TIF_DEBUG); - - tsk->thread.debugreg0 = 0; - tsk->thread.debugreg1 = 0; - tsk->thread.debugreg2 = 0; - tsk->thread.debugreg3 = 0; - tsk->thread.debugreg6 = 0; - tsk->thread.debugreg7 = 0; - memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - /* - * Forget coprocessor state.. - */ - tsk->fpu_counter = 0; - clear_fpu(tsk); - clear_used_math(); -} - void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { @@ -425,118 +370,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) } EXPORT_SYMBOL_GPL(start_thread); -static void hard_disable_TSC(void) -{ - write_cr4(read_cr4() | X86_CR4_TSD); -} - -void disable_TSC(void) -{ - preempt_disable(); - if (!test_and_set_thread_flag(TIF_NOTSC)) - /* - * Must flip the CPU state synchronously with - * TIF_NOTSC in the current running context. - */ - hard_disable_TSC(); - preempt_enable(); -} - -static void hard_enable_TSC(void) -{ - write_cr4(read_cr4() & ~X86_CR4_TSD); -} - -static void enable_TSC(void) -{ - preempt_disable(); - if (test_and_clear_thread_flag(TIF_NOTSC)) - /* - * Must flip the CPU state synchronously with - * TIF_NOTSC in the current running context. - */ - hard_enable_TSC(); - preempt_enable(); -} - -int get_tsc_mode(unsigned long adr) -{ - unsigned int val; - - if (test_thread_flag(TIF_NOTSC)) - val = PR_TSC_SIGSEGV; - else - val = PR_TSC_ENABLE; - - return put_user(val, (unsigned int __user *)adr); -} - -int set_tsc_mode(unsigned int val) -{ - if (val == PR_TSC_SIGSEGV) - disable_TSC(); - else if (val == PR_TSC_ENABLE) - enable_TSC(); - else - return -EINVAL; - - return 0; -} - -/* - * This special macro can be used to load a debugging register - */ -#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r) - -static inline void __switch_to_xtra(struct task_struct *prev_p, - struct task_struct *next_p, - struct tss_struct *tss) -{ - struct thread_struct *prev, *next; - - prev = &prev_p->thread, - next = &next_p->thread; - - if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || - test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) - ds_switch_to(prev_p, next_p); - else if (next->debugctlmsr != prev->debugctlmsr) - update_debugctlmsr(next->debugctlmsr); - - if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - loaddebug(next, 0); - loaddebug(next, 1); - loaddebug(next, 2); - loaddebug(next, 3); - /* no 4 and 5 */ - loaddebug(next, 6); - loaddebug(next, 7); - } - - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ - test_tsk_thread_flag(next_p, TIF_NOTSC)) { - /* prev and next are different */ - if (test_tsk_thread_flag(next_p, TIF_NOTSC)) - hard_disable_TSC(); - else - hard_enable_TSC(); - } - - if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { - /* - * Copy the relevant range of the IO bitmap. - * Normally this is 128 bytes or less: - */ - memcpy(tss->io_bitmap, next->io_bitmap_ptr, - max(prev->io_bitmap_max, next->io_bitmap_max)); - } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { - /* - * Clear any possible leftover bits: - */ - memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); - } -} - /* * switch_to(x,y) should switch tasks from x to y. * @@ -694,11 +527,6 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } -asmlinkage long sys_fork(struct pt_regs *regs) -{ - return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); -} - asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) @@ -708,22 +536,6 @@ sys_clone(unsigned long clone_flags, unsigned long newsp, return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); } -/* - * This is trivial, and on the face of it looks like it - * could equally well be done in user mode. - * - * Not so, for quite unobvious reasons - register pressure. - * In user mode vfork() cannot have a stack frame, and if - * done by calling the "clone()" system call directly, you - * do not have enough call-clobbered registers to hold all - * the information you need. - */ -asmlinkage long sys_vfork(struct pt_regs *regs) -{ - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, - NULL, NULL); -} - unsigned long get_wchan(struct task_struct *p) { unsigned long stack; -- cgit v1.2.3-70-g09d2