diff options
Diffstat (limited to 'arch')
319 files changed, 24046 insertions, 24407 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index de211ac3853..77201d3f747 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -91,6 +91,11 @@ config GENERIC_IRQ_PROBE bool default y +config GENERIC_LOCKBREAK + bool + default y + depends on SMP && PREEMPT + config RWSEM_GENERIC_SPINLOCK bool default y diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index bef47725d4a..5a41e75ae1f 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -42,6 +42,11 @@ config MMU config SWIOTLB bool +config GENERIC_LOCKBREAK + bool + default y + depends on SMP && PREEMPT + config RWSEM_XCHGADD_ALGORITHM bool default y @@ -75,6 +80,9 @@ config GENERIC_TIME_VSYSCALL bool default y +config ARCH_SETS_UP_PER_CPU_AREA + def_bool y + config DMI bool default y diff --git a/arch/ia64/ia32/binfmt_elf32.c b/arch/ia64/ia32/binfmt_elf32.c index 3e35987af45..4f0c30c38e9 100644 --- a/arch/ia64/ia32/binfmt_elf32.c +++ b/arch/ia64/ia32/binfmt_elf32.c @@ -222,7 +222,8 @@ elf32_set_personality (void) } static unsigned long -elf32_map (struct file *filep, unsigned long addr, struct elf_phdr *eppnt, int prot, int type) +elf32_map(struct file *filep, unsigned long addr, struct elf_phdr *eppnt, + int prot, int type, unsigned long unused) { unsigned long pgoff = (eppnt->p_vaddr) & ~IA32_PAGE_MASK; diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c index 196287928ba..e699eb6c44b 100644 --- a/arch/ia64/kernel/module.c +++ b/arch/ia64/kernel/module.c @@ -947,7 +947,7 @@ percpu_modcopy (void *pcpudst, const void *src, unsigned long size) { unsigned int i; for_each_possible_cpu(i) { - memcpy(pcpudst + __per_cpu_offset[i], src, size); + memcpy(pcpudst + per_cpu_offset(i), src, size); } } #endif /* CONFIG_SMP */ diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index ab9a264cb19..f7237c5f531 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -235,6 +235,11 @@ config IRAM_SIZE # Define implied options from the CPU selection here # +config GENERIC_LOCKBREAK + bool + default y + depends on SMP && PREEMPT + config RWSEM_GENERIC_SPINLOCK bool depends on M32R diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 6b0f85f02c7..4fad0a34b99 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -694,6 +694,11 @@ source "arch/mips/vr41xx/Kconfig" endmenu +config GENERIC_LOCKBREAK + bool + default y + depends on SMP && PREEMPT + config RWSEM_GENERIC_SPINLOCK bool default y diff --git a/arch/mips/kernel/i8253.c b/arch/mips/kernel/i8253.c index c2d497ceffd..fc4aa07b6d3 100644 --- a/arch/mips/kernel/i8253.c +++ b/arch/mips/kernel/i8253.c @@ -24,9 +24,7 @@ DEFINE_SPINLOCK(i8253_lock); static void init_pit_timer(enum clock_event_mode mode, struct clock_event_device *evt) { - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); + spin_lock(&i8253_lock); switch(mode) { case CLOCK_EVT_MODE_PERIODIC: @@ -55,7 +53,7 @@ static void init_pit_timer(enum clock_event_mode mode, /* Nothing to do here */ break; } - spin_unlock_irqrestore(&i8253_lock, flags); + spin_unlock(&i8253_lock); } /* @@ -65,12 +63,10 @@ static void init_pit_timer(enum clock_event_mode mode, */ static int pit_next_event(unsigned long delta, struct clock_event_device *evt) { - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); + spin_lock(&i8253_lock); outb_p(delta & 0xff , PIT_CH0); /* LSB */ outb(delta >> 8 , PIT_CH0); /* MSB */ - spin_unlock_irqrestore(&i8253_lock, flags); + spin_unlock(&i8253_lock); return 0; } diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index b8ef1787a19..2b649c46631 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -19,6 +19,11 @@ config MMU config STACK_GROWSUP def_bool y +config GENERIC_LOCKBREAK + bool + default y + depends on SMP && PREEMPT + config RWSEM_GENERIC_SPINLOCK def_bool y diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 232c298c933..fb85f6b72fc 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -42,6 +42,9 @@ config GENERIC_HARDIRQS bool default y +config ARCH_SETS_UP_PER_CPU_AREA + def_bool PPC64 + config IRQ_PER_CPU bool default y @@ -53,6 +56,11 @@ config RWSEM_XCHGADD_ALGORITHM bool default y +config GENERIC_LOCKBREAK + bool + default y + depends on SMP && PREEMPT + config ARCH_HAS_ILOG2_U32 bool default y diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 3e17d154d0d..8b056d2295c 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -256,7 +256,7 @@ static int set_evrregs(struct task_struct *task, unsigned long *data) #endif /* CONFIG_SPE */ -static void set_single_step(struct task_struct *task) +void user_enable_single_step(struct task_struct *task) { struct pt_regs *regs = task->thread.regs; @@ -271,7 +271,7 @@ static void set_single_step(struct task_struct *task) set_tsk_thread_flag(task, TIF_SINGLESTEP); } -static void clear_single_step(struct task_struct *task) +void user_disable_single_step(struct task_struct *task) { struct pt_regs *regs = task->thread.regs; @@ -313,7 +313,7 @@ static int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, void ptrace_disable(struct task_struct *child) { /* make sure the single step bit is not set. */ - clear_single_step(child); + user_disable_single_step(child); } /* @@ -445,52 +445,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; } - case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ - case PTRACE_CONT: { /* restart after signal. */ - ret = -EIO; - if (!valid_signal(data)) - break; - if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - else - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - child->exit_code = data; - /* make sure the single step bit is not set. */ - clear_single_step(child); - wake_up_process(child); - ret = 0; - break; - } - -/* - * make the child exit. Best I can do is send it a sigkill. - * perhaps it should be put in the status that it wants to - * exit. - */ - case PTRACE_KILL: { - ret = 0; - if (child->exit_state == EXIT_ZOMBIE) /* already dead */ - break; - child->exit_code = SIGKILL; - /* make sure the single step bit is not set. */ - clear_single_step(child); - wake_up_process(child); - break; - } - - case PTRACE_SINGLESTEP: { /* set the trap flag. */ - ret = -EIO; - if (!valid_signal(data)) - break; - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - set_single_step(child); - child->exit_code = data; - /* give it a chance to run. */ - wake_up_process(child); - ret = 0; - break; - } - case PTRACE_GET_DEBUGREG: { ret = -EINVAL; /* We only support one DABR and no IABRS at the moment */ diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index 10b212a1f9f..26f5791baa3 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -66,6 +66,9 @@ config AUDIT_ARCH bool default y +config ARCH_SETS_UP_PER_CPU_AREA + def_bool y + config ARCH_NO_VIRT_TO_BUS def_bool y @@ -200,6 +203,11 @@ config US2E_FREQ If in doubt, say N. # Global things across all Sun machines. +config GENERIC_LOCKBREAK + bool + default y + depends on SMP && PREEMPT + config RWSEM_GENERIC_SPINLOCK bool diff --git a/arch/um/kernel/ksyms.c b/arch/um/kernel/ksyms.c index 1b388b41d95..7c7142ba3bd 100644 --- a/arch/um/kernel/ksyms.c +++ b/arch/um/kernel/ksyms.c @@ -71,10 +71,10 @@ EXPORT_SYMBOL(dump_thread); /* required for SMP */ -extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); +extern void __write_lock_failed(rwlock_t *rw); EXPORT_SYMBOL(__write_lock_failed); -extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); +extern void __read_lock_failed(rwlock_t *rw); EXPORT_SYMBOL(__read_lock_failed); #endif diff --git a/arch/um/sys-i386/signal.c b/arch/um/sys-i386/signal.c index 0147227ce18..19053d46cb6 100644 --- a/arch/um/sys-i386/signal.c +++ b/arch/um/sys-i386/signal.c @@ -3,10 +3,10 @@ * Licensed under the GPL */ -#include "linux/ptrace.h" -#include "asm/unistd.h" -#include "asm/uaccess.h" -#include "asm/ucontext.h" +#include <linux/ptrace.h> +#include <asm/unistd.h> +#include <asm/uaccess.h> +#include <asm/ucontext.h> #include "frame_kern.h" #include "skas.h" @@ -18,17 +18,17 @@ void copy_sc(struct uml_pt_regs *regs, void *from) REGS_FS(regs->gp) = sc->fs; REGS_ES(regs->gp) = sc->es; REGS_DS(regs->gp) = sc->ds; - REGS_EDI(regs->gp) = sc->edi; - REGS_ESI(regs->gp) = sc->esi; - REGS_EBP(regs->gp) = sc->ebp; - REGS_SP(regs->gp) = sc->esp; - REGS_EBX(regs->gp) = sc->ebx; - REGS_EDX(regs->gp) = sc->edx; - REGS_ECX(regs->gp) = sc->ecx; - REGS_EAX(regs->gp) = sc->eax; - REGS_IP(regs->gp) = sc->eip; + REGS_EDI(regs->gp) = sc->di; + REGS_ESI(regs->gp) = sc->si; + REGS_EBP(regs->gp) = sc->bp; + REGS_SP(regs->gp) = sc->sp; + REGS_EBX(regs->gp) = sc->bx; + REGS_EDX(regs->gp) = sc->dx; + REGS_ECX(regs->gp) = sc->cx; + REGS_EAX(regs->gp) = sc->ax; + REGS_IP(regs->gp) = sc->ip; REGS_CS(regs->gp) = sc->cs; - REGS_EFLAGS(regs->gp) = sc->eflags; + REGS_EFLAGS(regs->gp) = sc->flags; REGS_SS(regs->gp) = sc->ss; } @@ -229,18 +229,18 @@ static int copy_sc_to_user(struct sigcontext __user *to, sc.fs = REGS_FS(regs->regs.gp); sc.es = REGS_ES(regs->regs.gp); sc.ds = REGS_DS(regs->regs.gp); - sc.edi = REGS_EDI(regs->regs.gp); - sc.esi = REGS_ESI(regs->regs.gp); - sc.ebp = REGS_EBP(regs->regs.gp); - sc.esp = sp; - sc.ebx = REGS_EBX(regs->regs.gp); - sc.edx = REGS_EDX(regs->regs.gp); - sc.ecx = REGS_ECX(regs->regs.gp); - sc.eax = REGS_EAX(regs->regs.gp); - sc.eip = REGS_IP(regs->regs.gp); + sc.di = REGS_EDI(regs->regs.gp); + sc.si = REGS_ESI(regs->regs.gp); + sc.bp = REGS_EBP(regs->regs.gp); + sc.sp = sp; + sc.bx = REGS_EBX(regs->regs.gp); + sc.dx = REGS_EDX(regs->regs.gp); + sc.cx = REGS_ECX(regs->regs.gp); + sc.ax = REGS_EAX(regs->regs.gp); + sc.ip = REGS_IP(regs->regs.gp); sc.cs = REGS_CS(regs->regs.gp); - sc.eflags = REGS_EFLAGS(regs->regs.gp); - sc.esp_at_signal = regs->regs.gp[UESP]; + sc.flags = REGS_EFLAGS(regs->regs.gp); + sc.sp_at_signal = regs->regs.gp[UESP]; sc.ss = regs->regs.gp[SS]; sc.cr2 = fi->cr2; sc.err = fi->error_code; diff --git a/arch/um/sys-x86_64/signal.c b/arch/um/sys-x86_64/signal.c index 1778d33808f..7457436b433 100644 --- a/arch/um/sys-x86_64/signal.c +++ b/arch/um/sys-x86_64/signal.c @@ -4,11 +4,11 @@ * Licensed under the GPL */ -#include "linux/personality.h" -#include "linux/ptrace.h" -#include "asm/unistd.h" -#include "asm/uaccess.h" -#include "asm/ucontext.h" +#include <linux/personality.h> +#include <linux/ptrace.h> +#include <asm/unistd.h> +#include <asm/uaccess.h> +#include <asm/ucontext.h> #include "frame_kern.h" #include "skas.h" @@ -27,16 +27,16 @@ void copy_sc(struct uml_pt_regs *regs, void *from) GETREG(regs, R13, sc, r13); GETREG(regs, R14, sc, r14); GETREG(regs, R15, sc, r15); - GETREG(regs, RDI, sc, rdi); - GETREG(regs, RSI, sc, rsi); - GETREG(regs, RBP, sc, rbp); - GETREG(regs, RBX, sc, rbx); - GETREG(regs, RDX, sc, rdx); - GETREG(regs, RAX, sc, rax); - GETREG(regs, RCX, sc, rcx); - GETREG(regs, RSP, sc, rsp); - GETREG(regs, RIP, sc, rip); - GETREG(regs, EFLAGS, sc, eflags); + GETREG(regs, RDI, sc, di); + GETREG(regs, RSI, sc, si); + GETREG(regs, RBP, sc, bp); + GETREG(regs, RBX, sc, bx); + GETREG(regs, RDX, sc, dx); + GETREG(regs, RAX, sc, ax); + GETREG(regs, RCX, sc, cx); + GETREG(regs, RSP, sc, sp); + GETREG(regs, RIP, sc, ip); + GETREG(regs, EFLAGS, sc, flags); GETREG(regs, CS, sc, cs); #undef GETREG @@ -61,16 +61,16 @@ static int copy_sc_from_user(struct pt_regs *regs, err |= GETREG(regs, R13, from, r13); err |= GETREG(regs, R14, from, r14); err |= GETREG(regs, R15, from, r15); - err |= GETREG(regs, RDI, from, rdi); - err |= GETREG(regs, RSI, from, rsi); - err |= GETREG(regs, RBP, from, rbp); - err |= GETREG(regs, RBX, from, rbx); - err |= GETREG(regs, RDX, from, rdx); - err |= GETREG(regs, RAX, from, rax); - err |= GETREG(regs, RCX, from, rcx); - err |= GETREG(regs, RSP, from, rsp); - err |= GETREG(regs, RIP, from, rip); - err |= GETREG(regs, EFLAGS, from, eflags); + err |= GETREG(regs, RDI, from, di); + err |= GETREG(regs, RSI, from, si); + err |= GETREG(regs, RBP, from, bp); + err |= GETREG(regs, RBX, from, bx); + err |= GETREG(regs, RDX, from, dx); + err |= GETREG(regs, RAX, from, ax); + err |= GETREG(regs, RCX, from, cx); + err |= GETREG(regs, RSP, from, sp); + err |= GETREG(regs, RIP, from, ip); + err |= GETREG(regs, EFLAGS, from, flags); err |= GETREG(regs, CS, from, cs); if (err) return 1; @@ -108,19 +108,19 @@ static int copy_sc_to_user(struct sigcontext __user *to, __put_user((regs)->regs.gp[(regno) / sizeof(unsigned long)], \ &(sc)->regname) - err |= PUTREG(regs, RDI, to, rdi); - err |= PUTREG(regs, RSI, to, rsi); - err |= PUTREG(regs, RBP, to, rbp); + err |= PUTREG(regs, RDI, to, di); + err |= PUTREG(regs, RSI, to, si); + err |= PUTREG(regs, RBP, to, bp); /* * Must use orignal RSP, which is passed in, rather than what's in * the pt_regs, because that's already been updated to point at the * signal frame. */ - err |= __put_user(sp, &to->rsp); - err |= PUTREG(regs, RBX, to, rbx); - err |= PUTREG(regs, RDX, to, rdx); - err |= PUTREG(regs, RCX, to, rcx); - err |= PUTREG(regs, RAX, to, rax); + err |= __put_user(sp, &to->sp); + err |= PUTREG(regs, RBX, to, bx); + err |= PUTREG(regs, RDX, to, dx); + err |= PUTREG(regs, RCX, to, cx); + err |= PUTREG(regs, RAX, to, ax); err |= PUTREG(regs, R8, to, r8); err |= PUTREG(regs, R9, to, r9); err |= PUTREG(regs, R10, to, r10); @@ -135,8 +135,8 @@ static int copy_sc_to_user(struct sigcontext __user *to, err |= __put_user(fi->error_code, &to->err); err |= __put_user(fi->trap_no, &to->trapno); - err |= PUTREG(regs, RIP, to, rip); - err |= PUTREG(regs, EFLAGS, to, eflags); + err |= PUTREG(regs, RIP, to, ip); + err |= PUTREG(regs, EFLAGS, to, flags); #undef PUTREG err |= __put_user(mask, &to->oldmask); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 80b7ba4056d..fb3eea3e38e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -17,81 +17,69 @@ config X86_64 ### Arch settings config X86 - bool - default y + def_bool y + +config GENERIC_LOCKBREAK + def_bool n config GENERIC_TIME - bool - default y + def_bool y config GENERIC_CMOS_UPDATE - bool - default y + def_bool y config CLOCKSOURCE_WATCHDOG - bool - default y + def_bool y config GENERIC_CLOCKEVENTS - bool - default y + def_bool y config GENERIC_CLOCKEVENTS_BROADCAST - bool - default y + def_bool y depends on X86_64 || (X86_32 && X86_LOCAL_APIC) config LOCKDEP_SUPPORT - bool - default y + def_bool y config STACKTRACE_SUPPORT - bool - default y + def_bool y config SEMAPHORE_SLEEPERS - bool - default y + def_bool y config MMU - bool - default y + def_bool y config ZONE_DMA - bool - default y + def_bool y config QUICKLIST - bool - default X86_32 + def_bool X86_32 config SBUS bool config GENERIC_ISA_DMA - bool - default y + def_bool y config GENERIC_IOMAP - bool - default y + def_bool y config GENERIC_BUG - bool - default y + def_bool y depends on BUG config GENERIC_HWEIGHT - bool - default y + def_bool y + +config GENERIC_GPIO + def_bool n config ARCH_MAY_HAVE_PC_FDC - bool - default y + def_bool y config DMI - bool - default y + def_bool y config RWSEM_GENERIC_SPINLOCK def_bool !X86_XADD @@ -112,6 +100,9 @@ config GENERIC_TIME_VSYSCALL bool default X86_64 +config HAVE_SETUP_PER_CPU_AREA + def_bool X86_64 + config ARCH_SUPPORTS_OPROFILE bool default y @@ -144,9 +135,17 @@ config GENERIC_PENDING_IRQ config X86_SMP bool - depends on X86_32 && SMP && !X86_VOYAGER + depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64) default y +config X86_32_SMP + def_bool y + depends on X86_32 && SMP + +config X86_64_SMP + def_bool y + depends on X86_64 && SMP + config X86_HT bool depends on SMP @@ -292,6 +291,18 @@ config X86_ES7000 Only choose this option if you have such a system, otherwise you should say N here. +config X86_RDC321X + bool "RDC R-321x SoC" + depends on X86_32 + select M486 + select X86_REBOOTFIXUPS + select GENERIC_GPIO + select LEDS_GPIO + help + This option is needed for RDC R-321x system-on-chip, also known + as R-8610-(G). + If you don't have one of these chips, you should say N here. + config X86_VSMP bool "Support for ScaleMP vSMP" depends on X86_64 && PCI @@ -303,8 +314,8 @@ config X86_VSMP endchoice config SCHED_NO_NO_OMIT_FRAME_POINTER - bool "Single-depth WCHAN output" - default y + def_bool y + prompt "Single-depth WCHAN output" depends on X86_32 help Calculate simpler /proc/<PID>/wchan values. If this option @@ -314,18 +325,8 @@ config SCHED_NO_NO_OMIT_FRAME_POINTER If in doubt, say "Y". -config PARAVIRT - bool - depends on X86_32 && !(X86_VISWS || X86_VOYAGER) - help - This changes the kernel so it can modify itself when it is run - under a hypervisor, potentially improving performance significantly - over full virtualization. However, when run without a hypervisor - the kernel is theoretically slower and slightly larger. - menuconfig PARAVIRT_GUEST bool "Paravirtualized guest support" - depends on X86_32 help Say Y here to get to see options related to running Linux under various hypervisors. This option alone does not add any kernel code. @@ -339,6 +340,7 @@ source "arch/x86/xen/Kconfig" config VMI bool "VMI Guest support" select PARAVIRT + depends on X86_32 depends on !(X86_VISWS || X86_VOYAGER) help VMI provides a paravirtualized interface to the VMware ESX server @@ -348,40 +350,43 @@ config VMI source "arch/x86/lguest/Kconfig" +config PARAVIRT + bool "Enable paravirtualization code" + depends on !(X86_VISWS || X86_VOYAGER) + help + This changes the kernel so it can modify itself when it is run + under a hypervisor, potentially improving performance significantly + over full virtualization. However, when run without a hypervisor + the kernel is theoretically slower and slightly larger. + endif config ACPI_SRAT - bool - default y + def_bool y depends on X86_32 && ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH) select ACPI_NUMA config HAVE_ARCH_PARSE_SRAT - bool - default y - depends on ACPI_SRAT + def_bool y + depends on ACPI_SRAT config X86_SUMMIT_NUMA - bool - default y + def_bool y depends on X86_32 && NUMA && (X86_SUMMIT || X86_GENERICARCH) config X86_CYCLONE_TIMER - bool - default y + def_bool y depends on X86_32 && X86_SUMMIT || X86_GENERICARCH config ES7000_CLUSTERED_APIC - bool - default y + def_bool y depends on SMP && X86_ES7000 && MPENTIUMIII source "arch/x86/Kconfig.cpu" config HPET_TIMER - bool + def_bool X86_64 prompt "HPET Timer Support" if X86_32 - default X86_64 help Use the IA-PC HPET (High Precision Event Timer) to manage time in preference to the PIT and RTC, if a HPET is @@ -399,9 +404,8 @@ config HPET_TIMER Choose N to continue using the legacy 8254 timer. config HPET_EMULATE_RTC - bool - depends on HPET_TIMER && RTC=y - default y + def_bool y + depends on HPET_TIMER && (RTC=y || RTC=m) # Mark as embedded because too many people got it wrong. # The code disables itself when not needed. @@ -441,8 +445,8 @@ config CALGARY_IOMMU If unsure, say Y. config CALGARY_IOMMU_ENABLED_BY_DEFAULT - bool "Should Calgary be enabled by default?" - default y + def_bool y + prompt "Should Calgary be enabled by default?" depends on CALGARY_IOMMU help Should Calgary be enabled by default? if you choose 'y', Calgary @@ -486,9 +490,9 @@ config SCHED_SMT N here. config SCHED_MC - bool "Multi-core scheduler support" + def_bool y + prompt "Multi-core scheduler support" depends on (X86_64 && SMP) || (X86_32 && X86_HT) - default y help Multi-core scheduler support improves the CPU scheduler's decision making when dealing with multi-core CPU chips at a cost of slightly @@ -522,19 +526,16 @@ config X86_UP_IOAPIC an IO-APIC, then the kernel will still run with no slowdown at all. config X86_LOCAL_APIC - bool + def_bool y depends on X86_64 || (X86_32 && (X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER) || X86_GENERICARCH)) - default y config X86_IO_APIC - bool + def_bool y depends on X86_64 || (X86_32 && (X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER)) || X86_GENERICARCH)) - default y config X86_VISWS_APIC - bool + def_bool y depends on X86_32 && X86_VISWS - default y config X86_MCE bool "Machine Check Exception" @@ -554,17 +555,17 @@ config X86_MCE the 386 and 486, so nearly everyone can say Y here. config X86_MCE_INTEL - bool "Intel MCE features" + def_bool y + prompt "Intel MCE features" depends on X86_64 && X86_MCE && X86_LOCAL_APIC - default y help Additional support for intel specific MCE features such as the thermal monitor. config X86_MCE_AMD - bool "AMD MCE features" + def_bool y + prompt "AMD MCE features" depends on X86_64 && X86_MCE && X86_LOCAL_APIC - default y help Additional support for AMD specific MCE features such as the DRAM Error Threshold. @@ -637,9 +638,9 @@ config I8K Say N otherwise. config X86_REBOOTFIXUPS - bool "Enable X86 board specific fixups for reboot" + def_bool n + prompt "Enable X86 board specific fixups for reboot" depends on X86_32 && X86 - default n ---help--- This enables chipset and/or board specific fixups to be done in order to get reboot to work correctly. This is only needed on @@ -648,7 +649,7 @@ config X86_REBOOTFIXUPS system. Currently, the only fixup is for the Geode machines using - CS5530A and CS5536 chipsets. + CS5530A and CS5536 chipsets and the RDC R-321x SoC. Say Y if you want to enable the fixup. Currently, it's safe to enable this option even if you don't need it. @@ -672,9 +673,8 @@ config MICROCODE module will be called microcode. config MICROCODE_OLD_INTERFACE - bool + def_bool y depends on MICROCODE - default y config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" @@ -798,13 +798,12 @@ config PAGE_OFFSET depends on X86_32 config HIGHMEM - bool + def_bool y depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) - default y config X86_PAE - bool "PAE (Physical Address Extension) Support" - default n + def_bool n + prompt "PAE (Physical Address Extension) Support" depends on X86_32 && !HIGHMEM4G select RESOURCES_64BIT help @@ -836,10 +835,10 @@ comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) config K8_NUMA - bool "Old style AMD Opteron NUMA detection" - depends on X86_64 && NUMA && PCI - default y - help + def_bool y + prompt "Old style AMD Opteron NUMA detection" + depends on X86_64 && NUMA && PCI + help Enable K8 NUMA node topology detection. You should say Y here if you have a multi processor AMD K8 system. This uses an old method to read the NUMA configuration directly from the builtin @@ -847,10 +846,10 @@ config K8_NUMA instead, which also takes priority if both are compiled in. config X86_64_ACPI_NUMA - bool "ACPI NUMA detection" + def_bool y + prompt "ACPI NUMA detection" depends on X86_64 && NUMA && ACPI && PCI select ACPI_NUMA - default y help Enable ACPI SRAT based node topology detection. @@ -864,52 +863,53 @@ config NUMA_EMU config NODES_SHIFT int + range 1 15 if X86_64 default "6" if X86_64 default "4" if X86_NUMAQ default "3" depends on NEED_MULTIPLE_NODES config HAVE_ARCH_BOOTMEM_NODE - bool + def_bool y depends on X86_32 && NUMA - default y config ARCH_HAVE_MEMORY_PRESENT - bool + def_bool y depends on X86_32 && DISCONTIGMEM - default y config NEED_NODE_MEMMAP_SIZE - bool + def_bool y depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) - default y config HAVE_ARCH_ALLOC_REMAP - bool + def_bool y depends on X86_32 && NUMA - default y config ARCH_FLATMEM_ENABLE def_bool y - depends on (X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC) || (X86_64 && !NUMA) + depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC && !NUMA config ARCH_DISCONTIGMEM_ENABLE def_bool y - depends on NUMA + depends on NUMA && X86_32 config ARCH_DISCONTIGMEM_DEFAULT def_bool y - depends on NUMA + depends on NUMA && X86_32 + +config ARCH_SPARSEMEM_DEFAULT + def_bool y + depends on X86_64 config ARCH_SPARSEMEM_ENABLE def_bool y - depends on NUMA || (EXPERIMENTAL && (X86_PC || X86_64)) + depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 config ARCH_SELECT_MEMORY_MODEL def_bool y - depends on X86_32 && ARCH_SPARSEMEM_ENABLE + depends on ARCH_SPARSEMEM_ENABLE config ARCH_MEMORY_PROBE def_bool X86_64 @@ -987,42 +987,32 @@ config MTRR See <file:Documentation/mtrr.txt> for more information. config EFI - bool "Boot from EFI support" - depends on X86_32 && ACPI - default n + def_bool n + prompt "EFI runtime service support" + depends on ACPI ---help--- - This enables the kernel to boot on EFI platforms using - system configuration information passed to it from the firmware. - This also enables the kernel to use any EFI runtime services that are + This enables the kernel to use EFI runtime services that are available (such as the EFI variable services). - This option is only useful on systems that have EFI firmware - and will result in a kernel image that is ~8k larger. In addition, - you must use the latest ELILO loader available at - <http://elilo.sourceforge.net> in order to take advantage of - kernel initialization using EFI information (neither GRUB nor LILO know - anything about EFI). However, even with this option, the resultant - kernel should continue to boot on existing non-EFI platforms. + This option is only useful on systems that have EFI firmware. + In addition, you should use the latest ELILO loader available + at <http://elilo.sourceforge.net> in order to take advantage + of EFI runtime services. However, even with this option, the + resultant kernel should continue to boot on existing non-EFI + platforms. config IRQBALANCE - bool "Enable kernel irq balancing" + def_bool y + prompt "Enable kernel irq balancing" depends on X86_32 && SMP && X86_IO_APIC - default y help The default yes will allow the kernel to do irq load balancing. Saying no will keep the kernel from doing irq load balancing. -# turning this on wastes a bunch of space. -# Summit needs it only when NUMA is on -config BOOT_IOREMAP - bool - depends on X86_32 && (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI)) - default y - config SECCOMP - bool "Enable seccomp to safely compute untrusted bytecode" + def_bool y + prompt "Enable seccomp to safely compute untrusted bytecode" depends on PROC_FS - default y help This kernel feature is useful for number crunching applications that may need to compute untrusted bytecode during their @@ -1189,11 +1179,11 @@ config HOTPLUG_CPU suspend. config COMPAT_VDSO - bool "Compat VDSO support" - default y - depends on X86_32 + def_bool y + prompt "Compat VDSO support" + depends on X86_32 || IA32_EMULATION help - Map the VDSO to the predictable old-style address too. + Map the 32-bit VDSO to the predictable old-style address too. ---help--- Say N here if you are running a sufficiently recent glibc version (2.3.3 or later), to remove the high-mapped @@ -1207,30 +1197,26 @@ config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y depends on X86_64 || (X86_32 && HIGHMEM) -config MEMORY_HOTPLUG_RESERVE - def_bool X86_64 - depends on (MEMORY_HOTPLUG && DISCONTIGMEM) - config HAVE_ARCH_EARLY_PFN_TO_NID def_bool X86_64 depends on NUMA -config OUT_OF_LINE_PFN_TO_PAGE - def_bool X86_64 - depends on DISCONTIGMEM - menu "Power management options" depends on !X86_VOYAGER config ARCH_HIBERNATION_HEADER - bool + def_bool y depends on X86_64 && HIBERNATION - default y source "kernel/power/Kconfig" source "drivers/acpi/Kconfig" +config X86_APM_BOOT + bool + default y + depends on APM || APM_MODULE + menuconfig APM tristate "APM (Advanced Power Management) BIOS support" depends on X86_32 && PM_SLEEP && !X86_VISWS @@ -1371,7 +1357,7 @@ menu "Bus options (PCI etc.)" config PCI bool "PCI support" if !X86_VISWS depends on !X86_VOYAGER - default y if X86_VISWS + default y select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) help Find out whether you have a PCI motherboard. PCI is the name of a @@ -1418,25 +1404,21 @@ config PCI_GOANY endchoice config PCI_BIOS - bool + def_bool y depends on X86_32 && !X86_VISWS && PCI && (PCI_GOBIOS || PCI_GOANY) - default y # x86-64 doesn't support PCI BIOS access from long mode so always go direct. config PCI_DIRECT - bool + def_bool y depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY) || X86_VISWS) - default y config PCI_MMCONFIG - bool + def_bool y depends on X86_32 && PCI && ACPI && (PCI_GOMMCONFIG || PCI_GOANY) - default y config PCI_DOMAINS - bool + def_bool y depends on PCI - default y config PCI_MMCONFIG bool "Support mmconfig PCI config space access" @@ -1453,9 +1435,9 @@ config DMAR remapping devices. config DMAR_GFX_WA - bool "Support for Graphics workaround" + def_bool y + prompt "Support for Graphics workaround" depends on DMAR - default y help Current Graphics drivers tend to use physical address for DMA and avoid using DMA APIs. Setting this config @@ -1464,9 +1446,8 @@ config DMAR_GFX_WA to use physical addresses for DMA. config DMAR_FLOPPY_WA - bool + def_bool y depends on DMAR - default y help Floppy disk drivers are know to bypass DMA API calls thereby failing to work when IOMMU is enabled. This @@ -1479,8 +1460,7 @@ source "drivers/pci/Kconfig" # x86_64 have no ISA slots, but do have ISA-style DMA. config ISA_DMA_API - bool - default y + def_bool y if X86_32 @@ -1546,9 +1526,9 @@ config SCx200HR_TIMER other workaround is idle=poll boot option. config GEODE_MFGPT_TIMER - bool "Geode Multi-Function General Purpose Timer (MFGPT) events" + def_bool y + prompt "Geode Multi-Function General Purpose Timer (MFGPT) events" depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS - default y help This driver provides a clock event source based on the MFGPT timer(s) in the CS5535 and CS5536 companion chip for the geode. @@ -1575,6 +1555,7 @@ source "fs/Kconfig.binfmt" config IA32_EMULATION bool "IA32 Emulation" depends on X86_64 + select COMPAT_BINFMT_ELF help Include code to run 32-bit programs under a 64-bit kernel. You should likely turn this on, unless you're 100% sure that you don't have any @@ -1587,18 +1568,16 @@ config IA32_AOUT Support old a.out binaries in the 32bit emulation. config COMPAT - bool + def_bool y depends on IA32_EMULATION - default y config COMPAT_FOR_U64_ALIGNMENT def_bool COMPAT depends on X86_64 config SYSVIPC_COMPAT - bool + def_bool y depends on X86_64 && COMPAT && SYSVIPC - default y endmenu diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index c30162202dc..e09a6b73a1a 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -219,10 +219,10 @@ config MGEODEGX1 Select this for a Geode GX1 (Cyrix MediaGX) chip. config MGEODE_LX - bool "Geode GX/LX" + bool "Geode GX/LX" depends on X86_32 - help - Select this for AMD Geode GX and LX processors. + help + Select this for AMD Geode GX and LX processors. config MCYRIXIII bool "CyrixIII/VIA-C3" @@ -258,7 +258,7 @@ config MPSC Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey Xeon CPUs with Intel 64bit which is compatible with x86-64. Note that the latest Xeons (Xeon 51xx and 53xx) are not based on the - Netburst core and shouldn't use this option. You can distinguish them + Netburst core and shouldn't use this option. You can distinguish them using the cpu family field in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. @@ -317,81 +317,75 @@ config X86_L1_CACHE_SHIFT default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 config X86_XADD - bool + def_bool y depends on X86_32 && !M386 - default y config X86_PPRO_FENCE - bool + bool "PentiumPro memory ordering errata workaround" depends on M686 || M586MMX || M586TSC || M586 || M486 || M386 || MGEODEGX1 - default y + help + Old PentiumPro multiprocessor systems had errata that could cause memory + operations to violate the x86 ordering standard in rare cases. Enabling this + option will attempt to work around some (but not all) occurances of + this problem, at the cost of much heavier spinlock and memory barrier + operations. + + If unsure, say n here. Even distro kernels should think twice before enabling + this: there are few systems, and an unlikely bug. config X86_F00F_BUG - bool + def_bool y depends on M586MMX || M586TSC || M586 || M486 || M386 - default y config X86_WP_WORKS_OK - bool + def_bool y depends on X86_32 && !M386 - default y config X86_INVLPG - bool + def_bool y depends on X86_32 && !M386 - default y config X86_BSWAP - bool + def_bool y depends on X86_32 && !M386 - default y config X86_POPAD_OK - bool + def_bool y depends on X86_32 && !M386 - default y config X86_ALIGNMENT_16 - bool + def_bool y depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 - default y config X86_GOOD_APIC - bool + def_bool y depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64 - default y config X86_INTEL_USERCOPY - bool + def_bool y depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 - default y config X86_USE_PPRO_CHECKSUM - bool + def_bool y depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 - default y config X86_USE_3DNOW - bool + def_bool y depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML - default y config X86_OOSTORE - bool + def_bool y depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR - default y config X86_TSC - bool + def_bool y depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 - default y # this should be set for all -march=.. options where the compiler # generates cmov. config X86_CMOV - bool + def_bool y depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7) - default y config X86_MINIMUM_CPU_FAMILY int @@ -399,3 +393,6 @@ config X86_MINIMUM_CPU_FAMILY default "4" if X86_32 && (X86_XADD || X86_CMPXCHG || X86_BSWAP || X86_WP_WORKS_OK) default "3" +config X86_DEBUGCTLMSR + def_bool y + depends on !(M586MMX || M586TSC || M586 || M486 || M386) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 761ca7b5f12..2e1e3af28c3 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -6,7 +6,7 @@ config TRACE_IRQFLAGS_SUPPORT source "lib/Kconfig.debug" config EARLY_PRINTK - bool "Early printk" if EMBEDDED && DEBUG_KERNEL && X86_32 + bool "Early printk" if EMBEDDED default y help Write kernel log output directly into the VGA buffer or to a serial @@ -40,22 +40,49 @@ comment "Page alloc debug is incompatible with Software Suspend on i386" config DEBUG_PAGEALLOC bool "Debug page memory allocations" - depends on DEBUG_KERNEL && !HIBERNATION && !HUGETLBFS - depends on X86_32 + depends on DEBUG_KERNEL && X86_32 help Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types of memory corruptions. +config DEBUG_PER_CPU_MAPS + bool "Debug access to per_cpu maps" + depends on DEBUG_KERNEL + depends on X86_64_SMP + default n + help + Say Y to verify that the per_cpu map being accessed has + been setup. Adds a fair amount of code to kernel memory + and decreases performance. + + Say N if unsure. + config DEBUG_RODATA bool "Write protect kernel read-only data structures" + default y depends on DEBUG_KERNEL help Mark the kernel read-only data as write-protected in the pagetables, in order to catch accidental (and incorrect) writes to such const - data. This option may have a slight performance impact because a - portion of the kernel code won't be covered by a 2MB TLB anymore. - If in doubt, say "N". + data. This is recommended so that we can catch kernel bugs sooner. + If in doubt, say "Y". + +config DEBUG_RODATA_TEST + bool "Testcase for the DEBUG_RODATA feature" + depends on DEBUG_RODATA + help + This option enables a testcase for the DEBUG_RODATA + feature as well as for the change_page_attr() infrastructure. + If in doubt, say "N" + +config DEBUG_NX_TEST + tristate "Testcase for the NX non-executable stack feature" + depends on DEBUG_KERNEL && m + help + This option enables a testcase for the CPU NX capability + and the software setup of this feature. + If in doubt, say "N" config 4KSTACKS bool "Use 4Kb for kernel stacks instead of 8Kb" @@ -75,8 +102,7 @@ config X86_FIND_SMP_CONFIG config X86_MPPARSE def_bool y - depends on X86_LOCAL_APIC && !X86_VISWS - depends on X86_32 + depends on (X86_32 && (X86_LOCAL_APIC && !X86_VISWS)) || X86_64 config DOUBLEFAULT default y @@ -112,4 +138,91 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. +# +# IO delay types: +# + +config IO_DELAY_TYPE_0X80 + int + default "0" + +config IO_DELAY_TYPE_0XED + int + default "1" + +config IO_DELAY_TYPE_UDELAY + int + default "2" + +config IO_DELAY_TYPE_NONE + int + default "3" + +choice + prompt "IO delay type" + default IO_DELAY_0XED + +config IO_DELAY_0X80 + bool "port 0x80 based port-IO delay [recommended]" + help + This is the traditional Linux IO delay used for in/out_p. + It is the most tested hence safest selection here. + +config IO_DELAY_0XED + bool "port 0xed based port-IO delay" + help + Use port 0xed as the IO delay. This frees up port 0x80 which is + often used as a hardware-debug port. + +config IO_DELAY_UDELAY + bool "udelay based port-IO delay" + help + Use udelay(2) as the IO delay method. This provides the delay + while not having any side-effect on the IO port space. + +config IO_DELAY_NONE + bool "no port-IO delay" + help + No port-IO delay. Will break on old boxes that require port-IO + delay for certain operations. Should work on most new machines. + +endchoice + +if IO_DELAY_0X80 +config DEFAULT_IO_DELAY_TYPE + int + default IO_DELAY_TYPE_0X80 +endif + +if IO_DELAY_0XED +config DEFAULT_IO_DELAY_TYPE + int + default IO_DELAY_TYPE_0XED +endif + +if IO_DELAY_UDELAY +config DEFAULT_IO_DELAY_TYPE + int + default IO_DELAY_TYPE_UDELAY +endif + +if IO_DELAY_NONE +config DEFAULT_IO_DELAY_TYPE + int + default IO_DELAY_TYPE_NONE +endif + +config DEBUG_BOOT_PARAMS + bool "Debug boot parameters" + depends on DEBUG_KERNEL + depends on DEBUG_FS + help + This option will cause struct boot_params to be exported via debugfs. + +config CPA_DEBUG + bool "CPA self test code" + depends on DEBUG_KERNEL + help + Do change_page_attr self tests at boot. + endmenu diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 7aa1dc6d67c..b08f18261df 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -7,13 +7,252 @@ else KBUILD_DEFCONFIG := $(ARCH)_defconfig endif -# No need to remake these files -$(srctree)/arch/x86/Makefile%: ; +# BITS is used as extension for files which are available in a 32 bit +# and a 64 bit version to simplify shared Makefiles. +# e.g.: obj-y += foo_$(BITS).o +export BITS ifeq ($(CONFIG_X86_32),y) + BITS := 32 UTS_MACHINE := i386 - include $(srctree)/arch/x86/Makefile_32 + CHECKFLAGS += -D__i386__ + + biarch := $(call cc-option,-m32) + KBUILD_AFLAGS += $(biarch) + KBUILD_CFLAGS += $(biarch) + + ifdef CONFIG_RELOCATABLE + LDFLAGS_vmlinux := --emit-relocs + endif + + KBUILD_CFLAGS += -msoft-float -mregparm=3 -freg-struct-return + + # prevent gcc from keeping the stack 16 byte aligned + KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) + + # Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use + # a lot more stack due to the lack of sharing of stacklots: + KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \ + echo $(call cc-option,-fno-unit-at-a-time); fi ;) + + # CPU-specific tuning. Anything which can be shared with UML should go here. + include $(srctree)/arch/x86/Makefile_32.cpu + KBUILD_CFLAGS += $(cflags-y) + + # temporary until string.h is fixed + KBUILD_CFLAGS += -ffreestanding else + BITS := 64 UTS_MACHINE := x86_64 - include $(srctree)/arch/x86/Makefile_64 + CHECKFLAGS += -D__x86_64__ -m64 + + KBUILD_AFLAGS += -m64 + KBUILD_CFLAGS += -m64 + + # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) + cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) + cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) + + cflags-$(CONFIG_MCORE2) += \ + $(call cc-option,-march=core2,$(call cc-option,-mtune=generic)) + cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) + KBUILD_CFLAGS += $(cflags-y) + + KBUILD_CFLAGS += -mno-red-zone + KBUILD_CFLAGS += -mcmodel=kernel + + # -funit-at-a-time shrinks the kernel .text considerably + # unfortunately it makes reading oopses harder. + KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time) + + # this works around some issues with generating unwind tables in older gccs + # newer gccs do it by default + KBUILD_CFLAGS += -maccumulate-outgoing-args + + stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh + stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \ + "$(CC)" -fstack-protector ) + stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \ + "$(CC)" -fstack-protector-all ) + + KBUILD_CFLAGS += $(stackp-y) +endif + +# Stackpointer is addressed different for 32 bit and 64 bit x86 +sp-$(CONFIG_X86_32) := esp +sp-$(CONFIG_X86_64) := rsp + +# do binutils support CFI? +cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1) +# is .cfi_signal_frame supported too? +cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) +KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) +KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) + +LDFLAGS := -m elf_$(UTS_MACHINE) +OBJCOPYFLAGS := -O binary -R .note -R .comment -S + +# Speed up the build +KBUILD_CFLAGS += -pipe +# Workaround for a gcc prelease that unfortunately was shipped in a suse release +KBUILD_CFLAGS += -Wno-sign-compare +# +KBUILD_CFLAGS += -fno-asynchronous-unwind-tables +# prevent gcc from generating any FP code by mistake +KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) + +### +# Sub architecture support +# fcore-y is linked before mcore-y files. + +# Default subarch .c files +mcore-y := arch/x86/mach-default/ + +# Voyager subarch support +mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager +mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/ + +# VISWS subarch support +mflags-$(CONFIG_X86_VISWS) := -Iinclude/asm-x86/mach-visws +mcore-$(CONFIG_X86_VISWS) := arch/x86/mach-visws/ + +# NUMAQ subarch support +mflags-$(CONFIG_X86_NUMAQ) := -Iinclude/asm-x86/mach-numaq +mcore-$(CONFIG_X86_NUMAQ) := arch/x86/mach-default/ + +# BIGSMP subarch support +mflags-$(CONFIG_X86_BIGSMP) := -Iinclude/asm-x86/mach-bigsmp +mcore-$(CONFIG_X86_BIGSMP) := arch/x86/mach-default/ + +#Summit subarch support +mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-x86/mach-summit +mcore-$(CONFIG_X86_SUMMIT) := arch/x86/mach-default/ + +# generic subarchitecture +mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic +fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ +mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/ + + +# ES7000 subarch support +mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-x86/mach-es7000 +fcore-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/ +mcore-$(CONFIG_X86_ES7000) := arch/x86/mach-default/ + +# RDC R-321x subarch support +mflags-$(CONFIG_X86_RDC321X) := -Iinclude/asm-x86/mach-rdc321x +mcore-$(CONFIG_X86_RDC321X) := arch/x86/mach-default +core-$(CONFIG_X86_RDC321X) += arch/x86/mach-rdc321x/ + +# default subarch .h files +mflags-y += -Iinclude/asm-x86/mach-default + +# 64 bit does not support subarch support - clear sub arch variables +fcore-$(CONFIG_X86_64) := +mcore-$(CONFIG_X86_64) := +mflags-$(CONFIG_X86_64) := + +KBUILD_CFLAGS += $(mflags-y) +KBUILD_AFLAGS += $(mflags-y) + +### +# Kernel objects + +head-y := arch/x86/kernel/head_$(BITS).o +head-$(CONFIG_X86_64) += arch/x86/kernel/head64.o +head-y += arch/x86/kernel/init_task.o + +libs-y += arch/x86/lib/ + +# Sub architecture files that needs linking first +core-y += $(fcore-y) + +# Xen paravirtualization support +core-$(CONFIG_XEN) += arch/x86/xen/ + +# lguest paravirtualization support +core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/ + +core-y += arch/x86/kernel/ +core-y += arch/x86/mm/ + +# Remaining sub architecture files +core-y += $(mcore-y) + +core-y += arch/x86/crypto/ +core-y += arch/x86/vdso/ +core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/ + +# drivers-y are linked after core-y +drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/ +drivers-$(CONFIG_PCI) += arch/x86/pci/ + +# must be linked after kernel/ +drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/ + +ifeq ($(CONFIG_X86_32),y) +drivers-$(CONFIG_PM) += arch/x86/power/ +drivers-$(CONFIG_FB) += arch/x86/video/ endif + +#### +# boot loader support. Several targets are kept for legacy purposes + +boot := arch/x86/boot + +PHONY += zImage bzImage compressed zlilo bzlilo \ + zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install + +# Default kernel to build +all: bzImage + +# KBUILD_IMAGE specify target image being built + KBUILD_IMAGE := $(boot)/bzImage +zImage zlilo zdisk: KBUILD_IMAGE := arch/x86/boot/zImage + +zImage bzImage: vmlinux + $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) + $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot + $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/bzImage + +compressed: zImage + +zlilo bzlilo: vmlinux + $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo + +zdisk bzdisk: vmlinux + $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk + +fdimage fdimage144 fdimage288 isoimage: vmlinux + $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@ + +install: vdso_install + $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install + +PHONY += vdso_install +vdso_install: + $(Q)$(MAKE) $(build)=arch/x86/vdso $@ + +archclean: + $(Q)rm -rf $(objtree)/arch/i386 + $(Q)rm -rf $(objtree)/arch/x86_64 + $(Q)$(MAKE) $(clean)=$(boot) + +define archhelp + echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' + echo ' install - Install kernel using' + echo ' (your) ~/bin/installkernel or' + echo ' (distribution) /sbin/installkernel or' + echo ' install to $$(INSTALL_PATH) and run lilo' + echo ' fdimage - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' + echo ' fdimage144 - Create 1.4MB boot floppy image (arch/x86/boot/fdimage)' + echo ' fdimage288 - Create 2.8MB boot floppy image (arch/x86/boot/fdimage)' + echo ' isoimage - Create a boot CD-ROM image (arch/x86/boot/image.iso)' + echo ' bzdisk/fdimage*/isoimage also accept:' + echo ' FDARGS="..." arguments for the booted kernel' + echo ' FDINITRD=file initrd for the booted kernel' +endef + +CLEAN_FILES += arch/x86/boot/fdimage \ + arch/x86/boot/image.iso \ + arch/x86/boot/mtools.conf diff --git a/arch/x86/Makefile_32 b/arch/x86/Makefile_32 deleted file mode 100644 index 50394da2f6c..00000000000 --- a/arch/x86/Makefile_32 +++ /dev/null @@ -1,175 +0,0 @@ -# -# i386 Makefile -# -# This file is included by the global makefile so that you can add your own -# architecture-specific flags and dependencies. Remember to do have actions -# for "archclean" cleaning up for this architecture. -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# -# Copyright (C) 1994 by Linus Torvalds -# -# 19990713 Artur Skawina <skawina@geocities.com> -# Added '-march' and '-mpreferred-stack-boundary' support -# -# 20050320 Kianusch Sayah Karadji <kianusch@sk-tech.net> -# Added support for GEODE CPU - -# BITS is used as extension for files which are available in a 32 bit -# and a 64 bit version to simplify shared Makefiles. -# e.g.: obj-y += foo_$(BITS).o -BITS := 32 -export BITS - -HAS_BIARCH := $(call cc-option-yn, -m32) -ifeq ($(HAS_BIARCH),y) -AS := $(AS) --32 -LD := $(LD) -m elf_i386 -CC := $(CC) -m32 -endif - -LDFLAGS := -m elf_i386 -OBJCOPYFLAGS := -O binary -R .note -R .comment -S -ifdef CONFIG_RELOCATABLE -LDFLAGS_vmlinux := --emit-relocs -endif -CHECKFLAGS += -D__i386__ - -KBUILD_CFLAGS += -pipe -msoft-float -mregparm=3 -freg-struct-return - -# prevent gcc from keeping the stack 16 byte aligned -KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) - -# CPU-specific tuning. Anything which can be shared with UML should go here. -include $(srctree)/arch/x86/Makefile_32.cpu - -# temporary until string.h is fixed -cflags-y += -ffreestanding - -# this works around some issues with generating unwind tables in older gccs -# newer gccs do it by default -cflags-y += -maccumulate-outgoing-args - -# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use -# a lot more stack due to the lack of sharing of stacklots: -KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then echo $(call cc-option,-fno-unit-at-a-time); fi ;) - -# do binutils support CFI? -cflags-y += $(call as-instr,.cfi_startproc\n.cfi_rel_offset esp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,) -KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_rel_offset esp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,) - -# is .cfi_signal_frame supported too? -cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) -KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) - -KBUILD_CFLAGS += $(cflags-y) - -# Default subarch .c files -mcore-y := arch/x86/mach-default - -# Voyager subarch support -mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager -mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager - -# VISWS subarch support -mflags-$(CONFIG_X86_VISWS) := -Iinclude/asm-x86/mach-visws -mcore-$(CONFIG_X86_VISWS) := arch/x86/mach-visws - -# NUMAQ subarch support -mflags-$(CONFIG_X86_NUMAQ) := -Iinclude/asm-x86/mach-numaq -mcore-$(CONFIG_X86_NUMAQ) := arch/x86/mach-default - -# BIGSMP subarch support -mflags-$(CONFIG_X86_BIGSMP) := -Iinclude/asm-x86/mach-bigsmp -mcore-$(CONFIG_X86_BIGSMP) := arch/x86/mach-default - -#Summit subarch support -mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-x86/mach-summit -mcore-$(CONFIG_X86_SUMMIT) := arch/x86/mach-default - -# generic subarchitecture -mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-x86/mach-generic -mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default -core-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ - -# ES7000 subarch support -mflags-$(CONFIG_X86_ES7000) := -Iinclude/asm-x86/mach-es7000 -mcore-$(CONFIG_X86_ES7000) := arch/x86/mach-default -core-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/ - -# Xen paravirtualization support -core-$(CONFIG_XEN) += arch/x86/xen/ - -# lguest paravirtualization support -core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/ - -# default subarch .h files -mflags-y += -Iinclude/asm-x86/mach-default - -head-y := arch/x86/kernel/head_32.o arch/x86/kernel/init_task.o - -libs-y += arch/x86/lib/ -core-y += arch/x86/kernel/ \ - arch/x86/mm/ \ - $(mcore-y)/ \ - arch/x86/crypto/ -drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/ -drivers-$(CONFIG_PCI) += arch/x86/pci/ -# must be linked after kernel/ -drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/ -drivers-$(CONFIG_PM) += arch/x86/power/ -drivers-$(CONFIG_FB) += arch/x86/video/ - -KBUILD_CFLAGS += $(mflags-y) -KBUILD_AFLAGS += $(mflags-y) - -boot := arch/x86/boot - -PHONY += zImage bzImage compressed zlilo bzlilo \ - zdisk bzdisk fdimage fdimage144 fdimage288 isoimage install - -all: bzImage - -# KBUILD_IMAGE specify target image being built - KBUILD_IMAGE := $(boot)/bzImage -zImage zlilo zdisk: KBUILD_IMAGE := arch/x86/boot/zImage - -zImage bzImage: vmlinux - $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) - $(Q)mkdir -p $(objtree)/arch/i386/boot - $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/i386/boot/bzImage - -compressed: zImage - -zlilo bzlilo: vmlinux - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo - -zdisk bzdisk: vmlinux - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk - -fdimage fdimage144 fdimage288 isoimage: vmlinux - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@ - -install: - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install - -archclean: - $(Q)rm -rf $(objtree)/arch/i386/boot - $(Q)$(MAKE) $(clean)=arch/x86/boot - -define archhelp - echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' - echo ' install - Install kernel using' - echo ' (your) ~/bin/installkernel or' - echo ' (distribution) /sbin/installkernel or' - echo ' install to $$(INSTALL_PATH) and run lilo' - echo ' bzdisk - Create a boot floppy in /dev/fd0' - echo ' fdimage - Create a boot floppy image' - echo ' isoimage - Create a boot CD-ROM image' -endef - -CLEAN_FILES += arch/x86/boot/fdimage \ - arch/x86/boot/image.iso \ - arch/x86/boot/mtools.conf diff --git a/arch/x86/Makefile_64 b/arch/x86/Makefile_64 deleted file mode 100644 index a804860022e..00000000000 --- a/arch/x86/Makefile_64 +++ /dev/null @@ -1,144 +0,0 @@ -# -# x86_64 Makefile -# -# This file is included by the global makefile so that you can add your own -# architecture-specific flags and dependencies. Remember to do have actions -# for "archclean" and "archdep" for cleaning up and making dependencies for -# this architecture -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# -# Copyright (C) 1994 by Linus Torvalds -# -# 19990713 Artur Skawina <skawina@geocities.com> -# Added '-march' and '-mpreferred-stack-boundary' support -# 20000913 Pavel Machek <pavel@suse.cz> -# Converted for x86_64 architecture -# 20010105 Andi Kleen, add IA32 compiler. -# ....and later removed it again.... -# -# $Id: Makefile,v 1.31 2002/03/22 15:56:07 ak Exp $ - -# BITS is used as extension for files which are available in a 32 bit -# and a 64 bit version to simplify shared Makefiles. -# e.g.: obj-y += foo_$(BITS).o -BITS := 64 -export BITS - -LDFLAGS := -m elf_x86_64 -OBJCOPYFLAGS := -O binary -R .note -R .comment -S -LDFLAGS_vmlinux := -CHECKFLAGS += -D__x86_64__ -m64 - -cflags-y := -cflags-kernel-y := -cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) -cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) -# gcc doesn't support -march=core2 yet as of gcc 4.3, but I hope it -# will eventually. Use -mtune=generic as fallback -cflags-$(CONFIG_MCORE2) += \ - $(call cc-option,-march=core2,$(call cc-option,-mtune=generic)) -cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) - -cflags-y += -m64 -cflags-y += -mno-red-zone -cflags-y += -mcmodel=kernel -cflags-y += -pipe -cflags-y += -Wno-sign-compare -cflags-y += -fno-asynchronous-unwind-tables -ifneq ($(CONFIG_DEBUG_INFO),y) -# -fweb shrinks the kernel a bit, but the difference is very small -# it also messes up debugging, so don't use it for now. -#cflags-y += $(call cc-option,-fweb) -endif -# -funit-at-a-time shrinks the kernel .text considerably -# unfortunately it makes reading oopses harder. -cflags-y += $(call cc-option,-funit-at-a-time) -# prevent gcc from generating any FP code by mistake -cflags-y += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) -# this works around some issues with generating unwind tables in older gccs -# newer gccs do it by default -cflags-y += -maccumulate-outgoing-args - -# do binutils support CFI? -cflags-y += $(call as-instr,.cfi_startproc\n.cfi_rel_offset rsp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,) -KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_rel_offset rsp${comma}0\n.cfi_endproc,-DCONFIG_AS_CFI=1,) - -# is .cfi_signal_frame supported too? -cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) -KBUILD_AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) - -cflags-$(CONFIG_CC_STACKPROTECTOR) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh "$(CC)" -fstack-protector ) -cflags-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh "$(CC)" -fstack-protector-all ) - -KBUILD_CFLAGS += $(cflags-y) -CFLAGS_KERNEL += $(cflags-kernel-y) -KBUILD_AFLAGS += -m64 - -head-y := arch/x86/kernel/head_64.o arch/x86/kernel/head64.o arch/x86/kernel/init_task.o - -libs-y += arch/x86/lib/ -core-y += arch/x86/kernel/ \ - arch/x86/mm/ \ - arch/x86/crypto/ \ - arch/x86/vdso/ -core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/ -drivers-$(CONFIG_PCI) += arch/x86/pci/ -drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/ - -boot := arch/x86/boot - -PHONY += bzImage bzlilo install archmrproper \ - fdimage fdimage144 fdimage288 isoimage archclean - -#Default target when executing "make" -all: bzImage - -BOOTIMAGE := arch/x86/boot/bzImage -KBUILD_IMAGE := $(BOOTIMAGE) - -bzImage: vmlinux - $(Q)$(MAKE) $(build)=$(boot) $(BOOTIMAGE) - $(Q)mkdir -p $(objtree)/arch/x86_64/boot - $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/x86_64/boot/bzImage - -bzlilo: vmlinux - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zlilo - -bzdisk: vmlinux - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) zdisk - -fdimage fdimage144 fdimage288 isoimage: vmlinux - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@ - -install: vdso_install - $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(BOOTIMAGE) $@ - -vdso_install: -ifeq ($(CONFIG_IA32_EMULATION),y) - $(Q)$(MAKE) $(build)=arch/x86/ia32 $@ -endif - $(Q)$(MAKE) $(build)=arch/x86/vdso $@ - -archclean: - $(Q)rm -rf $(objtree)/arch/x86_64/boot - $(Q)$(MAKE) $(clean)=$(boot) - -define archhelp - echo '* bzImage - Compressed kernel image (arch/x86/boot/bzImage)' - echo ' install - Install kernel using' - echo ' (your) ~/bin/installkernel or' - echo ' (distribution) /sbin/installkernel or' - echo ' install to $$(INSTALL_PATH) and run lilo' - echo ' bzdisk - Create a boot floppy in /dev/fd0' - echo ' fdimage - Create a boot floppy image' - echo ' isoimage - Create a boot CD-ROM image' -endef - -CLEAN_FILES += arch/x86/boot/fdimage \ - arch/x86/boot/image.iso \ - arch/x86/boot/mtools.conf - - diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 7a3116ccf38..349b81a39c4 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -28,9 +28,11 @@ SVGA_MODE := -DSVGA_MODE=NORMAL_VGA targets := vmlinux.bin setup.bin setup.elf zImage bzImage subdir- := compressed -setup-y += a20.o apm.o cmdline.o copy.o cpu.o cpucheck.o edd.o +setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o setup-y += header.o main.o mca.o memory.o pm.o pmjump.o -setup-y += printf.o string.o tty.o video.o version.o voyager.o +setup-y += printf.o string.o tty.o video.o version.o +setup-$(CONFIG_X86_APM_BOOT) += apm.o +setup-$(CONFIG_X86_VOYAGER) += voyager.o # The link order of the video-*.o modules can matter. In particular, # video-vga.o *must* be listed first, followed by video-vesa.o. @@ -49,10 +51,7 @@ HOSTCFLAGS_build.o := $(LINUXINCLUDE) # How to compile the 16-bit code. Note we always compile for -march=i386, # that way we can complain to the user if the CPU is insufficient. -cflags-$(CONFIG_X86_32) := -cflags-$(CONFIG_X86_64) := -m32 KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ - $(cflags-y) \ -Wall -Wstrict-prototypes \ -march=i386 -mregparm=3 \ -include $(srctree)/$(src)/code16gcc.h \ @@ -62,6 +61,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ $(call cc-option, -fno-unit-at-a-time)) \ $(call cc-option, -fno-stack-protector) \ $(call cc-option, -mpreferred-stack-boundary=2) +KBUILD_CFLAGS += $(call cc-option,-m32) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ $(obj)/zImage: IMAGE_OFFSET := 0x1000 diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c index eab50c55a3a..c117c7fb859 100644 --- a/arch/x86/boot/apm.c +++ b/arch/x86/boot/apm.c @@ -19,8 +19,6 @@ #include "boot.h" -#if defined(CONFIG_APM) || defined(CONFIG_APM_MODULE) - int query_apm_bios(void) { u16 ax, bx, cx, dx, di; @@ -95,4 +93,3 @@ int query_apm_bios(void) return 0; } -#endif diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index d2b5adf4651..7822a4983da 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -109,7 +109,7 @@ typedef unsigned int addr_t; static inline u8 rdfs8(addr_t addr) { u8 v; - asm volatile("movb %%fs:%1,%0" : "=r" (v) : "m" (*(u8 *)addr)); + asm volatile("movb %%fs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr)); return v; } static inline u16 rdfs16(addr_t addr) @@ -127,21 +127,21 @@ static inline u32 rdfs32(addr_t addr) static inline void wrfs8(u8 v, addr_t addr) { - asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "r" (v)); + asm volatile("movb %1,%%fs:%0" : "+m" (*(u8 *)addr) : "qi" (v)); } static inline void wrfs16(u16 v, addr_t addr) { - asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "r" (v)); + asm volatile("movw %1,%%fs:%0" : "+m" (*(u16 *)addr) : "ri" (v)); } static inline void wrfs32(u32 v, addr_t addr) { - asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "r" (v)); + asm volatile("movl %1,%%fs:%0" : "+m" (*(u32 *)addr) : "ri" (v)); } static inline u8 rdgs8(addr_t addr) { u8 v; - asm volatile("movb %%gs:%1,%0" : "=r" (v) : "m" (*(u8 *)addr)); + asm volatile("movb %%gs:%1,%0" : "=q" (v) : "m" (*(u8 *)addr)); return v; } static inline u16 rdgs16(addr_t addr) @@ -159,15 +159,15 @@ static inline u32 rdgs32(addr_t addr) static inline void wrgs8(u8 v, addr_t addr) { - asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "r" (v)); + asm volatile("movb %1,%%gs:%0" : "+m" (*(u8 *)addr) : "qi" (v)); } static inline void wrgs16(u16 v, addr_t addr) { - asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "r" (v)); + asm volatile("movw %1,%%gs:%0" : "+m" (*(u16 *)addr) : "ri" (v)); } static inline void wrgs32(u32 v, addr_t addr) { - asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "r" (v)); + asm volatile("movl %1,%%gs:%0" : "+m" (*(u32 *)addr) : "ri" (v)); } /* Note: these only return true/false, not a signed return value! */ @@ -241,6 +241,7 @@ int query_apm_bios(void); /* cmdline.c */ int cmdline_find_option(const char *option, char *buffer, int bufsize); +int cmdline_find_option_bool(const char *option); /* cpu.c, cpucheck.c */ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c index 34bb778c435..680408a0f46 100644 --- a/arch/x86/boot/cmdline.c +++ b/arch/x86/boot/cmdline.c @@ -95,3 +95,68 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize) return len; } + +/* + * Find a boolean option (like quiet,noapic,nosmp....) + * + * Returns the position of that option (starts counting with 1) + * or 0 on not found + */ +int cmdline_find_option_bool(const char *option) +{ + u32 cmdline_ptr = boot_params.hdr.cmd_line_ptr; + addr_t cptr; + char c; + int pos = 0, wstart = 0; + const char *opptr = NULL; + enum { + st_wordstart, /* Start of word/after whitespace */ + st_wordcmp, /* Comparing this word */ + st_wordskip, /* Miscompare, skip */ + } state = st_wordstart; + + if (!cmdline_ptr || cmdline_ptr >= 0x100000) + return -1; /* No command line, or inaccessible */ + + cptr = cmdline_ptr & 0xf; + set_fs(cmdline_ptr >> 4); + + while (cptr < 0x10000) { + c = rdfs8(cptr++); + pos++; + + switch (state) { + case st_wordstart: + if (!c) + return 0; + else if (myisspace(c)) + break; + + state = st_wordcmp; + opptr = option; + wstart = pos; + /* fall through */ + + case st_wordcmp: + if (!*opptr) + if (!c || myisspace(c)) + return wstart; + else + state = st_wordskip; + else if (!c) + return 0; + else if (c != *opptr++) + state = st_wordskip; + break; + + case st_wordskip: + if (!c) + return 0; + else if (myisspace(c)) + state = st_wordstart; + break; + } + } + + return 0; /* Buffer overrun */ +} diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 52c1db85452..fe24ceabd90 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -1,5 +1,63 @@ +# +# linux/arch/x86/boot/compressed/Makefile +# +# create a compressed vmlinux image from the original vmlinux +# + +targets := vmlinux vmlinux.bin vmlinux.bin.gz head_$(BITS).o misc.o piggy.o + +KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 +KBUILD_CFLAGS += -fno-strict-aliasing -fPIC +cflags-$(CONFIG_X86_64) := -mcmodel=small +KBUILD_CFLAGS += $(cflags-y) +KBUILD_CFLAGS += $(call cc-option,-ffreestanding) +KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector) + +KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ + +LDFLAGS := -m elf_$(UTS_MACHINE) +LDFLAGS_vmlinux := -T + +$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE + $(call if_changed,ld) + @: + +$(obj)/vmlinux.bin: vmlinux FORCE + $(call if_changed,objcopy) + + ifeq ($(CONFIG_X86_32),y) -include ${srctree}/arch/x86/boot/compressed/Makefile_32 +targets += vmlinux.bin.all vmlinux.relocs +hostprogs-y := relocs + +quiet_cmd_relocs = RELOCS $@ + cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< +$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE + $(call if_changed,relocs) + +vmlinux.bin.all-y := $(obj)/vmlinux.bin +vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs +quiet_cmd_relocbin = BUILD $@ + cmd_relocbin = cat $(filter-out FORCE,$^) > $@ +$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE + $(call if_changed,relocbin) + +ifdef CONFIG_RELOCATABLE +$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE + $(call if_changed,gzip) else -include ${srctree}/arch/x86/boot/compressed/Makefile_64 +$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE + $(call if_changed,gzip) endif +LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T + +else +$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE + $(call if_changed,gzip) + +LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T +endif + + +$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE + $(call if_changed,ld) diff --git a/arch/x86/boot/compressed/Makefile_32 b/arch/x86/boot/compressed/Makefile_32 deleted file mode 100644 index e43ff7c56e6..00000000000 --- a/arch/x86/boot/compressed/Makefile_32 +++ /dev/null @@ -1,50 +0,0 @@ -# -# linux/arch/x86/boot/compressed/Makefile -# -# create a compressed vmlinux image from the original vmlinux -# - -targets := vmlinux vmlinux.bin vmlinux.bin.gz head_32.o misc_32.o piggy.o \ - vmlinux.bin.all vmlinux.relocs -EXTRA_AFLAGS := -traditional - -LDFLAGS_vmlinux := -T -hostprogs-y := relocs - -KBUILD_CFLAGS := -m32 -D__KERNEL__ $(LINUX_INCLUDE) -O2 \ - -fno-strict-aliasing -fPIC \ - $(call cc-option,-ffreestanding) \ - $(call cc-option,-fno-stack-protector) -LDFLAGS := -m elf_i386 - -$(obj)/vmlinux: $(src)/vmlinux_32.lds $(obj)/head_32.o $(obj)/misc_32.o $(obj)/piggy.o FORCE - $(call if_changed,ld) - @: - -$(obj)/vmlinux.bin: vmlinux FORCE - $(call if_changed,objcopy) - -quiet_cmd_relocs = RELOCS $@ - cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< -$(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE - $(call if_changed,relocs) - -vmlinux.bin.all-y := $(obj)/vmlinux.bin -vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs -quiet_cmd_relocbin = BUILD $@ - cmd_relocbin = cat $(filter-out FORCE,$^) > $@ -$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE - $(call if_changed,relocbin) - -ifdef CONFIG_RELOCATABLE -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE - $(call if_changed,gzip) -else -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE - $(call if_changed,gzip) -endif - -LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T - -$(obj)/piggy.o: $(src)/vmlinux_32.scr $(obj)/vmlinux.bin.gz FORCE - $(call if_changed,ld) diff --git a/arch/x86/boot/compressed/Makefile_64 b/arch/x86/boot/compressed/Makefile_64 deleted file mode 100644 index 7801e8dd90b..00000000000 --- a/arch/x86/boot/compressed/Makefile_64 +++ /dev/null @@ -1,30 +0,0 @@ -# -# linux/arch/x86/boot/compressed/Makefile -# -# create a compressed vmlinux image from the original vmlinux -# - -targets := vmlinux vmlinux.bin vmlinux.bin.gz head_64.o misc_64.o piggy.o - -KBUILD_CFLAGS := -m64 -D__KERNEL__ $(LINUXINCLUDE) -O2 \ - -fno-strict-aliasing -fPIC -mcmodel=small \ - $(call cc-option, -ffreestanding) \ - $(call cc-option, -fno-stack-protector) -KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ -LDFLAGS := -m elf_x86_64 - -LDFLAGS_vmlinux := -T -$(obj)/vmlinux: $(src)/vmlinux_64.lds $(obj)/head_64.o $(obj)/misc_64.o $(obj)/piggy.o FORCE - $(call if_changed,ld) - @: - -$(obj)/vmlinux.bin: vmlinux FORCE - $(call if_changed,objcopy) - -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE - $(call if_changed,gzip) - -LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T - -$(obj)/piggy.o: $(obj)/vmlinux_64.scr $(obj)/vmlinux.bin.gz FORCE - $(call if_changed,ld) diff --git a/arch/x86/boot/compressed/misc_32.c b/arch/x86/boot/compressed/misc.c index b74d60d1b2f..8182e32c1b4 100644 --- a/arch/x86/boot/compressed/misc_32.c +++ b/arch/x86/boot/compressed/misc.c @@ -1,7 +1,7 @@ /* * misc.c - * - * This is a collection of several routines from gzip-1.0.3 + * + * This is a collection of several routines from gzip-1.0.3 * adapted for Linux. * * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 @@ -9,9 +9,18 @@ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ +/* + * we have to be careful, because no indirections are allowed here, and + * paravirt_ops is a kind of one. As it will only run in baremetal anyway, + * we just keep it from happening + */ #undef CONFIG_PARAVIRT +#ifdef CONFIG_X86_64 +#define _LINUX_STRING_H_ 1 +#define __LINUX_BITMAP_H 1 +#endif + #include <linux/linkage.h> -#include <linux/vmalloc.h> #include <linux/screen_info.h> #include <asm/io.h> #include <asm/page.h> @@ -186,10 +195,20 @@ static void *memcpy(void *dest, const void *src, unsigned n); static void putstr(const char *); -static unsigned long free_mem_ptr; -static unsigned long free_mem_end_ptr; +#ifdef CONFIG_X86_64 +#define memptr long +#else +#define memptr unsigned +#endif + +static memptr free_mem_ptr; +static memptr free_mem_end_ptr; +#ifdef CONFIG_X86_64 +#define HEAP_SIZE 0x7000 +#else #define HEAP_SIZE 0x4000 +#endif static char *vidmem = (char *)0xb8000; static int vidport; @@ -230,7 +249,7 @@ static void gzip_mark(void **ptr) static void gzip_release(void **ptr) { - free_mem_ptr = (unsigned long) *ptr; + free_mem_ptr = (memptr) *ptr; } static void scroll(void) @@ -247,8 +266,10 @@ static void putstr(const char *s) int x,y,pos; char c; +#ifdef CONFIG_X86_32 if (RM_SCREEN_INFO.orig_video_mode == 0 && lines == 0 && cols == 0) return; +#endif x = RM_SCREEN_INFO.orig_x; y = RM_SCREEN_INFO.orig_y; @@ -261,7 +282,7 @@ static void putstr(const char *s) y--; } } else { - vidmem [ ( x + cols * y ) * 2 ] = c; + vidmem [(x + cols * y) * 2] = c; if ( ++x >= cols ) { x = 0; if ( ++y >= lines ) { @@ -276,16 +297,16 @@ static void putstr(const char *s) RM_SCREEN_INFO.orig_y = y; pos = (x + cols * y) * 2; /* Update cursor position */ - outb_p(14, vidport); - outb_p(0xff & (pos >> 9), vidport+1); - outb_p(15, vidport); - outb_p(0xff & (pos >> 1), vidport+1); + outb(14, vidport); + outb(0xff & (pos >> 9), vidport+1); + outb(15, vidport); + outb(0xff & (pos >> 1), vidport+1); } static void* memset(void* s, int c, unsigned n) { int i; - char *ss = (char*)s; + char *ss = s; for (i=0;i<n;i++) ss[i] = c; return s; @@ -294,7 +315,8 @@ static void* memset(void* s, int c, unsigned n) static void* memcpy(void* dest, const void* src, unsigned n) { int i; - char *d = (char *)dest, *s = (char *)src; + const char *s = src; + char *d = dest; for (i=0;i<n;i++) d[i] = s[i]; return dest; @@ -339,11 +361,13 @@ static void error(char *x) putstr(x); putstr("\n\n -- System halted"); - while(1); /* Halt */ + while (1) + asm("hlt"); } -asmlinkage void decompress_kernel(void *rmode, unsigned long end, - uch *input_data, unsigned long input_len, uch *output) +asmlinkage void decompress_kernel(void *rmode, memptr heap, + uch *input_data, unsigned long input_len, + uch *output) { real_mode = rmode; @@ -358,25 +382,32 @@ asmlinkage void decompress_kernel(void *rmode, unsigned long end, lines = RM_SCREEN_INFO.orig_video_lines; cols = RM_SCREEN_INFO.orig_video_cols; - window = output; /* Output buffer (Normally at 1M) */ - free_mem_ptr = end; /* Heap */ - free_mem_end_ptr = end + HEAP_SIZE; - inbuf = input_data; /* Input buffer */ + window = output; /* Output buffer (Normally at 1M) */ + free_mem_ptr = heap; /* Heap */ + free_mem_end_ptr = heap + HEAP_SIZE; + inbuf = input_data; /* Input buffer */ insize = input_len; inptr = 0; +#ifdef CONFIG_X86_64 + if ((ulg)output & (__KERNEL_ALIGN - 1)) + error("Destination address not 2M aligned"); + if ((ulg)output >= 0xffffffffffUL) + error("Destination address too large"); +#else if ((u32)output & (CONFIG_PHYSICAL_ALIGN -1)) error("Destination address not CONFIG_PHYSICAL_ALIGN aligned"); - if (end > ((-__PAGE_OFFSET-(512 <<20)-1) & 0x7fffffff)) + if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff)) error("Destination address too large"); #ifndef CONFIG_RELOCATABLE if ((u32)output != LOAD_PHYSICAL_ADDR) error("Wrong destination address"); #endif +#endif makecrc(); - putstr("Uncompressing Linux... "); + putstr("\nDecompressing Linux... "); gunzip(); - putstr("Ok, booting the kernel.\n"); + putstr("done.\nBooting the kernel.\n"); return; } diff --git a/arch/x86/boot/compressed/misc_64.c b/arch/x86/boot/compressed/misc_64.c deleted file mode 100644 index 6ea015aa65e..00000000000 --- a/arch/x86/boot/compressed/misc_64.c +++ /dev/null @@ -1,371 +0,0 @@ -/* - * misc.c - * - * This is a collection of several routines from gzip-1.0.3 - * adapted for Linux. - * - * malloc by Hannu Savolainen 1993 and Matthias Urlichs 1994 - * puts by Nick Holloway 1993, better puts by Martin Mares 1995 - * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 - */ - -#define _LINUX_STRING_H_ 1 -#define __LINUX_BITMAP_H 1 - -#include <linux/linkage.h> -#include <linux/screen_info.h> -#include <asm/io.h> -#include <asm/page.h> - -/* WARNING!! - * This code is compiled with -fPIC and it is relocated dynamically - * at run time, but no relocation processing is performed. - * This means that it is not safe to place pointers in static structures. - */ - -/* - * Getting to provable safe in place decompression is hard. - * Worst case behaviours need to be analyzed. - * Background information: - * - * The file layout is: - * magic[2] - * method[1] - * flags[1] - * timestamp[4] - * extraflags[1] - * os[1] - * compressed data blocks[N] - * crc[4] orig_len[4] - * - * resulting in 18 bytes of non compressed data overhead. - * - * Files divided into blocks - * 1 bit (last block flag) - * 2 bits (block type) - * - * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved. - * The smallest block type encoding is always used. - * - * stored: - * 32 bits length in bytes. - * - * fixed: - * magic fixed tree. - * symbols. - * - * dynamic: - * dynamic tree encoding. - * symbols. - * - * - * The buffer for decompression in place is the length of the - * uncompressed data, plus a small amount extra to keep the algorithm safe. - * The compressed data is placed at the end of the buffer. The output - * pointer is placed at the start of the buffer and the input pointer - * is placed where the compressed data starts. Problems will occur - * when the output pointer overruns the input pointer. - * - * The output pointer can only overrun the input pointer if the input - * pointer is moving faster than the output pointer. A condition only - * triggered by data whose compressed form is larger than the uncompressed - * form. - * - * The worst case at the block level is a growth of the compressed data - * of 5 bytes per 32767 bytes. - * - * The worst case internal to a compressed block is very hard to figure. - * The worst case can at least be boundined by having one bit that represents - * 32764 bytes and then all of the rest of the bytes representing the very - * very last byte. - * - * All of which is enough to compute an amount of extra data that is required - * to be safe. To avoid problems at the block level allocating 5 extra bytes - * per 32767 bytes of data is sufficient. To avoind problems internal to a block - * adding an extra 32767 bytes (the worst case uncompressed block size) is - * sufficient, to ensure that in the worst case the decompressed data for - * block will stop the byte before the compressed data for a block begins. - * To avoid problems with the compressed data's meta information an extra 18 - * bytes are needed. Leading to the formula: - * - * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size. - * - * Adding 8 bytes per 32K is a bit excessive but much easier to calculate. - * Adding 32768 instead of 32767 just makes for round numbers. - * Adding the decompressor_size is necessary as it musht live after all - * of the data as well. Last I measured the decompressor is about 14K. - * 10K of actual data and 4K of bss. - * - */ - -/* - * gzip declarations - */ - -#define OF(args) args -#define STATIC static - -#undef memset -#undef memcpy -#define memzero(s, n) memset ((s), 0, (n)) - -typedef unsigned char uch; -typedef unsigned short ush; -typedef unsigned long ulg; - -#define WSIZE 0x80000000 /* Window size must be at least 32k, - * and a power of two - * We don't actually have a window just - * a huge output buffer so I report - * a 2G windows size, as that should - * always be larger than our output buffer. - */ - -static uch *inbuf; /* input buffer */ -static uch *window; /* Sliding window buffer, (and final output buffer) */ - -static unsigned insize; /* valid bytes in inbuf */ -static unsigned inptr; /* index of next byte to be processed in inbuf */ -static unsigned outcnt; /* bytes in output buffer */ - -/* gzip flag byte */ -#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ -#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */ -#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */ -#define ORIG_NAME 0x08 /* bit 3 set: original file name present */ -#define COMMENT 0x10 /* bit 4 set: file comment present */ -#define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */ -#define RESERVED 0xC0 /* bit 6,7: reserved */ - -#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf()) - -/* Diagnostic functions */ -#ifdef DEBUG -# define Assert(cond,msg) {if(!(cond)) error(msg);} -# define Trace(x) fprintf x -# define Tracev(x) {if (verbose) fprintf x ;} -# define Tracevv(x) {if (verbose>1) fprintf x ;} -# define Tracec(c,x) {if (verbose && (c)) fprintf x ;} -# define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;} -#else -# define Assert(cond,msg) -# define Trace(x) -# define Tracev(x) -# define Tracevv(x) -# define Tracec(c,x) -# define Tracecv(c,x) -#endif - -static int fill_inbuf(void); -static void flush_window(void); -static void error(char *m); -static void gzip_mark(void **); -static void gzip_release(void **); - -/* - * This is set up by the setup-routine at boot-time - */ -static unsigned char *real_mode; /* Pointer to real-mode data */ - -#define RM_EXT_MEM_K (*(unsigned short *)(real_mode + 0x2)) -#ifndef STANDARD_MEMORY_BIOS_CALL -#define RM_ALT_MEM_K (*(unsigned long *)(real_mode + 0x1e0)) -#endif -#define RM_SCREEN_INFO (*(struct screen_info *)(real_mode+0)) - -extern unsigned char input_data[]; -extern int input_len; - -static long bytes_out = 0; - -static void *malloc(int size); -static void free(void *where); - -static void *memset(void *s, int c, unsigned n); -static void *memcpy(void *dest, const void *src, unsigned n); - -static void putstr(const char *); - -static long free_mem_ptr; -static long free_mem_end_ptr; - -#define HEAP_SIZE 0x7000 - -static char *vidmem = (char *)0xb8000; -static int vidport; -static int lines, cols; - -#include "../../../../lib/inflate.c" - -static void *malloc(int size) -{ - void *p; - - if (size <0) error("Malloc error"); - if (free_mem_ptr <= 0) error("Memory error"); - - free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */ - - p = (void *)free_mem_ptr; - free_mem_ptr += size; - - if (free_mem_ptr >= free_mem_end_ptr) - error("Out of memory"); - - return p; -} - -static void free(void *where) -{ /* Don't care */ -} - -static void gzip_mark(void **ptr) -{ - *ptr = (void *) free_mem_ptr; -} - -static void gzip_release(void **ptr) -{ - free_mem_ptr = (long) *ptr; -} - -static void scroll(void) -{ - int i; - - memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 ); - for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 ) - vidmem[i] = ' '; -} - -static void putstr(const char *s) -{ - int x,y,pos; - char c; - - x = RM_SCREEN_INFO.orig_x; - y = RM_SCREEN_INFO.orig_y; - - while ( ( c = *s++ ) != '\0' ) { - if ( c == '\n' ) { - x = 0; - if ( ++y >= lines ) { - scroll(); - y--; - } - } else { - vidmem [ ( x + cols * y ) * 2 ] = c; - if ( ++x >= cols ) { - x = 0; - if ( ++y >= lines ) { - scroll(); - y--; - } - } - } - } - - RM_SCREEN_INFO.orig_x = x; - RM_SCREEN_INFO.orig_y = y; - - pos = (x + cols * y) * 2; /* Update cursor position */ - outb_p(14, vidport); - outb_p(0xff & (pos >> 9), vidport+1); - outb_p(15, vidport); - outb_p(0xff & (pos >> 1), vidport+1); -} - -static void* memset(void* s, int c, unsigned n) -{ - int i; - char *ss = (char*)s; - - for (i=0;i<n;i++) ss[i] = c; - return s; -} - -static void* memcpy(void* dest, const void* src, unsigned n) -{ - int i; - char *d = (char *)dest, *s = (char *)src; - - for (i=0;i<n;i++) d[i] = s[i]; - return dest; -} - -/* =========================================================================== - * Fill the input buffer. This is called only when the buffer is empty - * and at least one byte is really needed. - */ -static int fill_inbuf(void) -{ - error("ran out of input data"); - return 0; -} - -/* =========================================================================== - * Write the output window window[0..outcnt-1] and update crc and bytes_out. - * (Used for the decompressed data only.) - */ -static void flush_window(void) -{ - /* With my window equal to my output buffer - * I only need to compute the crc here. - */ - ulg c = crc; /* temporary variable */ - unsigned n; - uch *in, ch; - - in = window; - for (n = 0; n < outcnt; n++) { - ch = *in++; - c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); - } - crc = c; - bytes_out += (ulg)outcnt; - outcnt = 0; -} - -static void error(char *x) -{ - putstr("\n\n"); - putstr(x); - putstr("\n\n -- System halted"); - - while(1); /* Halt */ -} - -asmlinkage void decompress_kernel(void *rmode, unsigned long heap, - uch *input_data, unsigned long input_len, uch *output) -{ - real_mode = rmode; - - if (RM_SCREEN_INFO.orig_video_mode == 7) { - vidmem = (char *) 0xb0000; - vidport = 0x3b4; - } else { - vidmem = (char *) 0xb8000; - vidport = 0x3d4; - } - - lines = RM_SCREEN_INFO.orig_video_lines; - cols = RM_SCREEN_INFO.orig_video_cols; - - window = output; /* Output buffer (Normally at 1M) */ - free_mem_ptr = heap; /* Heap */ - free_mem_end_ptr = heap + HEAP_SIZE; - inbuf = input_data; /* Input buffer */ - insize = input_len; - inptr = 0; - - if ((ulg)output & (__KERNEL_ALIGN - 1)) - error("Destination address not 2M aligned"); - if ((ulg)output >= 0xffffffffffUL) - error("Destination address too large"); - - makecrc(); - putstr(".\nDecompressing Linux..."); - gunzip(); - putstr("done.\nBooting the kernel.\n"); - return; -} diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c index 7a0d00b2cf2..d01ea42187e 100644 --- a/arch/x86/boot/compressed/relocs.c +++ b/arch/x86/boot/compressed/relocs.c @@ -27,11 +27,6 @@ static unsigned long *relocs; * absolute relocations present w.r.t these symbols. */ static const char* safe_abs_relocs[] = { - "__kernel_vsyscall", - "__kernel_rt_sigreturn", - "__kernel_sigreturn", - "SYSENTER_RETURN", - "VDSO_NOTE_MASK", "xen_irq_disable_direct_reloc", "xen_save_fl_direct_reloc", }; @@ -45,6 +40,8 @@ static int is_safe_abs_reloc(const char* sym_name) /* Match found */ return 1; } + if (strncmp(sym_name, "VDSO", 4) == 0) + return 1; if (strncmp(sym_name, "__crc_", 6) == 0) return 1; return 0; diff --git a/arch/x86/boot/compressed/vmlinux_64.scr b/arch/x86/boot/compressed/vmlinux.scr index bd1429ce193..f02382ae5c4 100644 --- a/arch/x86/boot/compressed/vmlinux_64.scr +++ b/arch/x86/boot/compressed/vmlinux.scr @@ -1,6 +1,6 @@ SECTIONS { - .text.compressed : { + .rodata.compressed : { input_len = .; LONG(input_data_end - input_data) input_data = .; *(.data) diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds index cc4854f6c6c..bb3c48379c4 100644 --- a/arch/x86/boot/compressed/vmlinux_32.lds +++ b/arch/x86/boot/compressed/vmlinux_32.lds @@ -3,17 +3,17 @@ OUTPUT_ARCH(i386) ENTRY(startup_32) SECTIONS { - /* Be careful parts of head.S assume startup_32 is at - * address 0. + /* Be careful parts of head_32.S assume startup_32 is at + * address 0. */ - . = 0 ; + . = 0; .text.head : { _head = . ; *(.text.head) _ehead = . ; } - .data.compressed : { - *(.data.compressed) + .rodata.compressed : { + *(.rodata.compressed) } .text : { _text = .; /* Text */ diff --git a/arch/x86/boot/compressed/vmlinux_32.scr b/arch/x86/boot/compressed/vmlinux_32.scr deleted file mode 100644 index 707a88f7f29..00000000000 --- a/arch/x86/boot/compressed/vmlinux_32.scr +++ /dev/null @@ -1,10 +0,0 @@ -SECTIONS -{ - .data.compressed : { - input_len = .; - LONG(input_data_end - input_data) input_data = .; - *(.data) - output_len = . - 4; - input_data_end = .; - } -} diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux_64.lds index 94c13e557fb..f6e5b445f45 100644 --- a/arch/x86/boot/compressed/vmlinux_64.lds +++ b/arch/x86/boot/compressed/vmlinux_64.lds @@ -3,15 +3,19 @@ OUTPUT_ARCH(i386:x86-64) ENTRY(startup_64) SECTIONS { - /* Be careful parts of head.S assume startup_32 is at - * address 0. + /* Be careful parts of head_64.S assume startup_64 is at + * address 0. */ . = 0; - .text : { + .text.head : { _head = . ; *(.text.head) _ehead = . ; - *(.text.compressed) + } + .rodata.compressed : { + *(.rodata.compressed) + } + .text : { _text = .; /* Text */ *(.text) *(.text.*) diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index bd138e442ec..8721dc46a0b 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c @@ -129,6 +129,7 @@ void query_edd(void) char eddarg[8]; int do_mbr = 1; int do_edd = 1; + int be_quiet; int devno; struct edd_info ei, *edp; u32 *mbrptr; @@ -140,12 +141,21 @@ void query_edd(void) do_edd = 0; } + be_quiet = cmdline_find_option_bool("quiet"); + edp = boot_params.eddbuf; mbrptr = boot_params.edd_mbr_sig_buffer; if (!do_edd) return; + /* Bugs in OnBoard or AddOnCards Bios may hang the EDD probe, + * so give a hint if this happens. + */ + + if (!be_quiet) + printf("Probing EDD (edd=off to disable)... "); + for (devno = 0x80; devno < 0x80+EDD_MBR_SIG_MAX; devno++) { /* * Scan the BIOS-supported hard disks and query EDD @@ -162,6 +172,9 @@ void query_edd(void) if (do_mbr && !read_mbr_sig(devno, &ei, mbrptr++)) boot_params.edd_mbr_sig_buf_entries = devno-0x80+1; } + + if (!be_quiet) + printf("ok\n"); } #endif diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 4cc5b0411db..64ad9016585 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -195,10 +195,13 @@ cmd_line_ptr: .long 0 # (Header version 0x0202 or later) # can be located anywhere in # low memory 0x10000 or higher. -ramdisk_max: .long (-__PAGE_OFFSET-(512 << 20)-1) & 0x7fffffff +ramdisk_max: .long 0x7fffffff # (Header version 0x0203 or later) # The highest safe address for # the contents of an initrd + # The current kernel allows up to 4 GB, + # but leave it at 2 GB to avoid + # possible bootloader bugs. kernel_alignment: .long CONFIG_PHYSICAL_ALIGN #physical addr alignment #required for protected mode diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 1f95750ede2..7828da5cfd0 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -100,20 +100,32 @@ static void set_bios_mode(void) #endif } -void main(void) +static void init_heap(void) { - /* First, copy the boot header into the "zeropage" */ - copy_boot_params(); + char *stack_end; - /* End of heap check */ if (boot_params.hdr.loadflags & CAN_USE_HEAP) { - heap_end = (char *)(boot_params.hdr.heap_end_ptr - +0x200-STACK_SIZE); + asm("leal %P1(%%esp),%0" + : "=r" (stack_end) : "i" (-STACK_SIZE)); + + heap_end = (char *) + ((size_t)boot_params.hdr.heap_end_ptr + 0x200); + if (heap_end > stack_end) + heap_end = stack_end; } else { /* Boot protocol 2.00 only, no heap available */ puts("WARNING: Ancient bootloader, some functionality " "may be limited!\n"); } +} + +void main(void) +{ + /* First, copy the boot header into the "zeropage" */ + copy_boot_params(); + + /* End of heap check */ + init_heap(); /* Make sure we have all the proper CPU support */ if (validate_cpu()) { @@ -131,9 +143,6 @@ void main(void) /* Set keyboard repeat rate (why?) */ keyboard_set_repeat(); - /* Set the video mode */ - set_video(); - /* Query MCA information */ query_mca(); @@ -154,6 +163,10 @@ void main(void) #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) query_edd(); #endif + + /* Set the video mode */ + set_video(); + /* Do the last things and invoke protected mode */ go_to_protected_mode(); } diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c index 09fb342cc62..1a0f936c160 100644 --- a/arch/x86/boot/pm.c +++ b/arch/x86/boot/pm.c @@ -104,7 +104,7 @@ static void reset_coprocessor(void) (((u64)(base & 0xff000000) << 32) | \ ((u64)flags << 40) | \ ((u64)(limit & 0x00ff0000) << 32) | \ - ((u64)(base & 0x00ffff00) << 16) | \ + ((u64)(base & 0x00ffffff) << 16) | \ ((u64)(limit & 0x0000ffff))) struct gdt_ptr { @@ -121,6 +121,10 @@ static void setup_gdt(void) [GDT_ENTRY_BOOT_CS] = GDT_ENTRY(0xc09b, 0, 0xfffff), /* DS: data, read/write, 4 GB, base 0 */ [GDT_ENTRY_BOOT_DS] = GDT_ENTRY(0xc093, 0, 0xfffff), + /* TSS: 32-bit tss, 104 bytes, base 4096 */ + /* We only have a TSS here to keep Intel VT happy; + we don't actually use it for anything. */ + [GDT_ENTRY_BOOT_TSS] = GDT_ENTRY(0x0089, 4096, 103), }; /* Xen HVM incorrectly stores a pointer to the gdt_ptr, instead of the gdt_ptr contents. Thus, make it static so it will diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S index fa6bed1fac1..f5402d51f7c 100644 --- a/arch/x86/boot/pmjump.S +++ b/arch/x86/boot/pmjump.S @@ -15,6 +15,7 @@ */ #include <asm/boot.h> +#include <asm/processor-flags.h> #include <asm/segment.h> .text @@ -29,28 +30,55 @@ */ protected_mode_jump: movl %edx, %esi # Pointer to boot_params table - movl %eax, 2f # Patch ljmpl instruction + + xorl %ebx, %ebx + movw %cs, %bx + shll $4, %ebx + addl %ebx, 2f movw $__BOOT_DS, %cx - xorl %ebx, %ebx # Per the 32-bit boot protocol - xorl %ebp, %ebp # Per the 32-bit boot protocol - xorl %edi, %edi # Per the 32-bit boot protocol + movw $__BOOT_TSS, %di movl %cr0, %edx - orb $1, %dl # Protected mode (PE) bit + orb $X86_CR0_PE, %dl # Protected mode movl %edx, %cr0 jmp 1f # Short jump to serialize on 386/486 1: - movw %cx, %ds - movw %cx, %es - movw %cx, %fs - movw %cx, %gs - movw %cx, %ss - - # Jump to the 32-bit entrypoint + # Transition to 32-bit mode .byte 0x66, 0xea # ljmpl opcode -2: .long 0 # offset +2: .long in_pm32 # offset .word __BOOT_CS # segment .size protected_mode_jump, .-protected_mode_jump + + .code32 + .type in_pm32, @function +in_pm32: + # Set up data segments for flat 32-bit mode + movl %ecx, %ds + movl %ecx, %es + movl %ecx, %fs + movl %ecx, %gs + movl %ecx, %ss + # The 32-bit code sets up its own stack, but this way we do have + # a valid stack if some debugging hack wants to use it. + addl %ebx, %esp + + # Set up TR to make Intel VT happy + ltr %di + + # Clear registers to allow for future extensions to the + # 32-bit boot protocol + xorl %ecx, %ecx + xorl %edx, %edx + xorl %ebx, %ebx + xorl %ebp, %ebp + xorl %edi, %edi + + # Set up LDTR to make Intel VT happy + lldt %cx + + jmpl *%eax # Jump to the 32-bit entrypoint + + .size in_pm32, .-in_pm32 diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c index ed0672a8187..ff664a11709 100644 --- a/arch/x86/boot/video-bios.c +++ b/arch/x86/boot/video-bios.c @@ -104,6 +104,7 @@ static int bios_probe(void) mi = GET_HEAP(struct mode_info, 1); mi->mode = VIDEO_FIRST_BIOS+mode; + mi->depth = 0; /* text */ mi->x = rdfs16(0x44a); mi->y = rdfs8(0x484)+1; nmodes++; @@ -116,7 +117,7 @@ static int bios_probe(void) __videocard video_bios = { - .card_name = "BIOS (scanned)", + .card_name = "BIOS", .probe = bios_probe, .set_mode = bios_set_mode, .unsafe = 1, diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 4716b9a9635..662dd2f1306 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -79,20 +79,28 @@ static int vesa_probe(void) /* Text Mode, TTY BIOS supported, supported by hardware */ mi = GET_HEAP(struct mode_info, 1); - mi->mode = mode + VIDEO_FIRST_VESA; - mi->x = vminfo.h_res; - mi->y = vminfo.v_res; + mi->mode = mode + VIDEO_FIRST_VESA; + mi->depth = 0; /* text */ + mi->x = vminfo.h_res; + mi->y = vminfo.v_res; nmodes++; - } else if ((vminfo.mode_attr & 0x99) == 0x99) { + } else if ((vminfo.mode_attr & 0x99) == 0x99 && + (vminfo.memory_layout == 4 || + vminfo.memory_layout == 6) && + vminfo.memory_planes == 1) { #ifdef CONFIG_FB /* Graphics mode, color, linear frame buffer - supported -- register the mode but hide from - the menu. Only do this if framebuffer is - configured, however, otherwise the user will - be left without a screen. */ + supported. Only register the mode if + if framebuffer is configured, however, + otherwise the user will be left without a screen. + We don't require CONFIG_FB_VESA, however, since + some of the other framebuffer drivers can use + this mode-setting, too. */ mi = GET_HEAP(struct mode_info, 1); mi->mode = mode + VIDEO_FIRST_VESA; - mi->x = mi->y = 0; + mi->depth = vminfo.bpp; + mi->x = vminfo.h_res; + mi->y = vminfo.v_res; nmodes++; #endif } diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index aef02f9ec0c..7259387b7d1 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -18,22 +18,22 @@ #include "video.h" static struct mode_info vga_modes[] = { - { VIDEO_80x25, 80, 25 }, - { VIDEO_8POINT, 80, 50 }, - { VIDEO_80x43, 80, 43 }, - { VIDEO_80x28, 80, 28 }, - { VIDEO_80x30, 80, 30 }, - { VIDEO_80x34, 80, 34 }, - { VIDEO_80x60, 80, 60 }, + { VIDEO_80x25, 80, 25, 0 }, + { VIDEO_8POINT, 80, 50, 0 }, + { VIDEO_80x43, 80, 43, 0 }, + { VIDEO_80x28, 80, 28, 0 }, + { VIDEO_80x30, 80, 30, 0 }, + { VIDEO_80x34, 80, 34, 0 }, + { VIDEO_80x60, 80, 60, 0 }, }; static struct mode_info ega_modes[] = { - { VIDEO_80x25, 80, 25 }, - { VIDEO_8POINT, 80, 43 }, + { VIDEO_80x25, 80, 25, 0 }, + { VIDEO_8POINT, 80, 43, 0 }, }; static struct mode_info cga_modes[] = { - { VIDEO_80x25, 80, 25 }, + { VIDEO_80x25, 80, 25, 0 }, }; __videocard video_vga; diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index ad9712f0173..696d08f3843 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -293,13 +293,28 @@ static void display_menu(void) struct mode_info *mi; char ch; int i; + int nmodes; + int modes_per_line; + int col; - puts("Mode: COLSxROWS:\n"); + nmodes = 0; + for (card = video_cards; card < video_cards_end; card++) + nmodes += card->nmodes; + modes_per_line = 1; + if (nmodes >= 20) + modes_per_line = 3; + + for (col = 0; col < modes_per_line; col++) + puts("Mode: Resolution: Type: "); + putchar('\n'); + + col = 0; ch = '0'; for (card = video_cards; card < video_cards_end; card++) { mi = card->modes; for (i = 0; i < card->nmodes; i++, mi++) { + char resbuf[32]; int visible = mi->x && mi->y; u16 mode_id = mi->mode ? mi->mode : (mi->y << 8)+mi->x; @@ -307,8 +322,18 @@ static void display_menu(void) if (!visible) continue; /* Hidden mode */ - printf("%c %04X %3dx%-3d %s\n", - ch, mode_id, mi->x, mi->y, card->card_name); + if (mi->depth) + sprintf(resbuf, "%dx%d", mi->y, mi->depth); + else + sprintf(resbuf, "%d", mi->y); + + printf("%c %03X %4dx%-7s %-6s", + ch, mode_id, mi->x, resbuf, card->card_name); + col++; + if (col >= modes_per_line) { + putchar('\n'); + col = 0; + } if (ch == '9') ch = 'a'; @@ -318,6 +343,8 @@ static void display_menu(void) ch++; } } + if (col) + putchar('\n'); } #define H(x) ((x)-'a'+10) diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h index b92447d5121..d69347f79e8 100644 --- a/arch/x86/boot/video.h +++ b/arch/x86/boot/video.h @@ -83,7 +83,8 @@ void store_screen(void); struct mode_info { u16 mode; /* Mode number (vga= style) */ - u8 x, y; /* Width, height */ + u16 x, y; /* Width, height */ + u16 depth; /* Bits per pixel, 0 for text mode */ }; struct card_info { diff --git a/arch/x86/boot/voyager.c b/arch/x86/boot/voyager.c index 61c8fe0453b..6499e3239b4 100644 --- a/arch/x86/boot/voyager.c +++ b/arch/x86/boot/voyager.c @@ -16,8 +16,6 @@ #include "boot.h" -#ifdef CONFIG_X86_VOYAGER - int query_voyager(void) { u8 err; @@ -42,5 +40,3 @@ int query_voyager(void) copy_from_fs(data_ptr, di, 7); /* Table is 7 bytes apparently */ return 0; } - -#endif /* CONFIG_X86_VOYAGER */ diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 54ee1764fda..77562e7cdab 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -99,9 +99,9 @@ CONFIG_IOSCHED_NOOP=y CONFIG_IOSCHED_AS=y CONFIG_IOSCHED_DEADLINE=y CONFIG_IOSCHED_CFQ=y -CONFIG_DEFAULT_AS=y +# CONFIG_DEFAULT_AS is not set # CONFIG_DEFAULT_DEADLINE is not set -# CONFIG_DEFAULT_CFQ is not set +CONFIG_DEFAULT_CFQ=y # CONFIG_DEFAULT_NOOP is not set CONFIG_DEFAULT_IOSCHED="anticipatory" diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 38a83f9c966..9e2b0ef851d 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -145,15 +145,6 @@ CONFIG_K8_NUMA=y CONFIG_NODES_SHIFT=6 CONFIG_X86_64_ACPI_NUMA=y CONFIG_NUMA_EMU=y -CONFIG_ARCH_DISCONTIGMEM_ENABLE=y -CONFIG_ARCH_DISCONTIGMEM_DEFAULT=y -CONFIG_ARCH_SPARSEMEM_ENABLE=y -CONFIG_SELECT_MEMORY_MODEL=y -# CONFIG_FLATMEM_MANUAL is not set -CONFIG_DISCONTIGMEM_MANUAL=y -# CONFIG_SPARSEMEM_MANUAL is not set -CONFIG_DISCONTIGMEM=y -CONFIG_FLAT_NODE_MEM_MAP=y CONFIG_NEED_MULTIPLE_NODES=y # CONFIG_SPARSEMEM_STATIC is not set CONFIG_SPLIT_PTLOCK_CPUS=4 diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index e2edda255a8..52d0ccfcf6e 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -2,9 +2,7 @@ # Makefile for the ia32 kernel emulation subsystem. # -obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o tls32.o \ - ia32_binfmt.o fpu32.o ptrace32.o syscall32.o syscall32_syscall.o \ - mmap32.o +obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o sysv-$(CONFIG_SYSVIPC) := ipc32.o obj-$(CONFIG_IA32_EMULATION) += $(sysv-y) @@ -13,40 +11,3 @@ obj-$(CONFIG_IA32_AOUT) += ia32_aout.o audit-class-$(CONFIG_AUDIT) := audit.o obj-$(CONFIG_IA32_EMULATION) += $(audit-class-y) - -$(obj)/syscall32_syscall.o: \ - $(foreach F,sysenter syscall,$(obj)/vsyscall-$F.so) - -# Teach kbuild about targets -targets := $(foreach F,$(addprefix vsyscall-,sysenter syscall),\ - $F.o $F.so $F.so.dbg) - -# The DSO images are built using a special linker script -quiet_cmd_syscall = SYSCALL $@ - cmd_syscall = $(CC) -m32 -nostdlib -shared \ - $(call ld-option, -Wl$(comma)--hash-style=sysv) \ - -Wl,-soname=linux-gate.so.1 -o $@ \ - -Wl,-T,$(filter-out FORCE,$^) - -$(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg FORCE - $(call if_changed,objcopy) - -$(obj)/vsyscall-sysenter.so.dbg $(obj)/vsyscall-syscall.so.dbg: \ -$(obj)/vsyscall-%.so.dbg: $(src)/vsyscall.lds $(obj)/vsyscall-%.o FORCE - $(call if_changed,syscall) - -AFLAGS_vsyscall-sysenter.o = -m32 -Wa,-32 -AFLAGS_vsyscall-syscall.o = -m32 -Wa,-32 - -vdsos := vdso32-sysenter.so vdso32-syscall.so - -quiet_cmd_vdso_install = INSTALL $@ - cmd_vdso_install = cp $(@:vdso32-%.so=$(obj)/vsyscall-%.so.dbg) \ - $(MODLIB)/vdso/$@ - -$(vdsos): - @mkdir -p $(MODLIB)/vdso - $(call cmd,vdso_install) - -vdso_install: $(vdsos) diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c index 91b7b5922df..5d7b381da69 100644 --- a/arch/x86/ia32/audit.c +++ b/arch/x86/ia32/audit.c @@ -27,7 +27,7 @@ unsigned ia32_signal_class[] = { int ia32_classify_syscall(unsigned syscall) { - switch(syscall) { + switch (syscall) { case __NR_open: return 2; case __NR_openat: diff --git a/arch/x86/ia32/fpu32.c b/arch/x86/ia32/fpu32.c deleted file mode 100644 index 2c8209a3605..00000000000 --- a/arch/x86/ia32/fpu32.c +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright 2002 Andi Kleen, SuSE Labs. - * FXSAVE<->i387 conversion support. Based on code by Gareth Hughes. - * This is used for ptrace, signals and coredumps in 32bit emulation. - */ - -#include <linux/sched.h> -#include <asm/sigcontext32.h> -#include <asm/processor.h> -#include <asm/uaccess.h> -#include <asm/i387.h> - -static inline unsigned short twd_i387_to_fxsr(unsigned short twd) -{ - unsigned int tmp; /* to avoid 16 bit prefixes in the code */ - - /* Transform each pair of bits into 01 (valid) or 00 (empty) */ - tmp = ~twd; - tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ - /* and move the valid bits to the lower byte. */ - tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ - tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ - tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ - return tmp; -} - -static inline unsigned long twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) -{ - struct _fpxreg *st = NULL; - unsigned long tos = (fxsave->swd >> 11) & 7; - unsigned long twd = (unsigned long) fxsave->twd; - unsigned long tag; - unsigned long ret = 0xffff0000; - int i; - -#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); - - for (i = 0 ; i < 8 ; i++) { - if (twd & 0x1) { - st = FPREG_ADDR( fxsave, (i - tos) & 7 ); - - switch (st->exponent & 0x7fff) { - case 0x7fff: - tag = 2; /* Special */ - break; - case 0x0000: - if ( !st->significand[0] && - !st->significand[1] && - !st->significand[2] && - !st->significand[3] ) { - tag = 1; /* Zero */ - } else { - tag = 2; /* Special */ - } - break; - default: - if (st->significand[3] & 0x8000) { - tag = 0; /* Valid */ - } else { - tag = 2; /* Special */ - } - break; - } - } else { - tag = 3; /* Empty */ - } - ret |= (tag << (2 * i)); - twd = twd >> 1; - } - return ret; -} - - -static inline int convert_fxsr_from_user(struct i387_fxsave_struct *fxsave, - struct _fpstate_ia32 __user *buf) -{ - struct _fpxreg *to; - struct _fpreg __user *from; - int i; - u32 v; - int err = 0; - -#define G(num,val) err |= __get_user(val, num + (u32 __user *)buf) - G(0, fxsave->cwd); - G(1, fxsave->swd); - G(2, fxsave->twd); - fxsave->twd = twd_i387_to_fxsr(fxsave->twd); - G(3, fxsave->rip); - G(4, v); - fxsave->fop = v>>16; /* cs ignored */ - G(5, fxsave->rdp); - /* 6: ds ignored */ -#undef G - if (err) - return -1; - - to = (struct _fpxreg *)&fxsave->st_space[0]; - from = &buf->_st[0]; - for (i = 0 ; i < 8 ; i++, to++, from++) { - if (__copy_from_user(to, from, sizeof(*from))) - return -1; - } - return 0; -} - - -static inline int convert_fxsr_to_user(struct _fpstate_ia32 __user *buf, - struct i387_fxsave_struct *fxsave, - struct pt_regs *regs, - struct task_struct *tsk) -{ - struct _fpreg __user *to; - struct _fpxreg *from; - int i; - u16 cs,ds; - int err = 0; - - if (tsk == current) { - /* should be actually ds/cs at fpu exception time, - but that information is not available in 64bit mode. */ - asm("movw %%ds,%0 " : "=r" (ds)); - asm("movw %%cs,%0 " : "=r" (cs)); - } else { /* ptrace. task has stopped. */ - ds = tsk->thread.ds; - cs = regs->cs; - } - -#define P(num,val) err |= __put_user(val, num + (u32 __user *)buf) - P(0, (u32)fxsave->cwd | 0xffff0000); - P(1, (u32)fxsave->swd | 0xffff0000); - P(2, twd_fxsr_to_i387(fxsave)); - P(3, (u32)fxsave->rip); - P(4, cs | ((u32)fxsave->fop) << 16); - P(5, fxsave->rdp); - P(6, 0xffff0000 | ds); -#undef P - - if (err) - return -1; - - to = &buf->_st[0]; - from = (struct _fpxreg *) &fxsave->st_space[0]; - for ( i = 0 ; i < 8 ; i++, to++, from++ ) { - if (__copy_to_user(to, from, sizeof(*to))) - return -1; - } - return 0; -} - -int restore_i387_ia32(struct task_struct *tsk, struct _fpstate_ia32 __user *buf, int fsave) -{ - clear_fpu(tsk); - if (!fsave) { - if (__copy_from_user(&tsk->thread.i387.fxsave, - &buf->_fxsr_env[0], - sizeof(struct i387_fxsave_struct))) - return -1; - tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; - set_stopped_child_used_math(tsk); - } - return convert_fxsr_from_user(&tsk->thread.i387.fxsave, buf); -} - -int save_i387_ia32(struct task_struct *tsk, - struct _fpstate_ia32 __user *buf, - struct pt_regs *regs, - int fsave) -{ - int err = 0; - - init_fpu(tsk); - if (convert_fxsr_to_user(buf, &tsk->thread.i387.fxsave, regs, tsk)) - return -1; - if (fsave) - return 0; - err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status); - if (fsave) - return err ? -1 : 1; - err |= __put_user(X86_FXSR_MAGIC, &buf->magic); - err |= __copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave, - sizeof(struct i387_fxsave_struct)); - return err ? -1 : 1; -} diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index f82e1a94fcb..e4c12079171 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -25,6 +25,7 @@ #include <linux/binfmts.h> #include <linux/personality.h> #include <linux/init.h> +#include <linux/jiffies.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -36,61 +37,67 @@ #undef WARN_OLD #undef CORE_DUMP /* probably broken */ -static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); -static int load_aout_library(struct file*); +static int load_aout_binary(struct linux_binprm *, struct pt_regs *regs); +static int load_aout_library(struct file *); #ifdef CORE_DUMP -static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); +static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, + unsigned long limit); /* * fill in the user structure for a core dump.. */ -static void dump_thread32(struct pt_regs * regs, struct user32 * dump) +static void dump_thread32(struct pt_regs *regs, struct user32 *dump) { - u32 fs,gs; + u32 fs, gs; /* changed the size calculations - should hopefully work better. lbt */ dump->magic = CMAGIC; dump->start_code = 0; - dump->start_stack = regs->rsp & ~(PAGE_SIZE - 1); + dump->start_stack = regs->sp & ~(PAGE_SIZE - 1); dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; - dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; + dump->u_dsize = ((unsigned long) + (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; dump->u_dsize -= dump->u_tsize; dump->u_ssize = 0; - dump->u_debugreg[0] = current->thread.debugreg0; - dump->u_debugreg[1] = current->thread.debugreg1; - dump->u_debugreg[2] = current->thread.debugreg2; - dump->u_debugreg[3] = current->thread.debugreg3; - dump->u_debugreg[4] = 0; - dump->u_debugreg[5] = 0; - dump->u_debugreg[6] = current->thread.debugreg6; - dump->u_debugreg[7] = current->thread.debugreg7; - - if (dump->start_stack < 0xc0000000) - dump->u_ssize = ((unsigned long) (0xc0000000 - dump->start_stack)) >> PAGE_SHIFT; - - dump->regs.ebx = regs->rbx; - dump->regs.ecx = regs->rcx; - dump->regs.edx = regs->rdx; - dump->regs.esi = regs->rsi; - dump->regs.edi = regs->rdi; - dump->regs.ebp = regs->rbp; - dump->regs.eax = regs->rax; + dump->u_debugreg[0] = current->thread.debugreg0; + dump->u_debugreg[1] = current->thread.debugreg1; + dump->u_debugreg[2] = current->thread.debugreg2; + dump->u_debugreg[3] = current->thread.debugreg3; + dump->u_debugreg[4] = 0; + dump->u_debugreg[5] = 0; + dump->u_debugreg[6] = current->thread.debugreg6; + dump->u_debugreg[7] = current->thread.debugreg7; + + if (dump->start_stack < 0xc0000000) { + unsigned long tmp; + + tmp = (unsigned long) (0xc0000000 - dump->start_stack); + dump->u_ssize = tmp >> PAGE_SHIFT; + } + + dump->regs.bx = regs->bx; + dump->regs.cx = regs->cx; + dump->regs.dx = regs->dx; + dump->regs.si = regs->si; + dump->regs.di = regs->di; + dump->regs.bp = regs->bp; + dump->regs.ax = regs->ax; dump->regs.ds = current->thread.ds; dump->regs.es = current->thread.es; asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs; - asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; - dump->regs.orig_eax = regs->orig_rax; - dump->regs.eip = regs->rip; + asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; + dump->regs.orig_ax = regs->orig_ax; + dump->regs.ip = regs->ip; dump->regs.cs = regs->cs; - dump->regs.eflags = regs->eflags; - dump->regs.esp = regs->rsp; + dump->regs.flags = regs->flags; + dump->regs.sp = regs->sp; dump->regs.ss = regs->ss; #if 1 /* FIXME */ dump->u_fpvalid = 0; #else - dump->u_fpvalid = dump_fpu (regs, &dump->i387); + dump->u_fpvalid = dump_fpu(regs, &dump->i387); #endif } @@ -128,15 +135,19 @@ static int dump_write(struct file *file, const void *addr, int nr) return file->f_op->write(file, addr, nr, &file->f_pos) == nr; } -#define DUMP_WRITE(addr, nr) \ +#define DUMP_WRITE(addr, nr) \ if (!dump_write(file, (void *)(addr), (nr))) \ goto end_coredump; -#define DUMP_SEEK(offset) \ -if (file->f_op->llseek) { \ - if (file->f_op->llseek(file,(offset),0) != (offset)) \ - goto end_coredump; \ -} else file->f_pos = (offset) +#define DUMP_SEEK(offset) \ + if (file->f_op->llseek) { \ + if (file->f_op->llseek(file, (offset), 0) != (offset)) \ + goto end_coredump; \ + } else \ + file->f_pos = (offset) + +#define START_DATA() (u.u_tsize << PAGE_SHIFT) +#define START_STACK(u) (u.start_stack) /* * Routine writes a core dump image in the current directory. @@ -148,62 +159,70 @@ if (file->f_op->llseek) { \ * dumping of the process results in another error.. */ -static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) +static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, + unsigned long limit) { mm_segment_t fs; int has_dumped = 0; unsigned long dump_start, dump_size; struct user32 dump; -# define START_DATA(u) (u.u_tsize << PAGE_SHIFT) -# define START_STACK(u) (u.start_stack) fs = get_fs(); set_fs(KERNEL_DS); has_dumped = 1; current->flags |= PF_DUMPCORE; - strncpy(dump.u_comm, current->comm, sizeof(current->comm)); - dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) - ((unsigned long)(&dump))); + strncpy(dump.u_comm, current->comm, sizeof(current->comm)); + dump.u_ar0 = (u32)(((unsigned long)(&dump.regs)) - + ((unsigned long)(&dump))); dump.signal = signr; dump_thread32(regs, &dump); -/* If the size of the dump file exceeds the rlimit, then see what would happen - if we wrote the stack, but not the data area. */ + /* + * If the size of the dump file exceeds the rlimit, then see + * what would happen if we wrote the stack, but not the data + * area. + */ if ((dump.u_dsize + dump.u_ssize + 1) * PAGE_SIZE > limit) dump.u_dsize = 0; -/* Make sure we have enough room to write the stack and data areas. */ + /* Make sure we have enough room to write the stack and data areas. */ if ((dump.u_ssize + 1) * PAGE_SIZE > limit) dump.u_ssize = 0; -/* make sure we actually have a data and stack area to dump */ + /* make sure we actually have a data and stack area to dump */ set_fs(USER_DS); - if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) + if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_DATA(dump), + dump.u_dsize << PAGE_SHIFT)) dump.u_dsize = 0; - if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) + if (!access_ok(VERIFY_READ, (void *) (unsigned long)START_STACK(dump), + dump.u_ssize << PAGE_SHIFT)) dump.u_ssize = 0; set_fs(KERNEL_DS); -/* struct user */ - DUMP_WRITE(&dump,sizeof(dump)); -/* Now dump all of the user data. Include malloced stuff as well */ + /* struct user */ + DUMP_WRITE(&dump, sizeof(dump)); + /* Now dump all of the user data. Include malloced stuff as well */ DUMP_SEEK(PAGE_SIZE); -/* now we start writing out the user space info */ + /* now we start writing out the user space info */ set_fs(USER_DS); -/* Dump the data area */ + /* Dump the data area */ if (dump.u_dsize != 0) { dump_start = START_DATA(dump); dump_size = dump.u_dsize << PAGE_SHIFT; - DUMP_WRITE(dump_start,dump_size); + DUMP_WRITE(dump_start, dump_size); } -/* Now prepare to dump the stack area */ + /* Now prepare to dump the stack area */ if (dump.u_ssize != 0) { dump_start = START_STACK(dump); dump_size = dump.u_ssize << PAGE_SHIFT; - DUMP_WRITE(dump_start,dump_size); + DUMP_WRITE(dump_start, dump_size); } -/* Finally dump the task struct. Not be used by gdb, but could be useful */ + /* + * Finally dump the task struct. Not be used by gdb, but + * could be useful + */ set_fs(KERNEL_DS); - DUMP_WRITE(current,sizeof(*current)); + DUMP_WRITE(current, sizeof(*current)); end_coredump: set_fs(fs); return has_dumped; @@ -217,35 +236,34 @@ end_coredump: */ static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm) { - u32 __user *argv; - u32 __user *envp; - u32 __user *sp; - int argc = bprm->argc; - int envc = bprm->envc; + u32 __user *argv, *envp, *sp; + int argc = bprm->argc, envc = bprm->envc; sp = (u32 __user *) ((-(unsigned long)sizeof(u32)) & (unsigned long) p); sp -= envc+1; envp = sp; sp -= argc+1; argv = sp; - put_user((unsigned long) envp,--sp); - put_user((unsigned long) argv,--sp); - put_user(argc,--sp); + put_user((unsigned long) envp, --sp); + put_user((unsigned long) argv, --sp); + put_user(argc, --sp); current->mm->arg_start = (unsigned long) p; - while (argc-->0) { + while (argc-- > 0) { char c; - put_user((u32)(unsigned long)p,argv++); + + put_user((u32)(unsigned long)p, argv++); do { - get_user(c,p++); + get_user(c, p++); } while (c); } put_user(0, argv); current->mm->arg_end = current->mm->env_start = (unsigned long) p; - while (envc-->0) { + while (envc-- > 0) { char c; - put_user((u32)(unsigned long)p,envp++); + + put_user((u32)(unsigned long)p, envp++); do { - get_user(c,p++); + get_user(c, p++); } while (c); } put_user(0, envp); @@ -257,20 +275,18 @@ static u32 __user *create_aout_tables(char __user *p, struct linux_binprm *bprm) * These are the functions used to load a.out style executables and shared * libraries. There is no binary dependent code anywhere else. */ - -static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) +static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) { + unsigned long error, fd_offset, rlim; struct exec ex; - unsigned long error; - unsigned long fd_offset; - unsigned long rlim; int retval; ex = *((struct exec *) bprm->buf); /* exec-header */ if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != OMAGIC && N_MAGIC(ex) != QMAGIC && N_MAGIC(ex) != NMAGIC) || N_TRSIZE(ex) || N_DRSIZE(ex) || - i_size_read(bprm->file->f_path.dentry->d_inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { + i_size_read(bprm->file->f_path.dentry->d_inode) < + ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { return -ENOEXEC; } @@ -291,13 +307,13 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) if (retval) return retval; - regs->cs = __USER32_CS; + regs->cs = __USER32_CS; regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; /* OK, This is the point of no return */ set_personality(PER_LINUX); - set_thread_flag(TIF_IA32); + set_thread_flag(TIF_IA32); clear_thread_flag(TIF_ABI_PENDING); current->mm->end_code = ex.a_text + @@ -311,7 +327,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) current->mm->mmap = NULL; compute_creds(bprm); - current->flags &= ~PF_FORKNOEXEC; + current->flags &= ~PF_FORKNOEXEC; if (N_MAGIC(ex) == OMAGIC) { unsigned long text_addr, map_size; @@ -338,30 +354,31 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) send_sig(SIGKILL, current, 0); return error; } - + flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data); } else { #ifdef WARN_OLD static unsigned long error_time, error_time2; if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && - (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ) - { + (N_MAGIC(ex) != NMAGIC) && + time_after(jiffies, error_time2 + 5*HZ)) { printk(KERN_NOTICE "executable not page aligned\n"); error_time2 = jiffies; } if ((fd_offset & ~PAGE_MASK) != 0 && - (jiffies-error_time) > 5*HZ) - { - printk(KERN_WARNING - "fd_offset is not page aligned. Please convert program: %s\n", + time_after(jiffies, error_time + 5*HZ)) { + printk(KERN_WARNING + "fd_offset is not page aligned. Please convert " + "program: %s\n", bprm->file->f_path.dentry->d_name.name); error_time = jiffies; } #endif - if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { + if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) { loff_t pos = fd_offset; + down_write(¤t->mm->mmap_sem); do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); up_write(¤t->mm->mmap_sem); @@ -376,9 +393,10 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) down_write(¤t->mm->mmap_sem); error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, - PROT_READ | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, - fd_offset); + PROT_READ | PROT_EXEC, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | + MAP_EXECUTABLE | MAP_32BIT, + fd_offset); up_write(¤t->mm->mmap_sem); if (error != N_TXTADDR(ex)) { @@ -387,9 +405,10 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) } down_write(¤t->mm->mmap_sem); - error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data, + error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE | MAP_32BIT, + MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | + MAP_EXECUTABLE | MAP_32BIT, fd_offset + ex.a_text); up_write(¤t->mm->mmap_sem); if (error != N_DATADDR(ex)) { @@ -403,9 +422,9 @@ beyond_if: set_brk(current->mm->start_brk, current->mm->brk); retval = setup_arg_pages(bprm, IA32_STACK_TOP, EXSTACK_DEFAULT); - if (retval < 0) { - /* Someone check-me: is this error path enough? */ - send_sig(SIGKILL, current, 0); + if (retval < 0) { + /* Someone check-me: is this error path enough? */ + send_sig(SIGKILL, current, 0); return retval; } @@ -414,10 +433,10 @@ beyond_if: /* start thread */ asm volatile("movl %0,%%fs" :: "r" (0)); \ asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); - load_gs_index(0); - (regs)->rip = ex.a_entry; - (regs)->rsp = current->mm->start_stack; - (regs)->eflags = 0x200; + load_gs_index(0); + (regs)->ip = ex.a_entry; + (regs)->sp = current->mm->start_stack; + (regs)->flags = 0x200; (regs)->cs = __USER32_CS; (regs)->ss = __USER32_DS; regs->r8 = regs->r9 = regs->r10 = regs->r11 = @@ -425,7 +444,7 @@ beyond_if: set_fs(USER_DS); if (unlikely(current->ptrace & PT_PTRACED)) { if (current->ptrace & PT_TRACE_EXEC) - ptrace_notify ((PTRACE_EVENT_EXEC << 8) | SIGTRAP); + ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP); else send_sig(SIGTRAP, current, 0); } @@ -434,9 +453,8 @@ beyond_if: static int load_aout_library(struct file *file) { - struct inode * inode; - unsigned long bss, start_addr, len; - unsigned long error; + struct inode *inode; + unsigned long bss, start_addr, len, error; int retval; struct exec ex; @@ -450,7 +468,8 @@ static int load_aout_library(struct file *file) /* We come in here for the regular a.out style of shared libraries */ if ((N_MAGIC(ex) != ZMAGIC && N_MAGIC(ex) != QMAGIC) || N_TRSIZE(ex) || N_DRSIZE(ex) || ((ex.a_entry & 0xfff) && N_MAGIC(ex) == ZMAGIC) || - i_size_read(inode) < ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { + i_size_read(inode) < + ex.a_text+ex.a_data+N_SYMSIZE(ex)+N_TXTOFF(ex)) { goto out; } @@ -467,10 +486,10 @@ static int load_aout_library(struct file *file) #ifdef WARN_OLD static unsigned long error_time; - if ((jiffies-error_time) > 5*HZ) - { - printk(KERN_WARNING - "N_TXTOFF is not page aligned. Please convert library: %s\n", + if (time_after(jiffies, error_time + 5*HZ)) { + printk(KERN_WARNING + "N_TXTOFF is not page aligned. Please convert " + "library: %s\n", file->f_path.dentry->d_name.name); error_time = jiffies; } @@ -478,11 +497,12 @@ static int load_aout_library(struct file *file) down_write(¤t->mm->mmap_sem); do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); up_write(¤t->mm->mmap_sem); - + file->f_op->read(file, (char __user *)start_addr, ex.a_text + ex.a_data, &pos); flush_icache_range((unsigned long) start_addr, - (unsigned long) start_addr + ex.a_text + ex.a_data); + (unsigned long) start_addr + ex.a_text + + ex.a_data); retval = 0; goto out; diff --git a/arch/x86/ia32/ia32_binfmt.c b/arch/x86/ia32/ia32_binfmt.c deleted file mode 100644 index 55822d2cf05..00000000000 --- a/arch/x86/ia32/ia32_binfmt.c +++ /dev/null @@ -1,285 +0,0 @@ -/* - * Written 2000,2002 by Andi Kleen. - * - * Loosely based on the sparc64 and IA64 32bit emulation loaders. - * This tricks binfmt_elf.c into loading 32bit binaries using lots - * of ugly preprocessor tricks. Talk about very very poor man's inheritance. - */ - -#include <linux/types.h> -#include <linux/stddef.h> -#include <linux/rwsem.h> -#include <linux/sched.h> -#include <linux/compat.h> -#include <linux/string.h> -#include <linux/binfmts.h> -#include <linux/mm.h> -#include <linux/security.h> -#include <linux/elfcore-compat.h> - -#include <asm/segment.h> -#include <asm/ptrace.h> -#include <asm/processor.h> -#include <asm/user32.h> -#include <asm/sigcontext32.h> -#include <asm/fpu32.h> -#include <asm/i387.h> -#include <asm/uaccess.h> -#include <asm/ia32.h> -#include <asm/vsyscall32.h> - -#undef ELF_ARCH -#undef ELF_CLASS -#define ELF_CLASS ELFCLASS32 -#define ELF_ARCH EM_386 - -#undef elfhdr -#undef elf_phdr -#undef elf_note -#undef elf_addr_t -#define elfhdr elf32_hdr -#define elf_phdr elf32_phdr -#define elf_note elf32_note -#define elf_addr_t Elf32_Off - -#define ELF_NAME "elf/i386" - -#define AT_SYSINFO 32 -#define AT_SYSINFO_EHDR 33 - -int sysctl_vsyscall32 = 1; - -#undef ARCH_DLINFO -#define ARCH_DLINFO do { \ - if (sysctl_vsyscall32) { \ - current->mm->context.vdso = (void *)VSYSCALL32_BASE; \ - NEW_AUX_ENT(AT_SYSINFO, (u32)(u64)VSYSCALL32_VSYSCALL); \ - NEW_AUX_ENT(AT_SYSINFO_EHDR, VSYSCALL32_BASE); \ - } \ -} while(0) - -struct file; - -#define IA32_EMULATOR 1 - -#undef ELF_ET_DYN_BASE - -#define ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) - -#define jiffies_to_timeval(a,b) do { (b)->tv_usec = 0; (b)->tv_sec = (a)/HZ; }while(0) - -#define _GET_SEG(x) \ - ({ __u32 seg; asm("movl %%" __stringify(x) ",%0" : "=r"(seg)); seg; }) - -/* Assumes current==process to be dumped */ -#undef ELF_CORE_COPY_REGS -#define ELF_CORE_COPY_REGS(pr_reg, regs) \ - pr_reg[0] = regs->rbx; \ - pr_reg[1] = regs->rcx; \ - pr_reg[2] = regs->rdx; \ - pr_reg[3] = regs->rsi; \ - pr_reg[4] = regs->rdi; \ - pr_reg[5] = regs->rbp; \ - pr_reg[6] = regs->rax; \ - pr_reg[7] = _GET_SEG(ds); \ - pr_reg[8] = _GET_SEG(es); \ - pr_reg[9] = _GET_SEG(fs); \ - pr_reg[10] = _GET_SEG(gs); \ - pr_reg[11] = regs->orig_rax; \ - pr_reg[12] = regs->rip; \ - pr_reg[13] = regs->cs; \ - pr_reg[14] = regs->eflags; \ - pr_reg[15] = regs->rsp; \ - pr_reg[16] = regs->ss; - - -#define elf_prstatus compat_elf_prstatus -#define elf_prpsinfo compat_elf_prpsinfo -#define elf_fpregset_t struct user_i387_ia32_struct -#define elf_fpxregset_t struct user32_fxsr_struct -#define user user32 - -#undef elf_read_implies_exec -#define elf_read_implies_exec(ex, executable_stack) (executable_stack != EXSTACK_DISABLE_X) - -#define elf_core_copy_regs elf32_core_copy_regs -static inline void elf32_core_copy_regs(compat_elf_gregset_t *elfregs, - struct pt_regs *regs) -{ - ELF_CORE_COPY_REGS((&elfregs->ebx), regs) -} - -#define elf_core_copy_task_regs elf32_core_copy_task_regs -static inline int elf32_core_copy_task_regs(struct task_struct *t, - compat_elf_gregset_t* elfregs) -{ - struct pt_regs *pp = task_pt_regs(t); - ELF_CORE_COPY_REGS((&elfregs->ebx), pp); - /* fix wrong segments */ - elfregs->ds = t->thread.ds; - elfregs->fs = t->thread.fsindex; - elfregs->gs = t->thread.gsindex; - elfregs->es = t->thread.es; - return 1; -} - -#define elf_core_copy_task_fpregs elf32_core_copy_task_fpregs -static inline int -elf32_core_copy_task_fpregs(struct task_struct *tsk, struct pt_regs *regs, - elf_fpregset_t *fpu) -{ - struct _fpstate_ia32 *fpstate = (void*)fpu; - mm_segment_t oldfs = get_fs(); - - if (!tsk_used_math(tsk)) - return 0; - if (!regs) - regs = task_pt_regs(tsk); - if (tsk == current) - unlazy_fpu(tsk); - set_fs(KERNEL_DS); - save_i387_ia32(tsk, fpstate, regs, 1); - /* Correct for i386 bug. It puts the fop into the upper 16bits of - the tag word (like FXSAVE), not into the fcs*/ - fpstate->cssel |= fpstate->tag & 0xffff0000; - set_fs(oldfs); - return 1; -} - -#define ELF_CORE_COPY_XFPREGS 1 -#define ELF_CORE_XFPREG_TYPE NT_PRXFPREG -#define elf_core_copy_task_xfpregs elf32_core_copy_task_xfpregs -static inline int -elf32_core_copy_task_xfpregs(struct task_struct *t, elf_fpxregset_t *xfpu) -{ - struct pt_regs *regs = task_pt_regs(t); - if (!tsk_used_math(t)) - return 0; - if (t == current) - unlazy_fpu(t); - memcpy(xfpu, &t->thread.i387.fxsave, sizeof(elf_fpxregset_t)); - xfpu->fcs = regs->cs; - xfpu->fos = t->thread.ds; /* right? */ - return 1; -} - -#undef elf_check_arch -#define elf_check_arch(x) \ - ((x)->e_machine == EM_386) - -extern int force_personality32; - -#undef ELF_EXEC_PAGESIZE -#undef ELF_HWCAP -#undef ELF_PLATFORM -#undef SET_PERSONALITY -#define ELF_EXEC_PAGESIZE PAGE_SIZE -#define ELF_HWCAP (boot_cpu_data.x86_capability[0]) -#define ELF_PLATFORM ("i686") -#define SET_PERSONALITY(ex, ibcs2) \ -do { \ - unsigned long new_flags = 0; \ - if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \ - new_flags = _TIF_IA32; \ - if ((current_thread_info()->flags & _TIF_IA32) \ - != new_flags) \ - set_thread_flag(TIF_ABI_PENDING); \ - else \ - clear_thread_flag(TIF_ABI_PENDING); \ - /* XXX This overwrites the user set personality */ \ - current->personality |= force_personality32; \ -} while (0) - -/* Override some function names */ -#define elf_format elf32_format - -#define init_elf_binfmt init_elf32_binfmt -#define exit_elf_binfmt exit_elf32_binfmt - -#define load_elf_binary load_elf32_binary - -#undef ELF_PLAT_INIT -#define ELF_PLAT_INIT(r, load_addr) elf32_init(r) - -#undef start_thread -#define start_thread(regs,new_rip,new_rsp) do { \ - asm volatile("movl %0,%%fs" :: "r" (0)); \ - asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); \ - load_gs_index(0); \ - (regs)->rip = (new_rip); \ - (regs)->rsp = (new_rsp); \ - (regs)->eflags = 0x200; \ - (regs)->cs = __USER32_CS; \ - (regs)->ss = __USER32_DS; \ - set_fs(USER_DS); \ -} while(0) - - -#include <linux/module.h> - -MODULE_DESCRIPTION("Binary format loader for compatibility with IA32 ELF binaries."); -MODULE_AUTHOR("Eric Youngdale, Andi Kleen"); - -#undef MODULE_DESCRIPTION -#undef MODULE_AUTHOR - -static void elf32_init(struct pt_regs *); - -#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 -#define arch_setup_additional_pages syscall32_setup_pages -extern int syscall32_setup_pages(struct linux_binprm *, int exstack); - -#include "../../../fs/binfmt_elf.c" - -static void elf32_init(struct pt_regs *regs) -{ - struct task_struct *me = current; - regs->rdi = 0; - regs->rsi = 0; - regs->rdx = 0; - regs->rcx = 0; - regs->rax = 0; - regs->rbx = 0; - regs->rbp = 0; - regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = - regs->r13 = regs->r14 = regs->r15 = 0; - me->thread.fs = 0; - me->thread.gs = 0; - me->thread.fsindex = 0; - me->thread.gsindex = 0; - me->thread.ds = __USER_DS; - me->thread.es = __USER_DS; -} - -#ifdef CONFIG_SYSCTL -/* Register vsyscall32 into the ABI table */ -#include <linux/sysctl.h> - -static ctl_table abi_table2[] = { - { - .procname = "vsyscall32", - .data = &sysctl_vsyscall32, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - {} -}; - -static ctl_table abi_root_table2[] = { - { - .ctl_name = CTL_ABI, - .procname = "abi", - .mode = 0555, - .child = abi_table2 - }, - {} -}; - -static __init int ia32_binfmt_init(void) -{ - register_sysctl_table(abi_root_table2); - return 0; -} -__initcall(ia32_binfmt_init); -#endif diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 6ea19c25f90..1c0503bdfb1 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -29,9 +29,8 @@ #include <asm/ia32_unistd.h> #include <asm/user32.h> #include <asm/sigcontext32.h> -#include <asm/fpu32.h> #include <asm/proto.h> -#include <asm/vsyscall32.h> +#include <asm/vdso.h> #define DEBUG_SIG 0 @@ -43,7 +42,8 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where); int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) { int err; - if (!access_ok (VERIFY_WRITE, to, sizeof(compat_siginfo_t))) + + if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) return -EFAULT; /* If you change siginfo_t structure, please make sure that @@ -53,16 +53,19 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) 3 ints plus the relevant union member. */ err = __put_user(from->si_signo, &to->si_signo); err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user((short)from->si_code, &to->si_code); + err |= __put_user((short)from->si_code, &to->si_code); if (from->si_code < 0) { err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr); + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(ptr_to_compat(from->si_ptr), &to->si_ptr); } else { - /* First 32bits of unions are always present: - * si_pid === si_band === si_tid === si_addr(LS half) */ - err |= __put_user(from->_sifields._pad[0], &to->_sifields._pad[0]); + /* + * First 32bits of unions are always present: + * si_pid === si_band === si_tid === si_addr(LS half) + */ + err |= __put_user(from->_sifields._pad[0], + &to->_sifields._pad[0]); switch (from->si_code >> 16) { case __SI_FAULT >> 16: break; @@ -76,14 +79,15 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) err |= __put_user(from->si_uid, &to->si_uid); break; case __SI_POLL >> 16: - err |= __put_user(from->si_fd, &to->si_fd); + err |= __put_user(from->si_fd, &to->si_fd); break; case __SI_TIMER >> 16: - err |= __put_user(from->si_overrun, &to->si_overrun); + err |= __put_user(from->si_overrun, &to->si_overrun); err |= __put_user(ptr_to_compat(from->si_ptr), - &to->si_ptr); + &to->si_ptr); break; - case __SI_RT >> 16: /* This is not generated by the kernel as of now. */ + /* This is not generated by the kernel as of now. */ + case __SI_RT >> 16: case __SI_MESGQ >> 16: err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_int, &to->si_int); @@ -97,7 +101,8 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) { int err; u32 ptr32; - if (!access_ok (VERIFY_READ, from, sizeof(compat_siginfo_t))) + + if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t))) return -EFAULT; err = __get_user(to->si_signo, &from->si_signo); @@ -112,8 +117,7 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) return err; } -asmlinkage long -sys32_sigsuspend(int history0, int history1, old_sigset_t mask) +asmlinkage long sys32_sigsuspend(int history0, int history1, old_sigset_t mask) { mask &= _BLOCKABLE; spin_lock_irq(¤t->sighand->siglock); @@ -128,36 +132,37 @@ sys32_sigsuspend(int history0, int history1, old_sigset_t mask) return -ERESTARTNOHAND; } -asmlinkage long -sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, - stack_ia32_t __user *uoss_ptr, - struct pt_regs *regs) +asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr, + stack_ia32_t __user *uoss_ptr, + struct pt_regs *regs) { - stack_t uss,uoss; + stack_t uss, uoss; int ret; - mm_segment_t seg; - if (uss_ptr) { + mm_segment_t seg; + + if (uss_ptr) { u32 ptr; - memset(&uss,0,sizeof(stack_t)); - if (!access_ok(VERIFY_READ,uss_ptr,sizeof(stack_ia32_t)) || + + memset(&uss, 0, sizeof(stack_t)); + if (!access_ok(VERIFY_READ, uss_ptr, sizeof(stack_ia32_t)) || __get_user(ptr, &uss_ptr->ss_sp) || __get_user(uss.ss_flags, &uss_ptr->ss_flags) || __get_user(uss.ss_size, &uss_ptr->ss_size)) return -EFAULT; uss.ss_sp = compat_ptr(ptr); } - seg = get_fs(); - set_fs(KERNEL_DS); - ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->rsp); - set_fs(seg); + seg = get_fs(); + set_fs(KERNEL_DS); + ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->sp); + set_fs(seg); if (ret >= 0 && uoss_ptr) { - if (!access_ok(VERIFY_WRITE,uoss_ptr,sizeof(stack_ia32_t)) || + if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(stack_ia32_t)) || __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) || __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) || __put_user(uoss.ss_size, &uoss_ptr->ss_size)) ret = -EFAULT; - } - return ret; + } + return ret; } /* @@ -186,87 +191,85 @@ struct rt_sigframe char retcode[8]; }; -static int -ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_ia32 __user *sc, unsigned int *peax) +#define COPY(x) { \ + unsigned int reg; \ + err |= __get_user(reg, &sc->x); \ + regs->x = reg; \ +} + +#define RELOAD_SEG(seg,mask) \ + { unsigned int cur; \ + unsigned short pre; \ + err |= __get_user(pre, &sc->seg); \ + asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \ + pre |= mask; \ + if (pre != cur) loadsegment(seg, pre); } + +static int ia32_restore_sigcontext(struct pt_regs *regs, + struct sigcontext_ia32 __user *sc, + unsigned int *peax) { - unsigned int err = 0; - + unsigned int tmpflags, gs, oldgs, err = 0; + struct _fpstate_ia32 __user *buf; + u32 tmp; + /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; #if DEBUG_SIG - printk("SIG restore_sigcontext: sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n", - sc, sc->err, sc->eip, sc->cs, sc->eflags); + printk(KERN_DEBUG "SIG restore_sigcontext: " + "sc=%p err(%x) eip(%x) cs(%x) flg(%x)\n", + sc, sc->err, sc->ip, sc->cs, sc->flags); #endif -#define COPY(x) { \ - unsigned int reg; \ - err |= __get_user(reg, &sc->e ##x); \ - regs->r ## x = reg; \ -} -#define RELOAD_SEG(seg,mask) \ - { unsigned int cur; \ - unsigned short pre; \ - err |= __get_user(pre, &sc->seg); \ - asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \ - pre |= mask; \ - if (pre != cur) loadsegment(seg,pre); } - - /* Reload fs and gs if they have changed in the signal handler. - This does not handle long fs/gs base changes in the handler, but - does not clobber them at least in the normal case. */ - - { - unsigned gs, oldgs; - err |= __get_user(gs, &sc->gs); - gs |= 3; - asm("movl %%gs,%0" : "=r" (oldgs)); - if (gs != oldgs) - load_gs_index(gs); - } - RELOAD_SEG(fs,3); - RELOAD_SEG(ds,3); - RELOAD_SEG(es,3); + /* + * Reload fs and gs if they have changed in the signal + * handler. This does not handle long fs/gs base changes in + * the handler, but does not clobber them at least in the + * normal case. + */ + err |= __get_user(gs, &sc->gs); + gs |= 3; + asm("movl %%gs,%0" : "=r" (oldgs)); + if (gs != oldgs) + load_gs_index(gs); + + RELOAD_SEG(fs, 3); + RELOAD_SEG(ds, 3); + RELOAD_SEG(es, 3); COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); COPY(dx); COPY(cx); COPY(ip); - /* Don't touch extended registers */ - - err |= __get_user(regs->cs, &sc->cs); - regs->cs |= 3; - err |= __get_user(regs->ss, &sc->ss); - regs->ss |= 3; - - { - unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->eflags); - regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); - regs->orig_rax = -1; /* disable syscall checks */ - } + /* Don't touch extended registers */ + + err |= __get_user(regs->cs, &sc->cs); + regs->cs |= 3; + err |= __get_user(regs->ss, &sc->ss); + regs->ss |= 3; + + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5); + /* disable syscall checks */ + regs->orig_ax = -1; + + err |= __get_user(tmp, &sc->fpstate); + buf = compat_ptr(tmp); + if (buf) { + if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) + goto badframe; + err |= restore_i387_ia32(buf); + } else { + struct task_struct *me = current; - { - u32 tmp; - struct _fpstate_ia32 __user * buf; - err |= __get_user(tmp, &sc->fpstate); - buf = compat_ptr(tmp); - if (buf) { - if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387_ia32(current, buf, 0); - } else { - struct task_struct *me = current; - if (used_math()) { - clear_fpu(me); - clear_used_math(); - } + if (used_math()) { + clear_fpu(me); + clear_used_math(); } } - { - u32 tmp; - err |= __get_user(tmp, &sc->eax); - *peax = tmp; - } + err |= __get_user(tmp, &sc->ax); + *peax = tmp; + return err; badframe: @@ -275,15 +278,16 @@ badframe: asmlinkage long sys32_sigreturn(struct pt_regs *regs) { - struct sigframe __user *frame = (struct sigframe __user *)(regs->rsp-8); + struct sigframe __user *frame = (struct sigframe __user *)(regs->sp-8); sigset_t set; - unsigned int eax; + unsigned int ax; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; if (__get_user(set.sig[0], &frame->sc.oldmask) || (_COMPAT_NSIG_WORDS > 1 - && __copy_from_user((((char *) &set.sig) + 4), &frame->extramask, + && __copy_from_user((((char *) &set.sig) + 4), + &frame->extramask, sizeof(frame->extramask)))) goto badframe; @@ -292,24 +296,24 @@ asmlinkage long sys32_sigreturn(struct pt_regs *regs) current->blocked = set; recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - - if (ia32_restore_sigcontext(regs, &frame->sc, &eax)) + + if (ia32_restore_sigcontext(regs, &frame->sc, &ax)) goto badframe; - return eax; + return ax; badframe: signal_fault(regs, frame, "32bit sigreturn"); return 0; -} +} asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) { struct rt_sigframe __user *frame; sigset_t set; - unsigned int eax; + unsigned int ax; struct pt_regs tregs; - frame = (struct rt_sigframe __user *)(regs->rsp - 4); + frame = (struct rt_sigframe __user *)(regs->sp - 4); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; @@ -321,28 +325,28 @@ asmlinkage long sys32_rt_sigreturn(struct pt_regs *regs) current->blocked = set; recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - - if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) + + if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; tregs = *regs; if (sys32_sigaltstack(&frame->uc.uc_stack, NULL, &tregs) == -EFAULT) goto badframe; - return eax; + return ax; badframe: - signal_fault(regs,frame,"32bit rt sigreturn"); + signal_fault(regs, frame, "32bit rt sigreturn"); return 0; -} +} /* * Set up a signal frame. */ -static int -ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __user *fpstate, - struct pt_regs *regs, unsigned int mask) +static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, + struct _fpstate_ia32 __user *fpstate, + struct pt_regs *regs, unsigned int mask) { int tmp, err = 0; @@ -356,26 +360,26 @@ ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __ __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp)); err |= __put_user(tmp, (unsigned int __user *)&sc->es); - err |= __put_user((u32)regs->rdi, &sc->edi); - err |= __put_user((u32)regs->rsi, &sc->esi); - err |= __put_user((u32)regs->rbp, &sc->ebp); - err |= __put_user((u32)regs->rsp, &sc->esp); - err |= __put_user((u32)regs->rbx, &sc->ebx); - err |= __put_user((u32)regs->rdx, &sc->edx); - err |= __put_user((u32)regs->rcx, &sc->ecx); - err |= __put_user((u32)regs->rax, &sc->eax); + err |= __put_user((u32)regs->di, &sc->di); + err |= __put_user((u32)regs->si, &sc->si); + err |= __put_user((u32)regs->bp, &sc->bp); + err |= __put_user((u32)regs->sp, &sc->sp); + err |= __put_user((u32)regs->bx, &sc->bx); + err |= __put_user((u32)regs->dx, &sc->dx); + err |= __put_user((u32)regs->cx, &sc->cx); + err |= __put_user((u32)regs->ax, &sc->ax); err |= __put_user((u32)regs->cs, &sc->cs); err |= __put_user((u32)regs->ss, &sc->ss); err |= __put_user(current->thread.trap_no, &sc->trapno); err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user((u32)regs->rip, &sc->eip); - err |= __put_user((u32)regs->eflags, &sc->eflags); - err |= __put_user((u32)regs->rsp, &sc->esp_at_signal); + err |= __put_user((u32)regs->ip, &sc->ip); + err |= __put_user((u32)regs->flags, &sc->flags); + err |= __put_user((u32)regs->sp, &sc->sp_at_signal); - tmp = save_i387_ia32(current, fpstate, regs, 0); + tmp = save_i387_ia32(fpstate); if (tmp < 0) err = -EFAULT; - else { + else { clear_used_math(); stts(); err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), @@ -392,40 +396,53 @@ ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, struct _fpstate_ia32 __ /* * Determine which stack to use.. */ -static void __user * -get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) +static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, + size_t frame_size) { - unsigned long rsp; + unsigned long sp; /* Default to using normal stack */ - rsp = regs->rsp; + sp = regs->sp; /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(rsp) == 0) - rsp = current->sas_ss_sp + current->sas_ss_size; + if (sas_ss_flags(sp) == 0) + sp = current->sas_ss_sp + current->sas_ss_size; } /* This is the legacy signal stack switching. */ else if ((regs->ss & 0xffff) != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && - ka->sa.sa_restorer) { - rsp = (unsigned long) ka->sa.sa_restorer; - } + ka->sa.sa_restorer) + sp = (unsigned long) ka->sa.sa_restorer; - rsp -= frame_size; + sp -= frame_size; /* Align the stack pointer according to the i386 ABI, * i.e. so that on function entry ((sp + 4) & 15) == 0. */ - rsp = ((rsp + 4) & -16ul) - 4; - return (void __user *) rsp; + sp = ((sp + 4) & -16ul) - 4; + return (void __user *) sp; } int ia32_setup_frame(int sig, struct k_sigaction *ka, - compat_sigset_t *set, struct pt_regs * regs) + compat_sigset_t *set, struct pt_regs *regs) { struct sigframe __user *frame; + void __user *restorer; int err = 0; + /* copy_to_user optimizes that into a single 8 byte store */ + static const struct { + u16 poplmovl; + u32 val; + u16 int80; + u16 pad; + } __attribute__((packed)) code = { + 0xb858, /* popl %eax ; movl $...,%eax */ + __NR_ia32_sigreturn, + 0x80cd, /* int $0x80 */ + 0, + }; + frame = get_sigframe(ka, regs, sizeof(*frame)); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) @@ -443,64 +460,53 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, if (_COMPAT_NSIG_WORDS > 1) { err |= __copy_to_user(frame->extramask, &set->sig[1], sizeof(frame->extramask)); + if (err) + goto give_sigsegv; } - if (err) - goto give_sigsegv; - /* Return stub is in 32bit vsyscall page */ - { - void __user *restorer; + if (ka->sa.sa_flags & SA_RESTORER) { + restorer = ka->sa.sa_restorer; + } else { + /* Return stub is in 32bit vsyscall page */ if (current->binfmt->hasvdso) - restorer = VSYSCALL32_SIGRETURN; + restorer = VDSO32_SYMBOL(current->mm->context.vdso, + sigreturn); else - restorer = (void *)&frame->retcode; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; - err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); - } - /* These are actually not used anymore, but left because some - gdb versions depend on them as a marker. */ - { - /* copy_to_user optimizes that into a single 8 byte store */ - static const struct { - u16 poplmovl; - u32 val; - u16 int80; - u16 pad; - } __attribute__((packed)) code = { - 0xb858, /* popl %eax ; movl $...,%eax */ - __NR_ia32_sigreturn, - 0x80cd, /* int $0x80 */ - 0, - }; - err |= __copy_to_user(frame->retcode, &code, 8); + restorer = &frame->retcode; } + err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); + + /* + * These are actually not used anymore, but left because some + * gdb versions depend on them as a marker. + */ + err |= __copy_to_user(frame->retcode, &code, 8); if (err) goto give_sigsegv; /* Set up registers for signal handler */ - regs->rsp = (unsigned long) frame; - regs->rip = (unsigned long) ka->sa.sa_handler; + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ka->sa.sa_handler; /* Make -mregparm=3 work */ - regs->rax = sig; - regs->rdx = 0; - regs->rcx = 0; + regs->ax = sig; + regs->dx = 0; + regs->cx = 0; - asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); - asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); + asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); + asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); - regs->cs = __USER32_CS; - regs->ss = __USER32_DS; + regs->cs = __USER32_CS; + regs->ss = __USER32_DS; set_fs(USER_DS); - regs->eflags &= ~TF_MASK; + regs->flags &= ~X86_EFLAGS_TF; if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); #if DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", - current->comm, current->pid, frame, regs->rip, frame->pretcode); + printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", + current->comm, current->pid, frame, regs->ip, frame->pretcode); #endif return 0; @@ -511,25 +517,34 @@ give_sigsegv: } int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - compat_sigset_t *set, struct pt_regs * regs) + compat_sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; + struct exec_domain *ed = current_thread_info()->exec_domain; + void __user *restorer; int err = 0; + /* __copy_to_user optimizes that into a single 8 byte store */ + static const struct { + u8 movl; + u32 val; + u16 int80; + u16 pad; + u8 pad2; + } __attribute__((packed)) code = { + 0xb8, + __NR_ia32_rt_sigreturn, + 0x80cd, + 0, + }; + frame = get_sigframe(ka, regs, sizeof(*frame)); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; - { - struct exec_domain *ed = current_thread_info()->exec_domain; - err |= __put_user((ed - && ed->signal_invmap - && sig < 32 - ? ed->signal_invmap[sig] - : sig), - &frame->sig); - } + err |= __put_user((ed && ed->signal_invmap && sig < 32 + ? ed->signal_invmap[sig] : sig), &frame->sig); err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); err |= copy_siginfo_to_user32(&frame->info, info); @@ -540,73 +555,58 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->rsp), + err |= __put_user(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, - regs, set->sig[0]); + regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) goto give_sigsegv; - - { - void __user *restorer = VSYSCALL32_RTSIGRETURN; - if (ka->sa.sa_flags & SA_RESTORER) - restorer = ka->sa.sa_restorer; - err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); - } - - /* This is movl $,%eax ; int $0x80 */ - /* Not actually used anymore, but left because some gdb versions - need it. */ - { - /* __copy_to_user optimizes that into a single 8 byte store */ - static const struct { - u8 movl; - u32 val; - u16 int80; - u16 pad; - u8 pad2; - } __attribute__((packed)) code = { - 0xb8, - __NR_ia32_rt_sigreturn, - 0x80cd, - 0, - }; - err |= __copy_to_user(frame->retcode, &code, 8); - } + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + else + restorer = VDSO32_SYMBOL(current->mm->context.vdso, + rt_sigreturn); + err |= __put_user(ptr_to_compat(restorer), &frame->pretcode); + + /* + * Not actually used anymore, but left because some gdb + * versions need it. + */ + err |= __copy_to_user(frame->retcode, &code, 8); if (err) goto give_sigsegv; /* Set up registers for signal handler */ - regs->rsp = (unsigned long) frame; - regs->rip = (unsigned long) ka->sa.sa_handler; + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ka->sa.sa_handler; /* Make -mregparm=3 work */ - regs->rax = sig; - regs->rdx = (unsigned long) &frame->info; - regs->rcx = (unsigned long) &frame->uc; + regs->ax = sig; + regs->dx = (unsigned long) &frame->info; + regs->cx = (unsigned long) &frame->uc; /* Make -mregparm=3 work */ - regs->rax = sig; - regs->rdx = (unsigned long) &frame->info; - regs->rcx = (unsigned long) &frame->uc; + regs->ax = sig; + regs->dx = (unsigned long) &frame->info; + regs->cx = (unsigned long) &frame->uc; + + asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); + asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); - asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); - asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); - - regs->cs = __USER32_CS; - regs->ss = __USER32_DS; + regs->cs = __USER32_CS; + regs->ss = __USER32_DS; set_fs(USER_DS); - regs->eflags &= ~TF_MASK; + regs->flags &= ~X86_EFLAGS_TF; if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); #if DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", - current->comm, current->pid, frame, regs->rip, frame->pretcode); + printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", + current->comm, current->pid, frame, regs->ip, frame->pretcode); #endif return 0; diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index df588f0f76e..0db0a6291bb 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -12,7 +12,6 @@ #include <asm/ia32_unistd.h> #include <asm/thread_info.h> #include <asm/segment.h> -#include <asm/vsyscall32.h> #include <asm/irqflags.h> #include <linux/linkage.h> @@ -104,7 +103,7 @@ ENTRY(ia32_sysenter_target) pushfq CFI_ADJUST_CFA_OFFSET 8 /*CFI_REL_OFFSET rflags,0*/ - movl $VSYSCALL32_SYSEXIT, %r10d + movl 8*3-THREAD_SIZE+threadinfo_sysenter_return(%rsp), %r10d CFI_REGISTER rip,r10 pushq $__USER32_CS CFI_ADJUST_CFA_OFFSET 8 @@ -142,6 +141,8 @@ sysenter_do_call: andl $~TS_COMPAT,threadinfo_status(%r10) /* clear IF, that popfq doesn't enable interrupts early */ andl $~0x200,EFLAGS-R11(%rsp) + movl RIP-R11(%rsp),%edx /* User %eip */ + CFI_REGISTER rip,rdx RESTORE_ARGS 1,24,1,1,1,1 popfq CFI_ADJUST_CFA_OFFSET -8 @@ -149,8 +150,6 @@ sysenter_do_call: popq %rcx /* User %esp */ CFI_ADJUST_CFA_OFFSET -8 CFI_REGISTER rsp,rcx - movl $VSYSCALL32_SYSEXIT,%edx /* User %eip */ - CFI_REGISTER rip,rdx TRACE_IRQS_ON swapgs sti /* sti only takes effect after the next instruction */ @@ -644,8 +643,8 @@ ia32_sys_call_table: .quad compat_sys_futex /* 240 */ .quad compat_sys_sched_setaffinity .quad compat_sys_sched_getaffinity - .quad sys32_set_thread_area - .quad sys32_get_thread_area + .quad sys_set_thread_area + .quad sys_get_thread_area .quad compat_sys_io_setup /* 245 */ .quad sys_io_destroy .quad compat_sys_io_getevents diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c index 7b3342e5aab..d21991ce606 100644 --- a/arch/x86/ia32/ipc32.c +++ b/arch/x86/ia32/ipc32.c @@ -9,9 +9,8 @@ #include <linux/ipc.h> #include <linux/compat.h> -asmlinkage long -sys32_ipc(u32 call, int first, int second, int third, - compat_uptr_t ptr, u32 fifth) +asmlinkage long sys32_ipc(u32 call, int first, int second, int third, + compat_uptr_t ptr, u32 fifth) { int version; @@ -19,36 +18,35 @@ sys32_ipc(u32 call, int first, int second, int third, call &= 0xffff; switch (call) { - case SEMOP: + case SEMOP: /* struct sembuf is the same on 32 and 64bit :)) */ return sys_semtimedop(first, compat_ptr(ptr), second, NULL); - case SEMTIMEDOP: + case SEMTIMEDOP: return compat_sys_semtimedop(first, compat_ptr(ptr), second, compat_ptr(fifth)); - case SEMGET: + case SEMGET: return sys_semget(first, second, third); - case SEMCTL: + case SEMCTL: return compat_sys_semctl(first, second, third, compat_ptr(ptr)); - case MSGSND: + case MSGSND: return compat_sys_msgsnd(first, second, third, compat_ptr(ptr)); - case MSGRCV: + case MSGRCV: return compat_sys_msgrcv(first, second, fifth, third, version, compat_ptr(ptr)); - case MSGGET: + case MSGGET: return sys_msgget((key_t) first, second); - case MSGCTL: + case MSGCTL: return compat_sys_msgctl(first, second, compat_ptr(ptr)); - case SHMAT: + case SHMAT: return compat_sys_shmat(first, second, third, version, compat_ptr(ptr)); - break; - case SHMDT: + case SHMDT: return sys_shmdt(compat_ptr(ptr)); - case SHMGET: + case SHMGET: return sys_shmget(first, (unsigned)second, third); - case SHMCTL: + case SHMCTL: return compat_sys_shmctl(first, second, compat_ptr(ptr)); } return -ENOSYS; diff --git a/arch/x86/ia32/mmap32.c b/arch/x86/ia32/mmap32.c deleted file mode 100644 index e4b84b4a417..00000000000 --- a/arch/x86/ia32/mmap32.c +++ /dev/null @@ -1,79 +0,0 @@ -/* - * linux/arch/x86_64/ia32/mm/mmap.c - * - * flexible mmap layout support - * - * Based on the i386 version which was - * - * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * - * Started by Ingo Molnar <mingo@elte.hu> - */ - -#include <linux/personality.h> -#include <linux/mm.h> -#include <linux/random.h> -#include <linux/sched.h> - -/* - * Top of mmap area (just below the process stack). - * - * Leave an at least ~128 MB hole. - */ -#define MIN_GAP (128*1024*1024) -#define MAX_GAP (TASK_SIZE/6*5) - -static inline unsigned long mmap_base(struct mm_struct *mm) -{ - unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; - unsigned long random_factor = 0; - - if (current->flags & PF_RANDOMIZE) - random_factor = get_random_int() % (1024*1024); - - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; - - return PAGE_ALIGN(TASK_SIZE - gap - random_factor); -} - -/* - * This function, called very early during the creation of a new - * process VM image, sets up which VM layout function to use: - */ -void ia32_pick_mmap_layout(struct mm_struct *mm) -{ - /* - * Fall back to the standard layout if the personality - * bit is set, or if the expected stack growth is unlimited: - */ - if (sysctl_legacy_va_layout || - (current->personality & ADDR_COMPAT_LAYOUT) || - current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) { - mm->mmap_base = TASK_UNMAPPED_BASE; - mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; - } else { - mm->mmap_base = mmap_base(mm); - mm->get_unmapped_area = arch_get_unmapped_area_topdown; - mm->unmap_area = arch_unmap_area_topdown; - } -} diff --git a/arch/x86/ia32/ptrace32.c b/arch/x86/ia32/ptrace32.c deleted file mode 100644 index 4a233ad6269..00000000000 --- a/arch/x86/ia32/ptrace32.c +++ /dev/null @@ -1,404 +0,0 @@ -/* - * 32bit ptrace for x86-64. - * - * Copyright 2001,2002 Andi Kleen, SuSE Labs. - * Some parts copied from arch/i386/kernel/ptrace.c. See that file for earlier - * copyright. - * - * This allows to access 64bit processes too; but there is no way to see the extended - * register contents. - */ - -#include <linux/kernel.h> -#include <linux/stddef.h> -#include <linux/sched.h> -#include <linux/syscalls.h> -#include <linux/unistd.h> -#include <linux/mm.h> -#include <linux/err.h> -#include <linux/ptrace.h> -#include <asm/ptrace.h> -#include <asm/compat.h> -#include <asm/uaccess.h> -#include <asm/user32.h> -#include <asm/user.h> -#include <asm/errno.h> -#include <asm/debugreg.h> -#include <asm/i387.h> -#include <asm/fpu32.h> -#include <asm/ia32.h> - -/* - * Determines which flags the user has access to [1 = access, 0 = no access]. - * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9). - * Also masks reserved bits (31-22, 15, 5, 3, 1). - */ -#define FLAG_MASK 0x54dd5UL - -#define R32(l,q) \ - case offsetof(struct user32, regs.l): stack[offsetof(struct pt_regs, q)/8] = val; break - -static int putreg32(struct task_struct *child, unsigned regno, u32 val) -{ - int i; - __u64 *stack = (__u64 *)task_pt_regs(child); - - switch (regno) { - case offsetof(struct user32, regs.fs): - if (val && (val & 3) != 3) return -EIO; - child->thread.fsindex = val & 0xffff; - break; - case offsetof(struct user32, regs.gs): - if (val && (val & 3) != 3) return -EIO; - child->thread.gsindex = val & 0xffff; - break; - case offsetof(struct user32, regs.ds): - if (val && (val & 3) != 3) return -EIO; - child->thread.ds = val & 0xffff; - break; - case offsetof(struct user32, regs.es): - child->thread.es = val & 0xffff; - break; - case offsetof(struct user32, regs.ss): - if ((val & 3) != 3) return -EIO; - stack[offsetof(struct pt_regs, ss)/8] = val & 0xffff; - break; - case offsetof(struct user32, regs.cs): - if ((val & 3) != 3) return -EIO; - stack[offsetof(struct pt_regs, cs)/8] = val & 0xffff; - break; - - R32(ebx, rbx); - R32(ecx, rcx); - R32(edx, rdx); - R32(edi, rdi); - R32(esi, rsi); - R32(ebp, rbp); - R32(eax, rax); - R32(orig_eax, orig_rax); - R32(eip, rip); - R32(esp, rsp); - - case offsetof(struct user32, regs.eflags): { - __u64 *flags = &stack[offsetof(struct pt_regs, eflags)/8]; - val &= FLAG_MASK; - *flags = val | (*flags & ~FLAG_MASK); - break; - } - - case offsetof(struct user32, u_debugreg[4]): - case offsetof(struct user32, u_debugreg[5]): - return -EIO; - - case offsetof(struct user32, u_debugreg[0]): - child->thread.debugreg0 = val; - break; - - case offsetof(struct user32, u_debugreg[1]): - child->thread.debugreg1 = val; - break; - - case offsetof(struct user32, u_debugreg[2]): - child->thread.debugreg2 = val; - break; - - case offsetof(struct user32, u_debugreg[3]): - child->thread.debugreg3 = val; - break; - - case offsetof(struct user32, u_debugreg[6]): - child->thread.debugreg6 = val; - break; - - case offsetof(struct user32, u_debugreg[7]): - val &= ~DR_CONTROL_RESERVED; - /* See arch/i386/kernel/ptrace.c for an explanation of - * this awkward check.*/ - for(i=0; i<4; i++) - if ((0x5454 >> ((val >> (16 + 4*i)) & 0xf)) & 1) - return -EIO; - child->thread.debugreg7 = val; - if (val) - set_tsk_thread_flag(child, TIF_DEBUG); - else - clear_tsk_thread_flag(child, TIF_DEBUG); - break; - - default: - if (regno > sizeof(struct user32) || (regno & 3)) - return -EIO; - - /* Other dummy fields in the virtual user structure are ignored */ - break; - } - return 0; -} - -#undef R32 - -#define R32(l,q) \ - case offsetof(struct user32, regs.l): *val = stack[offsetof(struct pt_regs, q)/8]; break - -static int getreg32(struct task_struct *child, unsigned regno, u32 *val) -{ - __u64 *stack = (__u64 *)task_pt_regs(child); - - switch (regno) { - case offsetof(struct user32, regs.fs): - *val = child->thread.fsindex; - break; - case offsetof(struct user32, regs.gs): - *val = child->thread.gsindex; - break; - case offsetof(struct user32, regs.ds): - *val = child->thread.ds; - break; - case offsetof(struct user32, regs.es): - *val = child->thread.es; - break; - - R32(cs, cs); - R32(ss, ss); - R32(ebx, rbx); - R32(ecx, rcx); - R32(edx, rdx); - R32(edi, rdi); - R32(esi, rsi); - R32(ebp, rbp); - R32(eax, rax); - R32(orig_eax, orig_rax); - R32(eip, rip); - R32(eflags, eflags); - R32(esp, rsp); - - case offsetof(struct user32, u_debugreg[0]): - *val = child->thread.debugreg0; - break; - case offsetof(struct user32, u_debugreg[1]): - *val = child->thread.debugreg1; - break; - case offsetof(struct user32, u_debugreg[2]): - *val = child->thread.debugreg2; - break; - case offsetof(struct user32, u_debugreg[3]): - *val = child->thread.debugreg3; - break; - case offsetof(struct user32, u_debugreg[6]): - *val = child->thread.debugreg6; - break; - case offsetof(struct user32, u_debugreg[7]): - *val = child->thread.debugreg7; - break; - - default: - if (regno > sizeof(struct user32) || (regno & 3)) - return -EIO; - - /* Other dummy fields in the virtual user structure are ignored */ - *val = 0; - break; - } - return 0; -} - -#undef R32 - -static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data) -{ - int ret; - compat_siginfo_t __user *si32 = compat_ptr(data); - siginfo_t ssi; - siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t)); - if (request == PTRACE_SETSIGINFO) { - memset(&ssi, 0, sizeof(siginfo_t)); - ret = copy_siginfo_from_user32(&ssi, si32); - if (ret) - return ret; - if (copy_to_user(si, &ssi, sizeof(siginfo_t))) - return -EFAULT; - } - ret = sys_ptrace(request, pid, addr, (unsigned long)si); - if (ret) - return ret; - if (request == PTRACE_GETSIGINFO) { - if (copy_from_user(&ssi, si, sizeof(siginfo_t))) - return -EFAULT; - ret = copy_siginfo_to_user32(si32, &ssi); - } - return ret; -} - -asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) -{ - struct task_struct *child; - struct pt_regs *childregs; - void __user *datap = compat_ptr(data); - int ret; - __u32 val; - - switch (request) { - case PTRACE_TRACEME: - case PTRACE_ATTACH: - case PTRACE_KILL: - case PTRACE_CONT: - case PTRACE_SINGLESTEP: - case PTRACE_DETACH: - case PTRACE_SYSCALL: - case PTRACE_OLDSETOPTIONS: - case PTRACE_SETOPTIONS: - case PTRACE_SET_THREAD_AREA: - case PTRACE_GET_THREAD_AREA: - return sys_ptrace(request, pid, addr, data); - - default: - return -EINVAL; - - case PTRACE_PEEKTEXT: - case PTRACE_PEEKDATA: - case PTRACE_POKEDATA: - case PTRACE_POKETEXT: - case PTRACE_POKEUSR: - case PTRACE_PEEKUSR: - case PTRACE_GETREGS: - case PTRACE_SETREGS: - case PTRACE_SETFPREGS: - case PTRACE_GETFPREGS: - case PTRACE_SETFPXREGS: - case PTRACE_GETFPXREGS: - case PTRACE_GETEVENTMSG: - break; - - case PTRACE_SETSIGINFO: - case PTRACE_GETSIGINFO: - return ptrace32_siginfo(request, pid, addr, data); - } - - child = ptrace_get_task_struct(pid); - if (IS_ERR(child)) - return PTR_ERR(child); - - ret = ptrace_check_attach(child, request == PTRACE_KILL); - if (ret < 0) - goto out; - - childregs = task_pt_regs(child); - - switch (request) { - case PTRACE_PEEKDATA: - case PTRACE_PEEKTEXT: - ret = 0; - if (access_process_vm(child, addr, &val, sizeof(u32), 0)!=sizeof(u32)) - ret = -EIO; - else - ret = put_user(val, (unsigned int __user *)datap); - break; - - case PTRACE_POKEDATA: - case PTRACE_POKETEXT: - ret = 0; - if (access_process_vm(child, addr, &data, sizeof(u32), 1)!=sizeof(u32)) - ret = -EIO; - break; - - case PTRACE_PEEKUSR: - ret = getreg32(child, addr, &val); - if (ret == 0) - ret = put_user(val, (__u32 __user *)datap); - break; - - case PTRACE_POKEUSR: - ret = putreg32(child, addr, data); - break; - - case PTRACE_GETREGS: { /* Get all gp regs from the child. */ - int i; - if (!access_ok(VERIFY_WRITE, datap, 16*4)) { - ret = -EIO; - break; - } - ret = 0; - for ( i = 0; i <= 16*4 ; i += sizeof(__u32) ) { - getreg32(child, i, &val); - ret |= __put_user(val,(u32 __user *)datap); - datap += sizeof(u32); - } - break; - } - - case PTRACE_SETREGS: { /* Set all gp regs in the child. */ - unsigned long tmp; - int i; - if (!access_ok(VERIFY_READ, datap, 16*4)) { - ret = -EIO; - break; - } - ret = 0; - for ( i = 0; i <= 16*4; i += sizeof(u32) ) { - ret |= __get_user(tmp, (u32 __user *)datap); - putreg32(child, i, tmp); - datap += sizeof(u32); - } - break; - } - - case PTRACE_GETFPREGS: - ret = -EIO; - if (!access_ok(VERIFY_READ, compat_ptr(data), - sizeof(struct user_i387_struct))) - break; - save_i387_ia32(child, datap, childregs, 1); - ret = 0; - break; - - case PTRACE_SETFPREGS: - ret = -EIO; - if (!access_ok(VERIFY_WRITE, datap, - sizeof(struct user_i387_struct))) - break; - ret = 0; - /* don't check EFAULT to be bug-to-bug compatible to i386 */ - restore_i387_ia32(child, datap, 1); - break; - - case PTRACE_GETFPXREGS: { - struct user32_fxsr_struct __user *u = datap; - init_fpu(child); - ret = -EIO; - if (!access_ok(VERIFY_WRITE, u, sizeof(*u))) - break; - ret = -EFAULT; - if (__copy_to_user(u, &child->thread.i387.fxsave, sizeof(*u))) - break; - ret = __put_user(childregs->cs, &u->fcs); - ret |= __put_user(child->thread.ds, &u->fos); - break; - } - case PTRACE_SETFPXREGS: { - struct user32_fxsr_struct __user *u = datap; - unlazy_fpu(child); - ret = -EIO; - if (!access_ok(VERIFY_READ, u, sizeof(*u))) - break; - /* no checking to be bug-to-bug compatible with i386. */ - /* but silence warning */ - if (__copy_from_user(&child->thread.i387.fxsave, u, sizeof(*u))) - ; - set_stopped_child_used_math(child); - child->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; - ret = 0; - break; - } - - case PTRACE_GETEVENTMSG: - ret = put_user(child->ptrace_message,(unsigned int __user *)compat_ptr(data)); - break; - - default: - BUG(); - } - - out: - put_task_struct(child); - return ret; -} - diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index bee96d61443..abf71d26fc2 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -1,29 +1,29 @@ /* * sys_ia32.c: Conversion between 32bit and 64bit native syscalls. Based on - * sys_sparc32 + * sys_sparc32 * * Copyright (C) 2000 VA Linux Co * Copyright (C) 2000 Don Dugger <n0ano@valinux.com> - * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com> - * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) - * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 1999 Arun Sharma <arun.sharma@intel.com> + * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) * Copyright (C) 2000 Hewlett-Packard Co. * Copyright (C) 2000 David Mosberger-Tang <davidm@hpl.hp.com> - * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port) + * Copyright (C) 2000,2001,2002 Andi Kleen, SuSE Labs (x86-64 port) * * These routines maintain argument size conversion between 32bit and 64bit - * environment. In 2.5 most of this should be moved to a generic directory. + * environment. In 2.5 most of this should be moved to a generic directory. * * This file assumes that there is a hole at the end of user address space. - * - * Some of the functions are LE specific currently. These are hopefully all marked. - * This should be fixed. + * + * Some of the functions are LE specific currently. These are + * hopefully all marked. This should be fixed. */ #include <linux/kernel.h> #include <linux/sched.h> -#include <linux/fs.h> -#include <linux/file.h> +#include <linux/fs.h> +#include <linux/file.h> #include <linux/signal.h> #include <linux/syscalls.h> #include <linux/resource.h> @@ -90,43 +90,44 @@ int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf) if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino) return -EOVERFLOW; if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) || - __put_user (old_encode_dev(kbuf->dev), &ubuf->st_dev) || - __put_user (ino, &ubuf->st_ino) || - __put_user (kbuf->mode, &ubuf->st_mode) || - __put_user (kbuf->nlink, &ubuf->st_nlink) || - __put_user (uid, &ubuf->st_uid) || - __put_user (gid, &ubuf->st_gid) || - __put_user (old_encode_dev(kbuf->rdev), &ubuf->st_rdev) || - __put_user (kbuf->size, &ubuf->st_size) || - __put_user (kbuf->atime.tv_sec, &ubuf->st_atime) || - __put_user (kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) || - __put_user (kbuf->mtime.tv_sec, &ubuf->st_mtime) || - __put_user (kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) || - __put_user (kbuf->ctime.tv_sec, &ubuf->st_ctime) || - __put_user (kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) || - __put_user (kbuf->blksize, &ubuf->st_blksize) || - __put_user (kbuf->blocks, &ubuf->st_blocks)) + __put_user(old_encode_dev(kbuf->dev), &ubuf->st_dev) || + __put_user(ino, &ubuf->st_ino) || + __put_user(kbuf->mode, &ubuf->st_mode) || + __put_user(kbuf->nlink, &ubuf->st_nlink) || + __put_user(uid, &ubuf->st_uid) || + __put_user(gid, &ubuf->st_gid) || + __put_user(old_encode_dev(kbuf->rdev), &ubuf->st_rdev) || + __put_user(kbuf->size, &ubuf->st_size) || + __put_user(kbuf->atime.tv_sec, &ubuf->st_atime) || + __put_user(kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) || + __put_user(kbuf->mtime.tv_sec, &ubuf->st_mtime) || + __put_user(kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) || + __put_user(kbuf->ctime.tv_sec, &ubuf->st_ctime) || + __put_user(kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) || + __put_user(kbuf->blksize, &ubuf->st_blksize) || + __put_user(kbuf->blocks, &ubuf->st_blocks)) return -EFAULT; return 0; } -asmlinkage long -sys32_truncate64(char __user * filename, unsigned long offset_low, unsigned long offset_high) +asmlinkage long sys32_truncate64(char __user *filename, + unsigned long offset_low, + unsigned long offset_high) { return sys_truncate(filename, ((loff_t) offset_high << 32) | offset_low); } -asmlinkage long -sys32_ftruncate64(unsigned int fd, unsigned long offset_low, unsigned long offset_high) +asmlinkage long sys32_ftruncate64(unsigned int fd, unsigned long offset_low, + unsigned long offset_high) { return sys_ftruncate(fd, ((loff_t) offset_high << 32) | offset_low); } -/* Another set for IA32/LFS -- x86_64 struct stat is different due to - support for 64bit inode numbers. */ - -static int -cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) +/* + * Another set for IA32/LFS -- x86_64 struct stat is different due to + * support for 64bit inode numbers. + */ +static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) { typeof(ubuf->st_uid) uid = 0; typeof(ubuf->st_gid) gid = 0; @@ -134,38 +135,39 @@ cp_stat64(struct stat64 __user *ubuf, struct kstat *stat) SET_GID(gid, stat->gid); if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct stat64)) || __put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) || - __put_user (stat->ino, &ubuf->__st_ino) || - __put_user (stat->ino, &ubuf->st_ino) || - __put_user (stat->mode, &ubuf->st_mode) || - __put_user (stat->nlink, &ubuf->st_nlink) || - __put_user (uid, &ubuf->st_uid) || - __put_user (gid, &ubuf->st_gid) || - __put_user (huge_encode_dev(stat->rdev), &ubuf->st_rdev) || - __put_user (stat->size, &ubuf->st_size) || - __put_user (stat->atime.tv_sec, &ubuf->st_atime) || - __put_user (stat->atime.tv_nsec, &ubuf->st_atime_nsec) || - __put_user (stat->mtime.tv_sec, &ubuf->st_mtime) || - __put_user (stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) || - __put_user (stat->ctime.tv_sec, &ubuf->st_ctime) || - __put_user (stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) || - __put_user (stat->blksize, &ubuf->st_blksize) || - __put_user (stat->blocks, &ubuf->st_blocks)) + __put_user(stat->ino, &ubuf->__st_ino) || + __put_user(stat->ino, &ubuf->st_ino) || + __put_user(stat->mode, &ubuf->st_mode) || + __put_user(stat->nlink, &ubuf->st_nlink) || + __put_user(uid, &ubuf->st_uid) || + __put_user(gid, &ubuf->st_gid) || + __put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev) || + __put_user(stat->size, &ubuf->st_size) || + __put_user(stat->atime.tv_sec, &ubuf->st_atime) || + __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec) || + __put_user(stat->mtime.tv_sec, &ubuf->st_mtime) || + __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) || + __put_user(stat->ctime.tv_sec, &ubuf->st_ctime) || + __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) || + __put_user(stat->blksize, &ubuf->st_blksize) || + __put_user(stat->blocks, &ubuf->st_blocks)) return -EFAULT; return 0; } -asmlinkage long -sys32_stat64(char __user * filename, struct stat64 __user *statbuf) +asmlinkage long sys32_stat64(char __user *filename, + struct stat64 __user *statbuf) { struct kstat stat; int ret = vfs_stat(filename, &stat); + if (!ret) ret = cp_stat64(statbuf, &stat); return ret; } -asmlinkage long -sys32_lstat64(char __user * filename, struct stat64 __user *statbuf) +asmlinkage long sys32_lstat64(char __user *filename, + struct stat64 __user *statbuf) { struct kstat stat; int ret = vfs_lstat(filename, &stat); @@ -174,8 +176,7 @@ sys32_lstat64(char __user * filename, struct stat64 __user *statbuf) return ret; } -asmlinkage long -sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf) +asmlinkage long sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf) { struct kstat stat; int ret = vfs_fstat(fd, &stat); @@ -184,9 +185,8 @@ sys32_fstat64(unsigned int fd, struct stat64 __user *statbuf) return ret; } -asmlinkage long -sys32_fstatat(unsigned int dfd, char __user *filename, - struct stat64 __user* statbuf, int flag) +asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename, + struct stat64 __user *statbuf, int flag) { struct kstat stat; int error = -EINVAL; @@ -221,8 +221,7 @@ struct mmap_arg_struct { unsigned int offset; }; -asmlinkage long -sys32_mmap(struct mmap_arg_struct __user *arg) +asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) { struct mmap_arg_struct a; struct file *file = NULL; @@ -233,33 +232,33 @@ sys32_mmap(struct mmap_arg_struct __user *arg) return -EFAULT; if (a.offset & ~PAGE_MASK) - return -EINVAL; + return -EINVAL; if (!(a.flags & MAP_ANONYMOUS)) { file = fget(a.fd); if (!file) return -EBADF; } - - mm = current->mm; - down_write(&mm->mmap_sem); - retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, a.offset>>PAGE_SHIFT); + + mm = current->mm; + down_write(&mm->mmap_sem); + retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, + a.offset>>PAGE_SHIFT); if (file) fput(file); - up_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); return retval; } -asmlinkage long -sys32_mprotect(unsigned long start, size_t len, unsigned long prot) +asmlinkage long sys32_mprotect(unsigned long start, size_t len, + unsigned long prot) { - return sys_mprotect(start,len,prot); + return sys_mprotect(start, len, prot); } -asmlinkage long -sys32_pipe(int __user *fd) +asmlinkage long sys32_pipe(int __user *fd) { int retval; int fds[2]; @@ -269,13 +268,13 @@ sys32_pipe(int __user *fd) goto out; if (copy_to_user(fd, fds, sizeof(fds))) retval = -EFAULT; - out: +out: return retval; } -asmlinkage long -sys32_rt_sigaction(int sig, struct sigaction32 __user *act, - struct sigaction32 __user *oact, unsigned int sigsetsize) +asmlinkage long sys32_rt_sigaction(int sig, struct sigaction32 __user *act, + struct sigaction32 __user *oact, + unsigned int sigsetsize) { struct k_sigaction new_ka, old_ka; int ret; @@ -291,12 +290,17 @@ sys32_rt_sigaction(int sig, struct sigaction32 __user *act, if (!access_ok(VERIFY_READ, act, sizeof(*act)) || __get_user(handler, &act->sa_handler) || __get_user(new_ka.sa.sa_flags, &act->sa_flags) || - __get_user(restorer, &act->sa_restorer)|| - __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t))) + __get_user(restorer, &act->sa_restorer) || + __copy_from_user(&set32, &act->sa_mask, + sizeof(compat_sigset_t))) return -EFAULT; new_ka.sa.sa_handler = compat_ptr(handler); new_ka.sa.sa_restorer = compat_ptr(restorer); - /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */ + + /* + * FIXME: here we rely on _COMPAT_NSIG_WORS to be >= + * than _NSIG_WORDS << 1 + */ switch (_NSIG_WORDS) { case 4: new_ka.sa.sa_mask.sig[3] = set32.sig[6] | (((long)set32.sig[7]) << 32); @@ -312,7 +316,10 @@ sys32_rt_sigaction(int sig, struct sigaction32 __user *act, ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { - /* FIXME: here we rely on _COMPAT_NSIG_WORS to be >= than _NSIG_WORDS << 1 */ + /* + * FIXME: here we rely on _COMPAT_NSIG_WORS to be >= + * than _NSIG_WORDS << 1 + */ switch (_NSIG_WORDS) { case 4: set32.sig[7] = (old_ka.sa.sa_mask.sig[3] >> 32); @@ -328,23 +335,26 @@ sys32_rt_sigaction(int sig, struct sigaction32 __user *act, set32.sig[0] = old_ka.sa.sa_mask.sig[0]; } if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || - __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || + __put_user(ptr_to_compat(old_ka.sa.sa_handler), + &oact->sa_handler) || + __put_user(ptr_to_compat(old_ka.sa.sa_restorer), + &oact->sa_restorer) || __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || - __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t))) + __copy_to_user(&oact->sa_mask, &set32, + sizeof(compat_sigset_t))) return -EFAULT; } return ret; } -asmlinkage long -sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigaction32 __user *oact) +asmlinkage long sys32_sigaction(int sig, struct old_sigaction32 __user *act, + struct old_sigaction32 __user *oact) { - struct k_sigaction new_ka, old_ka; - int ret; + struct k_sigaction new_ka, old_ka; + int ret; - if (act) { + if (act) { compat_old_sigset_t mask; compat_uptr_t handler, restorer; @@ -359,33 +369,35 @@ sys32_sigaction (int sig, struct old_sigaction32 __user *act, struct old_sigacti new_ka.sa.sa_restorer = compat_ptr(restorer); siginitset(&new_ka.sa.sa_mask, mask); - } + } - ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); if (!ret && oact) { if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || - __put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler) || - __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer) || + __put_user(ptr_to_compat(old_ka.sa.sa_handler), + &oact->sa_handler) || + __put_user(ptr_to_compat(old_ka.sa.sa_restorer), + &oact->sa_restorer) || __put_user(old_ka.sa.sa_flags, &oact->sa_flags) || __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask)) return -EFAULT; - } + } return ret; } -asmlinkage long -sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, - compat_sigset_t __user *oset, unsigned int sigsetsize) +asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, + compat_sigset_t __user *oset, + unsigned int sigsetsize) { sigset_t s; compat_sigset_t s32; int ret; mm_segment_t old_fs = get_fs(); - + if (set) { - if (copy_from_user (&s32, set, sizeof(compat_sigset_t))) + if (copy_from_user(&s32, set, sizeof(compat_sigset_t))) return -EFAULT; switch (_NSIG_WORDS) { case 4: s.sig[3] = s32.sig[6] | (((long)s32.sig[7]) << 32); @@ -394,13 +406,14 @@ sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, case 1: s.sig[0] = s32.sig[0] | (((long)s32.sig[1]) << 32); } } - set_fs (KERNEL_DS); + set_fs(KERNEL_DS); ret = sys_rt_sigprocmask(how, set ? (sigset_t __user *)&s : NULL, oset ? (sigset_t __user *)&s : NULL, - sigsetsize); - set_fs (old_fs); - if (ret) return ret; + sigsetsize); + set_fs(old_fs); + if (ret) + return ret; if (oset) { switch (_NSIG_WORDS) { case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3]; @@ -408,52 +421,49 @@ sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1]; case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0]; } - if (copy_to_user (oset, &s32, sizeof(compat_sigset_t))) + if (copy_to_user(oset, &s32, sizeof(compat_sigset_t))) return -EFAULT; } return 0; } -static inline long -get_tv32(struct timeval *o, struct compat_timeval __user *i) +static inline long get_tv32(struct timeval *o, struct compat_timeval __user *i) { - int err = -EFAULT; - if (access_ok(VERIFY_READ, i, sizeof(*i))) { + int err = -EFAULT; + + if (access_ok(VERIFY_READ, i, sizeof(*i))) { err = __get_user(o->tv_sec, &i->tv_sec); err |= __get_user(o->tv_usec, &i->tv_usec); } - return err; + return err; } -static inline long -put_tv32(struct compat_timeval __user *o, struct timeval *i) +static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i) { int err = -EFAULT; - if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { + + if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { err = __put_user(i->tv_sec, &o->tv_sec); err |= __put_user(i->tv_usec, &o->tv_usec); - } - return err; + } + return err; } -extern unsigned int alarm_setitimer(unsigned int seconds); - -asmlinkage long -sys32_alarm(unsigned int seconds) +asmlinkage long sys32_alarm(unsigned int seconds) { return alarm_setitimer(seconds); } -/* Translations due to time_t size differences. Which affects all - sorts of things, like timeval and itimerval. */ - -extern struct timezone sys_tz; - -asmlinkage long -sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) +/* + * Translations due to time_t size differences. Which affects all + * sorts of things, like timeval and itimerval. + */ +asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz) { if (tv) { struct timeval ktv; + do_gettimeofday(&ktv); if (put_tv32(tv, &ktv)) return -EFAULT; @@ -465,14 +475,14 @@ sys32_gettimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) return 0; } -asmlinkage long -sys32_settimeofday(struct compat_timeval __user *tv, struct timezone __user *tz) +asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz) { struct timeval ktv; struct timespec kts; struct timezone ktz; - if (tv) { + if (tv) { if (get_tv32(&ktv, tv)) return -EFAULT; kts.tv_sec = ktv.tv_sec; @@ -494,8 +504,7 @@ struct sel_arg_struct { unsigned int tvp; }; -asmlinkage long -sys32_old_select(struct sel_arg_struct __user *arg) +asmlinkage long sys32_old_select(struct sel_arg_struct __user *arg) { struct sel_arg_struct a; @@ -505,50 +514,45 @@ sys32_old_select(struct sel_arg_struct __user *arg) compat_ptr(a.exp), compat_ptr(a.tvp)); } -extern asmlinkage long -compat_sys_wait4(compat_pid_t pid, compat_uint_t * stat_addr, int options, - struct compat_rusage *ru); - -asmlinkage long -sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, int options) +asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, + int options) { return compat_sys_wait4(pid, stat_addr, options, NULL); } /* 32-bit timeval and related flotsam. */ -asmlinkage long -sys32_sysfs(int option, u32 arg1, u32 arg2) +asmlinkage long sys32_sysfs(int option, u32 arg1, u32 arg2) { return sys_sysfs(option, arg1, arg2); } -asmlinkage long -sys32_sched_rr_get_interval(compat_pid_t pid, struct compat_timespec __user *interval) +asmlinkage long sys32_sched_rr_get_interval(compat_pid_t pid, + struct compat_timespec __user *interval) { struct timespec t; int ret; - mm_segment_t old_fs = get_fs (); - - set_fs (KERNEL_DS); + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t); - set_fs (old_fs); + set_fs(old_fs); if (put_compat_timespec(&t, interval)) return -EFAULT; return ret; } -asmlinkage long -sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize) +asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *set, + compat_size_t sigsetsize) { sigset_t s; compat_sigset_t s32; int ret; mm_segment_t old_fs = get_fs(); - - set_fs (KERNEL_DS); + + set_fs(KERNEL_DS); ret = sys_rt_sigpending((sigset_t __user *)&s, sigsetsize); - set_fs (old_fs); + set_fs(old_fs); if (!ret) { switch (_NSIG_WORDS) { case 4: s32.sig[7] = (s.sig[3] >> 32); s32.sig[6] = s.sig[3]; @@ -556,30 +560,29 @@ sys32_rt_sigpending(compat_sigset_t __user *set, compat_size_t sigsetsize) case 2: s32.sig[3] = (s.sig[1] >> 32); s32.sig[2] = s.sig[1]; case 1: s32.sig[1] = (s.sig[0] >> 32); s32.sig[0] = s.sig[0]; } - if (copy_to_user (set, &s32, sizeof(compat_sigset_t))) + if (copy_to_user(set, &s32, sizeof(compat_sigset_t))) return -EFAULT; } return ret; } -asmlinkage long -sys32_rt_sigqueueinfo(int pid, int sig, compat_siginfo_t __user *uinfo) +asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig, + compat_siginfo_t __user *uinfo) { siginfo_t info; int ret; mm_segment_t old_fs = get_fs(); - + if (copy_siginfo_from_user32(&info, uinfo)) return -EFAULT; - set_fs (KERNEL_DS); + set_fs(KERNEL_DS); ret = sys_rt_sigqueueinfo(pid, sig, (siginfo_t __user *)&info); - set_fs (old_fs); + set_fs(old_fs); return ret; } /* These are here just in case some old ia32 binary calls it. */ -asmlinkage long -sys32_pause(void) +asmlinkage long sys32_pause(void) { current->state = TASK_INTERRUPTIBLE; schedule(); @@ -599,25 +602,25 @@ struct sysctl_ia32 { }; -asmlinkage long -sys32_sysctl(struct sysctl_ia32 __user *args32) +asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *args32) { struct sysctl_ia32 a32; - mm_segment_t old_fs = get_fs (); + mm_segment_t old_fs = get_fs(); void __user *oldvalp, *newvalp; size_t oldlen; int __user *namep; long ret; - if (copy_from_user(&a32, args32, sizeof (a32))) + if (copy_from_user(&a32, args32, sizeof(a32))) return -EFAULT; /* - * We need to pre-validate these because we have to disable address checking - * before calling do_sysctl() because of OLDLEN but we can't run the risk of the - * user specifying bad addresses here. Well, since we're dealing with 32 bit - * addresses, we KNOW that access_ok() will always succeed, so this is an - * expensive NOP, but so what... + * We need to pre-validate these because we have to disable + * address checking before calling do_sysctl() because of + * OLDLEN but we can't run the risk of the user specifying bad + * addresses here. Well, since we're dealing with 32 bit + * addresses, we KNOW that access_ok() will always succeed, so + * this is an expensive NOP, but so what... */ namep = compat_ptr(a32.name); oldvalp = compat_ptr(a32.oldval); @@ -636,34 +639,34 @@ sys32_sysctl(struct sysctl_ia32 __user *args32) unlock_kernel(); set_fs(old_fs); - if (oldvalp && put_user (oldlen, (int __user *)compat_ptr(a32.oldlenp))) + if (oldvalp && put_user(oldlen, (int __user *)compat_ptr(a32.oldlenp))) return -EFAULT; return ret; } #endif -/* warning: next two assume little endian */ -asmlinkage long -sys32_pread(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi) +/* warning: next two assume little endian */ +asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count, + u32 poslo, u32 poshi) { return sys_pread64(fd, ubuf, count, ((loff_t)AA(poshi) << 32) | AA(poslo)); } -asmlinkage long -sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, u32 poslo, u32 poshi) +asmlinkage long sys32_pwrite(unsigned int fd, char __user *ubuf, u32 count, + u32 poslo, u32 poshi) { return sys_pwrite64(fd, ubuf, count, ((loff_t)AA(poshi) << 32) | AA(poslo)); } -asmlinkage long -sys32_personality(unsigned long personality) +asmlinkage long sys32_personality(unsigned long personality) { int ret; - if (personality(current->personality) == PER_LINUX32 && + + if (personality(current->personality) == PER_LINUX32 && personality == PER_LINUX) personality = PER_LINUX32; ret = sys_personality(personality); @@ -672,34 +675,33 @@ sys32_personality(unsigned long personality) return ret; } -asmlinkage long -sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, s32 count) +asmlinkage long sys32_sendfile(int out_fd, int in_fd, + compat_off_t __user *offset, s32 count) { mm_segment_t old_fs = get_fs(); int ret; off_t of; - + if (offset && get_user(of, offset)) return -EFAULT; - + set_fs(KERNEL_DS); ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL, count); set_fs(old_fs); - + if (offset && put_user(of, offset)) return -EFAULT; - return ret; } asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) { struct mm_struct *mm = current->mm; unsigned long error; - struct file * file = NULL; + struct file *file = NULL; flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); if (!(flags & MAP_ANONYMOUS)) { @@ -717,36 +719,35 @@ asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len, return error; } -asmlinkage long sys32_olduname(struct oldold_utsname __user * name) +asmlinkage long sys32_olduname(struct oldold_utsname __user *name) { + char *arch = "x86_64"; int err; if (!name) return -EFAULT; if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) return -EFAULT; - - down_read(&uts_sem); - - err = __copy_to_user(&name->sysname,&utsname()->sysname, - __OLD_UTS_LEN); - err |= __put_user(0,name->sysname+__OLD_UTS_LEN); - err |= __copy_to_user(&name->nodename,&utsname()->nodename, - __OLD_UTS_LEN); - err |= __put_user(0,name->nodename+__OLD_UTS_LEN); - err |= __copy_to_user(&name->release,&utsname()->release, - __OLD_UTS_LEN); - err |= __put_user(0,name->release+__OLD_UTS_LEN); - err |= __copy_to_user(&name->version,&utsname()->version, - __OLD_UTS_LEN); - err |= __put_user(0,name->version+__OLD_UTS_LEN); - { - char *arch = "x86_64"; - if (personality(current->personality) == PER_LINUX32) - arch = "i686"; - - err |= __copy_to_user(&name->machine, arch, strlen(arch)+1); - } + + down_read(&uts_sem); + + err = __copy_to_user(&name->sysname, &utsname()->sysname, + __OLD_UTS_LEN); + err |= __put_user(0, name->sysname+__OLD_UTS_LEN); + err |= __copy_to_user(&name->nodename, &utsname()->nodename, + __OLD_UTS_LEN); + err |= __put_user(0, name->nodename+__OLD_UTS_LEN); + err |= __copy_to_user(&name->release, &utsname()->release, + __OLD_UTS_LEN); + err |= __put_user(0, name->release+__OLD_UTS_LEN); + err |= __copy_to_user(&name->version, &utsname()->version, + __OLD_UTS_LEN); + err |= __put_user(0, name->version+__OLD_UTS_LEN); + + if (personality(current->personality) == PER_LINUX32) + arch = "i686"; + + err |= __copy_to_user(&name->machine, arch, strlen(arch) + 1); up_read(&uts_sem); @@ -755,17 +756,19 @@ asmlinkage long sys32_olduname(struct oldold_utsname __user * name) return err; } -long sys32_uname(struct old_utsname __user * name) +long sys32_uname(struct old_utsname __user *name) { int err; + if (!name) return -EFAULT; down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof(*name)); up_read(&uts_sem); - if (personality(current->personality) == PER_LINUX32) + if (personality(current->personality) == PER_LINUX32) err |= copy_to_user(&name->machine, "i686", 5); - return err?-EFAULT:0; + + return err ? -EFAULT : 0; } long sys32_ustat(unsigned dev, struct ustat32 __user *u32p) @@ -773,27 +776,28 @@ long sys32_ustat(unsigned dev, struct ustat32 __user *u32p) struct ustat u; mm_segment_t seg; int ret; - - seg = get_fs(); - set_fs(KERNEL_DS); + + seg = get_fs(); + set_fs(KERNEL_DS); ret = sys_ustat(dev, (struct ustat __user *)&u); set_fs(seg); - if (ret >= 0) { - if (!access_ok(VERIFY_WRITE,u32p,sizeof(struct ustat32)) || - __put_user((__u32) u.f_tfree, &u32p->f_tfree) || - __put_user((__u32) u.f_tinode, &u32p->f_tfree) || - __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) || - __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack))) - ret = -EFAULT; - } + if (ret < 0) + return ret; + + if (!access_ok(VERIFY_WRITE, u32p, sizeof(struct ustat32)) || + __put_user((__u32) u.f_tfree, &u32p->f_tfree) || + __put_user((__u32) u.f_tinode, &u32p->f_tfree) || + __copy_to_user(&u32p->f_fname, u.f_fname, sizeof(u.f_fname)) || + __copy_to_user(&u32p->f_fpack, u.f_fpack, sizeof(u.f_fpack))) + ret = -EFAULT; return ret; -} +} asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, compat_uptr_t __user *envp, struct pt_regs *regs) { long error; - char * filename; + char *filename; filename = getname(name); error = PTR_ERR(filename); @@ -812,18 +816,19 @@ asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, asmlinkage long sys32_clone(unsigned int clone_flags, unsigned int newsp, struct pt_regs *regs) { - void __user *parent_tid = (void __user *)regs->rdx; - void __user *child_tid = (void __user *)regs->rdi; + void __user *parent_tid = (void __user *)regs->dx; + void __user *child_tid = (void __user *)regs->di; + if (!newsp) - newsp = regs->rsp; - return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); + newsp = regs->sp; + return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); } /* - * Some system calls that need sign extended arguments. This could be done by a generic wrapper. - */ - -long sys32_lseek (unsigned int fd, int offset, unsigned int whence) + * Some system calls that need sign extended arguments. This could be + * done by a generic wrapper. + */ +long sys32_lseek(unsigned int fd, int offset, unsigned int whence) { return sys_lseek(fd, offset, whence); } @@ -832,49 +837,52 @@ long sys32_kill(int pid, int sig) { return sys_kill(pid, sig); } - -long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, + +long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, __u32 len_low, __u32 len_high, int advice) -{ +{ return sys_fadvise64_64(fd, (((u64)offset_high)<<32) | offset_low, (((u64)len_high)<<32) | len_low, - advice); -} + advice); +} long sys32_vm86_warning(void) -{ +{ struct task_struct *me = current; static char lastcomm[sizeof(me->comm)]; + if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { - compat_printk(KERN_INFO "%s: vm86 mode not supported on 64 bit kernel\n", - me->comm); + compat_printk(KERN_INFO + "%s: vm86 mode not supported on 64 bit kernel\n", + me->comm); strncpy(lastcomm, me->comm, sizeof(lastcomm)); - } + } return -ENOSYS; -} +} long sys32_lookup_dcookie(u32 addr_low, u32 addr_high, - char __user * buf, size_t len) + char __user *buf, size_t len) { return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len); } -asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, size_t count) +asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, + size_t count) { return sys_readahead(fd, ((u64)off_hi << 32) | off_lo, count); } asmlinkage long sys32_sync_file_range(int fd, unsigned off_low, unsigned off_hi, - unsigned n_low, unsigned n_hi, int flags) + unsigned n_low, unsigned n_hi, int flags) { return sys_sync_file_range(fd, ((u64)off_hi << 32) | off_low, ((u64)n_hi << 32) | n_low, flags); } -asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi, size_t len, - int advice) +asmlinkage long sys32_fadvise64(int fd, unsigned offset_lo, unsigned offset_hi, + size_t len, int advice) { return sys_fadvise64_64(fd, ((u64)offset_hi << 32) | offset_lo, len, advice); diff --git a/arch/x86/ia32/syscall32.c b/arch/x86/ia32/syscall32.c deleted file mode 100644 index 15013bac181..00000000000 --- a/arch/x86/ia32/syscall32.c +++ /dev/null @@ -1,83 +0,0 @@ -/* Copyright 2002,2003 Andi Kleen, SuSE Labs */ - -/* vsyscall handling for 32bit processes. Map a stub page into it - on demand because 32bit cannot reach the kernel's fixmaps */ - -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/kernel.h> -#include <linux/gfp.h> -#include <linux/init.h> -#include <linux/stringify.h> -#include <linux/security.h> -#include <asm/proto.h> -#include <asm/tlbflush.h> -#include <asm/ia32_unistd.h> -#include <asm/vsyscall32.h> - -extern unsigned char syscall32_syscall[], syscall32_syscall_end[]; -extern unsigned char syscall32_sysenter[], syscall32_sysenter_end[]; -extern int sysctl_vsyscall32; - -static struct page *syscall32_pages[1]; -static int use_sysenter = -1; - -struct linux_binprm; - -/* Setup a VMA at program startup for the vsyscall page */ -int syscall32_setup_pages(struct linux_binprm *bprm, int exstack) -{ - struct mm_struct *mm = current->mm; - int ret; - - down_write(&mm->mmap_sem); - /* - * MAYWRITE to allow gdb to COW and set breakpoints - * - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. - */ - /* Could randomize here */ - ret = install_special_mapping(mm, VSYSCALL32_BASE, PAGE_SIZE, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, - syscall32_pages); - up_write(&mm->mmap_sem); - return ret; -} - -static int __init init_syscall32(void) -{ - char *syscall32_page = (void *)get_zeroed_page(GFP_KERNEL); - if (!syscall32_page) - panic("Cannot allocate syscall32 page"); - syscall32_pages[0] = virt_to_page(syscall32_page); - if (use_sysenter > 0) { - memcpy(syscall32_page, syscall32_sysenter, - syscall32_sysenter_end - syscall32_sysenter); - } else { - memcpy(syscall32_page, syscall32_syscall, - syscall32_syscall_end - syscall32_syscall); - } - return 0; -} - -__initcall(init_syscall32); - -/* May not be __init: called during resume */ -void syscall32_cpu_init(void) -{ - if (use_sysenter < 0) - use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL); - - /* Load these always in case some future AMD CPU supports - SYSENTER from compat mode too. */ - checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL); - checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); - - wrmsrl(MSR_CSTAR, ia32_cstar_target); -} diff --git a/arch/x86/ia32/syscall32_syscall.S b/arch/x86/ia32/syscall32_syscall.S deleted file mode 100644 index 933f0f08b1c..00000000000 --- a/arch/x86/ia32/syscall32_syscall.S +++ /dev/null @@ -1,17 +0,0 @@ -/* 32bit VDSOs mapped into user space. */ - - .section ".init.data","aw" - - .globl syscall32_syscall - .globl syscall32_syscall_end - -syscall32_syscall: - .incbin "arch/x86/ia32/vsyscall-syscall.so" -syscall32_syscall_end: - - .globl syscall32_sysenter - .globl syscall32_sysenter_end - -syscall32_sysenter: - .incbin "arch/x86/ia32/vsyscall-sysenter.so" -syscall32_sysenter_end: diff --git a/arch/x86/ia32/tls32.c b/arch/x86/ia32/tls32.c deleted file mode 100644 index 1cc4340de3c..00000000000 --- a/arch/x86/ia32/tls32.c +++ /dev/null @@ -1,163 +0,0 @@ -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/user.h> - -#include <asm/uaccess.h> -#include <asm/desc.h> -#include <asm/system.h> -#include <asm/ldt.h> -#include <asm/processor.h> -#include <asm/proto.h> - -/* - * sys_alloc_thread_area: get a yet unused TLS descriptor index. - */ -static int get_free_idx(void) -{ - struct thread_struct *t = ¤t->thread; - int idx; - - for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) - if (desc_empty((struct n_desc_struct *)(t->tls_array) + idx)) - return idx + GDT_ENTRY_TLS_MIN; - return -ESRCH; -} - -/* - * Set a given TLS descriptor: - * When you want addresses > 32bit use arch_prctl() - */ -int do_set_thread_area(struct thread_struct *t, struct user_desc __user *u_info) -{ - struct user_desc info; - struct n_desc_struct *desc; - int cpu, idx; - - if (copy_from_user(&info, u_info, sizeof(info))) - return -EFAULT; - - idx = info.entry_number; - - /* - * index -1 means the kernel should try to find and - * allocate an empty descriptor: - */ - if (idx == -1) { - idx = get_free_idx(); - if (idx < 0) - return idx; - if (put_user(idx, &u_info->entry_number)) - return -EFAULT; - } - - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN; - - /* - * We must not get preempted while modifying the TLS. - */ - cpu = get_cpu(); - - if (LDT_empty(&info)) { - desc->a = 0; - desc->b = 0; - } else { - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } - if (t == ¤t->thread) - load_TLS(t, cpu); - - put_cpu(); - return 0; -} - -asmlinkage long sys32_set_thread_area(struct user_desc __user *u_info) -{ - return do_set_thread_area(¤t->thread, u_info); -} - - -/* - * Get the current Thread-Local Storage area: - */ - -#define GET_BASE(desc) ( \ - (((desc)->a >> 16) & 0x0000ffff) | \ - (((desc)->b << 16) & 0x00ff0000) | \ - ( (desc)->b & 0xff000000) ) - -#define GET_LIMIT(desc) ( \ - ((desc)->a & 0x0ffff) | \ - ((desc)->b & 0xf0000) ) - -#define GET_32BIT(desc) (((desc)->b >> 22) & 1) -#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) -#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) -#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) -#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) -#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) -#define GET_LONGMODE(desc) (((desc)->b >> 21) & 1) - -int do_get_thread_area(struct thread_struct *t, struct user_desc __user *u_info) -{ - struct user_desc info; - struct n_desc_struct *desc; - int idx; - - if (get_user(idx, &u_info->entry_number)) - return -EFAULT; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = ((struct n_desc_struct *)t->tls_array) + idx - GDT_ENTRY_TLS_MIN; - - memset(&info, 0, sizeof(struct user_desc)); - info.entry_number = idx; - info.base_addr = GET_BASE(desc); - info.limit = GET_LIMIT(desc); - info.seg_32bit = GET_32BIT(desc); - info.contents = GET_CONTENTS(desc); - info.read_exec_only = !GET_WRITABLE(desc); - info.limit_in_pages = GET_LIMIT_PAGES(desc); - info.seg_not_present = !GET_PRESENT(desc); - info.useable = GET_USEABLE(desc); - info.lm = GET_LONGMODE(desc); - - if (copy_to_user(u_info, &info, sizeof(info))) - return -EFAULT; - return 0; -} - -asmlinkage long sys32_get_thread_area(struct user_desc __user *u_info) -{ - return do_get_thread_area(¤t->thread, u_info); -} - - -int ia32_child_tls(struct task_struct *p, struct pt_regs *childregs) -{ - struct n_desc_struct *desc; - struct user_desc info; - struct user_desc __user *cp; - int idx; - - cp = (void __user *)childregs->rsi; - if (copy_from_user(&info, cp, sizeof(info))) - return -EFAULT; - if (LDT_empty(&info)) - return -EINVAL; - - idx = info.entry_number; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = (struct n_desc_struct *)(p->thread.tls_array) + idx - GDT_ENTRY_TLS_MIN; - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - - return 0; -} diff --git a/arch/x86/ia32/vsyscall-sigreturn.S b/arch/x86/ia32/vsyscall-sigreturn.S deleted file mode 100644 index b383be00bae..00000000000 --- a/arch/x86/ia32/vsyscall-sigreturn.S +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Common code for the sigreturn entry points on the vsyscall page. - * This code uses SYSCALL_ENTER_KERNEL (either syscall or int $0x80) - * to enter the kernel. - * This file is #include'd by vsyscall-*.S to define them after the - * vsyscall entry point. The addresses we get for these entry points - * by doing ".balign 32" must match in both versions of the page. - */ - - .code32 - .section .text.sigreturn,"ax" - .balign 32 - .globl __kernel_sigreturn - .type __kernel_sigreturn,@function -__kernel_sigreturn: -.LSTART_sigreturn: - popl %eax - movl $__NR_ia32_sigreturn, %eax - SYSCALL_ENTER_KERNEL -.LEND_sigreturn: - .size __kernel_sigreturn,.-.LSTART_sigreturn - - .section .text.rtsigreturn,"ax" - .balign 32 - .globl __kernel_rt_sigreturn - .type __kernel_rt_sigreturn,@function -__kernel_rt_sigreturn: -.LSTART_rt_sigreturn: - movl $__NR_ia32_rt_sigreturn, %eax - SYSCALL_ENTER_KERNEL -.LEND_rt_sigreturn: - .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn - - .section .eh_frame,"a",@progbits -.LSTARTFRAMES: - .long .LENDCIES-.LSTARTCIES -.LSTARTCIES: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zRS" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIES: - - .long .LENDFDE2-.LSTARTFDE2 /* Length FDE */ -.LSTARTFDE2: - .long .LSTARTFDE2-.LSTARTFRAMES /* CIE pointer */ - /* HACK: The dwarf2 unwind routines will subtract 1 from the - return address to get an address in the middle of the - presumed call instruction. Since we didn't get here via - a call, we need to include the nop before the real start - to make up for it. */ - .long .LSTART_sigreturn-1-. /* PC-relative start address */ - .long .LEND_sigreturn-.LSTART_sigreturn+1 - .uleb128 0 /* Augmentation length */ - /* What follows are the instructions for the table generation. - We record the locations of each register saved. This is - complicated by the fact that the "CFA" is always assumed to - be the value of the stack pointer in the caller. This means - that we must define the CFA of this body of code to be the - saved value of the stack pointer in the sigcontext. Which - also means that there is no fixed relation to the other - saved registers, which means that we must use DW_CFA_expression - to compute their addresses. It also means that when we - adjust the stack with the popl, we have to do it all over again. */ - -#define do_cfa_expr(offset) \ - .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ - .uleb128 1f-0f; /* length */ \ -0: .byte 0x74; /* DW_OP_breg4 */ \ - .sleb128 offset; /* offset */ \ - .byte 0x06; /* DW_OP_deref */ \ -1: - -#define do_expr(regno, offset) \ - .byte 0x10; /* DW_CFA_expression */ \ - .uleb128 regno; /* regno */ \ - .uleb128 1f-0f; /* length */ \ -0: .byte 0x74; /* DW_OP_breg4 */ \ - .sleb128 offset; /* offset */ \ -1: - - do_cfa_expr(IA32_SIGCONTEXT_esp+4) - do_expr(0, IA32_SIGCONTEXT_eax+4) - do_expr(1, IA32_SIGCONTEXT_ecx+4) - do_expr(2, IA32_SIGCONTEXT_edx+4) - do_expr(3, IA32_SIGCONTEXT_ebx+4) - do_expr(5, IA32_SIGCONTEXT_ebp+4) - do_expr(6, IA32_SIGCONTEXT_esi+4) - do_expr(7, IA32_SIGCONTEXT_edi+4) - do_expr(8, IA32_SIGCONTEXT_eip+4) - - .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ - - do_cfa_expr(IA32_SIGCONTEXT_esp) - do_expr(0, IA32_SIGCONTEXT_eax) - do_expr(1, IA32_SIGCONTEXT_ecx) - do_expr(2, IA32_SIGCONTEXT_edx) - do_expr(3, IA32_SIGCONTEXT_ebx) - do_expr(5, IA32_SIGCONTEXT_ebp) - do_expr(6, IA32_SIGCONTEXT_esi) - do_expr(7, IA32_SIGCONTEXT_edi) - do_expr(8, IA32_SIGCONTEXT_eip) - - .align 4 -.LENDFDE2: - - .long .LENDFDE3-.LSTARTFDE3 /* Length FDE */ -.LSTARTFDE3: - .long .LSTARTFDE3-.LSTARTFRAMES /* CIE pointer */ - /* HACK: See above wrt unwind library assumptions. */ - .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */ - .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1 - .uleb128 0 /* Augmentation */ - /* What follows are the instructions for the table generation. - We record the locations of each register saved. This is - slightly less complicated than the above, since we don't - modify the stack pointer in the process. */ - - do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esp) - do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eax) - do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ecx) - do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edx) - do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebx) - do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ebp) - do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_esi) - do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_edi) - do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_eip) - - .align 4 -.LENDFDE3: - -#include "../../x86/kernel/vsyscall-note_32.S" - diff --git a/arch/x86/ia32/vsyscall-sysenter.S b/arch/x86/ia32/vsyscall-sysenter.S deleted file mode 100644 index ae056e553d1..00000000000 --- a/arch/x86/ia32/vsyscall-sysenter.S +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Code for the vsyscall page. This version uses the sysenter instruction. - */ - -#include <asm/ia32_unistd.h> -#include <asm/asm-offsets.h> - - .code32 - .text - .section .text.vsyscall,"ax" - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function -__kernel_vsyscall: -.LSTART_vsyscall: - push %ecx -.Lpush_ecx: - push %edx -.Lpush_edx: - push %ebp -.Lenter_kernel: - movl %esp,%ebp - sysenter - .space 7,0x90 - jmp .Lenter_kernel - /* 16: System call normal return point is here! */ - pop %ebp -.Lpop_ebp: - pop %edx -.Lpop_edx: - pop %ecx -.Lpop_ecx: - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - - .section .eh_frame,"a",@progbits -.LSTARTFRAME: - .long .LENDCIE-.LSTARTCIE -.LSTARTCIE: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIE: - - .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ -.LSTARTFDE1: - .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 /* Augmentation length */ - /* What follows are the instructions for the table generation. - We have to record all changes of the stack pointer. */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpush_ecx-.LSTART_vsyscall - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x08 /* RA at offset 8 now */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpush_edx-.Lpush_ecx - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x0c /* RA at offset 12 now */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lenter_kernel-.Lpush_edx - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x10 /* RA at offset 16 now */ - .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ - /* Finally the epilogue. */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpop_ebp-.Lenter_kernel - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x12 /* RA at offset 12 now */ - .byte 0xc5 /* DW_CFA_restore %ebp */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpop_edx-.Lpop_ebp - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x08 /* RA at offset 8 now */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpop_ecx-.Lpop_edx - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x04 /* RA at offset 4 now */ - .align 4 -.LENDFDE1: - -#define SYSCALL_ENTER_KERNEL int $0x80 -#include "vsyscall-sigreturn.S" diff --git a/arch/x86/ia32/vsyscall.lds b/arch/x86/ia32/vsyscall.lds deleted file mode 100644 index 1dc86ff5bcb..00000000000 --- a/arch/x86/ia32/vsyscall.lds +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Linker script for vsyscall DSO. The vsyscall page is an ELF shared - * object prelinked to its virtual address. This script controls its layout. - */ - -/* This must match <asm/fixmap.h>. */ -VSYSCALL_BASE = 0xffffe000; - -SECTIONS -{ - . = VSYSCALL_BASE + SIZEOF_HEADERS; - - .hash : { *(.hash) } :text - .gnu.hash : { *(.gnu.hash) } - .dynsym : { *(.dynsym) } - .dynstr : { *(.dynstr) } - .gnu.version : { *(.gnu.version) } - .gnu.version_d : { *(.gnu.version_d) } - .gnu.version_r : { *(.gnu.version_r) } - - /* This linker script is used both with -r and with -shared. - For the layouts to match, we need to skip more than enough - space for the dynamic symbol table et al. If this amount - is insufficient, ld -shared will barf. Just increase it here. */ - . = VSYSCALL_BASE + 0x400; - - .text.vsyscall : { *(.text.vsyscall) } :text =0x90909090 - - /* This is an 32bit object and we cannot easily get the offsets - into the 64bit kernel. Just hardcode them here. This assumes - that all the stubs don't need more than 0x100 bytes. */ - . = VSYSCALL_BASE + 0x500; - - .text.sigreturn : { *(.text.sigreturn) } :text =0x90909090 - - . = VSYSCALL_BASE + 0x600; - - .text.rtsigreturn : { *(.text.rtsigreturn) } :text =0x90909090 - - .note : { *(.note.*) } :text :note - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text - .dynamic : { *(.dynamic) } :text :dynamic - .useless : { - *(.got.plt) *(.got) - *(.data .data.* .gnu.linkonce.d.*) - *(.dynbss) - *(.bss .bss.* .gnu.linkonce.b.*) - } :text -} - -/* - * We must supply the ELF program headers explicitly to get just one - * PT_LOAD segment, and set the flags explicitly to make segments read-only. - */ -PHDRS -{ - text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ - dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ - note PT_NOTE FLAGS(4); /* PF_R */ - eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ -} - -/* - * This controls what symbols we export from the DSO. - */ -VERSION -{ - LINUX_2.5 { - global: - __kernel_vsyscall; - __kernel_sigreturn; - __kernel_rt_sigreturn; - - local: *; - }; -} - -/* The ELF entry point can be used to set the AT_SYSINFO value. */ -ENTRY(__kernel_vsyscall); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 38573340b14..6f813009d44 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -1,9 +1,91 @@ -ifeq ($(CONFIG_X86_32),y) -include ${srctree}/arch/x86/kernel/Makefile_32 -else -include ${srctree}/arch/x86/kernel/Makefile_64 +# +# Makefile for the linux kernel. +# + +extra-y := head_$(BITS).o init_task.o vmlinux.lds +extra-$(CONFIG_X86_64) += head64.o + +CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) +CFLAGS_vsyscall_64.o := $(PROFILING) -g0 + +obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o +obj-y += traps_$(BITS).o irq_$(BITS).o +obj-y += time_$(BITS).o ioport.o ldt.o +obj-y += setup_$(BITS).o i8259_$(BITS).o +obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o +obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o +obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o +obj-y += pci-dma_$(BITS).o bootflag.o e820_$(BITS).o +obj-y += quirks.o i8237.o topology.o kdebugfs.o +obj-y += alternative.o i8253.o +obj-$(CONFIG_X86_64) += pci-nommu_64.o bugs_64.o +obj-y += tsc_$(BITS).o io_delay.o rtc.o + +obj-y += i387.o +obj-y += ptrace.o +obj-y += ds.o +obj-$(CONFIG_X86_32) += tls.o +obj-$(CONFIG_IA32_EMULATION) += tls.o +obj-y += step.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-y += cpu/ +obj-y += acpi/ +obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o +obj-$(CONFIG_X86_64) += reboot.o +obj-$(CONFIG_MCA) += mca_32.o +obj-$(CONFIG_X86_MSR) += msr.o +obj-$(CONFIG_X86_CPUID) += cpuid.o +obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_PCI) += early-quirks.o +obj-$(CONFIG_APM) += apm_32.o +obj-$(CONFIG_X86_SMP) += smp_$(BITS).o smpboot_$(BITS).o tsc_sync.o +obj-$(CONFIG_X86_32_SMP) += smpcommon_32.o +obj-$(CONFIG_X86_64_SMP) += smp_64.o smpboot_64.o tsc_sync.o +obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o +obj-$(CONFIG_X86_MPPARSE) += mpparse_$(BITS).o +obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o +obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o +obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o +obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o +obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o +obj-$(CONFIG_X86_NUMAQ) += numaq_32.o +obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o +obj-$(CONFIG_X86_VSMP) += vsmp_64.o +obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_MODULES) += module_$(BITS).o +obj-$(CONFIG_ACPI_SRAT) += srat_32.o +obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o +obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o +obj-$(CONFIG_VM86) += vm86_32.o +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o + +obj-$(CONFIG_HPET_TIMER) += hpet.o + +obj-$(CONFIG_K8_NB) += k8.o +obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o +obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o +obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o + +obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o + +ifdef CONFIG_INPUT_PCSPKR +obj-y += pcspeaker.o endif -# Workaround to delete .lds files with make clean -# The problem is that we do not enter Makefile_32 with make clean. -clean-files := vsyscall*.lds vsyscall*.so +obj-$(CONFIG_SCx200) += scx200_32.o + +### +# 64 bit specific files +ifeq ($(CONFIG_X86_64),y) + obj-y += genapic_64.o genapic_flat_64.o + obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o + obj-$(CONFIG_AUDIT) += audit_64.o + obj-$(CONFIG_PM) += suspend_64.o + obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o + + obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o + obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o + obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o +endif diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32 deleted file mode 100644 index a7bc93c2766..00000000000 --- a/arch/x86/kernel/Makefile_32 +++ /dev/null @@ -1,88 +0,0 @@ -# -# Makefile for the linux kernel. -# - -extra-y := head_32.o init_task.o vmlinux.lds -CPPFLAGS_vmlinux.lds += -Ui386 - -obj-y := process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \ - ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \ - pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\ - quirks.o i8237.o topology.o alternative.o i8253.o tsc_32.o - -obj-$(CONFIG_STACKTRACE) += stacktrace.o -obj-y += cpu/ -obj-y += acpi/ -obj-$(CONFIG_X86_BIOS_REBOOT) += reboot_32.o -obj-$(CONFIG_MCA) += mca_32.o -obj-$(CONFIG_X86_MSR) += msr.o -obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_MICROCODE) += microcode.o -obj-$(CONFIG_PCI) += early-quirks.o -obj-$(CONFIG_APM) += apm_32.o -obj-$(CONFIG_X86_SMP) += smp_32.o smpboot_32.o tsc_sync.o -obj-$(CONFIG_SMP) += smpcommon_32.o -obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_32.o -obj-$(CONFIG_X86_MPPARSE) += mpparse_32.o -obj-$(CONFIG_X86_LOCAL_APIC) += apic_32.o nmi_32.o -obj-$(CONFIG_X86_IO_APIC) += io_apic_32.o -obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o -obj-$(CONFIG_KEXEC) += machine_kexec_32.o relocate_kernel_32.o crash.o -obj-$(CONFIG_CRASH_DUMP) += crash_dump_32.o -obj-$(CONFIG_X86_NUMAQ) += numaq_32.o -obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o -obj-$(CONFIG_KPROBES) += kprobes_32.o -obj-$(CONFIG_MODULES) += module_32.o -obj-y += sysenter_32.o vsyscall_32.o -obj-$(CONFIG_ACPI_SRAT) += srat_32.o -obj-$(CONFIG_EFI) += efi_32.o efi_stub_32.o -obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o -obj-$(CONFIG_VM86) += vm86_32.o -obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_HPET_TIMER) += hpet.o -obj-$(CONFIG_K8_NB) += k8.o -obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o - -obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o -obj-$(CONFIG_PARAVIRT) += paravirt_32.o -obj-y += pcspeaker.o - -obj-$(CONFIG_SCx200) += scx200_32.o - -# vsyscall_32.o contains the vsyscall DSO images as __initdata. -# We must build both images before we can assemble it. -# Note: kbuild does not track this dependency due to usage of .incbin -$(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so -targets += $(foreach F,int80 sysenter,vsyscall-$F_32.o vsyscall-$F_32.so) -targets += vsyscall-note_32.o vsyscall_32.lds - -# The DSO images are built using a special linker script. -quiet_cmd_syscall = SYSCALL $@ - cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \ - -Wl,-T,$(filter-out FORCE,$^) -o $@ - -export CPPFLAGS_vsyscall_32.lds += -P -C -Ui386 - -vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \ - $(call ld-option, -Wl$(comma)--hash-style=sysv) -SYSCFLAGS_vsyscall-sysenter_32.so = $(vsyscall-flags) -SYSCFLAGS_vsyscall-int80_32.so = $(vsyscall-flags) - -$(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so: \ -$(obj)/vsyscall-%.so: $(src)/vsyscall_32.lds \ - $(obj)/vsyscall-%.o $(obj)/vsyscall-note_32.o FORCE - $(call if_changed,syscall) - -# We also create a special relocatable object that should mirror the symbol -# table and layout of the linked DSO. With ld -R we can then refer to -# these symbols in the kernel code rather than hand-coded addresses. -extra-y += vsyscall-syms.o -$(obj)/built-in.o: $(obj)/vsyscall-syms.o -$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o - -SYSCFLAGS_vsyscall-syms.o = -r -$(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \ - $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE - $(call if_changed,syscall) - - diff --git a/arch/x86/kernel/Makefile_64 b/arch/x86/kernel/Makefile_64 deleted file mode 100644 index 5a88890d8ee..00000000000 --- a/arch/x86/kernel/Makefile_64 +++ /dev/null @@ -1,45 +0,0 @@ -# -# Makefile for the linux kernel. -# - -extra-y := head_64.o head64.o init_task.o vmlinux.lds -CPPFLAGS_vmlinux.lds += -Ux86_64 -EXTRA_AFLAGS := -traditional - -obj-y := process_64.o signal_64.o entry_64.o traps_64.o irq_64.o \ - ptrace_64.o time_64.o ioport_64.o ldt_64.o setup_64.o i8259_64.o sys_x86_64.o \ - x8664_ksyms_64.o i387_64.o syscall_64.o vsyscall_64.o \ - setup64.o bootflag.o e820_64.o reboot_64.o quirks.o i8237.o \ - pci-dma_64.o pci-nommu_64.o alternative.o hpet.o tsc_64.o bugs_64.o \ - i8253.o - -obj-$(CONFIG_STACKTRACE) += stacktrace.o -obj-y += cpu/ -obj-y += acpi/ -obj-$(CONFIG_X86_MSR) += msr.o -obj-$(CONFIG_MICROCODE) += microcode.o -obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_SMP) += smp_64.o smpboot_64.o trampoline_64.o tsc_sync.o -obj-y += apic_64.o nmi_64.o -obj-y += io_apic_64.o mpparse_64.o genapic_64.o genapic_flat_64.o -obj-$(CONFIG_KEXEC) += machine_kexec_64.o relocate_kernel_64.o crash.o -obj-$(CONFIG_CRASH_DUMP) += crash_dump_64.o -obj-$(CONFIG_PM) += suspend_64.o -obj-$(CONFIG_HIBERNATION) += suspend_asm_64.o -obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o -obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o -obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o -obj-$(CONFIG_KPROBES) += kprobes_64.o -obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o -obj-$(CONFIG_X86_VSMP) += vsmp_64.o -obj-$(CONFIG_K8_NB) += k8.o -obj-$(CONFIG_AUDIT) += audit_64.o - -obj-$(CONFIG_MODULES) += module_64.o -obj-$(CONFIG_PCI) += early-quirks.o - -obj-y += topology.o -obj-y += pcspeaker.o - -CFLAGS_vsyscall_64.o := $(PROFILING) -g0 diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 1351c3982ee..19d3d6e9d09 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,5 +1,5 @@ obj-$(CONFIG_ACPI) += boot.o -obj-$(CONFIG_ACPI_SLEEP) += sleep_$(BITS).o wakeup_$(BITS).o +obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o processor.o diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c new file mode 100644 index 00000000000..6bc815cd8cb --- /dev/null +++ b/arch/x86/kernel/acpi/sleep.c @@ -0,0 +1,87 @@ +/* + * sleep.c - x86-specific ACPI sleep support. + * + * Copyright (C) 2001-2003 Patrick Mochel + * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz> + */ + +#include <linux/acpi.h> +#include <linux/bootmem.h> +#include <linux/dmi.h> +#include <linux/cpumask.h> + +#include <asm/smp.h> + +/* address in low memory of the wakeup routine. */ +unsigned long acpi_wakeup_address = 0; +unsigned long acpi_realmode_flags; +extern char wakeup_start, wakeup_end; + +extern unsigned long acpi_copy_wakeup_routine(unsigned long); + +/** + * acpi_save_state_mem - save kernel state + * + * Create an identity mapped page table and copy the wakeup routine to + * low memory. + */ +int acpi_save_state_mem(void) +{ + if (!acpi_wakeup_address) { + printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n"); + return -ENOMEM; + } + memcpy((void *)acpi_wakeup_address, &wakeup_start, + &wakeup_end - &wakeup_start); + acpi_copy_wakeup_routine(acpi_wakeup_address); + + return 0; +} + +/* + * acpi_restore_state - undo effects of acpi_save_state_mem + */ +void acpi_restore_state_mem(void) +{ +} + + +/** + * acpi_reserve_bootmem - do _very_ early ACPI initialisation + * + * We allocate a page from the first 1MB of memory for the wakeup + * routine for when we come back from a sleep state. The + * runtime allocator allows specification of <16MB pages, but not + * <1MB pages. + */ +void __init acpi_reserve_bootmem(void) +{ + if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) { + printk(KERN_ERR + "ACPI: Wakeup code way too big, S3 disabled.\n"); + return; + } + + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); + if (!acpi_wakeup_address) + printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); +} + + +static int __init acpi_sleep_setup(char *str) +{ + while ((str != NULL) && (*str != '\0')) { + if (strncmp(str, "s3_bios", 7) == 0) + acpi_realmode_flags |= 1; + if (strncmp(str, "s3_mode", 7) == 0) + acpi_realmode_flags |= 2; + if (strncmp(str, "s3_beep", 7) == 0) + acpi_realmode_flags |= 4; + str = strchr(str, ','); + if (str != NULL) + str += strspn(str, ", \t"); + } + return 1; +} + +__setup("acpi_sleep=", acpi_sleep_setup); diff --git a/arch/x86/kernel/acpi/sleep_32.c b/arch/x86/kernel/acpi/sleep_32.c index 10699489cfe..63fe5525e02 100644 --- a/arch/x86/kernel/acpi/sleep_32.c +++ b/arch/x86/kernel/acpi/sleep_32.c @@ -12,76 +12,6 @@ #include <asm/smp.h> -/* address in low memory of the wakeup routine. */ -unsigned long acpi_wakeup_address = 0; -unsigned long acpi_realmode_flags; -extern char wakeup_start, wakeup_end; - -extern unsigned long FASTCALL(acpi_copy_wakeup_routine(unsigned long)); - -/** - * acpi_save_state_mem - save kernel state - * - * Create an identity mapped page table and copy the wakeup routine to - * low memory. - */ -int acpi_save_state_mem(void) -{ - if (!acpi_wakeup_address) - return 1; - memcpy((void *)acpi_wakeup_address, &wakeup_start, - &wakeup_end - &wakeup_start); - acpi_copy_wakeup_routine(acpi_wakeup_address); - - return 0; -} - -/* - * acpi_restore_state - undo effects of acpi_save_state_mem - */ -void acpi_restore_state_mem(void) -{ -} - -/** - * acpi_reserve_bootmem - do _very_ early ACPI initialisation - * - * We allocate a page from the first 1MB of memory for the wakeup - * routine for when we come back from a sleep state. The - * runtime allocator allows specification of <16MB pages, but not - * <1MB pages. - */ -void __init acpi_reserve_bootmem(void) -{ - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) { - printk(KERN_ERR - "ACPI: Wakeup code way too big, S3 disabled.\n"); - return; - } - - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); - if (!acpi_wakeup_address) - printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); -} - -static int __init acpi_sleep_setup(char *str) -{ - while ((str != NULL) && (*str != '\0')) { - if (strncmp(str, "s3_bios", 7) == 0) - acpi_realmode_flags |= 1; - if (strncmp(str, "s3_mode", 7) == 0) - acpi_realmode_flags |= 2; - if (strncmp(str, "s3_beep", 7) == 0) - acpi_realmode_flags |= 4; - str = strchr(str, ','); - if (str != NULL) - str += strspn(str, ", \t"); - } - return 1; -} - -__setup("acpi_sleep=", acpi_sleep_setup); - /* Ouch, we want to delete this. We already have better version in userspace, in s2ram from suspend.sf.net project */ static __init int reset_videomode_after_s3(const struct dmi_system_id *d) diff --git a/arch/x86/kernel/acpi/sleep_64.c b/arch/x86/kernel/acpi/sleep_64.c deleted file mode 100644 index da42de261ba..00000000000 --- a/arch/x86/kernel/acpi/sleep_64.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * acpi.c - Architecture-Specific Low-Level ACPI Support - * - * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> - * Copyright (C) 2001 Jun Nakajima <jun.nakajima@intel.com> - * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org> - * Copyright (C) 2002 Andi Kleen, SuSE Labs (x86-64 port) - * Copyright (C) 2003 Pavel Machek, SuSE Labs - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/pci.h> -#include <linux/bootmem.h> -#include <linux/acpi.h> -#include <linux/cpumask.h> - -#include <asm/mpspec.h> -#include <asm/io.h> -#include <asm/apic.h> -#include <asm/apicdef.h> -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/pgalloc.h> -#include <asm/io_apic.h> -#include <asm/proto.h> -#include <asm/tlbflush.h> - -/* -------------------------------------------------------------------------- - Low-Level Sleep Support - -------------------------------------------------------------------------- */ - -/* address in low memory of the wakeup routine. */ -unsigned long acpi_wakeup_address = 0; -unsigned long acpi_realmode_flags; -extern char wakeup_start, wakeup_end; - -extern unsigned long acpi_copy_wakeup_routine(unsigned long); - -/** - * acpi_save_state_mem - save kernel state - * - * Create an identity mapped page table and copy the wakeup routine to - * low memory. - */ -int acpi_save_state_mem(void) -{ - memcpy((void *)acpi_wakeup_address, &wakeup_start, - &wakeup_end - &wakeup_start); - acpi_copy_wakeup_routine(acpi_wakeup_address); - - return 0; -} - -/* - * acpi_restore_state - */ -void acpi_restore_state_mem(void) -{ -} - -/** - * acpi_reserve_bootmem - do _very_ early ACPI initialisation - * - * We allocate a page in low memory for the wakeup - * routine for when we come back from a sleep state. The - * runtime allocator allows specification of <16M pages, but not - * <1M pages. - */ -void __init acpi_reserve_bootmem(void) -{ - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); - if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2)) - printk(KERN_CRIT - "ACPI: Wakeup code way too big, will crash on attempt" - " to suspend\n"); -} - -static int __init acpi_sleep_setup(char *str) -{ - while ((str != NULL) && (*str != '\0')) { - if (strncmp(str, "s3_bios", 7) == 0) - acpi_realmode_flags |= 1; - if (strncmp(str, "s3_mode", 7) == 0) - acpi_realmode_flags |= 2; - if (strncmp(str, "s3_beep", 7) == 0) - acpi_realmode_flags |= 4; - str = strchr(str, ','); - if (str != NULL) - str += strspn(str, ", \t"); - } - return 1; -} - -__setup("acpi_sleep=", acpi_sleep_setup); - diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index 1e931aaf2ef..f53e3277f8e 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -1,4 +1,4 @@ -.text + .section .text.page_aligned #include <linux/linkage.h> #include <asm/segment.h> #include <asm/page.h> diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 5ed3bc5c61d..2e1b9e0d076 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -344,13 +344,13 @@ do_suspend_lowlevel: call save_processor_state movq $saved_context, %rax - movq %rsp, pt_regs_rsp(%rax) - movq %rbp, pt_regs_rbp(%rax) - movq %rsi, pt_regs_rsi(%rax) - movq %rdi, pt_regs_rdi(%rax) - movq %rbx, pt_regs_rbx(%rax) - movq %rcx, pt_regs_rcx(%rax) - movq %rdx, pt_regs_rdx(%rax) + movq %rsp, pt_regs_sp(%rax) + movq %rbp, pt_regs_bp(%rax) + movq %rsi, pt_regs_si(%rax) + movq %rdi, pt_regs_di(%rax) + movq %rbx, pt_regs_bx(%rax) + movq %rcx, pt_regs_cx(%rax) + movq %rdx, pt_regs_dx(%rax) movq %r8, pt_regs_r8(%rax) movq %r9, pt_regs_r9(%rax) movq %r10, pt_regs_r10(%rax) @@ -360,7 +360,7 @@ do_suspend_lowlevel: movq %r14, pt_regs_r14(%rax) movq %r15, pt_regs_r15(%rax) pushfq - popq pt_regs_eflags(%rax) + popq pt_regs_flags(%rax) movq $.L97, saved_rip(%rip) @@ -391,15 +391,15 @@ do_suspend_lowlevel: movq %rbx, %cr2 movq saved_context_cr0(%rax), %rbx movq %rbx, %cr0 - pushq pt_regs_eflags(%rax) + pushq pt_regs_flags(%rax) popfq - movq pt_regs_rsp(%rax), %rsp - movq pt_regs_rbp(%rax), %rbp - movq pt_regs_rsi(%rax), %rsi - movq pt_regs_rdi(%rax), %rdi - movq pt_regs_rbx(%rax), %rbx - movq pt_regs_rcx(%rax), %rcx - movq pt_regs_rdx(%rax), %rdx + movq pt_regs_sp(%rax), %rsp + movq pt_regs_bp(%rax), %rbp + movq pt_regs_si(%rax), %rsi + movq pt_regs_di(%rax), %rdi + movq pt_regs_bx(%rax), %rbx + movq pt_regs_cx(%rax), %rcx + movq pt_regs_dx(%rax), %rdx movq pt_regs_r8(%rax), %r8 movq pt_regs_r9(%rax), %r9 movq pt_regs_r10(%rax), %r10 diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index d6405e0842b..45d79ea890a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -273,6 +273,7 @@ struct smp_alt_module { }; static LIST_HEAD(smp_alt_modules); static DEFINE_SPINLOCK(smp_alt); +static int smp_mode = 1; /* protected by smp_alt */ void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, @@ -341,12 +342,13 @@ void alternatives_smp_switch(int smp) #ifdef CONFIG_LOCKDEP /* - * A not yet fixed binutils section handling bug prevents - * alternatives-replacement from working reliably, so turn - * it off: + * Older binutils section handling bug prevented + * alternatives-replacement from working reliably. + * + * If this still occurs then you should see a hang + * or crash shortly after this line: */ - printk("lockdep: not fixing up alternatives.\n"); - return; + printk("lockdep: fixing up alternatives.\n"); #endif if (noreplace_smp || smp_alt_once) @@ -354,21 +356,29 @@ void alternatives_smp_switch(int smp) BUG_ON(!smp && (num_online_cpus() > 1)); spin_lock_irqsave(&smp_alt, flags); - if (smp) { + + /* + * Avoid unnecessary switches because it forces JIT based VMs to + * throw away all cached translations, which can be quite costly. + */ + if (smp == smp_mode) { + /* nothing */ + } else if (smp) { printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); - clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); - clear_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); + clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_lock(mod->locks, mod->locks_end, mod->text, mod->text_end); } else { printk(KERN_INFO "SMP alternatives: switching to UP code\n"); - set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); - set_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); + set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); + set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_unlock(mod->locks, mod->locks_end, mod->text, mod->text_end); } + smp_mode = smp; spin_unlock_irqrestore(&smp_alt, flags); } @@ -431,8 +441,9 @@ void __init alternative_instructions(void) if (smp_alt_once) { if (1 == num_possible_cpus()) { printk(KERN_INFO "SMP alternatives: switching to UP code\n"); - set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); - set_bit(X86_FEATURE_UP, cpu_data(0).x86_capability); + set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); + set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); + alternatives_smp_unlock(__smp_locks, __smp_locks_end, _text, _etext); } @@ -440,7 +451,10 @@ void __init alternative_instructions(void) alternatives_smp_module_add(NULL, "core kernel", __smp_locks, __smp_locks_end, _text, _etext); - alternatives_smp_switch(0); + + /* Only switch to UP mode if we don't immediately boot others */ + if (num_possible_cpus() == 1 || setup_max_cpus <= 1) + alternatives_smp_switch(0); } #endif apply_paravirt(__parainstructions, __parainstructions_end); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 5b6992799c9..608152a2a05 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -1,12 +1,12 @@ -/* +/* * Firmware replacement code. - * + * * Work around broken BIOSes that don't set an aperture or only set the - * aperture in the AGP bridge. - * If all fails map the aperture over some low memory. This is cheaper than - * doing bounce buffering. The memory is lost. This is done at early boot - * because only the bootmem allocator can allocate 32+MB. - * + * aperture in the AGP bridge. + * If all fails map the aperture over some low memory. This is cheaper than + * doing bounce buffering. The memory is lost. This is done at early boot + * because only the bootmem allocator can allocate 32+MB. + * * Copyright 2002 Andi Kleen, SuSE Labs. */ #include <linux/kernel.h> @@ -30,7 +30,7 @@ int gart_iommu_aperture_disabled __initdata = 0; int gart_iommu_aperture_allowed __initdata = 0; int fallback_aper_order __initdata = 1; /* 64MB */ -int fallback_aper_force __initdata = 0; +int fallback_aper_force __initdata = 0; int fix_aperture __initdata = 1; @@ -49,167 +49,270 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) /* This code runs before the PCI subsystem is initialized, so just access the northbridge directly. */ -static u32 __init allocate_aperture(void) +static u32 __init allocate_aperture(void) { u32 aper_size; - void *p; + void *p; - if (fallback_aper_order > 7) - fallback_aper_order = 7; - aper_size = (32 * 1024 * 1024) << fallback_aper_order; + if (fallback_aper_order > 7) + fallback_aper_order = 7; + aper_size = (32 * 1024 * 1024) << fallback_aper_order; - /* - * Aperture has to be naturally aligned. This means an 2GB aperture won't - * have much chance of finding a place in the lower 4GB of memory. - * Unfortunately we cannot move it up because that would make the - * IOMMU useless. + /* + * Aperture has to be naturally aligned. This means a 2GB aperture + * won't have much chance of finding a place in the lower 4GB of + * memory. Unfortunately we cannot move it up because that would + * make the IOMMU useless. */ p = __alloc_bootmem_nopanic(aper_size, aper_size, 0); if (!p || __pa(p)+aper_size > 0xffffffff) { - printk("Cannot allocate aperture memory hole (%p,%uK)\n", - p, aper_size>>10); + printk(KERN_ERR + "Cannot allocate aperture memory hole (%p,%uK)\n", + p, aper_size>>10); if (p) free_bootmem(__pa(p), aper_size); return 0; } - printk("Mapping aperture over %d KB of RAM @ %lx\n", - aper_size >> 10, __pa(p)); + printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", + aper_size >> 10, __pa(p)); insert_aperture_resource((u32)__pa(p), aper_size); - return (u32)__pa(p); + + return (u32)__pa(p); } static int __init aperture_valid(u64 aper_base, u32 aper_size) -{ - if (!aper_base) - return 0; - if (aper_size < 64*1024*1024) { - printk("Aperture too small (%d MB)\n", aper_size>>20); +{ + if (!aper_base) return 0; - } + if (aper_base + aper_size > 0x100000000UL) { - printk("Aperture beyond 4GB. Ignoring.\n"); - return 0; + printk(KERN_ERR "Aperture beyond 4GB. Ignoring.\n"); + return 0; } if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) { - printk("Aperture pointing to e820 RAM. Ignoring.\n"); - return 0; - } + printk(KERN_ERR "Aperture pointing to e820 RAM. Ignoring.\n"); + return 0; + } + if (aper_size < 64*1024*1024) { + printk(KERN_ERR "Aperture too small (%d MB)\n", aper_size>>20); + return 0; + } + return 1; -} +} /* Find a PCI capability */ -static __u32 __init find_cap(int num, int slot, int func, int cap) -{ - u8 pos; +static __u32 __init find_cap(int num, int slot, int func, int cap) +{ int bytes; - if (!(read_pci_config_16(num,slot,func,PCI_STATUS) & PCI_STATUS_CAP_LIST)) + u8 pos; + + if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & + PCI_STATUS_CAP_LIST)) return 0; - pos = read_pci_config_byte(num,slot,func,PCI_CAPABILITY_LIST); - for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { + + pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); + for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { u8 id; - pos &= ~3; - id = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_ID); + + pos &= ~3; + id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); if (id == 0xff) break; - if (id == cap) - return pos; - pos = read_pci_config_byte(num,slot,func,pos+PCI_CAP_LIST_NEXT); - } + if (id == cap) + return pos; + pos = read_pci_config_byte(num, slot, func, + pos+PCI_CAP_LIST_NEXT); + } return 0; -} +} /* Read a standard AGPv3 bridge header */ static __u32 __init read_agp(int num, int slot, int func, int cap, u32 *order) -{ +{ u32 apsize; u32 apsizereg; int nbits; u32 aper_low, aper_hi; u64 aper; - printk("AGP bridge at %02x:%02x:%02x\n", num, slot, func); - apsizereg = read_pci_config_16(num,slot,func, cap + 0x14); + printk(KERN_INFO "AGP bridge at %02x:%02x:%02x\n", num, slot, func); + apsizereg = read_pci_config_16(num, slot, func, cap + 0x14); if (apsizereg == 0xffffffff) { - printk("APSIZE in AGP bridge unreadable\n"); + printk(KERN_ERR "APSIZE in AGP bridge unreadable\n"); return 0; } apsize = apsizereg & 0xfff; /* Some BIOS use weird encodings not in the AGPv3 table. */ - if (apsize & 0xff) - apsize |= 0xf00; + if (apsize & 0xff) + apsize |= 0xf00; nbits = hweight16(apsize); *order = 7 - nbits; if ((int)*order < 0) /* < 32MB */ *order = 0; - - aper_low = read_pci_config(num,slot,func, 0x10); - aper_hi = read_pci_config(num,slot,func,0x14); + + aper_low = read_pci_config(num, slot, func, 0x10); + aper_hi = read_pci_config(num, slot, func, 0x14); aper = (aper_low & ~((1<<22)-1)) | ((u64)aper_hi << 32); - printk("Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", - aper, 32 << *order, apsizereg); + printk(KERN_INFO "Aperture from AGP @ %Lx size %u MB (APSIZE %x)\n", + aper, 32 << *order, apsizereg); if (!aperture_valid(aper, (32*1024*1024) << *order)) - return 0; - return (u32)aper; -} - -/* Look for an AGP bridge. Windows only expects the aperture in the - AGP bridge and some BIOS forget to initialize the Northbridge too. - Work around this here. - - Do an PCI bus scan by hand because we're running before the PCI - subsystem. + return 0; + return (u32)aper; +} - All K8 AGP bridges are AGPv3 compliant, so we can do this scan - generically. It's probably overkill to always scan all slots because - the AGP bridges should be always an own bus on the HT hierarchy, - but do it here for future safety. */ +/* + * Look for an AGP bridge. Windows only expects the aperture in the + * AGP bridge and some BIOS forget to initialize the Northbridge too. + * Work around this here. + * + * Do an PCI bus scan by hand because we're running before the PCI + * subsystem. + * + * All K8 AGP bridges are AGPv3 compliant, so we can do this scan + * generically. It's probably overkill to always scan all slots because + * the AGP bridges should be always an own bus on the HT hierarchy, + * but do it here for future safety. + */ static __u32 __init search_agp_bridge(u32 *order, int *valid_agp) { int num, slot, func; /* Poor man's PCI discovery */ - for (num = 0; num < 256; num++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { + for (num = 0; num < 256; num++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { u32 class, cap; u8 type; - class = read_pci_config(num,slot,func, + class = read_pci_config(num, slot, func, PCI_CLASS_REVISION); if (class == 0xffffffff) - break; - - switch (class >> 16) { + break; + + switch (class >> 16) { case PCI_CLASS_BRIDGE_HOST: case PCI_CLASS_BRIDGE_OTHER: /* needed? */ /* AGP bridge? */ - cap = find_cap(num,slot,func,PCI_CAP_ID_AGP); + cap = find_cap(num, slot, func, + PCI_CAP_ID_AGP); if (!cap) break; - *valid_agp = 1; - return read_agp(num,slot,func,cap,order); - } - + *valid_agp = 1; + return read_agp(num, slot, func, cap, + order); + } + /* No multi-function device? */ - type = read_pci_config_byte(num,slot,func, + type = read_pci_config_byte(num, slot, func, PCI_HEADER_TYPE); if (!(type & 0x80)) break; - } - } + } + } } - printk("No AGP bridge found\n"); + printk(KERN_INFO "No AGP bridge found\n"); + return 0; } +static int gart_fix_e820 __initdata = 1; + +static int __init parse_gart_mem(char *p) +{ + if (!p) + return -EINVAL; + + if (!strncmp(p, "off", 3)) + gart_fix_e820 = 0; + else if (!strncmp(p, "on", 2)) + gart_fix_e820 = 1; + + return 0; +} +early_param("gart_fix_e820", parse_gart_mem); + +void __init early_gart_iommu_check(void) +{ + /* + * in case it is enabled before, esp for kexec/kdump, + * previous kernel already enable that. memset called + * by allocate_aperture/__alloc_bootmem_nopanic cause restart. + * or second kernel have different position for GART hole. and new + * kernel could use hole as RAM that is still used by GART set by + * first kernel + * or BIOS forget to put that in reserved. + * try to update e820 to make that region as reserved. + */ + int fix, num; + u32 ctl; + u32 aper_size = 0, aper_order = 0, last_aper_order = 0; + u64 aper_base = 0, last_aper_base = 0; + int aper_enabled = 0, last_aper_enabled = 0; + + if (!early_pci_allowed()) + return; + + fix = 0; + for (num = 24; num < 32; num++) { + if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) + continue; + + ctl = read_pci_config(0, num, 3, 0x90); + aper_enabled = ctl & 1; + aper_order = (ctl >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; + aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; + aper_base <<= 25; + + if ((last_aper_order && aper_order != last_aper_order) || + (last_aper_base && aper_base != last_aper_base) || + (last_aper_enabled && aper_enabled != last_aper_enabled)) { + fix = 1; + break; + } + last_aper_order = aper_order; + last_aper_base = aper_base; + last_aper_enabled = aper_enabled; + } + + if (!fix && !aper_enabled) + return; + + if (!aper_base || !aper_size || aper_base + aper_size > 0x100000000UL) + fix = 1; + + if (gart_fix_e820 && !fix && aper_enabled) { + if (e820_any_mapped(aper_base, aper_base + aper_size, + E820_RAM)) { + /* reserved it, so we can resuse it in second kernel */ + printk(KERN_INFO "update e820 for GART\n"); + add_memory_region(aper_base, aper_size, E820_RESERVED); + update_e820(); + } + return; + } + + /* different nodes have different setting, disable them all at first*/ + for (num = 24; num < 32; num++) { + if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) + continue; + + ctl = read_pci_config(0, num, 3, 0x90); + ctl &= ~1; + write_pci_config(0, num, 3, 0x90, ctl); + } + +} + void __init gart_iommu_hole_init(void) -{ - int fix, num; +{ u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; u64 aper_base, last_aper_base = 0; - int valid_agp = 0; + int fix, num, valid_agp = 0; + int node; if (gart_iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) @@ -218,24 +321,26 @@ void __init gart_iommu_hole_init(void) printk(KERN_INFO "Checking aperture...\n"); fix = 0; - for (num = 24; num < 32; num++) { + node = 0; + for (num = 24; num < 32; num++) { if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) continue; iommu_detected = 1; gart_iommu_aperture = 1; - aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; - aper_size = (32 * 1024 * 1024) << aper_order; + aper_order = (read_pci_config(0, num, 3, 0x90) >> 1) & 7; + aper_size = (32 * 1024 * 1024) << aper_order; aper_base = read_pci_config(0, num, 3, 0x94) & 0x7fff; - aper_base <<= 25; + aper_base <<= 25; + + printk(KERN_INFO "Node %d: aperture @ %Lx size %u MB\n", + node, aper_base, aper_size >> 20); + node++; - printk("CPU %d: aperture @ %Lx size %u MB\n", num-24, - aper_base, aper_size>>20); - if (!aperture_valid(aper_base, aper_size)) { - fix = 1; - break; + fix = 1; + break; } if ((last_aper_order && aper_order != last_aper_order) || @@ -245,55 +350,64 @@ void __init gart_iommu_hole_init(void) } last_aper_order = aper_order; last_aper_base = aper_base; - } + } if (!fix && !fallback_aper_force) { if (last_aper_base) { unsigned long n = (32 * 1024 * 1024) << last_aper_order; + insert_aperture_resource((u32)last_aper_base, n); } - return; + return; } if (!fallback_aper_force) - aper_alloc = search_agp_bridge(&aper_order, &valid_agp); - - if (aper_alloc) { + aper_alloc = search_agp_bridge(&aper_order, &valid_agp); + + if (aper_alloc) { /* Got the aperture from the AGP bridge */ } else if (swiotlb && !valid_agp) { /* Do nothing */ } else if ((!no_iommu && end_pfn > MAX_DMA32_PFN) || force_iommu || valid_agp || - fallback_aper_force) { - printk("Your BIOS doesn't leave a aperture memory hole\n"); - printk("Please enable the IOMMU option in the BIOS setup\n"); - printk("This costs you %d MB of RAM\n", - 32 << fallback_aper_order); + fallback_aper_force) { + printk(KERN_ERR + "Your BIOS doesn't leave a aperture memory hole\n"); + printk(KERN_ERR + "Please enable the IOMMU option in the BIOS setup\n"); + printk(KERN_ERR + "This costs you %d MB of RAM\n", + 32 << fallback_aper_order); aper_order = fallback_aper_order; aper_alloc = allocate_aperture(); - if (!aper_alloc) { - /* Could disable AGP and IOMMU here, but it's probably - not worth it. But the later users cannot deal with - bad apertures and turning on the aperture over memory - causes very strange problems, so it's better to - panic early. */ + if (!aper_alloc) { + /* + * Could disable AGP and IOMMU here, but it's + * probably not worth it. But the later users + * cannot deal with bad apertures and turning + * on the aperture over memory causes very + * strange problems, so it's better to panic + * early. + */ panic("Not enough memory for aperture"); } - } else { - return; - } + } else { + return; + } /* Fix up the north bridges */ - for (num = 24; num < 32; num++) { + for (num = 24; num < 32; num++) { if (!early_is_k8_nb(read_pci_config(0, num, 3, 0x00))) - continue; - - /* Don't enable translation yet. That is done later. - Assume this BIOS didn't initialise the GART so - just overwrite all previous bits */ - write_pci_config(0, num, 3, 0x90, aper_order<<1); - write_pci_config(0, num, 3, 0x94, aper_alloc>>25); - } -} + continue; + + /* + * Don't enable translation yet. That is done later. + * Assume this BIOS didn't initialise the GART so + * just overwrite all previous bits + */ + write_pci_config(0, num, 3, 0x90, aper_order<<1); + write_pci_config(0, num, 3, 0x94, aper_alloc>>25); + } +} diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index a56c782653b..35a568ea840 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -43,12 +43,10 @@ #include <mach_apicdef.h> #include <mach_ipi.h> -#include "io_ports.h" - /* * Sanity check */ -#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F +#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) # error SPURIOUS_APIC_VECTOR definition error #endif @@ -57,7 +55,7 @@ * * -1=force-disable, +1=force-enable */ -static int enable_local_apic __initdata = 0; +static int enable_local_apic __initdata; /* Local APIC timer verification ok */ static int local_apic_timer_verify_ok; @@ -101,6 +99,8 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events); /* Local APIC was disabled by the BIOS and enabled by the kernel */ static int enabled_via_apicbase; +static unsigned long apic_phys; + /* * Get the LAPIC version */ @@ -110,7 +110,7 @@ static inline int lapic_get_version(void) } /* - * Check, if the APIC is integrated or a seperate chip + * Check, if the APIC is integrated or a separate chip */ static inline int lapic_is_integrated(void) { @@ -135,9 +135,9 @@ void apic_wait_icr_idle(void) cpu_relax(); } -unsigned long safe_apic_wait_icr_idle(void) +u32 safe_apic_wait_icr_idle(void) { - unsigned long send_status; + u32 send_status; int timeout; timeout = 0; @@ -154,7 +154,7 @@ unsigned long safe_apic_wait_icr_idle(void) /** * enable_NMI_through_LVT0 - enable NMI through local vector table 0 */ -void enable_NMI_through_LVT0 (void * dummy) +void __cpuinit enable_NMI_through_LVT0(void) { unsigned int v = APIC_DM_NMI; @@ -379,8 +379,10 @@ void __init setup_boot_APIC_clock(void) */ if (local_apic_timer_disabled) { /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) + if (num_possible_cpus() > 1) { + lapic_clockevent.mult = 1; setup_APIC_timer(); + } return; } @@ -434,7 +436,7 @@ void __init setup_boot_APIC_clock(void) "with PM Timer: %ldms instead of 100ms\n", (long)res); /* Correct the lapic counter value */ - res = (((u64) delta ) * pm_100ms); + res = (((u64) delta) * pm_100ms); do_div(res, deltapm); printk(KERN_INFO "APIC delta adjusted to PM-Timer: " "%lu (%ld)\n", (unsigned long) res, delta); @@ -472,6 +474,19 @@ void __init setup_boot_APIC_clock(void) local_apic_timer_verify_ok = 1; + /* + * Do a sanity check on the APIC calibration result + */ + if (calibration_result < (1000000 / HZ)) { + local_irq_enable(); + printk(KERN_WARNING + "APIC frequency too slow, disabling apic timer\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; + } + /* We trust the pm timer based calibration */ if (!pm_referenced) { apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); @@ -563,6 +578,9 @@ static void local_apic_timer_interrupt(void) return; } + /* + * the NMI deadlock-detector uses this. + */ per_cpu(irq_stat, cpu).apic_timer_irqs++; evt->event_handler(evt); @@ -576,8 +594,7 @@ static void local_apic_timer_interrupt(void) * [ if a single-CPU system runs an SMP kernel then we call the local * interrupt as well. Thus we cannot inline the local irq ... ] */ - -void fastcall smp_apic_timer_interrupt(struct pt_regs *regs) +void smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); @@ -616,9 +633,14 @@ int setup_profiling_timer(unsigned int multiplier) */ void clear_local_APIC(void) { - int maxlvt = lapic_get_maxlvt(); - unsigned long v; + int maxlvt; + u32 v; + + /* APIC hasn't been mapped yet */ + if (!apic_phys) + return; + maxlvt = lapic_get_maxlvt(); /* * Masking an LVT entry can trigger a local APIC error * if the vector is zero. Mask LVTERR first to prevent this. @@ -976,7 +998,8 @@ void __cpuinit setup_local_APIC(void) value |= APIC_LVT_LEVEL_TRIGGER; apic_write_around(APIC_LVT1, value); - if (integrated && !esr_disable) { /* !82489DX */ + if (integrated && !esr_disable) { + /* !82489DX */ maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); @@ -1020,7 +1043,7 @@ void __cpuinit setup_local_APIC(void) /* * Detect and initialize APIC */ -static int __init detect_init_APIC (void) +static int __init detect_init_APIC(void) { u32 h, l, features; @@ -1077,7 +1100,7 @@ static int __init detect_init_APIC (void) printk(KERN_WARNING "Could not enable APIC!\n"); return -1; } - set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; /* The BIOS may have set up the APIC at some other address */ @@ -1104,8 +1127,6 @@ no_apic: */ void __init init_apic_mappings(void) { - unsigned long apic_phys; - /* * If no local APIC can be found then set up a fake all * zeroes page to simulate the local APIC and another @@ -1164,10 +1185,10 @@ fake_ioapic_page: * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. */ -int __init APIC_init_uniprocessor (void) +int __init APIC_init_uniprocessor(void) { if (enable_local_apic < 0) - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); if (!smp_found_config && !cpu_has_apic) return -1; @@ -1179,7 +1200,7 @@ int __init APIC_init_uniprocessor (void) APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", boot_cpu_physical_apicid); - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return -1; } @@ -1210,50 +1231,6 @@ int __init APIC_init_uniprocessor (void) } /* - * APIC command line parameters - */ -static int __init parse_lapic(char *arg) -{ - enable_local_apic = 1; - return 0; -} -early_param("lapic", parse_lapic); - -static int __init parse_nolapic(char *arg) -{ - enable_local_apic = -1; - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - return 0; -} -early_param("nolapic", parse_nolapic); - -static int __init parse_disable_lapic_timer(char *arg) -{ - local_apic_timer_disabled = 1; - return 0; -} -early_param("nolapic_timer", parse_disable_lapic_timer); - -static int __init parse_lapic_timer_c2_ok(char *arg) -{ - local_apic_timer_c2_ok = 1; - return 0; -} -early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); - -static int __init apic_set_verbosity(char *str) -{ - if (strcmp("debug", str) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) - apic_verbosity = APIC_VERBOSE; - return 1; -} - -__setup("apic=", apic_set_verbosity); - - -/* * Local APIC interrupts */ @@ -1306,7 +1283,7 @@ void smp_error_interrupt(struct pt_regs *regs) 6: Received illegal vector 7: Illegal register address */ - printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", + printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); irq_exit(); } @@ -1393,7 +1370,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) value = apic_read(APIC_LVT0); value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); apic_write_around(APIC_LVT0, value); @@ -1565,3 +1542,46 @@ device_initcall(init_lapic_sysfs); static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ + +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + enable_local_apic = 1; + return 0; +} +early_param("lapic", parse_lapic); + +static int __init parse_nolapic(char *arg) +{ + enable_local_apic = -1; + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + return 0; +} +early_param("nolapic", parse_nolapic); + +static int __init parse_disable_lapic_timer(char *arg) +{ + local_apic_timer_disabled = 1; + return 0; +} +early_param("nolapic_timer", parse_disable_lapic_timer); + +static int __init parse_lapic_timer_c2_ok(char *arg) +{ + local_apic_timer_c2_ok = 1; + return 0; +} +early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); + +static int __init apic_set_verbosity(char *str) +{ + if (strcmp("debug", str) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", str) == 0) + apic_verbosity = APIC_VERBOSE; + return 1; +} +__setup("apic=", apic_set_verbosity); + diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index fa6cdee6d30..d8d03e09dea 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -23,32 +23,37 @@ #include <linux/mc146818rtc.h> #include <linux/kernel_stat.h> #include <linux/sysdev.h> -#include <linux/module.h> #include <linux/ioport.h> #include <linux/clockchips.h> +#include <linux/acpi_pmtmr.h> +#include <linux/module.h> #include <asm/atomic.h> #include <asm/smp.h> #include <asm/mtrr.h> #include <asm/mpspec.h> +#include <asm/hpet.h> #include <asm/pgalloc.h> #include <asm/mach_apic.h> #include <asm/nmi.h> #include <asm/idle.h> #include <asm/proto.h> #include <asm/timex.h> -#include <asm/hpet.h> #include <asm/apic.h> -int apic_verbosity; int disable_apic_timer __cpuinitdata; static int apic_calibrate_pmtmr __initdata; +int disable_apic; -/* Local APIC timer works in C2? */ +/* Local APIC timer works in C2 */ int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); -static struct resource *ioapic_resources; +/* + * Debug level, exported for io_apic.c + */ +int apic_verbosity; + static struct resource lapic_resource = { .name = "Local APIC", .flags = IORESOURCE_MEM | IORESOURCE_BUSY, @@ -60,10 +65,8 @@ static int lapic_next_event(unsigned long delta, struct clock_event_device *evt); static void lapic_timer_setup(enum clock_event_mode mode, struct clock_event_device *evt); - static void lapic_timer_broadcast(cpumask_t mask); - -static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen); +static void apic_pm_activate(void); static struct clock_event_device lapic_clockevent = { .name = "lapic", @@ -78,6 +81,150 @@ static struct clock_event_device lapic_clockevent = { }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); +static unsigned long apic_phys; + +/* + * Get the LAPIC version + */ +static inline int lapic_get_version(void) +{ + return GET_APIC_VERSION(apic_read(APIC_LVR)); +} + +/* + * Check, if the APIC is integrated or a seperate chip + */ +static inline int lapic_is_integrated(void) +{ + return 1; +} + +/* + * Check, whether this is a modern or a first generation APIC + */ +static int modern_apic(void) +{ + /* AMD systems use old APIC versions, so check the CPU */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 0xf) + return 1; + return lapic_get_version() >= 0x14; +} + +void apic_wait_icr_idle(void) +{ + while (apic_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +u32 safe_apic_wait_icr_idle(void) +{ + u32 send_status; + int timeout; + + timeout = 0; + do { + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + if (!send_status) + break; + udelay(100); + } while (timeout++ < 1000); + + return send_status; +} + +/** + * enable_NMI_through_LVT0 - enable NMI through local vector table 0 + */ +void __cpuinit enable_NMI_through_LVT0(void) +{ + unsigned int v; + + /* unmask and set to NMI */ + v = APIC_DM_NMI; + apic_write(APIC_LVT0, v); +} + +/** + * lapic_get_maxlvt - get the maximum number of local vector table entries + */ +int lapic_get_maxlvt(void) +{ + unsigned int v, maxlvt; + + v = apic_read(APIC_LVR); + maxlvt = GET_APIC_MAXLVT(v); + return maxlvt; +} + +/* + * This function sets up the local APIC timer, with a timeout of + * 'clocks' APIC bus clock. During calibration we actually call + * this function twice on the boot CPU, once with a bogus timeout + * value, second time for real. The other (noncalibrating) CPUs + * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. + */ + +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) +{ + unsigned int lvtt_value, tmp_value; + + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + if (!irqen) + lvtt_value |= APIC_LVT_MASKED; + + apic_write(APIC_LVTT, lvtt_value); + + /* + * Divide PICLK by 16 + */ + tmp_value = apic_read(APIC_TDCR); + apic_write(APIC_TDCR, (tmp_value + & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) + | APIC_TDR_DIV_16); + + if (!oneshot) + apic_write(APIC_TMICT, clocks); +} + +/* + * Setup extended LVT, AMD specific (K8, family 10h) + * + * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and + * MCE interrupts are supported. Thus MCE offset must be set to 0. + */ + +#define APIC_EILVT_LVTOFF_MCE 0 +#define APIC_EILVT_LVTOFF_IBS 1 + +static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) +{ + unsigned long reg = (lvt_off << 4) + APIC_EILVT0; + unsigned int v = (mask << 16) | (msg_type << 8) | vector; + + apic_write(reg, v); +} + +u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) +{ + setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); + return APIC_EILVT_LVTOFF_MCE; +} + +u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) +{ + setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); + return APIC_EILVT_LVTOFF_IBS; +} + +/* + * Program the next event, relative to now + */ static int lapic_next_event(unsigned long delta, struct clock_event_device *evt) { @@ -85,6 +232,9 @@ static int lapic_next_event(unsigned long delta, return 0; } +/* + * Setup the lapic timer in periodic or oneshot mode + */ static void lapic_timer_setup(enum clock_event_mode mode, struct clock_event_device *evt) { @@ -127,75 +277,261 @@ static void lapic_timer_broadcast(cpumask_t mask) #endif } -static void apic_pm_activate(void); +/* + * Setup the local APIC timer for this CPU. Copy the initilized values + * of the boot CPU and register the clock event in the framework. + */ +static void setup_APIC_timer(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events); -void apic_wait_icr_idle(void) + memcpy(levt, &lapic_clockevent, sizeof(*levt)); + levt->cpumask = cpumask_of_cpu(smp_processor_id()); + + clockevents_register_device(levt); +} + +/* + * In this function we calibrate APIC bus clocks to the external + * timer. Unfortunately we cannot use jiffies and the timer irq + * to calibrate, since some later bootup code depends on getting + * the first irq? Ugh. + * + * We want to do the calibration only once since we + * want to have local timer irqs syncron. CPUs connected + * by the same APIC bus have the very same bus frequency. + * And we want to have irqs off anyways, no accidental + * APIC irq that way. + */ + +#define TICK_COUNT 100000000 + +static void __init calibrate_APIC_clock(void) { - while (apic_read(APIC_ICR) & APIC_ICR_BUSY) - cpu_relax(); + unsigned apic, apic_start; + unsigned long tsc, tsc_start; + int result; + + local_irq_disable(); + + /* + * Put whatever arbitrary (but long enough) timeout + * value into the APIC clock, we just want to get the + * counter running for calibration. + * + * No interrupt enable ! + */ + __setup_APIC_LVTT(250000000, 0, 0); + + apic_start = apic_read(APIC_TMCCT); +#ifdef CONFIG_X86_PM_TIMER + if (apic_calibrate_pmtmr && pmtmr_ioport) { + pmtimer_wait(5000); /* 5ms wait */ + apic = apic_read(APIC_TMCCT); + result = (apic_start - apic) * 1000L / 5; + } else +#endif + { + rdtscll(tsc_start); + + do { + apic = apic_read(APIC_TMCCT); + rdtscll(tsc); + } while ((tsc - tsc_start) < TICK_COUNT && + (apic_start - apic) < TICK_COUNT); + + result = (apic_start - apic) * 1000L * tsc_khz / + (tsc - tsc_start); + } + + local_irq_enable(); + + printk(KERN_DEBUG "APIC timer calibration result %d\n", result); + + printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", + result / 1000 / 1000, result / 1000 % 1000); + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + calibration_result = result / HZ; } -unsigned int safe_apic_wait_icr_idle(void) +/* + * Setup the boot APIC + * + * Calibrate and verify the result. + */ +void __init setup_boot_APIC_clock(void) { - unsigned int send_status; - int timeout; + /* + * The local apic timer can be disabled via the kernel commandline. + * Register the lapic timer as a dummy clock event source on SMP + * systems, so the broadcast mechanism is used. On UP systems simply + * ignore it. + */ + if (disable_apic_timer) { + printk(KERN_INFO "Disabling APIC timer\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) { + lapic_clockevent.mult = 1; + setup_APIC_timer(); + } + return; + } - timeout = 0; - do { - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - if (!send_status) - break; - udelay(100); - } while (timeout++ < 1000); + printk(KERN_INFO "Using local APIC timer interrupts.\n"); + calibrate_APIC_clock(); - return send_status; + /* + * Do a sanity check on the APIC calibration result + */ + if (calibration_result < (1000000 / HZ)) { + printk(KERN_WARNING + "APIC frequency too slow, disabling apic timer\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; + } + + /* + * If nmi_watchdog is set to IO_APIC, we need the + * PIT/HPET going. Otherwise register lapic as a dummy + * device. + */ + if (nmi_watchdog != NMI_IO_APIC) + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + else + printk(KERN_WARNING "APIC timer registered as dummy," + " due to nmi_watchdog=1!\n"); + + setup_APIC_timer(); } -void enable_NMI_through_LVT0 (void * dummy) +/* + * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the + * C1E flag only in the secondary CPU, so when we detect the wreckage + * we already have enabled the boot CPU local apic timer. Check, if + * disable_apic_timer is set and the DUMMY flag is cleared. If yes, + * set the DUMMY flag again and force the broadcast mode in the + * clockevents layer. + */ +void __cpuinit check_boot_apic_timer_broadcast(void) { - unsigned int v; + if (!disable_apic_timer || + (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY)) + return; - /* unmask and set to NMI */ - v = APIC_DM_NMI; - apic_write(APIC_LVT0, v); + printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n"); + lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY; + + local_irq_enable(); + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &boot_cpu_id); + local_irq_disable(); } -int get_maxlvt(void) +void __cpuinit setup_secondary_APIC_clock(void) { - unsigned int v, maxlvt; + check_boot_apic_timer_broadcast(); + setup_APIC_timer(); +} - v = apic_read(APIC_LVR); - maxlvt = GET_APIC_MAXLVT(v); - return maxlvt; +/* + * The guts of the apic timer interrupt + */ +static void local_apic_timer_interrupt(void) +{ + int cpu = smp_processor_id(); + struct clock_event_device *evt = &per_cpu(lapic_events, cpu); + + /* + * Normally we should not be here till LAPIC has been initialized but + * in some cases like kdump, its possible that there is a pending LAPIC + * timer interrupt from previous kernel's context and is delivered in + * new kernel the moment interrupts are enabled. + * + * Interrupts are enabled early and LAPIC is setup much later, hence + * its possible that when we get here evt->event_handler is NULL. + * Check for event_handler being NULL and discard the interrupt as + * spurious. + */ + if (!evt->event_handler) { + printk(KERN_WARNING + "Spurious LAPIC timer interrupt on cpu %d\n", cpu); + /* Switch it off */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); + return; + } + + /* + * the NMI deadlock-detector uses this. + */ + add_pda(apic_timer_irqs, 1); + + evt->event_handler(evt); } /* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesn't support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] */ -void ack_bad_irq(unsigned int irq) +void smp_apic_timer_interrupt(struct pt_regs *regs) { - printk("unexpected IRQ trap at vector %02x\n", irq); + struct pt_regs *old_regs = set_irq_regs(regs); + /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - * But don't ack when the APIC is disabled. -AK + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. */ - if (!disable_apic) - ack_APIC_irq(); + ack_APIC_irq(); + /* + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ + exit_idle(); + irq_enter(); + local_apic_timer_interrupt(); + irq_exit(); + set_irq_regs(old_regs); +} + +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; } + +/* + * Local APIC start and shutdown + */ + +/** + * clear_local_APIC - shutdown the local APIC + * + * This is called, when a CPU is disabled and before rebooting, so the state of + * the local APIC has no dangling leftovers. Also used to cleanout any BIOS + * leftovers during boot. + */ void clear_local_APIC(void) { - int maxlvt; - unsigned int v; + int maxlvt = lapic_get_maxlvt(); + u32 v; - maxlvt = get_maxlvt(); + /* APIC hasn't been mapped yet */ + if (!apic_phys) + return; + maxlvt = lapic_get_maxlvt(); /* * Masking an LVT entry can trigger a local APIC error * if the vector is zero. Mask LVTERR first to prevent this. @@ -233,45 +569,9 @@ void clear_local_APIC(void) apic_read(APIC_ESR); } -void disconnect_bsp_APIC(int virt_wire_setup) -{ - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* - * For LVT0 make it edge triggered, active high, - * external and enabled - */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write(APIC_LVT0, value); - } else { - /* Disable LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED); - } - - /* For LVT1 make it edge triggered, active high, nmi and enabled */ - value = apic_read(APIC_LVT1); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write(APIC_LVT1, value); -} - +/** + * disable_local_APIC - clear and disable the local APIC + */ void disable_local_APIC(void) { unsigned int value; @@ -333,7 +633,7 @@ int __init verify_local_APIC(void) reg1 = GET_APIC_VERSION(reg0); if (reg1 == 0x00 || reg1 == 0xff) return 0; - reg1 = get_maxlvt(); + reg1 = lapic_get_maxlvt(); if (reg1 < 0x02 || reg1 == 0xff) return 0; @@ -355,18 +655,20 @@ int __init verify_local_APIC(void) * compatibility mode, but most boxes are anymore. */ reg0 = apic_read(APIC_LVT0); - apic_printk(APIC_DEBUG,"Getting LVT0: %x\n", reg0); + apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); reg1 = apic_read(APIC_LVT1); apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); return 1; } +/** + * sync_Arb_IDs - synchronize APIC bus arbitration IDs + */ void __init sync_Arb_IDs(void) { /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ - unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - if (ver >= 0x14) /* P4 or higher */ + if (modern_apic()) return; /* @@ -418,9 +720,12 @@ void __init init_bsp_APIC(void) apic_write(APIC_LVT1, value); } -void __cpuinit setup_local_APIC (void) +/** + * setup_local_APIC - setup the local APIC + */ +void __cpuinit setup_local_APIC(void) { - unsigned int value, maxlvt; + unsigned int value; int i, j; value = apic_read(APIC_LVR); @@ -516,30 +821,217 @@ void __cpuinit setup_local_APIC (void) else value = APIC_DM_NMI | APIC_LVT_MASKED; apic_write(APIC_LVT1, value); +} - { - unsigned oldvalue; - maxlvt = get_maxlvt(); - oldvalue = apic_read(APIC_ESR); - value = ERROR_APIC_VECTOR; // enables sending errors - apic_write(APIC_LVTERR, value); - /* - * spec says clear errors after enabling vector. - */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); - value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, - "ESR value after enabling vector: %08x, after %08x\n", - oldvalue, value); - } +void __cpuinit lapic_setup_esr(void) +{ + unsigned maxlvt = lapic_get_maxlvt(); + + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR); + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); +} +void __cpuinit end_local_APIC_setup(void) +{ + lapic_setup_esr(); nmi_watchdog_default(); setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } +/* + * Detect and enable local APICs on non-SMP boards. + * Original code written by Keir Fraser. + * On AMD64 we trust the BIOS - if it says no APIC it is likely + * not correctly set up (usually the APIC timer won't work etc.) + */ +static int __init detect_init_APIC(void) +{ + if (!cpu_has_apic) { + printk(KERN_INFO "No local APIC present\n"); + return -1; + } + + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + boot_cpu_id = 0; + return 0; +} + +/** + * init_apic_mappings - initialize APIC mappings + */ +void __init init_apic_mappings(void) +{ + /* + * If no local APIC can be found then set up a fake all + * zeroes page to simulate the local APIC and another + * one for the IO-APIC. + */ + if (!smp_found_config && detect_init_APIC()) { + apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + apic_phys = __pa(apic_phys); + } else + apic_phys = mp_lapic_addr; + + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + APIC_BASE, apic_phys); + + /* Put local APIC into the resource map. */ + lapic_resource.start = apic_phys; + lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; + insert_resource(&iomem_resource, &lapic_resource); + + /* + * Fetch the APIC ID of the BSP in case we have a + * default configuration (or the MP table is broken). + */ + boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); +} + +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int __init APIC_init_uniprocessor(void) +{ + if (disable_apic) { + printk(KERN_INFO "Apic disabled\n"); + return -1; + } + if (!cpu_has_apic) { + disable_apic = 1; + printk(KERN_INFO "Apic disabled by BIOS\n"); + return -1; + } + + verify_local_APIC(); + + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); + apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id)); + + setup_local_APIC(); + + /* + * Now enable IO-APICs, actually call clear_IO_APIC + * We need clear_IO_APIC before enabling vector on BP + */ + if (!skip_ioapic_setup && nr_ioapics) + enable_IO_APIC(); + + end_local_APIC_setup(); + + if (smp_found_config && !skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); + else + nr_ioapics = 0; + setup_boot_APIC_clock(); + check_nmi_watchdog(); + return 0; +} + +/* + * Local APIC interrupts + */ + +/* + * This interrupt should _never_ happen with our APIC/SMP architecture + */ +asmlinkage void smp_spurious_interrupt(void) +{ + unsigned int v; + exit_idle(); + irq_enter(); + /* + * Check if this really is a spurious interrupt and ACK it + * if it is a vectored one. Just in case... + * Spurious interrupts should not be ACKed. + */ + v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); + if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) + ack_APIC_irq(); + + add_pda(irq_spurious_count, 1); + irq_exit(); +} + +/* + * This interrupt should never happen with our APIC/SMP architecture + */ +asmlinkage void smp_error_interrupt(void) +{ + unsigned int v, v1; + + exit_idle(); + irq_enter(); + /* First tickle the hardware, only then report what went on. -- REW */ + v = apic_read(APIC_ESR); + apic_write(APIC_ESR, 0); + v1 = apic_read(APIC_ESR); + ack_APIC_irq(); + atomic_inc(&irq_err_count); + + /* Here is what the APIC error bits mean: + 0: Send CS error + 1: Receive CS error + 2: Send accept error + 3: Receive accept error + 4: Reserved + 5: Send illegal vector + 6: Received illegal vector + 7: Illegal register address + */ + printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", + smp_processor_id(), v , v1); + irq_exit(); +} + +void disconnect_bsp_APIC(int virt_wire_setup) +{ + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* + * For LVT0 make it edge triggered, active high, + * external and enabled + */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write(APIC_LVT0, APIC_LVT_MASKED); + } + + /* For LVT1 make it edge triggered, active high, nmi and enabled */ + value = apic_read(APIC_LVT1); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write(APIC_LVT1, value); +} + +/* + * Power management + */ #ifdef CONFIG_PM static struct { @@ -571,7 +1063,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) if (!apic_pm_state.active) return 0; - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); apic_pm_state.apic_id = apic_read(APIC_ID); apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); @@ -605,7 +1097,7 @@ static int lapic_resume(struct sys_device *dev) if (!apic_pm_state.active) return 0; - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); local_irq_save(flags); rdmsr(MSR_IA32_APICBASE, l, h); @@ -645,8 +1137,8 @@ static struct sysdev_class lapic_sysclass = { }; static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, + .id = 0, + .cls = &lapic_sysclass, }; static void __cpuinit apic_pm_activate(void) @@ -657,9 +1149,11 @@ static void __cpuinit apic_pm_activate(void) static int __init init_lapic_sysfs(void) { int error; + if (!cpu_has_apic) return 0; /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + error = sysdev_class_register(&lapic_sysclass); if (!error) error = sysdev_register(&device_lapic); @@ -673,423 +1167,6 @@ static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ -static int __init apic_set_verbosity(char *str) -{ - if (str == NULL) { - skip_ioapic_setup = 0; - ioapic_force = 1; - return 0; - } - if (strcmp("debug", str) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) - apic_verbosity = APIC_VERBOSE; - else { - printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug\n", str); - return -EINVAL; - } - - return 0; -} -early_param("apic", apic_set_verbosity); - -/* - * Detect and enable local APICs on non-SMP boards. - * Original code written by Keir Fraser. - * On AMD64 we trust the BIOS - if it says no APIC it is likely - * not correctly set up (usually the APIC timer won't work etc.) - */ - -static int __init detect_init_APIC (void) -{ - if (!cpu_has_apic) { - printk(KERN_INFO "No local APIC present\n"); - return -1; - } - - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - boot_cpu_id = 0; - return 0; -} - -#ifdef CONFIG_X86_IO_APIC -static struct resource * __init ioapic_setup_resources(void) -{ -#define IOAPIC_RESOURCE_NAME_SIZE 11 - unsigned long n; - struct resource *res; - char *mem; - int i; - - if (nr_ioapics <= 0) - return NULL; - - n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); - n *= nr_ioapics; - - mem = alloc_bootmem(n); - res = (void *)mem; - - if (mem != NULL) { - memset(mem, 0, n); - mem += sizeof(struct resource) * nr_ioapics; - - for (i = 0; i < nr_ioapics; i++) { - res[i].name = mem; - res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); - mem += IOAPIC_RESOURCE_NAME_SIZE; - } - } - - ioapic_resources = res; - - return res; -} - -static int __init ioapic_insert_resources(void) -{ - int i; - struct resource *r = ioapic_resources; - - if (!r) { - printk("IO APIC resources could be not be allocated.\n"); - return -1; - } - - for (i = 0; i < nr_ioapics; i++) { - insert_resource(&iomem_resource, r); - r++; - } - - return 0; -} - -/* Insert the IO APIC resources after PCI initialization has occured to handle - * IO APICS that are mapped in on a BAR in PCI space. */ -late_initcall(ioapic_insert_resources); -#endif - -void __init init_apic_mappings(void) -{ - unsigned long apic_phys; - - /* - * If no local APIC can be found then set up a fake all - * zeroes page to simulate the local APIC and another - * one for the IO-APIC. - */ - if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - apic_phys = __pa(apic_phys); - } else - apic_phys = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, apic_phys); - - /* Put local APIC into the resource map. */ - lapic_resource.start = apic_phys; - lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; - insert_resource(&iomem_resource, &lapic_resource); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID)); - - { - unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - int i; - struct resource *ioapic_res; - - ioapic_res = ioapic_setup_resources(); - for (i = 0; i < nr_ioapics; i++) { - if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mpc_apicaddr; - } else { - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); - ioapic_phys = __pa(ioapic_phys); - } - set_fixmap_nocache(idx, ioapic_phys); - apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %016lx (%016lx)\n", - __fix_to_virt(idx), ioapic_phys); - idx++; - - if (ioapic_res != NULL) { - ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; - ioapic_res++; - } - } - } -} - -/* - * This function sets up the local APIC timer, with a timeout of - * 'clocks' APIC bus clock. During calibration we actually call - * this function twice on the boot CPU, once with a bogus timeout - * value, second time for real. The other (noncalibrating) CPUs - * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. - */ - -static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) -{ - unsigned int lvtt_value, tmp_value; - - lvtt_value = LOCAL_TIMER_VECTOR; - if (!oneshot) - lvtt_value |= APIC_LVT_TIMER_PERIODIC; - if (!irqen) - lvtt_value |= APIC_LVT_MASKED; - - apic_write(APIC_LVTT, lvtt_value); - - /* - * Divide PICLK by 16 - */ - tmp_value = apic_read(APIC_TDCR); - apic_write(APIC_TDCR, (tmp_value - & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) - | APIC_TDR_DIV_16); - - if (!oneshot) - apic_write(APIC_TMICT, clocks); -} - -static void setup_APIC_timer(void) -{ - struct clock_event_device *levt = &__get_cpu_var(lapic_events); - - memcpy(levt, &lapic_clockevent, sizeof(*levt)); - levt->cpumask = cpumask_of_cpu(smp_processor_id()); - - clockevents_register_device(levt); -} - -/* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. - */ - -#define TICK_COUNT 100000000 - -static void __init calibrate_APIC_clock(void) -{ - unsigned apic, apic_start; - unsigned long tsc, tsc_start; - int result; - - local_irq_disable(); - - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - * - * No interrupt enable ! - */ - __setup_APIC_LVTT(250000000, 0, 0); - - apic_start = apic_read(APIC_TMCCT); -#ifdef CONFIG_X86_PM_TIMER - if (apic_calibrate_pmtmr && pmtmr_ioport) { - pmtimer_wait(5000); /* 5ms wait */ - apic = apic_read(APIC_TMCCT); - result = (apic_start - apic) * 1000L / 5; - } else -#endif - { - rdtscll(tsc_start); - - do { - apic = apic_read(APIC_TMCCT); - rdtscll(tsc); - } while ((tsc - tsc_start) < TICK_COUNT && - (apic_start - apic) < TICK_COUNT); - - result = (apic_start - apic) * 1000L * tsc_khz / - (tsc - tsc_start); - } - - local_irq_enable(); - - printk(KERN_DEBUG "APIC timer calibration result %d\n", result); - - printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", - result / 1000 / 1000, result / 1000 % 1000); - - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - - calibration_result = result / HZ; -} - -void __init setup_boot_APIC_clock (void) -{ - /* - * The local apic timer can be disabled via the kernel commandline. - * Register the lapic timer as a dummy clock event source on SMP - * systems, so the broadcast mechanism is used. On UP systems simply - * ignore it. - */ - if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) - setup_APIC_timer(); - return; - } - - printk(KERN_INFO "Using local APIC timer interrupts.\n"); - calibrate_APIC_clock(); - - /* - * If nmi_watchdog is set to IO_APIC, we need the - * PIT/HPET going. Otherwise register lapic as a dummy - * device. - */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - printk(KERN_WARNING "APIC timer registered as dummy," - " due to nmi_watchdog=1!\n"); - - setup_APIC_timer(); -} - -/* - * AMD C1E enabled CPUs have a real nasty problem: Some BIOSes set the - * C1E flag only in the secondary CPU, so when we detect the wreckage - * we already have enabled the boot CPU local apic timer. Check, if - * disable_apic_timer is set and the DUMMY flag is cleared. If yes, - * set the DUMMY flag again and force the broadcast mode in the - * clockevents layer. - */ -void __cpuinit check_boot_apic_timer_broadcast(void) -{ - if (!disable_apic_timer || - (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY)) - return; - - printk(KERN_INFO "AMD C1E detected late. Force timer broadcast.\n"); - lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY; - - local_irq_enable(); - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &boot_cpu_id); - local_irq_disable(); -} - -void __cpuinit setup_secondary_APIC_clock(void) -{ - check_boot_apic_timer_broadcast(); - setup_APIC_timer(); -} - -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - -void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector, - unsigned char msg_type, unsigned char mask) -{ - unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE; - unsigned int v = (mask << 16) | (msg_type << 8) | vector; - apic_write(reg, v); -} - -/* - * Local timer interrupt handler. It does both profiling and - * process statistics/rescheduling. - * - * We do profiling in every local tick, statistics/rescheduling - * happen only every 'profiling multiplier' ticks. The default - * multiplier is 1 and it can be changed by writing the new multiplier - * value into /proc/profile. - */ - -void smp_local_timer_interrupt(void) -{ - int cpu = smp_processor_id(); - struct clock_event_device *evt = &per_cpu(lapic_events, cpu); - - /* - * Normally we should not be here till LAPIC has been initialized but - * in some cases like kdump, its possible that there is a pending LAPIC - * timer interrupt from previous kernel's context and is delivered in - * new kernel the moment interrupts are enabled. - * - * Interrupts are enabled early and LAPIC is setup much later, hence - * its possible that when we get here evt->event_handler is NULL. - * Check for event_handler being NULL and discard the interrupt as - * spurious. - */ - if (!evt->event_handler) { - printk(KERN_WARNING - "Spurious LAPIC timer interrupt on cpu %d\n", cpu); - /* Switch it off */ - lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); - return; - } - - /* - * the NMI deadlock-detector uses this. - */ - add_pda(apic_timer_irqs, 1); - - evt->event_handler(evt); -} - -/* - * Local APIC timer interrupt. This is the most natural way for doing - * local interrupts, but local timer interrupts can be emulated by - * broadcast interrupts too. [in case the hw doesn't support APIC timers] - * - * [ if a single-CPU system runs an SMP kernel then we call the local - * interrupt as well. Thus we cannot inline the local irq ... ] - */ -void smp_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - exit_idle(); - irq_enter(); - smp_local_timer_interrupt(); - irq_exit(); - set_irq_regs(old_regs); -} - /* * apic_is_clustered_box() -- Check if we can expect good TSC * @@ -1103,21 +1180,34 @@ __cpuinit int apic_is_clustered_box(void) { int i, clusters, zeros; unsigned id; + u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); bitmap_zero(clustermap, NUM_APIC_CLUSTERS); for (i = 0; i < NR_CPUS; i++) { - id = bios_cpu_apicid[i]; + /* are we being called early in kernel startup? */ + if (bios_cpu_apicid) { + id = bios_cpu_apicid[i]; + } + else if (i < nr_cpu_ids) { + if (cpu_present(i)) + id = per_cpu(x86_bios_cpu_apicid, i); + else + continue; + } + else + break; + if (id != BAD_APICID) __set_bit(APIC_CLUSTERID(id), clustermap); } /* Problem: Partially populated chassis may not have CPUs in some of * the APIC clusters they have been allocated. Only present CPUs have - * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since - * clusters are allocated sequentially, count zeros only if they are - * bounded by ones. + * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. + * Since clusters are allocated sequentially, count zeros only if + * they are bounded by ones. */ clusters = 0; zeros = 0; @@ -1138,96 +1228,33 @@ __cpuinit int apic_is_clustered_box(void) } /* - * This interrupt should _never_ happen with our APIC/SMP architecture - */ -asmlinkage void smp_spurious_interrupt(void) -{ - unsigned int v; - exit_idle(); - irq_enter(); - /* - * Check if this really is a spurious interrupt and ACK it - * if it is a vectored one. Just in case... - * Spurious interrupts should not be ACKed. - */ - v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); - if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) - ack_APIC_irq(); - - add_pda(irq_spurious_count, 1); - irq_exit(); -} - -/* - * This interrupt should never happen with our APIC/SMP architecture + * APIC command line parameters */ - -asmlinkage void smp_error_interrupt(void) -{ - unsigned int v, v1; - - exit_idle(); - irq_enter(); - /* First tickle the hardware, only then report what went on. -- REW */ - v = apic_read(APIC_ESR); - apic_write(APIC_ESR, 0); - v1 = apic_read(APIC_ESR); - ack_APIC_irq(); - atomic_inc(&irq_err_count); - - /* Here is what the APIC error bits mean: - 0: Send CS error - 1: Receive CS error - 2: Send accept error - 3: Receive accept error - 4: Reserved - 5: Send illegal vector - 6: Received illegal vector - 7: Illegal register address - */ - printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); - irq_exit(); -} - -int disable_apic; - -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ -int __init APIC_init_uniprocessor (void) +static int __init apic_set_verbosity(char *str) { - if (disable_apic) { - printk(KERN_INFO "Apic disabled\n"); - return -1; + if (str == NULL) { + skip_ioapic_setup = 0; + ioapic_force = 1; + return 0; } - if (!cpu_has_apic) { - disable_apic = 1; - printk(KERN_INFO "Apic disabled by BIOS\n"); - return -1; + if (strcmp("debug", str) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", str) == 0) + apic_verbosity = APIC_VERBOSE; + else { + printk(KERN_WARNING "APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug\n", str); + return -EINVAL; } - verify_local_APIC(); - - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id); - apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id)); - - setup_local_APIC(); - - if (smp_found_config && !skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); - else - nr_ioapics = 0; - setup_boot_APIC_clock(); - check_nmi_watchdog(); return 0; } +early_param("apic", apic_set_verbosity); static __init int setup_disableapic(char *str) { disable_apic = 1; - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return 0; } early_param("disableapic", setup_disableapic); diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index af045ca0f65..d4438ef296d 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -227,6 +227,7 @@ #include <linux/dmi.h> #include <linux/suspend.h> #include <linux/kthread.h> +#include <linux/jiffies.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -235,8 +236,6 @@ #include <asm/paravirt.h> #include <asm/reboot.h> -#include "io_ports.h" - #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) extern int (*console_blank_hook)(int); #endif @@ -324,7 +323,7 @@ extern int (*console_blank_hook)(int); /* * Ignore suspend events for this amount of time after a resume */ -#define DEFAULT_BOUNCE_INTERVAL (3 * HZ) +#define DEFAULT_BOUNCE_INTERVAL (3 * HZ) /* * Maximum number of events stored @@ -336,7 +335,7 @@ extern int (*console_blank_hook)(int); */ struct apm_user { int magic; - struct apm_user * next; + struct apm_user *next; unsigned int suser: 1; unsigned int writer: 1; unsigned int reader: 1; @@ -372,44 +371,44 @@ struct apm_user { static struct { unsigned long offset; unsigned short segment; -} apm_bios_entry; -static int clock_slowed; -static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; -static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; -static int set_pm_idle; -static int suspends_pending; -static int standbys_pending; -static int ignore_sys_suspend; -static int ignore_normal_resume; -static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; - -static int debug __read_mostly; -static int smp __read_mostly; -static int apm_disabled = -1; +} apm_bios_entry; +static int clock_slowed; +static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; +static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; +static int set_pm_idle; +static int suspends_pending; +static int standbys_pending; +static int ignore_sys_suspend; +static int ignore_normal_resume; +static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; + +static int debug __read_mostly; +static int smp __read_mostly; +static int apm_disabled = -1; #ifdef CONFIG_SMP -static int power_off; +static int power_off; #else -static int power_off = 1; +static int power_off = 1; #endif #ifdef CONFIG_APM_REAL_MODE_POWER_OFF -static int realmode_power_off = 1; +static int realmode_power_off = 1; #else -static int realmode_power_off; +static int realmode_power_off; #endif #ifdef CONFIG_APM_ALLOW_INTS -static int allow_ints = 1; +static int allow_ints = 1; #else -static int allow_ints; +static int allow_ints; #endif -static int broken_psr; +static int broken_psr; static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); -static struct apm_user * user_list; +static struct apm_user *user_list; static DEFINE_SPINLOCK(user_list_lock); -static const struct desc_struct bad_bios_desc = { 0, 0x00409200 }; +static const struct desc_struct bad_bios_desc = { { { 0, 0x00409200 } } }; -static const char driver_version[] = "1.16ac"; /* no spaces */ +static const char driver_version[] = "1.16ac"; /* no spaces */ static struct task_struct *kapmd_task; @@ -417,7 +416,7 @@ static struct task_struct *kapmd_task; * APM event names taken from the APM 1.2 specification. These are * the message codes that the BIOS uses to tell us about events */ -static const char * const apm_event_name[] = { +static const char * const apm_event_name[] = { "system standby", "system suspend", "normal resume", @@ -435,14 +434,14 @@ static const char * const apm_event_name[] = { typedef struct lookup_t { int key; - char * msg; + char *msg; } lookup_t; /* * The BIOS returns a set of standard error codes in AX when the * carry flag is set. */ - + static const lookup_t error_table[] = { /* N/A { APM_SUCCESS, "Operation succeeded" }, */ { APM_DISABLED, "Power management disabled" }, @@ -472,24 +471,25 @@ static const lookup_t error_table[] = { * Write a meaningful log entry to the kernel log in the event of * an APM error. */ - + static void apm_error(char *str, int err) { - int i; + int i; for (i = 0; i < ERROR_COUNT; i++) - if (error_table[i].key == err) break; + if (error_table[i].key == err) + break; if (i < ERROR_COUNT) printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); else printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", - str, err); + str, err); } /* * Lock APM functionality to physical CPU 0 */ - + #ifdef CONFIG_SMP static cpumask_t apm_save_cpus(void) @@ -511,7 +511,7 @@ static inline void apm_restore_cpus(cpumask_t mask) /* * No CPU lockdown needed on a uniprocessor */ - + #define apm_save_cpus() (current->cpus_allowed) #define apm_restore_cpus(x) (void)(x) @@ -590,7 +590,7 @@ static inline void apm_irq_restore(unsigned long flags) * code is returned in AH (bits 8-15 of eax) and this function * returns non-zero. */ - + static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi) { @@ -602,7 +602,7 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, struct desc_struct *gdt; cpus = apm_save_cpus(); - + cpu = get_cpu(); gdt = get_cpu_gdt_table(cpu); save_desc_40 = gdt[0x40 / 8]; @@ -616,7 +616,7 @@ static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, gdt[0x40 / 8] = save_desc_40; put_cpu(); apm_restore_cpus(cpus); - + return *eax & 0xff; } @@ -645,7 +645,7 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax) struct desc_struct *gdt; cpus = apm_save_cpus(); - + cpu = get_cpu(); gdt = get_cpu_gdt_table(cpu); save_desc_40 = gdt[0x40 / 8]; @@ -680,7 +680,7 @@ static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax) static int apm_driver_version(u_short *val) { - u32 eax; + u32 eax; if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax)) return (eax >> 8) & 0xff; @@ -704,16 +704,16 @@ static int apm_driver_version(u_short *val) * that APM 1.2 is in use. If no messges are pending the value 0x80 * is returned (No power management events pending). */ - + static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) { - u32 eax; - u32 ebx; - u32 ecx; - u32 dummy; + u32 eax; + u32 ebx; + u32 ecx; + u32 dummy; if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx, - &dummy, &dummy)) + &dummy, &dummy)) return (eax >> 8) & 0xff; *event = ebx; if (apm_info.connection_version < 0x0102) @@ -736,10 +736,10 @@ static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) * The state holds the state to transition to, which may in fact * be an acceptance of a BIOS requested state change. */ - + static int set_power_state(u_short what, u_short state) { - u32 eax; + u32 eax; if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax)) return (eax >> 8) & 0xff; @@ -752,7 +752,7 @@ static int set_power_state(u_short what, u_short state) * * Transition the entire system into a new APM power state. */ - + static int set_system_power_state(u_short state) { return set_power_state(APM_DEVICE_ALL, state); @@ -766,13 +766,13 @@ static int set_system_power_state(u_short state) * to handle the idle request. On a success the function returns 1 * if the BIOS did clock slowing or 0 otherwise. */ - + static int apm_do_idle(void) { - u32 eax; - u8 ret = 0; - int idled = 0; - int polling; + u32 eax; + u8 ret = 0; + int idled = 0; + int polling; polling = !!(current_thread_info()->status & TS_POLLING); if (polling) { @@ -799,10 +799,9 @@ static int apm_do_idle(void) /* This always fails on some SMP boards running UP kernels. * Only report the failure the first 5 times. */ - if (++t < 5) - { + if (++t < 5) { printk(KERN_DEBUG "apm_do_idle failed (%d)\n", - (eax >> 8) & 0xff); + (eax >> 8) & 0xff); t = jiffies; } return -1; @@ -814,15 +813,15 @@ static int apm_do_idle(void) /** * apm_do_busy - inform the BIOS the CPU is busy * - * Request that the BIOS brings the CPU back to full performance. + * Request that the BIOS brings the CPU back to full performance. */ - + static void apm_do_busy(void) { - u32 dummy; + u32 dummy; if (clock_slowed || ALWAYS_CALL_BUSY) { - (void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy); + (void)apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy); clock_slowed = 0; } } @@ -833,15 +832,15 @@ static void apm_do_busy(void) * power management - we probably want * to conserve power. */ -#define IDLE_CALC_LIMIT (HZ * 100) -#define IDLE_LEAKY_MAX 16 +#define IDLE_CALC_LIMIT (HZ * 100) +#define IDLE_LEAKY_MAX 16 static void (*original_pm_idle)(void) __read_mostly; /** * apm_cpu_idle - cpu idling for APM capable Linux * - * This is the idling function the kernel executes when APM is available. It + * This is the idling function the kernel executes when APM is available. It * tries to do BIOS powermanagement based on the average system idle time. * Furthermore it calls the system default idle routine. */ @@ -882,7 +881,8 @@ recalc: t = jiffies; switch (apm_do_idle()) { - case 0: apm_idle_done = 1; + case 0: + apm_idle_done = 1; if (t != jiffies) { if (bucket) { bucket = IDLE_LEAKY_MAX; @@ -893,7 +893,8 @@ recalc: continue; } break; - case 1: apm_idle_done = 1; + case 1: + apm_idle_done = 1; break; default: /* BIOS refused */ break; @@ -921,10 +922,10 @@ recalc: * the SMP call on CPU0 as some systems will only honour this call * on their first cpu. */ - + static void apm_power_off(void) { - unsigned char po_bios_call[] = { + unsigned char po_bios_call[] = { 0xb8, 0x00, 0x10, /* movw $0x1000,ax */ 0x8e, 0xd0, /* movw ax,ss */ 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */ @@ -935,13 +936,12 @@ static void apm_power_off(void) }; /* Some bioses don't like being called from CPU != 0 */ - if (apm_info.realmode_power_off) - { + if (apm_info.realmode_power_off) { (void)apm_save_cpus(); machine_real_restart(po_bios_call, sizeof(po_bios_call)); + } else { + (void)set_system_power_state(APM_STATE_OFF); } - else - (void) set_system_power_state(APM_STATE_OFF); } #ifdef CONFIG_APM_DO_ENABLE @@ -950,17 +950,17 @@ static void apm_power_off(void) * apm_enable_power_management - enable BIOS APM power management * @enable: enable yes/no * - * Enable or disable the APM BIOS power services. + * Enable or disable the APM BIOS power services. */ - + static int apm_enable_power_management(int enable) { - u32 eax; + u32 eax; if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED)) return APM_NOT_ENGAGED; if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL, - enable, &eax)) + enable, &eax)) return (eax >> 8) & 0xff; if (enable) apm_info.bios.flags &= ~APM_BIOS_DISABLED; @@ -983,19 +983,19 @@ static int apm_enable_power_management(int enable) * if reported is a lifetime in secodnds/minutes at current powwer * consumption. */ - + static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) { - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 dummy; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 dummy; if (apm_info.get_power_status_broken) return APM_32_UNSUPPORTED; if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0, - &eax, &ebx, &ecx, &edx, &dummy)) + &eax, &ebx, &ecx, &edx, &dummy)) return (eax >> 8) & 0xff; *status = ebx; *bat = ecx; @@ -1011,11 +1011,11 @@ static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) static int apm_get_battery_status(u_short which, u_short *status, u_short *bat, u_short *life, u_short *nbat) { - u32 eax; - u32 ebx; - u32 ecx; - u32 edx; - u32 esi; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 esi; if (apm_info.connection_version < 0x0102) { /* pretend we only have one battery. */ @@ -1026,7 +1026,7 @@ static int apm_get_battery_status(u_short which, u_short *status, } if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax, - &ebx, &ecx, &edx, &esi)) + &ebx, &ecx, &edx, &esi)) return (eax >> 8) & 0xff; *status = ebx; *bat = ecx; @@ -1044,10 +1044,10 @@ static int apm_get_battery_status(u_short which, u_short *status, * Activate or deactive power management on either a specific device * or the entire system (%APM_DEVICE_ALL). */ - + static int apm_engage_power_management(u_short device, int enable) { - u32 eax; + u32 eax; if ((enable == 0) && (device == APM_DEVICE_ALL) && (apm_info.bios.flags & APM_BIOS_DISABLED)) @@ -1074,7 +1074,7 @@ static int apm_engage_power_management(u_short device, int enable) * all video devices. Typically the BIOS will do laptop backlight and * monitor powerdown for us. */ - + static int apm_console_blank(int blank) { int error = APM_NOT_ENGAGED; /* silence gcc */ @@ -1126,7 +1126,7 @@ static apm_event_t get_queued_event(struct apm_user *as) static void queue_event(apm_event_t event, struct apm_user *sender) { - struct apm_user * as; + struct apm_user *as; spin_lock(&user_list_lock); if (user_list == NULL) @@ -1174,11 +1174,11 @@ static void reinit_timer(void) spin_lock_irqsave(&i8253_lock, flags); /* set the clock to HZ */ - outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ + outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ udelay(10); - outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ + outb_pit(LATCH & 0xff, PIT_CH0); /* LSB */ udelay(10); - outb(LATCH >> 8, PIT_CH0); /* MSB */ + outb_pit(LATCH >> 8, PIT_CH0); /* MSB */ udelay(10); spin_unlock_irqrestore(&i8253_lock, flags); #endif @@ -1186,7 +1186,7 @@ static void reinit_timer(void) static int suspend(int vetoable) { - int err; + int err; struct apm_user *as; if (pm_send_all(PM_SUSPEND, (void *)3)) { @@ -1239,7 +1239,7 @@ static int suspend(int vetoable) static void standby(void) { - int err; + int err; local_irq_disable(); device_power_down(PMSG_SUSPEND); @@ -1256,8 +1256,8 @@ static void standby(void) static apm_event_t get_event(void) { - int error; - apm_event_t event = APM_NO_EVENTS; /* silence gcc */ + int error; + apm_event_t event = APM_NO_EVENTS; /* silence gcc */ apm_eventinfo_t info; static int notified; @@ -1275,9 +1275,9 @@ static apm_event_t get_event(void) static void check_events(void) { - apm_event_t event; - static unsigned long last_resume; - static int ignore_bounce; + apm_event_t event; + static unsigned long last_resume; + static int ignore_bounce; while ((event = get_event()) != 0) { if (debug) { @@ -1289,7 +1289,7 @@ static void check_events(void) "event 0x%02x\n", event); } if (ignore_bounce - && ((jiffies - last_resume) > bounce_interval)) + && (time_after(jiffies, last_resume + bounce_interval))) ignore_bounce = 0; switch (event) { @@ -1357,7 +1357,7 @@ static void check_events(void) /* * We are not allowed to reject a critical suspend. */ - (void) suspend(0); + (void)suspend(0); break; } } @@ -1365,12 +1365,12 @@ static void check_events(void) static void apm_event_handler(void) { - static int pending_count = 4; - int err; + static int pending_count = 4; + int err; if ((standbys_pending > 0) || (suspends_pending > 0)) { if ((apm_info.connection_version > 0x100) && - (pending_count-- <= 0)) { + (pending_count-- <= 0)) { pending_count = 4; if (debug) printk(KERN_DEBUG "apm: setting state busy\n"); @@ -1418,9 +1418,9 @@ static int check_apm_user(struct apm_user *as, const char *func) static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) { - struct apm_user * as; - int i; - apm_event_t event; + struct apm_user *as; + int i; + apm_event_t event; as = fp->private_data; if (check_apm_user(as, "read")) @@ -1459,9 +1459,9 @@ static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t * return 0; } -static unsigned int do_poll(struct file *fp, poll_table * wait) +static unsigned int do_poll(struct file *fp, poll_table *wait) { - struct apm_user * as; + struct apm_user *as; as = fp->private_data; if (check_apm_user(as, "poll")) @@ -1472,10 +1472,10 @@ static unsigned int do_poll(struct file *fp, poll_table * wait) return 0; } -static int do_ioctl(struct inode * inode, struct file *filp, +static int do_ioctl(struct inode *inode, struct file *filp, u_int cmd, u_long arg) { - struct apm_user * as; + struct apm_user *as; as = filp->private_data; if (check_apm_user(as, "ioctl")) @@ -1515,9 +1515,9 @@ static int do_ioctl(struct inode * inode, struct file *filp, return 0; } -static int do_release(struct inode * inode, struct file * filp) +static int do_release(struct inode *inode, struct file *filp) { - struct apm_user * as; + struct apm_user *as; as = filp->private_data; if (check_apm_user(as, "release")) @@ -1533,11 +1533,11 @@ static int do_release(struct inode * inode, struct file * filp) if (suspends_pending <= 0) (void) suspend(1); } - spin_lock(&user_list_lock); + spin_lock(&user_list_lock); if (user_list == as) user_list = as->next; else { - struct apm_user * as1; + struct apm_user *as1; for (as1 = user_list; (as1 != NULL) && (as1->next != as); @@ -1553,9 +1553,9 @@ static int do_release(struct inode * inode, struct file * filp) return 0; } -static int do_open(struct inode * inode, struct file * filp) +static int do_open(struct inode *inode, struct file *filp) { - struct apm_user * as; + struct apm_user *as; as = kmalloc(sizeof(*as), GFP_KERNEL); if (as == NULL) { @@ -1569,7 +1569,7 @@ static int do_open(struct inode * inode, struct file * filp) as->suspends_read = as->standbys_read = 0; /* * XXX - this is a tiny bit broken, when we consider BSD - * process accounting. If the device is opened by root, we + * process accounting. If the device is opened by root, we * instantly flag that we used superuser privs. Who knows, * we might close the device immediately without doing a * privileged operation -- cevans @@ -1652,16 +1652,16 @@ static int proc_apm_show(struct seq_file *m, void *v) 8) min = minutes; sec = seconds */ seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", - driver_version, - (apm_info.bios.version >> 8) & 0xff, - apm_info.bios.version & 0xff, - apm_info.bios.flags, - ac_line_status, - battery_status, - battery_flag, - percentage, - time_units, - units); + driver_version, + (apm_info.bios.version >> 8) & 0xff, + apm_info.bios.version & 0xff, + apm_info.bios.flags, + ac_line_status, + battery_status, + battery_flag, + percentage, + time_units, + units); return 0; } @@ -1684,8 +1684,8 @@ static int apm(void *unused) unsigned short cx; unsigned short dx; int error; - char * power_stat; - char * bat_stat; + char *power_stat; + char *bat_stat; #ifdef CONFIG_SMP /* 2002/08/01 - WT @@ -1744,23 +1744,41 @@ static int apm(void *unused) } } - if (debug && (num_online_cpus() == 1 || smp )) { + if (debug && (num_online_cpus() == 1 || smp)) { error = apm_get_power_status(&bx, &cx, &dx); if (error) printk(KERN_INFO "apm: power status not available\n"); else { switch ((bx >> 8) & 0xff) { - case 0: power_stat = "off line"; break; - case 1: power_stat = "on line"; break; - case 2: power_stat = "on backup power"; break; - default: power_stat = "unknown"; break; + case 0: + power_stat = "off line"; + break; + case 1: + power_stat = "on line"; + break; + case 2: + power_stat = "on backup power"; + break; + default: + power_stat = "unknown"; + break; } switch (bx & 0xff) { - case 0: bat_stat = "high"; break; - case 1: bat_stat = "low"; break; - case 2: bat_stat = "critical"; break; - case 3: bat_stat = "charging"; break; - default: bat_stat = "unknown"; break; + case 0: + bat_stat = "high"; + break; + case 1: + bat_stat = "low"; + break; + case 2: + bat_stat = "critical"; + break; + case 3: + bat_stat = "charging"; + break; + default: + bat_stat = "unknown"; + break; } printk(KERN_INFO "apm: AC %s, battery status %s, battery life ", @@ -1777,8 +1795,8 @@ static int apm(void *unused) printk("unknown\n"); else printk("%d %s\n", dx & 0x7fff, - (dx & 0x8000) ? - "minutes" : "seconds"); + (dx & 0x8000) ? + "minutes" : "seconds"); } } } @@ -1803,7 +1821,7 @@ static int apm(void *unused) #ifndef MODULE static int __init apm_setup(char *str) { - int invert; + int invert; while ((str != NULL) && (*str != '\0')) { if (strncmp(str, "off", 3) == 0) @@ -1828,14 +1846,13 @@ static int __init apm_setup(char *str) if ((strncmp(str, "power-off", 9) == 0) || (strncmp(str, "power_off", 9) == 0)) power_off = !invert; - if (strncmp(str, "smp", 3) == 0) - { + if (strncmp(str, "smp", 3) == 0) { smp = !invert; idle_threshold = 100; } if ((strncmp(str, "allow-ints", 10) == 0) || (strncmp(str, "allow_ints", 10) == 0)) - apm_info.allow_ints = !invert; + apm_info.allow_ints = !invert; if ((strncmp(str, "broken-psr", 10) == 0) || (strncmp(str, "broken_psr", 10) == 0)) apm_info.get_power_status_broken = !invert; @@ -1881,7 +1898,8 @@ static int __init print_if_true(const struct dmi_system_id *d) */ static int __init broken_ps2_resume(const struct dmi_system_id *d) { - printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident); + printk(KERN_INFO "%s machine detected. Mousepad Resume Bug " + "workaround hopefully not needed.\n", d->ident); return 0; } @@ -1890,7 +1908,8 @@ static int __init set_realmode_power_off(const struct dmi_system_id *d) { if (apm_info.realmode_power_off == 0) { apm_info.realmode_power_off = 1; - printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident); + printk(KERN_INFO "%s bios detected. " + "Using realmode poweroff only.\n", d->ident); } return 0; } @@ -1900,7 +1919,8 @@ static int __init set_apm_ints(const struct dmi_system_id *d) { if (apm_info.allow_ints == 0) { apm_info.allow_ints = 1; - printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident); + printk(KERN_INFO "%s machine detected. " + "Enabling interrupts during APM calls.\n", d->ident); } return 0; } @@ -1910,7 +1930,8 @@ static int __init apm_is_horked(const struct dmi_system_id *d) { if (apm_info.disabled == 0) { apm_info.disabled = 1; - printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); + printk(KERN_INFO "%s machine detected. " + "Disabling APM.\n", d->ident); } return 0; } @@ -1919,7 +1940,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d) { if (apm_info.disabled == 0) { apm_info.disabled = 1; - printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); + printk(KERN_INFO "%s machine detected. " + "Disabling APM.\n", d->ident); printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); printk(KERN_INFO "download from support.intel.com \n"); } @@ -1931,7 +1953,8 @@ static int __init apm_likes_to_melt(const struct dmi_system_id *d) { if (apm_info.forbid_idle == 0) { apm_info.forbid_idle = 1; - printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident); + printk(KERN_INFO "%s machine detected. " + "Disabling APM idle calls.\n", d->ident); } return 0; } @@ -1954,7 +1977,8 @@ static int __init apm_likes_to_melt(const struct dmi_system_id *d) static int __init broken_apm_power(const struct dmi_system_id *d) { apm_info.get_power_status_broken = 1; - printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n"); + printk(KERN_WARNING "BIOS strings suggest APM bugs, " + "disabling power status reporting.\n"); return 0; } @@ -1965,7 +1989,8 @@ static int __init broken_apm_power(const struct dmi_system_id *d) static int __init swab_apm_power_in_minutes(const struct dmi_system_id *d) { apm_info.get_power_status_swabinminutes = 1; - printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n"); + printk(KERN_WARNING "BIOS strings suggest APM reports battery life " + "in minutes and wrong byte order.\n"); return 0; } @@ -1990,8 +2015,8 @@ static struct dmi_system_id __initdata apm_dmi_table[] = { apm_is_horked, "Dell Inspiron 2500", { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), - DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), }, }, { /* Allow interrupts during suspend on Dell Inspiron laptops*/ set_apm_ints, "Dell Inspiron", { @@ -2014,15 +2039,15 @@ static struct dmi_system_id __initdata apm_dmi_table[] = { apm_is_horked, "Dell Dimension 4100", { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), - DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."), - DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, + DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), }, }, { /* Allow interrupts during suspend on Compaq Laptops*/ set_apm_ints, "Compaq 12XL125", { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"), DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION,"4.06"), }, + DMI_MATCH(DMI_BIOS_VERSION, "4.06"), }, }, { /* Allow interrupts during APM or the clock goes slow */ set_apm_ints, "ASUSTeK", @@ -2064,15 +2089,15 @@ static struct dmi_system_id __initdata apm_dmi_table[] = { apm_is_horked, "Sharp PC-PJ/AX", { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"), DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"), - DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"), - DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), }, + DMI_MATCH(DMI_BIOS_VENDOR, "SystemSoft"), + DMI_MATCH(DMI_BIOS_VERSION, "Version R2.08"), }, }, { /* APM crashes */ apm_is_horked, "Dell Inspiron 2500", { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), - DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), - DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), }, }, { /* APM idle hangs */ apm_likes_to_melt, "Jabil AMD", @@ -2203,11 +2228,11 @@ static int __init apm_init(void) return -ENODEV; } printk(KERN_INFO - "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n", - ((apm_info.bios.version >> 8) & 0xff), - (apm_info.bios.version & 0xff), - apm_info.bios.flags, - driver_version); + "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n", + ((apm_info.bios.version >> 8) & 0xff), + (apm_info.bios.version & 0xff), + apm_info.bios.flags, + driver_version); if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) { printk(KERN_INFO "apm: no 32 bit BIOS support\n"); return -ENODEV; @@ -2312,9 +2337,9 @@ static int __init apm_init(void) } wake_up_process(kapmd_task); - if (num_online_cpus() > 1 && !smp ) { + if (num_online_cpus() > 1 && !smp) { printk(KERN_NOTICE - "apm: disabled - APM is not SMP safe (power off active).\n"); + "apm: disabled - APM is not SMP safe (power off active).\n"); return 0; } @@ -2339,7 +2364,7 @@ static int __init apm_init(void) static void __exit apm_exit(void) { - int error; + int error; if (set_pm_idle) { pm_idle = original_pm_idle; diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 0e45981b2dd..afd84463b71 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -38,15 +38,15 @@ void foo(void); void foo(void) { - OFFSET(SIGCONTEXT_eax, sigcontext, eax); - OFFSET(SIGCONTEXT_ebx, sigcontext, ebx); - OFFSET(SIGCONTEXT_ecx, sigcontext, ecx); - OFFSET(SIGCONTEXT_edx, sigcontext, edx); - OFFSET(SIGCONTEXT_esi, sigcontext, esi); - OFFSET(SIGCONTEXT_edi, sigcontext, edi); - OFFSET(SIGCONTEXT_ebp, sigcontext, ebp); - OFFSET(SIGCONTEXT_esp, sigcontext, esp); - OFFSET(SIGCONTEXT_eip, sigcontext, eip); + OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax); + OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx); + OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx); + OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx); + OFFSET(IA32_SIGCONTEXT_si, sigcontext, si); + OFFSET(IA32_SIGCONTEXT_di, sigcontext, di); + OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp); + OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp); + OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip); BLANK(); OFFSET(CPUINFO_x86, cpuinfo_x86, x86); @@ -70,39 +70,38 @@ void foo(void) OFFSET(TI_cpu, thread_info, cpu); BLANK(); - OFFSET(GDS_size, Xgt_desc_struct, size); - OFFSET(GDS_address, Xgt_desc_struct, address); - OFFSET(GDS_pad, Xgt_desc_struct, pad); + OFFSET(GDS_size, desc_ptr, size); + OFFSET(GDS_address, desc_ptr, address); BLANK(); - OFFSET(PT_EBX, pt_regs, ebx); - OFFSET(PT_ECX, pt_regs, ecx); - OFFSET(PT_EDX, pt_regs, edx); - OFFSET(PT_ESI, pt_regs, esi); - OFFSET(PT_EDI, pt_regs, edi); - OFFSET(PT_EBP, pt_regs, ebp); - OFFSET(PT_EAX, pt_regs, eax); - OFFSET(PT_DS, pt_regs, xds); - OFFSET(PT_ES, pt_regs, xes); - OFFSET(PT_FS, pt_regs, xfs); - OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); - OFFSET(PT_EIP, pt_regs, eip); - OFFSET(PT_CS, pt_regs, xcs); - OFFSET(PT_EFLAGS, pt_regs, eflags); - OFFSET(PT_OLDESP, pt_regs, esp); - OFFSET(PT_OLDSS, pt_regs, xss); + OFFSET(PT_EBX, pt_regs, bx); + OFFSET(PT_ECX, pt_regs, cx); + OFFSET(PT_EDX, pt_regs, dx); + OFFSET(PT_ESI, pt_regs, si); + OFFSET(PT_EDI, pt_regs, di); + OFFSET(PT_EBP, pt_regs, bp); + OFFSET(PT_EAX, pt_regs, ax); + OFFSET(PT_DS, pt_regs, ds); + OFFSET(PT_ES, pt_regs, es); + OFFSET(PT_FS, pt_regs, fs); + OFFSET(PT_ORIG_EAX, pt_regs, orig_ax); + OFFSET(PT_EIP, pt_regs, ip); + OFFSET(PT_CS, pt_regs, cs); + OFFSET(PT_EFLAGS, pt_regs, flags); + OFFSET(PT_OLDESP, pt_regs, sp); + OFFSET(PT_OLDSS, pt_regs, ss); BLANK(); OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); - OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); + OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); BLANK(); OFFSET(pbe_address, pbe, address); OFFSET(pbe_orig_address, pbe, orig_address); OFFSET(pbe_next, pbe, next); - /* Offset from the sysenter stack to tss.esp0 */ - DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) - + /* Offset from the sysenter stack to tss.sp0 */ + DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - sizeof(struct tss_struct)); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); @@ -111,8 +110,6 @@ void foo(void) DEFINE(PTRS_PER_PMD, PTRS_PER_PMD); DEFINE(PTRS_PER_PGD, PTRS_PER_PGD); - DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK); - OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); #ifdef CONFIG_PARAVIRT @@ -123,7 +120,7 @@ void foo(void) OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); - OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); + OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret); OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); #endif diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index d1b6ed98774..494e1e096ee 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -38,7 +38,6 @@ int main(void) #define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) ENTRY(state); ENTRY(flags); - ENTRY(thread); ENTRY(pid); BLANK(); #undef ENTRY @@ -47,6 +46,9 @@ int main(void) ENTRY(addr_limit); ENTRY(preempt_count); ENTRY(status); +#ifdef CONFIG_IA32_EMULATION + ENTRY(sysenter_return); +#endif BLANK(); #undef ENTRY #define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry)) @@ -59,17 +61,31 @@ int main(void) ENTRY(data_offset); BLANK(); #undef ENTRY +#ifdef CONFIG_PARAVIRT + BLANK(); + OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); + OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops); + OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops); + OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); + OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); + OFFSET(PV_CPU_iret, pv_cpu_ops, iret); + OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret); + OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); + OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); +#endif + + #ifdef CONFIG_IA32_EMULATION #define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) - ENTRY(eax); - ENTRY(ebx); - ENTRY(ecx); - ENTRY(edx); - ENTRY(esi); - ENTRY(edi); - ENTRY(ebp); - ENTRY(esp); - ENTRY(eip); + ENTRY(ax); + ENTRY(bx); + ENTRY(cx); + ENTRY(dx); + ENTRY(si); + ENTRY(di); + ENTRY(bp); + ENTRY(sp); + ENTRY(ip); BLANK(); #undef ENTRY DEFINE(IA32_RT_SIGFRAME_sigcontext, @@ -81,14 +97,14 @@ int main(void) DEFINE(pbe_next, offsetof(struct pbe, next)); BLANK(); #define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry)) - ENTRY(rbx); - ENTRY(rbx); - ENTRY(rcx); - ENTRY(rdx); - ENTRY(rsp); - ENTRY(rbp); - ENTRY(rsi); - ENTRY(rdi); + ENTRY(bx); + ENTRY(bx); + ENTRY(cx); + ENTRY(dx); + ENTRY(sp); + ENTRY(bp); + ENTRY(si); + ENTRY(di); ENTRY(r8); ENTRY(r9); ENTRY(r10); @@ -97,7 +113,7 @@ int main(void) ENTRY(r13); ENTRY(r14); ENTRY(r15); - ENTRY(eflags); + ENTRY(flags); BLANK(); #undef ENTRY #define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry)) @@ -108,7 +124,7 @@ int main(void) ENTRY(cr8); BLANK(); #undef ENTRY - DEFINE(TSS_ist, offsetof(struct tss_struct, ist)); + DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist)); BLANK(); DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); BLANK(); diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c index 0b9860530a6..30f25a75fe2 100644 --- a/arch/x86/kernel/bootflag.c +++ b/arch/x86/kernel/bootflag.c @@ -1,8 +1,6 @@ /* * Implement 'Simple Boot Flag Specification 2.0' */ - - #include <linux/types.h> #include <linux/kernel.h> #include <linux/init.h> @@ -14,40 +12,38 @@ #include <linux/mc146818rtc.h> - #define SBF_RESERVED (0x78) #define SBF_PNPOS (1<<0) #define SBF_BOOTING (1<<1) #define SBF_DIAG (1<<2) #define SBF_PARITY (1<<7) - int sbf_port __initdata = -1; /* set via acpi_boot_init() */ - static int __init parity(u8 v) { int x = 0; int i; - - for(i=0;i<8;i++) - { - x^=(v&1); - v>>=1; + + for (i = 0; i < 8; i++) { + x ^= (v & 1); + v >>= 1; } + return x; } static void __init sbf_write(u8 v) { unsigned long flags; - if(sbf_port != -1) - { + + if (sbf_port != -1) { v &= ~SBF_PARITY; - if(!parity(v)) - v|=SBF_PARITY; + if (!parity(v)) + v |= SBF_PARITY; - printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v); + printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", + sbf_port, v); spin_lock_irqsave(&rtc_lock, flags); CMOS_WRITE(v, sbf_port); @@ -57,33 +53,41 @@ static void __init sbf_write(u8 v) static u8 __init sbf_read(void) { - u8 v; unsigned long flags; - if(sbf_port == -1) + u8 v; + + if (sbf_port == -1) return 0; + spin_lock_irqsave(&rtc_lock, flags); v = CMOS_READ(sbf_port); spin_unlock_irqrestore(&rtc_lock, flags); + return v; } static int __init sbf_value_valid(u8 v) { - if(v&SBF_RESERVED) /* Reserved bits */ + if (v & SBF_RESERVED) /* Reserved bits */ return 0; - if(!parity(v)) + if (!parity(v)) return 0; + return 1; } static int __init sbf_init(void) { u8 v; - if(sbf_port == -1) + + if (sbf_port == -1) return 0; + v = sbf_read(); - if(!sbf_value_valid(v)) - printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v); + if (!sbf_value_valid(v)) { + printk(KERN_WARNING "Simple Boot Flag value 0x%x read from " + "CMOS RAM was invalid\n", v); + } v &= ~SBF_RESERVED; v &= ~SBF_BOOTING; @@ -92,7 +96,7 @@ static int __init sbf_init(void) v |= SBF_PNPOS; #endif sbf_write(v); + return 0; } - module_init(sbf_init); diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c index 9a189cef640..8f520f93ffd 100644 --- a/arch/x86/kernel/bugs_64.c +++ b/arch/x86/kernel/bugs_64.c @@ -13,7 +13,6 @@ void __init check_bugs(void) { identify_cpu(&boot_cpu_data); - mtrr_bp_init(); #if !defined(CONFIG_SMP) printk("CPU: "); print_cpu_info(&boot_cpu_data); diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 3e91d3ee26e..238468ae199 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -45,6 +45,6 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) ®s[CR_ECX], ®s[CR_EDX]); if (regs[cb->reg] & (1 << cb->bit)) - set_bit(cb->feature, c->x86_capability); + set_cpu_cap(c, cb->feature); } } diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 1ff88c7f45c..06fa159232f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -63,6 +63,15 @@ static __cpuinit int amd_apic_timer_broken(void) int force_mwait __cpuinitdata; +void __cpuinit early_init_amd(struct cpuinfo_x86 *c) +{ + if (cpuid_eax(0x80000000) >= 0x80000007) { + c->x86_power = cpuid_edx(0x80000007); + if (c->x86_power & (1<<8)) + set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); + } +} + static void __cpuinit init_amd(struct cpuinfo_x86 *c) { u32 l, h; @@ -85,6 +94,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } #endif + early_init_amd(c); + /* * FIXME: We should handle the K5 here. Set up the write * range and also turn on MSR 83 bits 4 and 31 (write alloc, @@ -257,12 +268,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; } - if (cpuid_eax(0x80000000) >= 0x80000007) { - c->x86_power = cpuid_edx(0x80000007); - if (c->x86_power & (1<<8)) - set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); - } - #ifdef CONFIG_X86_HT /* * On a AMD multi core setup the lower bits of the APIC id @@ -295,12 +300,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) local_apic_timer_disabled = 1; #endif - if (c->x86 == 0x10 && !force_mwait) - clear_bit(X86_FEATURE_MWAIT, c->x86_capability); - /* K6s reports MCEs but don't actually have all the MSRs */ if (c->x86 < 6) clear_bit(X86_FEATURE_MCE, c->x86_capability); + + if (cpu_has_xmm) + set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability); } static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 205fd5ba57f..9b95edcfc6a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -11,6 +11,7 @@ #include <linux/utsname.h> #include <asm/bugs.h> #include <asm/processor.h> +#include <asm/processor-flags.h> #include <asm/i387.h> #include <asm/msr.h> #include <asm/paravirt.h> @@ -35,7 +36,7 @@ __setup("mca-pentium", mca_pentium); static int __init no_387(char *s) { boot_cpu_data.hard_math = 0; - write_cr0(0xE | read_cr0()); + write_cr0(X86_CR0_TS | X86_CR0_EM | X86_CR0_MP | read_cr0()); return 1; } @@ -153,7 +154,7 @@ static void __init check_config(void) * If we configured ourselves for a TSC, we'd better have one! */ #ifdef CONFIG_X86_TSC - if (!cpu_has_tsc && !tsc_disable) + if (!cpu_has_tsc) panic("Kernel compiled for Pentium+, requires TSC feature!"); #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e2fcf2051bd..db28aa9e2f6 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -22,43 +22,48 @@ #include "cpu.h" DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { - [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 }, - [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 }, - [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 }, - [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 }, + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, /* * Segments used for calling PnP BIOS have byte granularity. * They code segments and data segments have fixed 64k limits, * the transfer segment sizes are set at run time. */ - [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */ - [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */ + /* 32-bit code */ + [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, + /* 16-bit code */ + [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, /* * The APM segments have byte granularity and their bases * are set at run time. All have 64k limits. */ - [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ + /* 32-bit code */ + [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 }, - [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */ + [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, + /* data */ + [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, - [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 }, - [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 }, + [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, + [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; + static int cachesize_override __cpuinitdata = -1; -static int disable_x86_fxsr __cpuinitdata; static int disable_x86_serial_nr __cpuinitdata = 1; -static int disable_x86_sep __cpuinitdata; struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; -extern int disable_pse; - static void __cpuinit default_init(struct cpuinfo_x86 * c) { /* Not much we can do here... */ @@ -207,16 +212,8 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) static int __init x86_fxsr_setup(char * s) { - /* Tell all the other CPUs to not use it... */ - disable_x86_fxsr = 1; - - /* - * ... and clear the bits early in the boot_cpu_data - * so that the bootup process doesn't try to do this - * either. - */ - clear_bit(X86_FEATURE_FXSR, boot_cpu_data.x86_capability); - clear_bit(X86_FEATURE_XMM, boot_cpu_data.x86_capability); + setup_clear_cpu_cap(X86_FEATURE_FXSR); + setup_clear_cpu_cap(X86_FEATURE_XMM); return 1; } __setup("nofxsr", x86_fxsr_setup); @@ -224,7 +221,7 @@ __setup("nofxsr", x86_fxsr_setup); static int __init x86_sep_setup(char * s) { - disable_x86_sep = 1; + setup_clear_cpu_cap(X86_FEATURE_SEP); return 1; } __setup("nosep", x86_sep_setup); @@ -281,6 +278,33 @@ void __init cpu_detect(struct cpuinfo_x86 *c) c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; } } +static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) +{ + u32 tfms, xlvl; + int ebx; + + memset(&c->x86_capability, 0, sizeof c->x86_capability); + if (have_cpuid_p()) { + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + u32 capability, excap; + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); + c->x86_capability[0] = capability; + c->x86_capability[4] = excap; + } + + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + if ((xlvl & 0xffff0000) == 0x80000000) { + if (xlvl >= 0x80000001) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); + } + } + + } + +} /* Do minimum CPU detection early. Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment. @@ -300,6 +324,17 @@ static void __init early_cpu_detect(void) cpu_detect(c); get_cpu_vendor(c, 1); + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + early_init_amd(c); + break; + case X86_VENDOR_INTEL: + early_init_intel(c); + break; + } + + early_get_cap(c); } static void __cpuinit generic_identify(struct cpuinfo_x86 * c) @@ -357,8 +392,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c) init_scattered_cpuid_features(c); } - early_intel_workaround(c); - #ifdef CONFIG_X86_HT c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; #endif @@ -392,7 +425,7 @@ __setup("serialnumber", x86_serial_nr_setup); /* * This does the hard work of actually picking apart the CPU stuff... */ -static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) { int i; @@ -418,20 +451,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) generic_identify(c); - printk(KERN_DEBUG "CPU: After generic identify, caps:"); - for (i = 0; i < NCAPINTS; i++) - printk(" %08lx", c->x86_capability[i]); - printk("\n"); - - if (this_cpu->c_identify) { + if (this_cpu->c_identify) this_cpu->c_identify(c); - printk(KERN_DEBUG "CPU: After vendor identify, caps:"); - for (i = 0; i < NCAPINTS; i++) - printk(" %08lx", c->x86_capability[i]); - printk("\n"); - } - /* * Vendor-specific initialization. In this section we * canonicalize the feature flags, meaning if there are @@ -453,23 +475,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) * we do "generic changes." */ - /* TSC disabled? */ - if ( tsc_disable ) - clear_bit(X86_FEATURE_TSC, c->x86_capability); - - /* FXSR disabled? */ - if (disable_x86_fxsr) { - clear_bit(X86_FEATURE_FXSR, c->x86_capability); - clear_bit(X86_FEATURE_XMM, c->x86_capability); - } - - /* SEP disabled? */ - if (disable_x86_sep) - clear_bit(X86_FEATURE_SEP, c->x86_capability); - - if (disable_pse) - clear_bit(X86_FEATURE_PSE, c->x86_capability); - /* If the model name is still unset, do table lookup. */ if ( !c->x86_model_id[0] ) { char *p; @@ -482,13 +487,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86, c->x86_model); } - /* Now the feature flags better reflect actual CPU features! */ - - printk(KERN_DEBUG "CPU: After all inits, caps:"); - for (i = 0; i < NCAPINTS; i++) - printk(" %08lx", c->x86_capability[i]); - printk("\n"); - /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -501,8 +499,14 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } + /* Clear all flags overriden by options */ + for (i = 0; i < NCAPINTS; i++) + c->x86_capability[i] ^= cleared_cpu_caps[i]; + /* Init Machine Check Exception if available. */ mcheck_init(c); + + select_idle_routine(c); } void __init identify_boot_cpu(void) @@ -510,7 +514,6 @@ void __init identify_boot_cpu(void) identify_cpu(&boot_cpu_data); sysenter_setup(); enable_sep_cpu(); - mtrr_bp_init(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -567,6 +570,13 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) } #endif +static __init int setup_noclflush(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_CLFLSH); + return 1; +} +__setup("noclflush", setup_noclflush); + void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { char *vendor = NULL; @@ -590,6 +600,17 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) printk("\n"); } +static __init int setup_disablecpuid(char *arg) +{ + int bit; + if (get_option(&arg, &bit) && bit < NCAPINTS*32) + setup_clear_cpu_cap(bit); + else + return 0; + return 1; +} +__setup("clearcpuid=", setup_disablecpuid); + cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; /* This is hacky. :) @@ -620,21 +641,13 @@ void __init early_cpu_init(void) nexgen_init_cpu(); umc_init_cpu(); early_cpu_detect(); - -#ifdef CONFIG_DEBUG_PAGEALLOC - /* pse is not compatible with on-the-fly unmapping, - * disable it even if the cpus claim to support it. - */ - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); - disable_pse = 1; -#endif } /* Make sure %fs is initialized properly in idle threads */ struct pt_regs * __devinit idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); - regs->xfs = __KERNEL_PERCPU; + regs->fs = __KERNEL_PERCPU; return regs; } @@ -642,7 +655,7 @@ struct pt_regs * __devinit idle_regs(struct pt_regs *regs) * it's on the real one. */ void switch_to_new_gdt(void) { - struct Xgt_desc_struct gdt_descr; + struct desc_ptr gdt_descr; gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); gdt_descr.size = GDT_SIZE - 1; @@ -672,12 +685,6 @@ void __cpuinit cpu_init(void) if (cpu_has_vme || cpu_has_tsc || cpu_has_de) clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - if (tsc_disable && cpu_has_tsc) { - printk(KERN_NOTICE "Disabling TSC...\n"); - /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/ - clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); - set_in_cr4(X86_CR4_TSD); - } load_idt(&idt_descr); switch_to_new_gdt(); @@ -691,7 +698,7 @@ void __cpuinit cpu_init(void) BUG(); enter_lazy_tlb(&init_mm, curr); - load_esp0(t, thread); + load_sp0(t, thread); set_tss_desc(cpu,t); load_TR_desc(); load_LDT(&init_mm.context); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 2f6432cef6f..ad6527a5beb 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -24,5 +24,6 @@ extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; extern int get_model_name(struct cpuinfo_x86 *c); extern void display_cacheinfo(struct cpuinfo_x86 *c); -extern void early_intel_workaround(struct cpuinfo_x86 *c); +extern void early_init_intel(struct cpuinfo_x86 *c); +extern void early_init_amd(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index fea0af0476b..a962dcb9c40 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -67,7 +67,8 @@ struct acpi_cpufreq_data { unsigned int cpu_feature; }; -static struct acpi_cpufreq_data *drv_data[NR_CPUS]; +static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); + /* acpi_perf_data is a pointer to percpu data. */ static struct acpi_processor_performance *acpi_perf_data; @@ -218,14 +219,14 @@ static u32 get_cur_val(cpumask_t mask) if (unlikely(cpus_empty(mask))) return 0; - switch (drv_data[first_cpu(mask)]->cpu_feature) { + switch (per_cpu(drv_data, first_cpu(mask))->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; break; case SYSTEM_IO_CAPABLE: cmd.type = SYSTEM_IO_CAPABLE; - perf = drv_data[first_cpu(mask)]->acpi_data; + perf = per_cpu(drv_data, first_cpu(mask))->acpi_data; cmd.addr.io.port = perf->control_register.address; cmd.addr.io.bit_width = perf->control_register.bit_width; break; @@ -325,7 +326,7 @@ static unsigned int get_measured_perf(unsigned int cpu) #endif - retval = drv_data[cpu]->max_freq * perf_percent / 100; + retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; put_cpu(); set_cpus_allowed(current, saved_mask); @@ -336,7 +337,7 @@ static unsigned int get_measured_perf(unsigned int cpu) static unsigned int get_cur_freq_on_cpu(unsigned int cpu) { - struct acpi_cpufreq_data *data = drv_data[cpu]; + struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); unsigned int freq; dprintk("get_cur_freq_on_cpu (%d)\n", cpu); @@ -370,7 +371,7 @@ static unsigned int check_freqs(cpumask_t mask, unsigned int freq, static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { - struct acpi_cpufreq_data *data = drv_data[policy->cpu]; + struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); struct acpi_processor_performance *perf; struct cpufreq_freqs freqs; cpumask_t online_policy_cpus; @@ -466,7 +467,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, static int acpi_cpufreq_verify(struct cpufreq_policy *policy) { - struct acpi_cpufreq_data *data = drv_data[policy->cpu]; + struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); dprintk("acpi_cpufreq_verify\n"); @@ -570,7 +571,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) return -ENOMEM; data->acpi_data = percpu_ptr(acpi_perf_data, cpu); - drv_data[cpu] = data; + per_cpu(drv_data, cpu) = data; if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; @@ -714,20 +715,20 @@ err_unreg: acpi_processor_unregister_performance(perf, cpu); err_free: kfree(data); - drv_data[cpu] = NULL; + per_cpu(drv_data, cpu) = NULL; return result; } static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) { - struct acpi_cpufreq_data *data = drv_data[policy->cpu]; + struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); dprintk("acpi_cpufreq_cpu_exit\n"); if (data) { cpufreq_frequency_table_put_attr(policy->cpu); - drv_data[policy->cpu] = NULL; + per_cpu(drv_data, policy->cpu) = NULL; acpi_processor_unregister_performance(data->acpi_data, policy->cpu); kfree(data); @@ -738,7 +739,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) static int acpi_cpufreq_resume(struct cpufreq_policy *policy) { - struct acpi_cpufreq_data *data = drv_data[policy->cpu]; + struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); dprintk("acpi_cpufreq_resume\n"); diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index 749d00cb2eb..06fcce516d5 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -694,7 +694,7 @@ static acpi_status longhaul_walk_callback(acpi_handle obj_handle, if ( acpi_bus_get_device(obj_handle, &d) ) { return 0; } - *return_value = (void *)acpi_driver_data(d); + *return_value = acpi_driver_data(d); return 1; } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 99e1ef9939b..a0522735dd9 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -52,7 +52,7 @@ /* serialize freq changes */ static DEFINE_MUTEX(fidvid_mutex); -static struct powernow_k8_data *powernow_data[NR_CPUS]; +static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data); static int cpu_family = CPU_OPTERON; @@ -1018,7 +1018,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) { cpumask_t oldmask = CPU_MASK_ALL; - struct powernow_k8_data *data = powernow_data[pol->cpu]; + struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); u32 checkfid; u32 checkvid; unsigned int newstate; @@ -1094,7 +1094,7 @@ err_out: /* Driver entry point to verify the policy and range of frequencies */ static int powernowk8_verify(struct cpufreq_policy *pol) { - struct powernow_k8_data *data = powernow_data[pol->cpu]; + struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); if (!data) return -EINVAL; @@ -1202,7 +1202,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n", data->currfid, data->currvid); - powernow_data[pol->cpu] = data; + per_cpu(powernow_data, pol->cpu) = data; return 0; @@ -1216,7 +1216,7 @@ err_out: static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol) { - struct powernow_k8_data *data = powernow_data[pol->cpu]; + struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); if (!data) return -EINVAL; @@ -1237,7 +1237,7 @@ static unsigned int powernowk8_get (unsigned int cpu) cpumask_t oldmask = current->cpus_allowed; unsigned int khz = 0; - data = powernow_data[first_cpu(per_cpu(cpu_core_map, cpu))]; + data = per_cpu(powernow_data, first_cpu(per_cpu(cpu_core_map, cpu))); if (!data) return -EINVAL; diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 88d66fb8411..404a6a2d401 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -5,6 +5,7 @@ #include <asm/dma.h> #include <asm/io.h> #include <asm/processor-cyrix.h> +#include <asm/processor-flags.h> #include <asm/timer.h> #include <asm/pci-direct.h> #include <asm/tsc.h> @@ -126,15 +127,12 @@ static void __cpuinit set_cx86_reorder(void) static void __cpuinit set_cx86_memwb(void) { - u32 cr0; - printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); /* CCR2 bit 2: unlock NW bit */ setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); /* set 'Not Write-through' */ - cr0 = 0x20000000; - write_cr0(read_cr0() | cr0); + write_cr0(read_cr0() | X86_CR0_NW); /* CCR2 bit 2: lock NW bit and set WT1 */ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 ); } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index cc8c501b9f3..d1c372b018d 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -11,6 +11,8 @@ #include <asm/pgtable.h> #include <asm/msr.h> #include <asm/uaccess.h> +#include <asm/ptrace.h> +#include <asm/ds.h> #include "cpu.h" @@ -27,13 +29,14 @@ struct movsl_mask movsl_mask __read_mostly; #endif -void __cpuinit early_intel_workaround(struct cpuinfo_x86 *c) +void __cpuinit early_init_intel(struct cpuinfo_x86 *c) { - if (c->x86_vendor != X86_VENDOR_INTEL) - return; /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ if (c->x86 == 15 && c->x86_cache_alignment == 64) c->x86_cache_alignment = 128; + if ((c->x86 == 0xf && c->x86_model >= 0x03) || + (c->x86 == 0x6 && c->x86_model >= 0x0e)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); } /* @@ -113,6 +116,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) unsigned int l2 = 0; char *p = NULL; + early_init_intel(c); + #ifdef CONFIG_X86_F00F_BUG /* * All current models of Pentium and Pentium with MMX technology CPUs @@ -132,7 +137,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } #endif - select_idle_routine(c); l2 = init_intel_cacheinfo(c); if (c->cpuid_level > 9 ) { unsigned eax = cpuid_eax(10); @@ -201,16 +205,13 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) } #endif + if (cpu_has_xmm2) + set_bit(X86_FEATURE_LFENCE_RDTSC, c->x86_capability); if (c->x86 == 15) { set_bit(X86_FEATURE_P4, c->x86_capability); - set_bit(X86_FEATURE_SYNC_RDTSC, c->x86_capability); } if (c->x86 == 6) set_bit(X86_FEATURE_P3, c->x86_capability); - if ((c->x86 == 0xf && c->x86_model >= 0x03) || - (c->x86 == 0x6 && c->x86_model >= 0x0e)) - set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability); - if (cpu_has_ds) { unsigned int l1; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); @@ -219,6 +220,9 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (!(l1 & (1<<12))) set_bit(X86_FEATURE_PEBS, c->x86_capability); } + + if (cpu_has_bts) + ds_init_intel(c); } static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) @@ -342,5 +346,22 @@ unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) EXPORT_SYMBOL(cmpxchg_386_u32); #endif +#ifndef CONFIG_X86_CMPXCHG64 +unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) +{ + u64 prev; + unsigned long flags; + + /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u64 *)ptr; + if (prev == old) + *(u64 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_486_u64); +#endif + // arch_initcall(intel_cpu_init); diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index eef63e3630c..e633c9c2b76 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -16,7 +16,7 @@ #include "mce.h" /* Machine Check Handler For AMD Athlon/Duron */ -static fastcall void k7_machine_check(struct pt_regs * regs, long error_code) +static void k7_machine_check(struct pt_regs * regs, long error_code) { int recover=1; u32 alow, ahigh, high, low; @@ -27,29 +27,32 @@ static fastcall void k7_machine_check(struct pt_regs * regs, long error_code) if (mcgstl & (1<<0)) /* Recoverable ? */ recover=0; - printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", + printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl); - for (i=1; i<nr_mce_banks; i++) { - rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); + for (i = 1; i < nr_mce_banks; i++) { + rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); if (high&(1<<31)) { + char misc[20]; + char addr[24]; + misc[0] = addr[0] = '\0'; if (high & (1<<29)) recover |= 1; if (high & (1<<25)) recover |= 2; - printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low); high &= ~(1<<31); if (high & (1<<27)) { - rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); - printk ("[%08x%08x]", ahigh, alow); + rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); + snprintf(misc, 20, "[%08x%08x]", ahigh, alow); } if (high & (1<<26)) { - rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - printk (" at %08x%08x", ahigh, alow); + rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); + snprintf(addr, 24, " at %08x%08x", ahigh, alow); } - printk ("\n"); + printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", + smp_processor_id(), i, high, low, misc, addr); /* Clear it */ - wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); + wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); /* Serialize */ wmb(); add_taint(TAINT_MACHINE_CHECK); diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h index 81fb6e2d35f..ae9f628838f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.h +++ b/arch/x86/kernel/cpu/mcheck/mce.h @@ -8,7 +8,7 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c); void winchip_mcheck_init(struct cpuinfo_x86 *c); /* Call the installed machine check handler for this CPU setup. */ -extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code); +extern void (*machine_check_vector)(struct pt_regs *, long error_code); extern int nr_mce_banks; diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index 34c781eddee..a5182dcd94a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c @@ -22,13 +22,13 @@ int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ /* Handle unconfigured int18 (should never happen) */ -static fastcall void unexpected_machine_check(struct pt_regs * regs, long error_code) +static void unexpected_machine_check(struct pt_regs * regs, long error_code) { printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); } /* Call the installed machine check handler for this CPU setup. */ -void fastcall (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; +void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; /* This has to be run for each processor */ void mcheck_init(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 242e8668dbe..9a699ed0359 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -63,7 +63,7 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait); * separate MCEs from kernel messages to avoid bogus bug reports. */ -struct mce_log mcelog = { +static struct mce_log mcelog = { MCE_LOG_SIGNATURE, MCE_LOG_LEN, }; @@ -80,7 +80,7 @@ void mce_log(struct mce *mce) /* When the buffer fills up discard new entries. Assume that the earlier errors are the more interesting. */ if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, &mcelog.flags); + set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); return; } /* Old left over entry. Skip. */ @@ -110,12 +110,12 @@ static void print_mce(struct mce *m) KERN_EMERG "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", m->cpu, m->mcgstatus, m->bank, m->status); - if (m->rip) { + if (m->ip) { printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->rip); + m->cs, m->ip); if (m->cs == __KERNEL_CS) - print_symbol("{%s}", m->rip); + print_symbol("{%s}", m->ip); printk("\n"); } printk(KERN_EMERG "TSC %Lx ", m->tsc); @@ -156,16 +156,16 @@ static int mce_available(struct cpuinfo_x86 *c) static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) { if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) { - m->rip = regs->rip; + m->ip = regs->ip; m->cs = regs->cs; } else { - m->rip = 0; + m->ip = 0; m->cs = 0; } if (rip_msr) { /* Assume the RIP in the MSR is exact. Is this true? */ m->mcgstatus |= MCG_STATUS_EIPV; - rdmsrl(rip_msr, m->rip); + rdmsrl(rip_msr, m->ip); m->cs = 0; } } @@ -192,10 +192,10 @@ void do_machine_check(struct pt_regs * regs, long error_code) atomic_inc(&mce_entry); - if (regs) - notify_die(DIE_NMI, "machine check", regs, error_code, 18, - SIGKILL); - if (!banks) + if ((regs + && notify_die(DIE_NMI, "machine check", regs, error_code, + 18, SIGKILL) == NOTIFY_STOP) + || !banks) goto out2; memset(&m, 0, sizeof(struct mce)); @@ -288,7 +288,7 @@ void do_machine_check(struct pt_regs * regs, long error_code) * instruction which caused the MCE. */ if (m.mcgstatus & MCG_STATUS_EIPV) - user_space = panicm.rip && (panicm.cs & 3); + user_space = panicm.ip && (panicm.cs & 3); /* * If we know that the error was in user space, send a @@ -564,7 +564,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) { unsigned long *cpu_tsc; - static DECLARE_MUTEX(mce_read_sem); + static DEFINE_MUTEX(mce_read_mutex); unsigned next; char __user *buf = ubuf; int i, err; @@ -573,12 +573,12 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, if (!cpu_tsc) return -ENOMEM; - down(&mce_read_sem); + mutex_lock(&mce_read_mutex); next = rcu_dereference(mcelog.next); /* Only supports full reads right now */ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { - up(&mce_read_sem); + mutex_unlock(&mce_read_mutex); kfree(cpu_tsc); return -EINVAL; } @@ -621,7 +621,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, memset(&mcelog.entry[i], 0, sizeof(struct mce)); } } - up(&mce_read_sem); + mutex_unlock(&mce_read_mutex); kfree(cpu_tsc); return err ? -EFAULT : buf - ubuf; } @@ -634,8 +634,7 @@ static unsigned int mce_poll(struct file *file, poll_table *wait) return 0; } -static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, - unsigned long arg) +static long mce_ioctl(struct file *f, unsigned int cmd, unsigned long arg) { int __user *p = (int __user *)arg; @@ -664,7 +663,7 @@ static const struct file_operations mce_chrdev_ops = { .release = mce_release, .read = mce_read, .poll = mce_poll, - .ioctl = mce_ioctl, + .unlocked_ioctl = mce_ioctl, }; static struct miscdevice mce_log_device = { @@ -855,8 +854,8 @@ static void mce_remove_device(unsigned int cpu) } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int -mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; @@ -873,7 +872,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block mce_cpu_notifier = { +static struct notifier_block mce_cpu_notifier __cpuinitdata = { .notifier_call = mce_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 753588755fe..32671da8184 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -118,6 +118,7 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) { unsigned int bank, block; unsigned int cpu = smp_processor_id(); + u8 lvt_off; u32 low = 0, high = 0, address = 0; for (bank = 0; bank < NR_BANKS; ++bank) { @@ -153,14 +154,13 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) if (shared_bank[bank] && c->cpu_core_id) break; #endif + lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR, + APIC_EILVT_MSG_FIX, 0); + high &= ~MASK_LVTOFF_HI; - high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20; + high |= lvt_off << 20; wrmsr(address, low, high); - setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD, - THRESHOLD_APIC_VECTOR, - K8_APIC_EXT_INT_MSG_FIX, 0); - threshold_defaults.address = address; threshold_restart_bank(&threshold_defaults, 0, 0); } @@ -450,7 +450,8 @@ recurse: if (err) goto out_free; - kobject_uevent(&b->kobj, KOBJ_ADD); + if (b) + kobject_uevent(&b->kobj, KOBJ_ADD); return err; @@ -554,7 +555,7 @@ static __cpuinit int threshold_create_device(unsigned int cpu) int err = 0; for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, cpu) & 1 << bank)) + if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; err = threshold_create_bank(cpu, bank); if (err) @@ -637,14 +638,14 @@ static void threshold_remove_device(unsigned int cpu) unsigned int bank; for (bank = 0; bank < NR_BANKS; ++bank) { - if (!(per_cpu(bank_map, cpu) & 1 << bank)) + if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; threshold_remove_bank(cpu, bank); } } /* get notified when a cpu comes on/off */ -static int threshold_cpu_callback(struct notifier_block *nfb, +static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { /* cpu was unsigned int to begin with */ @@ -669,7 +670,7 @@ static int threshold_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block threshold_cpu_notifier = { +static struct notifier_block threshold_cpu_notifier __cpuinitdata = { .notifier_call = threshold_cpu_callback, }; diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index be4dabfee1f..cb03345554a 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -57,7 +57,7 @@ static void intel_thermal_interrupt(struct pt_regs *regs) /* Thermal interrupt handler for this CPU setup */ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; -fastcall void smp_thermal_interrupt(struct pt_regs *regs) +void smp_thermal_interrupt(struct pt_regs *regs) { irq_enter(); vendor_thermal_interrupt(regs); @@ -141,7 +141,7 @@ static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) rdmsr (MSR_IA32_MCG_EIP, r->eip, h); } -static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) +static void intel_machine_check(struct pt_regs * regs, long error_code) { int recover=1; u32 alow, ahigh, high, low; @@ -152,38 +152,41 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) if (mcgstl & (1<<0)) /* Recoverable ? */ recover=0; - printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", + printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl); if (mce_num_extended_msrs > 0) { struct intel_mce_extended_msrs dbg; intel_get_extended_msrs(&dbg); - printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n", - smp_processor_id(), dbg.eip, dbg.eflags); - printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n", - dbg.eax, dbg.ebx, dbg.ecx, dbg.edx); - printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", + printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n" + "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n" + "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", + smp_processor_id(), dbg.eip, dbg.eflags, + dbg.eax, dbg.ebx, dbg.ecx, dbg.edx, dbg.esi, dbg.edi, dbg.ebp, dbg.esp); } - for (i=0; i<nr_mce_banks; i++) { - rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); + for (i = 0; i < nr_mce_banks; i++) { + rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); if (high & (1<<31)) { + char misc[20]; + char addr[24]; + misc[0] = addr[0] = '\0'; if (high & (1<<29)) recover |= 1; if (high & (1<<25)) recover |= 2; - printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low); high &= ~(1<<31); if (high & (1<<27)) { - rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); - printk ("[%08x%08x]", ahigh, alow); + rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); + snprintf(misc, 20, "[%08x%08x]", ahigh, alow); } if (high & (1<<26)) { - rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - printk (" at %08x%08x", ahigh, alow); + rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); + snprintf(addr, 24, " at %08x%08x", ahigh, alow); } - printk ("\n"); + printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", + smp_processor_id(), i, high, low, misc, addr); } } diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 94bc43d950c..a18310aaae0 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -16,7 +16,7 @@ #include "mce.h" /* Machine check handler for Pentium class Intel */ -static fastcall void pentium_machine_check(struct pt_regs * regs, long error_code) +static void pentium_machine_check(struct pt_regs * regs, long error_code) { u32 loaddr, hi, lotype; rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c index deeae42ce19..74342604d30 100644 --- a/arch/x86/kernel/cpu/mcheck/p6.c +++ b/arch/x86/kernel/cpu/mcheck/p6.c @@ -16,7 +16,7 @@ #include "mce.h" /* Machine Check Handler For PII/PIII */ -static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) +static void intel_machine_check(struct pt_regs * regs, long error_code) { int recover=1; u32 alow, ahigh, high, low; @@ -27,27 +27,30 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) if (mcgstl & (1<<0)) /* Recoverable ? */ recover=0; - printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", + printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", smp_processor_id(), mcgsth, mcgstl); - for (i=0; i<nr_mce_banks; i++) { - rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); + for (i = 0; i < nr_mce_banks; i++) { + rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high); if (high & (1<<31)) { + char misc[20]; + char addr[24]; + misc[0] = addr[0] = '\0'; if (high & (1<<29)) recover |= 1; if (high & (1<<25)) recover |= 2; - printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low); high &= ~(1<<31); if (high & (1<<27)) { - rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); - printk ("[%08x%08x]", ahigh, alow); + rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh); + snprintf(misc, 20, "[%08x%08x]", ahigh, alow); } if (high & (1<<26)) { - rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); - printk (" at %08x%08x", ahigh, alow); + rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh); + snprintf(addr, 24, " at %08x%08x", ahigh, alow); } - printk ("\n"); + printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n", + smp_processor_id(), i, high, low, misc, addr); } } diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 9e424b6c293..3d428d5afc5 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -15,7 +15,7 @@ #include "mce.h" /* Machine check handler for WinChip C6 */ -static fastcall void winchip_machine_check(struct pt_regs * regs, long error_code) +static void winchip_machine_check(struct pt_regs * regs, long error_code) { printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK); diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index 0949cdbf848..ee2331b0e58 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -53,8 +53,6 @@ static void amd_set_mtrr(unsigned int reg, unsigned long base, <base> The base address of the region. <size> The size of the region. If this is 0 the region is disabled. <type> The type of the region. - <do_safe> If TRUE, do the change safely. If FALSE, safety measures should - be done externally. [RETURNS] Nothing. */ { diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 9964be3de2b..8e139c70f88 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -4,6 +4,7 @@ #include <asm/msr.h> #include <asm/io.h> #include <asm/processor-cyrix.h> +#include <asm/processor-flags.h> #include "mtrr.h" int arr3_protected; @@ -142,7 +143,7 @@ static void prepare_set(void) /* Disable and flush caches. Note that wbinvd flushes the TLBs as a side-effect */ - cr0 = read_cr0() | 0x40000000; + cr0 = read_cr0() | X86_CR0_CD; wbinvd(); write_cr0(cr0); wbinvd(); diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 992f08dfbb6..103d61a59b1 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -9,11 +9,12 @@ #include <asm/msr.h> #include <asm/system.h> #include <asm/cpufeature.h> +#include <asm/processor-flags.h> #include <asm/tlbflush.h> #include "mtrr.h" struct mtrr_state { - struct mtrr_var_range *var_ranges; + struct mtrr_var_range var_ranges[MAX_VAR_RANGES]; mtrr_type fixed_ranges[NUM_FIXED_RANGES]; unsigned char enabled; unsigned char have_fixed; @@ -85,12 +86,6 @@ void __init get_mtrr_state(void) struct mtrr_var_range *vrs; unsigned lo, dummy; - if (!mtrr_state.var_ranges) { - mtrr_state.var_ranges = kmalloc(num_var_ranges * sizeof (struct mtrr_var_range), - GFP_KERNEL); - if (!mtrr_state.var_ranges) - return; - } vrs = mtrr_state.var_ranges; rdmsr(MTRRcap_MSR, lo, dummy); @@ -188,7 +183,7 @@ static inline void k8_enable_fixed_iorrs(void) * \param changed pointer which indicates whether the MTRR needed to be changed * \param msrwords pointer to the MSR values which the MSR should have */ -static void set_fixed_range(int msr, int * changed, unsigned int * msrwords) +static void set_fixed_range(int msr, bool *changed, unsigned int *msrwords) { unsigned lo, hi; @@ -200,7 +195,7 @@ static void set_fixed_range(int msr, int * changed, unsigned int * msrwords) ((msrwords[0] | msrwords[1]) & K8_MTRR_RDMEM_WRMEM_MASK)) k8_enable_fixed_iorrs(); mtrr_wrmsr(msr, msrwords[0], msrwords[1]); - *changed = TRUE; + *changed = true; } } @@ -260,7 +255,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, static int set_fixed_ranges(mtrr_type * frs) { unsigned long long *saved = (unsigned long long *) frs; - int changed = FALSE; + bool changed = false; int block=-1, range; while (fixed_range_blocks[++block].ranges) @@ -273,17 +268,17 @@ static int set_fixed_ranges(mtrr_type * frs) /* Set the MSR pair relating to a var range. Returns TRUE if changes are made */ -static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) +static bool set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) { unsigned int lo, hi; - int changed = FALSE; + bool changed = false; rdmsr(MTRRphysBase_MSR(index), lo, hi); if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) || (vr->base_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { mtrr_wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); - changed = TRUE; + changed = true; } rdmsr(MTRRphysMask_MSR(index), lo, hi); @@ -292,7 +287,7 @@ static int set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) || (vr->mask_hi & (size_and_mask >> (32 - PAGE_SHIFT))) != (hi & (size_and_mask >> (32 - PAGE_SHIFT)))) { mtrr_wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); - changed = TRUE; + changed = true; } return changed; } @@ -350,7 +345,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) spin_lock(&set_atomicity_lock); /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ - cr0 = read_cr0() | 0x40000000; /* set CD flag */ + cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); wbinvd(); @@ -417,8 +412,6 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, <base> The base address of the region. <size> The size of the region. If this is 0 the region is disabled. <type> The type of the region. - <do_safe> If TRUE, do the change safely. If FALSE, safety measures should - be done externally. [RETURNS] Nothing. */ { diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index c7d8f175674..91e150acb46 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -11,10 +11,6 @@ #include <asm/mtrr.h> #include "mtrr.h" -/* RED-PEN: this is accessed without any locking */ -extern unsigned int *usage_table; - - #define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private) static const char *const mtrr_strings[MTRR_NUM_TYPES] = @@ -37,7 +33,7 @@ const char *mtrr_attrib_to_str(int x) static int mtrr_file_add(unsigned long base, unsigned long size, - unsigned int type, char increment, struct file *file, int page) + unsigned int type, bool increment, struct file *file, int page) { int reg, max; unsigned int *fcount = FILE_FCOUNT(file); @@ -55,7 +51,7 @@ mtrr_file_add(unsigned long base, unsigned long size, base >>= PAGE_SHIFT; size >>= PAGE_SHIFT; } - reg = mtrr_add_page(base, size, type, 1); + reg = mtrr_add_page(base, size, type, true); if (reg >= 0) ++fcount[reg]; return reg; @@ -141,7 +137,7 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) size >>= PAGE_SHIFT; err = mtrr_add_page((unsigned long) base, (unsigned long) size, i, - 1); + true); if (err < 0) return err; return len; @@ -217,7 +213,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = - mtrr_file_add(sentry.base, sentry.size, sentry.type, 1, + mtrr_file_add(sentry.base, sentry.size, sentry.type, true, file, 0); break; case MTRRIOC_SET_ENTRY: @@ -226,7 +222,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = mtrr_add(sentry.base, sentry.size, sentry.type, 0); + err = mtrr_add(sentry.base, sentry.size, sentry.type, false); break; case MTRRIOC_DEL_ENTRY: #ifdef CONFIG_COMPAT @@ -270,7 +266,7 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; err = - mtrr_file_add(sentry.base, sentry.size, sentry.type, 1, + mtrr_file_add(sentry.base, sentry.size, sentry.type, true, file, 1); break; case MTRRIOC_SET_PAGE_ENTRY: @@ -279,7 +275,8 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg) #endif if (!capable(CAP_SYS_ADMIN)) return -EPERM; - err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0); + err = + mtrr_add_page(sentry.base, sentry.size, sentry.type, false); break; case MTRRIOC_DEL_PAGE_ENTRY: #ifdef CONFIG_COMPAT @@ -396,7 +393,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) for (i = 0; i < max; i++) { mtrr_if->get(i, &base, &size, &type); if (size == 0) - usage_table[i] = 0; + mtrr_usage_table[i] = 0; else { if (size < (0x100000 >> PAGE_SHIFT)) { /* less than 1MB */ @@ -410,7 +407,7 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) len += seq_printf(seq, "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", i, base, base >> (20 - PAGE_SHIFT), size, factor, - mtrr_attrib_to_str(type), usage_table[i]); + mtrr_attrib_to_str(type), mtrr_usage_table[i]); } } return 0; diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index beb45c9c083..71591958265 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -38,8 +38,8 @@ #include <linux/cpu.h> #include <linux/mutex.h> +#include <asm/e820.h> #include <asm/mtrr.h> - #include <asm/uaccess.h> #include <asm/processor.h> #include <asm/msr.h> @@ -47,7 +47,7 @@ u32 num_var_ranges = 0; -unsigned int *usage_table; +unsigned int mtrr_usage_table[MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; @@ -121,13 +121,8 @@ static void __init init_table(void) int i, max; max = num_var_ranges; - if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL)) - == NULL) { - printk(KERN_ERR "mtrr: could not allocate\n"); - return; - } for (i = 0; i < max; i++) - usage_table[i] = 1; + mtrr_usage_table[i] = 1; } struct set_mtrr_data { @@ -311,7 +306,7 @@ static void set_mtrr(unsigned int reg, unsigned long base, */ int mtrr_add_page(unsigned long base, unsigned long size, - unsigned int type, char increment) + unsigned int type, bool increment) { int i, replace, error; mtrr_type ltype; @@ -383,7 +378,7 @@ int mtrr_add_page(unsigned long base, unsigned long size, goto out; } if (increment) - ++usage_table[i]; + ++mtrr_usage_table[i]; error = i; goto out; } @@ -391,13 +386,15 @@ int mtrr_add_page(unsigned long base, unsigned long size, i = mtrr_if->get_free_region(base, size, replace); if (i >= 0) { set_mtrr(i, base, size, type); - if (likely(replace < 0)) - usage_table[i] = 1; - else { - usage_table[i] = usage_table[replace] + !!increment; + if (likely(replace < 0)) { + mtrr_usage_table[i] = 1; + } else { + mtrr_usage_table[i] = mtrr_usage_table[replace]; + if (increment) + mtrr_usage_table[i]++; if (unlikely(replace != i)) { set_mtrr(replace, 0, 0, 0); - usage_table[replace] = 0; + mtrr_usage_table[replace] = 0; } } } else @@ -460,7 +457,7 @@ static int mtrr_check(unsigned long base, unsigned long size) int mtrr_add(unsigned long base, unsigned long size, unsigned int type, - char increment) + bool increment) { if (mtrr_check(base, size)) return -EINVAL; @@ -527,11 +524,11 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); goto out; } - if (usage_table[reg] < 1) { + if (mtrr_usage_table[reg] < 1) { printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); goto out; } - if (--usage_table[reg] < 1) + if (--mtrr_usage_table[reg] < 1) set_mtrr(reg, 0, 0, 0); error = reg; out: @@ -591,16 +588,11 @@ struct mtrr_value { unsigned long lsize; }; -static struct mtrr_value * mtrr_state; +static struct mtrr_value mtrr_state[MAX_VAR_RANGES]; static int mtrr_save(struct sys_device * sysdev, pm_message_t state) { int i; - int size = num_var_ranges * sizeof(struct mtrr_value); - - mtrr_state = kzalloc(size,GFP_ATOMIC); - if (!mtrr_state) - return -ENOMEM; for (i = 0; i < num_var_ranges; i++) { mtrr_if->get(i, @@ -622,7 +614,6 @@ static int mtrr_restore(struct sys_device * sysdev) mtrr_state[i].lsize, mtrr_state[i].ltype); } - kfree(mtrr_state); return 0; } @@ -633,6 +624,112 @@ static struct sysdev_driver mtrr_sysdev_driver = { .resume = mtrr_restore, }; +static int disable_mtrr_trim; + +static int __init disable_mtrr_trim_setup(char *str) +{ + disable_mtrr_trim = 1; + return 0; +} +early_param("disable_mtrr_trim", disable_mtrr_trim_setup); + +/* + * Newer AMD K8s and later CPUs have a special magic MSR way to force WB + * for memory >4GB. Check for that here. + * Note this won't check if the MTRRs < 4GB where the magic bit doesn't + * apply to are wrong, but so far we don't know of any such case in the wild. + */ +#define Tom2Enabled (1U << 21) +#define Tom2ForceMemTypeWB (1U << 22) + +static __init int amd_special_default_mtrr(void) +{ + u32 l, h; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return 0; + if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) + return 0; + /* In case some hypervisor doesn't pass SYSCFG through */ + if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) + return 0; + /* + * Memory between 4GB and top of mem is forced WB by this magic bit. + * Reserved before K8RevF, but should be zero there. + */ + if ((l & (Tom2Enabled | Tom2ForceMemTypeWB)) == + (Tom2Enabled | Tom2ForceMemTypeWB)) + return 1; + return 0; +} + +/** + * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs + * + * Some buggy BIOSes don't setup the MTRRs properly for systems with certain + * memory configurations. This routine checks that the highest MTRR matches + * the end of memory, to make sure the MTRRs having a write back type cover + * all of the memory the kernel is intending to use. If not, it'll trim any + * memory off the end by adjusting end_pfn, removing it from the kernel's + * allocation pools, warning the user with an obnoxious message. + */ +int __init mtrr_trim_uncached_memory(unsigned long end_pfn) +{ + unsigned long i, base, size, highest_addr = 0, def, dummy; + mtrr_type type; + u64 trim_start, trim_size; + + /* + * Make sure we only trim uncachable memory on machines that + * support the Intel MTRR architecture: + */ + if (!is_cpu(INTEL) || disable_mtrr_trim) + return 0; + rdmsr(MTRRdefType_MSR, def, dummy); + def &= 0xff; + if (def != MTRR_TYPE_UNCACHABLE) + return 0; + + if (amd_special_default_mtrr()) + return 0; + + /* Find highest cached pfn */ + for (i = 0; i < num_var_ranges; i++) { + mtrr_if->get(i, &base, &size, &type); + if (type != MTRR_TYPE_WRBACK) + continue; + base <<= PAGE_SHIFT; + size <<= PAGE_SHIFT; + if (highest_addr < base + size) + highest_addr = base + size; + } + + /* kvm/qemu doesn't have mtrr set right, don't trim them all */ + if (!highest_addr) { + printk(KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n"); + WARN_ON(1); + return 0; + } + + if ((highest_addr >> PAGE_SHIFT) < end_pfn) { + printk(KERN_WARNING "WARNING: BIOS bug: CPU MTRRs don't cover" + " all of memory, losing %LdMB of RAM.\n", + (((u64)end_pfn << PAGE_SHIFT) - highest_addr) >> 20); + + WARN_ON(1); + + printk(KERN_INFO "update e820 for mtrr\n"); + trim_start = highest_addr; + trim_size = end_pfn; + trim_size <<= PAGE_SHIFT; + trim_size -= trim_start; + add_memory_region(trim_start, trim_size, E820_RESERVED); + update_e820(); + return 1; + } + + return 0; +} /** * mtrr_bp_init - initialize mtrrs on the boot CPU diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 289dfe6030e..fb74a2c2081 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -2,10 +2,8 @@ * local mtrr defines. */ -#ifndef TRUE -#define TRUE 1 -#define FALSE 0 -#endif +#include <linux/types.h> +#include <linux/stddef.h> #define MTRRcap_MSR 0x0fe #define MTRRdefType_MSR 0x2ff @@ -14,6 +12,7 @@ #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) #define NUM_FIXED_RANGES 88 +#define MAX_VAR_RANGES 256 #define MTRRfix64K_00000_MSR 0x250 #define MTRRfix16K_80000_MSR 0x258 #define MTRRfix16K_A0000_MSR 0x259 @@ -34,6 +33,8 @@ an 8 bit field: */ typedef u8 mtrr_type; +extern unsigned int mtrr_usage_table[MAX_VAR_RANGES]; + struct mtrr_ops { u32 vendor; u32 use_intel_if; diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 49e20c2afcd..9f8ba923d1c 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c @@ -4,6 +4,7 @@ #include <asm/mtrr.h> #include <asm/msr.h> #include <asm/processor-cyrix.h> +#include <asm/processor-flags.h> #include "mtrr.h" @@ -25,7 +26,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) /* Disable and flush caches. Note that wbinvd flushes the TLBs as a side-effect */ - cr0 = read_cr0() | 0x40000000; + cr0 = read_cr0() | X86_CR0_CD; wbinvd(); write_cr0(cr0); wbinvd(); diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index c02541e6e65..9b838324b81 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -167,7 +167,6 @@ void release_evntsel_nmi(unsigned int msr) clear_bit(counter, evntsel_nmi_owner); } -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); EXPORT_SYMBOL(reserve_perfctr_nmi); EXPORT_SYMBOL(release_perfctr_nmi); diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 3900e46d66d..02821326014 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -188,7 +188,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos) static void c_stop(struct seq_file *m, void *v) { } -struct seq_operations cpuinfo_op = { +const struct seq_operations cpuinfo_op = { .start = c_start, .next = c_next, .stop = c_stop, diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index d387c770c51..dec66e45281 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -50,7 +50,7 @@ struct cpuid_command { static void cpuid_smp_cpuid(void *cmd_block) { - struct cpuid_command *cmd = (struct cpuid_command *)cmd_block; + struct cpuid_command *cmd = cmd_block; cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2], &cmd->data[3]); diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 40978af630e..a47798b59f0 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -17,7 +17,7 @@ static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; static void doublefault_fn(void) { - struct Xgt_desc_struct gdt_desc = {0, 0}; + struct desc_ptr gdt_desc = {0, 0}; unsigned long gdt, tss; store_gdt(&gdt_desc); @@ -33,14 +33,15 @@ static void doublefault_fn(void) printk(KERN_EMERG "double fault, tss at %08lx\n", tss); if (ptr_ok(tss)) { - struct i386_hw_tss *t = (struct i386_hw_tss *)tss; + struct x86_hw_tss *t = (struct x86_hw_tss *)tss; - printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", t->eip, t->esp); + printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", + t->ip, t->sp); printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", - t->eax, t->ebx, t->ecx, t->edx); + t->ax, t->bx, t->cx, t->dx); printk(KERN_EMERG "esi = %08lx, edi = %08lx\n", - t->esi, t->edi); + t->si, t->di); } } @@ -50,15 +51,15 @@ static void doublefault_fn(void) struct tss_struct doublefault_tss __cacheline_aligned = { .x86_tss = { - .esp0 = STACK_START, + .sp0 = STACK_START, .ss0 = __KERNEL_DS, .ldt = 0, .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, - .eip = (unsigned long) doublefault_fn, + .ip = (unsigned long) doublefault_fn, /* 0x2 bit is always set */ - .eflags = X86_EFLAGS_SF | 0x2, - .esp = STACK_START, + .flags = X86_EFLAGS_SF | 0x2, + .sp = STACK_START, .es = __USER_DS, .cs = __KERNEL_CS, .ss = __KERNEL_DS, diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c new file mode 100644 index 00000000000..1c5ca4d1878 --- /dev/null +++ b/arch/x86/kernel/ds.c @@ -0,0 +1,464 @@ +/* + * Debug Store support + * + * This provides a low-level interface to the hardware's Debug Store + * feature that is used for last branch recording (LBR) and + * precise-event based sampling (PEBS). + * + * Different architectures use a different DS layout/pointer size. + * The below functions therefore work on a void*. + * + * + * Since there is no user for PEBS, yet, only LBR (or branch + * trace store, BTS) is supported. + * + * + * Copyright (C) 2007 Intel Corporation. + * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007 + */ + +#include <asm/ds.h> + +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/slab.h> + + +/* + * Debug Store (DS) save area configuration (see Intel64 and IA32 + * Architectures Software Developer's Manual, section 18.5) + * + * The DS configuration consists of the following fields; different + * architetures vary in the size of those fields. + * - double-word aligned base linear address of the BTS buffer + * - write pointer into the BTS buffer + * - end linear address of the BTS buffer (one byte beyond the end of + * the buffer) + * - interrupt pointer into BTS buffer + * (interrupt occurs when write pointer passes interrupt pointer) + * - double-word aligned base linear address of the PEBS buffer + * - write pointer into the PEBS buffer + * - end linear address of the PEBS buffer (one byte beyond the end of + * the buffer) + * - interrupt pointer into PEBS buffer + * (interrupt occurs when write pointer passes interrupt pointer) + * - value to which counter is reset following counter overflow + * + * On later architectures, the last branch recording hardware uses + * 64bit pointers even in 32bit mode. + * + * + * Branch Trace Store (BTS) records store information about control + * flow changes. They at least provide the following information: + * - source linear address + * - destination linear address + * + * Netburst supported a predicated bit that had been dropped in later + * architectures. We do not suppor it. + * + * + * In order to abstract from the actual DS and BTS layout, we describe + * the access to the relevant fields. + * Thanks to Andi Kleen for proposing this design. + * + * The implementation, however, is not as general as it might seem. In + * order to stay somewhat simple and efficient, we assume an + * underlying unsigned type (mostly a pointer type) and we expect the + * field to be at least as big as that type. + */ + +/* + * A special from_ip address to indicate that the BTS record is an + * info record that needs to be interpreted or skipped. + */ +#define BTS_ESCAPE_ADDRESS (-1) + +/* + * A field access descriptor + */ +struct access_desc { + unsigned char offset; + unsigned char size; +}; + +/* + * The configuration for a particular DS/BTS hardware implementation. + */ +struct ds_configuration { + /* the DS configuration */ + unsigned char sizeof_ds; + struct access_desc bts_buffer_base; + struct access_desc bts_index; + struct access_desc bts_absolute_maximum; + struct access_desc bts_interrupt_threshold; + /* the BTS configuration */ + unsigned char sizeof_bts; + struct access_desc from_ip; + struct access_desc to_ip; + /* BTS variants used to store additional information like + timestamps */ + struct access_desc info_type; + struct access_desc info_data; + unsigned long debugctl_mask; +}; + +/* + * The global configuration used by the below accessor functions + */ +static struct ds_configuration ds_cfg; + +/* + * Accessor functions for some DS and BTS fields using the above + * global ptrace_bts_cfg. + */ +static inline unsigned long get_bts_buffer_base(char *base) +{ + return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset); +} +static inline void set_bts_buffer_base(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value; +} +static inline unsigned long get_bts_index(char *base) +{ + return *(unsigned long *)(base + ds_cfg.bts_index.offset); +} +static inline void set_bts_index(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value; +} +static inline unsigned long get_bts_absolute_maximum(char *base) +{ + return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset); +} +static inline void set_bts_absolute_maximum(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value; +} +static inline unsigned long get_bts_interrupt_threshold(char *base) +{ + return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset); +} +static inline void set_bts_interrupt_threshold(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value; +} +static inline unsigned long get_from_ip(char *base) +{ + return *(unsigned long *)(base + ds_cfg.from_ip.offset); +} +static inline void set_from_ip(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value; +} +static inline unsigned long get_to_ip(char *base) +{ + return *(unsigned long *)(base + ds_cfg.to_ip.offset); +} +static inline void set_to_ip(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value; +} +static inline unsigned char get_info_type(char *base) +{ + return *(unsigned char *)(base + ds_cfg.info_type.offset); +} +static inline void set_info_type(char *base, unsigned char value) +{ + (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value; +} +static inline unsigned long get_info_data(char *base) +{ + return *(unsigned long *)(base + ds_cfg.info_data.offset); +} +static inline void set_info_data(char *base, unsigned long value) +{ + (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value; +} + + +int ds_allocate(void **dsp, size_t bts_size_in_bytes) +{ + size_t bts_size_in_records; + unsigned long bts; + void *ds; + + if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) + return -EOPNOTSUPP; + + if (bts_size_in_bytes < 0) + return -EINVAL; + + bts_size_in_records = + bts_size_in_bytes / ds_cfg.sizeof_bts; + bts_size_in_bytes = + bts_size_in_records * ds_cfg.sizeof_bts; + + if (bts_size_in_bytes <= 0) + return -EINVAL; + + bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL); + + if (!bts) + return -ENOMEM; + + ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); + + if (!ds) { + kfree((void *)bts); + return -ENOMEM; + } + + set_bts_buffer_base(ds, bts); + set_bts_index(ds, bts); + set_bts_absolute_maximum(ds, bts + bts_size_in_bytes); + set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1); + + *dsp = ds; + return 0; +} + +int ds_free(void **dsp) +{ + if (*dsp) + kfree((void *)get_bts_buffer_base(*dsp)); + kfree(*dsp); + *dsp = 0; + + return 0; +} + +int ds_get_bts_size(void *ds) +{ + int size_in_bytes; + + if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) + return -EOPNOTSUPP; + + if (!ds) + return 0; + + size_in_bytes = + get_bts_absolute_maximum(ds) - + get_bts_buffer_base(ds); + return size_in_bytes; +} + +int ds_get_bts_end(void *ds) +{ + int size_in_bytes = ds_get_bts_size(ds); + + if (size_in_bytes <= 0) + return size_in_bytes; + + return size_in_bytes / ds_cfg.sizeof_bts; +} + +int ds_get_bts_index(void *ds) +{ + int index_offset_in_bytes; + + if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) + return -EOPNOTSUPP; + + index_offset_in_bytes = + get_bts_index(ds) - + get_bts_buffer_base(ds); + + return index_offset_in_bytes / ds_cfg.sizeof_bts; +} + +int ds_set_overflow(void *ds, int method) +{ + switch (method) { + case DS_O_SIGNAL: + return -EOPNOTSUPP; + case DS_O_WRAP: + return 0; + default: + return -EINVAL; + } +} + +int ds_get_overflow(void *ds) +{ + return DS_O_WRAP; +} + +int ds_clear(void *ds) +{ + int bts_size = ds_get_bts_size(ds); + unsigned long bts_base; + + if (bts_size <= 0) + return bts_size; + + bts_base = get_bts_buffer_base(ds); + memset((void *)bts_base, 0, bts_size); + + set_bts_index(ds, bts_base); + return 0; +} + +int ds_read_bts(void *ds, int index, struct bts_struct *out) +{ + void *bts; + + if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) + return -EOPNOTSUPP; + + if (index < 0) + return -EINVAL; + + if (index >= ds_get_bts_size(ds)) + return -EINVAL; + + bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts)); + + memset(out, 0, sizeof(*out)); + if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) { + out->qualifier = get_info_type(bts); + out->variant.jiffies = get_info_data(bts); + } else { + out->qualifier = BTS_BRANCH; + out->variant.lbr.from_ip = get_from_ip(bts); + out->variant.lbr.to_ip = get_to_ip(bts); + } + + return sizeof(*out);; +} + +int ds_write_bts(void *ds, const struct bts_struct *in) +{ + unsigned long bts; + + if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) + return -EOPNOTSUPP; + + if (ds_get_bts_size(ds) <= 0) + return -ENXIO; + + bts = get_bts_index(ds); + + memset((void *)bts, 0, ds_cfg.sizeof_bts); + switch (in->qualifier) { + case BTS_INVALID: + break; + + case BTS_BRANCH: + set_from_ip((void *)bts, in->variant.lbr.from_ip); + set_to_ip((void *)bts, in->variant.lbr.to_ip); + break; + + case BTS_TASK_ARRIVES: + case BTS_TASK_DEPARTS: + set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS); + set_info_type((void *)bts, in->qualifier); + set_info_data((void *)bts, in->variant.jiffies); + break; + + default: + return -EINVAL; + } + + bts = bts + ds_cfg.sizeof_bts; + if (bts >= get_bts_absolute_maximum(ds)) + bts = get_bts_buffer_base(ds); + set_bts_index(ds, bts); + + return ds_cfg.sizeof_bts; +} + +unsigned long ds_debugctl_mask(void) +{ + return ds_cfg.debugctl_mask; +} + +#ifdef __i386__ +static const struct ds_configuration ds_cfg_netburst = { + .sizeof_ds = 9 * 4, + .bts_buffer_base = { 0, 4 }, + .bts_index = { 4, 4 }, + .bts_absolute_maximum = { 8, 4 }, + .bts_interrupt_threshold = { 12, 4 }, + .sizeof_bts = 3 * 4, + .from_ip = { 0, 4 }, + .to_ip = { 4, 4 }, + .info_type = { 4, 1 }, + .info_data = { 8, 4 }, + .debugctl_mask = (1<<2)|(1<<3) +}; + +static const struct ds_configuration ds_cfg_pentium_m = { + .sizeof_ds = 9 * 4, + .bts_buffer_base = { 0, 4 }, + .bts_index = { 4, 4 }, + .bts_absolute_maximum = { 8, 4 }, + .bts_interrupt_threshold = { 12, 4 }, + .sizeof_bts = 3 * 4, + .from_ip = { 0, 4 }, + .to_ip = { 4, 4 }, + .info_type = { 4, 1 }, + .info_data = { 8, 4 }, + .debugctl_mask = (1<<6)|(1<<7) +}; +#endif /* _i386_ */ + +static const struct ds_configuration ds_cfg_core2 = { + .sizeof_ds = 9 * 8, + .bts_buffer_base = { 0, 8 }, + .bts_index = { 8, 8 }, + .bts_absolute_maximum = { 16, 8 }, + .bts_interrupt_threshold = { 24, 8 }, + .sizeof_bts = 3 * 8, + .from_ip = { 0, 8 }, + .to_ip = { 8, 8 }, + .info_type = { 8, 1 }, + .info_data = { 16, 8 }, + .debugctl_mask = (1<<6)|(1<<7)|(1<<9) +}; + +static inline void +ds_configure(const struct ds_configuration *cfg) +{ + ds_cfg = *cfg; +} + +void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) +{ + switch (c->x86) { + case 0x6: + switch (c->x86_model) { +#ifdef __i386__ + case 0xD: + case 0xE: /* Pentium M */ + ds_configure(&ds_cfg_pentium_m); + break; +#endif /* _i386_ */ + case 0xF: /* Core2 */ + ds_configure(&ds_cfg_core2); + break; + default: + /* sorry, don't know about them */ + break; + } + break; + case 0xF: + switch (c->x86_model) { +#ifdef __i386__ + case 0x0: + case 0x1: + case 0x2: /* Netburst */ + ds_configure(&ds_cfg_netburst); + break; +#endif /* _i386_ */ + default: + /* sorry, don't know about them */ + break; + } + break; + default: + /* sorry, don't know about them */ + break; + } +} diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c index 18f500d185a..4e16ef4a265 100644 --- a/arch/x86/kernel/e820_32.c +++ b/arch/x86/kernel/e820_32.c @@ -7,7 +7,6 @@ #include <linux/kexec.h> #include <linux/module.h> #include <linux/mm.h> -#include <linux/efi.h> #include <linux/pfn.h> #include <linux/uaccess.h> #include <linux/suspend.h> @@ -17,11 +16,6 @@ #include <asm/e820.h> #include <asm/setup.h> -#ifdef CONFIG_EFI -int efi_enabled = 0; -EXPORT_SYMBOL(efi_enabled); -#endif - struct e820map e820; struct change_member { struct e820entry *pbios; /* pointer to original bios entry */ @@ -37,26 +31,6 @@ unsigned long pci_mem_start = 0x10000000; EXPORT_SYMBOL(pci_mem_start); #endif extern int user_defined_memmap; -struct resource data_resource = { - .name = "Kernel data", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -struct resource code_resource = { - .name = "Kernel code", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -struct resource bss_resource = { - .name = "Kernel bss", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; static struct resource system_rom_resource = { .name = "System ROM", @@ -111,60 +85,6 @@ static struct resource video_rom_resource = { .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM }; -static struct resource video_ram_resource = { - .name = "Video RAM area", - .start = 0xa0000, - .end = 0xbffff, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM -}; - -static struct resource standard_io_resources[] = { { - .name = "dma1", - .start = 0x0000, - .end = 0x001f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic1", - .start = 0x0020, - .end = 0x0021, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer0", - .start = 0x0040, - .end = 0x0043, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "timer1", - .start = 0x0050, - .end = 0x0053, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "keyboard", - .start = 0x0060, - .end = 0x006f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma page reg", - .start = 0x0080, - .end = 0x008f, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "pic2", - .start = 0x00a0, - .end = 0x00a1, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "dma2", - .start = 0x00c0, - .end = 0x00df, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -}, { - .name = "fpu", - .start = 0x00f0, - .end = 0x00ff, - .flags = IORESOURCE_BUSY | IORESOURCE_IO -} }; - #define ROMSIGNATURE 0xaa55 static int __init romsignature(const unsigned char *rom) @@ -260,10 +180,9 @@ static void __init probe_roms(void) * Request address space for all standard RAM and ROM resources * and also for regions reported as reserved by the e820. */ -static void __init -legacy_init_iomem_resources(struct resource *code_resource, - struct resource *data_resource, - struct resource *bss_resource) +void __init init_iomem_resources(struct resource *code_resource, + struct resource *data_resource, + struct resource *bss_resource) { int i; @@ -305,35 +224,6 @@ legacy_init_iomem_resources(struct resource *code_resource, } } -/* - * Request address space for all standard resources - * - * This is called just before pcibios_init(), which is also a - * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). - */ -static int __init request_standard_resources(void) -{ - int i; - - printk("Setting up standard PCI resources\n"); - if (efi_enabled) - efi_initialize_iomem_resources(&code_resource, - &data_resource, &bss_resource); - else - legacy_init_iomem_resources(&code_resource, - &data_resource, &bss_resource); - - /* EFI systems may still have VGA */ - request_resource(&iomem_resource, &video_ram_resource); - - /* request I/O space for devices used on all i[345]86 PCs */ - for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) - request_resource(&ioport_resource, &standard_io_resources[i]); - return 0; -} - -subsys_initcall(request_standard_resources); - #if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION) /** * e820_mark_nosave_regions - Find the ranges of physical addresses that do not @@ -370,19 +260,17 @@ void __init add_memory_region(unsigned long long start, { int x; - if (!efi_enabled) { - x = e820.nr_map; - - if (x == E820MAX) { - printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); - return; - } + x = e820.nr_map; - e820.map[x].addr = start; - e820.map[x].size = size; - e820.map[x].type = type; - e820.nr_map++; + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; } /* add_memory_region */ /* @@ -598,29 +486,6 @@ int __init copy_e820_map(struct e820entry * biosmap, int nr_map) } /* - * Callback for efi_memory_walk. - */ -static int __init -efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) -{ - unsigned long *max_pfn = arg, pfn; - - if (start < end) { - pfn = PFN_UP(end -1); - if (pfn > *max_pfn) - *max_pfn = pfn; - } - return 0; -} - -static int __init -efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) -{ - memory_present(0, PFN_UP(start), PFN_DOWN(end)); - return 0; -} - -/* * Find the highest page frame number we have available */ void __init find_max_pfn(void) @@ -628,11 +493,6 @@ void __init find_max_pfn(void) int i; max_pfn = 0; - if (efi_enabled) { - efi_memmap_walk(efi_find_max_pfn, &max_pfn); - efi_memmap_walk(efi_memory_present_wrapper, NULL); - return; - } for (i = 0; i < e820.nr_map; i++) { unsigned long start, end; @@ -650,34 +510,12 @@ void __init find_max_pfn(void) } /* - * Free all available memory for boot time allocation. Used - * as a callback function by efi_memory_walk() - */ - -static int __init -free_available_memory(unsigned long start, unsigned long end, void *arg) -{ - /* check max_low_pfn */ - if (start >= (max_low_pfn << PAGE_SHIFT)) - return 0; - if (end >= (max_low_pfn << PAGE_SHIFT)) - end = max_low_pfn << PAGE_SHIFT; - if (start < end) - free_bootmem(start, end - start); - - return 0; -} -/* * Register fully available low RAM pages with the bootmem allocator. */ void __init register_bootmem_low_pages(unsigned long max_low_pfn) { int i; - if (efi_enabled) { - efi_memmap_walk(free_available_memory, NULL); - return; - } for (i = 0; i < e820.nr_map; i++) { unsigned long curr_pfn, last_pfn, size; /* @@ -785,56 +623,12 @@ void __init print_memory_map(char *who) } } -static __init __always_inline void efi_limit_regions(unsigned long long size) -{ - unsigned long long current_addr = 0; - efi_memory_desc_t *md, *next_md; - void *p, *p1; - int i, j; - - j = 0; - p1 = memmap.map; - for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { - md = p; - next_md = p1; - current_addr = md->phys_addr + - PFN_PHYS(md->num_pages); - if (is_available_memory(md)) { - if (md->phys_addr >= size) continue; - memcpy(next_md, md, memmap.desc_size); - if (current_addr >= size) { - next_md->num_pages -= - PFN_UP(current_addr-size); - } - p1 += memmap.desc_size; - next_md = p1; - j++; - } else if ((md->attribute & EFI_MEMORY_RUNTIME) == - EFI_MEMORY_RUNTIME) { - /* In order to make runtime services - * available we have to include runtime - * memory regions in memory map */ - memcpy(next_md, md, memmap.desc_size); - p1 += memmap.desc_size; - next_md = p1; - j++; - } - } - memmap.nr_map = j; - memmap.map_end = memmap.map + - (memmap.nr_map * memmap.desc_size); -} - void __init limit_regions(unsigned long long size) { unsigned long long current_addr; int i; print_memory_map("limit_regions start"); - if (efi_enabled) { - efi_limit_regions(size); - return; - } for (i = 0; i < e820.nr_map; i++) { current_addr = e820.map[i].addr + e820.map[i].size; if (current_addr < size) @@ -955,3 +749,14 @@ static int __init parse_memmap(char *arg) return 0; } early_param("memmap", parse_memmap); +void __init update_e820(void) +{ + u8 nr_map; + + nr_map = e820.nr_map; + if (sanitize_e820_map(e820.map, &nr_map)) + return; + e820.nr_map = nr_map; + printk(KERN_INFO "modified physical RAM map:\n"); + print_memory_map("modified"); +} diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index 04698e0b056..c617174e896 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -1,4 +1,4 @@ -/* +/* * Handle the memory map. * The functions here do the job until bootmem takes over. * @@ -26,80 +26,87 @@ #include <asm/proto.h> #include <asm/setup.h> #include <asm/sections.h> +#include <asm/kdebug.h> struct e820map e820; -/* +/* * PFN of last memory page. */ -unsigned long end_pfn; -EXPORT_SYMBOL(end_pfn); +unsigned long end_pfn; -/* +/* * end_pfn only includes RAM, while end_pfn_map includes all e820 entries. * The direct mapping extends to end_pfn_map, so that we can directly access * apertures, ACPI and other tables without having to play with fixmaps. - */ -unsigned long end_pfn_map; + */ +unsigned long end_pfn_map; -/* +/* * Last pfn which the user wants to use. */ static unsigned long __initdata end_user_pfn = MAXMEM>>PAGE_SHIFT; -extern struct resource code_resource, data_resource, bss_resource; - -/* Check for some hardcoded bad areas that early boot is not allowed to touch */ -static inline int bad_addr(unsigned long *addrp, unsigned long size) -{ - unsigned long addr = *addrp, last = addr + size; - - /* various gunk below that needed for SMP startup */ - if (addr < 0x8000) { - *addrp = PAGE_ALIGN(0x8000); - return 1; - } - - /* direct mapping tables of the kernel */ - if (last >= table_start<<PAGE_SHIFT && addr < table_end<<PAGE_SHIFT) { - *addrp = PAGE_ALIGN(table_end << PAGE_SHIFT); - return 1; - } - - /* initrd */ -#ifdef CONFIG_BLK_DEV_INITRD - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = ramdisk_image+ramdisk_size; - - if (last >= ramdisk_image && addr < ramdisk_end) { - *addrp = PAGE_ALIGN(ramdisk_end); - return 1; - } - } +/* + * Early reserved memory areas. + */ +#define MAX_EARLY_RES 20 + +struct early_res { + unsigned long start, end; +}; +static struct early_res early_res[MAX_EARLY_RES] __initdata = { + { 0, PAGE_SIZE }, /* BIOS data page */ +#ifdef CONFIG_SMP + { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE }, #endif - /* kernel code */ - if (last >= __pa_symbol(&_text) && addr < __pa_symbol(&_end)) { - *addrp = PAGE_ALIGN(__pa_symbol(&_end)); - return 1; + {} +}; + +void __init reserve_early(unsigned long start, unsigned long end) +{ + int i; + struct early_res *r; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + r = &early_res[i]; + if (end > r->start && start < r->end) + panic("Overlapping early reservations %lx-%lx to %lx-%lx\n", + start, end, r->start, r->end); } + if (i >= MAX_EARLY_RES) + panic("Too many early reservations"); + r = &early_res[i]; + r->start = start; + r->end = end; +} - if (last >= ebda_addr && addr < ebda_addr + ebda_size) { - *addrp = PAGE_ALIGN(ebda_addr + ebda_size); - return 1; +void __init early_res_to_bootmem(void) +{ + int i; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + reserve_bootmem_generic(r->start, r->end - r->start); } +} -#ifdef CONFIG_NUMA - /* NUMA memory to node map */ - if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) { - *addrp = nodemap_addr + nodemap_size; - return 1; +/* Check for already reserved areas */ +static inline int bad_addr(unsigned long *addrp, unsigned long size) +{ + int i; + unsigned long addr = *addrp, last; + int changed = 0; +again: + last = addr + size; + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + if (last >= r->start && addr < r->end) { + *addrp = addr = r->end; + changed = 1; + goto again; + } } -#endif - /* XXX ramdisk image here? */ - return 0; -} + return changed; +} /* * This function checks if any part of the range <start,end> is mapped @@ -107,16 +114,18 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) */ int e820_any_mapped(unsigned long start, unsigned long end, unsigned type) -{ +{ int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - if (type && ei->type != type) + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (type && ei->type != type) continue; if (ei->addr >= end || ei->addr + ei->size <= start) - continue; - return 1; - } + continue; + return 1; + } return 0; } EXPORT_SYMBOL_GPL(e820_any_mapped); @@ -127,11 +136,14 @@ EXPORT_SYMBOL_GPL(e820_any_mapped); * Note: this function only works correct if the e820 table is sorted and * not-overlapping, which is the case */ -int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type) +int __init e820_all_mapped(unsigned long start, unsigned long end, + unsigned type) { int i; + for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) continue; /* is the region (part) in overlap with the current region ?*/ @@ -143,65 +155,73 @@ int __init e820_all_mapped(unsigned long start, unsigned long end, unsigned type */ if (ei->addr <= start) start = ei->addr + ei->size; - /* if start is now at or beyond end, we're done, full coverage */ + /* + * if start is now at or beyond end, we're done, full + * coverage + */ if (start >= end) - return 1; /* we're done */ + return 1; } return 0; } -/* - * Find a free area in a specific range. - */ -unsigned long __init find_e820_area(unsigned long start, unsigned long end, unsigned size) -{ - int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long addr = ei->addr, last; - if (ei->type != E820_RAM) - continue; - if (addr < start) +/* + * Find a free area in a specific range. + */ +unsigned long __init find_e820_area(unsigned long start, unsigned long end, + unsigned size) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + unsigned long addr = ei->addr, last; + + if (ei->type != E820_RAM) + continue; + if (addr < start) addr = start; - if (addr > ei->addr + ei->size) - continue; + if (addr > ei->addr + ei->size) + continue; while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size) ; last = PAGE_ALIGN(addr) + size; if (last > ei->addr + ei->size) continue; - if (last > end) + if (last > end) continue; - return addr; - } - return -1UL; -} + return addr; + } + return -1UL; +} /* * Find the highest page frame number we have available */ unsigned long __init e820_end_of_ram(void) { - unsigned long end_pfn = 0; + unsigned long end_pfn; + end_pfn = find_max_pfn_with_active_regions(); - - if (end_pfn > end_pfn_map) + + if (end_pfn > end_pfn_map) end_pfn_map = end_pfn; if (end_pfn_map > MAXMEM>>PAGE_SHIFT) end_pfn_map = MAXMEM>>PAGE_SHIFT; if (end_pfn > end_user_pfn) end_pfn = end_user_pfn; - if (end_pfn > end_pfn_map) - end_pfn = end_pfn_map; + if (end_pfn > end_pfn_map) + end_pfn = end_pfn_map; - printk("end_pfn_map = %lu\n", end_pfn_map); - return end_pfn; + printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map); + return end_pfn; } /* * Mark e820 reserved areas as busy for the resource manager. */ -void __init e820_reserve_resources(void) +void __init e820_reserve_resources(struct resource *code_resource, + struct resource *data_resource, struct resource *bss_resource) { int i; for (i = 0; i < e820.nr_map; i++) { @@ -219,13 +239,13 @@ void __init e820_reserve_resources(void) request_resource(&iomem_resource, res); if (e820.map[i].type == E820_RAM) { /* - * We don't know which RAM region contains kernel data, - * so we try it repeatedly and let the resource manager - * test it. + * We don't know which RAM region contains kernel data, + * so we try it repeatedly and let the resource manager + * test it. */ - request_resource(res, &code_resource); - request_resource(res, &data_resource); - request_resource(res, &bss_resource); + request_resource(res, code_resource); + request_resource(res, data_resource); + request_resource(res, bss_resource); #ifdef CONFIG_KEXEC if (crashk_res.start != crashk_res.end) request_resource(res, &crashk_res); @@ -322,9 +342,9 @@ e820_register_active_regions(int nid, unsigned long start_pfn, add_active_range(nid, ei_startpfn, ei_endpfn); } -/* +/* * Add a memory region to the kernel e820 map. - */ + */ void __init add_memory_region(unsigned long start, unsigned long size, int type) { int x = e820.nr_map; @@ -349,9 +369,7 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long end_pfn = end >> PAGE_SHIFT; - unsigned long ei_startpfn; - unsigned long ei_endpfn; - unsigned long ram = 0; + unsigned long ei_startpfn, ei_endpfn, ram = 0; int i; for (i = 0; i < e820.nr_map; i++) { @@ -363,28 +381,31 @@ unsigned long __init e820_hole_size(unsigned long start, unsigned long end) return end - start - (ram << PAGE_SHIFT); } -void __init e820_print_map(char *who) +static void __init e820_print_map(char *who) { int i; for (i = 0; i < e820.nr_map; i++) { printk(KERN_INFO " %s: %016Lx - %016Lx ", who, - (unsigned long long) e820.map[i].addr, - (unsigned long long) (e820.map[i].addr + e820.map[i].size)); + (unsigned long long) e820.map[i].addr, + (unsigned long long) + (e820.map[i].addr + e820.map[i].size)); switch (e820.map[i].type) { - case E820_RAM: printk("(usable)\n"); - break; + case E820_RAM: + printk(KERN_CONT "(usable)\n"); + break; case E820_RESERVED: - printk("(reserved)\n"); - break; + printk(KERN_CONT "(reserved)\n"); + break; case E820_ACPI: - printk("(ACPI data)\n"); - break; + printk(KERN_CONT "(ACPI data)\n"); + break; case E820_NVS: - printk("(ACPI NVS)\n"); - break; - default: printk("type %u\n", e820.map[i].type); - break; + printk(KERN_CONT "(ACPI NVS)\n"); + break; + default: + printk(KERN_CONT "type %u\n", e820.map[i].type); + break; } } } @@ -392,11 +413,11 @@ void __init e820_print_map(char *who) /* * Sanitize the BIOS e820 map. * - * Some e820 responses include overlapping entries. The following + * Some e820 responses include overlapping entries. The following * replaces the original e820 map with a new one, removing overlaps. * */ -static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) +static int __init sanitize_e820_map(struct e820entry *biosmap, char *pnr_map) { struct change_member { struct e820entry *pbios; /* pointer to original bios entry */ @@ -416,7 +437,8 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) int i; /* - Visually we're performing the following (1,2,3,4 = memory types)... + Visually we're performing the following + (1,2,3,4 = memory types)... Sample memory map (w/overlaps): ____22__________________ @@ -458,22 +480,23 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) old_nr = *pnr_map; /* bail out if we find any unreasonable addresses in bios map */ - for (i=0; i<old_nr; i++) + for (i = 0; i < old_nr; i++) if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) return -1; /* create pointers for initial change-point information (for sorting) */ - for (i=0; i < 2*old_nr; i++) + for (i = 0; i < 2 * old_nr; i++) change_point[i] = &change_point_list[i]; /* record all known change-points (starting and ending addresses), omitting those that are for empty memory regions */ chgidx = 0; - for (i=0; i < old_nr; i++) { + for (i = 0; i < old_nr; i++) { if (biosmap[i].size != 0) { change_point[chgidx]->addr = biosmap[i].addr; change_point[chgidx++]->pbios = &biosmap[i]; - change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; + change_point[chgidx]->addr = biosmap[i].addr + + biosmap[i].size; change_point[chgidx++]->pbios = &biosmap[i]; } } @@ -483,75 +506,106 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) still_changing = 1; while (still_changing) { still_changing = 0; - for (i=1; i < chg_nr; i++) { - /* if <current_addr> > <last_addr>, swap */ - /* or, if current=<start_addr> & last=<end_addr>, swap */ - if ((change_point[i]->addr < change_point[i-1]->addr) || - ((change_point[i]->addr == change_point[i-1]->addr) && - (change_point[i]->addr == change_point[i]->pbios->addr) && - (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) - ) - { + for (i = 1; i < chg_nr; i++) { + unsigned long long curaddr, lastaddr; + unsigned long long curpbaddr, lastpbaddr; + + curaddr = change_point[i]->addr; + lastaddr = change_point[i - 1]->addr; + curpbaddr = change_point[i]->pbios->addr; + lastpbaddr = change_point[i - 1]->pbios->addr; + + /* + * swap entries, when: + * + * curaddr > lastaddr or + * curaddr == lastaddr and curaddr == curpbaddr and + * lastaddr != lastpbaddr + */ + if (curaddr < lastaddr || + (curaddr == lastaddr && curaddr == curpbaddr && + lastaddr != lastpbaddr)) { change_tmp = change_point[i]; change_point[i] = change_point[i-1]; change_point[i-1] = change_tmp; - still_changing=1; + still_changing = 1; } } } /* create a new bios memory map, removing overlaps */ - overlap_entries=0; /* number of entries in the overlap table */ - new_bios_entry=0; /* index for creating new bios map entries */ + overlap_entries = 0; /* number of entries in the overlap table */ + new_bios_entry = 0; /* index for creating new bios map entries */ last_type = 0; /* start with undefined memory type */ last_addr = 0; /* start with 0 as last starting address */ + /* loop through change-points, determining affect on the new bios map */ - for (chgidx=0; chgidx < chg_nr; chgidx++) - { + for (chgidx = 0; chgidx < chg_nr; chgidx++) { /* keep track of all overlapping bios entries */ - if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) - { - /* add map entry to overlap list (> 1 entry implies an overlap) */ - overlap_list[overlap_entries++]=change_point[chgidx]->pbios; - } - else - { - /* remove entry from list (order independent, so swap with last) */ - for (i=0; i<overlap_entries; i++) - { - if (overlap_list[i] == change_point[chgidx]->pbios) - overlap_list[i] = overlap_list[overlap_entries-1]; + if (change_point[chgidx]->addr == + change_point[chgidx]->pbios->addr) { + /* + * add map entry to overlap list (> 1 entry + * implies an overlap) + */ + overlap_list[overlap_entries++] = + change_point[chgidx]->pbios; + } else { + /* + * remove entry from list (order independent, + * so swap with last) + */ + for (i = 0; i < overlap_entries; i++) { + if (overlap_list[i] == + change_point[chgidx]->pbios) + overlap_list[i] = + overlap_list[overlap_entries-1]; } overlap_entries--; } - /* if there are overlapping entries, decide which "type" to use */ - /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ + /* + * if there are overlapping entries, decide which + * "type" to use (larger value takes precedence -- + * 1=usable, 2,3,4,4+=unusable) + */ current_type = 0; - for (i=0; i<overlap_entries; i++) + for (i = 0; i < overlap_entries; i++) if (overlap_list[i]->type > current_type) current_type = overlap_list[i]->type; - /* continue building up new bios map based on this information */ + /* + * continue building up new bios map based on this + * information + */ if (current_type != last_type) { if (last_type != 0) { new_bios[new_bios_entry].size = change_point[chgidx]->addr - last_addr; - /* move forward only if the new size was non-zero */ + /* + * move forward only if the new size + * was non-zero + */ if (new_bios[new_bios_entry].size != 0) + /* + * no more space left for new + * bios entries ? + */ if (++new_bios_entry >= E820MAX) - break; /* no more space left for new bios entries */ + break; } if (current_type != 0) { - new_bios[new_bios_entry].addr = change_point[chgidx]->addr; + new_bios[new_bios_entry].addr = + change_point[chgidx]->addr; new_bios[new_bios_entry].type = current_type; - last_addr=change_point[chgidx]->addr; + last_addr = change_point[chgidx]->addr; } last_type = current_type; } } - new_nr = new_bios_entry; /* retain count for new bios entries */ + /* retain count for new bios entries */ + new_nr = new_bios_entry; /* copy new bios mapping into original location */ - memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); + memcpy(biosmap, new_bios, new_nr * sizeof(struct e820entry)); *pnr_map = new_nr; return 0; @@ -566,7 +620,7 @@ static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) * will have given us a memory map that we can use to properly * set up memory. If we aren't, we'll fake a memory map. */ -static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) +static int __init copy_e820_map(struct e820entry *biosmap, int nr_map) { /* Only one memory region (or negative)? Ignore it */ if (nr_map < 2) @@ -583,18 +637,20 @@ static int __init copy_e820_map(struct e820entry * biosmap, int nr_map) return -1; add_memory_region(start, size, type); - } while (biosmap++,--nr_map); + } while (biosmap++, --nr_map); return 0; } -void early_panic(char *msg) +static void early_panic(char *msg) { early_printk(msg); panic(msg); } -void __init setup_memory_region(void) +/* We're not void only for x86 32-bit compat */ +char * __init machine_specific_memory_setup(void) { + char *who = "BIOS-e820"; /* * Try to copy the BIOS-supplied E820-map. * @@ -605,7 +661,10 @@ void __init setup_memory_region(void) if (copy_e820_map(boot_params.e820_map, boot_params.e820_entries) < 0) early_panic("Cannot find a valid memory map"); printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - e820_print_map("BIOS-e820"); + e820_print_map(who); + + /* In case someone cares... */ + return who; } static int __init parse_memopt(char *p) @@ -613,9 +672,9 @@ static int __init parse_memopt(char *p) if (!p) return -EINVAL; end_user_pfn = memparse(p, &p); - end_user_pfn >>= PAGE_SHIFT; + end_user_pfn >>= PAGE_SHIFT; return 0; -} +} early_param("mem", parse_memopt); static int userdef __initdata; @@ -627,9 +686,9 @@ static int __init parse_memmap_opt(char *p) if (!strcmp(p, "exactmap")) { #ifdef CONFIG_CRASH_DUMP - /* If we are doing a crash dump, we - * still need to know the real mem - * size before original memory map is + /* + * If we are doing a crash dump, we still need to know + * the real mem size before original memory map is * reset. */ e820_register_active_regions(0, 0, -1UL); @@ -646,6 +705,8 @@ static int __init parse_memmap_opt(char *p) mem_size = memparse(p, &p); if (p == oldp) return -EINVAL; + + userdef = 1; if (*p == '@') { start_at = memparse(p+1, &p); add_memory_region(start_at, mem_size, E820_RAM); @@ -665,11 +726,29 @@ early_param("memmap", parse_memmap_opt); void __init finish_e820_parsing(void) { if (userdef) { + char nr = e820.nr_map; + + if (sanitize_e820_map(e820.map, &nr) < 0) + early_panic("Invalid user supplied memory map"); + e820.nr_map = nr; + printk(KERN_INFO "user-defined physical RAM map:\n"); e820_print_map("user"); } } +void __init update_e820(void) +{ + u8 nr_map; + + nr_map = e820.nr_map; + if (sanitize_e820_map(e820.map, &nr_map)) + return; + e820.nr_map = nr_map; + printk(KERN_INFO "modified physical RAM map:\n"); + e820_print_map("modified"); +} + unsigned long pci_mem_start = 0xaeedbabe; EXPORT_SYMBOL(pci_mem_start); @@ -713,8 +792,10 @@ __init void e820_setup_gap(void) if (!found) { gapstart = (end_pfn << PAGE_SHIFT) + 1024*1024; - printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit address range\n" - KERN_ERR "PCI: Unassigned devices with 32bit resource registers may break!\n"); + printk(KERN_ERR "PCI: Warning: Cannot find a gap in the 32bit " + "address range\n" + KERN_ERR "PCI: Unassigned devices with 32bit resource " + "registers may break!\n"); } /* @@ -727,8 +808,9 @@ __init void e820_setup_gap(void) /* Fun with two's complement */ pci_mem_start = (gapstart + round) & -round; - printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", - pci_mem_start, gapstart, gapsize); + printk(KERN_INFO + "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", + pci_mem_start, gapstart, gapsize); } int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 88bb83ec895..9f51e1ea9e8 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -21,7 +21,33 @@ #include <asm/gart.h> #endif -static void __init via_bugs(void) +static void __init fix_hypertransport_config(int num, int slot, int func) +{ + u32 htcfg; + /* + * we found a hypertransport bus + * make sure that we are broadcasting + * interrupts to all cpus on the ht bus + * if we're using extended apic ids + */ + htcfg = read_pci_config(num, slot, func, 0x68); + if (htcfg & (1 << 18)) { + printk(KERN_INFO "Detected use of extended apic ids " + "on hypertransport bus\n"); + if ((htcfg & (1 << 17)) == 0) { + printk(KERN_INFO "Enabling hypertransport extended " + "apic interrupt broadcast\n"); + printk(KERN_INFO "Note this is a bios bug, " + "please contact your hw vendor\n"); + htcfg |= (1 << 17); + write_pci_config(num, slot, func, 0x68, htcfg); + } + } + + +} + +static void __init via_bugs(int num, int slot, int func) { #ifdef CONFIG_GART_IOMMU if ((end_pfn > MAX_DMA32_PFN || force_iommu) && @@ -44,7 +70,7 @@ static int __init nvidia_hpet_check(struct acpi_table_header *header) #endif /* CONFIG_X86_IO_APIC */ #endif /* CONFIG_ACPI */ -static void __init nvidia_bugs(void) +static void __init nvidia_bugs(int num, int slot, int func) { #ifdef CONFIG_ACPI #ifdef CONFIG_X86_IO_APIC @@ -72,7 +98,7 @@ static void __init nvidia_bugs(void) } -static void __init ati_bugs(void) +static void __init ati_bugs(int num, int slot, int func) { #ifdef CONFIG_X86_IO_APIC if (timer_over_8254 == 1) { @@ -83,18 +109,67 @@ static void __init ati_bugs(void) #endif } +#define QFLAG_APPLY_ONCE 0x1 +#define QFLAG_APPLIED 0x2 +#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) struct chipset { - u16 vendor; - void (*f)(void); + u32 vendor; + u32 device; + u32 class; + u32 class_mask; + u32 flags; + void (*f)(int num, int slot, int func); }; static struct chipset early_qrk[] __initdata = { - { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, - { PCI_VENDOR_ID_VIA, via_bugs }, - { PCI_VENDOR_ID_ATI, ati_bugs }, + { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, + { PCI_VENDOR_ID_VIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, + { PCI_VENDOR_ID_ATI, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, ati_bugs }, + { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, {} }; +static void __init check_dev_quirk(int num, int slot, int func) +{ + u16 class; + u16 vendor; + u16 device; + u8 type; + int i; + + class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE); + + if (class == 0xffff) + return; + + vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID); + + device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); + + for (i = 0; early_qrk[i].f != NULL; i++) { + if (((early_qrk[i].vendor == PCI_ANY_ID) || + (early_qrk[i].vendor == vendor)) && + ((early_qrk[i].device == PCI_ANY_ID) || + (early_qrk[i].device == device)) && + (!((early_qrk[i].class ^ class) & + early_qrk[i].class_mask))) { + if ((early_qrk[i].flags & + QFLAG_DONE) != QFLAG_DONE) + early_qrk[i].f(num, slot, func); + early_qrk[i].flags |= QFLAG_APPLIED; + } + } + + type = read_pci_config_byte(num, slot, func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + return; +} + void __init early_quirks(void) { int num, slot, func; @@ -103,36 +178,8 @@ void __init early_quirks(void) return; /* Poor man's PCI discovery */ - for (num = 0; num < 32; num++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - u32 class; - u32 vendor; - u8 type; - int i; - class = read_pci_config(num,slot,func, - PCI_CLASS_REVISION); - if (class == 0xffffffff) - break; - - if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) - continue; - - vendor = read_pci_config(num, slot, func, - PCI_VENDOR_ID); - vendor &= 0xffff; - - for (i = 0; early_qrk[i].f; i++) - if (early_qrk[i].vendor == vendor) { - early_qrk[i].f(); - return; - } - - type = read_pci_config_byte(num, slot, func, - PCI_HEADER_TYPE); - if (!(type & 0x80)) - break; - } - } - } + for (num = 0; num < 32; num++) + for (slot = 0; slot < 32; slot++) + for (func = 0; func < 8; func++) + check_dev_quirk(num, slot, func); } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c new file mode 100644 index 00000000000..1411324a625 --- /dev/null +++ b/arch/x86/kernel/efi.c @@ -0,0 +1,512 @@ +/* + * Common EFI (Extensible Firmware Interface) support functions + * Based on Extensible Firmware Interface Specification version 1.0 + * + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + * Copyright (C) 1999-2002 Hewlett-Packard Co. + * David Mosberger-Tang <davidm@hpl.hp.com> + * Stephane Eranian <eranian@hpl.hp.com> + * Copyright (C) 2005-2008 Intel Co. + * Fenghua Yu <fenghua.yu@intel.com> + * Bibo Mao <bibo.mao@intel.com> + * Chandramouli Narayanan <mouli@linux.intel.com> + * Huang Ying <ying.huang@intel.com> + * + * Copied from efi_32.c to eliminate the duplicated code between EFI + * 32/64 support code. --ying 2007-10-26 + * + * All EFI Runtime Services are not implemented yet as EFI only + * supports physical mode addressing on SoftSDV. This is to be fixed + * in a future version. --drummond 1999-07-20 + * + * Implemented EFI runtime services and virtual mode calls. --davidm + * + * Goutham Rao: <goutham.rao@intel.com> + * Skip non-WB memory and ignore empty memory ranges. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/efi.h> +#include <linux/bootmem.h> +#include <linux/spinlock.h> +#include <linux/uaccess.h> +#include <linux/time.h> +#include <linux/io.h> +#include <linux/reboot.h> +#include <linux/bcd.h> + +#include <asm/setup.h> +#include <asm/efi.h> +#include <asm/time.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> + +#define EFI_DEBUG 1 +#define PFX "EFI: " + +int efi_enabled; +EXPORT_SYMBOL(efi_enabled); + +struct efi efi; +EXPORT_SYMBOL(efi); + +struct efi_memory_map memmap; + +struct efi efi_phys __initdata; +static efi_system_table_t efi_systab __initdata; + +static int __init setup_noefi(char *arg) +{ + efi_enabled = 0; + return 0; +} +early_param("noefi", setup_noefi); + +static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) +{ + return efi_call_virt2(get_time, tm, tc); +} + +static efi_status_t virt_efi_set_time(efi_time_t *tm) +{ + return efi_call_virt1(set_time, tm); +} + +static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled, + efi_bool_t *pending, + efi_time_t *tm) +{ + return efi_call_virt3(get_wakeup_time, + enabled, pending, tm); +} + +static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) +{ + return efi_call_virt2(set_wakeup_time, + enabled, tm); +} + +static efi_status_t virt_efi_get_variable(efi_char16_t *name, + efi_guid_t *vendor, + u32 *attr, + unsigned long *data_size, + void *data) +{ + return efi_call_virt5(get_variable, + name, vendor, attr, + data_size, data); +} + +static efi_status_t virt_efi_get_next_variable(unsigned long *name_size, + efi_char16_t *name, + efi_guid_t *vendor) +{ + return efi_call_virt3(get_next_variable, + name_size, name, vendor); +} + +static efi_status_t virt_efi_set_variable(efi_char16_t *name, + efi_guid_t *vendor, + unsigned long attr, + unsigned long data_size, + void *data) +{ + return efi_call_virt5(set_variable, + name, vendor, attr, + data_size, data); +} + +static efi_status_t virt_efi_get_next_high_mono_count(u32 *count) +{ + return efi_call_virt1(get_next_high_mono_count, count); +} + +static void virt_efi_reset_system(int reset_type, + efi_status_t status, + unsigned long data_size, + efi_char16_t *data) +{ + efi_call_virt4(reset_system, reset_type, status, + data_size, data); +} + +static efi_status_t virt_efi_set_virtual_address_map( + unsigned long memory_map_size, + unsigned long descriptor_size, + u32 descriptor_version, + efi_memory_desc_t *virtual_map) +{ + return efi_call_virt4(set_virtual_address_map, + memory_map_size, descriptor_size, + descriptor_version, virtual_map); +} + +static efi_status_t __init phys_efi_set_virtual_address_map( + unsigned long memory_map_size, + unsigned long descriptor_size, + u32 descriptor_version, + efi_memory_desc_t *virtual_map) +{ + efi_status_t status; + + efi_call_phys_prelog(); + status = efi_call_phys4(efi_phys.set_virtual_address_map, + memory_map_size, descriptor_size, + descriptor_version, virtual_map); + efi_call_phys_epilog(); + return status; +} + +static efi_status_t __init phys_efi_get_time(efi_time_t *tm, + efi_time_cap_t *tc) +{ + efi_status_t status; + + efi_call_phys_prelog(); + status = efi_call_phys2(efi_phys.get_time, tm, tc); + efi_call_phys_epilog(); + return status; +} + +int efi_set_rtc_mmss(unsigned long nowtime) +{ + int real_seconds, real_minutes; + efi_status_t status; + efi_time_t eft; + efi_time_cap_t cap; + + status = efi.get_time(&eft, &cap); + if (status != EFI_SUCCESS) { + printk(KERN_ERR "Oops: efitime: can't read time!\n"); + return -1; + } + + real_seconds = nowtime % 60; + real_minutes = nowtime / 60; + if (((abs(real_minutes - eft.minute) + 15)/30) & 1) + real_minutes += 30; + real_minutes %= 60; + eft.minute = real_minutes; + eft.second = real_seconds; + + status = efi.set_time(&eft); + if (status != EFI_SUCCESS) { + printk(KERN_ERR "Oops: efitime: can't write time!\n"); + return -1; + } + return 0; +} + +unsigned long efi_get_time(void) +{ + efi_status_t status; + efi_time_t eft; + efi_time_cap_t cap; + + status = efi.get_time(&eft, &cap); + if (status != EFI_SUCCESS) + printk(KERN_ERR "Oops: efitime: can't read time!\n"); + + return mktime(eft.year, eft.month, eft.day, eft.hour, + eft.minute, eft.second); +} + +#if EFI_DEBUG +static void __init print_efi_memmap(void) +{ + efi_memory_desc_t *md; + void *p; + int i; + + for (p = memmap.map, i = 0; + p < memmap.map_end; + p += memmap.desc_size, i++) { + md = p; + printk(KERN_INFO PFX "mem%02u: type=%u, attr=0x%llx, " + "range=[0x%016llx-0x%016llx) (%lluMB)\n", + i, md->type, md->attribute, md->phys_addr, + md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), + (md->num_pages >> (20 - EFI_PAGE_SHIFT))); + } +} +#endif /* EFI_DEBUG */ + +void __init efi_init(void) +{ + efi_config_table_t *config_tables; + efi_runtime_services_t *runtime; + efi_char16_t *c16; + char vendor[100] = "unknown"; + int i = 0; + void *tmp; + +#ifdef CONFIG_X86_32 + efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab; + memmap.phys_map = (void *)boot_params.efi_info.efi_memmap; +#else + efi_phys.systab = (efi_system_table_t *) + (boot_params.efi_info.efi_systab | + ((__u64)boot_params.efi_info.efi_systab_hi<<32)); + memmap.phys_map = (void *) + (boot_params.efi_info.efi_memmap | + ((__u64)boot_params.efi_info.efi_memmap_hi<<32)); +#endif + memmap.nr_map = boot_params.efi_info.efi_memmap_size / + boot_params.efi_info.efi_memdesc_size; + memmap.desc_version = boot_params.efi_info.efi_memdesc_version; + memmap.desc_size = boot_params.efi_info.efi_memdesc_size; + + efi.systab = early_ioremap((unsigned long)efi_phys.systab, + sizeof(efi_system_table_t)); + if (efi.systab == NULL) + printk(KERN_ERR "Couldn't map the EFI system table!\n"); + memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t)); + early_iounmap(efi.systab, sizeof(efi_system_table_t)); + efi.systab = &efi_systab; + + /* + * Verify the EFI Table + */ + if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) + printk(KERN_ERR "EFI system table signature incorrect!\n"); + if ((efi.systab->hdr.revision >> 16) == 0) + printk(KERN_ERR "Warning: EFI system table version " + "%d.%02d, expected 1.00 or greater!\n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff); + + /* + * Show what we know for posterity + */ + c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2); + if (c16) { + for (i = 0; i < sizeof(vendor) && *c16; ++i) + vendor[i] = *c16++; + vendor[i] = '\0'; + } else + printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); + early_iounmap(tmp, 2); + + printk(KERN_INFO "EFI v%u.%.02u by %s \n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff, vendor); + + /* + * Let's see what config tables the firmware passed to us. + */ + config_tables = early_ioremap( + efi.systab->tables, + efi.systab->nr_tables * sizeof(efi_config_table_t)); + if (config_tables == NULL) + printk(KERN_ERR "Could not map EFI Configuration Table!\n"); + + printk(KERN_INFO); + for (i = 0; i < efi.systab->nr_tables; i++) { + if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) { + efi.mps = config_tables[i].table; + printk(" MPS=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, + ACPI_20_TABLE_GUID)) { + efi.acpi20 = config_tables[i].table; + printk(" ACPI 2.0=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, + ACPI_TABLE_GUID)) { + efi.acpi = config_tables[i].table; + printk(" ACPI=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, + SMBIOS_TABLE_GUID)) { + efi.smbios = config_tables[i].table; + printk(" SMBIOS=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, + HCDP_TABLE_GUID)) { + efi.hcdp = config_tables[i].table; + printk(" HCDP=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, + UGA_IO_PROTOCOL_GUID)) { + efi.uga = config_tables[i].table; + printk(" UGA=0x%lx ", config_tables[i].table); + } + } + printk("\n"); + early_iounmap(config_tables, + efi.systab->nr_tables * sizeof(efi_config_table_t)); + + /* + * Check out the runtime services table. We need to map + * the runtime services table so that we can grab the physical + * address of several of the EFI runtime functions, needed to + * set the firmware into virtual mode. + */ + runtime = early_ioremap((unsigned long)efi.systab->runtime, + sizeof(efi_runtime_services_t)); + if (runtime != NULL) { + /* + * We will only need *early* access to the following + * two EFI runtime services before set_virtual_address_map + * is invoked. + */ + efi_phys.get_time = (efi_get_time_t *)runtime->get_time; + efi_phys.set_virtual_address_map = + (efi_set_virtual_address_map_t *) + runtime->set_virtual_address_map; + /* + * Make efi_get_time can be called before entering + * virtual mode. + */ + efi.get_time = phys_efi_get_time; + } else + printk(KERN_ERR "Could not map the EFI runtime service " + "table!\n"); + early_iounmap(runtime, sizeof(efi_runtime_services_t)); + + /* Map the EFI memory map */ + memmap.map = early_ioremap((unsigned long)memmap.phys_map, + memmap.nr_map * memmap.desc_size); + if (memmap.map == NULL) + printk(KERN_ERR "Could not map the EFI memory map!\n"); + memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); + if (memmap.desc_size != sizeof(efi_memory_desc_t)) + printk(KERN_WARNING "Kernel-defined memdesc" + "doesn't match the one from EFI!\n"); + + /* Setup for EFI runtime service */ + reboot_type = BOOT_EFI; + +#if EFI_DEBUG + print_efi_memmap(); +#endif +} + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +static void __init runtime_code_page_mkexec(void) +{ + efi_memory_desc_t *md; + unsigned long end; + void *p; + + if (!(__supported_pte_mask & _PAGE_NX)) + return; + + /* Make EFI runtime service code area executable */ + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); + if (md->type == EFI_RUNTIME_SERVICES_CODE && + (end >> PAGE_SHIFT) <= max_pfn_mapped) { + set_memory_x(md->virt_addr, md->num_pages); + set_memory_uc(md->virt_addr, md->num_pages); + } + } + __flush_tlb_all(); +} +#else +static inline void __init runtime_code_page_mkexec(void) { } +#endif + +/* + * This function will switch the EFI runtime services to virtual mode. + * Essentially, look through the EFI memmap and map every region that + * has the runtime attribute bit set in its memory descriptor and update + * that memory descriptor with the virtual address obtained from ioremap(). + * This enables the runtime services to be called without having to + * thunk back into physical mode for every invocation. + */ +void __init efi_enter_virtual_mode(void) +{ + efi_memory_desc_t *md; + efi_status_t status; + unsigned long end; + void *p; + + efi.systab = NULL; + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if (!(md->attribute & EFI_MEMORY_RUNTIME)) + continue; + end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); + if ((md->attribute & EFI_MEMORY_WB) && + ((end >> PAGE_SHIFT) <= max_pfn_mapped)) + md->virt_addr = (unsigned long)__va(md->phys_addr); + else + md->virt_addr = (unsigned long) + efi_ioremap(md->phys_addr, + md->num_pages << EFI_PAGE_SHIFT); + if (!md->virt_addr) + printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n", + (unsigned long long)md->phys_addr); + if ((md->phys_addr <= (unsigned long)efi_phys.systab) && + ((unsigned long)efi_phys.systab < end)) + efi.systab = (efi_system_table_t *)(unsigned long) + (md->virt_addr - md->phys_addr + + (unsigned long)efi_phys.systab); + } + + BUG_ON(!efi.systab); + + status = phys_efi_set_virtual_address_map( + memmap.desc_size * memmap.nr_map, + memmap.desc_size, + memmap.desc_version, + memmap.phys_map); + + if (status != EFI_SUCCESS) { + printk(KERN_ALERT "Unable to switch EFI into virtual mode " + "(status=%lx)!\n", status); + panic("EFI call to SetVirtualAddressMap() failed!"); + } + + /* + * Now that EFI is in virtual mode, update the function + * pointers in the runtime service table to the new virtual addresses. + * + * Call EFI services through wrapper functions. + */ + efi.get_time = virt_efi_get_time; + efi.set_time = virt_efi_set_time; + efi.get_wakeup_time = virt_efi_get_wakeup_time; + efi.set_wakeup_time = virt_efi_set_wakeup_time; + efi.get_variable = virt_efi_get_variable; + efi.get_next_variable = virt_efi_get_next_variable; + efi.set_variable = virt_efi_set_variable; + efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; + efi.reset_system = virt_efi_reset_system; + efi.set_virtual_address_map = virt_efi_set_virtual_address_map; + runtime_code_page_mkexec(); + early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); + memmap.map = NULL; +} + +/* + * Convenience functions to obtain memory types and attributes + */ +u32 efi_mem_type(unsigned long phys_addr) +{ + efi_memory_desc_t *md; + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if ((md->phys_addr <= phys_addr) && + (phys_addr < (md->phys_addr + + (md->num_pages << EFI_PAGE_SHIFT)))) + return md->type; + } + return 0; +} + +u64 efi_mem_attributes(unsigned long phys_addr) +{ + efi_memory_desc_t *md; + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if ((md->phys_addr <= phys_addr) && + (phys_addr < (md->phys_addr + + (md->num_pages << EFI_PAGE_SHIFT)))) + return md->attribute; + } + return 0; +} diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c index e2be78f4939..cb91f985b4a 100644 --- a/arch/x86/kernel/efi_32.c +++ b/arch/x86/kernel/efi_32.c @@ -20,40 +20,15 @@ */ #include <linux/kernel.h> -#include <linux/init.h> -#include <linux/mm.h> #include <linux/types.h> -#include <linux/time.h> -#include <linux/spinlock.h> -#include <linux/bootmem.h> #include <linux/ioport.h> -#include <linux/module.h> #include <linux/efi.h> -#include <linux/kexec.h> -#include <asm/setup.h> #include <asm/io.h> #include <asm/page.h> #include <asm/pgtable.h> -#include <asm/processor.h> -#include <asm/desc.h> #include <asm/tlbflush.h> -#define EFI_DEBUG 0 -#define PFX "EFI: " - -extern efi_status_t asmlinkage efi_call_phys(void *, ...); - -struct efi efi; -EXPORT_SYMBOL(efi); -static struct efi efi_phys; -struct efi_memory_map memmap; - -/* - * We require an early boot_ioremap mapping mechanism initially - */ -extern void * boot_ioremap(unsigned long, unsigned long); - /* * To make EFI call EFI runtime service in physical addressing mode we need * prelog/epilog before/after the invocation to disable interrupt, to @@ -62,16 +37,14 @@ extern void * boot_ioremap(unsigned long, unsigned long); */ static unsigned long efi_rt_eflags; -static DEFINE_SPINLOCK(efi_rt_lock); static pgd_t efi_bak_pg_dir_pointer[2]; -static void efi_call_phys_prelog(void) __acquires(efi_rt_lock) +void efi_call_phys_prelog(void) { unsigned long cr4; unsigned long temp; - struct Xgt_desc_struct gdt_descr; + struct desc_ptr gdt_descr; - spin_lock(&efi_rt_lock); local_irq_save(efi_rt_eflags); /* @@ -101,17 +74,17 @@ static void efi_call_phys_prelog(void) __acquires(efi_rt_lock) /* * After the lock is released, the original page table is restored. */ - local_flush_tlb(); + __flush_tlb_all(); gdt_descr.address = __pa(get_cpu_gdt_table(0)); gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); } -static void efi_call_phys_epilog(void) __releases(efi_rt_lock) +void efi_call_phys_epilog(void) { unsigned long cr4; - struct Xgt_desc_struct gdt_descr; + struct desc_ptr gdt_descr; gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); gdt_descr.size = GDT_SIZE - 1; @@ -132,586 +105,7 @@ static void efi_call_phys_epilog(void) __releases(efi_rt_lock) /* * After the lock is released, the original page table is restored. */ - local_flush_tlb(); + __flush_tlb_all(); local_irq_restore(efi_rt_eflags); - spin_unlock(&efi_rt_lock); -} - -static efi_status_t -phys_efi_set_virtual_address_map(unsigned long memory_map_size, - unsigned long descriptor_size, - u32 descriptor_version, - efi_memory_desc_t *virtual_map) -{ - efi_status_t status; - - efi_call_phys_prelog(); - status = efi_call_phys(efi_phys.set_virtual_address_map, - memory_map_size, descriptor_size, - descriptor_version, virtual_map); - efi_call_phys_epilog(); - return status; -} - -static efi_status_t -phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) -{ - efi_status_t status; - - efi_call_phys_prelog(); - status = efi_call_phys(efi_phys.get_time, tm, tc); - efi_call_phys_epilog(); - return status; -} - -inline int efi_set_rtc_mmss(unsigned long nowtime) -{ - int real_seconds, real_minutes; - efi_status_t status; - efi_time_t eft; - efi_time_cap_t cap; - - spin_lock(&efi_rt_lock); - status = efi.get_time(&eft, &cap); - spin_unlock(&efi_rt_lock); - if (status != EFI_SUCCESS) - panic("Ooops, efitime: can't read time!\n"); - real_seconds = nowtime % 60; - real_minutes = nowtime / 60; - - if (((abs(real_minutes - eft.minute) + 15)/30) & 1) - real_minutes += 30; - real_minutes %= 60; - - eft.minute = real_minutes; - eft.second = real_seconds; - - if (status != EFI_SUCCESS) { - printk("Ooops: efitime: can't read time!\n"); - return -1; - } - return 0; -} -/* - * This is used during kernel init before runtime - * services have been remapped and also during suspend, therefore, - * we'll need to call both in physical and virtual modes. - */ -inline unsigned long efi_get_time(void) -{ - efi_status_t status; - efi_time_t eft; - efi_time_cap_t cap; - - if (efi.get_time) { - /* if we are in virtual mode use remapped function */ - status = efi.get_time(&eft, &cap); - } else { - /* we are in physical mode */ - status = phys_efi_get_time(&eft, &cap); - } - - if (status != EFI_SUCCESS) - printk("Oops: efitime: can't read time status: 0x%lx\n",status); - - return mktime(eft.year, eft.month, eft.day, eft.hour, - eft.minute, eft.second); -} - -int is_available_memory(efi_memory_desc_t * md) -{ - if (!(md->attribute & EFI_MEMORY_WB)) - return 0; - - switch (md->type) { - case EFI_LOADER_CODE: - case EFI_LOADER_DATA: - case EFI_BOOT_SERVICES_CODE: - case EFI_BOOT_SERVICES_DATA: - case EFI_CONVENTIONAL_MEMORY: - return 1; - } - return 0; -} - -/* - * We need to map the EFI memory map again after paging_init(). - */ -void __init efi_map_memmap(void) -{ - memmap.map = NULL; - - memmap.map = bt_ioremap((unsigned long) memmap.phys_map, - (memmap.nr_map * memmap.desc_size)); - if (memmap.map == NULL) - printk(KERN_ERR PFX "Could not remap the EFI memmap!\n"); - - memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); -} - -#if EFI_DEBUG -static void __init print_efi_memmap(void) -{ - efi_memory_desc_t *md; - void *p; - int i; - - for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { - md = p; - printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, " - "range=[0x%016llx-0x%016llx) (%lluMB)\n", - i, md->type, md->attribute, md->phys_addr, - md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), - (md->num_pages >> (20 - EFI_PAGE_SHIFT))); - } -} -#endif /* EFI_DEBUG */ - -/* - * Walks the EFI memory map and calls CALLBACK once for each EFI - * memory descriptor that has memory that is available for kernel use. - */ -void efi_memmap_walk(efi_freemem_callback_t callback, void *arg) -{ - int prev_valid = 0; - struct range { - unsigned long start; - unsigned long end; - } uninitialized_var(prev), curr; - efi_memory_desc_t *md; - unsigned long start, end; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - - if ((md->num_pages == 0) || (!is_available_memory(md))) - continue; - - curr.start = md->phys_addr; - curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT); - - if (!prev_valid) { - prev = curr; - prev_valid = 1; - } else { - if (curr.start < prev.start) - printk(KERN_INFO PFX "Unordered memory map\n"); - if (prev.end == curr.start) - prev.end = curr.end; - else { - start = - (unsigned long) (PAGE_ALIGN(prev.start)); - end = (unsigned long) (prev.end & PAGE_MASK); - if ((end > start) - && (*callback) (start, end, arg) < 0) - return; - prev = curr; - } - } - } - if (prev_valid) { - start = (unsigned long) PAGE_ALIGN(prev.start); - end = (unsigned long) (prev.end & PAGE_MASK); - if (end > start) - (*callback) (start, end, arg); - } -} - -void __init efi_init(void) -{ - efi_config_table_t *config_tables; - efi_runtime_services_t *runtime; - efi_char16_t *c16; - char vendor[100] = "unknown"; - unsigned long num_config_tables; - int i = 0; - - memset(&efi, 0, sizeof(efi) ); - memset(&efi_phys, 0, sizeof(efi_phys)); - - efi_phys.systab = - (efi_system_table_t *)boot_params.efi_info.efi_systab; - memmap.phys_map = (void *)boot_params.efi_info.efi_memmap; - memmap.nr_map = boot_params.efi_info.efi_memmap_size/ - boot_params.efi_info.efi_memdesc_size; - memmap.desc_version = boot_params.efi_info.efi_memdesc_version; - memmap.desc_size = boot_params.efi_info.efi_memdesc_size; - - efi.systab = (efi_system_table_t *) - boot_ioremap((unsigned long) efi_phys.systab, - sizeof(efi_system_table_t)); - /* - * Verify the EFI Table - */ - if (efi.systab == NULL) - printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n"); - if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) - printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n"); - if ((efi.systab->hdr.revision >> 16) == 0) - printk(KERN_ERR PFX "Warning: EFI system table version " - "%d.%02d, expected 1.00 or greater\n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff); - - /* - * Grab some details from the system table - */ - num_config_tables = efi.systab->nr_tables; - config_tables = (efi_config_table_t *)efi.systab->tables; - runtime = efi.systab->runtime; - - /* - * Show what we know for posterity - */ - c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2); - if (c16) { - for (i = 0; i < (sizeof(vendor) - 1) && *c16; ++i) - vendor[i] = *c16++; - vendor[i] = '\0'; - } else - printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); - - printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n", - efi.systab->hdr.revision >> 16, - efi.systab->hdr.revision & 0xffff, vendor); - - /* - * Let's see what config tables the firmware passed to us. - */ - config_tables = (efi_config_table_t *) - boot_ioremap((unsigned long) config_tables, - num_config_tables * sizeof(efi_config_table_t)); - - if (config_tables == NULL) - printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n"); - - efi.mps = EFI_INVALID_TABLE_ADDR; - efi.acpi = EFI_INVALID_TABLE_ADDR; - efi.acpi20 = EFI_INVALID_TABLE_ADDR; - efi.smbios = EFI_INVALID_TABLE_ADDR; - efi.sal_systab = EFI_INVALID_TABLE_ADDR; - efi.boot_info = EFI_INVALID_TABLE_ADDR; - efi.hcdp = EFI_INVALID_TABLE_ADDR; - efi.uga = EFI_INVALID_TABLE_ADDR; - - for (i = 0; i < num_config_tables; i++) { - if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) { - efi.mps = config_tables[i].table; - printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table); - } else - if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) { - efi.acpi20 = config_tables[i].table; - printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table); - } else - if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) { - efi.acpi = config_tables[i].table; - printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table); - } else - if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) { - efi.smbios = config_tables[i].table; - printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table); - } else - if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) { - efi.hcdp = config_tables[i].table; - printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table); - } else - if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) { - efi.uga = config_tables[i].table; - printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table); - } - } - printk("\n"); - - /* - * Check out the runtime services table. We need to map - * the runtime services table so that we can grab the physical - * address of several of the EFI runtime functions, needed to - * set the firmware into virtual mode. - */ - - runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long) - runtime, - sizeof(efi_runtime_services_t)); - if (runtime != NULL) { - /* - * We will only need *early* access to the following - * two EFI runtime services before set_virtual_address_map - * is invoked. - */ - efi_phys.get_time = (efi_get_time_t *) runtime->get_time; - efi_phys.set_virtual_address_map = - (efi_set_virtual_address_map_t *) - runtime->set_virtual_address_map; - } else - printk(KERN_ERR PFX "Could not map the runtime service table!\n"); - - /* Map the EFI memory map for use until paging_init() */ - memmap.map = boot_ioremap(boot_params.efi_info.efi_memmap, - boot_params.efi_info.efi_memmap_size); - if (memmap.map == NULL) - printk(KERN_ERR PFX "Could not map the EFI memory map!\n"); - - memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); - -#if EFI_DEBUG - print_efi_memmap(); -#endif -} - -static inline void __init check_range_for_systab(efi_memory_desc_t *md) -{ - if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) && - ((unsigned long)efi_phys.systab < md->phys_addr + - ((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) { - unsigned long addr; - - addr = md->virt_addr - md->phys_addr + - (unsigned long)efi_phys.systab; - efi.systab = (efi_system_table_t *)addr; - } -} - -/* - * Wrap all the virtual calls in a way that forces the parameters on the stack. - */ - -#define efi_call_virt(f, args...) \ - ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args) - -static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) -{ - return efi_call_virt(get_time, tm, tc); -} - -static efi_status_t virt_efi_set_time (efi_time_t *tm) -{ - return efi_call_virt(set_time, tm); -} - -static efi_status_t virt_efi_get_wakeup_time (efi_bool_t *enabled, - efi_bool_t *pending, - efi_time_t *tm) -{ - return efi_call_virt(get_wakeup_time, enabled, pending, tm); -} - -static efi_status_t virt_efi_set_wakeup_time (efi_bool_t enabled, - efi_time_t *tm) -{ - return efi_call_virt(set_wakeup_time, enabled, tm); -} - -static efi_status_t virt_efi_get_variable (efi_char16_t *name, - efi_guid_t *vendor, u32 *attr, - unsigned long *data_size, void *data) -{ - return efi_call_virt(get_variable, name, vendor, attr, data_size, data); -} - -static efi_status_t virt_efi_get_next_variable (unsigned long *name_size, - efi_char16_t *name, - efi_guid_t *vendor) -{ - return efi_call_virt(get_next_variable, name_size, name, vendor); -} - -static efi_status_t virt_efi_set_variable (efi_char16_t *name, - efi_guid_t *vendor, - unsigned long attr, - unsigned long data_size, void *data) -{ - return efi_call_virt(set_variable, name, vendor, attr, data_size, data); -} - -static efi_status_t virt_efi_get_next_high_mono_count (u32 *count) -{ - return efi_call_virt(get_next_high_mono_count, count); -} - -static void virt_efi_reset_system (int reset_type, efi_status_t status, - unsigned long data_size, - efi_char16_t *data) -{ - efi_call_virt(reset_system, reset_type, status, data_size, data); -} - -/* - * This function will switch the EFI runtime services to virtual mode. - * Essentially, look through the EFI memmap and map every region that - * has the runtime attribute bit set in its memory descriptor and update - * that memory descriptor with the virtual address obtained from ioremap(). - * This enables the runtime services to be called without having to - * thunk back into physical mode for every invocation. - */ - -void __init efi_enter_virtual_mode(void) -{ - efi_memory_desc_t *md; - efi_status_t status; - void *p; - - efi.systab = NULL; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - - if (!(md->attribute & EFI_MEMORY_RUNTIME)) - continue; - - md->virt_addr = (unsigned long)ioremap(md->phys_addr, - md->num_pages << EFI_PAGE_SHIFT); - if (!(unsigned long)md->virt_addr) { - printk(KERN_ERR PFX "ioremap of 0x%lX failed\n", - (unsigned long)md->phys_addr); - } - /* update the virtual address of the EFI system table */ - check_range_for_systab(md); - } - - BUG_ON(!efi.systab); - - status = phys_efi_set_virtual_address_map( - memmap.desc_size * memmap.nr_map, - memmap.desc_size, - memmap.desc_version, - memmap.phys_map); - - if (status != EFI_SUCCESS) { - printk (KERN_ALERT "You are screwed! " - "Unable to switch EFI into virtual mode " - "(status=%lx)\n", status); - panic("EFI call to SetVirtualAddressMap() failed!"); - } - - /* - * Now that EFI is in virtual mode, update the function - * pointers in the runtime service table to the new virtual addresses. - */ - - efi.get_time = virt_efi_get_time; - efi.set_time = virt_efi_set_time; - efi.get_wakeup_time = virt_efi_get_wakeup_time; - efi.set_wakeup_time = virt_efi_set_wakeup_time; - efi.get_variable = virt_efi_get_variable; - efi.get_next_variable = virt_efi_get_next_variable; - efi.set_variable = virt_efi_set_variable; - efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; - efi.reset_system = virt_efi_reset_system; -} - -void __init -efi_initialize_iomem_resources(struct resource *code_resource, - struct resource *data_resource, - struct resource *bss_resource) -{ - struct resource *res; - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - - if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) > - 0x100000000ULL) - continue; - res = kzalloc(sizeof(struct resource), GFP_ATOMIC); - switch (md->type) { - case EFI_RESERVED_TYPE: - res->name = "Reserved Memory"; - break; - case EFI_LOADER_CODE: - res->name = "Loader Code"; - break; - case EFI_LOADER_DATA: - res->name = "Loader Data"; - break; - case EFI_BOOT_SERVICES_DATA: - res->name = "BootServices Data"; - break; - case EFI_BOOT_SERVICES_CODE: - res->name = "BootServices Code"; - break; - case EFI_RUNTIME_SERVICES_CODE: - res->name = "Runtime Service Code"; - break; - case EFI_RUNTIME_SERVICES_DATA: - res->name = "Runtime Service Data"; - break; - case EFI_CONVENTIONAL_MEMORY: - res->name = "Conventional Memory"; - break; - case EFI_UNUSABLE_MEMORY: - res->name = "Unusable Memory"; - break; - case EFI_ACPI_RECLAIM_MEMORY: - res->name = "ACPI Reclaim"; - break; - case EFI_ACPI_MEMORY_NVS: - res->name = "ACPI NVS"; - break; - case EFI_MEMORY_MAPPED_IO: - res->name = "Memory Mapped IO"; - break; - case EFI_MEMORY_MAPPED_IO_PORT_SPACE: - res->name = "Memory Mapped IO Port Space"; - break; - default: - res->name = "Reserved"; - break; - } - res->start = md->phys_addr; - res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1); - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - if (request_resource(&iomem_resource, res) < 0) - printk(KERN_ERR PFX "Failed to allocate res %s : " - "0x%llx-0x%llx\n", res->name, - (unsigned long long)res->start, - (unsigned long long)res->end); - /* - * We don't know which region contains kernel data so we try - * it repeatedly and let the resource manager test it. - */ - if (md->type == EFI_CONVENTIONAL_MEMORY) { - request_resource(res, code_resource); - request_resource(res, data_resource); - request_resource(res, bss_resource); -#ifdef CONFIG_KEXEC - request_resource(res, &crashk_res); -#endif - } - } -} - -/* - * Convenience functions to obtain memory types and attributes - */ - -u32 efi_mem_type(unsigned long phys_addr) -{ - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if ((md->phys_addr <= phys_addr) && (phys_addr < - (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) )) - return md->type; - } - return 0; -} - -u64 efi_mem_attributes(unsigned long phys_addr) -{ - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if ((md->phys_addr <= phys_addr) && (phys_addr < - (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) )) - return md->attribute; - } - return 0; } diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c new file mode 100644 index 00000000000..4b73992c1e1 --- /dev/null +++ b/arch/x86/kernel/efi_64.c @@ -0,0 +1,134 @@ +/* + * x86_64 specific EFI support functions + * Based on Extensible Firmware Interface Specification version 1.0 + * + * Copyright (C) 2005-2008 Intel Co. + * Fenghua Yu <fenghua.yu@intel.com> + * Bibo Mao <bibo.mao@intel.com> + * Chandramouli Narayanan <mouli@linux.intel.com> + * Huang Ying <ying.huang@intel.com> + * + * Code to convert EFI to E820 map has been implemented in elilo bootloader + * based on a EFI patch by Edgar Hucek. Based on the E820 map, the page table + * is setup appropriately for EFI runtime code. + * - mouli 06/14/2007. + * + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/bootmem.h> +#include <linux/ioport.h> +#include <linux/module.h> +#include <linux/efi.h> +#include <linux/uaccess.h> +#include <linux/io.h> +#include <linux/reboot.h> + +#include <asm/setup.h> +#include <asm/page.h> +#include <asm/e820.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/proto.h> +#include <asm/efi.h> + +static pgd_t save_pgd __initdata; +static unsigned long efi_flags __initdata; + +static void __init early_mapping_set_exec(unsigned long start, + unsigned long end, + int executable) +{ + pte_t *kpte; + int level; + + while (start < end) { + kpte = lookup_address((unsigned long)__va(start), &level); + BUG_ON(!kpte); + if (executable) + set_pte(kpte, pte_mkexec(*kpte)); + else + set_pte(kpte, __pte((pte_val(*kpte) | _PAGE_NX) & \ + __supported_pte_mask)); + if (level == 4) + start = (start + PMD_SIZE) & PMD_MASK; + else + start = (start + PAGE_SIZE) & PAGE_MASK; + } +} + +static void __init early_runtime_code_mapping_set_exec(int executable) +{ + efi_memory_desc_t *md; + void *p; + + if (!(__supported_pte_mask & _PAGE_NX)) + return; + + /* Make EFI runtime service code area executable */ + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if (md->type == EFI_RUNTIME_SERVICES_CODE) { + unsigned long end; + end = md->phys_addr + (md->num_pages << PAGE_SHIFT); + early_mapping_set_exec(md->phys_addr, end, executable); + } + } +} + +void __init efi_call_phys_prelog(void) +{ + unsigned long vaddress; + + local_irq_save(efi_flags); + early_runtime_code_mapping_set_exec(1); + vaddress = (unsigned long)__va(0x0UL); + save_pgd = *pgd_offset_k(0x0UL); + set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress)); + __flush_tlb_all(); +} + +void __init efi_call_phys_epilog(void) +{ + /* + * After the lock is released, the original page table is restored. + */ + set_pgd(pgd_offset_k(0x0UL), save_pgd); + early_runtime_code_mapping_set_exec(0); + __flush_tlb_all(); + local_irq_restore(efi_flags); +} + +void __init efi_reserve_bootmem(void) +{ + reserve_bootmem_generic((unsigned long)memmap.phys_map, + memmap.nr_map * memmap.desc_size); +} + +void __iomem * __init efi_ioremap(unsigned long offset, + unsigned long size) +{ + static unsigned pages_mapped; + unsigned long last_addr; + unsigned i, pages; + + last_addr = offset + size - 1; + offset &= PAGE_MASK; + pages = (PAGE_ALIGN(last_addr) - offset) >> PAGE_SHIFT; + if (pages_mapped + pages > MAX_EFI_IO_PAGES) + return NULL; + + for (i = 0; i < pages; i++) { + __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped, + offset, PAGE_KERNEL_EXEC_NOCACHE); + offset += PAGE_SIZE; + pages_mapped++; + } + + return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \ + (pages_mapped - pages)); +} diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S new file mode 100644 index 00000000000..99b47d48c9f --- /dev/null +++ b/arch/x86/kernel/efi_stub_64.S @@ -0,0 +1,109 @@ +/* + * Function calling ABI conversion from Linux to EFI for x86_64 + * + * Copyright (C) 2007 Intel Corp + * Bibo Mao <bibo.mao@intel.com> + * Huang Ying <ying.huang@intel.com> + */ + +#include <linux/linkage.h> + +#define SAVE_XMM \ + mov %rsp, %rax; \ + subq $0x70, %rsp; \ + and $~0xf, %rsp; \ + mov %rax, (%rsp); \ + mov %cr0, %rax; \ + clts; \ + mov %rax, 0x8(%rsp); \ + movaps %xmm0, 0x60(%rsp); \ + movaps %xmm1, 0x50(%rsp); \ + movaps %xmm2, 0x40(%rsp); \ + movaps %xmm3, 0x30(%rsp); \ + movaps %xmm4, 0x20(%rsp); \ + movaps %xmm5, 0x10(%rsp) + +#define RESTORE_XMM \ + movaps 0x60(%rsp), %xmm0; \ + movaps 0x50(%rsp), %xmm1; \ + movaps 0x40(%rsp), %xmm2; \ + movaps 0x30(%rsp), %xmm3; \ + movaps 0x20(%rsp), %xmm4; \ + movaps 0x10(%rsp), %xmm5; \ + mov 0x8(%rsp), %rsi; \ + mov %rsi, %cr0; \ + mov (%rsp), %rsp + +ENTRY(efi_call0) + SAVE_XMM + subq $32, %rsp + call *%rdi + addq $32, %rsp + RESTORE_XMM + ret + +ENTRY(efi_call1) + SAVE_XMM + subq $32, %rsp + mov %rsi, %rcx + call *%rdi + addq $32, %rsp + RESTORE_XMM + ret + +ENTRY(efi_call2) + SAVE_XMM + subq $32, %rsp + mov %rsi, %rcx + call *%rdi + addq $32, %rsp + RESTORE_XMM + ret + +ENTRY(efi_call3) + SAVE_XMM + subq $32, %rsp + mov %rcx, %r8 + mov %rsi, %rcx + call *%rdi + addq $32, %rsp + RESTORE_XMM + ret + +ENTRY(efi_call4) + SAVE_XMM + subq $32, %rsp + mov %r8, %r9 + mov %rcx, %r8 + mov %rsi, %rcx + call *%rdi + addq $32, %rsp + RESTORE_XMM + ret + +ENTRY(efi_call5) + SAVE_XMM + subq $48, %rsp + mov %r9, 32(%rsp) + mov %r8, %r9 + mov %rcx, %r8 + mov %rsi, %rcx + call *%rdi + addq $48, %rsp + RESTORE_XMM + ret + +ENTRY(efi_call6) + SAVE_XMM + mov (%rsp), %rax + mov 8(%rax), %rax + subq $48, %rsp + mov %r9, 32(%rsp) + mov %rax, 40(%rsp) + mov %r8, %r9 + mov %rcx, %r8 + mov %rsi, %rcx + call *%rdi + addq $48, %rsp + RESTORE_XMM + ret diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index dc7f938e501..be5c31d0488 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -58,7 +58,7 @@ * for paravirtualization. The following will never clobber any registers: * INTERRUPT_RETURN (aka. "iret") * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") - * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * ENABLE_INTERRUPTS_SYSCALL_RET (aka "sti; sysexit"). * * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). @@ -283,12 +283,12 @@ END(resume_kernel) the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ # sysenter call handler stub -ENTRY(sysenter_entry) +ENTRY(ia32_sysenter_target) CFI_STARTPROC simple CFI_SIGNAL_FRAME CFI_DEF_CFA esp, 0 CFI_REGISTER esp, ebp - movl TSS_sysenter_esp0(%esp),%esp + movl TSS_sysenter_sp0(%esp),%esp sysenter_past_esp: /* * No need to follow this irqs on/off section: the syscall @@ -351,7 +351,7 @@ sysenter_past_esp: xorl %ebp,%ebp TRACE_IRQS_ON 1: mov PT_FS(%esp), %fs - ENABLE_INTERRUPTS_SYSEXIT + ENABLE_INTERRUPTS_SYSCALL_RET CFI_ENDPROC .pushsection .fixup,"ax" 2: movl $0,PT_FS(%esp) @@ -360,7 +360,7 @@ sysenter_past_esp: .align 4 .long 1b,2b .popsection -ENDPROC(sysenter_entry) +ENDPROC(ia32_sysenter_target) # system call handler stub ENTRY(system_call) @@ -583,7 +583,7 @@ END(syscall_badsys) * Build the entry stubs and pointer table with * some assembler magic. */ -.data +.section .rodata,"a" ENTRY(interrupt) .text @@ -743,7 +743,7 @@ END(device_not_available) * that sets up the real kernel stack. Check here, since we can't * allow the wrong stack to be used. * - * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have + * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have * already pushed 3 words if it hits on the sysenter instruction: * eflags, cs and eip. * @@ -755,7 +755,7 @@ END(device_not_available) cmpw $__KERNEL_CS,4(%esp); \ jne ok; \ label: \ - movl TSS_sysenter_esp0+offset(%esp),%esp; \ + movl TSS_sysenter_sp0+offset(%esp),%esp; \ CFI_DEF_CFA esp, 0; \ CFI_UNDEFINED eip; \ pushfl; \ @@ -768,7 +768,7 @@ label: \ KPROBE_ENTRY(debug) RING0_INT_FRAME - cmpl $sysenter_entry,(%esp) + cmpl $ia32_sysenter_target,(%esp) jne debug_stack_correct FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) debug_stack_correct: @@ -799,7 +799,7 @@ KPROBE_ENTRY(nmi) popl %eax CFI_ADJUST_CFA_OFFSET -4 je nmi_espfix_stack - cmpl $sysenter_entry,(%esp) + cmpl $ia32_sysenter_target,(%esp) je nmi_stack_fixup pushl %eax CFI_ADJUST_CFA_OFFSET 4 @@ -812,7 +812,7 @@ KPROBE_ENTRY(nmi) popl %eax CFI_ADJUST_CFA_OFFSET -4 jae nmi_stack_correct - cmpl $sysenter_entry,12(%esp) + cmpl $ia32_sysenter_target,12(%esp) je nmi_debug_stack_check nmi_stack_correct: /* We have a RING0_INT_FRAME here */ @@ -882,10 +882,10 @@ ENTRY(native_iret) .previous END(native_iret) -ENTRY(native_irq_enable_sysexit) +ENTRY(native_irq_enable_syscall_ret) sti sysexit -END(native_irq_enable_sysexit) +END(native_irq_enable_syscall_ret) #endif KPROBE_ENTRY(int3) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e70f3881d7e..bea8474744f 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -50,6 +50,7 @@ #include <asm/hw_irq.h> #include <asm/page.h> #include <asm/irqflags.h> +#include <asm/paravirt.h> .code64 @@ -57,6 +58,13 @@ #define retint_kernel retint_restore_args #endif +#ifdef CONFIG_PARAVIRT +ENTRY(native_irq_enable_syscall_ret) + movq %gs:pda_oldrsp,%rsp + swapgs + sysretq +#endif /* CONFIG_PARAVIRT */ + .macro TRACE_IRQS_IRETQ offset=ARGOFFSET #ifdef CONFIG_TRACE_IRQFLAGS @@ -216,14 +224,21 @@ ENTRY(system_call) CFI_DEF_CFA rsp,PDA_STACKOFFSET CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ - swapgs + SWAPGS_UNSAFE_STACK + /* + * A hypervisor implementation might want to use a label + * after the swapgs, so that it can do the swapgs + * for the guest and jump here on syscall. + */ +ENTRY(system_call_after_swapgs) + movq %rsp,%gs:pda_oldrsp movq %gs:pda_kernelstack,%rsp /* * No need to follow this irqs off/on section - it's straight * and short: */ - sti + ENABLE_INTERRUPTS(CLBR_NONE) SAVE_ARGS 8,1 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) @@ -246,7 +261,7 @@ ret_from_sys_call: sysret_check: LOCKDEP_SYS_EXIT GET_THREAD_INFO(%rcx) - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movl threadinfo_flags(%rcx),%edx andl %edi,%edx @@ -260,9 +275,7 @@ sysret_check: CFI_REGISTER rip,rcx RESTORE_ARGS 0,-ARG_SKIP,1 /*CFI_REGISTER rflags,r11*/ - movq %gs:pda_oldrsp,%rsp - swapgs - sysretq + ENABLE_INTERRUPTS_SYSCALL_RET CFI_RESTORE_STATE /* Handle reschedules */ @@ -271,7 +284,7 @@ sysret_careful: bt $TIF_NEED_RESCHED,%edx jnc sysret_signal TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule @@ -282,7 +295,7 @@ sysret_careful: /* Handle a signal */ sysret_signal: TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) testl $_TIF_DO_NOTIFY_MASK,%edx jz 1f @@ -295,7 +308,7 @@ sysret_signal: 1: movl $_TIF_NEED_RESCHED,%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check @@ -327,7 +340,7 @@ tracesys: */ .globl int_ret_from_sys_call int_ret_from_sys_call: - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $3,CS-ARGOFFSET(%rsp) je retint_restore_args @@ -349,20 +362,20 @@ int_careful: bt $TIF_NEED_RESCHED,%edx jnc int_very_careful TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule popq %rdi CFI_ADJUST_CFA_OFFSET -8 - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check /* handle signals and tracing -- both require a full stack frame */ int_very_careful: TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST /* Check for syscall exit trace */ testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx @@ -385,7 +398,7 @@ int_signal: 1: movl $_TIF_NEED_RESCHED,%edi int_restore_rest: RESTORE_REST - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check CFI_ENDPROC @@ -506,7 +519,7 @@ END(stub_rt_sigreturn) CFI_DEF_CFA_REGISTER rbp testl $3,CS(%rdi) je 1f - swapgs + SWAPGS /* irqcount is used to check if a CPU is already on an interrupt stack or not. While this is essentially redundant with preempt_count it is a little cheaper to use a separate counter in the PDA @@ -527,7 +540,7 @@ ENTRY(common_interrupt) interrupt do_IRQ /* 0(%rsp): oldrsp-ARGOFFSET */ ret_from_intr: - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF decl %gs:pda_irqcount leaveq @@ -556,13 +569,13 @@ retint_swapgs: /* return to user-space */ /* * The iretq could re-enable interrupts: */ - cli + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_IRETQ - swapgs + SWAPGS jmp restore_args retint_restore_args: /* return to kernel space */ - cli + DISABLE_INTERRUPTS(CLBR_ANY) /* * The iretq could re-enable interrupts: */ @@ -570,10 +583,14 @@ retint_restore_args: /* return to kernel space */ restore_args: RESTORE_ARGS 0,8,0 iret_label: +#ifdef CONFIG_PARAVIRT + INTERRUPT_RETURN +#endif +ENTRY(native_iret) iretq .section __ex_table,"a" - .quad iret_label,bad_iret + .quad native_iret, bad_iret .previous .section .fixup,"ax" /* force a signal here? this matches i386 behaviour */ @@ -581,24 +598,24 @@ iret_label: bad_iret: movq $11,%rdi /* SIGSEGV */ TRACE_IRQS_ON - sti - jmp do_exit - .previous - + ENABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) + jmp do_exit + .previous + /* edi: workmask, edx: work */ retint_careful: CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc retint_signal TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) pushq %rdi CFI_ADJUST_CFA_OFFSET 8 call schedule popq %rdi CFI_ADJUST_CFA_OFFSET -8 GET_THREAD_INFO(%rcx) - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp retint_check @@ -606,14 +623,14 @@ retint_signal: testl $_TIF_DO_NOTIFY_MASK,%edx jz retint_swapgs TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST movq $-1,ORIG_RAX(%rsp) xorl %esi,%esi # oldset movq %rsp,%rdi # &pt_regs call do_notify_resume RESTORE_REST - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movl $_TIF_NEED_RESCHED,%edi GET_THREAD_INFO(%rcx) @@ -731,7 +748,7 @@ END(spurious_interrupt) rdmsr testl %edx,%edx js 1f - swapgs + SWAPGS xorl %ebx,%ebx 1: .if \ist @@ -747,7 +764,7 @@ END(spurious_interrupt) .if \ist addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) .endif - cli + DISABLE_INTERRUPTS(CLBR_NONE) .if \irqtrace TRACE_IRQS_OFF .endif @@ -776,10 +793,10 @@ paranoid_swapgs\trace: .if \trace TRACE_IRQS_IRETQ 0 .endif - swapgs + SWAPGS_UNSAFE_STACK paranoid_restore\trace: RESTORE_ALL 8 - iretq + INTERRUPT_RETURN paranoid_userspace\trace: GET_THREAD_INFO(%rcx) movl threadinfo_flags(%rcx),%ebx @@ -794,11 +811,11 @@ paranoid_userspace\trace: .if \trace TRACE_IRQS_ON .endif - sti + ENABLE_INTERRUPTS(CLBR_NONE) xorl %esi,%esi /* arg2: oldset */ movq %rsp,%rdi /* arg1: &pt_regs */ call do_notify_resume - cli + DISABLE_INTERRUPTS(CLBR_NONE) .if \trace TRACE_IRQS_OFF .endif @@ -807,9 +824,9 @@ paranoid_schedule\trace: .if \trace TRACE_IRQS_ON .endif - sti + ENABLE_INTERRUPTS(CLBR_ANY) call schedule - cli + DISABLE_INTERRUPTS(CLBR_ANY) .if \trace TRACE_IRQS_OFF .endif @@ -862,7 +879,7 @@ KPROBE_ENTRY(error_entry) testl $3,CS(%rsp) je error_kernelspace error_swapgs: - swapgs + SWAPGS error_sti: movq %rdi,RDI(%rsp) CFI_REL_OFFSET rdi,RDI @@ -874,7 +891,7 @@ error_sti: error_exit: movl %ebx,%eax RESTORE_REST - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) testl %eax,%eax @@ -911,12 +928,12 @@ ENTRY(load_gs_index) CFI_STARTPROC pushf CFI_ADJUST_CFA_OFFSET 8 - cli - swapgs + DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI)) + SWAPGS gs_change: movl %edi,%gs 2: mfence /* workaround */ - swapgs + SWAPGS popf CFI_ADJUST_CFA_OFFSET -8 ret @@ -930,7 +947,7 @@ ENDPROC(load_gs_index) .section .fixup,"ax" /* running with kernelgs */ bad_gs: - swapgs /* switch back to user gs */ + SWAPGS /* switch back to user gs */ xorl %eax,%eax movl %eax,%gs jmp 2b diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index ce703e21c91..4ae7b644026 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -24,18 +24,11 @@ #include <acpi/acpi_bus.h> #endif -/* - * which logical CPU number maps to which CPU (physical APIC ID) - * - * The following static array is used during kernel startup - * and the x86_cpu_to_apicid_ptr contains the address of the - * array during this time. Is it zeroed when the per_cpu - * data area is removed. - */ -u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata +/* which logical CPU number maps to which CPU (physical APIC ID) */ +u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata = { [0 ... NR_CPUS-1] = BAD_APICID }; -void *x86_cpu_to_apicid_ptr; -DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID; +void *x86_cpu_to_apicid_early_ptr; +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID; EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); struct genapic __read_mostly *genapic = &apic_flat; diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c index f12d8c5d980..9c7f7d39596 100644 --- a/arch/x86/kernel/geode_32.c +++ b/arch/x86/kernel/geode_32.c @@ -1,6 +1,7 @@ /* * AMD Geode southbridge support code * Copyright (C) 2006, Advanced Micro Devices, Inc. + * Copyright (C) 2007, Andres Salomon <dilinger@debian.org> * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public License @@ -51,45 +52,62 @@ EXPORT_SYMBOL_GPL(geode_get_dev_base); /* === GPIO API === */ -void geode_gpio_set(unsigned int gpio, unsigned int reg) +void geode_gpio_set(u32 gpio, unsigned int reg) { u32 base = geode_get_dev_base(GEODE_DEV_GPIO); if (!base) return; - if (gpio < 16) - outl(1 << gpio, base + reg); - else - outl(1 << (gpio - 16), base + 0x80 + reg); + /* low bank register */ + if (gpio & 0xFFFF) + outl(gpio & 0xFFFF, base + reg); + /* high bank register */ + gpio >>= 16; + if (gpio) + outl(gpio, base + 0x80 + reg); } EXPORT_SYMBOL_GPL(geode_gpio_set); -void geode_gpio_clear(unsigned int gpio, unsigned int reg) +void geode_gpio_clear(u32 gpio, unsigned int reg) { u32 base = geode_get_dev_base(GEODE_DEV_GPIO); if (!base) return; - if (gpio < 16) - outl(1 << (gpio + 16), base + reg); - else - outl(1 << gpio, base + 0x80 + reg); + /* low bank register */ + if (gpio & 0xFFFF) + outl((gpio & 0xFFFF) << 16, base + reg); + /* high bank register */ + gpio &= (0xFFFF << 16); + if (gpio) + outl(gpio, base + 0x80 + reg); } EXPORT_SYMBOL_GPL(geode_gpio_clear); -int geode_gpio_isset(unsigned int gpio, unsigned int reg) +int geode_gpio_isset(u32 gpio, unsigned int reg) { u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + u32 val; if (!base) return 0; - if (gpio < 16) - return (inl(base + reg) & (1 << gpio)) ? 1 : 0; - else - return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0; + /* low bank register */ + if (gpio & 0xFFFF) { + val = inl(base + reg) & (gpio & 0xFFFF); + if ((gpio & 0xFFFF) == val) + return 1; + } + /* high bank register */ + gpio >>= 16; + if (gpio) { + val = inl(base + 0x80 + reg) & gpio; + if (gpio == val) + return 1; + } + return 0; } EXPORT_SYMBOL_GPL(geode_gpio_isset); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 6b3469311e4..a317336cdea 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -10,6 +10,7 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/percpu.h> +#include <linux/start_kernel.h> #include <asm/processor.h> #include <asm/proto.h> @@ -19,12 +20,14 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/sections.h> +#include <asm/kdebug.h> +#include <asm/e820.h> static void __init zap_identity_mappings(void) { pgd_t *pgd = pgd_offset_k(0UL); pgd_clear(pgd); - __flush_tlb(); + __flush_tlb_all(); } /* Don't add a printk in there. printk relies on the PDA which is not initialized @@ -46,6 +49,35 @@ static void __init copy_bootdata(char *real_mode_data) } } +#define EBDA_ADDR_POINTER 0x40E + +static __init void reserve_ebda(void) +{ + unsigned ebda_addr, ebda_size; + + /* + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E + */ + ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); + ebda_addr <<= 4; + + if (!ebda_addr) + return; + + ebda_size = *(unsigned short *)__va(ebda_addr); + + /* Round EBDA up to pages */ + if (ebda_size == 0) + ebda_size = 1; + ebda_size <<= 10; + ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); + if (ebda_size > 64*1024) + ebda_size = 64*1024; + + reserve_early(ebda_addr, ebda_addr + ebda_size); +} + void __init x86_64_start_kernel(char * real_mode_data) { int i; @@ -56,8 +88,13 @@ void __init x86_64_start_kernel(char * real_mode_data) /* Make NULL pointers segfault */ zap_identity_mappings(); - for (i = 0; i < IDT_ENTRIES; i++) + for (i = 0; i < IDT_ENTRIES; i++) { +#ifdef CONFIG_EARLY_PRINTK + set_intr_gate(i, &early_idt_handlers[i]); +#else set_intr_gate(i, early_idt_handler); +#endif + } load_idt((const struct desc_ptr *)&idt_descr); early_printk("Kernel alive\n"); @@ -67,8 +104,24 @@ void __init x86_64_start_kernel(char * real_mode_data) pda_init(0); copy_bootdata(__va(real_mode_data)); -#ifdef CONFIG_SMP - cpu_set(0, cpu_online_map); -#endif + + reserve_early(__pa_symbol(&_text), __pa_symbol(&_end)); + + /* Reserve INITRD */ + if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { + unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; + unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; + unsigned long ramdisk_end = ramdisk_image + ramdisk_size; + reserve_early(ramdisk_image, ramdisk_end); + } + + reserve_ebda(); + + /* + * At this point everything still needed from the boot loader + * or BIOS or kernel text should be early reserved or marked not + * RAM in e820. All other memory is free game. + */ + start_kernel(); } diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index fbad51fce67..5d8c5730686 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -9,6 +9,7 @@ .text #include <linux/threads.h> +#include <linux/init.h> #include <linux/linkage.h> #include <asm/segment.h> #include <asm/page.h> @@ -151,7 +152,9 @@ WEAK(xen_entry) /* Unknown implementation; there's really nothing we can do at this point. */ ud2a -.data + + __INITDATA + subarch_entries: .long default_entry /* normal x86/PC */ .long lguest_entry /* lguest hypervisor */ @@ -199,7 +202,6 @@ default_entry: addl $0x67, %eax /* 0x67 == _PAGE_TABLE */ movl %eax, 4092(%edx) - xorl %ebx,%ebx /* This is the boot CPU (BSP) */ jmp 3f /* * Non-boot CPU entry point; entered from trampoline.S @@ -222,6 +224,8 @@ ENTRY(startup_32_smp) movl %eax,%es movl %eax,%fs movl %eax,%gs +#endif /* CONFIG_SMP */ +3: /* * New page tables may be in 4Mbyte page mode and may @@ -268,12 +272,6 @@ ENTRY(startup_32_smp) wrmsr 6: - /* This is a secondary processor (AP) */ - xorl %ebx,%ebx - incl %ebx - -#endif /* CONFIG_SMP */ -3: /* * Enable paging @@ -297,7 +295,7 @@ ENTRY(startup_32_smp) popfl #ifdef CONFIG_SMP - andl %ebx,%ebx + cmpb $0, ready jz 1f /* Initial CPU cleans BSS */ jmp checkCPUtype 1: @@ -502,6 +500,7 @@ early_fault: call printk #endif #endif + call dump_stack hlt_loop: hlt jmp hlt_loop diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b6167fe3330..1d5a7a36120 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -19,6 +19,13 @@ #include <asm/msr.h> #include <asm/cache.h> +#ifdef CONFIG_PARAVIRT +#include <asm/asm-offsets.h> +#include <asm/paravirt.h> +#else +#define GET_CR2_INTO_RCX movq %cr2, %rcx +#endif + /* we are not able to switch in one step to the final KERNEL ADRESS SPACE * because we need identity-mapped pages. * @@ -260,14 +267,43 @@ init_rsp: bad_address: jmp bad_address +#ifdef CONFIG_EARLY_PRINTK +.macro early_idt_tramp first, last + .ifgt \last-\first + early_idt_tramp \first, \last-1 + .endif + movl $\last,%esi + jmp early_idt_handler +.endm + + .globl early_idt_handlers +early_idt_handlers: + early_idt_tramp 0, 63 + early_idt_tramp 64, 127 + early_idt_tramp 128, 191 + early_idt_tramp 192, 255 +#endif + ENTRY(early_idt_handler) +#ifdef CONFIG_EARLY_PRINTK cmpl $2,early_recursion_flag(%rip) jz 1f incl early_recursion_flag(%rip) + GET_CR2_INTO_RCX + movq %rcx,%r9 + xorl %r8d,%r8d # zero for error code + movl %esi,%ecx # get vector number + # Test %ecx against mask of vectors that push error code. + cmpl $31,%ecx + ja 0f + movl $1,%eax + salq %cl,%rax + testl $0x27d00,%eax + je 0f + popq %r8 # get error code +0: movq 0(%rsp),%rcx # get ip + movq 8(%rsp),%rdx # get cs xorl %eax,%eax - movq 8(%rsp),%rsi # get rip - movq (%rsp),%rdx - movq %cr2,%rcx leaq early_idt_msg(%rip),%rdi call early_printk cmpl $2,early_recursion_flag(%rip) @@ -278,15 +314,19 @@ ENTRY(early_idt_handler) movq 8(%rsp),%rsi # get rip again call __print_symbol #endif +#endif /* EARLY_PRINTK */ 1: hlt jmp 1b + +#ifdef CONFIG_EARLY_PRINTK early_recursion_flag: .long 0 early_idt_msg: - .asciz "PANIC: early exception rip %lx error %lx cr2 %lx\n" + .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n" early_idt_ripmsg: .asciz "RIP %s\n" +#endif /* CONFIG_EARLY_PRINTK */ .balign PAGE_SIZE diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 2f99ee206b9..429d084e014 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -6,7 +6,6 @@ #include <linux/init.h> #include <linux/sysdev.h> #include <linux/pm.h> -#include <linux/delay.h> #include <asm/fixmap.h> #include <asm/hpet.h> @@ -16,7 +15,8 @@ #define HPET_MASK CLOCKSOURCE_MASK(32) #define HPET_SHIFT 22 -/* FSEC = 10^-15 NSEC = 10^-9 */ +/* FSEC = 10^-15 + NSEC = 10^-9 */ #define FSEC_PER_NSEC 1000000 /* @@ -107,6 +107,7 @@ int is_hpet_enabled(void) { return is_hpet_capable() && hpet_legacy_int_enabled; } +EXPORT_SYMBOL_GPL(is_hpet_enabled); /* * When the hpet driver (/dev/hpet) is enabled, we need to reserve @@ -132,16 +133,13 @@ static void hpet_reserve_platform_timers(unsigned long id) #ifdef CONFIG_HPET_EMULATE_RTC hpet_reserve_timer(&hd, 1); #endif - hd.hd_irq[0] = HPET_LEGACY_8254; hd.hd_irq[1] = HPET_LEGACY_RTC; - for (i = 2; i < nrtimers; timer++, i++) - hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; - + for (i = 2; i < nrtimers; timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; hpet_alloc(&hd); - } #else static void hpet_reserve_platform_timers(unsigned long id) { } @@ -478,6 +476,7 @@ void hpet_disable(void) */ #include <linux/mc146818rtc.h> #include <linux/rtc.h> +#include <asm/rtc.h> #define DEFAULT_RTC_INT_FREQ 64 #define DEFAULT_RTC_SHIFT 6 @@ -492,6 +491,38 @@ static unsigned long hpet_default_delta; static unsigned long hpet_pie_delta; static unsigned long hpet_pie_limit; +static rtc_irq_handler irq_handler; + +/* + * Registers a IRQ handler. + */ +int hpet_register_irq_handler(rtc_irq_handler handler) +{ + if (!is_hpet_enabled()) + return -ENODEV; + if (irq_handler) + return -EBUSY; + + irq_handler = handler; + + return 0; +} +EXPORT_SYMBOL_GPL(hpet_register_irq_handler); + +/* + * Deregisters the IRQ handler registered with hpet_register_irq_handler() + * and does cleanup. + */ +void hpet_unregister_irq_handler(rtc_irq_handler handler) +{ + if (!is_hpet_enabled()) + return; + + irq_handler = NULL; + hpet_rtc_flags = 0; +} +EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler); + /* * Timer 1 for RTC emulation. We use one shot mode, as periodic mode * is not supported by all HPET implementations for timer 1. @@ -533,6 +564,7 @@ int hpet_rtc_timer_init(void) return 1; } +EXPORT_SYMBOL_GPL(hpet_rtc_timer_init); /* * The functions below are called from rtc driver. @@ -547,6 +579,7 @@ int hpet_mask_rtc_irq_bit(unsigned long bit_mask) hpet_rtc_flags &= ~bit_mask; return 1; } +EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit); int hpet_set_rtc_irq_bit(unsigned long bit_mask) { @@ -562,6 +595,7 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask) return 1; } +EXPORT_SYMBOL_GPL(hpet_set_rtc_irq_bit); int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) @@ -575,6 +609,7 @@ int hpet_set_alarm_time(unsigned char hrs, unsigned char min, return 1; } +EXPORT_SYMBOL_GPL(hpet_set_alarm_time); int hpet_set_periodic_freq(unsigned long freq) { @@ -593,11 +628,13 @@ int hpet_set_periodic_freq(unsigned long freq) } return 1; } +EXPORT_SYMBOL_GPL(hpet_set_periodic_freq); int hpet_rtc_dropped_irq(void) { return is_hpet_enabled(); } +EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq); static void hpet_rtc_timer_reinit(void) { @@ -641,9 +678,10 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) unsigned long rtc_int_flag = 0; hpet_rtc_timer_reinit(); + memset(&curr_time, 0, sizeof(struct rtc_time)); if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) - rtc_get_rtc_time(&curr_time); + get_rtc_time(&curr_time); if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { @@ -665,8 +703,10 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) if (rtc_int_flag) { rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); - rtc_interrupt(rtc_int_flag, dev_id); + if (irq_handler) + irq_handler(rtc_int_flag, dev_id); } return IRQ_HANDLED; } +EXPORT_SYMBOL_GPL(hpet_rtc_interrupt); #endif diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 02112fcc0de..061627806a2 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -22,12 +22,5 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(strstr); -#ifdef CONFIG_SMP -extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); -extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); -EXPORT_SYMBOL(__write_lock_failed); -EXPORT_SYMBOL(__read_lock_failed); -#endif - EXPORT_SYMBOL(csum_partial); EXPORT_SYMBOL(empty_zero_page); diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c new file mode 100644 index 00000000000..26719bd2c77 --- /dev/null +++ b/arch/x86/kernel/i387.c @@ -0,0 +1,479 @@ +/* + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes <gareth@valinux.com>, May 2000 + */ + +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/regset.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/math_emu.h> +#include <asm/sigcontext.h> +#include <asm/user.h> +#include <asm/ptrace.h> +#include <asm/uaccess.h> + +#ifdef CONFIG_X86_64 + +#include <asm/sigcontext32.h> +#include <asm/user32.h> + +#else + +#define save_i387_ia32 save_i387 +#define restore_i387_ia32 restore_i387 + +#define _fpstate_ia32 _fpstate +#define user_i387_ia32_struct user_i387_struct +#define user32_fxsr_struct user_fxsr_struct + +#endif + +#ifdef CONFIG_MATH_EMULATION +#define HAVE_HWFP (boot_cpu_data.hard_math) +#else +#define HAVE_HWFP 1 +#endif + +unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; + +void mxcsr_feature_mask_init(void) +{ + unsigned long mask = 0; + clts(); + if (cpu_has_fxsr) { + memset(¤t->thread.i387.fxsave, 0, + sizeof(struct i387_fxsave_struct)); + asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); + mask = current->thread.i387.fxsave.mxcsr_mask; + if (mask == 0) + mask = 0x0000ffbf; + } + mxcsr_feature_mask &= mask; + stts(); +} + +#ifdef CONFIG_X86_64 +/* + * Called at bootup to set up the initial FPU state that is later cloned + * into all processes. + */ +void __cpuinit fpu_init(void) +{ + unsigned long oldcr0 = read_cr0(); + extern void __bad_fxsave_alignment(void); + + if (offsetof(struct task_struct, thread.i387.fxsave) & 15) + __bad_fxsave_alignment(); + set_in_cr4(X86_CR4_OSFXSR); + set_in_cr4(X86_CR4_OSXMMEXCPT); + + write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */ + + mxcsr_feature_mask_init(); + /* clean state in init */ + current_thread_info()->status = 0; + clear_used_math(); +} +#endif /* CONFIG_X86_64 */ + +/* + * The _current_ task is using the FPU for the first time + * so initialize it and set the mxcsr to its default + * value at reset if we support XMM instructions and then + * remeber the current task has used the FPU. + */ +void init_fpu(struct task_struct *tsk) +{ + if (tsk_used_math(tsk)) { + if (tsk == current) + unlazy_fpu(tsk); + return; + } + + if (cpu_has_fxsr) { + memset(&tsk->thread.i387.fxsave, 0, + sizeof(struct i387_fxsave_struct)); + tsk->thread.i387.fxsave.cwd = 0x37f; + if (cpu_has_xmm) + tsk->thread.i387.fxsave.mxcsr = MXCSR_DEFAULT; + } else { + memset(&tsk->thread.i387.fsave, 0, + sizeof(struct i387_fsave_struct)); + tsk->thread.i387.fsave.cwd = 0xffff037fu; + tsk->thread.i387.fsave.swd = 0xffff0000u; + tsk->thread.i387.fsave.twd = 0xffffffffu; + tsk->thread.i387.fsave.fos = 0xffff0000u; + } + /* + * Only the device not available exception or ptrace can call init_fpu. + */ + set_stopped_child_used_math(tsk); +} + +int fpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + return tsk_used_math(target) ? regset->n : 0; +} + +int xfpregs_active(struct task_struct *target, const struct user_regset *regset) +{ + return (cpu_has_fxsr && tsk_used_math(target)) ? regset->n : 0; +} + +int xfpregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (!cpu_has_fxsr) + return -ENODEV; + + unlazy_fpu(target); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.i387.fxsave, 0, -1); +} + +int xfpregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + + if (!cpu_has_fxsr) + return -ENODEV; + + unlazy_fpu(target); + set_stopped_child_used_math(target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.i387.fxsave, 0, -1); + + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + target->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; + + return ret; +} + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + +/* + * FPU tag word conversions. + */ + +static inline unsigned short twd_i387_to_fxsr(unsigned short twd) +{ + unsigned int tmp; /* to avoid 16 bit prefixes in the code */ + + /* Transform each pair of bits into 01 (valid) or 00 (empty) */ + tmp = ~twd; + tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ + /* and move the valid bits to the lower byte. */ + tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ + tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ + tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ + return tmp; +} + +#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); +#define FP_EXP_TAG_VALID 0 +#define FP_EXP_TAG_ZERO 1 +#define FP_EXP_TAG_SPECIAL 2 +#define FP_EXP_TAG_EMPTY 3 + +static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) +{ + struct _fpxreg *st; + u32 tos = (fxsave->swd >> 11) & 7; + u32 twd = (unsigned long) fxsave->twd; + u32 tag; + u32 ret = 0xffff0000u; + int i; + + for (i = 0; i < 8; i++, twd >>= 1) { + if (twd & 0x1) { + st = FPREG_ADDR(fxsave, (i - tos) & 7); + + switch (st->exponent & 0x7fff) { + case 0x7fff: + tag = FP_EXP_TAG_SPECIAL; + break; + case 0x0000: + if (!st->significand[0] && + !st->significand[1] && + !st->significand[2] && + !st->significand[3]) + tag = FP_EXP_TAG_ZERO; + else + tag = FP_EXP_TAG_SPECIAL; + break; + default: + if (st->significand[3] & 0x8000) + tag = FP_EXP_TAG_VALID; + else + tag = FP_EXP_TAG_SPECIAL; + break; + } + } else { + tag = FP_EXP_TAG_EMPTY; + } + ret |= tag << (2 * i); + } + return ret; +} + +/* + * FXSR floating point environment conversions. + */ + +static void convert_from_fxsr(struct user_i387_ia32_struct *env, + struct task_struct *tsk) +{ + struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave; + struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; + struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; + int i; + + env->cwd = fxsave->cwd | 0xffff0000u; + env->swd = fxsave->swd | 0xffff0000u; + env->twd = twd_fxsr_to_i387(fxsave); + +#ifdef CONFIG_X86_64 + env->fip = fxsave->rip; + env->foo = fxsave->rdp; + if (tsk == current) { + /* + * should be actually ds/cs at fpu exception time, but + * that information is not available in 64bit mode. + */ + asm("mov %%ds,%0" : "=r" (env->fos)); + asm("mov %%cs,%0" : "=r" (env->fcs)); + } else { + struct pt_regs *regs = task_pt_regs(tsk); + env->fos = 0xffff0000 | tsk->thread.ds; + env->fcs = regs->cs; + } +#else + env->fip = fxsave->fip; + env->fcs = fxsave->fcs; + env->foo = fxsave->foo; + env->fos = fxsave->fos; +#endif + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(to[0])); +} + +static void convert_to_fxsr(struct task_struct *tsk, + const struct user_i387_ia32_struct *env) + +{ + struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave; + struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; + struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; + int i; + + fxsave->cwd = env->cwd; + fxsave->swd = env->swd; + fxsave->twd = twd_i387_to_fxsr(env->twd); + fxsave->fop = (u16) ((u32) env->fcs >> 16); +#ifdef CONFIG_X86_64 + fxsave->rip = env->fip; + fxsave->rdp = env->foo; + /* cs and ds ignored */ +#else + fxsave->fip = env->fip; + fxsave->fcs = (env->fcs & 0xffff); + fxsave->foo = env->foo; + fxsave->fos = env->fos; +#endif + + for (i = 0; i < 8; ++i) + memcpy(&to[i], &from[i], sizeof(from[0])); +} + +int fpregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct user_i387_ia32_struct env; + + if (!HAVE_HWFP) + return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf); + + unlazy_fpu(target); + + if (!cpu_has_fxsr) + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.i387.fsave, 0, -1); + + if (kbuf && pos == 0 && count == sizeof(env)) { + convert_from_fxsr(kbuf, target); + return 0; + } + + convert_from_fxsr(&env, target); + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1); +} + +int fpregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct user_i387_ia32_struct env; + int ret; + + if (!HAVE_HWFP) + return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); + + unlazy_fpu(target); + set_stopped_child_used_math(target); + + if (!cpu_has_fxsr) + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.i387.fsave, 0, -1); + + if (pos > 0 || count < sizeof(env)) + convert_from_fxsr(&env, target); + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); + if (!ret) + convert_to_fxsr(target, &env); + + return ret; +} + +/* + * Signal frame handlers. + */ + +static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) +{ + struct task_struct *tsk = current; + + unlazy_fpu(tsk); + tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd; + if (__copy_to_user(buf, &tsk->thread.i387.fsave, + sizeof(struct i387_fsave_struct))) + return -1; + return 1; +} + +static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) +{ + struct task_struct *tsk = current; + struct user_i387_ia32_struct env; + int err = 0; + + unlazy_fpu(tsk); + + convert_from_fxsr(&env, tsk); + if (__copy_to_user(buf, &env, sizeof(env))) + return -1; + + err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status); + err |= __put_user(X86_FXSR_MAGIC, &buf->magic); + if (err) + return -1; + + if (__copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave, + sizeof(struct i387_fxsave_struct))) + return -1; + return 1; +} + +int save_i387_ia32(struct _fpstate_ia32 __user *buf) +{ + if (!used_math()) + return 0; + + /* This will cause a "finit" to be triggered by the next + * attempted FPU operation by the 'current' process. + */ + clear_used_math(); + + if (HAVE_HWFP) { + if (cpu_has_fxsr) { + return save_i387_fxsave(buf); + } else { + return save_i387_fsave(buf); + } + } else { + return fpregs_soft_get(current, NULL, + 0, sizeof(struct user_i387_ia32_struct), + NULL, buf) ? -1 : 1; + } +} + +static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) +{ + struct task_struct *tsk = current; + clear_fpu(tsk); + return __copy_from_user(&tsk->thread.i387.fsave, buf, + sizeof(struct i387_fsave_struct)); +} + +static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) +{ + int err; + struct task_struct *tsk = current; + struct user_i387_ia32_struct env; + clear_fpu(tsk); + err = __copy_from_user(&tsk->thread.i387.fxsave, &buf->_fxsr_env[0], + sizeof(struct i387_fxsave_struct)); + /* mxcsr reserved bits must be masked to zero for security reasons */ + tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; + if (err || __copy_from_user(&env, buf, sizeof(env))) + return 1; + convert_to_fxsr(tsk, &env); + return 0; +} + +int restore_i387_ia32(struct _fpstate_ia32 __user *buf) +{ + int err; + + if (HAVE_HWFP) { + if (cpu_has_fxsr) { + err = restore_i387_fxsave(buf); + } else { + err = restore_i387_fsave(buf); + } + } else { + err = fpregs_soft_set(current, NULL, + 0, sizeof(struct user_i387_ia32_struct), + NULL, buf) != 0; + } + set_used_math(); + return err; +} + +/* + * FPU state for core dumps. + * This is only used for a.out dumps now. + * It is declared generically using elf_fpregset_t (which is + * struct user_i387_struct) but is in fact only used for 32-bit + * dumps, so on 64-bit it is really struct user_i387_ia32_struct. + */ +int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu) +{ + int fpvalid; + struct task_struct *tsk = current; + + fpvalid = !!used_math(); + if (fpvalid) + fpvalid = !fpregs_get(tsk, NULL, + 0, sizeof(struct user_i387_ia32_struct), + fpu, NULL); + + return fpvalid; +} +EXPORT_SYMBOL(dump_fpu); + +#endif /* CONFIG_X86_32 || CONFIG_IA32_EMULATION */ diff --git a/arch/x86/kernel/i387_32.c b/arch/x86/kernel/i387_32.c deleted file mode 100644 index 7d2e12f6c78..00000000000 --- a/arch/x86/kernel/i387_32.c +++ /dev/null @@ -1,544 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes <gareth@valinux.com>, May 2000 - */ - -#include <linux/sched.h> -#include <linux/module.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/math_emu.h> -#include <asm/sigcontext.h> -#include <asm/user.h> -#include <asm/ptrace.h> -#include <asm/uaccess.h> - -#ifdef CONFIG_MATH_EMULATION -#define HAVE_HWFP (boot_cpu_data.hard_math) -#else -#define HAVE_HWFP 1 -#endif - -static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff; - -void mxcsr_feature_mask_init(void) -{ - unsigned long mask = 0; - clts(); - if (cpu_has_fxsr) { - memset(¤t->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); - asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); - mask = current->thread.i387.fxsave.mxcsr_mask; - if (mask == 0) mask = 0x0000ffbf; - } - mxcsr_feature_mask &= mask; - stts(); -} - -/* - * The _current_ task is using the FPU for the first time - * so initialize it and set the mxcsr to its default - * value at reset if we support XMM instructions and then - * remeber the current task has used the FPU. - */ -void init_fpu(struct task_struct *tsk) -{ - if (cpu_has_fxsr) { - memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); - tsk->thread.i387.fxsave.cwd = 0x37f; - if (cpu_has_xmm) - tsk->thread.i387.fxsave.mxcsr = 0x1f80; - } else { - memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct)); - tsk->thread.i387.fsave.cwd = 0xffff037fu; - tsk->thread.i387.fsave.swd = 0xffff0000u; - tsk->thread.i387.fsave.twd = 0xffffffffu; - tsk->thread.i387.fsave.fos = 0xffff0000u; - } - /* only the device not available exception or ptrace can call init_fpu */ - set_stopped_child_used_math(tsk); -} - -/* - * FPU lazy state save handling. - */ - -void kernel_fpu_begin(void) -{ - struct thread_info *thread = current_thread_info(); - - preempt_disable(); - if (thread->status & TS_USEDFPU) { - __save_init_fpu(thread->task); - return; - } - clts(); -} -EXPORT_SYMBOL_GPL(kernel_fpu_begin); - -/* - * FPU tag word conversions. - */ - -static inline unsigned short twd_i387_to_fxsr( unsigned short twd ) -{ - unsigned int tmp; /* to avoid 16 bit prefixes in the code */ - - /* Transform each pair of bits into 01 (valid) or 00 (empty) */ - tmp = ~twd; - tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ - /* and move the valid bits to the lower byte. */ - tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ - tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ - tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ - return tmp; -} - -static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave ) -{ - struct _fpxreg *st = NULL; - unsigned long tos = (fxsave->swd >> 11) & 7; - unsigned long twd = (unsigned long) fxsave->twd; - unsigned long tag; - unsigned long ret = 0xffff0000u; - int i; - -#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); - - for ( i = 0 ; i < 8 ; i++ ) { - if ( twd & 0x1 ) { - st = FPREG_ADDR( fxsave, (i - tos) & 7 ); - - switch ( st->exponent & 0x7fff ) { - case 0x7fff: - tag = 2; /* Special */ - break; - case 0x0000: - if ( !st->significand[0] && - !st->significand[1] && - !st->significand[2] && - !st->significand[3] ) { - tag = 1; /* Zero */ - } else { - tag = 2; /* Special */ - } - break; - default: - if ( st->significand[3] & 0x8000 ) { - tag = 0; /* Valid */ - } else { - tag = 2; /* Special */ - } - break; - } - } else { - tag = 3; /* Empty */ - } - ret |= (tag << (2 * i)); - twd = twd >> 1; - } - return ret; -} - -/* - * FPU state interaction. - */ - -unsigned short get_fpu_cwd( struct task_struct *tsk ) -{ - if ( cpu_has_fxsr ) { - return tsk->thread.i387.fxsave.cwd; - } else { - return (unsigned short)tsk->thread.i387.fsave.cwd; - } -} - -unsigned short get_fpu_swd( struct task_struct *tsk ) -{ - if ( cpu_has_fxsr ) { - return tsk->thread.i387.fxsave.swd; - } else { - return (unsigned short)tsk->thread.i387.fsave.swd; - } -} - -#if 0 -unsigned short get_fpu_twd( struct task_struct *tsk ) -{ - if ( cpu_has_fxsr ) { - return tsk->thread.i387.fxsave.twd; - } else { - return (unsigned short)tsk->thread.i387.fsave.twd; - } -} -#endif /* 0 */ - -unsigned short get_fpu_mxcsr( struct task_struct *tsk ) -{ - if ( cpu_has_xmm ) { - return tsk->thread.i387.fxsave.mxcsr; - } else { - return 0x1f80; - } -} - -#if 0 - -void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd ) -{ - if ( cpu_has_fxsr ) { - tsk->thread.i387.fxsave.cwd = cwd; - } else { - tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u); - } -} - -void set_fpu_swd( struct task_struct *tsk, unsigned short swd ) -{ - if ( cpu_has_fxsr ) { - tsk->thread.i387.fxsave.swd = swd; - } else { - tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u); - } -} - -void set_fpu_twd( struct task_struct *tsk, unsigned short twd ) -{ - if ( cpu_has_fxsr ) { - tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd); - } else { - tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u); - } -} - -#endif /* 0 */ - -/* - * FXSR floating point environment conversions. - */ - -static int convert_fxsr_to_user( struct _fpstate __user *buf, - struct i387_fxsave_struct *fxsave ) -{ - unsigned long env[7]; - struct _fpreg __user *to; - struct _fpxreg *from; - int i; - - env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul; - env[1] = (unsigned long)fxsave->swd | 0xffff0000ul; - env[2] = twd_fxsr_to_i387(fxsave); - env[3] = fxsave->fip; - env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16); - env[5] = fxsave->foo; - env[6] = fxsave->fos; - - if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) ) - return 1; - - to = &buf->_st[0]; - from = (struct _fpxreg *) &fxsave->st_space[0]; - for ( i = 0 ; i < 8 ; i++, to++, from++ ) { - unsigned long __user *t = (unsigned long __user *)to; - unsigned long *f = (unsigned long *)from; - - if (__put_user(*f, t) || - __put_user(*(f + 1), t + 1) || - __put_user(from->exponent, &to->exponent)) - return 1; - } - return 0; -} - -static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave, - struct _fpstate __user *buf ) -{ - unsigned long env[7]; - struct _fpxreg *to; - struct _fpreg __user *from; - int i; - - if ( __copy_from_user( env, buf, 7 * sizeof(long) ) ) - return 1; - - fxsave->cwd = (unsigned short)(env[0] & 0xffff); - fxsave->swd = (unsigned short)(env[1] & 0xffff); - fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff)); - fxsave->fip = env[3]; - fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16); - fxsave->fcs = (env[4] & 0xffff); - fxsave->foo = env[5]; - fxsave->fos = env[6]; - - to = (struct _fpxreg *) &fxsave->st_space[0]; - from = &buf->_st[0]; - for ( i = 0 ; i < 8 ; i++, to++, from++ ) { - unsigned long *t = (unsigned long *)to; - unsigned long __user *f = (unsigned long __user *)from; - - if (__get_user(*t, f) || - __get_user(*(t + 1), f + 1) || - __get_user(to->exponent, &from->exponent)) - return 1; - } - return 0; -} - -/* - * Signal frame handlers. - */ - -static inline int save_i387_fsave( struct _fpstate __user *buf ) -{ - struct task_struct *tsk = current; - - unlazy_fpu( tsk ); - tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd; - if ( __copy_to_user( buf, &tsk->thread.i387.fsave, - sizeof(struct i387_fsave_struct) ) ) - return -1; - return 1; -} - -static int save_i387_fxsave( struct _fpstate __user *buf ) -{ - struct task_struct *tsk = current; - int err = 0; - - unlazy_fpu( tsk ); - - if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) ) - return -1; - - err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status ); - err |= __put_user( X86_FXSR_MAGIC, &buf->magic ); - if ( err ) - return -1; - - if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave, - sizeof(struct i387_fxsave_struct) ) ) - return -1; - return 1; -} - -int save_i387( struct _fpstate __user *buf ) -{ - if ( !used_math() ) - return 0; - - /* This will cause a "finit" to be triggered by the next - * attempted FPU operation by the 'current' process. - */ - clear_used_math(); - - if ( HAVE_HWFP ) { - if ( cpu_has_fxsr ) { - return save_i387_fxsave( buf ); - } else { - return save_i387_fsave( buf ); - } - } else { - return save_i387_soft( ¤t->thread.i387.soft, buf ); - } -} - -static inline int restore_i387_fsave( struct _fpstate __user *buf ) -{ - struct task_struct *tsk = current; - clear_fpu( tsk ); - return __copy_from_user( &tsk->thread.i387.fsave, buf, - sizeof(struct i387_fsave_struct) ); -} - -static int restore_i387_fxsave( struct _fpstate __user *buf ) -{ - int err; - struct task_struct *tsk = current; - clear_fpu( tsk ); - err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0], - sizeof(struct i387_fxsave_struct) ); - /* mxcsr reserved bits must be masked to zero for security reasons */ - tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; - return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf ); -} - -int restore_i387( struct _fpstate __user *buf ) -{ - int err; - - if ( HAVE_HWFP ) { - if ( cpu_has_fxsr ) { - err = restore_i387_fxsave( buf ); - } else { - err = restore_i387_fsave( buf ); - } - } else { - err = restore_i387_soft( ¤t->thread.i387.soft, buf ); - } - set_used_math(); - return err; -} - -/* - * ptrace request handlers. - */ - -static inline int get_fpregs_fsave( struct user_i387_struct __user *buf, - struct task_struct *tsk ) -{ - return __copy_to_user( buf, &tsk->thread.i387.fsave, - sizeof(struct user_i387_struct) ); -} - -static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf, - struct task_struct *tsk ) -{ - return convert_fxsr_to_user( (struct _fpstate __user *)buf, - &tsk->thread.i387.fxsave ); -} - -int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk ) -{ - if ( HAVE_HWFP ) { - if ( cpu_has_fxsr ) { - return get_fpregs_fxsave( buf, tsk ); - } else { - return get_fpregs_fsave( buf, tsk ); - } - } else { - return save_i387_soft( &tsk->thread.i387.soft, - (struct _fpstate __user *)buf ); - } -} - -static inline int set_fpregs_fsave( struct task_struct *tsk, - struct user_i387_struct __user *buf ) -{ - return __copy_from_user( &tsk->thread.i387.fsave, buf, - sizeof(struct user_i387_struct) ); -} - -static inline int set_fpregs_fxsave( struct task_struct *tsk, - struct user_i387_struct __user *buf ) -{ - return convert_fxsr_from_user( &tsk->thread.i387.fxsave, - (struct _fpstate __user *)buf ); -} - -int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf ) -{ - if ( HAVE_HWFP ) { - if ( cpu_has_fxsr ) { - return set_fpregs_fxsave( tsk, buf ); - } else { - return set_fpregs_fsave( tsk, buf ); - } - } else { - return restore_i387_soft( &tsk->thread.i387.soft, - (struct _fpstate __user *)buf ); - } -} - -int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk ) -{ - if ( cpu_has_fxsr ) { - if (__copy_to_user( buf, &tsk->thread.i387.fxsave, - sizeof(struct user_fxsr_struct) )) - return -EFAULT; - return 0; - } else { - return -EIO; - } -} - -int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf ) -{ - int ret = 0; - - if ( cpu_has_fxsr ) { - if (__copy_from_user( &tsk->thread.i387.fxsave, buf, - sizeof(struct user_fxsr_struct) )) - ret = -EFAULT; - /* mxcsr reserved bits must be masked to zero for security reasons */ - tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; - } else { - ret = -EIO; - } - return ret; -} - -/* - * FPU state for core dumps. - */ - -static inline void copy_fpu_fsave( struct task_struct *tsk, - struct user_i387_struct *fpu ) -{ - memcpy( fpu, &tsk->thread.i387.fsave, - sizeof(struct user_i387_struct) ); -} - -static inline void copy_fpu_fxsave( struct task_struct *tsk, - struct user_i387_struct *fpu ) -{ - unsigned short *to; - unsigned short *from; - int i; - - memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) ); - - to = (unsigned short *)&fpu->st_space[0]; - from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0]; - for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) { - memcpy( to, from, 5 * sizeof(unsigned short) ); - } -} - -int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) -{ - int fpvalid; - struct task_struct *tsk = current; - - fpvalid = !!used_math(); - if ( fpvalid ) { - unlazy_fpu( tsk ); - if ( cpu_has_fxsr ) { - copy_fpu_fxsave( tsk, fpu ); - } else { - copy_fpu_fsave( tsk, fpu ); - } - } - - return fpvalid; -} -EXPORT_SYMBOL(dump_fpu); - -int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) -{ - int fpvalid = !!tsk_used_math(tsk); - - if (fpvalid) { - if (tsk == current) - unlazy_fpu(tsk); - if (cpu_has_fxsr) - copy_fpu_fxsave(tsk, fpu); - else - copy_fpu_fsave(tsk, fpu); - } - return fpvalid; -} - -int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu) -{ - int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr; - - if (fpvalid) { - if (tsk == current) - unlazy_fpu(tsk); - memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu)); - } - return fpvalid; -} diff --git a/arch/x86/kernel/i387_64.c b/arch/x86/kernel/i387_64.c deleted file mode 100644 index bfaff28fb13..00000000000 --- a/arch/x86/kernel/i387_64.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * Copyright (C) 2002 Andi Kleen, SuSE Labs - * - * Pentium III FXSR, SSE support - * General FPU state handling cleanups - * Gareth Hughes <gareth@valinux.com>, May 2000 - * - * x86-64 rework 2002 Andi Kleen. - * Does direct fxsave in and out of user space now for signal handlers. - * All the FSAVE<->FXSAVE conversion code has been moved to the 32bit emulation, - * the 64bit user space sees a FXSAVE frame directly. - */ - -#include <linux/sched.h> -#include <linux/init.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/sigcontext.h> -#include <asm/user.h> -#include <asm/ptrace.h> -#include <asm/uaccess.h> - -unsigned int mxcsr_feature_mask __read_mostly = 0xffffffff; - -void mxcsr_feature_mask_init(void) -{ - unsigned int mask; - clts(); - memset(¤t->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); - asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); - mask = current->thread.i387.fxsave.mxcsr_mask; - if (mask == 0) mask = 0x0000ffbf; - mxcsr_feature_mask &= mask; - stts(); -} - -/* - * Called at bootup to set up the initial FPU state that is later cloned - * into all processes. - */ -void __cpuinit fpu_init(void) -{ - unsigned long oldcr0 = read_cr0(); - extern void __bad_fxsave_alignment(void); - - if (offsetof(struct task_struct, thread.i387.fxsave) & 15) - __bad_fxsave_alignment(); - set_in_cr4(X86_CR4_OSFXSR); - set_in_cr4(X86_CR4_OSXMMEXCPT); - - write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */ - - mxcsr_feature_mask_init(); - /* clean state in init */ - current_thread_info()->status = 0; - clear_used_math(); -} - -void init_fpu(struct task_struct *child) -{ - if (tsk_used_math(child)) { - if (child == current) - unlazy_fpu(child); - return; - } - memset(&child->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); - child->thread.i387.fxsave.cwd = 0x37f; - child->thread.i387.fxsave.mxcsr = 0x1f80; - /* only the device not available exception or ptrace can call init_fpu */ - set_stopped_child_used_math(child); -} - -/* - * Signal frame handlers. - */ - -int save_i387(struct _fpstate __user *buf) -{ - struct task_struct *tsk = current; - int err = 0; - - BUILD_BUG_ON(sizeof(struct user_i387_struct) != - sizeof(tsk->thread.i387.fxsave)); - - if ((unsigned long)buf % 16) - printk("save_i387: bad fpstate %p\n",buf); - - if (!used_math()) - return 0; - clear_used_math(); /* trigger finit */ - if (task_thread_info(tsk)->status & TS_USEDFPU) { - err = save_i387_checking((struct i387_fxsave_struct __user *)buf); - if (err) return err; - task_thread_info(tsk)->status &= ~TS_USEDFPU; - stts(); - } else { - if (__copy_to_user(buf, &tsk->thread.i387.fxsave, - sizeof(struct i387_fxsave_struct))) - return -1; - } - return 1; -} - -/* - * ptrace request handlers. - */ - -int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *tsk) -{ - init_fpu(tsk); - return __copy_to_user(buf, &tsk->thread.i387.fxsave, - sizeof(struct user_i387_struct)) ? -EFAULT : 0; -} - -int set_fpregs(struct task_struct *tsk, struct user_i387_struct __user *buf) -{ - if (__copy_from_user(&tsk->thread.i387.fxsave, buf, - sizeof(struct user_i387_struct))) - return -EFAULT; - return 0; -} - -/* - * FPU state for core dumps. - */ - -int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) -{ - struct task_struct *tsk = current; - - if (!used_math()) - return 0; - - unlazy_fpu(tsk); - memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); - return 1; -} - -int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) -{ - int fpvalid = !!tsk_used_math(tsk); - - if (fpvalid) { - if (tsk == current) - unlazy_fpu(tsk); - memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(struct user_i387_struct)); -} - return fpvalid; -} diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index a42c8074532..ef62b07b2b4 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -13,10 +13,17 @@ #include <asm/delay.h> #include <asm/i8253.h> #include <asm/io.h> +#include <asm/hpet.h> DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); +#ifdef CONFIG_X86_32 +static void pit_disable_clocksource(void); +#else +static inline void pit_disable_clocksource(void) { } +#endif + /* * HPET replaces the PIT, when enabled. So we need to know, which of * the two timers is used @@ -31,38 +38,38 @@ struct clock_event_device *global_clock_event; static void init_pit_timer(enum clock_event_mode mode, struct clock_event_device *evt) { - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); + spin_lock(&i8253_lock); switch(mode) { case CLOCK_EVT_MODE_PERIODIC: /* binary, mode 2, LSB/MSB, ch 0 */ - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - outb(LATCH >> 8 , PIT_CH0); /* MSB */ + outb_pit(0x34, PIT_MODE); + outb_pit(LATCH & 0xff , PIT_CH0); /* LSB */ + outb_pit(LATCH >> 8 , PIT_CH0); /* MSB */ break; case CLOCK_EVT_MODE_SHUTDOWN: case CLOCK_EVT_MODE_UNUSED: if (evt->mode == CLOCK_EVT_MODE_PERIODIC || evt->mode == CLOCK_EVT_MODE_ONESHOT) { - outb_p(0x30, PIT_MODE); - outb_p(0, PIT_CH0); - outb_p(0, PIT_CH0); + outb_pit(0x30, PIT_MODE); + outb_pit(0, PIT_CH0); + outb_pit(0, PIT_CH0); } + pit_disable_clocksource(); break; case CLOCK_EVT_MODE_ONESHOT: /* One shot setup */ - outb_p(0x38, PIT_MODE); + pit_disable_clocksource(); + outb_pit(0x38, PIT_MODE); break; case CLOCK_EVT_MODE_RESUME: /* Nothing to do here */ break; } - spin_unlock_irqrestore(&i8253_lock, flags); + spin_unlock(&i8253_lock); } /* @@ -72,12 +79,10 @@ static void init_pit_timer(enum clock_event_mode mode, */ static int pit_next_event(unsigned long delta, struct clock_event_device *evt) { - unsigned long flags; - - spin_lock_irqsave(&i8253_lock, flags); - outb_p(delta & 0xff , PIT_CH0); /* LSB */ - outb(delta >> 8 , PIT_CH0); /* MSB */ - spin_unlock_irqrestore(&i8253_lock, flags); + spin_lock(&i8253_lock); + outb_pit(delta & 0xff , PIT_CH0); /* LSB */ + outb_pit(delta >> 8 , PIT_CH0); /* MSB */ + spin_unlock(&i8253_lock); return 0; } @@ -148,15 +153,15 @@ static cycle_t pit_read(void) * count), it cannot be newer. */ jifs = jiffies; - outb_p(0x00, PIT_MODE); /* latch the count ASAP */ - count = inb_p(PIT_CH0); /* read the latched count */ - count |= inb_p(PIT_CH0) << 8; + outb_pit(0x00, PIT_MODE); /* latch the count ASAP */ + count = inb_pit(PIT_CH0); /* read the latched count */ + count |= inb_pit(PIT_CH0) << 8; /* VIA686a test code... reset the latch if count > max + 1 */ if (count > LATCH) { - outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff, PIT_CH0); - outb(LATCH >> 8, PIT_CH0); + outb_pit(0x34, PIT_MODE); + outb_pit(LATCH & 0xff, PIT_CH0); + outb_pit(LATCH >> 8, PIT_CH0); count = LATCH - 1; } @@ -195,9 +200,28 @@ static struct clocksource clocksource_pit = { .shift = 20, }; +static void pit_disable_clocksource(void) +{ + /* + * Use mult to check whether it is registered or not + */ + if (clocksource_pit.mult) { + clocksource_unregister(&clocksource_pit); + clocksource_pit.mult = 0; + } +} + static int __init init_pit_clocksource(void) { - if (num_possible_cpus() > 1) /* PIT does not scale! */ + /* + * Several reasons not to register PIT as a clocksource: + * + * - On SMP PIT does not scale due to i8253_lock + * - when HPET is enabled + * - when local APIC timer is active (PIT is switched off) + */ + if (num_possible_cpus() > 1 || is_hpet_enabled() || + pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC) return 0; clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c index 5f3496d0198..2d25b77102f 100644 --- a/arch/x86/kernel/i8259_32.c +++ b/arch/x86/kernel/i8259_32.c @@ -21,8 +21,6 @@ #include <asm/arch_hooks.h> #include <asm/i8259.h> -#include <io_ports.h> - /* * This is the 'legacy' 8259A Programmable Interrupt Controller, * present in the majority of PC/AT boxes. @@ -291,20 +289,20 @@ void init_8259A(int auto_eoi) outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ /* - * outb_p - this has to work on a wide range of PC hardware. + * outb_pic - this has to work on a wide range of PC hardware. */ - outb_p(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ - outb_p(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ - outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ + outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ + outb_pic(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ + outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ if (auto_eoi) /* master does Auto EOI */ - outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); + outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); else /* master expects normal EOI */ - outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); - outb_p(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ - outb_p(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ - outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ - outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ + outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ + outb_pic(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ if (auto_eoi) /* * In AEOI mode we just have to mask the interrupt @@ -341,7 +339,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id) outb(0,0xF0); if (ignore_fpu_irq || !boot_cpu_data.hard_math) return IRQ_NONE; - math_error((void __user *)get_irq_regs()->eip); + math_error((void __user *)get_irq_regs()->ip); return IRQ_HANDLED; } diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c index ba6d57286f5..fa57a156850 100644 --- a/arch/x86/kernel/i8259_64.c +++ b/arch/x86/kernel/i8259_64.c @@ -21,6 +21,7 @@ #include <asm/delay.h> #include <asm/desc.h> #include <asm/apic.h> +#include <asm/i8259.h> /* * Common place to define all x86 IRQ vectors @@ -48,7 +49,7 @@ */ /* - * The IO-APIC gives us many more interrupt sources. Most of these + * The IO-APIC gives us many more interrupt sources. Most of these * are unused but an SMP system is supposed to have enough memory ... * sometimes (mostly wrt. hw bugs) we get corrupted vectors all * across the spectrum, so we really want to be prepared to get all @@ -76,7 +77,7 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) /* for the irq vectors */ -static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { +static void (*__initdata interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { IRQLIST_16(0x2), IRQLIST_16(0x3), IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), @@ -114,11 +115,7 @@ static struct irq_chip i8259A_chip = { /* * This contains the irq mask for both 8259A irq controllers, */ -static unsigned int cached_irq_mask = 0xffff; - -#define __byte(x,y) (((unsigned char *)&(y))[x]) -#define cached_21 (__byte(0,cached_irq_mask)) -#define cached_A1 (__byte(1,cached_irq_mask)) +unsigned int cached_irq_mask = 0xffff; /* * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) @@ -139,9 +136,9 @@ void disable_8259A_irq(unsigned int irq) spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask |= mask; if (irq & 8) - outb(cached_A1,0xA1); + outb(cached_slave_mask, PIC_SLAVE_IMR); else - outb(cached_21,0x21); + outb(cached_master_mask, PIC_MASTER_IMR); spin_unlock_irqrestore(&i8259A_lock, flags); } @@ -153,9 +150,9 @@ void enable_8259A_irq(unsigned int irq) spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask &= mask; if (irq & 8) - outb(cached_A1,0xA1); + outb(cached_slave_mask, PIC_SLAVE_IMR); else - outb(cached_21,0x21); + outb(cached_master_mask, PIC_MASTER_IMR); spin_unlock_irqrestore(&i8259A_lock, flags); } @@ -167,9 +164,9 @@ int i8259A_irq_pending(unsigned int irq) spin_lock_irqsave(&i8259A_lock, flags); if (irq < 8) - ret = inb(0x20) & mask; + ret = inb(PIC_MASTER_CMD) & mask; else - ret = inb(0xA0) & (mask >> 8); + ret = inb(PIC_SLAVE_CMD) & (mask >> 8); spin_unlock_irqrestore(&i8259A_lock, flags); return ret; @@ -196,14 +193,14 @@ static inline int i8259A_irq_real(unsigned int irq) int irqmask = 1<<irq; if (irq < 8) { - outb(0x0B,0x20); /* ISR register */ - value = inb(0x20) & irqmask; - outb(0x0A,0x20); /* back to the IRR register */ + outb(0x0B,PIC_MASTER_CMD); /* ISR register */ + value = inb(PIC_MASTER_CMD) & irqmask; + outb(0x0A,PIC_MASTER_CMD); /* back to the IRR register */ return value; } - outb(0x0B,0xA0); /* ISR register */ - value = inb(0xA0) & (irqmask >> 8); - outb(0x0A,0xA0); /* back to the IRR register */ + outb(0x0B,PIC_SLAVE_CMD); /* ISR register */ + value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); + outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ return value; } @@ -240,14 +237,17 @@ static void mask_and_ack_8259A(unsigned int irq) handle_real_irq: if (irq & 8) { - inb(0xA1); /* DUMMY - (do we need this?) */ - outb(cached_A1,0xA1); - outb(0x60+(irq&7),0xA0);/* 'Specific EOI' to slave */ - outb(0x62,0x20); /* 'Specific EOI' to master-IRQ2 */ + inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ + outb(cached_slave_mask, PIC_SLAVE_IMR); + /* 'Specific EOI' to slave */ + outb(0x60+(irq&7),PIC_SLAVE_CMD); + /* 'Specific EOI' to master-IRQ2 */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); } else { - inb(0x21); /* DUMMY - (do we need this?) */ - outb(cached_21,0x21); - outb(0x60+irq,0x20); /* 'Specific EOI' to master */ + inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ + outb(cached_master_mask, PIC_MASTER_IMR); + /* 'Specific EOI' to master */ + outb(0x60+irq,PIC_MASTER_CMD); } spin_unlock_irqrestore(&i8259A_lock, flags); return; @@ -270,7 +270,8 @@ spurious_8259A_irq: * lets ACK and report it. [once per IRQ] */ if (!(spurious_irq_mask & irqmask)) { - printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); + printk(KERN_DEBUG + "spurious 8259A interrupt: IRQ%d.\n", irq); spurious_irq_mask |= irqmask; } atomic_inc(&irq_err_count); @@ -283,51 +284,6 @@ spurious_8259A_irq: } } -void init_8259A(int auto_eoi) -{ - unsigned long flags; - - i8259A_auto_eoi = auto_eoi; - - spin_lock_irqsave(&i8259A_lock, flags); - - outb(0xff, 0x21); /* mask all of 8259A-1 */ - outb(0xff, 0xA1); /* mask all of 8259A-2 */ - - /* - * outb_p - this has to work on a wide range of PC hardware. - */ - outb_p(0x11, 0x20); /* ICW1: select 8259A-1 init */ - outb_p(IRQ0_VECTOR, 0x21); /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ - outb_p(0x04, 0x21); /* 8259A-1 (the master) has a slave on IR2 */ - if (auto_eoi) - outb_p(0x03, 0x21); /* master does Auto EOI */ - else - outb_p(0x01, 0x21); /* master expects normal EOI */ - - outb_p(0x11, 0xA0); /* ICW1: select 8259A-2 init */ - outb_p(IRQ8_VECTOR, 0xA1); /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ - outb_p(0x02, 0xA1); /* 8259A-2 is a slave on master's IR2 */ - outb_p(0x01, 0xA1); /* (slave's support for AEOI in flat mode - is to be investigated) */ - - if (auto_eoi) - /* - * in AEOI mode we just have to mask the interrupt - * when acking. - */ - i8259A_chip.mask_ack = disable_8259A_irq; - else - i8259A_chip.mask_ack = mask_and_ack_8259A; - - udelay(100); /* wait for 8259A to initialize */ - - outb(cached_21, 0x21); /* restore master IRQ mask */ - outb(cached_A1, 0xA1); /* restore slave IRQ mask */ - - spin_unlock_irqrestore(&i8259A_lock, flags); -} - static char irq_trigger[2]; /** * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ @@ -364,8 +320,8 @@ static int i8259A_shutdown(struct sys_device *dev) * the kernel initialization code can get it * out of. */ - outb(0xff, 0x21); /* mask all of 8259A-1 */ - outb(0xff, 0xA1); /* mask all of 8259A-1 */ + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ return 0; } @@ -391,6 +347,58 @@ static int __init i8259A_init_sysfs(void) device_initcall(i8259A_init_sysfs); +void init_8259A(int auto_eoi) +{ + unsigned long flags; + + i8259A_auto_eoi = auto_eoi; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + + /* + * outb_pic - this has to work on a wide range of PC hardware. + */ + outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ + /* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */ + outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR); + /* 8259A-1 (the master) has a slave on IR2 */ + outb_pic(0x04, PIC_MASTER_IMR); + if (auto_eoi) /* master does Auto EOI */ + outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); + else /* master expects normal EOI */ + outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + + outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ + /* ICW2: 8259A-2 IR0-7 mapped to 0x38-0x3f */ + outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR); + /* 8259A-2 is a slave on master's IR2 */ + outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); + /* (slave's support for AEOI in flat mode is to be investigated) */ + outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); + + if (auto_eoi) + /* + * In AEOI mode we just have to mask the interrupt + * when acking. + */ + i8259A_chip.mask_ack = disable_8259A_irq; + else + i8259A_chip.mask_ack = mask_and_ack_8259A; + + udelay(100); /* wait for 8259A to initialize */ + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} + + + + /* * IRQ2 is cascade interrupt to second interrupt controller */ @@ -448,7 +456,9 @@ void __init init_ISA_irqs (void) } } -void __init init_IRQ(void) +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) { int i; diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 468c9c43784..5b3ce793436 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -15,7 +15,6 @@ static struct files_struct init_files = INIT_FILES; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); -EXPORT_SYMBOL(init_mm); /* * Initial thread structure. diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index ab77f190546..4ca548632c8 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -35,6 +35,7 @@ #include <linux/htirq.h> #include <linux/freezer.h> #include <linux/kthread.h> +#include <linux/jiffies.h> /* time_after() */ #include <asm/io.h> #include <asm/smp.h> @@ -48,8 +49,6 @@ #include <mach_apic.h> #include <mach_apicdef.h> -#include "io_ports.h" - int (*ioapic_renumber_irq)(int ioapic, int irq); atomic_t irq_mis_count; @@ -351,7 +350,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) # include <asm/processor.h> /* kernel_thread() */ # include <linux/kernel_stat.h> /* kstat */ # include <linux/slab.h> /* kmalloc() */ -# include <linux/timer.h> /* time_after() */ +# include <linux/timer.h> #define IRQBALANCE_CHECK_ARCH -999 #define MAX_BALANCED_IRQ_INTERVAL (5*HZ) @@ -727,7 +726,7 @@ late_initcall(balanced_irq_init); #endif /* CONFIG_SMP */ #ifndef CONFIG_SMP -void fastcall send_IPI_self(int vector) +void send_IPI_self(int vector) { unsigned int cfg; @@ -1900,7 +1899,7 @@ static int __init timer_irq_works(void) * might have cached one ExtINT interrupt. Finally, at * least one tick may be lost due to delays. */ - if (jiffies - t1 > 4) + if (time_after(jiffies, t1 + 4)) return 1; return 0; @@ -2080,7 +2079,7 @@ static struct irq_chip lapic_chip __read_mostly = { .eoi = ack_apic, }; -static void setup_nmi (void) +static void __init setup_nmi(void) { /* * Dirty trick to enable the NMI watchdog ... @@ -2093,7 +2092,7 @@ static void setup_nmi (void) */ apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); - on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); + enable_NMI_through_LVT0(); apic_printk(APIC_VERBOSE, " done.\n"); } diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 23a3ac06a23..1627c0d53e0 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -32,9 +32,11 @@ #include <linux/msi.h> #include <linux/htirq.h> #include <linux/dmar.h> +#include <linux/jiffies.h> #ifdef CONFIG_ACPI #include <acpi/acpi_bus.h> #endif +#include <linux/bootmem.h> #include <asm/idle.h> #include <asm/io.h> @@ -1069,7 +1071,7 @@ void __apicdebuginit print_local_APIC(void * dummy) v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); ver = GET_APIC_VERSION(v); - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); v = apic_read(APIC_TASKPRI); printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); @@ -1171,7 +1173,7 @@ void __apicdebuginit print_PIC(void) #endif /* 0 */ -static void __init enable_IO_APIC(void) +void __init enable_IO_APIC(void) { union IO_APIC_reg_01 reg_01; int i8259_apic, i8259_pin; @@ -1298,7 +1300,7 @@ static int __init timer_irq_works(void) */ /* jiffies wrap? */ - if (jiffies - t1 > 4) + if (time_after(jiffies, t1 + 4)) return 1; return 0; } @@ -1411,7 +1413,7 @@ static void irq_complete_move(unsigned int irq) if (likely(!cfg->move_in_progress)) return; - vector = ~get_irq_regs()->orig_rax; + vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { cpumask_t cleanup_mask; @@ -1438,7 +1440,7 @@ static void ack_apic_level(unsigned int irq) int do_unmask_irq = 0; irq_complete_move(irq); -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) +#ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; @@ -1565,7 +1567,7 @@ static struct hw_interrupt_type lapic_irq_type __read_mostly = { .end = end_lapic_irq, }; -static void setup_nmi (void) +static void __init setup_nmi(void) { /* * Dirty trick to enable the NMI watchdog ... @@ -1578,7 +1580,7 @@ static void setup_nmi (void) */ printk(KERN_INFO "activating NMI Watchdog ..."); - enable_NMI_through_LVT0(NULL); + enable_NMI_through_LVT0(); printk(" done.\n"); } @@ -1654,7 +1656,7 @@ static inline void unlock_ExtINT_logic(void) * * FIXME: really need to revamp this for modern platforms only. */ -static inline void check_timer(void) +static inline void __init check_timer(void) { struct irq_cfg *cfg = irq_cfg + 0; int apic1, pin1, apic2, pin2; @@ -1788,7 +1790,10 @@ __setup("no_timer_check", notimercheck); void __init setup_IO_APIC(void) { - enable_IO_APIC(); + + /* + * calling enable_IO_APIC() is moved to setup_local_APIC for BP + */ if (acpi_ioapic) io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ @@ -2288,3 +2293,92 @@ void __init setup_ioapic_dest(void) } #endif +#define IOAPIC_RESOURCE_NAME_SIZE 11 + +static struct resource *ioapic_resources; + +static struct resource * __init ioapic_setup_resources(void) +{ + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics <= 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + mem = alloc_bootmem(n); + res = (void *)mem; + + if (mem != NULL) { + memset(mem, 0, n); + mem += sizeof(struct resource) * nr_ioapics; + + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + sprintf(mem, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + } + } + + ioapic_resources = res; + + return res; +} + +void __init ioapic_init_mappings(void) +{ + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + struct resource *ioapic_res; + int i; + + ioapic_res = ioapic_setup_resources(); + for (i = 0; i < nr_ioapics; i++) { + if (smp_found_config) { + ioapic_phys = mp_ioapics[i].mpc_apicaddr; + } else { + ioapic_phys = (unsigned long) + alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = __pa(ioapic_phys); + } + set_fixmap_nocache(idx, ioapic_phys); + apic_printk(APIC_VERBOSE, + "mapped IOAPIC to %016lx (%016lx)\n", + __fix_to_virt(idx), ioapic_phys); + idx++; + + if (ioapic_res != NULL) { + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res++; + } + } +} + +static int __init ioapic_insert_resources(void) +{ + int i; + struct resource *r = ioapic_resources; + + if (!r) { + printk(KERN_ERR + "IO APIC resources could be not be allocated.\n"); + return -1; + } + + for (i = 0; i < nr_ioapics; i++) { + insert_resource(&iomem_resource, r); + r++; + } + + return 0; +} + +/* Insert the IO APIC resources after PCI initialization has occured to handle + * IO APICS that are mapped in on a BAR in PCI space. */ +late_initcall(ioapic_insert_resources); + diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c new file mode 100644 index 00000000000..bd49321034d --- /dev/null +++ b/arch/x86/kernel/io_delay.c @@ -0,0 +1,114 @@ +/* + * I/O delay strategies for inb_p/outb_p + * + * Allow for a DMI based override of port 0x80, needed for certain HP laptops + * and possibly other systems. Also allow for the gradual elimination of + * outb_p/inb_p API uses. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/dmi.h> +#include <asm/io.h> + +int io_delay_type __read_mostly = CONFIG_DEFAULT_IO_DELAY_TYPE; +EXPORT_SYMBOL_GPL(io_delay_type); + +static int __initdata io_delay_override; + +/* + * Paravirt wants native_io_delay to be a constant. + */ +void native_io_delay(void) +{ + switch (io_delay_type) { + default: + case CONFIG_IO_DELAY_TYPE_0X80: + asm volatile ("outb %al, $0x80"); + break; + case CONFIG_IO_DELAY_TYPE_0XED: + asm volatile ("outb %al, $0xed"); + break; + case CONFIG_IO_DELAY_TYPE_UDELAY: + /* + * 2 usecs is an upper-bound for the outb delay but + * note that udelay doesn't have the bus-level + * side-effects that outb does, nor does udelay() have + * precise timings during very early bootup (the delays + * are shorter until calibrated): + */ + udelay(2); + case CONFIG_IO_DELAY_TYPE_NONE: + break; + } +} +EXPORT_SYMBOL(native_io_delay); + +static int __init dmi_io_delay_0xed_port(const struct dmi_system_id *id) +{ + if (io_delay_type == CONFIG_IO_DELAY_TYPE_0X80) { + printk(KERN_NOTICE "%s: using 0xed I/O delay port\n", + id->ident); + io_delay_type = CONFIG_IO_DELAY_TYPE_0XED; + } + + return 0; +} + +/* + * Quirk table for systems that misbehave (lock up, etc.) if port + * 0x80 is used: + */ +static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = { + { + .callback = dmi_io_delay_0xed_port, + .ident = "Compaq Presario V6000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B7") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion dv9000z", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30B9") + } + }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "HP Pavilion tx1000", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30BF") + } + }, + { } +}; + +void __init io_delay_init(void) +{ + if (!io_delay_override) + dmi_check_system(io_delay_0xed_port_dmi_table); +} + +static int __init io_delay_param(char *s) +{ + if (!strcmp(s, "0x80")) + io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; + else if (!strcmp(s, "0xed")) + io_delay_type = CONFIG_IO_DELAY_TYPE_0XED; + else if (!strcmp(s, "udelay")) + io_delay_type = CONFIG_IO_DELAY_TYPE_UDELAY; + else if (!strcmp(s, "none")) + io_delay_type = CONFIG_IO_DELAY_TYPE_NONE; + else + return -EINVAL; + + io_delay_override = 1; + return 0; +} + +early_param("io_delay", io_delay_param); diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport.c index 4ed48dc8df1..50e5e4a31c8 100644 --- a/arch/x86/kernel/ioport_32.c +++ b/arch/x86/kernel/ioport.c @@ -1,6 +1,6 @@ /* * This contains the io-permission bitmap code - written by obz, with changes - * by Linus. + * by Linus. 32/64 bits code unification by Miguel Botón. */ #include <linux/sched.h> @@ -16,49 +16,27 @@ #include <linux/syscalls.h> /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) +static void set_bitmap(unsigned long *bitmap, unsigned int base, + unsigned int extent, int new_value) { - unsigned long mask; - unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG); - unsigned int low_index = base & (BITS_PER_LONG-1); - int length = low_index + extent; - - if (low_index != 0) { - mask = (~0UL << low_index); - if (length < BITS_PER_LONG) - mask &= ~(~0UL << length); - if (new_value) - *bitmap_base++ |= mask; - else - *bitmap_base++ &= ~mask; - length -= BITS_PER_LONG; - } - - mask = (new_value ? ~0UL : 0UL); - while (length >= BITS_PER_LONG) { - *bitmap_base++ = mask; - length -= BITS_PER_LONG; - } + unsigned int i; - if (length > 0) { - mask = ~(~0UL << length); + for (i = base; i < base + extent; i++) { if (new_value) - *bitmap_base++ |= mask; + __set_bit(i, bitmap); else - *bitmap_base++ &= ~mask; + __clear_bit(i, bitmap); } } - /* * this changes the io permissions bitmap in the current task. */ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) { - unsigned long i, max_long, bytes, bytes_updated; struct thread_struct * t = ¤t->thread; struct tss_struct * tss; - unsigned long *bitmap; + unsigned int i, max_long, bytes, bytes_updated; if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) return -EINVAL; @@ -71,7 +49,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * this is why we delay this operation until now: */ if (!t->io_bitmap_ptr) { - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + unsigned long *bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!bitmap) return -ENOMEM; @@ -100,11 +79,12 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) if (t->io_bitmap_ptr[i] != ~0UL) max_long = i; - bytes = (max_long + 1) * sizeof(long); + bytes = (max_long + 1) * sizeof(unsigned long); bytes_updated = max(bytes, t->io_bitmap_max); t->io_bitmap_max = bytes; +#ifdef CONFIG_X86_32 /* * Sets the lazy trigger so that the next I/O operation will * reload the correct bitmap. @@ -113,6 +93,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) */ tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; tss->io_bitmap_owner = NULL; +#else + /* Update the TSS: */ + memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); +#endif put_cpu(); @@ -124,18 +108,14 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * beyond the 0x3ff range: to get the full 65536 ports bitmapped * you'd need 8kB of bitmaps/process, which is a bit excessive. * - * Here we just change the eflags value on the stack: we allow + * Here we just change the flags value on the stack: we allow * only the super-user to do it. This depends on the stack-layout * on system-call entry - see also fork() and the signal handling * code. */ - -asmlinkage long sys_iopl(unsigned long unused) +static int do_iopl(unsigned int level, struct pt_regs *regs) { - volatile struct pt_regs * regs = (struct pt_regs *) &unused; - unsigned int level = regs->ebx; - unsigned int old = (regs->eflags >> 12) & 3; - struct thread_struct *t = ¤t->thread; + unsigned int old = (regs->flags >> 12) & 3; if (level > 3) return -EINVAL; @@ -144,8 +124,31 @@ asmlinkage long sys_iopl(unsigned long unused) if (!capable(CAP_SYS_RAWIO)) return -EPERM; } + regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); + + return 0; +} + +#ifdef CONFIG_X86_32 +asmlinkage long sys_iopl(unsigned long regsp) +{ + struct pt_regs *regs = (struct pt_regs *)®sp; + unsigned int level = regs->bx; + struct thread_struct *t = ¤t->thread; + int rc; + + rc = do_iopl(level, regs); + if (rc < 0) + goto out; + t->iopl = level << 12; - regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl; set_iopl_mask(t->iopl); - return 0; +out: + return rc; +} +#else +asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) +{ + return do_iopl(level, regs); } +#endif diff --git a/arch/x86/kernel/ioport_64.c b/arch/x86/kernel/ioport_64.c deleted file mode 100644 index 5f62fad64da..00000000000 --- a/arch/x86/kernel/ioport_64.c +++ /dev/null @@ -1,117 +0,0 @@ -/* - * This contains the io-permission bitmap code - written by obz, with changes - * by Linus. - */ - -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/capability.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/ioport.h> -#include <linux/smp.h> -#include <linux/stddef.h> -#include <linux/slab.h> -#include <linux/thread_info.h> -#include <linux/syscalls.h> - -/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ -static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) -{ - int i; - if (new_value) - for (i = base; i < base + extent; i++) - __set_bit(i, bitmap); - else - for (i = base; i < base + extent; i++) - clear_bit(i, bitmap); -} - -/* - * this changes the io permissions bitmap in the current task. - */ -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) -{ - unsigned int i, max_long, bytes, bytes_updated; - struct thread_struct * t = ¤t->thread; - struct tss_struct * tss; - unsigned long *bitmap; - - if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) - return -EINVAL; - if (turn_on && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - /* - * If it's the first ioperm() call in this thread's lifetime, set the - * IO bitmap up. ioperm() is much less timing critical than clone(), - * this is why we delay this operation until now: - */ - if (!t->io_bitmap_ptr) { - bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); - if (!bitmap) - return -ENOMEM; - - memset(bitmap, 0xff, IO_BITMAP_BYTES); - t->io_bitmap_ptr = bitmap; - set_thread_flag(TIF_IO_BITMAP); - } - - /* - * do it in the per-thread copy and in the TSS ... - * - * Disable preemption via get_cpu() - we must not switch away - * because the ->io_bitmap_max value must match the bitmap - * contents: - */ - tss = &per_cpu(init_tss, get_cpu()); - - set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); - - /* - * Search for a (possibly new) maximum. This is simple and stupid, - * to keep it obviously correct: - */ - max_long = 0; - for (i = 0; i < IO_BITMAP_LONGS; i++) - if (t->io_bitmap_ptr[i] != ~0UL) - max_long = i; - - bytes = (max_long + 1) * sizeof(long); - bytes_updated = max(bytes, t->io_bitmap_max); - - t->io_bitmap_max = bytes; - - /* Update the TSS: */ - memcpy(tss->io_bitmap, t->io_bitmap_ptr, bytes_updated); - - put_cpu(); - - return 0; -} - -/* - * sys_iopl has to be used when you want to access the IO ports - * beyond the 0x3ff range: to get the full 65536 ports bitmapped - * you'd need 8kB of bitmaps/process, which is a bit excessive. - * - * Here we just change the eflags value on the stack: we allow - * only the super-user to do it. This depends on the stack-layout - * on system-call entry - see also fork() and the signal handling - * code. - */ - -asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs) -{ - unsigned int old = (regs->eflags >> 12) & 3; - - if (level > 3) - return -EINVAL; - /* Trying to gain more privileges? */ - if (level > old) { - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; - } - regs->eflags = (regs->eflags &~ X86_EFLAGS_IOPL) | (level << 12); - return 0; -} diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index d3fde94f734..cef054b09d2 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -66,11 +66,11 @@ static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; * SMP cross-CPU interrupts have their own specific * handlers). */ -fastcall unsigned int do_IRQ(struct pt_regs *regs) +unsigned int do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs; /* high bit used in ret_from_ code */ - int irq = ~regs->orig_eax; + int irq = ~regs->orig_ax; struct irq_desc *desc = irq_desc + irq; #ifdef CONFIG_4KSTACKS union irq_ctx *curctx, *irqctx; @@ -88,13 +88,13 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs) #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ { - long esp; + long sp; __asm__ __volatile__("andl %%esp,%0" : - "=r" (esp) : "0" (THREAD_SIZE - 1)); - if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { + "=r" (sp) : "0" (THREAD_SIZE - 1)); + if (unlikely(sp < (sizeof(struct thread_info) + STACK_WARN))) { printk("do_IRQ: stack overflow: %ld\n", - esp - sizeof(struct thread_info)); + sp - sizeof(struct thread_info)); dump_stack(); } } @@ -112,7 +112,7 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs) * current stack (which is the irq stack already after all) */ if (curctx != irqctx) { - int arg1, arg2, ebx; + int arg1, arg2, bx; /* build the stack frame on the IRQ stack */ isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); @@ -128,10 +128,10 @@ fastcall unsigned int do_IRQ(struct pt_regs *regs) (curctx->tinfo.preempt_count & SOFTIRQ_MASK); asm volatile( - " xchgl %%ebx,%%esp \n" - " call *%%edi \n" - " movl %%ebx,%%esp \n" - : "=a" (arg1), "=d" (arg2), "=b" (ebx) + " xchgl %%ebx,%%esp \n" + " call *%%edi \n" + " movl %%ebx,%%esp \n" + : "=a" (arg1), "=d" (arg2), "=b" (bx) : "0" (irq), "1" (desc), "2" (isp), "D" (desc->handle_irq) : "memory", "cc" diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 6b5c730d67b..3aac15466a9 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -20,6 +20,26 @@ atomic_t irq_err_count; +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq); + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But don't ack when the APIC is disabled. -AK + */ + if (!disable_apic) + ack_APIC_irq(); +} + #ifdef CONFIG_DEBUG_STACKOVERFLOW /* * Probabilistic stack overflow check: @@ -33,11 +53,11 @@ static inline void stack_overflow_check(struct pt_regs *regs) u64 curbase = (u64)task_stack_page(current); static unsigned long warned = -60*HZ; - if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && - regs->rsp < curbase + sizeof(struct thread_info) + 128 && + if (regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && + regs->sp < curbase + sizeof(struct thread_info) + 128 && time_after(jiffies, warned + 60*HZ)) { - printk("do_IRQ: %s near stack overflow (cur:%Lx,rsp:%lx)\n", - current->comm, curbase, regs->rsp); + printk("do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n", + current->comm, curbase, regs->sp); show_stack(NULL,NULL); warned = jiffies; } @@ -142,7 +162,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) struct pt_regs *old_regs = set_irq_regs(regs); /* high bit used in ret_from_ code */ - unsigned vector = ~regs->orig_rax; + unsigned vector = ~regs->orig_ax; unsigned irq; exit_idle(); diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c new file mode 100644 index 00000000000..73354302fda --- /dev/null +++ b/arch/x86/kernel/kdebugfs.c @@ -0,0 +1,65 @@ +/* + * Architecture specific debugfs files + * + * Copyright (C) 2007, Intel Corp. + * Huang Ying <ying.huang@intel.com> + * + * This file is released under the GPLv2. + */ + +#include <linux/debugfs.h> +#include <linux/stat.h> +#include <linux/init.h> + +#include <asm/setup.h> + +#ifdef CONFIG_DEBUG_BOOT_PARAMS +static struct debugfs_blob_wrapper boot_params_blob = { + .data = &boot_params, + .size = sizeof(boot_params), +}; + +static int __init boot_params_kdebugfs_init(void) +{ + int error; + struct dentry *dbp, *version, *data; + + dbp = debugfs_create_dir("boot_params", NULL); + if (!dbp) { + error = -ENOMEM; + goto err_return; + } + version = debugfs_create_x16("version", S_IRUGO, dbp, + &boot_params.hdr.version); + if (!version) { + error = -ENOMEM; + goto err_dir; + } + data = debugfs_create_blob("data", S_IRUGO, dbp, + &boot_params_blob); + if (!data) { + error = -ENOMEM; + goto err_version; + } + return 0; +err_version: + debugfs_remove(version); +err_dir: + debugfs_remove(dbp); +err_return: + return error; +} +#endif + +static int __init arch_kdebugfs_init(void) +{ + int error = 0; + +#ifdef CONFIG_DEBUG_BOOT_PARAMS + error = boot_params_kdebugfs_init(); +#endif + + return error; +} + +arch_initcall(arch_kdebugfs_init); diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c new file mode 100644 index 00000000000..a99e764fd66 --- /dev/null +++ b/arch/x86/kernel/kprobes.c @@ -0,0 +1,1066 @@ +/* + * Kernel Probes (KProbes) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel + * Probes initial implementation ( includes contributions from + * Rusty Russell). + * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes + * interface to access function arguments. + * 2004-Oct Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi + * <prasanna@in.ibm.com> adapted for x86_64 from i386. + * 2005-Mar Roland McGrath <roland@redhat.com> + * Fixed to handle %rip-relative addressing mode correctly. + * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston + * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi + * <prasanna@in.ibm.com> added function-return probes. + * 2005-May Rusty Lynch <rusty.lynch@intel.com> + * Added function return probes functionality + * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added + * kprobe-booster and kretprobe-booster for i386. + * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster + * and kretprobe-booster for x86-64 + * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven + * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com> + * unified x86 kprobes code. + */ + +#include <linux/kprobes.h> +#include <linux/ptrace.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/hardirq.h> +#include <linux/preempt.h> +#include <linux/module.h> +#include <linux/kdebug.h> + +#include <asm/cacheflush.h> +#include <asm/desc.h> +#include <asm/pgtable.h> +#include <asm/uaccess.h> +#include <asm/alternative.h> + +void jprobe_return_end(void); + +DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +#ifdef CONFIG_X86_64 +#define stack_addr(regs) ((unsigned long *)regs->sp) +#else +/* + * "®s->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs + * don't save the ss and esp registers if the CPU is already in kernel + * mode when it traps. So for kprobes, regs->sp and regs->ss are not + * the [nonexistent] saved stack pointer and ss register, but rather + * the top 8 bytes of the pre-int3 stack. So ®s->sp happens to + * point to the top of the pre-int3 stack. + */ +#define stack_addr(regs) ((unsigned long *)®s->sp) +#endif + +#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 32)) + /* + * Undefined/reserved opcodes, conditional jump, Opcode Extension + * Groups, and some special opcodes can not boost. + */ +static const u32 twobyte_is_boostable[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ---------------------------------------------- */ + W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */ + W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 10 */ + W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */ + W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */ + W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */ + W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */ + W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */ + W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */ + W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */ + W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */ + W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */ + W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0) /* f0 */ + /* ----------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +static const u32 onebyte_has_modrm[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ----------------------------------------------- */ + W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */ + W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */ + W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */ + W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */ + W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ + W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ + W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */ + W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */ + W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ + W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */ + W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */ + W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */ + W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */ + W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ + W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */ + W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */ + /* ----------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +static const u32 twobyte_has_modrm[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ----------------------------------------------- */ + W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */ + W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */ + W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */ + W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */ + W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */ + W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */ + W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */ + W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */ + W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */ + W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */ + W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */ + W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */ + W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */ + W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */ + W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */ + W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */ + /* ----------------------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ +}; +#undef W + +struct kretprobe_blackpoint kretprobe_blacklist[] = { + {"__switch_to", }, /* This function switches only current task, but + doesn't switch kernel stack.*/ + {NULL, NULL} /* Terminator */ +}; +const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); + +/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ +static void __kprobes set_jmp_op(void *from, void *to) +{ + struct __arch_jmp_op { + char op; + s32 raddr; + } __attribute__((packed)) * jop; + jop = (struct __arch_jmp_op *)from; + jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); + jop->op = RELATIVEJUMP_INSTRUCTION; +} + +/* + * Check for the REX prefix which can only exist on X86_64 + * X86_32 always returns 0 + */ +static int __kprobes is_REX_prefix(kprobe_opcode_t *insn) +{ +#ifdef CONFIG_X86_64 + if ((*insn & 0xf0) == 0x40) + return 1; +#endif + return 0; +} + +/* + * Returns non-zero if opcode is boostable. + * RIP relative instructions are adjusted at copying time in 64 bits mode + */ +static int __kprobes can_boost(kprobe_opcode_t *opcodes) +{ + kprobe_opcode_t opcode; + kprobe_opcode_t *orig_opcodes = opcodes; + +retry: + if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) + return 0; + opcode = *(opcodes++); + + /* 2nd-byte opcode */ + if (opcode == 0x0f) { + if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) + return 0; + return test_bit(*opcodes, + (unsigned long *)twobyte_is_boostable); + } + + switch (opcode & 0xf0) { +#ifdef CONFIG_X86_64 + case 0x40: + goto retry; /* REX prefix is boostable */ +#endif + case 0x60: + if (0x63 < opcode && opcode < 0x67) + goto retry; /* prefixes */ + /* can't boost Address-size override and bound */ + return (opcode != 0x62 && opcode != 0x67); + case 0x70: + return 0; /* can't boost conditional jump */ + case 0xc0: + /* can't boost software-interruptions */ + return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; + case 0xd0: + /* can boost AA* and XLAT */ + return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7); + case 0xe0: + /* can boost in/out and absolute jmps */ + return ((opcode & 0x04) || opcode == 0xea); + case 0xf0: + if ((opcode & 0x0c) == 0 && opcode != 0xf1) + goto retry; /* lock/rep(ne) prefix */ + /* clear and set flags are boostable */ + return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); + default: + /* segment override prefixes are boostable */ + if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e) + goto retry; /* prefixes */ + /* CS override prefix and call are not boostable */ + return (opcode != 0x2e && opcode != 0x9a); + } +} + +/* + * Returns non-zero if opcode modifies the interrupt flag. + */ +static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) +{ + switch (*insn) { + case 0xfa: /* cli */ + case 0xfb: /* sti */ + case 0xcf: /* iret/iretd */ + case 0x9d: /* popf/popfd */ + return 1; + } + + /* + * on X86_64, 0x40-0x4f are REX prefixes so we need to look + * at the next byte instead.. but of course not recurse infinitely + */ + if (is_REX_prefix(insn)) + return is_IF_modifier(++insn); + + return 0; +} + +/* + * Adjust the displacement if the instruction uses the %rip-relative + * addressing mode. + * If it does, Return the address of the 32-bit displacement word. + * If not, return null. + * Only applicable to 64-bit x86. + */ +static void __kprobes fix_riprel(struct kprobe *p) +{ +#ifdef CONFIG_X86_64 + u8 *insn = p->ainsn.insn; + s64 disp; + int need_modrm; + + /* Skip legacy instruction prefixes. */ + while (1) { + switch (*insn) { + case 0x66: + case 0x67: + case 0x2e: + case 0x3e: + case 0x26: + case 0x64: + case 0x65: + case 0x36: + case 0xf0: + case 0xf3: + case 0xf2: + ++insn; + continue; + } + break; + } + + /* Skip REX instruction prefix. */ + if (is_REX_prefix(insn)) + ++insn; + + if (*insn == 0x0f) { + /* Two-byte opcode. */ + ++insn; + need_modrm = test_bit(*insn, + (unsigned long *)twobyte_has_modrm); + } else + /* One-byte opcode. */ + need_modrm = test_bit(*insn, + (unsigned long *)onebyte_has_modrm); + + if (need_modrm) { + u8 modrm = *++insn; + if ((modrm & 0xc7) == 0x05) { + /* %rip+disp32 addressing mode */ + /* Displacement follows ModRM byte. */ + ++insn; + /* + * The copied instruction uses the %rip-relative + * addressing mode. Adjust the displacement for the + * difference between the original location of this + * instruction and the location of the copy that will + * actually be run. The tricky bit here is making sure + * that the sign extension happens correctly in this + * calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the + * %rip value and yield the same 64-bit result that the + * sign-extension of the original signed 32-bit + * displacement would have given. + */ + disp = (u8 *) p->addr + *((s32 *) insn) - + (u8 *) p->ainsn.insn; + BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ + *(s32 *)insn = (s32) disp; + } + } +#endif +} + +static void __kprobes arch_copy_kprobe(struct kprobe *p) +{ + memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + + fix_riprel(p); + + if (can_boost(p->addr)) + p->ainsn.boostable = 0; + else + p->ainsn.boostable = -1; + + p->opcode = *p->addr; +} + +int __kprobes arch_prepare_kprobe(struct kprobe *p) +{ + /* insn: must be on special executable page on x86. */ + p->ainsn.insn = get_insn_slot(); + if (!p->ainsn.insn) + return -ENOMEM; + arch_copy_kprobe(p); + return 0; +} + +void __kprobes arch_arm_kprobe(struct kprobe *p) +{ + text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); +} + +void __kprobes arch_disarm_kprobe(struct kprobe *p) +{ + text_poke(p->addr, &p->opcode, 1); +} + +void __kprobes arch_remove_kprobe(struct kprobe *p) +{ + mutex_lock(&kprobe_mutex); + free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); + mutex_unlock(&kprobe_mutex); +} + +static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + kcb->prev_kprobe.kp = kprobe_running(); + kcb->prev_kprobe.status = kcb->kprobe_status; + kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags; + kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags; +} + +static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; + kcb->kprobe_status = kcb->prev_kprobe.status; + kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags; + kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags; +} + +static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = p; + kcb->kprobe_saved_flags = kcb->kprobe_old_flags + = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); + if (is_IF_modifier(p->ainsn.insn)) + kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF; +} + +static void __kprobes clear_btf(void) +{ + if (test_thread_flag(TIF_DEBUGCTLMSR)) + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0); +} + +static void __kprobes restore_btf(void) +{ + if (test_thread_flag(TIF_DEBUGCTLMSR)) + wrmsrl(MSR_IA32_DEBUGCTLMSR, current->thread.debugctlmsr); +} + +static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) +{ + clear_btf(); + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; + /* single step inline if the instruction is an int3 */ + if (p->opcode == BREAKPOINT_INSTRUCTION) + regs->ip = (unsigned long)p->addr; + else + regs->ip = (unsigned long)p->ainsn.insn; +} + +/* Called with kretprobe_lock held */ +void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + unsigned long *sara = stack_addr(regs); + + ri->ret_addr = (kprobe_opcode_t *) *sara; + + /* Replace the return addr with trampoline addr */ + *sara = (unsigned long) &kretprobe_trampoline; +} + +static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ +#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM) + if (p->ainsn.boostable == 1 && !p->post_handler) { + /* Boost up -- we can execute copied instructions directly */ + reset_current_kprobe(); + regs->ip = (unsigned long)p->ainsn.insn; + preempt_enable_no_resched(); + return; + } +#endif + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_HIT_SS; +} + +/* + * We have reentered the kprobe_handler(), since another probe was hit while + * within the handler. We save the original kprobes variables and just single + * step on the instruction of the new probe without calling any user handlers. + */ +static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + switch (kcb->kprobe_status) { + case KPROBE_HIT_SSDONE: +#ifdef CONFIG_X86_64 + /* TODO: Provide re-entrancy from post_kprobes_handler() and + * avoid exception stack corruption while single-stepping on + * the instruction of the new probe. + */ + arch_disarm_kprobe(p); + regs->ip = (unsigned long)p->addr; + reset_current_kprobe(); + preempt_enable_no_resched(); + break; +#endif + case KPROBE_HIT_ACTIVE: + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kprobes_inc_nmissed_count(p); + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_REENTER; + break; + case KPROBE_HIT_SS: + if (p == kprobe_running()) { + regs->flags &= ~TF_MASK; + regs->flags |= kcb->kprobe_saved_flags; + return 0; + } else { + /* A probe has been hit in the codepath leading up + * to, or just after, single-stepping of a probed + * instruction. This entire codepath should strictly + * reside in .kprobes.text section. Raise a warning + * to highlight this peculiar case. + */ + } + default: + /* impossible cases */ + WARN_ON(1); + return 0; + } + + return 1; +} + +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate and they + * remain disabled thorough out this function. + */ +static int __kprobes kprobe_handler(struct pt_regs *regs) +{ + kprobe_opcode_t *addr; + struct kprobe *p; + struct kprobe_ctlblk *kcb; + + addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); + if (*addr != BREAKPOINT_INSTRUCTION) { + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + * Back up over the (now missing) int3 and run + * the original instruction. + */ + regs->ip = (unsigned long)addr; + return 1; + } + + /* + * We don't want to be preempted for the entire + * duration of kprobe processing. We conditionally + * re-enable preemption at the end of this function, + * and also in reenter_kprobe() and setup_singlestep(). + */ + preempt_disable(); + + kcb = get_kprobe_ctlblk(); + p = get_kprobe(addr); + + if (p) { + if (kprobe_running()) { + if (reenter_kprobe(p, regs, kcb)) + return 1; + } else { + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + /* + * If we have no pre-handler or it returned 0, we + * continue with normal processing. If we have a + * pre-handler and it returned non-zero, it prepped + * for calling the break_handler below on re-entry + * for jprobe processing, so get out doing nothing + * more here. + */ + if (!p->pre_handler || !p->pre_handler(p, regs)) + setup_singlestep(p, regs, kcb); + return 1; + } + } else if (kprobe_running()) { + p = __get_cpu_var(current_kprobe); + if (p->break_handler && p->break_handler(p, regs)) { + setup_singlestep(p, regs, kcb); + return 1; + } + } /* else: not a kprobe fault; let the kernel handle it */ + + preempt_enable_no_resched(); + return 0; +} + +/* + * When a retprobed function returns, this code saves registers and + * calls trampoline_handler() runs, which calls the kretprobe's handler. + */ +void __kprobes kretprobe_trampoline_holder(void) +{ + asm volatile ( + ".global kretprobe_trampoline\n" + "kretprobe_trampoline: \n" +#ifdef CONFIG_X86_64 + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + /* + * Skip cs, ip, orig_ax. + * trampoline_handler() will plug in these values + */ + " subq $24, %rsp\n" + " pushq %rdi\n" + " pushq %rsi\n" + " pushq %rdx\n" + " pushq %rcx\n" + " pushq %rax\n" + " pushq %r8\n" + " pushq %r9\n" + " pushq %r10\n" + " pushq %r11\n" + " pushq %rbx\n" + " pushq %rbp\n" + " pushq %r12\n" + " pushq %r13\n" + " pushq %r14\n" + " pushq %r15\n" + " movq %rsp, %rdi\n" + " call trampoline_handler\n" + /* Replace saved sp with true return address. */ + " movq %rax, 152(%rsp)\n" + " popq %r15\n" + " popq %r14\n" + " popq %r13\n" + " popq %r12\n" + " popq %rbp\n" + " popq %rbx\n" + " popq %r11\n" + " popq %r10\n" + " popq %r9\n" + " popq %r8\n" + " popq %rax\n" + " popq %rcx\n" + " popq %rdx\n" + " popq %rsi\n" + " popq %rdi\n" + /* Skip orig_ax, ip, cs */ + " addq $24, %rsp\n" + " popfq\n" +#else + " pushf\n" + /* + * Skip cs, ip, orig_ax. + * trampoline_handler() will plug in these values + */ + " subl $12, %esp\n" + " pushl %fs\n" + " pushl %ds\n" + " pushl %es\n" + " pushl %eax\n" + " pushl %ebp\n" + " pushl %edi\n" + " pushl %esi\n" + " pushl %edx\n" + " pushl %ecx\n" + " pushl %ebx\n" + " movl %esp, %eax\n" + " call trampoline_handler\n" + /* Move flags to cs */ + " movl 52(%esp), %edx\n" + " movl %edx, 48(%esp)\n" + /* Replace saved flags with true return address. */ + " movl %eax, 52(%esp)\n" + " popl %ebx\n" + " popl %ecx\n" + " popl %edx\n" + " popl %esi\n" + " popl %edi\n" + " popl %ebp\n" + " popl %eax\n" + /* Skip ip, orig_ax, es, ds, fs */ + " addl $20, %esp\n" + " popf\n" +#endif + " ret\n"); +} + +/* + * Called from kretprobe_trampoline + */ +void * __kprobes trampoline_handler(struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head, empty_rp; + struct hlist_node *node, *tmp; + unsigned long flags, orig_ret_address = 0; + unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; + + INIT_HLIST_HEAD(&empty_rp); + spin_lock_irqsave(&kretprobe_lock, flags); + head = kretprobe_inst_table_head(current); + /* fixup registers */ +#ifdef CONFIG_X86_64 + regs->cs = __KERNEL_CS; +#else + regs->cs = __KERNEL_CS | get_kernel_rpl(); +#endif + regs->ip = trampoline_address; + regs->orig_ax = ~0UL; + + /* + * It is possible to have multiple instances associated with a given + * task either because multiple functions in the call path have + * return probes installed on them, and/or more then one + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always pushed into the head of the list + * - when multiple return probes are registered for the same + * function, the (chronologically) first instance's ret_addr + * will be the real return address, and all the rest will + * point to kretprobe_trampoline. + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) { + __get_cpu_var(current_kprobe) = &ri->rp->kp; + get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; + ri->rp->handler(ri, regs); + __get_cpu_var(current_kprobe) = NULL; + } + + orig_ret_address = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri, &empty_rp); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + kretprobe_assert(ri, orig_ret_address, trampoline_address); + + spin_unlock_irqrestore(&kretprobe_lock, flags); + + hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } + return (void *)orig_ret_address; +} + +/* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "int 3" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. + * + * This function prepares to return from the post-single-step + * interrupt. We have to fix up the stack as follows: + * + * 0) Except in the case of absolute or indirect jump or call instructions, + * the new ip is relative to the copied instruction. We need to make + * it relative to the original instruction. + * + * 1) If the single-stepped instruction was pushfl, then the TF and IF + * flags are set in the just-pushed flags, and may need to be cleared. + * + * 2) If the single-stepped instruction was a call, the return address + * that is atop the stack is the address following the copied instruction. + * We need to make it the address following the original instruction. + * + * If this is the first time we've single-stepped the instruction at + * this probepoint, and the instruction is boostable, boost it: add a + * jump instruction after the copied instruction, that jumps to the next + * instruction after the probepoint. + */ +static void __kprobes resume_execution(struct kprobe *p, + struct pt_regs *regs, struct kprobe_ctlblk *kcb) +{ + unsigned long *tos = stack_addr(regs); + unsigned long copy_ip = (unsigned long)p->ainsn.insn; + unsigned long orig_ip = (unsigned long)p->addr; + kprobe_opcode_t *insn = p->ainsn.insn; + + /*skip the REX prefix*/ + if (is_REX_prefix(insn)) + insn++; + + regs->flags &= ~X86_EFLAGS_TF; + switch (*insn) { + case 0x9c: /* pushfl */ + *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF); + *tos |= kcb->kprobe_old_flags; + break; + case 0xc2: /* iret/ret/lret */ + case 0xc3: + case 0xca: + case 0xcb: + case 0xcf: + case 0xea: /* jmp absolute -- ip is correct */ + /* ip is already adjusted, no more changes required */ + p->ainsn.boostable = 1; + goto no_change; + case 0xe8: /* call relative - Fix return addr */ + *tos = orig_ip + (*tos - copy_ip); + break; +#ifdef CONFIG_X86_32 + case 0x9a: /* call absolute -- same as call absolute, indirect */ + *tos = orig_ip + (*tos - copy_ip); + goto no_change; +#endif + case 0xff: + if ((insn[1] & 0x30) == 0x10) { + /* + * call absolute, indirect + * Fix return addr; ip is correct. + * But this is not boostable + */ + *tos = orig_ip + (*tos - copy_ip); + goto no_change; + } else if (((insn[1] & 0x31) == 0x20) || + ((insn[1] & 0x31) == 0x21)) { + /* + * jmp near and far, absolute indirect + * ip is correct. And this is boostable + */ + p->ainsn.boostable = 1; + goto no_change; + } + default: + break; + } + + if (p->ainsn.boostable == 0) { + if ((regs->ip > copy_ip) && + (regs->ip - copy_ip) + 5 < MAX_INSN_SIZE) { + /* + * These instructions can be executed directly if it + * jumps back to correct address. + */ + set_jmp_op((void *)regs->ip, + (void *)orig_ip + (regs->ip - copy_ip)); + p->ainsn.boostable = 1; + } else { + p->ainsn.boostable = -1; + } + } + + regs->ip += orig_ip - copy_ip; + +no_change: + restore_btf(); +} + +/* + * Interrupts are disabled on entry as trap1 is an interrupt gate and they + * remain disabled thoroughout this function. + */ +static int __kprobes post_kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (!cur) + return 0; + + if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + cur->post_handler(cur, regs, 0); + } + + resume_execution(cur, regs, kcb); + regs->flags |= kcb->kprobe_saved_flags; + trace_hardirqs_fixup_flags(regs->flags); + + /* Restore back the original saved kprobes variables and continue. */ + if (kcb->kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(kcb); + goto out; + } + reset_current_kprobe(); +out: + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, flags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (regs->flags & X86_EFLAGS_TF) + return 0; + + return 1; +} + +int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + switch (kcb->kprobe_status) { + case KPROBE_HIT_SS: + case KPROBE_REENTER: + /* + * We are here because the instruction being single + * stepped caused a page fault. We reset the current + * kprobe and the ip points back to the probe address + * and allow the page fault handler to continue as a + * normal page fault. + */ + regs->ip = (unsigned long)cur->addr; + regs->flags |= kcb->kprobe_old_flags; + if (kcb->kprobe_status == KPROBE_REENTER) + restore_previous_kprobe(kcb); + else + reset_current_kprobe(); + preempt_enable_no_resched(); + break; + case KPROBE_HIT_ACTIVE: + case KPROBE_HIT_SSDONE: + /* + * We increment the nmissed count for accounting, + * we can also use npre/npostfault count for accounting + * these specific fault cases. + */ + kprobes_inc_nmissed_count(cur); + + /* + * We come here because instructions in the pre/post + * handler caused the page_fault, this could happen + * if handler tries to access user space by + * copy_from_user(), get_user() etc. Let the + * user-specified handler try to fix it first. + */ + if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) + return 1; + + /* + * In case the user-specified fault handler returned + * zero, try to fix up. + */ + if (fixup_exception(regs)) + return 1; + + /* + * fixup routine could not handle it, + * Let do_page_fault() fix it. + */ + break; + default: + break; + } + return 0; +} + +/* + * Wrapper routine for handling exceptions. + */ +int __kprobes kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = data; + int ret = NOTIFY_DONE; + + if (args->regs && user_mode_vm(args->regs)) + return ret; + + switch (val) { + case DIE_INT3: + if (kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_DEBUG: + if (post_kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_GPF: + /* + * To be potentially processing a kprobe fault and to + * trust the result from kprobe_running(), we have + * be non-preemptible. + */ + if (!preemptible() && kprobe_running() && + kprobe_fault_handler(args->regs, args->trapnr)) + ret = NOTIFY_STOP; + break; + default: + break; + } + return ret; +} + +int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct jprobe *jp = container_of(p, struct jprobe, kp); + unsigned long addr; + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + kcb->jprobe_saved_regs = *regs; + kcb->jprobe_saved_sp = stack_addr(regs); + addr = (unsigned long)(kcb->jprobe_saved_sp); + + /* + * As Linus pointed out, gcc assumes that the callee + * owns the argument space and could overwrite it, e.g. + * tailcall optimization. So, to be absolutely safe + * we also save and restore enough stack bytes to cover + * the argument area. + */ + memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, + MIN_STACK_SIZE(addr)); + regs->flags &= ~X86_EFLAGS_IF; + trace_hardirqs_off(); + regs->ip = (unsigned long)(jp->entry); + return 1; +} + +void __kprobes jprobe_return(void) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + asm volatile ( +#ifdef CONFIG_X86_64 + " xchg %%rbx,%%rsp \n" +#else + " xchgl %%ebx,%%esp \n" +#endif + " int3 \n" + " .globl jprobe_return_end\n" + " jprobe_return_end: \n" + " nop \n"::"b" + (kcb->jprobe_saved_sp):"memory"); +} + +int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + u8 *addr = (u8 *) (regs->ip - 1); + struct jprobe *jp = container_of(p, struct jprobe, kp); + + if ((addr > (u8 *) jprobe_return) && + (addr < (u8 *) jprobe_return_end)) { + if (stack_addr(regs) != kcb->jprobe_saved_sp) { + struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; + printk(KERN_ERR + "current sp %p does not match saved sp %p\n", + stack_addr(regs), kcb->jprobe_saved_sp); + printk(KERN_ERR "Saved registers for jprobe %p\n", jp); + show_registers(saved_regs); + printk(KERN_ERR "Current registers\n"); + show_registers(regs); + BUG(); + } + *regs = kcb->jprobe_saved_regs; + memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp), + kcb->jprobes_stack, + MIN_STACK_SIZE(kcb->jprobe_saved_sp)); + preempt_enable_no_resched(); + return 1; + } + return 0; +} + +int __init arch_init_kprobes(void) +{ + return 0; +} + +int __kprobes arch_trampoline_kprobe(struct kprobe *p) +{ + return 0; +} diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c deleted file mode 100644 index 3a020f79f82..00000000000 --- a/arch/x86/kernel/kprobes_32.c +++ /dev/null @@ -1,756 +0,0 @@ -/* - * Kernel Probes (KProbes) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2002, 2004 - * - * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel - * Probes initial implementation ( includes contributions from - * Rusty Russell). - * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes - * interface to access function arguments. - * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston - * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi - * <prasanna@in.ibm.com> added function-return probes. - */ - -#include <linux/kprobes.h> -#include <linux/ptrace.h> -#include <linux/preempt.h> -#include <linux/kdebug.h> -#include <asm/cacheflush.h> -#include <asm/desc.h> -#include <asm/uaccess.h> -#include <asm/alternative.h> - -void jprobe_return_end(void); - -DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; -DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); - -struct kretprobe_blackpoint kretprobe_blacklist[] = { - {"__switch_to", }, /* This function switches only current task, but - doesn't switch kernel stack.*/ - {NULL, NULL} /* Terminator */ -}; -const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); - -/* insert a jmp code */ -static __always_inline void set_jmp_op(void *from, void *to) -{ - struct __arch_jmp_op { - char op; - long raddr; - } __attribute__((packed)) *jop; - jop = (struct __arch_jmp_op *)from; - jop->raddr = (long)(to) - ((long)(from) + 5); - jop->op = RELATIVEJUMP_INSTRUCTION; -} - -/* - * returns non-zero if opcodes can be boosted. - */ -static __always_inline int can_boost(kprobe_opcode_t *opcodes) -{ -#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ - (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ - (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ - (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ - (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ - << (row % 32)) - /* - * Undefined/reserved opcodes, conditional jump, Opcode Extension - * Groups, and some special opcodes can not be boost. - */ - static const unsigned long twobyte_is_boostable[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ------------------------------- */ - W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */ - W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */ - W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */ - W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */ - W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */ - W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */ - W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */ - W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */ - W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */ - W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */ - W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */ - W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */ - W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */ - W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */ - W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */ - W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0) /* f0 */ - /* ------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - }; -#undef W - kprobe_opcode_t opcode; - kprobe_opcode_t *orig_opcodes = opcodes; -retry: - if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) - return 0; - opcode = *(opcodes++); - - /* 2nd-byte opcode */ - if (opcode == 0x0f) { - if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) - return 0; - return test_bit(*opcodes, twobyte_is_boostable); - } - - switch (opcode & 0xf0) { - case 0x60: - if (0x63 < opcode && opcode < 0x67) - goto retry; /* prefixes */ - /* can't boost Address-size override and bound */ - return (opcode != 0x62 && opcode != 0x67); - case 0x70: - return 0; /* can't boost conditional jump */ - case 0xc0: - /* can't boost software-interruptions */ - return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; - case 0xd0: - /* can boost AA* and XLAT */ - return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7); - case 0xe0: - /* can boost in/out and absolute jmps */ - return ((opcode & 0x04) || opcode == 0xea); - case 0xf0: - if ((opcode & 0x0c) == 0 && opcode != 0xf1) - goto retry; /* lock/rep(ne) prefix */ - /* clear and set flags can be boost */ - return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); - default: - if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e) - goto retry; /* prefixes */ - /* can't boost CS override and call */ - return (opcode != 0x2e && opcode != 0x9a); - } -} - -/* - * returns non-zero if opcode modifies the interrupt flag. - */ -static int __kprobes is_IF_modifier(kprobe_opcode_t opcode) -{ - switch (opcode) { - case 0xfa: /* cli */ - case 0xfb: /* sti */ - case 0xcf: /* iret/iretd */ - case 0x9d: /* popf/popfd */ - return 1; - } - return 0; -} - -int __kprobes arch_prepare_kprobe(struct kprobe *p) -{ - /* insn: must be on special executable page on i386. */ - p->ainsn.insn = get_insn_slot(); - if (!p->ainsn.insn) - return -ENOMEM; - - memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); - p->opcode = *p->addr; - if (can_boost(p->addr)) { - p->ainsn.boostable = 0; - } else { - p->ainsn.boostable = -1; - } - return 0; -} - -void __kprobes arch_arm_kprobe(struct kprobe *p) -{ - text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); -} - -void __kprobes arch_disarm_kprobe(struct kprobe *p) -{ - text_poke(p->addr, &p->opcode, 1); -} - -void __kprobes arch_remove_kprobe(struct kprobe *p) -{ - mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); - mutex_unlock(&kprobe_mutex); -} - -static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) -{ - kcb->prev_kprobe.kp = kprobe_running(); - kcb->prev_kprobe.status = kcb->kprobe_status; - kcb->prev_kprobe.old_eflags = kcb->kprobe_old_eflags; - kcb->prev_kprobe.saved_eflags = kcb->kprobe_saved_eflags; -} - -static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) -{ - __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; - kcb->kprobe_status = kcb->prev_kprobe.status; - kcb->kprobe_old_eflags = kcb->prev_kprobe.old_eflags; - kcb->kprobe_saved_eflags = kcb->prev_kprobe.saved_eflags; -} - -static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - __get_cpu_var(current_kprobe) = p; - kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags - = (regs->eflags & (TF_MASK | IF_MASK)); - if (is_IF_modifier(p->opcode)) - kcb->kprobe_saved_eflags &= ~IF_MASK; -} - -static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) -{ - regs->eflags |= TF_MASK; - regs->eflags &= ~IF_MASK; - /*single step inline if the instruction is an int3*/ - if (p->opcode == BREAKPOINT_INSTRUCTION) - regs->eip = (unsigned long)p->addr; - else - regs->eip = (unsigned long)p->ainsn.insn; -} - -/* Called with kretprobe_lock held */ -void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, - struct pt_regs *regs) -{ - unsigned long *sara = (unsigned long *)®s->esp; - - ri->ret_addr = (kprobe_opcode_t *) *sara; - - /* Replace the return addr with trampoline addr */ - *sara = (unsigned long) &kretprobe_trampoline; -} - -/* - * Interrupts are disabled on entry as trap3 is an interrupt gate and they - * remain disabled thorough out this function. - */ -static int __kprobes kprobe_handler(struct pt_regs *regs) -{ - struct kprobe *p; - int ret = 0; - kprobe_opcode_t *addr; - struct kprobe_ctlblk *kcb; - - addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t)); - - /* - * We don't want to be preempted for the entire - * duration of kprobe processing - */ - preempt_disable(); - kcb = get_kprobe_ctlblk(); - - /* Check we're not actually recursing */ - if (kprobe_running()) { - p = get_kprobe(addr); - if (p) { - if (kcb->kprobe_status == KPROBE_HIT_SS && - *p->ainsn.insn == BREAKPOINT_INSTRUCTION) { - regs->eflags &= ~TF_MASK; - regs->eflags |= kcb->kprobe_saved_eflags; - goto no_kprobe; - } - /* We have reentered the kprobe_handler(), since - * another probe was hit while within the handler. - * We here save the original kprobes variables and - * just single step on the instruction of the new probe - * without calling any user handlers. - */ - save_previous_kprobe(kcb); - set_current_kprobe(p, regs, kcb); - kprobes_inc_nmissed_count(p); - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_REENTER; - return 1; - } else { - if (*addr != BREAKPOINT_INSTRUCTION) { - /* The breakpoint instruction was removed by - * another cpu right after we hit, no further - * handling of this interrupt is appropriate - */ - regs->eip -= sizeof(kprobe_opcode_t); - ret = 1; - goto no_kprobe; - } - p = __get_cpu_var(current_kprobe); - if (p->break_handler && p->break_handler(p, regs)) { - goto ss_probe; - } - } - goto no_kprobe; - } - - p = get_kprobe(addr); - if (!p) { - if (*addr != BREAKPOINT_INSTRUCTION) { - /* - * The breakpoint instruction was removed right - * after we hit it. Another cpu has removed - * either a probepoint or a debugger breakpoint - * at this address. In either case, no further - * handling of this interrupt is appropriate. - * Back up over the (now missing) int3 and run - * the original instruction. - */ - regs->eip -= sizeof(kprobe_opcode_t); - ret = 1; - } - /* Not one of ours: let kernel handle it */ - goto no_kprobe; - } - - set_current_kprobe(p, regs, kcb); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; - - if (p->pre_handler && p->pre_handler(p, regs)) - /* handler has already set things up, so skip ss setup */ - return 1; - -ss_probe: -#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM) - if (p->ainsn.boostable == 1 && !p->post_handler){ - /* Boost up -- we can execute copied instructions directly */ - reset_current_kprobe(); - regs->eip = (unsigned long)p->ainsn.insn; - preempt_enable_no_resched(); - return 1; - } -#endif - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_HIT_SS; - return 1; - -no_kprobe: - preempt_enable_no_resched(); - return ret; -} - -/* - * For function-return probes, init_kprobes() establishes a probepoint - * here. When a retprobed function returns, this probe is hit and - * trampoline_probe_handler() runs, calling the kretprobe's handler. - */ - void __kprobes kretprobe_trampoline_holder(void) - { - asm volatile ( ".global kretprobe_trampoline\n" - "kretprobe_trampoline: \n" - " pushf\n" - /* skip cs, eip, orig_eax */ - " subl $12, %esp\n" - " pushl %fs\n" - " pushl %ds\n" - " pushl %es\n" - " pushl %eax\n" - " pushl %ebp\n" - " pushl %edi\n" - " pushl %esi\n" - " pushl %edx\n" - " pushl %ecx\n" - " pushl %ebx\n" - " movl %esp, %eax\n" - " call trampoline_handler\n" - /* move eflags to cs */ - " movl 52(%esp), %edx\n" - " movl %edx, 48(%esp)\n" - /* save true return address on eflags */ - " movl %eax, 52(%esp)\n" - " popl %ebx\n" - " popl %ecx\n" - " popl %edx\n" - " popl %esi\n" - " popl %edi\n" - " popl %ebp\n" - " popl %eax\n" - /* skip eip, orig_eax, es, ds, fs */ - " addl $20, %esp\n" - " popf\n" - " ret\n"); -} - -/* - * Called from kretprobe_trampoline - */ -fastcall void *__kprobes trampoline_handler(struct pt_regs *regs) -{ - struct kretprobe_instance *ri = NULL; - struct hlist_head *head, empty_rp; - struct hlist_node *node, *tmp; - unsigned long flags, orig_ret_address = 0; - unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; - - INIT_HLIST_HEAD(&empty_rp); - spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(current); - /* fixup registers */ - regs->xcs = __KERNEL_CS | get_kernel_rpl(); - regs->eip = trampoline_address; - regs->orig_eax = 0xffffffff; - - /* - * It is possible to have multiple instances associated with a given - * task either because an multiple functions in the call path - * have a return probe installed on them, and/or more then one return - * return probe was registered for a target function. - * - * We can handle this because: - * - instances are always inserted at the head of the list - * - when multiple return probes are registered for the same - * function, the first instance's ret_addr will point to the - * real return address, and all the rest will point to - * kretprobe_trampoline - */ - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - - if (ri->rp && ri->rp->handler){ - __get_cpu_var(current_kprobe) = &ri->rp->kp; - get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; - ri->rp->handler(ri, regs); - __get_cpu_var(current_kprobe) = NULL; - } - - orig_ret_address = (unsigned long)ri->ret_addr; - recycle_rp_inst(ri, &empty_rp); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_assert(ri, orig_ret_address, trampoline_address); - spin_unlock_irqrestore(&kretprobe_lock, flags); - - hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } - return (void*)orig_ret_address; -} - -/* - * Called after single-stepping. p->addr is the address of the - * instruction whose first byte has been replaced by the "int 3" - * instruction. To avoid the SMP problems that can occur when we - * temporarily put back the original opcode to single-step, we - * single-stepped a copy of the instruction. The address of this - * copy is p->ainsn.insn. - * - * This function prepares to return from the post-single-step - * interrupt. We have to fix up the stack as follows: - * - * 0) Except in the case of absolute or indirect jump or call instructions, - * the new eip is relative to the copied instruction. We need to make - * it relative to the original instruction. - * - * 1) If the single-stepped instruction was pushfl, then the TF and IF - * flags are set in the just-pushed eflags, and may need to be cleared. - * - * 2) If the single-stepped instruction was a call, the return address - * that is atop the stack is the address following the copied instruction. - * We need to make it the address following the original instruction. - * - * This function also checks instruction size for preparing direct execution. - */ -static void __kprobes resume_execution(struct kprobe *p, - struct pt_regs *regs, struct kprobe_ctlblk *kcb) -{ - unsigned long *tos = (unsigned long *)®s->esp; - unsigned long copy_eip = (unsigned long)p->ainsn.insn; - unsigned long orig_eip = (unsigned long)p->addr; - - regs->eflags &= ~TF_MASK; - switch (p->ainsn.insn[0]) { - case 0x9c: /* pushfl */ - *tos &= ~(TF_MASK | IF_MASK); - *tos |= kcb->kprobe_old_eflags; - break; - case 0xc2: /* iret/ret/lret */ - case 0xc3: - case 0xca: - case 0xcb: - case 0xcf: - case 0xea: /* jmp absolute -- eip is correct */ - /* eip is already adjusted, no more changes required */ - p->ainsn.boostable = 1; - goto no_change; - case 0xe8: /* call relative - Fix return addr */ - *tos = orig_eip + (*tos - copy_eip); - break; - case 0x9a: /* call absolute -- same as call absolute, indirect */ - *tos = orig_eip + (*tos - copy_eip); - goto no_change; - case 0xff: - if ((p->ainsn.insn[1] & 0x30) == 0x10) { - /* - * call absolute, indirect - * Fix return addr; eip is correct. - * But this is not boostable - */ - *tos = orig_eip + (*tos - copy_eip); - goto no_change; - } else if (((p->ainsn.insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */ - ((p->ainsn.insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */ - /* eip is correct. And this is boostable */ - p->ainsn.boostable = 1; - goto no_change; - } - default: - break; - } - - if (p->ainsn.boostable == 0) { - if ((regs->eip > copy_eip) && - (regs->eip - copy_eip) + 5 < MAX_INSN_SIZE) { - /* - * These instructions can be executed directly if it - * jumps back to correct address. - */ - set_jmp_op((void *)regs->eip, - (void *)orig_eip + (regs->eip - copy_eip)); - p->ainsn.boostable = 1; - } else { - p->ainsn.boostable = -1; - } - } - - regs->eip = orig_eip + (regs->eip - copy_eip); - -no_change: - return; -} - -/* - * Interrupts are disabled on entry as trap1 is an interrupt gate and they - * remain disabled thoroughout this function. - */ -static int __kprobes post_kprobe_handler(struct pt_regs *regs) -{ - struct kprobe *cur = kprobe_running(); - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - if (!cur) - return 0; - - if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - cur->post_handler(cur, regs, 0); - } - - resume_execution(cur, regs, kcb); - regs->eflags |= kcb->kprobe_saved_eflags; - trace_hardirqs_fixup_flags(regs->eflags); - - /*Restore back the original saved kprobes variables and continue. */ - if (kcb->kprobe_status == KPROBE_REENTER) { - restore_previous_kprobe(kcb); - goto out; - } - reset_current_kprobe(); -out: - preempt_enable_no_resched(); - - /* - * if somebody else is singlestepping across a probe point, eflags - * will have TF set, in which case, continue the remaining processing - * of do_debug, as if this is not a probe hit. - */ - if (regs->eflags & TF_MASK) - return 0; - - return 1; -} - -int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) -{ - struct kprobe *cur = kprobe_running(); - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - switch(kcb->kprobe_status) { - case KPROBE_HIT_SS: - case KPROBE_REENTER: - /* - * We are here because the instruction being single - * stepped caused a page fault. We reset the current - * kprobe and the eip points back to the probe address - * and allow the page fault handler to continue as a - * normal page fault. - */ - regs->eip = (unsigned long)cur->addr; - regs->eflags |= kcb->kprobe_old_eflags; - if (kcb->kprobe_status == KPROBE_REENTER) - restore_previous_kprobe(kcb); - else - reset_current_kprobe(); - preempt_enable_no_resched(); - break; - case KPROBE_HIT_ACTIVE: - case KPROBE_HIT_SSDONE: - /* - * We increment the nmissed count for accounting, - * we can also use npre/npostfault count for accouting - * these specific fault cases. - */ - kprobes_inc_nmissed_count(cur); - - /* - * We come here because instructions in the pre/post - * handler caused the page_fault, this could happen - * if handler tries to access user space by - * copy_from_user(), get_user() etc. Let the - * user-specified handler try to fix it first. - */ - if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) - return 1; - - /* - * In case the user-specified fault handler returned - * zero, try to fix up. - */ - if (fixup_exception(regs)) - return 1; - - /* - * fixup_exception() could not handle it, - * Let do_page_fault() fix it. - */ - break; - default: - break; - } - return 0; -} - -/* - * Wrapper routine to for handling exceptions. - */ -int __kprobes kprobe_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct die_args *args = (struct die_args *)data; - int ret = NOTIFY_DONE; - - if (args->regs && user_mode_vm(args->regs)) - return ret; - - switch (val) { - case DIE_INT3: - if (kprobe_handler(args->regs)) - ret = NOTIFY_STOP; - break; - case DIE_DEBUG: - if (post_kprobe_handler(args->regs)) - ret = NOTIFY_STOP; - break; - case DIE_GPF: - /* kprobe_running() needs smp_processor_id() */ - preempt_disable(); - if (kprobe_running() && - kprobe_fault_handler(args->regs, args->trapnr)) - ret = NOTIFY_STOP; - preempt_enable(); - break; - default: - break; - } - return ret; -} - -int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct jprobe *jp = container_of(p, struct jprobe, kp); - unsigned long addr; - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - kcb->jprobe_saved_regs = *regs; - kcb->jprobe_saved_esp = ®s->esp; - addr = (unsigned long)(kcb->jprobe_saved_esp); - - /* - * TBD: As Linus pointed out, gcc assumes that the callee - * owns the argument space and could overwrite it, e.g. - * tailcall optimization. So, to be absolutely safe - * we also save and restore enough stack bytes to cover - * the argument area. - */ - memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, - MIN_STACK_SIZE(addr)); - regs->eflags &= ~IF_MASK; - trace_hardirqs_off(); - regs->eip = (unsigned long)(jp->entry); - return 1; -} - -void __kprobes jprobe_return(void) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - asm volatile (" xchgl %%ebx,%%esp \n" - " int3 \n" - " .globl jprobe_return_end \n" - " jprobe_return_end: \n" - " nop \n"::"b" - (kcb->jprobe_saved_esp):"memory"); -} - -int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - u8 *addr = (u8 *) (regs->eip - 1); - unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp); - struct jprobe *jp = container_of(p, struct jprobe, kp); - - if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { - if (®s->esp != kcb->jprobe_saved_esp) { - struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; - printk("current esp %p does not match saved esp %p\n", - ®s->esp, kcb->jprobe_saved_esp); - printk("Saved registers for jprobe %p\n", jp); - show_registers(saved_regs); - printk("Current registers\n"); - show_registers(regs); - BUG(); - } - *regs = kcb->jprobe_saved_regs; - memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, - MIN_STACK_SIZE(stack_addr)); - preempt_enable_no_resched(); - return 1; - } - return 0; -} - -int __kprobes arch_trampoline_kprobe(struct kprobe *p) -{ - return 0; -} - -int __init arch_init_kprobes(void) -{ - return 0; -} diff --git a/arch/x86/kernel/kprobes_64.c b/arch/x86/kernel/kprobes_64.c deleted file mode 100644 index 5df19a9f923..00000000000 --- a/arch/x86/kernel/kprobes_64.c +++ /dev/null @@ -1,749 +0,0 @@ -/* - * Kernel Probes (KProbes) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - * - * Copyright (C) IBM Corporation, 2002, 2004 - * - * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel - * Probes initial implementation ( includes contributions from - * Rusty Russell). - * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes - * interface to access function arguments. - * 2004-Oct Jim Keniston <kenistoj@us.ibm.com> and Prasanna S Panchamukhi - * <prasanna@in.ibm.com> adapted for x86_64 - * 2005-Mar Roland McGrath <roland@redhat.com> - * Fixed to handle %rip-relative addressing mode correctly. - * 2005-May Rusty Lynch <rusty.lynch@intel.com> - * Added function return probes functionality - */ - -#include <linux/kprobes.h> -#include <linux/ptrace.h> -#include <linux/string.h> -#include <linux/slab.h> -#include <linux/preempt.h> -#include <linux/module.h> -#include <linux/kdebug.h> - -#include <asm/pgtable.h> -#include <asm/uaccess.h> -#include <asm/alternative.h> - -void jprobe_return_end(void); -static void __kprobes arch_copy_kprobe(struct kprobe *p); - -DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; -DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); - -struct kretprobe_blackpoint kretprobe_blacklist[] = { - {"__switch_to", }, /* This function switches only current task, but - doesn't switch kernel stack.*/ - {NULL, NULL} /* Terminator */ -}; -const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); - -/* - * returns non-zero if opcode modifies the interrupt flag. - */ -static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) -{ - switch (*insn) { - case 0xfa: /* cli */ - case 0xfb: /* sti */ - case 0xcf: /* iret/iretd */ - case 0x9d: /* popf/popfd */ - return 1; - } - - if (*insn >= 0x40 && *insn <= 0x4f && *++insn == 0xcf) - return 1; - return 0; -} - -int __kprobes arch_prepare_kprobe(struct kprobe *p) -{ - /* insn: must be on special executable page on x86_64. */ - p->ainsn.insn = get_insn_slot(); - if (!p->ainsn.insn) { - return -ENOMEM; - } - arch_copy_kprobe(p); - return 0; -} - -/* - * Determine if the instruction uses the %rip-relative addressing mode. - * If it does, return the address of the 32-bit displacement word. - * If not, return null. - */ -static s32 __kprobes *is_riprel(u8 *insn) -{ -#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ - (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ - (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ - (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ - (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ - << (row % 64)) - static const u64 onebyte_has_modrm[256 / 64] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ------------------------------- */ - W(0x00, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 00 */ - W(0x10, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 10 */ - W(0x20, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0)| /* 20 */ - W(0x30, 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0), /* 30 */ - W(0x40, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 40 */ - W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 50 */ - W(0x60, 0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0)| /* 60 */ - W(0x70, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 70 */ - W(0x80, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 80 */ - W(0x90, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 90 */ - W(0xa0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* a0 */ - W(0xb0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* b0 */ - W(0xc0, 1,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0)| /* c0 */ - W(0xd0, 1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1)| /* d0 */ - W(0xe0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* e0 */ - W(0xf0, 0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1) /* f0 */ - /* ------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - }; - static const u64 twobyte_has_modrm[256 / 64] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ------------------------------- */ - W(0x00, 1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1)| /* 0f */ - W(0x10, 1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0)| /* 1f */ - W(0x20, 1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1)| /* 2f */ - W(0x30, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 3f */ - W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 4f */ - W(0x50, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 5f */ - W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 6f */ - W(0x70, 1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,1), /* 7f */ - W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 8f */ - W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 9f */ - W(0xa0, 0,0,0,1,1,1,1,1,0,0,0,1,1,1,1,1)| /* af */ - W(0xb0, 1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1), /* bf */ - W(0xc0, 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0)| /* cf */ - W(0xd0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* df */ - W(0xe0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* ef */ - W(0xf0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0) /* ff */ - /* ------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - }; -#undef W - int need_modrm; - - /* Skip legacy instruction prefixes. */ - while (1) { - switch (*insn) { - case 0x66: - case 0x67: - case 0x2e: - case 0x3e: - case 0x26: - case 0x64: - case 0x65: - case 0x36: - case 0xf0: - case 0xf3: - case 0xf2: - ++insn; - continue; - } - break; - } - - /* Skip REX instruction prefix. */ - if ((*insn & 0xf0) == 0x40) - ++insn; - - if (*insn == 0x0f) { /* Two-byte opcode. */ - ++insn; - need_modrm = test_bit(*insn, twobyte_has_modrm); - } else { /* One-byte opcode. */ - need_modrm = test_bit(*insn, onebyte_has_modrm); - } - - if (need_modrm) { - u8 modrm = *++insn; - if ((modrm & 0xc7) == 0x05) { /* %rip+disp32 addressing mode */ - /* Displacement follows ModRM byte. */ - return (s32 *) ++insn; - } - } - - /* No %rip-relative addressing mode here. */ - return NULL; -} - -static void __kprobes arch_copy_kprobe(struct kprobe *p) -{ - s32 *ripdisp; - memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE); - ripdisp = is_riprel(p->ainsn.insn); - if (ripdisp) { - /* - * The copied instruction uses the %rip-relative - * addressing mode. Adjust the displacement for the - * difference between the original location of this - * instruction and the location of the copy that will - * actually be run. The tricky bit here is making sure - * that the sign extension happens correctly in this - * calculation, since we need a signed 32-bit result to - * be sign-extended to 64 bits when it's added to the - * %rip value and yield the same 64-bit result that the - * sign-extension of the original signed 32-bit - * displacement would have given. - */ - s64 disp = (u8 *) p->addr + *ripdisp - (u8 *) p->ainsn.insn; - BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ - *ripdisp = disp; - } - p->opcode = *p->addr; -} - -void __kprobes arch_arm_kprobe(struct kprobe *p) -{ - text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); -} - -void __kprobes arch_disarm_kprobe(struct kprobe *p) -{ - text_poke(p->addr, &p->opcode, 1); -} - -void __kprobes arch_remove_kprobe(struct kprobe *p) -{ - mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn, 0); - mutex_unlock(&kprobe_mutex); -} - -static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) -{ - kcb->prev_kprobe.kp = kprobe_running(); - kcb->prev_kprobe.status = kcb->kprobe_status; - kcb->prev_kprobe.old_rflags = kcb->kprobe_old_rflags; - kcb->prev_kprobe.saved_rflags = kcb->kprobe_saved_rflags; -} - -static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) -{ - __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; - kcb->kprobe_status = kcb->prev_kprobe.status; - kcb->kprobe_old_rflags = kcb->prev_kprobe.old_rflags; - kcb->kprobe_saved_rflags = kcb->prev_kprobe.saved_rflags; -} - -static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) -{ - __get_cpu_var(current_kprobe) = p; - kcb->kprobe_saved_rflags = kcb->kprobe_old_rflags - = (regs->eflags & (TF_MASK | IF_MASK)); - if (is_IF_modifier(p->ainsn.insn)) - kcb->kprobe_saved_rflags &= ~IF_MASK; -} - -static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) -{ - regs->eflags |= TF_MASK; - regs->eflags &= ~IF_MASK; - /*single step inline if the instruction is an int3*/ - if (p->opcode == BREAKPOINT_INSTRUCTION) - regs->rip = (unsigned long)p->addr; - else - regs->rip = (unsigned long)p->ainsn.insn; -} - -/* Called with kretprobe_lock held */ -void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, - struct pt_regs *regs) -{ - unsigned long *sara = (unsigned long *)regs->rsp; - - ri->ret_addr = (kprobe_opcode_t *) *sara; - /* Replace the return addr with trampoline addr */ - *sara = (unsigned long) &kretprobe_trampoline; -} - -int __kprobes kprobe_handler(struct pt_regs *regs) -{ - struct kprobe *p; - int ret = 0; - kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t)); - struct kprobe_ctlblk *kcb; - - /* - * We don't want to be preempted for the entire - * duration of kprobe processing - */ - preempt_disable(); - kcb = get_kprobe_ctlblk(); - - /* Check we're not actually recursing */ - if (kprobe_running()) { - p = get_kprobe(addr); - if (p) { - if (kcb->kprobe_status == KPROBE_HIT_SS && - *p->ainsn.insn == BREAKPOINT_INSTRUCTION) { - regs->eflags &= ~TF_MASK; - regs->eflags |= kcb->kprobe_saved_rflags; - goto no_kprobe; - } else if (kcb->kprobe_status == KPROBE_HIT_SSDONE) { - /* TODO: Provide re-entrancy from - * post_kprobes_handler() and avoid exception - * stack corruption while single-stepping on - * the instruction of the new probe. - */ - arch_disarm_kprobe(p); - regs->rip = (unsigned long)p->addr; - reset_current_kprobe(); - ret = 1; - } else { - /* We have reentered the kprobe_handler(), since - * another probe was hit while within the - * handler. We here save the original kprobe - * variables and just single step on instruction - * of the new probe without calling any user - * handlers. - */ - save_previous_kprobe(kcb); - set_current_kprobe(p, regs, kcb); - kprobes_inc_nmissed_count(p); - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_REENTER; - return 1; - } - } else { - if (*addr != BREAKPOINT_INSTRUCTION) { - /* The breakpoint instruction was removed by - * another cpu right after we hit, no further - * handling of this interrupt is appropriate - */ - regs->rip = (unsigned long)addr; - ret = 1; - goto no_kprobe; - } - p = __get_cpu_var(current_kprobe); - if (p->break_handler && p->break_handler(p, regs)) { - goto ss_probe; - } - } - goto no_kprobe; - } - - p = get_kprobe(addr); - if (!p) { - if (*addr != BREAKPOINT_INSTRUCTION) { - /* - * The breakpoint instruction was removed right - * after we hit it. Another cpu has removed - * either a probepoint or a debugger breakpoint - * at this address. In either case, no further - * handling of this interrupt is appropriate. - * Back up over the (now missing) int3 and run - * the original instruction. - */ - regs->rip = (unsigned long)addr; - ret = 1; - } - /* Not one of ours: let kernel handle it */ - goto no_kprobe; - } - - set_current_kprobe(p, regs, kcb); - kcb->kprobe_status = KPROBE_HIT_ACTIVE; - - if (p->pre_handler && p->pre_handler(p, regs)) - /* handler has already set things up, so skip ss setup */ - return 1; - -ss_probe: - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_HIT_SS; - return 1; - -no_kprobe: - preempt_enable_no_resched(); - return ret; -} - -/* - * For function-return probes, init_kprobes() establishes a probepoint - * here. When a retprobed function returns, this probe is hit and - * trampoline_probe_handler() runs, calling the kretprobe's handler. - */ - void kretprobe_trampoline_holder(void) - { - asm volatile ( ".global kretprobe_trampoline\n" - "kretprobe_trampoline: \n" - "nop\n"); - } - -/* - * Called when we hit the probe point at kretprobe_trampoline - */ -int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kretprobe_instance *ri = NULL; - struct hlist_head *head, empty_rp; - struct hlist_node *node, *tmp; - unsigned long flags, orig_ret_address = 0; - unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; - - INIT_HLIST_HEAD(&empty_rp); - spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(current); - - /* - * It is possible to have multiple instances associated with a given - * task either because an multiple functions in the call path - * have a return probe installed on them, and/or more then one return - * return probe was registered for a target function. - * - * We can handle this because: - * - instances are always inserted at the head of the list - * - when multiple return probes are registered for the same - * function, the first instance's ret_addr will point to the - * real return address, and all the rest will point to - * kretprobe_trampoline - */ - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - - if (ri->rp && ri->rp->handler) - ri->rp->handler(ri, regs); - - orig_ret_address = (unsigned long)ri->ret_addr; - recycle_rp_inst(ri, &empty_rp); - - if (orig_ret_address != trampoline_address) - /* - * This is the real return address. Any other - * instances associated with this task are for - * other calls deeper on the call stack - */ - break; - } - - kretprobe_assert(ri, orig_ret_address, trampoline_address); - regs->rip = orig_ret_address; - - reset_current_kprobe(); - spin_unlock_irqrestore(&kretprobe_lock, flags); - preempt_enable_no_resched(); - - hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { - hlist_del(&ri->hlist); - kfree(ri); - } - /* - * By returning a non-zero value, we are telling - * kprobe_handler() that we don't want the post_handler - * to run (and have re-enabled preemption) - */ - return 1; -} - -/* - * Called after single-stepping. p->addr is the address of the - * instruction whose first byte has been replaced by the "int 3" - * instruction. To avoid the SMP problems that can occur when we - * temporarily put back the original opcode to single-step, we - * single-stepped a copy of the instruction. The address of this - * copy is p->ainsn.insn. - * - * This function prepares to return from the post-single-step - * interrupt. We have to fix up the stack as follows: - * - * 0) Except in the case of absolute or indirect jump or call instructions, - * the new rip is relative to the copied instruction. We need to make - * it relative to the original instruction. - * - * 1) If the single-stepped instruction was pushfl, then the TF and IF - * flags are set in the just-pushed eflags, and may need to be cleared. - * - * 2) If the single-stepped instruction was a call, the return address - * that is atop the stack is the address following the copied instruction. - * We need to make it the address following the original instruction. - */ -static void __kprobes resume_execution(struct kprobe *p, - struct pt_regs *regs, struct kprobe_ctlblk *kcb) -{ - unsigned long *tos = (unsigned long *)regs->rsp; - unsigned long copy_rip = (unsigned long)p->ainsn.insn; - unsigned long orig_rip = (unsigned long)p->addr; - kprobe_opcode_t *insn = p->ainsn.insn; - - /*skip the REX prefix*/ - if (*insn >= 0x40 && *insn <= 0x4f) - insn++; - - regs->eflags &= ~TF_MASK; - switch (*insn) { - case 0x9c: /* pushfl */ - *tos &= ~(TF_MASK | IF_MASK); - *tos |= kcb->kprobe_old_rflags; - break; - case 0xc2: /* iret/ret/lret */ - case 0xc3: - case 0xca: - case 0xcb: - case 0xcf: - case 0xea: /* jmp absolute -- ip is correct */ - /* ip is already adjusted, no more changes required */ - goto no_change; - case 0xe8: /* call relative - Fix return addr */ - *tos = orig_rip + (*tos - copy_rip); - break; - case 0xff: - if ((insn[1] & 0x30) == 0x10) { - /* call absolute, indirect */ - /* Fix return addr; ip is correct. */ - *tos = orig_rip + (*tos - copy_rip); - goto no_change; - } else if (((insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */ - ((insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */ - /* ip is correct. */ - goto no_change; - } - default: - break; - } - - regs->rip = orig_rip + (regs->rip - copy_rip); -no_change: - - return; -} - -int __kprobes post_kprobe_handler(struct pt_regs *regs) -{ - struct kprobe *cur = kprobe_running(); - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - if (!cur) - return 0; - - if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { - kcb->kprobe_status = KPROBE_HIT_SSDONE; - cur->post_handler(cur, regs, 0); - } - - resume_execution(cur, regs, kcb); - regs->eflags |= kcb->kprobe_saved_rflags; - trace_hardirqs_fixup_flags(regs->eflags); - - /* Restore the original saved kprobes variables and continue. */ - if (kcb->kprobe_status == KPROBE_REENTER) { - restore_previous_kprobe(kcb); - goto out; - } - reset_current_kprobe(); -out: - preempt_enable_no_resched(); - - /* - * if somebody else is singlestepping across a probe point, eflags - * will have TF set, in which case, continue the remaining processing - * of do_debug, as if this is not a probe hit. - */ - if (regs->eflags & TF_MASK) - return 0; - - return 1; -} - -int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) -{ - struct kprobe *cur = kprobe_running(); - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - const struct exception_table_entry *fixup; - - switch(kcb->kprobe_status) { - case KPROBE_HIT_SS: - case KPROBE_REENTER: - /* - * We are here because the instruction being single - * stepped caused a page fault. We reset the current - * kprobe and the rip points back to the probe address - * and allow the page fault handler to continue as a - * normal page fault. - */ - regs->rip = (unsigned long)cur->addr; - regs->eflags |= kcb->kprobe_old_rflags; - if (kcb->kprobe_status == KPROBE_REENTER) - restore_previous_kprobe(kcb); - else - reset_current_kprobe(); - preempt_enable_no_resched(); - break; - case KPROBE_HIT_ACTIVE: - case KPROBE_HIT_SSDONE: - /* - * We increment the nmissed count for accounting, - * we can also use npre/npostfault count for accouting - * these specific fault cases. - */ - kprobes_inc_nmissed_count(cur); - - /* - * We come here because instructions in the pre/post - * handler caused the page_fault, this could happen - * if handler tries to access user space by - * copy_from_user(), get_user() etc. Let the - * user-specified handler try to fix it first. - */ - if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) - return 1; - - /* - * In case the user-specified fault handler returned - * zero, try to fix up. - */ - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return 1; - } - - /* - * fixup() could not handle it, - * Let do_page_fault() fix it. - */ - break; - default: - break; - } - return 0; -} - -/* - * Wrapper routine for handling exceptions. - */ -int __kprobes kprobe_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct die_args *args = (struct die_args *)data; - int ret = NOTIFY_DONE; - - if (args->regs && user_mode(args->regs)) - return ret; - - switch (val) { - case DIE_INT3: - if (kprobe_handler(args->regs)) - ret = NOTIFY_STOP; - break; - case DIE_DEBUG: - if (post_kprobe_handler(args->regs)) - ret = NOTIFY_STOP; - break; - case DIE_GPF: - /* kprobe_running() needs smp_processor_id() */ - preempt_disable(); - if (kprobe_running() && - kprobe_fault_handler(args->regs, args->trapnr)) - ret = NOTIFY_STOP; - preempt_enable(); - break; - default: - break; - } - return ret; -} - -int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct jprobe *jp = container_of(p, struct jprobe, kp); - unsigned long addr; - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - kcb->jprobe_saved_regs = *regs; - kcb->jprobe_saved_rsp = (long *) regs->rsp; - addr = (unsigned long)(kcb->jprobe_saved_rsp); - /* - * As Linus pointed out, gcc assumes that the callee - * owns the argument space and could overwrite it, e.g. - * tailcall optimization. So, to be absolutely safe - * we also save and restore enough stack bytes to cover - * the argument area. - */ - memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, - MIN_STACK_SIZE(addr)); - regs->eflags &= ~IF_MASK; - trace_hardirqs_off(); - regs->rip = (unsigned long)(jp->entry); - return 1; -} - -void __kprobes jprobe_return(void) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - - asm volatile (" xchg %%rbx,%%rsp \n" - " int3 \n" - " .globl jprobe_return_end \n" - " jprobe_return_end: \n" - " nop \n"::"b" - (kcb->jprobe_saved_rsp):"memory"); -} - -int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) -{ - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - u8 *addr = (u8 *) (regs->rip - 1); - unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_rsp); - struct jprobe *jp = container_of(p, struct jprobe, kp); - - if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { - if ((unsigned long *)regs->rsp != kcb->jprobe_saved_rsp) { - struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; - printk("current rsp %p does not match saved rsp %p\n", - (long *)regs->rsp, kcb->jprobe_saved_rsp); - printk("Saved registers for jprobe %p\n", jp); - show_registers(saved_regs); - printk("Current registers\n"); - show_registers(regs); - BUG(); - } - *regs = kcb->jprobe_saved_regs; - memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, - MIN_STACK_SIZE(stack_addr)); - preempt_enable_no_resched(); - return 1; - } - return 0; -} - -static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *) &kretprobe_trampoline, - .pre_handler = trampoline_probe_handler -}; - -int __init arch_init_kprobes(void) -{ - return register_kprobe(&trampoline_p); -} - -int __kprobes arch_trampoline_kprobe(struct kprobe *p) -{ - if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) - return 1; - - return 0; -} diff --git a/arch/x86/kernel/ldt_32.c b/arch/x86/kernel/ldt.c index 9ff90a27c45..8a7660c8394 100644 --- a/arch/x86/kernel/ldt_32.c +++ b/arch/x86/kernel/ldt.c @@ -1,6 +1,9 @@ /* * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2002 Andi Kleen + * + * This handles calls from both 32bit and 64bit mode. */ #include <linux/errno.h> @@ -9,7 +12,6 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/vmalloc.h> -#include <linux/slab.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -17,7 +19,7 @@ #include <asm/desc.h> #include <asm/mmu_context.h> -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ +#ifdef CONFIG_SMP static void flush_ldt(void *null) { if (current->active_mm) @@ -27,26 +29,31 @@ static void flush_ldt(void *null) static int alloc_ldt(mm_context_t *pc, int mincount, int reload) { - void *oldldt; - void *newldt; + void *oldldt, *newldt; int oldsize; if (mincount <= pc->size) return 0; oldsize = pc->size; - mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); + mincount = (mincount + 511) & (~511); + if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) + newldt = vmalloc(mincount * LDT_ENTRY_SIZE); else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + newldt = (void *)__get_free_page(GFP_KERNEL); if (!newldt) return -ENOMEM; if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); + memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE); oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); + memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, + (mincount - oldsize) * LDT_ENTRY_SIZE); + +#ifdef CONFIG_X86_64 + /* CHECKME: Do we really need this ? */ + wmb(); +#endif pc->ldt = newldt; wmb(); pc->size = mincount; @@ -55,6 +62,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) if (reload) { #ifdef CONFIG_SMP cpumask_t mask; + preempt_disable(); load_LDT(pc); mask = cpumask_of_cpu(smp_processor_id()); @@ -66,10 +74,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) #endif } if (oldsize) { - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) + if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) vfree(oldldt); else - kfree(oldldt); + put_page(virt_to_page(oldldt)); } return 0; } @@ -77,9 +85,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) static inline int copy_ldt(mm_context_t *new, mm_context_t *old) { int err = alloc_ldt(new, old->size, 0); + if (err < 0) return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); return 0; } @@ -89,7 +98,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old) */ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { - struct mm_struct * old_mm; + struct mm_struct *old_mm; int retval = 0; mutex_init(&mm->context.lock); @@ -105,33 +114,38 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) /* * No need to lock the MM as we are the last user + * + * 64bit: Don't touch the LDT register - we're already in the next thread. */ void destroy_context(struct mm_struct *mm) { if (mm->context.size) { +#ifdef CONFIG_X86_32 + /* CHECKME: Can this ever happen ? */ if (mm == current->active_mm) clear_LDT(); - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) +#endif + if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) vfree(mm->context.ldt); else - kfree(mm->context.ldt); + put_page(virt_to_page(mm->context.ldt)); mm->context.size = 0; } } -static int read_ldt(void __user * ptr, unsigned long bytecount) +static int read_ldt(void __user *ptr, unsigned long bytecount) { int err; unsigned long size; - struct mm_struct * mm = current->mm; + struct mm_struct *mm = current->mm; if (!mm->context.size) return 0; - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; + if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES; mutex_lock(&mm->context.lock); - size = mm->context.size*LDT_ENTRY_SIZE; + size = mm->context.size * LDT_ENTRY_SIZE; if (size > bytecount) size = bytecount; @@ -143,7 +157,7 @@ static int read_ldt(void __user * ptr, unsigned long bytecount) goto error_return; if (size != bytecount) { /* zero-fill the rest */ - if (clear_user(ptr+size, bytecount-size) != 0) { + if (clear_user(ptr + size, bytecount - size) != 0) { err = -EFAULT; goto error_return; } @@ -153,34 +167,32 @@ error_return: return err; } -static int read_default_ldt(void __user * ptr, unsigned long bytecount) +static int read_default_ldt(void __user *ptr, unsigned long bytecount) { - int err; - unsigned long size; - - err = 0; - size = 5*sizeof(struct desc_struct); - if (size > bytecount) - size = bytecount; - - err = size; - if (clear_user(ptr, size)) - err = -EFAULT; - - return err; + /* CHECKME: Can we use _one_ random number ? */ +#ifdef CONFIG_X86_32 + unsigned long size = 5 * sizeof(struct desc_struct); +#else + unsigned long size = 128; +#endif + if (bytecount > size) + bytecount = size; + if (clear_user(ptr, bytecount)) + return -EFAULT; + return bytecount; } -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) +static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { - struct mm_struct * mm = current->mm; - __u32 entry_1, entry_2; + struct mm_struct *mm = current->mm; + struct desc_struct ldt; int error; struct user_desc ldt_info; error = -EINVAL; if (bytecount != sizeof(ldt_info)) goto out; - error = -EFAULT; + error = -EFAULT; if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) goto out; @@ -196,28 +208,27 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) mutex_lock(&mm->context.lock); if (ldt_info.entry_number >= mm->context.size) { - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); + error = alloc_ldt(¤t->mm->context, + ldt_info.entry_number + 1, 1); if (error < 0) goto out_unlock; } - /* Allow LDTs to be cleared by the user. */ - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { if (oldmode || LDT_empty(&ldt_info)) { - entry_1 = 0; - entry_2 = 0; + memset(&ldt, 0, sizeof(ldt)); goto install; } } - entry_1 = LDT_entry_a(&ldt_info); - entry_2 = LDT_entry_b(&ldt_info); + fill_ldt(&ldt, &ldt_info); if (oldmode) - entry_2 &= ~(1 << 20); + ldt.avl = 0; /* Install the new entry ... */ install: - write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2); + write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt); error = 0; out_unlock: @@ -226,7 +237,8 @@ out: return error; } -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +asmlinkage int sys_modify_ldt(int func, void __user *ptr, + unsigned long bytecount) { int ret = -ENOSYS; diff --git a/arch/x86/kernel/ldt_64.c b/arch/x86/kernel/ldt_64.c deleted file mode 100644 index 60e57abb8e9..00000000000 --- a/arch/x86/kernel/ldt_64.c +++ /dev/null @@ -1,250 +0,0 @@ -/* - * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2002 Andi Kleen - * - * This handles calls from both 32bit and 64bit mode. - */ - -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/vmalloc.h> -#include <linux/slab.h> - -#include <asm/uaccess.h> -#include <asm/system.h> -#include <asm/ldt.h> -#include <asm/desc.h> -#include <asm/proto.h> - -#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ -static void flush_ldt(void *null) -{ - if (current->active_mm) - load_LDT(¤t->active_mm->context); -} -#endif - -static int alloc_ldt(mm_context_t *pc, unsigned mincount, int reload) -{ - void *oldldt; - void *newldt; - unsigned oldsize; - - if (mincount <= (unsigned)pc->size) - return 0; - oldsize = pc->size; - mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); - else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; - - if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); - oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); - wmb(); - pc->ldt = newldt; - wmb(); - pc->size = mincount; - wmb(); - if (reload) { -#ifdef CONFIG_SMP - cpumask_t mask; - - preempt_disable(); - mask = cpumask_of_cpu(smp_processor_id()); - load_LDT(pc); - if (!cpus_equal(current->mm->cpu_vm_mask, mask)) - smp_call_function(flush_ldt, NULL, 1, 1); - preempt_enable(); -#else - load_LDT(pc); -#endif - } - if (oldsize) { - if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(oldldt); - else - kfree(oldldt); - } - return 0; -} - -static inline int copy_ldt(mm_context_t *new, mm_context_t *old) -{ - int err = alloc_ldt(new, old->size, 0); - if (err < 0) - return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); - return 0; -} - -/* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm) -{ - struct mm_struct * old_mm; - int retval = 0; - - mutex_init(&mm->context.lock); - mm->context.size = 0; - old_mm = current->mm; - if (old_mm && old_mm->context.size > 0) { - mutex_lock(&old_mm->context.lock); - retval = copy_ldt(&mm->context, &old_mm->context); - mutex_unlock(&old_mm->context.lock); - } - return retval; -} - -/* - * - * Don't touch the LDT register - we're already in the next thread. - */ -void destroy_context(struct mm_struct *mm) -{ - if (mm->context.size) { - if ((unsigned)mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - kfree(mm->context.ldt); - mm->context.size = 0; - } -} - -static int read_ldt(void __user * ptr, unsigned long bytecount) -{ - int err; - unsigned long size; - struct mm_struct * mm = current->mm; - - if (!mm->context.size) - return 0; - if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) - bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; - - mutex_lock(&mm->context.lock); - size = mm->context.size*LDT_ENTRY_SIZE; - if (size > bytecount) - size = bytecount; - - err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; - mutex_unlock(&mm->context.lock); - if (err < 0) - goto error_return; - if (size != bytecount) { - /* zero-fill the rest */ - if (clear_user(ptr+size, bytecount-size) != 0) { - err = -EFAULT; - goto error_return; - } - } - return bytecount; -error_return: - return err; -} - -static int read_default_ldt(void __user * ptr, unsigned long bytecount) -{ - /* Arbitrary number */ - /* x86-64 default LDT is all zeros */ - if (bytecount > 128) - bytecount = 128; - if (clear_user(ptr, bytecount)) - return -EFAULT; - return bytecount; -} - -static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) -{ - struct task_struct *me = current; - struct mm_struct * mm = me->mm; - __u32 entry_1, entry_2, *lp; - int error; - struct user_desc ldt_info; - - error = -EINVAL; - - if (bytecount != sizeof(ldt_info)) - goto out; - error = -EFAULT; - if (copy_from_user(&ldt_info, ptr, bytecount)) - goto out; - - error = -EINVAL; - if (ldt_info.entry_number >= LDT_ENTRIES) - goto out; - if (ldt_info.contents == 3) { - if (oldmode) - goto out; - if (ldt_info.seg_not_present == 0) - goto out; - } - - mutex_lock(&mm->context.lock); - if (ldt_info.entry_number >= (unsigned)mm->context.size) { - error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); - if (error < 0) - goto out_unlock; - } - - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); - - /* Allow LDTs to be cleared by the user. */ - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { - if (oldmode || LDT_empty(&ldt_info)) { - entry_1 = 0; - entry_2 = 0; - goto install; - } - } - - entry_1 = LDT_entry_a(&ldt_info); - entry_2 = LDT_entry_b(&ldt_info); - if (oldmode) - entry_2 &= ~(1 << 20); - - /* Install the new entry ... */ -install: - *lp = entry_1; - *(lp+1) = entry_2; - error = 0; - -out_unlock: - mutex_unlock(&mm->context.lock); -out: - return error; -} - -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) -{ - int ret = -ENOSYS; - - switch (func) { - case 0: - ret = read_ldt(ptr, bytecount); - break; - case 1: - ret = write_ldt(ptr, bytecount, 1); - break; - case 2: - ret = read_default_ldt(ptr, bytecount); - break; - case 0x11: - ret = write_ldt(ptr, bytecount, 0); - break; - } - return ret; -} diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 11b935f4f88..c1cfd60639d 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -32,7 +32,7 @@ static u32 kexec_pte1[1024] PAGE_ALIGNED; static void set_idt(void *newidt, __u16 limit) { - struct Xgt_desc_struct curidt; + struct desc_ptr curidt; /* ia32 supports unaliged loads & stores */ curidt.size = limit; @@ -44,7 +44,7 @@ static void set_idt(void *newidt, __u16 limit) static void set_gdt(void *newgdt, __u16 limit) { - struct Xgt_desc_struct curgdt; + struct desc_ptr curgdt; /* ia32 supports unaligned loads & stores */ curgdt.size = limit; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index aa3d2c8f773..a1fef42f8cd 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -234,10 +234,5 @@ NORET_TYPE void machine_kexec(struct kimage *image) void arch_crash_save_vmcoreinfo(void) { VMCOREINFO_SYMBOL(init_level4_pgt); - -#ifdef CONFIG_ARCH_DISCONTIGMEM_ENABLE - VMCOREINFO_SYMBOL(node_data); - VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); -#endif } diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 3960ab7e149..219f86eb612 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -63,6 +63,21 @@ static int __init mfgpt_disable(char *s) } __setup("nomfgpt", mfgpt_disable); +/* Reset the MFGPT timers. This is required by some broken BIOSes which already + * do the same and leave the system in an unstable state. TinyBIOS 0.98 is + * affected at least (0.99 is OK with MFGPT workaround left to off). + */ +static int __init mfgpt_fix(char *s) +{ + u32 val, dummy; + + /* The following udocumented bit resets the MFGPT timers */ + val = 0xFF; dummy = 0; + wrmsr(0x5140002B, val, dummy); + return 1; +} +__setup("mfgptfix", mfgpt_fix); + /* * Check whether any MFGPTs are available for the kernel to use. In most * cases, firmware that uses AMD's VSA code will claim all timers during diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 40cfd548871..6ff447f9fda 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -244,8 +244,8 @@ static int microcode_sanity_check(void *mc) return 0; /* check extended signature checksum */ for (i = 0; i < ext_sigcount; i++) { - ext_sig = (struct extended_signature *)((void *)ext_header - + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i); + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; sum = orig_sum - (mc_header->sig + mc_header->pf + mc_header->cksum) + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); @@ -279,11 +279,9 @@ static int get_maching_microcode(void *mc, int cpu) if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) return 0; - ext_header = (struct extended_sigtable *)(mc + - get_datasize(mc_header) + MC_HEADER_SIZE); + ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; ext_sigcount = ext_header->count; - ext_sig = (struct extended_signature *)((void *)ext_header - + EXT_HEADER_SIZE); + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; for (i = 0; i < ext_sigcount; i++) { if (microcode_update_match(cpu, mc_header, ext_sig->sig, ext_sig->pf)) @@ -539,7 +537,7 @@ static int cpu_request_microcode(int cpu) pr_debug("ucode data file %s load failed\n", name); return error; } - buf = (void *)firmware->data; + buf = firmware->data; size = firmware->size; while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) > 0) { diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c index 7a05a7f6099..67009cdd5ec 100644 --- a/arch/x86/kernel/mpparse_32.c +++ b/arch/x86/kernel/mpparse_32.c @@ -68,7 +68,7 @@ unsigned int def_to_bigsmp = 0; /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; /* Internal processor count */ -unsigned int __cpuinitdata num_processors; +unsigned int num_processors; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map; @@ -258,7 +258,7 @@ static void __init MP_ioapic_info (struct mpc_config_ioapic *m) if (!(m->mpc_flags & MPC_APIC_USABLE)) return; - printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n", + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); if (nr_ioapics >= MAX_IO_APICS) { printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", @@ -405,9 +405,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) mps_oem_check(mpc, oem, str); - printk("APIC at: 0x%lX\n",mpc->mpc_lapic); + printk("APIC at: 0x%X\n", mpc->mpc_lapic); - /* + /* * Save the local APIC address (it might be non-default) -- but only * if we're not using ACPI. */ @@ -721,7 +721,7 @@ static int __init smp_scan_config (unsigned long base, unsigned long length) unsigned long *bp = phys_to_virt(base); struct intel_mp_floating *mpf; - Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); + printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length); if (sizeof(*mpf) != 16) printk("Error: MPF size\n"); @@ -734,8 +734,8 @@ static int __init smp_scan_config (unsigned long base, unsigned long length) || (mpf->mpf_specification == 4)) ) { smp_found_config = 1; - printk(KERN_INFO "found SMP MP-table at %08lx\n", - virt_to_phys(mpf)); + printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n", + mpf, virt_to_phys(mpf)); reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); if (mpf->mpf_physptr) { /* @@ -918,14 +918,14 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) */ mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; mp_ioapic_routing[idx].gsi_base = gsi_base; - mp_ioapic_routing[idx].gsi_end = gsi_base + + mp_ioapic_routing[idx].gsi_end = gsi_base + io_apic_get_redir_entries(idx); - printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " - "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, - mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, - mp_ioapic_routing[idx].gsi_base, - mp_ioapic_routing[idx].gsi_end); + printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapic_routing[idx].gsi_base, + mp_ioapic_routing[idx].gsi_end); } void __init @@ -1041,15 +1041,16 @@ void __init mp_config_acpi_legacy_irqs (void) } #define MAX_GSI_NUM 4096 +#define IRQ_COMPRESSION_START 64 int mp_register_gsi(u32 gsi, int triggering, int polarity) { int ioapic = -1; int ioapic_pin = 0; int idx, bit = 0; - static int pci_irq = 16; + static int pci_irq = IRQ_COMPRESSION_START; /* - * Mapping between Global System Interrups, which + * Mapping between Global System Interrupts, which * represent all possible interrupts, and IRQs * assigned to actual devices. */ @@ -1086,12 +1087,16 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", mp_ioapic_routing[ioapic].apic_id, ioapic_pin); - return gsi_to_irq[gsi]; + return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); } mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); - if (triggering == ACPI_LEVEL_SENSITIVE) { + /* + * For GSI >= 64, use IRQ compression + */ + if ((gsi >= IRQ_COMPRESSION_START) + && (triggering == ACPI_LEVEL_SENSITIVE)) { /* * For PCI devices assign IRQs in order, avoiding gaps * due to unused I/O APIC pins. diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c index ef4aab12358..72ab1403fed 100644 --- a/arch/x86/kernel/mpparse_64.c +++ b/arch/x86/kernel/mpparse_64.c @@ -60,14 +60,18 @@ unsigned int boot_cpu_id = -1U; EXPORT_SYMBOL(boot_cpu_id); /* Internal processor count */ -unsigned int num_processors __cpuinitdata = 0; +unsigned int num_processors; unsigned disabled_cpus __cpuinitdata; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; +u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata + = { [0 ... NR_CPUS-1] = BAD_APICID }; +void *x86_bios_cpu_apicid_early_ptr; +DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID; +EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid); /* @@ -118,24 +122,22 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m) physid_set(m->mpc_apicid, phys_cpu_present_map); if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { /* - * bios_cpu_apicid is required to have processors listed + * x86_bios_cpu_apicid is required to have processors listed * in same order as logical cpu numbers. Hence the first * entry is BSP, and so on. */ cpu = 0; } - bios_cpu_apicid[cpu] = m->mpc_apicid; - /* - * We get called early in the the start_kernel initialization - * process when the per_cpu data area is not yet setup, so we - * use a static array that is removed after the per_cpu data - * area is created. - */ - if (x86_cpu_to_apicid_ptr) { - u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr; - x86_cpu_to_apicid[cpu] = m->mpc_apicid; + /* are we being called early in kernel startup? */ + if (x86_cpu_to_apicid_early_ptr) { + u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr; + u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr; + + cpu_to_apicid[cpu] = m->mpc_apicid; + bios_cpu_apicid[cpu] = m->mpc_apicid; } else { per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid; + per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid; } cpu_set(cpu, cpu_possible_map); diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c index 4f4bfd3a88b..edd413650b3 100644 --- a/arch/x86/kernel/nmi_32.c +++ b/arch/x86/kernel/nmi_32.c @@ -51,13 +51,13 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); static int endflag __initdata = 0; +#ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all * CPUs during the test make them busy. */ static __init void nmi_cpu_busy(void *data) { -#ifdef CONFIG_SMP local_irq_enable_in_hardirq(); /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, @@ -67,8 +67,8 @@ static __init void nmi_cpu_busy(void *data) care if they get somewhat less cycles. */ while (endflag == 0) mb(); -#endif } +#endif static int __init check_nmi_watchdog(void) { @@ -87,11 +87,13 @@ static int __init check_nmi_watchdog(void) printk(KERN_INFO "Testing NMI watchdog ... "); +#ifdef CONFIG_SMP if (nmi_watchdog == NMI_LOCAL_APIC) smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); +#endif for_each_possible_cpu(cpu) - prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; + prev_nmi_count[cpu] = nmi_count(cpu); local_irq_enable(); mdelay((20*1000)/nmi_hz); // wait 20 ticks @@ -237,10 +239,10 @@ void acpi_nmi_disable(void) on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); } -void setup_apic_nmi_watchdog (void *unused) +void setup_apic_nmi_watchdog(void *unused) { if (__get_cpu_var(wd_enabled)) - return; + return; /* cheap hack to support suspend/resume */ /* if cpu0 is not active neither should the other cpus */ @@ -329,7 +331,7 @@ __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) unsigned int sum; int touched = 0; int cpu = smp_processor_id(); - int rc=0; + int rc = 0; /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c index c3d1476b6a1..fb99484d21c 100644 --- a/arch/x86/kernel/nmi_64.c +++ b/arch/x86/kernel/nmi_64.c @@ -39,7 +39,7 @@ static cpumask_t backtrace_mask = CPU_MASK_NONE; * 0: the lapic NMI watchdog is disabled, but can be enabled */ atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -int panic_on_timeout; +static int panic_on_timeout; unsigned int nmi_watchdog = NMI_DEFAULT; static unsigned int nmi_hz = HZ; @@ -78,22 +78,22 @@ static __init void nmi_cpu_busy(void *data) } #endif -int __init check_nmi_watchdog (void) +int __init check_nmi_watchdog(void) { - int *counts; + int *prev_nmi_count; int cpu; - if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) + if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) return 0; if (!atomic_read(&nmi_active)) return 0; - counts = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); - if (!counts) + prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); + if (!prev_nmi_count) return -1; - printk(KERN_INFO "testing NMI watchdog ... "); + printk(KERN_INFO "Testing NMI watchdog ... "); #ifdef CONFIG_SMP if (nmi_watchdog == NMI_LOCAL_APIC) @@ -101,30 +101,29 @@ int __init check_nmi_watchdog (void) #endif for (cpu = 0; cpu < NR_CPUS; cpu++) - counts[cpu] = cpu_pda(cpu)->__nmi_count; + prev_nmi_count[cpu] = cpu_pda(cpu)->__nmi_count; local_irq_enable(); mdelay((20*1000)/nmi_hz); // wait 20 ticks for_each_online_cpu(cpu) { if (!per_cpu(wd_enabled, cpu)) continue; - if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { + if (cpu_pda(cpu)->__nmi_count - prev_nmi_count[cpu] <= 5) { printk(KERN_WARNING "WARNING: CPU#%d: NMI " "appears to be stuck (%d->%d)!\n", - cpu, - counts[cpu], - cpu_pda(cpu)->__nmi_count); + cpu, + prev_nmi_count[cpu], + cpu_pda(cpu)->__nmi_count); per_cpu(wd_enabled, cpu) = 0; atomic_dec(&nmi_active); } } + endflag = 1; if (!atomic_read(&nmi_active)) { - kfree(counts); + kfree(prev_nmi_count); atomic_set(&nmi_active, -1); - endflag = 1; return -1; } - endflag = 1; printk("OK.\n"); /* now that we know it works we can reduce NMI frequency to @@ -132,11 +131,11 @@ int __init check_nmi_watchdog (void) if (nmi_watchdog == NMI_LOCAL_APIC) nmi_hz = lapic_adjust_nmi_hz(1); - kfree(counts); + kfree(prev_nmi_count); return 0; } -int __init setup_nmi_watchdog(char *str) +static int __init setup_nmi_watchdog(char *str) { int nmi; @@ -159,34 +158,6 @@ int __init setup_nmi_watchdog(char *str) __setup("nmi_watchdog=", setup_nmi_watchdog); - -static void __acpi_nmi_disable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - -/* - * Disable timer based NMIs on all CPUs: - */ -void acpi_nmi_disable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); -} - -static void __acpi_nmi_enable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI); -} - -/* - * Enable timer based NMIs on all CPUs: - */ -void acpi_nmi_enable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); -} #ifdef CONFIG_PM static int nmi_pm_active; /* nmi_active before suspend */ @@ -217,7 +188,7 @@ static struct sysdev_class nmi_sysclass = { }; static struct sys_device device_lapic_nmi = { - .id = 0, + .id = 0, .cls = &nmi_sysclass, }; @@ -231,7 +202,7 @@ static int __init init_lapic_nmi_sysfs(void) if (nmi_watchdog != NMI_LOCAL_APIC) return 0; - if ( atomic_read(&nmi_active) < 0 ) + if (atomic_read(&nmi_active) < 0) return 0; error = sysdev_class_register(&nmi_sysclass); @@ -244,9 +215,37 @@ late_initcall(init_lapic_nmi_sysfs); #endif /* CONFIG_PM */ +static void __acpi_nmi_enable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI); +} + +/* + * Enable timer based NMIs on all CPUs: + */ +void acpi_nmi_enable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); +} + +static void __acpi_nmi_disable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); +} + void setup_apic_nmi_watchdog(void *unused) { - if (__get_cpu_var(wd_enabled) == 1) + if (__get_cpu_var(wd_enabled)) return; /* cheap hack to support suspend/resume */ @@ -311,8 +310,9 @@ void touch_nmi_watchdog(void) } } - touch_softlockup_watchdog(); + touch_softlockup_watchdog(); } +EXPORT_SYMBOL(touch_nmi_watchdog); int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { @@ -479,4 +479,3 @@ void __trigger_all_cpu_backtrace(void) EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); -EXPORT_SYMBOL(touch_nmi_watchdog); diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index 9000d82c6dc..e65281b1634 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -82,7 +82,7 @@ static int __init numaq_tsc_disable(void) { if (num_online_nodes() > 1) { printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); - tsc_disable = 1; + setup_clear_cpu_cap(X86_FEATURE_TSC); } return 0; } diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt.c index f5000799f8e..075962cc75a 100644 --- a/arch/x86/kernel/paravirt_32.c +++ b/arch/x86/kernel/paravirt.c @@ -14,7 +14,10 @@ You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + 2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc */ + #include <linux/errno.h> #include <linux/module.h> #include <linux/efi.h> @@ -55,59 +58,9 @@ char *memory_setup(void) extern const char start_##ops##_##name[], end_##ops##_##name[]; \ asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":") -DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); -DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); -DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); -DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); -DEF_NATIVE(pv_cpu_ops, iret, "iret"); -DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit"); -DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); -DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); -DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); -DEF_NATIVE(pv_cpu_ops, clts, "clts"); -DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); - /* Undefined instruction for dealing with missing ops pointers. */ static const unsigned char ud2a[] = { 0x0f, 0x0b }; -static unsigned native_patch(u8 type, u16 clobbers, void *ibuf, - unsigned long addr, unsigned len) -{ - const unsigned char *start, *end; - unsigned ret; - - switch(type) { -#define SITE(ops, x) \ - case PARAVIRT_PATCH(ops.x): \ - start = start_##ops##_##x; \ - end = end_##ops##_##x; \ - goto patch_site - - SITE(pv_irq_ops, irq_disable); - SITE(pv_irq_ops, irq_enable); - SITE(pv_irq_ops, restore_fl); - SITE(pv_irq_ops, save_fl); - SITE(pv_cpu_ops, iret); - SITE(pv_cpu_ops, irq_enable_sysexit); - SITE(pv_mmu_ops, read_cr2); - SITE(pv_mmu_ops, read_cr3); - SITE(pv_mmu_ops, write_cr3); - SITE(pv_cpu_ops, clts); - SITE(pv_cpu_ops, read_tsc); -#undef SITE - - patch_site: - ret = paravirt_patch_insns(ibuf, len, start, end); - break; - - default: - ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); - break; - } - - return ret; -} - unsigned paravirt_patch_nop(void) { return 0; @@ -186,7 +139,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, /* If the operation is a nop, then nop the callsite */ ret = paravirt_patch_nop(); else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || - type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit)) + type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret)) /* If operation requires a jmp, then jmp */ ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); else @@ -237,7 +190,7 @@ static void native_flush_tlb_single(unsigned long addr) /* These are in entry.S */ extern void native_iret(void); -extern void native_irq_enable_sysexit(void); +extern void native_irq_enable_syscall_ret(void); static int __init print_banner(void) { @@ -285,18 +238,18 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA static inline void enter_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(x86_read_percpu(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); + BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); BUG_ON(preemptible()); - x86_write_percpu(paravirt_lazy_mode, mode); + __get_cpu_var(paravirt_lazy_mode) = mode; } void paravirt_leave_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(x86_read_percpu(paravirt_lazy_mode) != mode); + BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); BUG_ON(preemptible()); - x86_write_percpu(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); + __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; } void paravirt_enter_lazy_mmu(void) @@ -321,7 +274,7 @@ void paravirt_leave_lazy_cpu(void) enum paravirt_lazy_mode paravirt_get_lazy_mode(void) { - return x86_read_percpu(paravirt_lazy_mode); + return __get_cpu_var(paravirt_lazy_mode); } struct pv_info pv_info = { @@ -366,11 +319,16 @@ struct pv_cpu_ops pv_cpu_ops = { .read_cr4 = native_read_cr4, .read_cr4_safe = native_read_cr4_safe, .write_cr4 = native_write_cr4, +#ifdef CONFIG_X86_64 + .read_cr8 = native_read_cr8, + .write_cr8 = native_write_cr8, +#endif .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, .write_msr = native_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, + .read_tscp = native_read_tscp, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, .load_gdt = native_load_gdt, @@ -379,13 +337,14 @@ struct pv_cpu_ops pv_cpu_ops = { .store_idt = native_store_idt, .store_tr = native_store_tr, .load_tls = native_load_tls, - .write_ldt_entry = write_dt_entry, - .write_gdt_entry = write_dt_entry, - .write_idt_entry = write_dt_entry, - .load_esp0 = native_load_esp0, + .write_ldt_entry = native_write_ldt_entry, + .write_gdt_entry = native_write_gdt_entry, + .write_idt_entry = native_write_idt_entry, + .load_sp0 = native_load_sp0, - .irq_enable_sysexit = native_irq_enable_sysexit, + .irq_enable_syscall_ret = native_irq_enable_syscall_ret, .iret = native_iret, + .swapgs = native_swapgs, .set_iopl_mask = native_set_iopl_mask, .io_delay = native_io_delay, @@ -408,8 +367,10 @@ struct pv_apic_ops pv_apic_ops = { }; struct pv_mmu_ops pv_mmu_ops = { +#ifndef CONFIG_X86_64 .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, +#endif .read_cr2 = native_read_cr2, .write_cr2 = native_write_cr2, @@ -437,16 +398,23 @@ struct pv_mmu_ops pv_mmu_ops = { .kmap_atomic_pte = kmap_atomic, #endif +#if PAGETABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE .set_pte_atomic = native_set_pte_atomic, .set_pte_present = native_set_pte_present, - .set_pud = native_set_pud, .pte_clear = native_pte_clear, .pmd_clear = native_pmd_clear, - +#endif + .set_pud = native_set_pud, .pmd_val = native_pmd_val, .make_pmd = native_make_pmd, + +#if PAGETABLE_LEVELS == 4 + .pud_val = native_pud_val, + .make_pud = native_make_pud, + .set_pgd = native_set_pgd, #endif +#endif /* PAGETABLE_LEVELS >= 3 */ .pte_val = native_pte_val, .pgd_val = native_pgd_val, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c new file mode 100644 index 00000000000..82fc5fcab4f --- /dev/null +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -0,0 +1,49 @@ +#include <asm/paravirt.h> + +DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); +DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); +DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); +DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); +DEF_NATIVE(pv_cpu_ops, iret, "iret"); +DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit"); +DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); +DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); +DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); +DEF_NATIVE(pv_cpu_ops, clts, "clts"); +DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); + +unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len) +{ + const unsigned char *start, *end; + unsigned ret; + +#define PATCH_SITE(ops, x) \ + case PARAVIRT_PATCH(ops.x): \ + start = start_##ops##_##x; \ + end = end_##ops##_##x; \ + goto patch_site + switch(type) { + PATCH_SITE(pv_irq_ops, irq_disable); + PATCH_SITE(pv_irq_ops, irq_enable); + PATCH_SITE(pv_irq_ops, restore_fl); + PATCH_SITE(pv_irq_ops, save_fl); + PATCH_SITE(pv_cpu_ops, iret); + PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret); + PATCH_SITE(pv_mmu_ops, read_cr2); + PATCH_SITE(pv_mmu_ops, read_cr3); + PATCH_SITE(pv_mmu_ops, write_cr3); + PATCH_SITE(pv_cpu_ops, clts); + PATCH_SITE(pv_cpu_ops, read_tsc); + + patch_site: + ret = paravirt_patch_insns(ibuf, len, start, end); + break; + + default: + ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); + break; + } +#undef PATCH_SITE + return ret; +} diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c new file mode 100644 index 00000000000..7d904e138d7 --- /dev/null +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -0,0 +1,57 @@ +#include <asm/paravirt.h> +#include <asm/asm-offsets.h> +#include <linux/stringify.h> + +DEF_NATIVE(pv_irq_ops, irq_disable, "cli"); +DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); +DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq"); +DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax"); +DEF_NATIVE(pv_cpu_ops, iret, "iretq"); +DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); +DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); +DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); +DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); +DEF_NATIVE(pv_cpu_ops, clts, "clts"); +DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); + +/* the three commands give us more control to how to return from a syscall */ +DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "movq %gs:" __stringify(pda_oldrsp) ", %rsp; swapgs; sysretq;"); +DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); + +unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len) +{ + const unsigned char *start, *end; + unsigned ret; + +#define PATCH_SITE(ops, x) \ + case PARAVIRT_PATCH(ops.x): \ + start = start_##ops##_##x; \ + end = end_##ops##_##x; \ + goto patch_site + switch(type) { + PATCH_SITE(pv_irq_ops, restore_fl); + PATCH_SITE(pv_irq_ops, save_fl); + PATCH_SITE(pv_irq_ops, irq_enable); + PATCH_SITE(pv_irq_ops, irq_disable); + PATCH_SITE(pv_cpu_ops, iret); + PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret); + PATCH_SITE(pv_cpu_ops, swapgs); + PATCH_SITE(pv_mmu_ops, read_cr2); + PATCH_SITE(pv_mmu_ops, read_cr3); + PATCH_SITE(pv_mmu_ops, write_cr3); + PATCH_SITE(pv_cpu_ops, clts); + PATCH_SITE(pv_mmu_ops, flush_tlb_single); + PATCH_SITE(pv_cpu_ops, wbinvd); + + patch_site: + ret = paravirt_patch_insns(ibuf, len, start, end); + break; + + default: + ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); + break; + } +#undef PATCH_SITE + return ret; +} diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 6bf1f716909..21f34db2c03 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -30,7 +30,6 @@ #include <linux/spinlock.h> #include <linux/string.h> #include <linux/dma-mapping.h> -#include <linux/init.h> #include <linux/bitops.h> #include <linux/pci_ids.h> #include <linux/pci.h> @@ -183,7 +182,7 @@ static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; /* enable this to stress test the chip's TCE cache */ #ifdef CONFIG_IOMMU_DEBUG -int debugging __read_mostly = 1; +static int debugging = 1; static inline unsigned long verify_bit_range(unsigned long* bitmap, int expected, unsigned long start, unsigned long end) @@ -202,7 +201,7 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap, return ~0UL; } #else /* debugging is disabled */ -int debugging __read_mostly = 0; +static int debugging; static inline unsigned long verify_bit_range(unsigned long* bitmap, int expected, unsigned long start, unsigned long end) diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma_64.c index 5552d23d23c..a82473d192a 100644 --- a/arch/x86/kernel/pci-dma_64.c +++ b/arch/x86/kernel/pci-dma_64.c @@ -13,7 +13,6 @@ #include <asm/calgary.h> int iommu_merge __read_mostly = 0; -EXPORT_SYMBOL(iommu_merge); dma_addr_t bad_dma_address __read_mostly; EXPORT_SYMBOL(bad_dma_address); @@ -230,7 +229,7 @@ EXPORT_SYMBOL(dma_set_mask); * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter * documentation. */ -__init int iommu_setup(char *p) +static __init int iommu_setup(char *p) { iommu_merge = 1; diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 06bcba53604..4d5cc718198 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -1,12 +1,12 @@ /* * Dynamic DMA mapping support for AMD Hammer. - * + * * Use the integrated AGP GART in the Hammer northbridge as an IOMMU for PCI. * This allows to use PCI devices that only support 32bit addresses on systems - * with more than 4GB. + * with more than 4GB. * * See Documentation/DMA-mapping.txt for the interface specification. - * + * * Copyright 2002 Andi Kleen, SuSE Labs. * Subject to the GNU General Public License v2 only. */ @@ -37,23 +37,26 @@ #include <asm/k8.h> static unsigned long iommu_bus_base; /* GART remapping area (physical) */ -static unsigned long iommu_size; /* size of remapping area bytes */ +static unsigned long iommu_size; /* size of remapping area bytes */ static unsigned long iommu_pages; /* .. and in pages */ -static u32 *iommu_gatt_base; /* Remapping table */ +static u32 *iommu_gatt_base; /* Remapping table */ -/* If this is disabled the IOMMU will use an optimized flushing strategy - of only flushing when an mapping is reused. With it true the GART is flushed - for every mapping. Problem is that doing the lazy flush seems to trigger - bugs with some popular PCI cards, in particular 3ware (but has been also - also seen with Qlogic at least). */ +/* + * If this is disabled the IOMMU will use an optimized flushing strategy + * of only flushing when an mapping is reused. With it true the GART is + * flushed for every mapping. Problem is that doing the lazy flush seems + * to trigger bugs with some popular PCI cards, in particular 3ware (but + * has been also also seen with Qlogic at least). + */ int iommu_fullflush = 1; -/* Allocation bitmap for the remapping area */ +/* Allocation bitmap for the remapping area: */ static DEFINE_SPINLOCK(iommu_bitmap_lock); -static unsigned long *iommu_gart_bitmap; /* guarded by iommu_bitmap_lock */ +/* Guarded by iommu_bitmap_lock: */ +static unsigned long *iommu_gart_bitmap; -static u32 gart_unmapped_entry; +static u32 gart_unmapped_entry; #define GPTE_VALID 1 #define GPTE_COHERENT 2 @@ -61,10 +64,10 @@ static u32 gart_unmapped_entry; (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) #define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) -#define to_pages(addr,size) \ +#define to_pages(addr, size) \ (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) -#define EMERGENCY_PAGES 32 /* = 128KB */ +#define EMERGENCY_PAGES 32 /* = 128KB */ #ifdef CONFIG_AGP #define AGPEXTERN extern @@ -77,130 +80,152 @@ AGPEXTERN int agp_memory_reserved; AGPEXTERN __u32 *agp_gatt_table; static unsigned long next_bit; /* protected by iommu_bitmap_lock */ -static int need_flush; /* global flush state. set for each gart wrap */ +static int need_flush; /* global flush state. set for each gart wrap */ -static unsigned long alloc_iommu(int size) -{ +static unsigned long alloc_iommu(int size) +{ unsigned long offset, flags; - spin_lock_irqsave(&iommu_bitmap_lock, flags); - offset = find_next_zero_string(iommu_gart_bitmap,next_bit,iommu_pages,size); + spin_lock_irqsave(&iommu_bitmap_lock, flags); + offset = find_next_zero_string(iommu_gart_bitmap, next_bit, + iommu_pages, size); if (offset == -1) { need_flush = 1; - offset = find_next_zero_string(iommu_gart_bitmap,0,iommu_pages,size); + offset = find_next_zero_string(iommu_gart_bitmap, 0, + iommu_pages, size); } - if (offset != -1) { - set_bit_string(iommu_gart_bitmap, offset, size); - next_bit = offset+size; - if (next_bit >= iommu_pages) { + if (offset != -1) { + set_bit_string(iommu_gart_bitmap, offset, size); + next_bit = offset+size; + if (next_bit >= iommu_pages) { next_bit = 0; need_flush = 1; - } - } + } + } if (iommu_fullflush) need_flush = 1; - spin_unlock_irqrestore(&iommu_bitmap_lock, flags); + spin_unlock_irqrestore(&iommu_bitmap_lock, flags); + return offset; -} +} static void free_iommu(unsigned long offset, int size) -{ +{ unsigned long flags; + spin_lock_irqsave(&iommu_bitmap_lock, flags); __clear_bit_string(iommu_gart_bitmap, offset, size); spin_unlock_irqrestore(&iommu_bitmap_lock, flags); -} +} -/* +/* * Use global flush state to avoid races with multiple flushers. */ static void flush_gart(void) -{ +{ unsigned long flags; + spin_lock_irqsave(&iommu_bitmap_lock, flags); if (need_flush) { k8_flush_garts(); need_flush = 0; - } + } spin_unlock_irqrestore(&iommu_bitmap_lock, flags); -} +} #ifdef CONFIG_IOMMU_LEAK -#define SET_LEAK(x) if (iommu_leak_tab) \ - iommu_leak_tab[x] = __builtin_return_address(0); -#define CLEAR_LEAK(x) if (iommu_leak_tab) \ - iommu_leak_tab[x] = NULL; +#define SET_LEAK(x) \ + do { \ + if (iommu_leak_tab) \ + iommu_leak_tab[x] = __builtin_return_address(0);\ + } while (0) + +#define CLEAR_LEAK(x) \ + do { \ + if (iommu_leak_tab) \ + iommu_leak_tab[x] = NULL; \ + } while (0) /* Debugging aid for drivers that don't free their IOMMU tables */ -static void **iommu_leak_tab; +static void **iommu_leak_tab; static int leak_trace; static int iommu_leak_pages = 20; + static void dump_leak(void) { int i; - static int dump; - if (dump || !iommu_leak_tab) return; + static int dump; + + if (dump || !iommu_leak_tab) + return; dump = 1; - show_stack(NULL,NULL); - /* Very crude. dump some from the end of the table too */ - printk("Dumping %d pages from end of IOMMU:\n", iommu_leak_pages); - for (i = 0; i < iommu_leak_pages; i+=2) { - printk("%lu: ", iommu_pages-i); - printk_address((unsigned long) iommu_leak_tab[iommu_pages-i]); - printk("%c", (i+1)%2 == 0 ? '\n' : ' '); - } - printk("\n"); + show_stack(NULL, NULL); + + /* Very crude. dump some from the end of the table too */ + printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n", + iommu_leak_pages); + for (i = 0; i < iommu_leak_pages; i += 2) { + printk(KERN_DEBUG "%lu: ", iommu_pages-i); + printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0); + printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); + } + printk(KERN_DEBUG "\n"); } #else -#define SET_LEAK(x) -#define CLEAR_LEAK(x) +# define SET_LEAK(x) +# define CLEAR_LEAK(x) #endif static void iommu_full(struct device *dev, size_t size, int dir) { - /* + /* * Ran out of IOMMU space for this operation. This is very bad. * Unfortunately the drivers cannot handle this operation properly. - * Return some non mapped prereserved space in the aperture and + * Return some non mapped prereserved space in the aperture and * let the Northbridge deal with it. This will result in garbage * in the IO operation. When the size exceeds the prereserved space - * memory corruption will occur or random memory will be DMAed + * memory corruption will occur or random memory will be DMAed * out. Hopefully no network devices use single mappings that big. - */ - - printk(KERN_ERR - "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", - size, dev->bus_id); + */ + + printk(KERN_ERR + "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", + size, dev->bus_id); if (size > PAGE_SIZE*EMERGENCY_PAGES) { if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) panic("PCI-DMA: Memory would be corrupted\n"); - if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) - panic(KERN_ERR "PCI-DMA: Random memory would be DMAed\n"); - } - + if (dir == PCI_DMA_TODEVICE || dir == PCI_DMA_BIDIRECTIONAL) + panic(KERN_ERR + "PCI-DMA: Random memory would be DMAed\n"); + } #ifdef CONFIG_IOMMU_LEAK - dump_leak(); + dump_leak(); #endif -} +} -static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) -{ +static inline int +need_iommu(struct device *dev, unsigned long addr, size_t size) +{ u64 mask = *dev->dma_mask; int high = addr + size > mask; int mmu = high; - if (force_iommu) - mmu = 1; - return mmu; + + if (force_iommu) + mmu = 1; + + return mmu; } -static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) -{ +static inline int +nonforced_iommu(struct device *dev, unsigned long addr, size_t size) +{ u64 mask = *dev->dma_mask; int high = addr + size > mask; int mmu = high; - return mmu; + + return mmu; } /* Map a single continuous physical area into the IOMMU. @@ -208,13 +233,14 @@ static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t */ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir) -{ +{ unsigned long npages = to_pages(phys_mem, size); unsigned long iommu_page = alloc_iommu(npages); int i; + if (iommu_page == -1) { if (!nonforced_iommu(dev, phys_mem, size)) - return phys_mem; + return phys_mem; if (panic_on_overflow) panic("dma_map_area overflow %lu bytes\n", size); iommu_full(dev, size, dir); @@ -229,35 +255,39 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); } -static dma_addr_t gart_map_simple(struct device *dev, char *buf, - size_t size, int dir) +static dma_addr_t +gart_map_simple(struct device *dev, char *buf, size_t size, int dir) { dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir); + flush_gart(); + return map; } /* Map a single area into the IOMMU */ -static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) +static dma_addr_t +gart_map_single(struct device *dev, void *addr, size_t size, int dir) { unsigned long phys_mem, bus; if (!dev) dev = &fallback_dev; - phys_mem = virt_to_phys(addr); + phys_mem = virt_to_phys(addr); if (!need_iommu(dev, phys_mem, size)) - return phys_mem; + return phys_mem; bus = gart_map_simple(dev, addr, size, dir); - return bus; + + return bus; } /* * Free a DMA mapping. */ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, - size_t size, int direction) + size_t size, int direction) { unsigned long iommu_page; int npages; @@ -266,6 +296,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, if (dma_addr < iommu_bus_base + EMERGENCY_PAGES*PAGE_SIZE || dma_addr >= iommu_bus_base + iommu_size) return; + iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; npages = to_pages(dma_addr, size); for (i = 0; i < npages; i++) { @@ -278,7 +309,8 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, /* * Wrapper for pci_unmap_single working with scatterlists. */ -static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) +static void +gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) { struct scatterlist *s; int i; @@ -303,12 +335,13 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, for_each_sg(sg, s, nents, i) { unsigned long addr = sg_phys(s); - if (nonforced_iommu(dev, addr, s->length)) { + + if (nonforced_iommu(dev, addr, s->length)) { addr = dma_map_area(dev, addr, s->length, dir); - if (addr == bad_dma_address) { - if (i > 0) + if (addr == bad_dma_address) { + if (i > 0) gart_unmap_sg(dev, sg, i, dir); - nents = 0; + nents = 0; sg[0].dma_length = 0; break; } @@ -317,15 +350,16 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, s->dma_length = s->length; } flush_gart(); + return nents; } /* Map multiple scatterlist entries continuous into the first. */ static int __dma_map_cont(struct scatterlist *start, int nelems, - struct scatterlist *sout, unsigned long pages) + struct scatterlist *sout, unsigned long pages) { unsigned long iommu_start = alloc_iommu(pages); - unsigned long iommu_page = iommu_start; + unsigned long iommu_page = iommu_start; struct scatterlist *s; int i; @@ -335,32 +369,33 @@ static int __dma_map_cont(struct scatterlist *start, int nelems, for_each_sg(start, s, nelems, i) { unsigned long pages, addr; unsigned long phys_addr = s->dma_address; - + BUG_ON(s != start && s->offset); if (s == start) { sout->dma_address = iommu_bus_base; sout->dma_address += iommu_page*PAGE_SIZE + s->offset; sout->dma_length = s->length; - } else { - sout->dma_length += s->length; + } else { + sout->dma_length += s->length; } addr = phys_addr; - pages = to_pages(s->offset, s->length); - while (pages--) { - iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); + pages = to_pages(s->offset, s->length); + while (pages--) { + iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); SET_LEAK(iommu_page); addr += PAGE_SIZE; iommu_page++; } - } - BUG_ON(iommu_page - iommu_start != pages); + } + BUG_ON(iommu_page - iommu_start != pages); + return 0; } -static inline int dma_map_cont(struct scatterlist *start, int nelems, - struct scatterlist *sout, - unsigned long pages, int need) +static inline int +dma_map_cont(struct scatterlist *start, int nelems, struct scatterlist *sout, + unsigned long pages, int need) { if (!need) { BUG_ON(nelems != 1); @@ -370,22 +405,19 @@ static inline int dma_map_cont(struct scatterlist *start, int nelems, } return __dma_map_cont(start, nelems, sout, pages); } - + /* * DMA map all entries in a scatterlist. - * Merge chunks that have page aligned sizes into a continuous mapping. + * Merge chunks that have page aligned sizes into a continuous mapping. */ -static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, - int dir) +static int +gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) { - int i; - int out; - int start; - unsigned long pages = 0; - int need = 0, nextneed; struct scatterlist *s, *ps, *start_sg, *sgmap; + int need = 0, nextneed, i, out, start; + unsigned long pages = 0; - if (nents == 0) + if (nents == 0) return 0; if (!dev) @@ -397,15 +429,19 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, ps = NULL; /* shut up gcc */ for_each_sg(sg, s, nents, i) { dma_addr_t addr = sg_phys(s); + s->dma_address = addr; - BUG_ON(s->length == 0); + BUG_ON(s->length == 0); - nextneed = need_iommu(dev, addr, s->length); + nextneed = need_iommu(dev, addr, s->length); /* Handle the previous not yet processed entries */ if (i > start) { - /* Can only merge when the last chunk ends on a page - boundary and the new one doesn't have an offset. */ + /* + * Can only merge when the last chunk ends on a + * page boundary and the new one doesn't have an + * offset. + */ if (!iommu_merge || !nextneed || !need || s->offset || (ps->offset + ps->length) % PAGE_SIZE) { if (dma_map_cont(start_sg, i - start, sgmap, @@ -436,6 +472,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, error: flush_gart(); gart_unmap_sg(dev, sg, out, dir); + /* When it was forced or merged try again in a dumb way */ if (force_iommu || iommu_merge) { out = dma_map_sg_nonforce(dev, sg, nents, dir); @@ -444,64 +481,68 @@ error: } if (panic_on_overflow) panic("dma_map_sg: overflow on %lu pages\n", pages); + iommu_full(dev, pages << PAGE_SHIFT, dir); for_each_sg(sg, s, nents, i) s->dma_address = bad_dma_address; return 0; -} +} static int no_agp; static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) -{ - unsigned long a; - if (!iommu_size) { - iommu_size = aper_size; - if (!no_agp) - iommu_size /= 2; - } - - a = aper + iommu_size; +{ + unsigned long a; + + if (!iommu_size) { + iommu_size = aper_size; + if (!no_agp) + iommu_size /= 2; + } + + a = aper + iommu_size; iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; - if (iommu_size < 64*1024*1024) + if (iommu_size < 64*1024*1024) { printk(KERN_WARNING - "PCI-DMA: Warning: Small IOMMU %luMB. Consider increasing the AGP aperture in BIOS\n",iommu_size>>20); - + "PCI-DMA: Warning: Small IOMMU %luMB." + " Consider increasing the AGP aperture in BIOS\n", + iommu_size >> 20); + } + return iommu_size; -} +} -static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) -{ - unsigned aper_size = 0, aper_base_32; +static __init unsigned read_aperture(struct pci_dev *dev, u32 *size) +{ + unsigned aper_size = 0, aper_base_32, aper_order; u64 aper_base; - unsigned aper_order; - pci_read_config_dword(dev, 0x94, &aper_base_32); + pci_read_config_dword(dev, 0x94, &aper_base_32); pci_read_config_dword(dev, 0x90, &aper_order); - aper_order = (aper_order >> 1) & 7; + aper_order = (aper_order >> 1) & 7; - aper_base = aper_base_32 & 0x7fff; + aper_base = aper_base_32 & 0x7fff; aper_base <<= 25; - aper_size = (32 * 1024 * 1024) << aper_order; - if (aper_base + aper_size > 0x100000000UL || !aper_size) + aper_size = (32 * 1024 * 1024) << aper_order; + if (aper_base + aper_size > 0x100000000UL || !aper_size) aper_base = 0; *size = aper_size; return aper_base; -} +} -/* +/* * Private Northbridge GATT initialization in case we cannot use the - * AGP driver for some reason. + * AGP driver for some reason. */ static __init int init_k8_gatt(struct agp_kern_info *info) -{ +{ + unsigned aper_size, gatt_size, new_aper_size; + unsigned aper_base, new_aper_base; struct pci_dev *dev; void *gatt; - unsigned aper_base, new_aper_base; - unsigned aper_size, gatt_size, new_aper_size; int i; printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); @@ -509,75 +550,75 @@ static __init int init_k8_gatt(struct agp_kern_info *info) dev = NULL; for (i = 0; i < num_k8_northbridges; i++) { dev = k8_northbridges[i]; - new_aper_base = read_aperture(dev, &new_aper_size); - if (!new_aper_base) - goto nommu; - - if (!aper_base) { + new_aper_base = read_aperture(dev, &new_aper_size); + if (!new_aper_base) + goto nommu; + + if (!aper_base) { aper_size = new_aper_size; aper_base = new_aper_base; - } - if (aper_size != new_aper_size || aper_base != new_aper_base) + } + if (aper_size != new_aper_size || aper_base != new_aper_base) goto nommu; } if (!aper_base) - goto nommu; + goto nommu; info->aper_base = aper_base; - info->aper_size = aper_size>>20; + info->aper_size = aper_size >> 20; - gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); - gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); - if (!gatt) + gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); + gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); + if (!gatt) panic("Cannot allocate GATT table"); - if (change_page_attr_addr((unsigned long)gatt, gatt_size >> PAGE_SHIFT, PAGE_KERNEL_NOCACHE)) + if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) panic("Could not set GART PTEs to uncacheable pages"); - global_flush_tlb(); - memset(gatt, 0, gatt_size); + memset(gatt, 0, gatt_size); agp_gatt_table = gatt; for (i = 0; i < num_k8_northbridges; i++) { - u32 ctl; - u32 gatt_reg; + u32 gatt_reg; + u32 ctl; dev = k8_northbridges[i]; - gatt_reg = __pa(gatt) >> 12; - gatt_reg <<= 4; + gatt_reg = __pa(gatt) >> 12; + gatt_reg <<= 4; pci_write_config_dword(dev, 0x98, gatt_reg); - pci_read_config_dword(dev, 0x90, &ctl); + pci_read_config_dword(dev, 0x90, &ctl); ctl |= 1; ctl &= ~((1<<4) | (1<<5)); - pci_write_config_dword(dev, 0x90, ctl); + pci_write_config_dword(dev, 0x90, ctl); } flush_gart(); - - printk("PCI-DMA: aperture base @ %x size %u KB\n",aper_base, aper_size>>10); + + printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", + aper_base, aper_size>>10); return 0; nommu: - /* Should not happen anymore */ + /* Should not happen anymore */ printk(KERN_ERR "PCI-DMA: More than 4GB of RAM and no IOMMU\n" KERN_ERR "PCI-DMA: 32bit PCI IO may malfunction.\n"); - return -1; -} + return -1; +} extern int agp_amd64_init(void); static const struct dma_mapping_ops gart_dma_ops = { - .mapping_error = NULL, - .map_single = gart_map_single, - .map_simple = gart_map_simple, - .unmap_single = gart_unmap_single, - .sync_single_for_cpu = NULL, - .sync_single_for_device = NULL, - .sync_single_range_for_cpu = NULL, - .sync_single_range_for_device = NULL, - .sync_sg_for_cpu = NULL, - .sync_sg_for_device = NULL, - .map_sg = gart_map_sg, - .unmap_sg = gart_unmap_sg, + .mapping_error = NULL, + .map_single = gart_map_single, + .map_simple = gart_map_simple, + .unmap_single = gart_unmap_single, + .sync_single_for_cpu = NULL, + .sync_single_for_device = NULL, + .sync_single_range_for_cpu = NULL, + .sync_single_range_for_device = NULL, + .sync_sg_for_cpu = NULL, + .sync_sg_for_device = NULL, + .map_sg = gart_map_sg, + .unmap_sg = gart_unmap_sg, }; void gart_iommu_shutdown(void) @@ -588,23 +629,23 @@ void gart_iommu_shutdown(void) if (no_agp && (dma_ops != &gart_dma_ops)) return; - for (i = 0; i < num_k8_northbridges; i++) { - u32 ctl; + for (i = 0; i < num_k8_northbridges; i++) { + u32 ctl; - dev = k8_northbridges[i]; - pci_read_config_dword(dev, 0x90, &ctl); + dev = k8_northbridges[i]; + pci_read_config_dword(dev, 0x90, &ctl); - ctl &= ~1; + ctl &= ~1; - pci_write_config_dword(dev, 0x90, ctl); - } + pci_write_config_dword(dev, 0x90, ctl); + } } void __init gart_iommu_init(void) -{ +{ struct agp_kern_info info; - unsigned long aper_size; unsigned long iommu_start; + unsigned long aper_size; unsigned long scratch; long i; @@ -614,14 +655,14 @@ void __init gart_iommu_init(void) } #ifndef CONFIG_AGP_AMD64 - no_agp = 1; + no_agp = 1; #else /* Makefile puts PCI initialization via subsys_initcall first. */ /* Add other K8 AGP bridge drivers here */ - no_agp = no_agp || - (agp_amd64_init() < 0) || + no_agp = no_agp || + (agp_amd64_init() < 0) || (agp_copy_info(agp_bridge, &info) < 0); -#endif +#endif if (swiotlb) return; @@ -643,77 +684,78 @@ void __init gart_iommu_init(void) } printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); - aper_size = info.aper_size * 1024 * 1024; - iommu_size = check_iommu_size(info.aper_base, aper_size); - iommu_pages = iommu_size >> PAGE_SHIFT; - - iommu_gart_bitmap = (void*)__get_free_pages(GFP_KERNEL, - get_order(iommu_pages/8)); - if (!iommu_gart_bitmap) - panic("Cannot allocate iommu bitmap\n"); + aper_size = info.aper_size * 1024 * 1024; + iommu_size = check_iommu_size(info.aper_base, aper_size); + iommu_pages = iommu_size >> PAGE_SHIFT; + + iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL, + get_order(iommu_pages/8)); + if (!iommu_gart_bitmap) + panic("Cannot allocate iommu bitmap\n"); memset(iommu_gart_bitmap, 0, iommu_pages/8); #ifdef CONFIG_IOMMU_LEAK - if (leak_trace) { - iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, + if (leak_trace) { + iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, get_order(iommu_pages*sizeof(void *))); - if (iommu_leak_tab) - memset(iommu_leak_tab, 0, iommu_pages * 8); + if (iommu_leak_tab) + memset(iommu_leak_tab, 0, iommu_pages * 8); else - printk("PCI-DMA: Cannot allocate leak trace area\n"); - } + printk(KERN_DEBUG + "PCI-DMA: Cannot allocate leak trace area\n"); + } #endif - /* + /* * Out of IOMMU space handling. - * Reserve some invalid pages at the beginning of the GART. - */ - set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); + * Reserve some invalid pages at the beginning of the GART. + */ + set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); - agp_memory_reserved = iommu_size; + agp_memory_reserved = iommu_size; printk(KERN_INFO "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n", - iommu_size>>20); + iommu_size >> 20); - iommu_start = aper_size - iommu_size; - iommu_bus_base = info.aper_base + iommu_start; + iommu_start = aper_size - iommu_size; + iommu_bus_base = info.aper_base + iommu_start; bad_dma_address = iommu_bus_base; iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); - /* + /* * Unmap the IOMMU part of the GART. The alias of the page is * always mapped with cache enabled and there is no full cache * coherency across the GART remapping. The unmapping avoids * automatic prefetches from the CPU allocating cache lines in * there. All CPU accesses are done via the direct mapping to * the backing memory. The GART address is only used by PCI - * devices. + * devices. */ clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); - /* - * Try to workaround a bug (thanks to BenH) - * Set unmapped entries to a scratch page instead of 0. + /* + * Try to workaround a bug (thanks to BenH) + * Set unmapped entries to a scratch page instead of 0. * Any prefetches that hit unmapped entries won't get an bus abort * then. */ - scratch = get_zeroed_page(GFP_KERNEL); - if (!scratch) + scratch = get_zeroed_page(GFP_KERNEL); + if (!scratch) panic("Cannot allocate iommu scratch page"); gart_unmapped_entry = GPTE_ENCODE(__pa(scratch)); - for (i = EMERGENCY_PAGES; i < iommu_pages; i++) + for (i = EMERGENCY_PAGES; i < iommu_pages; i++) iommu_gatt_base[i] = gart_unmapped_entry; flush_gart(); dma_ops = &gart_dma_ops; -} +} void __init gart_parse_options(char *p) { int arg; #ifdef CONFIG_IOMMU_LEAK - if (!strncmp(p,"leak",4)) { + if (!strncmp(p, "leak", 4)) { leak_trace = 1; p += 4; if (*p == '=') ++p; @@ -723,18 +765,18 @@ void __init gart_parse_options(char *p) #endif if (isdigit(*p) && get_option(&p, &arg)) iommu_size = arg; - if (!strncmp(p, "fullflush",8)) + if (!strncmp(p, "fullflush", 8)) iommu_fullflush = 1; - if (!strncmp(p, "nofullflush",11)) + if (!strncmp(p, "nofullflush", 11)) iommu_fullflush = 0; - if (!strncmp(p,"noagp",5)) + if (!strncmp(p, "noagp", 5)) no_agp = 1; - if (!strncmp(p, "noaperture",10)) + if (!strncmp(p, "noaperture", 10)) fix_aperture = 0; /* duplicated from pci-dma.c */ - if (!strncmp(p,"force",5)) + if (!strncmp(p, "force", 5)) gart_iommu_aperture_allowed = 1; - if (!strncmp(p,"allowed",7)) + if (!strncmp(p, "allowed", 7)) gart_iommu_aperture_allowed = 1; if (!strncmp(p, "memaper", 7)) { fallback_aper_force = 1; diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 102866d729a..82a0a674a00 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -10,7 +10,6 @@ #include <asm/dma.h> int swiotlb __read_mostly; -EXPORT_SYMBOL(swiotlb); const struct dma_mapping_ops swiotlb_dma_ops = { .mapping_error = swiotlb_dma_mapping_error, diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c index ae8f91214f1..b112406f199 100644 --- a/arch/x86/kernel/pmtimer_64.c +++ b/arch/x86/kernel/pmtimer_64.c @@ -19,13 +19,13 @@ #include <linux/time.h> #include <linux/init.h> #include <linux/cpumask.h> +#include <linux/acpi_pmtmr.h> + #include <asm/io.h> #include <asm/proto.h> #include <asm/msr.h> #include <asm/vsyscall.h> -#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ - static inline u32 cyc2us(u32 cycles) { /* The Power Management Timer ticks at 3.579545 ticks per microsecond. diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 46d391d49de..968371ab223 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -55,6 +55,7 @@ #include <asm/tlbflush.h> #include <asm/cpu.h> +#include <asm/kdebug.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -74,7 +75,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); */ unsigned long thread_saved_pc(struct task_struct *tsk) { - return ((unsigned long *)tsk->thread.esp)[3]; + return ((unsigned long *)tsk->thread.sp)[3]; } /* @@ -113,10 +114,19 @@ void default_idle(void) smp_mb(); local_irq_disable(); - if (!need_resched()) + if (!need_resched()) { + ktime_t t0, t1; + u64 t0n, t1n; + + t0 = ktime_get(); + t0n = ktime_to_ns(t0); safe_halt(); /* enables interrupts racelessly */ - else - local_irq_enable(); + local_irq_disable(); + t1 = ktime_get(); + t1n = ktime_to_ns(t1); + sched_clock_idle_wakeup_event(t1n - t0n); + } + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } else { /* loop is done by the caller */ @@ -132,7 +142,7 @@ EXPORT_SYMBOL(default_idle); * to poll the ->work.need_resched flag instead of waiting for the * cross-CPU IPI to arrive. Use this option with caution. */ -static void poll_idle (void) +static void poll_idle(void) { cpu_relax(); } @@ -188,6 +198,9 @@ void cpu_idle(void) rmb(); idle = pm_idle; + if (rcu_pending(cpu)) + rcu_check_callbacks(cpu, 0); + if (!idle) idle = default_idle; @@ -255,13 +268,13 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); * New with Core Duo processors, MWAIT can take some hints based on CPU * capability. */ -void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) +void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { if (!need_resched()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) - __mwait(eax, ecx); + __mwait(ax, cx); } } @@ -272,19 +285,37 @@ static void mwait_idle(void) mwait_idle_with_hints(0, 0); } +static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) +{ + if (force_mwait) + return 1; + /* Any C1 states supported? */ + return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0; +} + void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { - if (cpu_has(c, X86_FEATURE_MWAIT)) { - printk("monitor/mwait feature present.\n"); + static int selected; + + if (selected) + return; +#ifdef CONFIG_X86_SMP + if (pm_idle == poll_idle && smp_num_siblings > 1) { + printk(KERN_WARNING "WARNING: polling idle and HT enabled," + " performance may degrade.\n"); + } +#endif + if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { /* * Skip, if setup has overridden idle. * One CPU supports mwait => All CPUs supports mwait */ if (!pm_idle) { - printk("using mwait in idle threads.\n"); + printk(KERN_INFO "using mwait in idle threads.\n"); pm_idle = mwait_idle; } } + selected = 1; } static int __init idle_setup(char *str) @@ -292,10 +323,6 @@ static int __init idle_setup(char *str) if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); pm_idle = poll_idle; -#ifdef CONFIG_X86_SMP - if (smp_num_siblings > 1) - printk("WARNING: polling idle and HT enabled, performance may degrade.\n"); -#endif } else if (!strcmp(str, "mwait")) force_mwait = 1; else @@ -310,15 +337,15 @@ void __show_registers(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; - unsigned long esp; + unsigned long sp; unsigned short ss, gs; if (user_mode_vm(regs)) { - esp = regs->esp; - ss = regs->xss & 0xffff; + sp = regs->sp; + ss = regs->ss & 0xffff; savesegment(gs, gs); } else { - esp = (unsigned long) (®s->esp); + sp = (unsigned long) (®s->sp); savesegment(ss, ss); savesegment(gs, gs); } @@ -331,17 +358,17 @@ void __show_registers(struct pt_regs *regs, int all) init_utsname()->version); printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", - 0xffff & regs->xcs, regs->eip, regs->eflags, + 0xffff & regs->cs, regs->ip, regs->flags, smp_processor_id()); - print_symbol("EIP is at %s\n", regs->eip); + print_symbol("EIP is at %s\n", regs->ip); printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", - regs->eax, regs->ebx, regs->ecx, regs->edx); + regs->ax, regs->bx, regs->cx, regs->dx); printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", - regs->esi, regs->edi, regs->ebp, esp); + regs->si, regs->di, regs->bp, sp); printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, - regs->xfs & 0xffff, gs, ss); + regs->ds & 0xffff, regs->es & 0xffff, + regs->fs & 0xffff, gs, ss); if (!all) return; @@ -369,12 +396,12 @@ void __show_registers(struct pt_regs *regs, int all) void show_regs(struct pt_regs *regs) { __show_registers(regs, 1); - show_trace(NULL, regs, ®s->esp); + show_trace(NULL, regs, ®s->sp, regs->bp); } /* - * This gets run with %ebx containing the - * function to call, and %edx containing + * This gets run with %bx containing the + * function to call, and %dx containing * the "args". */ extern void kernel_thread_helper(void); @@ -388,16 +415,16 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) memset(®s, 0, sizeof(regs)); - regs.ebx = (unsigned long) fn; - regs.edx = (unsigned long) arg; + regs.bx = (unsigned long) fn; + regs.dx = (unsigned long) arg; - regs.xds = __USER_DS; - regs.xes = __USER_DS; - regs.xfs = __KERNEL_PERCPU; - regs.orig_eax = -1; - regs.eip = (unsigned long) kernel_thread_helper; - regs.xcs = __KERNEL_CS | get_kernel_rpl(); - regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; + regs.ds = __USER_DS; + regs.es = __USER_DS; + regs.fs = __KERNEL_PERCPU; + regs.orig_ax = -1; + regs.ip = (unsigned long) kernel_thread_helper; + regs.cs = __KERNEL_CS | get_kernel_rpl(); + regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; /* Ok, create the new process.. */ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); @@ -435,7 +462,12 @@ void flush_thread(void) { struct task_struct *tsk = current; - memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); + tsk->thread.debugreg0 = 0; + tsk->thread.debugreg1 = 0; + tsk->thread.debugreg2 = 0; + tsk->thread.debugreg3 = 0; + tsk->thread.debugreg6 = 0; + tsk->thread.debugreg7 = 0; memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); clear_tsk_thread_flag(tsk, TIF_DEBUG); /* @@ -460,7 +492,7 @@ void prepare_to_copy(struct task_struct *tsk) unlazy_fpu(tsk); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { @@ -470,15 +502,15 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, childregs = task_pt_regs(p); *childregs = *regs; - childregs->eax = 0; - childregs->esp = esp; + childregs->ax = 0; + childregs->sp = sp; - p->thread.esp = (unsigned long) childregs; - p->thread.esp0 = (unsigned long) (childregs+1); + p->thread.sp = (unsigned long) childregs; + p->thread.sp0 = (unsigned long) (childregs+1); - p->thread.eip = (unsigned long) ret_from_fork; + p->thread.ip = (unsigned long) ret_from_fork; - savesegment(gs,p->thread.gs); + savesegment(gs, p->thread.gs); tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { @@ -491,32 +523,15 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, set_tsk_thread_flag(p, TIF_IO_BITMAP); } + err = 0; + /* * Set a new TLS for the child thread? */ - if (clone_flags & CLONE_SETTLS) { - struct desc_struct *desc; - struct user_desc info; - int idx; - - err = -EFAULT; - if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) - goto out; - err = -EINVAL; - if (LDT_empty(&info)) - goto out; - - idx = info.entry_number; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - goto out; - - desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } + if (clone_flags & CLONE_SETTLS) + err = do_set_thread_area(p, -1, + (struct user_desc __user *)childregs->si, 0); - err = 0; - out: if (err && p->thread.io_bitmap_ptr) { kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; @@ -529,62 +544,52 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, */ void dump_thread(struct pt_regs * regs, struct user * dump) { - int i; + u16 gs; /* changed the size calculations - should hopefully work better. lbt */ dump->magic = CMAGIC; dump->start_code = 0; - dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); + dump->start_stack = regs->sp & ~(PAGE_SIZE - 1); dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; dump->u_dsize -= dump->u_tsize; dump->u_ssize = 0; - for (i = 0; i < 8; i++) - dump->u_debugreg[i] = current->thread.debugreg[i]; + dump->u_debugreg[0] = current->thread.debugreg0; + dump->u_debugreg[1] = current->thread.debugreg1; + dump->u_debugreg[2] = current->thread.debugreg2; + dump->u_debugreg[3] = current->thread.debugreg3; + dump->u_debugreg[4] = 0; + dump->u_debugreg[5] = 0; + dump->u_debugreg[6] = current->thread.debugreg6; + dump->u_debugreg[7] = current->thread.debugreg7; if (dump->start_stack < TASK_SIZE) dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; - dump->regs.ebx = regs->ebx; - dump->regs.ecx = regs->ecx; - dump->regs.edx = regs->edx; - dump->regs.esi = regs->esi; - dump->regs.edi = regs->edi; - dump->regs.ebp = regs->ebp; - dump->regs.eax = regs->eax; - dump->regs.ds = regs->xds; - dump->regs.es = regs->xes; - dump->regs.fs = regs->xfs; - savesegment(gs,dump->regs.gs); - dump->regs.orig_eax = regs->orig_eax; - dump->regs.eip = regs->eip; - dump->regs.cs = regs->xcs; - dump->regs.eflags = regs->eflags; - dump->regs.esp = regs->esp; - dump->regs.ss = regs->xss; + dump->regs.bx = regs->bx; + dump->regs.cx = regs->cx; + dump->regs.dx = regs->dx; + dump->regs.si = regs->si; + dump->regs.di = regs->di; + dump->regs.bp = regs->bp; + dump->regs.ax = regs->ax; + dump->regs.ds = (u16)regs->ds; + dump->regs.es = (u16)regs->es; + dump->regs.fs = (u16)regs->fs; + savesegment(gs,gs); + dump->regs.orig_ax = regs->orig_ax; + dump->regs.ip = regs->ip; + dump->regs.cs = (u16)regs->cs; + dump->regs.flags = regs->flags; + dump->regs.sp = regs->sp; + dump->regs.ss = (u16)regs->ss; dump->u_fpvalid = dump_fpu (regs, &dump->i387); } EXPORT_SYMBOL(dump_thread); -/* - * Capture the user space registers if the task is not running (in user space) - */ -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) -{ - struct pt_regs ptregs = *task_pt_regs(tsk); - ptregs.xcs &= 0xffff; - ptregs.xds &= 0xffff; - ptregs.xes &= 0xffff; - ptregs.xss &= 0xffff; - - elf_core_copy_regs(regs, &ptregs); - - return 1; -} - #ifdef CONFIG_SECCOMP -void hard_disable_TSC(void) +static void hard_disable_TSC(void) { write_cr4(read_cr4() | X86_CR4_TSD); } @@ -599,7 +604,7 @@ void disable_TSC(void) hard_disable_TSC(); preempt_enable(); } -void hard_enable_TSC(void) +static void hard_enable_TSC(void) { write_cr4(read_cr4() & ~X86_CR4_TSD); } @@ -609,18 +614,32 @@ static noinline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss) { - struct thread_struct *next; + struct thread_struct *prev, *next; + unsigned long debugctl; + prev = &prev_p->thread; next = &next_p->thread; + debugctl = prev->debugctlmsr; + if (next->ds_area_msr != prev->ds_area_msr) { + /* we clear debugctl to make sure DS + * is not in use when we change it */ + debugctl = 0; + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0); + wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0); + } + + if (next->debugctlmsr != debugctl) + wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0); + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { - set_debugreg(next->debugreg[0], 0); - set_debugreg(next->debugreg[1], 1); - set_debugreg(next->debugreg[2], 2); - set_debugreg(next->debugreg[3], 3); + set_debugreg(next->debugreg0, 0); + set_debugreg(next->debugreg1, 1); + set_debugreg(next->debugreg2, 2); + set_debugreg(next->debugreg3, 3); /* no 4 and 5 */ - set_debugreg(next->debugreg[6], 6); - set_debugreg(next->debugreg[7], 7); + set_debugreg(next->debugreg6, 6); + set_debugreg(next->debugreg7, 7); } #ifdef CONFIG_SECCOMP @@ -634,6 +653,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, } #endif + if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); + + if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); + + if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { /* * Disable the bitmap via an invalid offset. We still cache @@ -687,11 +713,11 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, * More important, however, is the fact that this allows us much * more flexibility. * - * The return value (in %eax) will be the "prev" task after + * The return value (in %ax) will be the "prev" task after * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ -struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) +struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; @@ -710,7 +736,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas /* * Reload esp0. */ - load_esp0(tss, next); + load_sp0(tss, next); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -774,7 +800,7 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas asmlinkage int sys_fork(struct pt_regs regs) { - return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); + return do_fork(SIGCHLD, regs.sp, ®s, 0, NULL, NULL); } asmlinkage int sys_clone(struct pt_regs regs) @@ -783,12 +809,12 @@ asmlinkage int sys_clone(struct pt_regs regs) unsigned long newsp; int __user *parent_tidptr, *child_tidptr; - clone_flags = regs.ebx; - newsp = regs.ecx; - parent_tidptr = (int __user *)regs.edx; - child_tidptr = (int __user *)regs.edi; + clone_flags = regs.bx; + newsp = regs.cx; + parent_tidptr = (int __user *)regs.dx; + child_tidptr = (int __user *)regs.di; if (!newsp) - newsp = regs.esp; + newsp = regs.sp; return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); } @@ -804,7 +830,7 @@ asmlinkage int sys_clone(struct pt_regs regs) */ asmlinkage int sys_vfork(struct pt_regs regs) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.sp, ®s, 0, NULL, NULL); } /* @@ -815,18 +841,15 @@ asmlinkage int sys_execve(struct pt_regs regs) int error; char * filename; - filename = getname((char __user *) regs.ebx); + filename = getname((char __user *) regs.bx); error = PTR_ERR(filename); if (IS_ERR(filename)) goto out; error = do_execve(filename, - (char __user * __user *) regs.ecx, - (char __user * __user *) regs.edx, + (char __user * __user *) regs.cx, + (char __user * __user *) regs.dx, ®s); if (error == 0) { - task_lock(current); - current->ptrace &= ~PT_DTRACE; - task_unlock(current); /* Make sure we don't return using sysenter.. */ set_thread_flag(TIF_IRET); } @@ -840,145 +863,37 @@ out: unsigned long get_wchan(struct task_struct *p) { - unsigned long ebp, esp, eip; + unsigned long bp, sp, ip; unsigned long stack_page; int count = 0; if (!p || p == current || p->state == TASK_RUNNING) return 0; stack_page = (unsigned long)task_stack_page(p); - esp = p->thread.esp; - if (!stack_page || esp < stack_page || esp > top_esp+stack_page) + sp = p->thread.sp; + if (!stack_page || sp < stack_page || sp > top_esp+stack_page) return 0; - /* include/asm-i386/system.h:switch_to() pushes ebp last. */ - ebp = *(unsigned long *) esp; + /* include/asm-i386/system.h:switch_to() pushes bp last. */ + bp = *(unsigned long *) sp; do { - if (ebp < stack_page || ebp > top_ebp+stack_page) + if (bp < stack_page || bp > top_ebp+stack_page) return 0; - eip = *(unsigned long *) (ebp+4); - if (!in_sched_functions(eip)) - return eip; - ebp = *(unsigned long *) ebp; + ip = *(unsigned long *) (bp+4); + if (!in_sched_functions(ip)) + return ip; + bp = *(unsigned long *) bp; } while (count++ < 16); return 0; } -/* - * sys_alloc_thread_area: get a yet unused TLS descriptor index. - */ -static int get_free_idx(void) -{ - struct thread_struct *t = ¤t->thread; - int idx; - - for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) - if (desc_empty(t->tls_array + idx)) - return idx + GDT_ENTRY_TLS_MIN; - return -ESRCH; -} - -/* - * Set a given TLS descriptor: - */ -asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) -{ - struct thread_struct *t = ¤t->thread; - struct user_desc info; - struct desc_struct *desc; - int cpu, idx; - - if (copy_from_user(&info, u_info, sizeof(info))) - return -EFAULT; - idx = info.entry_number; - - /* - * index -1 means the kernel should try to find and - * allocate an empty descriptor: - */ - if (idx == -1) { - idx = get_free_idx(); - if (idx < 0) - return idx; - if (put_user(idx, &u_info->entry_number)) - return -EFAULT; - } - - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; - - /* - * We must not get preempted while modifying the TLS. - */ - cpu = get_cpu(); - - if (LDT_empty(&info)) { - desc->a = 0; - desc->b = 0; - } else { - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } - load_TLS(t, cpu); - - put_cpu(); - - return 0; -} - -/* - * Get the current Thread-Local Storage area: - */ - -#define GET_BASE(desc) ( \ - (((desc)->a >> 16) & 0x0000ffff) | \ - (((desc)->b << 16) & 0x00ff0000) | \ - ( (desc)->b & 0xff000000) ) - -#define GET_LIMIT(desc) ( \ - ((desc)->a & 0x0ffff) | \ - ((desc)->b & 0xf0000) ) - -#define GET_32BIT(desc) (((desc)->b >> 22) & 1) -#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) -#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) -#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) -#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) -#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) - -asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) -{ - struct user_desc info; - struct desc_struct *desc; - int idx; - - if (get_user(idx, &u_info->entry_number)) - return -EFAULT; - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - memset(&info, 0, sizeof(info)); - - desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - - info.entry_number = idx; - info.base_addr = GET_BASE(desc); - info.limit = GET_LIMIT(desc); - info.seg_32bit = GET_32BIT(desc); - info.contents = GET_CONTENTS(desc); - info.read_exec_only = !GET_WRITABLE(desc); - info.limit_in_pages = GET_LIMIT_PAGES(desc); - info.seg_not_present = !GET_PRESENT(desc); - info.useable = GET_USEABLE(desc); - - if (copy_to_user(u_info, &info, sizeof(info))) - return -EFAULT; - return 0; -} - unsigned long arch_align_stack(unsigned long sp) { if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) sp -= get_random_int() % 8192; return sp & ~0xf; } + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ab79e1dfa02..137a86171c3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -3,7 +3,7 @@ * * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 - * + * * X86-64 port * Andi Kleen. * @@ -19,19 +19,19 @@ #include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> +#include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> -#include <linux/fs.h> #include <linux/elfcore.h> #include <linux/smp.h> #include <linux/slab.h> #include <linux/user.h> -#include <linux/module.h> #include <linux/a.out.h> #include <linux/interrupt.h> +#include <linux/utsname.h> #include <linux/delay.h> +#include <linux/module.h> #include <linux/ptrace.h> -#include <linux/utsname.h> #include <linux/random.h> #include <linux/notifier.h> #include <linux/kprobes.h> @@ -72,13 +72,6 @@ void idle_notifier_register(struct notifier_block *n) { atomic_notifier_chain_register(&idle_notifier, n); } -EXPORT_SYMBOL_GPL(idle_notifier_register); - -void idle_notifier_unregister(struct notifier_block *n) -{ - atomic_notifier_chain_unregister(&idle_notifier, n); -} -EXPORT_SYMBOL(idle_notifier_unregister); void enter_idle(void) { @@ -106,7 +99,7 @@ void exit_idle(void) * We use this if we don't have any better * idle routine.. */ -static void default_idle(void) +void default_idle(void) { current_thread_info()->status &= ~TS_POLLING; /* @@ -116,11 +109,18 @@ static void default_idle(void) smp_mb(); local_irq_disable(); if (!need_resched()) { - /* Enables interrupts one instruction before HLT. - x86 special cases this so there is no race. */ - safe_halt(); - } else - local_irq_enable(); + ktime_t t0, t1; + u64 t0n, t1n; + + t0 = ktime_get(); + t0n = ktime_to_ns(t0); + safe_halt(); /* enables interrupts racelessly */ + local_irq_disable(); + t1 = ktime_get(); + t1n = ktime_to_ns(t1); + sched_clock_idle_wakeup_event(t1n - t0n); + } + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } @@ -129,54 +129,12 @@ static void default_idle(void) * to poll the ->need_resched flag instead of waiting for the * cross-CPU IPI to arrive. Use this option with caution. */ -static void poll_idle (void) +static void poll_idle(void) { local_irq_enable(); cpu_relax(); } -static void do_nothing(void *unused) -{ -} - -void cpu_idle_wait(void) -{ - unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map, tmp = current->cpus_allowed; - - set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); - put_cpu(); - - cpus_clear(map); - for_each_online_cpu(cpu) { - per_cpu(cpu_idle_state, cpu) = 1; - cpu_set(cpu, map); - } - - __get_cpu_var(cpu_idle_state) = 0; - - wmb(); - do { - ssleep(1); - for_each_online_cpu(cpu) { - if (cpu_isset(cpu, map) && - !per_cpu(cpu_idle_state, cpu)) - cpu_clear(cpu, map); - } - cpus_and(map, map, cpu_online_map); - /* - * We waited 1 sec, if a CPU still did not call idle - * it may be because it is in idle and not waking up - * because it has nothing to do. - * Give all the remaining CPUS a kick. - */ - smp_call_function_mask(map, do_nothing, 0, 0); - } while (!cpus_empty(map)); - - set_cpus_allowed(current, tmp); -} -EXPORT_SYMBOL_GPL(cpu_idle_wait); - #ifdef CONFIG_HOTPLUG_CPU DECLARE_PER_CPU(int, cpu_state); @@ -207,19 +165,18 @@ static inline void play_dead(void) * low exit latency (ie sit in a loop waiting for * somebody to say that they'd like to reschedule) */ -void cpu_idle (void) +void cpu_idle(void) { current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { + tick_nohz_stop_sched_tick(); while (!need_resched()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) __get_cpu_var(cpu_idle_state) = 0; - tick_nohz_stop_sched_tick(); - rmb(); idle = pm_idle; if (!idle) @@ -247,6 +204,47 @@ void cpu_idle (void) } } +static void do_nothing(void *unused) +{ +} + +void cpu_idle_wait(void) +{ + unsigned int cpu, this_cpu = get_cpu(); + cpumask_t map, tmp = current->cpus_allowed; + + set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); + put_cpu(); + + cpus_clear(map); + for_each_online_cpu(cpu) { + per_cpu(cpu_idle_state, cpu) = 1; + cpu_set(cpu, map); + } + + __get_cpu_var(cpu_idle_state) = 0; + + wmb(); + do { + ssleep(1); + for_each_online_cpu(cpu) { + if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) + cpu_clear(cpu, map); + } + cpus_and(map, map, cpu_online_map); + /* + * We waited 1 sec, if a CPU still did not call idle + * it may be because it is in idle and not waking up + * because it has nothing to do. + * Give all the remaining CPUS a kick. + */ + smp_call_function_mask(map, do_nothing, 0, 0); + } while (!cpus_empty(map)); + + set_cpus_allowed(current, tmp); +} +EXPORT_SYMBOL_GPL(cpu_idle_wait); + /* * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, * which can obviate IPI to trigger checking of need_resched. @@ -257,13 +255,13 @@ void cpu_idle (void) * New with Core Duo processors, MWAIT can take some hints based on CPU * capability. */ -void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) +void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { if (!need_resched()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); if (!need_resched()) - __mwait(eax, ecx); + __mwait(ax, cx); } } @@ -282,25 +280,41 @@ static void mwait_idle(void) } } + +static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) +{ + if (force_mwait) + return 1; + /* Any C1 states supported? */ + return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0; +} + void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { - static int printed; - if (cpu_has(c, X86_FEATURE_MWAIT)) { + static int selected; + + if (selected) + return; +#ifdef CONFIG_X86_SMP + if (pm_idle == poll_idle && smp_num_siblings > 1) { + printk(KERN_WARNING "WARNING: polling idle and HT enabled," + " performance may degrade.\n"); + } +#endif + if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { /* * Skip, if setup has overridden idle. * One CPU supports mwait => All CPUs supports mwait */ if (!pm_idle) { - if (!printed) { - printk(KERN_INFO "using mwait in idle threads.\n"); - printed = 1; - } + printk(KERN_INFO "using mwait in idle threads.\n"); pm_idle = mwait_idle; } } + selected = 1; } -static int __init idle_setup (char *str) +static int __init idle_setup(char *str) { if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); @@ -315,13 +329,13 @@ static int __init idle_setup (char *str) } early_param("idle", idle_setup); -/* Prints also some state that isn't saved in the pt_regs */ +/* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; - unsigned int fsindex,gsindex; - unsigned int ds,cs,es; + unsigned int fsindex, gsindex; + unsigned int ds, cs, es; printk("\n"); print_modules(); @@ -330,16 +344,16 @@ void __show_regs(struct pt_regs * regs) init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); - printk_address(regs->rip); - printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, - regs->eflags); + printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + printk_address(regs->ip, 1); + printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, + regs->flags); printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", - regs->rax, regs->rbx, regs->rcx); + regs->ax, regs->bx, regs->cx); printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", - regs->rdx, regs->rsi, regs->rdi); + regs->dx, regs->si, regs->di); printk("RBP: %016lx R08: %016lx R09: %016lx\n", - regs->rbp, regs->r8, regs->r9); + regs->bp, regs->r8, regs->r9); printk("R10: %016lx R11: %016lx R12: %016lx\n", regs->r10, regs->r11, regs->r12); printk("R13: %016lx R14: %016lx R15: %016lx\n", @@ -379,7 +393,7 @@ void show_regs(struct pt_regs *regs) { printk("CPU %d:", smp_processor_id()); __show_regs(regs); - show_trace(NULL, regs, (void *)(regs + 1)); + show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } /* @@ -390,7 +404,7 @@ void exit_thread(void) struct task_struct *me = current; struct thread_struct *t = &me->thread; - if (me->thread.io_bitmap_ptr) { + if (me->thread.io_bitmap_ptr) { struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); kfree(t->io_bitmap_ptr); @@ -426,7 +440,7 @@ void flush_thread(void) tsk->thread.debugreg3 = 0; tsk->thread.debugreg6 = 0; tsk->thread.debugreg7 = 0; - memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* * Forget coprocessor state.. */ @@ -449,26 +463,21 @@ void release_thread(struct task_struct *dead_task) static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) { - struct user_desc ud = { + struct user_desc ud = { .base_addr = addr, .limit = 0xfffff, .seg_32bit = 1, .limit_in_pages = 1, .useable = 1, }; - struct n_desc_struct *desc = (void *)t->thread.tls_array; + struct desc_struct *desc = t->thread.tls_array; desc += tls; - desc->a = LDT_entry_a(&ud); - desc->b = LDT_entry_b(&ud); + fill_ldt(desc, &ud); } static inline u32 read_32bit_tls(struct task_struct *t, int tls) { - struct desc_struct *desc = (void *)t->thread.tls_array; - desc += tls; - return desc->base0 | - (((u32)desc->base1) << 16) | - (((u32)desc->base2) << 24); + return get_desc_base(&t->thread.tls_array[tls]); } /* @@ -480,7 +489,7 @@ void prepare_to_copy(struct task_struct *tsk) unlazy_fpu(tsk); } -int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, +int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { @@ -492,14 +501,14 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, (THREAD_SIZE + task_stack_page(p))) - 1; *childregs = *regs; - childregs->rax = 0; - childregs->rsp = rsp; - if (rsp == ~0UL) - childregs->rsp = (unsigned long)childregs; + childregs->ax = 0; + childregs->sp = sp; + if (sp == ~0UL) + childregs->sp = (unsigned long)childregs; - p->thread.rsp = (unsigned long) childregs; - p->thread.rsp0 = (unsigned long) (childregs+1); - p->thread.userrsp = me->thread.userrsp; + p->thread.sp = (unsigned long) childregs; + p->thread.sp0 = (unsigned long) (childregs+1); + p->thread.usersp = me->thread.usersp; set_tsk_thread_flag(p, TIF_FORK); @@ -520,7 +529,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, IO_BITMAP_BYTES); set_tsk_thread_flag(p, TIF_IO_BITMAP); - } + } /* * Set a new TLS for the child thread? @@ -528,7 +537,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, if (clone_flags & CLONE_SETTLS) { #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) - err = ia32_child_tls(p, childregs); + err = do_set_thread_area(p, -1, + (struct user_desc __user *)childregs->si, 0); else #endif err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); @@ -547,17 +557,30 @@ out: /* * This special macro can be used to load a debugging register */ -#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r) +#define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r) static inline void __switch_to_xtra(struct task_struct *prev_p, - struct task_struct *next_p, - struct tss_struct *tss) + struct task_struct *next_p, + struct tss_struct *tss) { struct thread_struct *prev, *next; + unsigned long debugctl; prev = &prev_p->thread, next = &next_p->thread; + debugctl = prev->debugctlmsr; + if (next->ds_area_msr != prev->ds_area_msr) { + /* we clear debugctl to make sure DS + * is not in use when we change it */ + debugctl = 0; + wrmsrl(MSR_IA32_DEBUGCTLMSR, 0); + wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); + } + + if (next->debugctlmsr != debugctl) + wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr); + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { loaddebug(next, 0); loaddebug(next, 1); @@ -581,12 +604,18 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } + + if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); + + if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) + ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); } /* * switch_to(x,y) should switch tasks from x to y. * - * This could still be optimized: + * This could still be optimized: * - fold all the options into a flag word and test it with a single test. * - could test fs/gs bitsliced * @@ -597,7 +626,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; - int cpu = smp_processor_id(); + int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); /* we're going to use this soon, after a few expensive things */ @@ -607,7 +636,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Reload esp0, LDT and the page table pointer: */ - tss->rsp0 = next->rsp0; + load_sp0(tss, next); /* * Switch DS and ES. @@ -666,8 +695,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - prev->userrsp = read_pda(oldrsp); - write_pda(oldrsp, next->userrsp); + prev->usersp = read_pda(oldrsp); + write_pda(oldrsp, next->usersp); write_pda(pcurrent, next_p); write_pda(kernelstack, @@ -684,8 +713,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Now maybe reload the debug registers and handle I/O bitmaps */ - if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) - || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) + if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || + task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); /* If the task has used fpu the last 5 timeslices, just do a full @@ -700,7 +729,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * sys_execve() executes a new program. */ -asmlinkage +asmlinkage long sys_execve(char __user *name, char __user * __user *argv, char __user * __user *envp, struct pt_regs regs) { @@ -712,11 +741,6 @@ long sys_execve(char __user *name, char __user * __user *argv, if (IS_ERR(filename)) return error; error = do_execve(filename, argv, envp, ®s); - if (error == 0) { - task_lock(current); - current->ptrace &= ~PT_DTRACE; - task_unlock(current); - } putname(filename); return error; } @@ -726,18 +750,18 @@ void set_personality_64bit(void) /* inherit personality from parent */ /* Make sure to be in 64bit mode */ - clear_thread_flag(TIF_IA32); + clear_thread_flag(TIF_IA32); /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, so it's not too bad. The main problem is just that - 32bit childs are affected again. */ + 32bit childs are affected again. */ current->personality &= ~READ_IMPLIES_EXEC; } asmlinkage long sys_fork(struct pt_regs *regs) { - return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL); + return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL); } asmlinkage long @@ -745,7 +769,7 @@ sys_clone(unsigned long clone_flags, unsigned long newsp, void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) { if (!newsp) - newsp = regs->rsp; + newsp = regs->sp; return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); } @@ -761,29 +785,29 @@ sys_clone(unsigned long clone_flags, unsigned long newsp, */ asmlinkage long sys_vfork(struct pt_regs *regs) { - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0, + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0, NULL, NULL); } unsigned long get_wchan(struct task_struct *p) { unsigned long stack; - u64 fp,rip; + u64 fp,ip; int count = 0; if (!p || p == current || p->state==TASK_RUNNING) return 0; stack = (unsigned long)task_stack_page(p); - if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE) + if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) return 0; - fp = *(u64 *)(p->thread.rsp); + fp = *(u64 *)(p->thread.sp); do { if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE) return 0; - rip = *(u64 *)(fp+8); - if (!in_sched_functions(rip)) - return rip; + ip = *(u64 *)(fp+8); + if (!in_sched_functions(ip)) + return ip; fp = *(u64 *)fp; } while (count++ < 16); return 0; @@ -824,19 +848,19 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) /* Not strictly needed for fs, but do it for symmetry with gs */ if (addr >= TASK_SIZE_OF(task)) - return -EPERM; + return -EPERM; cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to + /* handle small bases via the GDT because that's faster to switch. */ - if (addr <= 0xffffffff) { + if (addr <= 0xffffffff) { set_32bit_tls(task, FS_TLS, addr); - if (doit) { - load_TLS(&task->thread, cpu); + if (doit) { + load_TLS(&task->thread, cpu); asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL)); } task->thread.fsindex = FS_TLS_SEL; task->thread.fs = 0; - } else { + } else { task->thread.fsindex = 0; task->thread.fs = addr; if (doit) { @@ -848,24 +872,24 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) } put_cpu(); break; - case ARCH_GET_FS: { - unsigned long base; + case ARCH_GET_FS: { + unsigned long base; if (task->thread.fsindex == FS_TLS_SEL) base = read_32bit_tls(task, FS_TLS); else if (doit) rdmsrl(MSR_FS_BASE, base); else base = task->thread.fs; - ret = put_user(base, (unsigned long __user *)addr); - break; + ret = put_user(base, (unsigned long __user *)addr); + break; } - case ARCH_GET_GS: { + case ARCH_GET_GS: { unsigned long base; unsigned gsindex; if (task->thread.gsindex == GS_TLS_SEL) base = read_32bit_tls(task, GS_TLS); else if (doit) { - asm("movl %%gs,%0" : "=r" (gsindex)); + asm("movl %%gs,%0" : "=r" (gsindex)); if (gsindex) rdmsrl(MSR_KERNEL_GS_BASE, base); else @@ -873,39 +897,21 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) } else base = task->thread.gs; - ret = put_user(base, (unsigned long __user *)addr); + ret = put_user(base, (unsigned long __user *)addr); break; } default: ret = -EINVAL; break; - } + } - return ret; -} + return ret; +} long sys_arch_prctl(int code, unsigned long addr) { return do_arch_prctl(current, code, addr); -} - -/* - * Capture the user space registers if the task is not running (in user space) - */ -int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) -{ - struct pt_regs *pp, ptregs; - - pp = task_pt_regs(tsk); - - ptregs = *pp; - ptregs.cs &= 0xffff; - ptregs.ss &= 0xffff; - - elf_core_copy_regs(regs, &ptregs); - - return 1; } unsigned long arch_align_stack(unsigned long sp) @@ -914,3 +920,9 @@ unsigned long arch_align_stack(unsigned long sp) sp -= get_random_int() % 8192; return sp & ~0xf; } + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c new file mode 100644 index 00000000000..96286df1bb8 --- /dev/null +++ b/arch/x86/kernel/ptrace.c @@ -0,0 +1,1545 @@ +/* By Ross Biro 1/23/92 */ +/* + * Pentium III FXSR, SSE support + * Gareth Hughes <gareth@valinux.com>, May 2000 + * + * BTS tracing + * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007 + */ + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/errno.h> +#include <linux/ptrace.h> +#include <linux/regset.h> +#include <linux/user.h> +#include <linux/elf.h> +#include <linux/security.h> +#include <linux/audit.h> +#include <linux/seccomp.h> +#include <linux/signal.h> + +#include <asm/uaccess.h> +#include <asm/pgtable.h> +#include <asm/system.h> +#include <asm/processor.h> +#include <asm/i387.h> +#include <asm/debugreg.h> +#include <asm/ldt.h> +#include <asm/desc.h> +#include <asm/prctl.h> +#include <asm/proto.h> +#include <asm/ds.h> + +#include "tls.h" + +enum x86_regset { + REGSET_GENERAL, + REGSET_FP, + REGSET_XFP, + REGSET_TLS, +}; + +/* + * does not yet catch signals sent when the child dies. + * in exit.c or in signal.c. + */ + +/* + * Determines which flags the user has access to [1 = access, 0 = no access]. + */ +#define FLAG_MASK_32 ((unsigned long) \ + (X86_EFLAGS_CF | X86_EFLAGS_PF | \ + X86_EFLAGS_AF | X86_EFLAGS_ZF | \ + X86_EFLAGS_SF | X86_EFLAGS_TF | \ + X86_EFLAGS_DF | X86_EFLAGS_OF | \ + X86_EFLAGS_RF | X86_EFLAGS_AC)) + +/* + * Determines whether a value may be installed in a segment register. + */ +static inline bool invalid_selector(u16 value) +{ + return unlikely(value != 0 && (value & SEGMENT_RPL_MASK) != USER_RPL); +} + +#ifdef CONFIG_X86_32 + +#define FLAG_MASK FLAG_MASK_32 + +static long *pt_regs_access(struct pt_regs *regs, unsigned long regno) +{ + BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); + regno >>= 2; + if (regno > FS) + --regno; + return ®s->bx + regno; +} + +static u16 get_segment_reg(struct task_struct *task, unsigned long offset) +{ + /* + * Returning the value truncates it to 16 bits. + */ + unsigned int retval; + if (offset != offsetof(struct user_regs_struct, gs)) + retval = *pt_regs_access(task_pt_regs(task), offset); + else { + retval = task->thread.gs; + if (task == current) + savesegment(gs, retval); + } + return retval; +} + +static int set_segment_reg(struct task_struct *task, + unsigned long offset, u16 value) +{ + /* + * The value argument was already truncated to 16 bits. + */ + if (invalid_selector(value)) + return -EIO; + + if (offset != offsetof(struct user_regs_struct, gs)) + *pt_regs_access(task_pt_regs(task), offset) = value; + else { + task->thread.gs = value; + if (task == current) + /* + * The user-mode %gs is not affected by + * kernel entry, so we must update the CPU. + */ + loadsegment(gs, value); + } + + return 0; +} + +static unsigned long debugreg_addr_limit(struct task_struct *task) +{ + return TASK_SIZE - 3; +} + +#else /* CONFIG_X86_64 */ + +#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) + +static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset) +{ + BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0); + return ®s->r15 + (offset / sizeof(regs->r15)); +} + +static u16 get_segment_reg(struct task_struct *task, unsigned long offset) +{ + /* + * Returning the value truncates it to 16 bits. + */ + unsigned int seg; + + switch (offset) { + case offsetof(struct user_regs_struct, fs): + if (task == current) { + /* Older gas can't assemble movq %?s,%r?? */ + asm("movl %%fs,%0" : "=r" (seg)); + return seg; + } + return task->thread.fsindex; + case offsetof(struct user_regs_struct, gs): + if (task == current) { + asm("movl %%gs,%0" : "=r" (seg)); + return seg; + } + return task->thread.gsindex; + case offsetof(struct user_regs_struct, ds): + if (task == current) { + asm("movl %%ds,%0" : "=r" (seg)); + return seg; + } + return task->thread.ds; + case offsetof(struct user_regs_struct, es): + if (task == current) { + asm("movl %%es,%0" : "=r" (seg)); + return seg; + } + return task->thread.es; + + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ss): + break; + } + return *pt_regs_access(task_pt_regs(task), offset); +} + +static int set_segment_reg(struct task_struct *task, + unsigned long offset, u16 value) +{ + /* + * The value argument was already truncated to 16 bits. + */ + if (invalid_selector(value)) + return -EIO; + + switch (offset) { + case offsetof(struct user_regs_struct,fs): + /* + * If this is setting fs as for normal 64-bit use but + * setting fs_base has implicitly changed it, leave it. + */ + if ((value == FS_TLS_SEL && task->thread.fsindex == 0 && + task->thread.fs != 0) || + (value == 0 && task->thread.fsindex == FS_TLS_SEL && + task->thread.fs == 0)) + break; + task->thread.fsindex = value; + if (task == current) + loadsegment(fs, task->thread.fsindex); + break; + case offsetof(struct user_regs_struct,gs): + /* + * If this is setting gs as for normal 64-bit use but + * setting gs_base has implicitly changed it, leave it. + */ + if ((value == GS_TLS_SEL && task->thread.gsindex == 0 && + task->thread.gs != 0) || + (value == 0 && task->thread.gsindex == GS_TLS_SEL && + task->thread.gs == 0)) + break; + task->thread.gsindex = value; + if (task == current) + load_gs_index(task->thread.gsindex); + break; + case offsetof(struct user_regs_struct,ds): + task->thread.ds = value; + if (task == current) + loadsegment(ds, task->thread.ds); + break; + case offsetof(struct user_regs_struct,es): + task->thread.es = value; + if (task == current) + loadsegment(es, task->thread.es); + break; + + /* + * Can't actually change these in 64-bit mode. + */ + case offsetof(struct user_regs_struct,cs): +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(task, TIF_IA32)) + task_pt_regs(task)->cs = value; +#endif + break; + case offsetof(struct user_regs_struct,ss): +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(task, TIF_IA32)) + task_pt_regs(task)->ss = value; +#endif + break; + } + + return 0; +} + +static unsigned long debugreg_addr_limit(struct task_struct *task) +{ +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(task, TIF_IA32)) + return IA32_PAGE_OFFSET - 3; +#endif + return TASK_SIZE64 - 7; +} + +#endif /* CONFIG_X86_32 */ + +static unsigned long get_flags(struct task_struct *task) +{ + unsigned long retval = task_pt_regs(task)->flags; + + /* + * If the debugger set TF, hide it from the readout. + */ + if (test_tsk_thread_flag(task, TIF_FORCED_TF)) + retval &= ~X86_EFLAGS_TF; + + return retval; +} + +static int set_flags(struct task_struct *task, unsigned long value) +{ + struct pt_regs *regs = task_pt_regs(task); + + /* + * If the user value contains TF, mark that + * it was not "us" (the debugger) that set it. + * If not, make sure it stays set if we had. + */ + if (value & X86_EFLAGS_TF) + clear_tsk_thread_flag(task, TIF_FORCED_TF); + else if (test_tsk_thread_flag(task, TIF_FORCED_TF)) + value |= X86_EFLAGS_TF; + + regs->flags = (regs->flags & ~FLAG_MASK) | (value & FLAG_MASK); + + return 0; +} + +static int putreg(struct task_struct *child, + unsigned long offset, unsigned long value) +{ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ds): + case offsetof(struct user_regs_struct, es): + case offsetof(struct user_regs_struct, fs): + case offsetof(struct user_regs_struct, gs): + case offsetof(struct user_regs_struct, ss): + return set_segment_reg(child, offset, value); + + case offsetof(struct user_regs_struct, flags): + return set_flags(child, value); + +#ifdef CONFIG_X86_64 + case offsetof(struct user_regs_struct,fs_base): + if (value >= TASK_SIZE_OF(child)) + return -EIO; + /* + * When changing the segment base, use do_arch_prctl + * to set either thread.fs or thread.fsindex and the + * corresponding GDT slot. + */ + if (child->thread.fs != value) + return do_arch_prctl(child, ARCH_SET_FS, value); + return 0; + case offsetof(struct user_regs_struct,gs_base): + /* + * Exactly the same here as the %fs handling above. + */ + if (value >= TASK_SIZE_OF(child)) + return -EIO; + if (child->thread.gs != value) + return do_arch_prctl(child, ARCH_SET_GS, value); + return 0; +#endif + } + + *pt_regs_access(task_pt_regs(child), offset) = value; + return 0; +} + +static unsigned long getreg(struct task_struct *task, unsigned long offset) +{ + switch (offset) { + case offsetof(struct user_regs_struct, cs): + case offsetof(struct user_regs_struct, ds): + case offsetof(struct user_regs_struct, es): + case offsetof(struct user_regs_struct, fs): + case offsetof(struct user_regs_struct, gs): + case offsetof(struct user_regs_struct, ss): + return get_segment_reg(task, offset); + + case offsetof(struct user_regs_struct, flags): + return get_flags(task); + +#ifdef CONFIG_X86_64 + case offsetof(struct user_regs_struct, fs_base): { + /* + * do_arch_prctl may have used a GDT slot instead of + * the MSR. To userland, it appears the same either + * way, except the %fs segment selector might not be 0. + */ + unsigned int seg = task->thread.fsindex; + if (task->thread.fs != 0) + return task->thread.fs; + if (task == current) + asm("movl %%fs,%0" : "=r" (seg)); + if (seg != FS_TLS_SEL) + return 0; + return get_desc_base(&task->thread.tls_array[FS_TLS]); + } + case offsetof(struct user_regs_struct, gs_base): { + /* + * Exactly the same here as the %fs handling above. + */ + unsigned int seg = task->thread.gsindex; + if (task->thread.gs != 0) + return task->thread.gs; + if (task == current) + asm("movl %%gs,%0" : "=r" (seg)); + if (seg != GS_TLS_SEL) + return 0; + return get_desc_base(&task->thread.tls_array[GS_TLS]); + } +#endif + } + + return *pt_regs_access(task_pt_regs(task), offset); +} + +static int genregs_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (kbuf) { + unsigned long *k = kbuf; + while (count > 0) { + *k++ = getreg(target, pos); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + unsigned long __user *u = ubuf; + while (count > 0) { + if (__put_user(getreg(target, pos), u++)) + return -EFAULT; + count -= sizeof(*u); + pos += sizeof(*u); + } + } + + return 0; +} + +static int genregs_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret = 0; + if (kbuf) { + const unsigned long *k = kbuf; + while (count > 0 && !ret) { + ret = putreg(target, pos, *k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + const unsigned long __user *u = ubuf; + while (count > 0 && !ret) { + unsigned long word; + ret = __get_user(word, u++); + if (ret) + break; + ret = putreg(target, pos, word); + count -= sizeof(*u); + pos += sizeof(*u); + } + } + return ret; +} + +/* + * This function is trivial and will be inlined by the compiler. + * Having it separates the implementation details of debug + * registers from the interface details of ptrace. + */ +static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) +{ + switch (n) { + case 0: return child->thread.debugreg0; + case 1: return child->thread.debugreg1; + case 2: return child->thread.debugreg2; + case 3: return child->thread.debugreg3; + case 6: return child->thread.debugreg6; + case 7: return child->thread.debugreg7; + } + return 0; +} + +static int ptrace_set_debugreg(struct task_struct *child, + int n, unsigned long data) +{ + int i; + + if (unlikely(n == 4 || n == 5)) + return -EIO; + + if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) + return -EIO; + + switch (n) { + case 0: child->thread.debugreg0 = data; break; + case 1: child->thread.debugreg1 = data; break; + case 2: child->thread.debugreg2 = data; break; + case 3: child->thread.debugreg3 = data; break; + + case 6: + if ((data & ~0xffffffffUL) != 0) + return -EIO; + child->thread.debugreg6 = data; + break; + + case 7: + /* + * Sanity-check data. Take one half-byte at once with + * check = (val >> (16 + 4*i)) & 0xf. It contains the + * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits + * 2 and 3 are LENi. Given a list of invalid values, + * we do mask |= 1 << invalid_value, so that + * (mask >> check) & 1 is a correct test for invalid + * values. + * + * R/Wi contains the type of the breakpoint / + * watchpoint, LENi contains the length of the watched + * data in the watchpoint case. + * + * The invalid values are: + * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit] + * - R/Wi == 0x10 (break on I/O reads or writes), so + * mask |= 0x4444. + * - R/Wi == 0x00 && LENi != 0x00, so we have mask |= + * 0x1110. + * + * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54. + * + * See the Intel Manual "System Programming Guide", + * 15.2.4 + * + * Note that LENi == 0x10 is defined on x86_64 in long + * mode (i.e. even for 32-bit userspace software, but + * 64-bit kernel), so the x86_64 mask value is 0x5454. + * See the AMD manual no. 24593 (AMD64 System Programming) + */ +#ifdef CONFIG_X86_32 +#define DR7_MASK 0x5f54 +#else +#define DR7_MASK 0x5554 +#endif + data &= ~DR_CONTROL_RESERVED; + for (i = 0; i < 4; i++) + if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) + return -EIO; + child->thread.debugreg7 = data; + if (data) + set_tsk_thread_flag(child, TIF_DEBUG); + else + clear_tsk_thread_flag(child, TIF_DEBUG); + break; + } + + return 0; +} + +static int ptrace_bts_get_size(struct task_struct *child) +{ + if (!child->thread.ds_area_msr) + return -ENXIO; + + return ds_get_bts_index((void *)child->thread.ds_area_msr); +} + +static int ptrace_bts_read_record(struct task_struct *child, + long index, + struct bts_struct __user *out) +{ + struct bts_struct ret; + int retval; + int bts_end; + int bts_index; + + if (!child->thread.ds_area_msr) + return -ENXIO; + + if (index < 0) + return -EINVAL; + + bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr); + if (bts_end <= index) + return -EINVAL; + + /* translate the ptrace bts index into the ds bts index */ + bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr); + bts_index -= (index + 1); + if (bts_index < 0) + bts_index += bts_end; + + retval = ds_read_bts((void *)child->thread.ds_area_msr, + bts_index, &ret); + if (retval < 0) + return retval; + + if (copy_to_user(out, &ret, sizeof(ret))) + return -EFAULT; + + return sizeof(ret); +} + +static int ptrace_bts_write_record(struct task_struct *child, + const struct bts_struct *in) +{ + int retval; + + if (!child->thread.ds_area_msr) + return -ENXIO; + + retval = ds_write_bts((void *)child->thread.ds_area_msr, in); + if (retval) + return retval; + + return sizeof(*in); +} + +static int ptrace_bts_clear(struct task_struct *child) +{ + if (!child->thread.ds_area_msr) + return -ENXIO; + + return ds_clear((void *)child->thread.ds_area_msr); +} + +static int ptrace_bts_drain(struct task_struct *child, + long size, + struct bts_struct __user *out) +{ + int end, i; + void *ds = (void *)child->thread.ds_area_msr; + + if (!ds) + return -ENXIO; + + end = ds_get_bts_index(ds); + if (end <= 0) + return end; + + if (size < (end * sizeof(struct bts_struct))) + return -EIO; + + for (i = 0; i < end; i++, out++) { + struct bts_struct ret; + int retval; + + retval = ds_read_bts(ds, i, &ret); + if (retval < 0) + return retval; + + if (copy_to_user(out, &ret, sizeof(ret))) + return -EFAULT; + } + + ds_clear(ds); + + return end; +} + +static int ptrace_bts_realloc(struct task_struct *child, + int size, int reduce_size) +{ + unsigned long rlim, vm; + int ret, old_size; + + if (size < 0) + return -EINVAL; + + old_size = ds_get_bts_size((void *)child->thread.ds_area_msr); + if (old_size < 0) + return old_size; + + ret = ds_free((void **)&child->thread.ds_area_msr); + if (ret < 0) + goto out; + + size >>= PAGE_SHIFT; + old_size >>= PAGE_SHIFT; + + current->mm->total_vm -= old_size; + current->mm->locked_vm -= old_size; + + if (size == 0) + goto out; + + rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + vm = current->mm->total_vm + size; + if (rlim < vm) { + ret = -ENOMEM; + + if (!reduce_size) + goto out; + + size = rlim - current->mm->total_vm; + if (size <= 0) + goto out; + } + + rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + vm = current->mm->locked_vm + size; + if (rlim < vm) { + ret = -ENOMEM; + + if (!reduce_size) + goto out; + + size = rlim - current->mm->locked_vm; + if (size <= 0) + goto out; + } + + ret = ds_allocate((void **)&child->thread.ds_area_msr, + size << PAGE_SHIFT); + if (ret < 0) + goto out; + + current->mm->total_vm += size; + current->mm->locked_vm += size; + +out: + if (child->thread.ds_area_msr) + set_tsk_thread_flag(child, TIF_DS_AREA_MSR); + else + clear_tsk_thread_flag(child, TIF_DS_AREA_MSR); + + return ret; +} + +static int ptrace_bts_config(struct task_struct *child, + long cfg_size, + const struct ptrace_bts_config __user *ucfg) +{ + struct ptrace_bts_config cfg; + int bts_size, ret = 0; + void *ds; + + if (cfg_size < sizeof(cfg)) + return -EIO; + + if (copy_from_user(&cfg, ucfg, sizeof(cfg))) + return -EFAULT; + + if ((int)cfg.size < 0) + return -EINVAL; + + bts_size = 0; + ds = (void *)child->thread.ds_area_msr; + if (ds) { + bts_size = ds_get_bts_size(ds); + if (bts_size < 0) + return bts_size; + } + cfg.size = PAGE_ALIGN(cfg.size); + + if (bts_size != cfg.size) { + ret = ptrace_bts_realloc(child, cfg.size, + cfg.flags & PTRACE_BTS_O_CUT_SIZE); + if (ret < 0) + goto errout; + + ds = (void *)child->thread.ds_area_msr; + } + + if (cfg.flags & PTRACE_BTS_O_SIGNAL) + ret = ds_set_overflow(ds, DS_O_SIGNAL); + else + ret = ds_set_overflow(ds, DS_O_WRAP); + if (ret < 0) + goto errout; + + if (cfg.flags & PTRACE_BTS_O_TRACE) + child->thread.debugctlmsr |= ds_debugctl_mask(); + else + child->thread.debugctlmsr &= ~ds_debugctl_mask(); + + if (cfg.flags & PTRACE_BTS_O_SCHED) + set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + else + clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + + ret = sizeof(cfg); + +out: + if (child->thread.debugctlmsr) + set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + else + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + + return ret; + +errout: + child->thread.debugctlmsr &= ~ds_debugctl_mask(); + clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + goto out; +} + +static int ptrace_bts_status(struct task_struct *child, + long cfg_size, + struct ptrace_bts_config __user *ucfg) +{ + void *ds = (void *)child->thread.ds_area_msr; + struct ptrace_bts_config cfg; + + if (cfg_size < sizeof(cfg)) + return -EIO; + + memset(&cfg, 0, sizeof(cfg)); + + if (ds) { + cfg.size = ds_get_bts_size(ds); + + if (ds_get_overflow(ds) == DS_O_SIGNAL) + cfg.flags |= PTRACE_BTS_O_SIGNAL; + + if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && + child->thread.debugctlmsr & ds_debugctl_mask()) + cfg.flags |= PTRACE_BTS_O_TRACE; + + if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) + cfg.flags |= PTRACE_BTS_O_SCHED; + } + + cfg.bts_size = sizeof(struct bts_struct); + + if (copy_to_user(ucfg, &cfg, sizeof(cfg))) + return -EFAULT; + + return sizeof(cfg); +} + +void ptrace_bts_take_timestamp(struct task_struct *tsk, + enum bts_qualifier qualifier) +{ + struct bts_struct rec = { + .qualifier = qualifier, + .variant.jiffies = jiffies_64 + }; + + ptrace_bts_write_record(tsk, &rec); +} + +/* + * Called by kernel/ptrace.c when detaching.. + * + * Make sure the single step bit is not set. + */ +void ptrace_disable(struct task_struct *child) +{ + user_disable_single_step(child); +#ifdef TIF_SYSCALL_EMU + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); +#endif + if (child->thread.ds_area_msr) { + ptrace_bts_realloc(child, 0, 0); + child->thread.debugctlmsr &= ~ds_debugctl_mask(); + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); + } +} + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION +static const struct user_regset_view user_x86_32_view; /* Initialized below. */ +#endif + +long arch_ptrace(struct task_struct *child, long request, long addr, long data) +{ + int ret; + unsigned long __user *datap = (unsigned long __user *)data; + + switch (request) { + /* read the word at location addr in the USER area. */ + case PTRACE_PEEKUSR: { + unsigned long tmp; + + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr < 0 || + addr >= sizeof(struct user)) + break; + + tmp = 0; /* Default return condition */ + if (addr < sizeof(struct user_regs_struct)) + tmp = getreg(child, addr); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + tmp = ptrace_get_debugreg(child, addr / sizeof(data)); + } + ret = put_user(tmp, datap); + break; + } + + case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ + ret = -EIO; + if ((addr & (sizeof(data) - 1)) || addr < 0 || + addr >= sizeof(struct user)) + break; + + if (addr < sizeof(struct user_regs_struct)) + ret = putreg(child, addr, data); + else if (addr >= offsetof(struct user, u_debugreg[0]) && + addr <= offsetof(struct user, u_debugreg[7])) { + addr -= offsetof(struct user, u_debugreg[0]); + ret = ptrace_set_debugreg(child, + addr / sizeof(data), data); + } + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user(child, + task_user_regset_view(current), + REGSET_FP, + 0, sizeof(struct user_i387_struct), + datap); + +#ifdef CONFIG_X86_32 + case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_XFP, + 0, sizeof(struct user_fxsr_struct), + datap); + + case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_XFP, + 0, sizeof(struct user_fxsr_struct), + datap); +#endif + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + case PTRACE_GET_THREAD_AREA: + if (addr < 0) + return -EIO; + ret = do_get_thread_area(child, addr, + (struct user_desc __user *) data); + break; + + case PTRACE_SET_THREAD_AREA: + if (addr < 0) + return -EIO; + ret = do_set_thread_area(child, addr, + (struct user_desc __user *) data, 0); + break; +#endif + +#ifdef CONFIG_X86_64 + /* normal 64bit interface to access TLS data. + Works just like arch_prctl, except that the arguments + are reversed. */ + case PTRACE_ARCH_PRCTL: + ret = do_arch_prctl(child, data, addr); + break; +#endif + + case PTRACE_BTS_CONFIG: + ret = ptrace_bts_config + (child, data, (struct ptrace_bts_config __user *)addr); + break; + + case PTRACE_BTS_STATUS: + ret = ptrace_bts_status + (child, data, (struct ptrace_bts_config __user *)addr); + break; + + case PTRACE_BTS_SIZE: + ret = ptrace_bts_get_size(child); + break; + + case PTRACE_BTS_GET: + ret = ptrace_bts_read_record + (child, data, (struct bts_struct __user *) addr); + break; + + case PTRACE_BTS_CLEAR: + ret = ptrace_bts_clear(child); + break; + + case PTRACE_BTS_DRAIN: + ret = ptrace_bts_drain + (child, data, (struct bts_struct __user *) addr); + break; + + default: + ret = ptrace_request(child, request, addr, data); + break; + } + + return ret; +} + +#ifdef CONFIG_IA32_EMULATION + +#include <linux/compat.h> +#include <linux/syscalls.h> +#include <asm/ia32.h> +#include <asm/user32.h> + +#define R32(l,q) \ + case offsetof(struct user32, regs.l): \ + regs->q = value; break + +#define SEG32(rs) \ + case offsetof(struct user32, regs.rs): \ + return set_segment_reg(child, \ + offsetof(struct user_regs_struct, rs), \ + value); \ + break + +static int putreg32(struct task_struct *child, unsigned regno, u32 value) +{ + struct pt_regs *regs = task_pt_regs(child); + + switch (regno) { + + SEG32(cs); + SEG32(ds); + SEG32(es); + SEG32(fs); + SEG32(gs); + SEG32(ss); + + R32(ebx, bx); + R32(ecx, cx); + R32(edx, dx); + R32(edi, di); + R32(esi, si); + R32(ebp, bp); + R32(eax, ax); + R32(orig_eax, orig_ax); + R32(eip, ip); + R32(esp, sp); + + case offsetof(struct user32, regs.eflags): + return set_flags(child, value); + + case offsetof(struct user32, u_debugreg[0]) ... + offsetof(struct user32, u_debugreg[7]): + regno -= offsetof(struct user32, u_debugreg[0]); + return ptrace_set_debugreg(child, regno / 4, value); + + default: + if (regno > sizeof(struct user32) || (regno & 3)) + return -EIO; + + /* + * Other dummy fields in the virtual user structure + * are ignored + */ + break; + } + return 0; +} + +#undef R32 +#undef SEG32 + +#define R32(l,q) \ + case offsetof(struct user32, regs.l): \ + *val = regs->q; break + +#define SEG32(rs) \ + case offsetof(struct user32, regs.rs): \ + *val = get_segment_reg(child, \ + offsetof(struct user_regs_struct, rs)); \ + break + +static int getreg32(struct task_struct *child, unsigned regno, u32 *val) +{ + struct pt_regs *regs = task_pt_regs(child); + + switch (regno) { + + SEG32(ds); + SEG32(es); + SEG32(fs); + SEG32(gs); + + R32(cs, cs); + R32(ss, ss); + R32(ebx, bx); + R32(ecx, cx); + R32(edx, dx); + R32(edi, di); + R32(esi, si); + R32(ebp, bp); + R32(eax, ax); + R32(orig_eax, orig_ax); + R32(eip, ip); + R32(esp, sp); + + case offsetof(struct user32, regs.eflags): + *val = get_flags(child); + break; + + case offsetof(struct user32, u_debugreg[0]) ... + offsetof(struct user32, u_debugreg[7]): + regno -= offsetof(struct user32, u_debugreg[0]); + *val = ptrace_get_debugreg(child, regno / 4); + break; + + default: + if (regno > sizeof(struct user32) || (regno & 3)) + return -EIO; + + /* + * Other dummy fields in the virtual user structure + * are ignored + */ + *val = 0; + break; + } + return 0; +} + +#undef R32 +#undef SEG32 + +static int genregs32_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (kbuf) { + compat_ulong_t *k = kbuf; + while (count > 0) { + getreg32(target, pos, k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + compat_ulong_t __user *u = ubuf; + while (count > 0) { + compat_ulong_t word; + getreg32(target, pos, &word); + if (__put_user(word, u++)) + return -EFAULT; + count -= sizeof(*u); + pos += sizeof(*u); + } + } + + return 0; +} + +static int genregs32_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret = 0; + if (kbuf) { + const compat_ulong_t *k = kbuf; + while (count > 0 && !ret) { + ret = putreg(target, pos, *k++); + count -= sizeof(*k); + pos += sizeof(*k); + } + } else { + const compat_ulong_t __user *u = ubuf; + while (count > 0 && !ret) { + compat_ulong_t word; + ret = __get_user(word, u++); + if (ret) + break; + ret = putreg(target, pos, word); + count -= sizeof(*u); + pos += sizeof(*u); + } + } + return ret; +} + +static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data) +{ + siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t)); + compat_siginfo_t __user *si32 = compat_ptr(data); + siginfo_t ssi; + int ret; + + if (request == PTRACE_SETSIGINFO) { + memset(&ssi, 0, sizeof(siginfo_t)); + ret = copy_siginfo_from_user32(&ssi, si32); + if (ret) + return ret; + if (copy_to_user(si, &ssi, sizeof(siginfo_t))) + return -EFAULT; + } + ret = sys_ptrace(request, pid, addr, (unsigned long)si); + if (ret) + return ret; + if (request == PTRACE_GETSIGINFO) { + if (copy_from_user(&ssi, si, sizeof(siginfo_t))) + return -EFAULT; + ret = copy_siginfo_to_user32(si32, &ssi); + } + return ret; +} + +asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) +{ + struct task_struct *child; + struct pt_regs *childregs; + void __user *datap = compat_ptr(data); + int ret; + __u32 val; + + switch (request) { + case PTRACE_TRACEME: + case PTRACE_ATTACH: + case PTRACE_KILL: + case PTRACE_CONT: + case PTRACE_SINGLESTEP: + case PTRACE_SINGLEBLOCK: + case PTRACE_DETACH: + case PTRACE_SYSCALL: + case PTRACE_OLDSETOPTIONS: + case PTRACE_SETOPTIONS: + case PTRACE_SET_THREAD_AREA: + case PTRACE_GET_THREAD_AREA: + case PTRACE_BTS_CONFIG: + case PTRACE_BTS_STATUS: + case PTRACE_BTS_SIZE: + case PTRACE_BTS_GET: + case PTRACE_BTS_CLEAR: + case PTRACE_BTS_DRAIN: + return sys_ptrace(request, pid, addr, data); + + default: + return -EINVAL; + + case PTRACE_PEEKTEXT: + case PTRACE_PEEKDATA: + case PTRACE_POKEDATA: + case PTRACE_POKETEXT: + case PTRACE_POKEUSR: + case PTRACE_PEEKUSR: + case PTRACE_GETREGS: + case PTRACE_SETREGS: + case PTRACE_SETFPREGS: + case PTRACE_GETFPREGS: + case PTRACE_SETFPXREGS: + case PTRACE_GETFPXREGS: + case PTRACE_GETEVENTMSG: + break; + + case PTRACE_SETSIGINFO: + case PTRACE_GETSIGINFO: + return ptrace32_siginfo(request, pid, addr, data); + } + + child = ptrace_get_task_struct(pid); + if (IS_ERR(child)) + return PTR_ERR(child); + + ret = ptrace_check_attach(child, request == PTRACE_KILL); + if (ret < 0) + goto out; + + childregs = task_pt_regs(child); + + switch (request) { + case PTRACE_PEEKUSR: + ret = getreg32(child, addr, &val); + if (ret == 0) + ret = put_user(val, (__u32 __user *)datap); + break; + + case PTRACE_POKEUSR: + ret = putreg32(child, addr, data); + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct32), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_GENERAL, 0, + sizeof(struct user_regs_struct32), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_FP, 0, + sizeof(struct user_i387_ia32_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user( + child, &user_x86_32_view, REGSET_FP, + 0, sizeof(struct user_i387_ia32_struct), datap); + + case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_XFP, 0, + sizeof(struct user32_fxsr_struct), + datap); + + case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_XFP, 0, + sizeof(struct user32_fxsr_struct), + datap); + + default: + return compat_ptrace_request(child, request, addr, data); + } + + out: + put_task_struct(child); + return ret; +} + +#endif /* CONFIG_IA32_EMULATION */ + +#ifdef CONFIG_X86_64 + +static const struct user_regset x86_64_regsets[] = { + [REGSET_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct) / sizeof(long), + .size = sizeof(long), .align = sizeof(long), + .get = genregs_get, .set = genregs_set + }, + [REGSET_FP] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct user_i387_struct) / sizeof(long), + .size = sizeof(long), .align = sizeof(long), + .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set + }, +}; + +static const struct user_regset_view user_x86_64_view = { + .name = "x86_64", .e_machine = EM_X86_64, + .regsets = x86_64_regsets, .n = ARRAY_SIZE(x86_64_regsets) +}; + +#else /* CONFIG_X86_32 */ + +#define user_regs_struct32 user_regs_struct +#define genregs32_get genregs_get +#define genregs32_set genregs_set + +#endif /* CONFIG_X86_64 */ + +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION +static const struct user_regset x86_32_regsets[] = { + [REGSET_GENERAL] = { + .core_note_type = NT_PRSTATUS, + .n = sizeof(struct user_regs_struct32) / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .get = genregs32_get, .set = genregs32_set + }, + [REGSET_FP] = { + .core_note_type = NT_PRFPREG, + .n = sizeof(struct user_i387_struct) / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .active = fpregs_active, .get = fpregs_get, .set = fpregs_set + }, + [REGSET_XFP] = { + .core_note_type = NT_PRXFPREG, + .n = sizeof(struct user_i387_struct) / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set + }, + [REGSET_TLS] = { + .core_note_type = NT_386_TLS, + .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, + .size = sizeof(struct user_desc), + .align = sizeof(struct user_desc), + .active = regset_tls_active, + .get = regset_tls_get, .set = regset_tls_set + }, +}; + +static const struct user_regset_view user_x86_32_view = { + .name = "i386", .e_machine = EM_386, + .regsets = x86_32_regsets, .n = ARRAY_SIZE(x86_32_regsets) +}; +#endif + +const struct user_regset_view *task_user_regset_view(struct task_struct *task) +{ +#ifdef CONFIG_IA32_EMULATION + if (test_tsk_thread_flag(task, TIF_IA32)) +#endif +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + return &user_x86_32_view; +#endif +#ifdef CONFIG_X86_64 + return &user_x86_64_view; +#endif +} + +#ifdef CONFIG_X86_32 + +void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) +{ + struct siginfo info; + + tsk->thread.trap_no = 1; + tsk->thread.error_code = error_code; + + memset(&info, 0, sizeof(info)); + info.si_signo = SIGTRAP; + info.si_code = TRAP_BRKPT; + + /* User-mode ip? */ + info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; + + /* Send us the fake SIGTRAP */ + force_sig_info(SIGTRAP, &info, tsk); +} + +/* notification of system call entry/exit + * - triggered by current->work.syscall_trace + */ +__attribute__((regparm(3))) +int do_syscall_trace(struct pt_regs *regs, int entryexit) +{ + int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU); + /* + * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall + * interception + */ + int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP); + int ret = 0; + + /* do the secure computing check first */ + if (!entryexit) + secure_computing(regs->orig_ax); + + if (unlikely(current->audit_context)) { + if (entryexit) + audit_syscall_exit(AUDITSC_RESULT(regs->ax), + regs->ax); + /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only + * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is + * not used, entry.S will call us only on syscall exit, not + * entry; so when TIF_SYSCALL_AUDIT is used we must avoid + * calling send_sigtrap() on syscall entry. + * + * Note that when PTRACE_SYSEMU_SINGLESTEP is used, + * is_singlestep is false, despite his name, so we will still do + * the correct thing. + */ + else if (is_singlestep) + goto out; + } + + if (!(current->ptrace & PT_PTRACED)) + goto out; + + /* If a process stops on the 1st tracepoint with SYSCALL_TRACE + * and then is resumed with SYSEMU_SINGLESTEP, it will come in + * here. We have to check this and return */ + if (is_sysemu && entryexit) + return 0; + + /* Fake a debug trap */ + if (is_singlestep) + send_sigtrap(current, regs, 0); + + if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu) + goto out; + + /* the 0x80 provides a way for the tracing parent to distinguish + between a syscall stop and SIGTRAP delivery */ + /* Note that the debugger could change the result of test_thread_flag!*/ + ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0)); + + /* + * this isn't the same as continuing with a signal, but it will do + * for normal use. strace only continues with a signal if the + * stopping signal is not SIGTRAP. -brl + */ + if (current->exit_code) { + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } + ret = is_sysemu; +out: + if (unlikely(current->audit_context) && !entryexit) + audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax, + regs->bx, regs->cx, regs->dx, regs->si); + if (ret == 0) + return 0; + + regs->orig_ax = -1; /* force skip of syscall restarting */ + if (unlikely(current->audit_context)) + audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); + return 1; +} + +#else /* CONFIG_X86_64 */ + +static void syscall_trace(struct pt_regs *regs) +{ + +#if 0 + printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", + current->comm, + regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0), + current_thread_info()->flags, current->ptrace); +#endif + + ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) + ? 0x80 : 0)); + /* + * this isn't the same as continuing with a signal, but it will do + * for normal use. strace only continues with a signal if the + * stopping signal is not SIGTRAP. -brl + */ + if (current->exit_code) { + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } +} + +asmlinkage void syscall_trace_enter(struct pt_regs *regs) +{ + /* do the secure computing check first */ + secure_computing(regs->orig_ax); + + if (test_thread_flag(TIF_SYSCALL_TRACE) + && (current->ptrace & PT_PTRACED)) + syscall_trace(regs); + + if (unlikely(current->audit_context)) { + if (test_thread_flag(TIF_IA32)) { + audit_syscall_entry(AUDIT_ARCH_I386, + regs->orig_ax, + regs->bx, regs->cx, + regs->dx, regs->si); + } else { + audit_syscall_entry(AUDIT_ARCH_X86_64, + regs->orig_ax, + regs->di, regs->si, + regs->dx, regs->r10); + } + } +} + +asmlinkage void syscall_trace_leave(struct pt_regs *regs) +{ + if (unlikely(current->audit_context)) + audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); + + if ((test_thread_flag(TIF_SYSCALL_TRACE) + || test_thread_flag(TIF_SINGLESTEP)) + && (current->ptrace & PT_PTRACED)) + syscall_trace(regs); +} + +#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c deleted file mode 100644 index ff5431cc03e..00000000000 --- a/arch/x86/kernel/ptrace_32.c +++ /dev/null @@ -1,717 +0,0 @@ -/* By Ross Biro 1/23/92 */ -/* - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@valinux.com>, May 2000 - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/errno.h> -#include <linux/ptrace.h> -#include <linux/user.h> -#include <linux/security.h> -#include <linux/audit.h> -#include <linux/seccomp.h> -#include <linux/signal.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/system.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/debugreg.h> -#include <asm/ldt.h> -#include <asm/desc.h> - -/* - * does not yet catch signals sent when the child dies. - * in exit.c or in signal.c. - */ - -/* - * Determines which flags the user has access to [1 = access, 0 = no access]. - * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9). - * Also masks reserved bits (31-22, 15, 5, 3, 1). - */ -#define FLAG_MASK 0x00050dd5 - -/* set's the trap flag. */ -#define TRAP_FLAG 0x100 - -/* - * Offset of eflags on child stack.. - */ -#define EFL_OFFSET offsetof(struct pt_regs, eflags) - -static inline struct pt_regs *get_child_regs(struct task_struct *task) -{ - void *stack_top = (void *)task->thread.esp0; - return stack_top - sizeof(struct pt_regs); -} - -/* - * This routine will get a word off of the processes privileged stack. - * the offset is bytes into the pt_regs structure on the stack. - * This routine assumes that all the privileged stacks are in our - * data space. - */ -static inline int get_stack_long(struct task_struct *task, int offset) -{ - unsigned char *stack; - - stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs); - stack += offset; - return (*((int *)stack)); -} - -/* - * This routine will put a word on the processes privileged stack. - * the offset is bytes into the pt_regs structure on the stack. - * This routine assumes that all the privileged stacks are in our - * data space. - */ -static inline int put_stack_long(struct task_struct *task, int offset, - unsigned long data) -{ - unsigned char * stack; - - stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs); - stack += offset; - *(unsigned long *) stack = data; - return 0; -} - -static int putreg(struct task_struct *child, - unsigned long regno, unsigned long value) -{ - switch (regno >> 2) { - case GS: - if (value && (value & 3) != 3) - return -EIO; - child->thread.gs = value; - return 0; - case DS: - case ES: - case FS: - if (value && (value & 3) != 3) - return -EIO; - value &= 0xffff; - break; - case SS: - case CS: - if ((value & 3) != 3) - return -EIO; - value &= 0xffff; - break; - case EFL: - value &= FLAG_MASK; - value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; - break; - } - if (regno > FS*4) - regno -= 1*4; - put_stack_long(child, regno, value); - return 0; -} - -static unsigned long getreg(struct task_struct *child, - unsigned long regno) -{ - unsigned long retval = ~0UL; - - switch (regno >> 2) { - case GS: - retval = child->thread.gs; - break; - case DS: - case ES: - case FS: - case SS: - case CS: - retval = 0xffff; - /* fall through */ - default: - if (regno > FS*4) - regno -= 1*4; - retval &= get_stack_long(child, regno); - } - return retval; -} - -#define LDT_SEGMENT 4 - -static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs) -{ - unsigned long addr, seg; - - addr = regs->eip; - seg = regs->xcs & 0xffff; - if (regs->eflags & VM_MASK) { - addr = (addr & 0xffff) + (seg << 4); - return addr; - } - - /* - * We'll assume that the code segments in the GDT - * are all zero-based. That is largely true: the - * TLS segments are used for data, and the PNPBIOS - * and APM bios ones we just ignore here. - */ - if (seg & LDT_SEGMENT) { - u32 *desc; - unsigned long base; - - seg &= ~7UL; - - mutex_lock(&child->mm->context.lock); - if (unlikely((seg >> 3) >= child->mm->context.size)) - addr = -1L; /* bogus selector, access would fault */ - else { - desc = child->mm->context.ldt + seg; - base = ((desc[0] >> 16) | - ((desc[1] & 0xff) << 16) | - (desc[1] & 0xff000000)); - - /* 16-bit code segment? */ - if (!((desc[1] >> 22) & 1)) - addr &= 0xffff; - addr += base; - } - mutex_unlock(&child->mm->context.lock); - } - return addr; -} - -static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) -{ - int i, copied; - unsigned char opcode[15]; - unsigned long addr = convert_eip_to_linear(child, regs); - - copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); - for (i = 0; i < copied; i++) { - switch (opcode[i]) { - /* popf and iret */ - case 0x9d: case 0xcf: - return 1; - /* opcode and address size prefixes */ - case 0x66: case 0x67: - continue; - /* irrelevant prefixes (segment overrides and repeats) */ - case 0x26: case 0x2e: - case 0x36: case 0x3e: - case 0x64: case 0x65: - case 0xf0: case 0xf2: case 0xf3: - continue; - - /* - * pushf: NOTE! We should probably not let - * the user see the TF bit being set. But - * it's more pain than it's worth to avoid - * it, and a debugger could emulate this - * all in user space if it _really_ cares. - */ - case 0x9c: - default: - return 0; - } - } - return 0; -} - -static void set_singlestep(struct task_struct *child) -{ - struct pt_regs *regs = get_child_regs(child); - - /* - * Always set TIF_SINGLESTEP - this guarantees that - * we single-step system calls etc.. This will also - * cause us to set TF when returning to user mode. - */ - set_tsk_thread_flag(child, TIF_SINGLESTEP); - - /* - * If TF was already set, don't do anything else - */ - if (regs->eflags & TRAP_FLAG) - return; - - /* Set TF on the kernel stack.. */ - regs->eflags |= TRAP_FLAG; - - /* - * ..but if TF is changed by the instruction we will trace, - * don't mark it as being "us" that set it, so that we - * won't clear it by hand later. - */ - if (is_setting_trap_flag(child, regs)) - return; - - child->ptrace |= PT_DTRACE; -} - -static void clear_singlestep(struct task_struct *child) -{ - /* Always clear TIF_SINGLESTEP... */ - clear_tsk_thread_flag(child, TIF_SINGLESTEP); - - /* But touch TF only if it was set by us.. */ - if (child->ptrace & PT_DTRACE) { - struct pt_regs *regs = get_child_regs(child); - regs->eflags &= ~TRAP_FLAG; - child->ptrace &= ~PT_DTRACE; - } -} - -/* - * Called by kernel/ptrace.c when detaching.. - * - * Make sure the single step bit is not set. - */ -void ptrace_disable(struct task_struct *child) -{ - clear_singlestep(child); - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); -} - -/* - * Perform get_thread_area on behalf of the traced child. - */ -static int -ptrace_get_thread_area(struct task_struct *child, - int idx, struct user_desc __user *user_desc) -{ - struct user_desc info; - struct desc_struct *desc; - -/* - * Get the current Thread-Local Storage area: - */ - -#define GET_BASE(desc) ( \ - (((desc)->a >> 16) & 0x0000ffff) | \ - (((desc)->b << 16) & 0x00ff0000) | \ - ( (desc)->b & 0xff000000) ) - -#define GET_LIMIT(desc) ( \ - ((desc)->a & 0x0ffff) | \ - ((desc)->b & 0xf0000) ) - -#define GET_32BIT(desc) (((desc)->b >> 22) & 1) -#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) -#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) -#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) -#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) -#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) - - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - - info.entry_number = idx; - info.base_addr = GET_BASE(desc); - info.limit = GET_LIMIT(desc); - info.seg_32bit = GET_32BIT(desc); - info.contents = GET_CONTENTS(desc); - info.read_exec_only = !GET_WRITABLE(desc); - info.limit_in_pages = GET_LIMIT_PAGES(desc); - info.seg_not_present = !GET_PRESENT(desc); - info.useable = GET_USEABLE(desc); - - if (copy_to_user(user_desc, &info, sizeof(info))) - return -EFAULT; - - return 0; -} - -/* - * Perform set_thread_area on behalf of the traced child. - */ -static int -ptrace_set_thread_area(struct task_struct *child, - int idx, struct user_desc __user *user_desc) -{ - struct user_desc info; - struct desc_struct *desc; - - if (copy_from_user(&info, user_desc, sizeof(info))) - return -EFAULT; - - if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) - return -EINVAL; - - desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; - if (LDT_empty(&info)) { - desc->a = 0; - desc->b = 0; - } else { - desc->a = LDT_entry_a(&info); - desc->b = LDT_entry_b(&info); - } - - return 0; -} - -long arch_ptrace(struct task_struct *child, long request, long addr, long data) -{ - struct user * dummy = NULL; - int i, ret; - unsigned long __user *datap = (unsigned long __user *)data; - - switch (request) { - /* when I and D space are separate, these will need to be fixed. */ - case PTRACE_PEEKTEXT: /* read word at location addr. */ - case PTRACE_PEEKDATA: - ret = generic_ptrace_peekdata(child, addr, data); - break; - - /* read the word at location addr in the USER area. */ - case PTRACE_PEEKUSR: { - unsigned long tmp; - - ret = -EIO; - if ((addr & 3) || addr < 0 || - addr > sizeof(struct user) - 3) - break; - - tmp = 0; /* Default return condition */ - if(addr < FRAME_SIZE*sizeof(long)) - tmp = getreg(child, addr); - if(addr >= (long) &dummy->u_debugreg[0] && - addr <= (long) &dummy->u_debugreg[7]){ - addr -= (long) &dummy->u_debugreg[0]; - addr = addr >> 2; - tmp = child->thread.debugreg[addr]; - } - ret = put_user(tmp, datap); - break; - } - - /* when I and D space are separate, this will have to be fixed. */ - case PTRACE_POKETEXT: /* write the word at location addr. */ - case PTRACE_POKEDATA: - ret = generic_ptrace_pokedata(child, addr, data); - break; - - case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ - ret = -EIO; - if ((addr & 3) || addr < 0 || - addr > sizeof(struct user) - 3) - break; - - if (addr < FRAME_SIZE*sizeof(long)) { - ret = putreg(child, addr, data); - break; - } - /* We need to be very careful here. We implicitly - want to modify a portion of the task_struct, and we - have to be selective about what portions we allow someone - to modify. */ - - ret = -EIO; - if(addr >= (long) &dummy->u_debugreg[0] && - addr <= (long) &dummy->u_debugreg[7]){ - - if(addr == (long) &dummy->u_debugreg[4]) break; - if(addr == (long) &dummy->u_debugreg[5]) break; - if(addr < (long) &dummy->u_debugreg[4] && - ((unsigned long) data) >= TASK_SIZE-3) break; - - /* Sanity-check data. Take one half-byte at once with - * check = (val >> (16 + 4*i)) & 0xf. It contains the - * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits - * 2 and 3 are LENi. Given a list of invalid values, - * we do mask |= 1 << invalid_value, so that - * (mask >> check) & 1 is a correct test for invalid - * values. - * - * R/Wi contains the type of the breakpoint / - * watchpoint, LENi contains the length of the watched - * data in the watchpoint case. - * - * The invalid values are: - * - LENi == 0x10 (undefined), so mask |= 0x0f00. - * - R/Wi == 0x10 (break on I/O reads or writes), so - * mask |= 0x4444. - * - R/Wi == 0x00 && LENi != 0x00, so we have mask |= - * 0x1110. - * - * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54. - * - * See the Intel Manual "System Programming Guide", - * 15.2.4 - * - * Note that LENi == 0x10 is defined on x86_64 in long - * mode (i.e. even for 32-bit userspace software, but - * 64-bit kernel), so the x86_64 mask value is 0x5454. - * See the AMD manual no. 24593 (AMD64 System - * Programming)*/ - - if(addr == (long) &dummy->u_debugreg[7]) { - data &= ~DR_CONTROL_RESERVED; - for(i=0; i<4; i++) - if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1) - goto out_tsk; - if (data) - set_tsk_thread_flag(child, TIF_DEBUG); - else - clear_tsk_thread_flag(child, TIF_DEBUG); - } - addr -= (long) &dummy->u_debugreg; - addr = addr >> 2; - child->thread.debugreg[addr] = data; - ret = 0; - } - break; - - case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */ - case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ - case PTRACE_CONT: /* restart after signal. */ - ret = -EIO; - if (!valid_signal(data)) - break; - if (request == PTRACE_SYSEMU) { - set_tsk_thread_flag(child, TIF_SYSCALL_EMU); - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - } else if (request == PTRACE_SYSCALL) { - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); - } else { - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - } - child->exit_code = data; - /* make sure the single step bit is not set. */ - clear_singlestep(child); - wake_up_process(child); - ret = 0; - break; - -/* - * make the child exit. Best I can do is send it a sigkill. - * perhaps it should be put in the status that it wants to - * exit. - */ - case PTRACE_KILL: - ret = 0; - if (child->exit_state == EXIT_ZOMBIE) /* already dead */ - break; - child->exit_code = SIGKILL; - /* make sure the single step bit is not set. */ - clear_singlestep(child); - wake_up_process(child); - break; - - case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */ - case PTRACE_SINGLESTEP: /* set the trap flag. */ - ret = -EIO; - if (!valid_signal(data)) - break; - - if (request == PTRACE_SYSEMU_SINGLESTEP) - set_tsk_thread_flag(child, TIF_SYSCALL_EMU); - else - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); - - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - set_singlestep(child); - child->exit_code = data; - /* give it a chance to run. */ - wake_up_process(child); - ret = 0; - break; - - case PTRACE_GETREGS: { /* Get all gp regs from the child. */ - if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) { - ret = -EIO; - break; - } - for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) { - __put_user(getreg(child, i), datap); - datap++; - } - ret = 0; - break; - } - - case PTRACE_SETREGS: { /* Set all gp regs in the child. */ - unsigned long tmp; - if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) { - ret = -EIO; - break; - } - for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) { - __get_user(tmp, datap); - putreg(child, i, tmp); - datap++; - } - ret = 0; - break; - } - - case PTRACE_GETFPREGS: { /* Get the child FPU state. */ - if (!access_ok(VERIFY_WRITE, datap, - sizeof(struct user_i387_struct))) { - ret = -EIO; - break; - } - ret = 0; - if (!tsk_used_math(child)) - init_fpu(child); - get_fpregs((struct user_i387_struct __user *)data, child); - break; - } - - case PTRACE_SETFPREGS: { /* Set the child FPU state. */ - if (!access_ok(VERIFY_READ, datap, - sizeof(struct user_i387_struct))) { - ret = -EIO; - break; - } - set_stopped_child_used_math(child); - set_fpregs(child, (struct user_i387_struct __user *)data); - ret = 0; - break; - } - - case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */ - if (!access_ok(VERIFY_WRITE, datap, - sizeof(struct user_fxsr_struct))) { - ret = -EIO; - break; - } - if (!tsk_used_math(child)) - init_fpu(child); - ret = get_fpxregs((struct user_fxsr_struct __user *)data, child); - break; - } - - case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */ - if (!access_ok(VERIFY_READ, datap, - sizeof(struct user_fxsr_struct))) { - ret = -EIO; - break; - } - set_stopped_child_used_math(child); - ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data); - break; - } - - case PTRACE_GET_THREAD_AREA: - ret = ptrace_get_thread_area(child, addr, - (struct user_desc __user *) data); - break; - - case PTRACE_SET_THREAD_AREA: - ret = ptrace_set_thread_area(child, addr, - (struct user_desc __user *) data); - break; - - default: - ret = ptrace_request(child, request, addr, data); - break; - } - out_tsk: - return ret; -} - -void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) -{ - struct siginfo info; - - tsk->thread.trap_no = 1; - tsk->thread.error_code = error_code; - - memset(&info, 0, sizeof(info)); - info.si_signo = SIGTRAP; - info.si_code = TRAP_BRKPT; - - /* User-mode eip? */ - info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL; - - /* Send us the fake SIGTRAP */ - force_sig_info(SIGTRAP, &info, tsk); -} - -/* notification of system call entry/exit - * - triggered by current->work.syscall_trace - */ -__attribute__((regparm(3))) -int do_syscall_trace(struct pt_regs *regs, int entryexit) -{ - int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU); - /* - * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall - * interception - */ - int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP); - int ret = 0; - - /* do the secure computing check first */ - if (!entryexit) - secure_computing(regs->orig_eax); - - if (unlikely(current->audit_context)) { - if (entryexit) - audit_syscall_exit(AUDITSC_RESULT(regs->eax), - regs->eax); - /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only - * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is - * not used, entry.S will call us only on syscall exit, not - * entry; so when TIF_SYSCALL_AUDIT is used we must avoid - * calling send_sigtrap() on syscall entry. - * - * Note that when PTRACE_SYSEMU_SINGLESTEP is used, - * is_singlestep is false, despite his name, so we will still do - * the correct thing. - */ - else if (is_singlestep) - goto out; - } - - if (!(current->ptrace & PT_PTRACED)) - goto out; - - /* If a process stops on the 1st tracepoint with SYSCALL_TRACE - * and then is resumed with SYSEMU_SINGLESTEP, it will come in - * here. We have to check this and return */ - if (is_sysemu && entryexit) - return 0; - - /* Fake a debug trap */ - if (is_singlestep) - send_sigtrap(current, regs, 0); - - if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu) - goto out; - - /* the 0x80 provides a way for the tracing parent to distinguish - between a syscall stop and SIGTRAP delivery */ - /* Note that the debugger could change the result of test_thread_flag!*/ - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0)); - - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } - ret = is_sysemu; -out: - if (unlikely(current->audit_context) && !entryexit) - audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax, - regs->ebx, regs->ecx, regs->edx, regs->esi); - if (ret == 0) - return 0; - - regs->orig_eax = -1; /* force skip of syscall restarting */ - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax); - return 1; -} diff --git a/arch/x86/kernel/ptrace_64.c b/arch/x86/kernel/ptrace_64.c deleted file mode 100644 index 607085f3f08..00000000000 --- a/arch/x86/kernel/ptrace_64.c +++ /dev/null @@ -1,621 +0,0 @@ -/* By Ross Biro 1/23/92 */ -/* - * Pentium III FXSR, SSE support - * Gareth Hughes <gareth@valinux.com>, May 2000 - * - * x86-64 port 2000-2002 Andi Kleen - */ - -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/errno.h> -#include <linux/ptrace.h> -#include <linux/user.h> -#include <linux/security.h> -#include <linux/audit.h> -#include <linux/seccomp.h> -#include <linux/signal.h> - -#include <asm/uaccess.h> -#include <asm/pgtable.h> -#include <asm/system.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/debugreg.h> -#include <asm/ldt.h> -#include <asm/desc.h> -#include <asm/proto.h> -#include <asm/ia32.h> - -/* - * does not yet catch signals sent when the child dies. - * in exit.c or in signal.c. - */ - -/* - * Determines which flags the user has access to [1 = access, 0 = no access]. - * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9). - * Also masks reserved bits (63-22, 15, 5, 3, 1). - */ -#define FLAG_MASK 0x54dd5UL - -/* set's the trap flag. */ -#define TRAP_FLAG 0x100UL - -/* - * eflags and offset of eflags on child stack.. - */ -#define EFLAGS offsetof(struct pt_regs, eflags) -#define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs))) - -/* - * this routine will get a word off of the processes privileged stack. - * the offset is how far from the base addr as stored in the TSS. - * this routine assumes that all the privileged stacks are in our - * data space. - */ -static inline unsigned long get_stack_long(struct task_struct *task, int offset) -{ - unsigned char *stack; - - stack = (unsigned char *)task->thread.rsp0; - stack += offset; - return (*((unsigned long *)stack)); -} - -/* - * this routine will put a word on the processes privileged stack. - * the offset is how far from the base addr as stored in the TSS. - * this routine assumes that all the privileged stacks are in our - * data space. - */ -static inline long put_stack_long(struct task_struct *task, int offset, - unsigned long data) -{ - unsigned char * stack; - - stack = (unsigned char *) task->thread.rsp0; - stack += offset; - *(unsigned long *) stack = data; - return 0; -} - -#define LDT_SEGMENT 4 - -unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs) -{ - unsigned long addr, seg; - - addr = regs->rip; - seg = regs->cs & 0xffff; - - /* - * We'll assume that the code segments in the GDT - * are all zero-based. That is largely true: the - * TLS segments are used for data, and the PNPBIOS - * and APM bios ones we just ignore here. - */ - if (seg & LDT_SEGMENT) { - u32 *desc; - unsigned long base; - - seg &= ~7UL; - - mutex_lock(&child->mm->context.lock); - if (unlikely((seg >> 3) >= child->mm->context.size)) - addr = -1L; /* bogus selector, access would fault */ - else { - desc = child->mm->context.ldt + seg; - base = ((desc[0] >> 16) | - ((desc[1] & 0xff) << 16) | - (desc[1] & 0xff000000)); - - /* 16-bit code segment? */ - if (!((desc[1] >> 22) & 1)) - addr &= 0xffff; - addr += base; - } - mutex_unlock(&child->mm->context.lock); - } - - return addr; -} - -static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) -{ - int i, copied; - unsigned char opcode[15]; - unsigned long addr = convert_rip_to_linear(child, regs); - - copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); - for (i = 0; i < copied; i++) { - switch (opcode[i]) { - /* popf and iret */ - case 0x9d: case 0xcf: - return 1; - - /* CHECKME: 64 65 */ - - /* opcode and address size prefixes */ - case 0x66: case 0x67: - continue; - /* irrelevant prefixes (segment overrides and repeats) */ - case 0x26: case 0x2e: - case 0x36: case 0x3e: - case 0x64: case 0x65: - case 0xf2: case 0xf3: - continue; - - case 0x40 ... 0x4f: - if (regs->cs != __USER_CS) - /* 32-bit mode: register increment */ - return 0; - /* 64-bit mode: REX prefix */ - continue; - - /* CHECKME: f2, f3 */ - - /* - * pushf: NOTE! We should probably not let - * the user see the TF bit being set. But - * it's more pain than it's worth to avoid - * it, and a debugger could emulate this - * all in user space if it _really_ cares. - */ - case 0x9c: - default: - return 0; - } - } - return 0; -} - -static void set_singlestep(struct task_struct *child) -{ - struct pt_regs *regs = task_pt_regs(child); - - /* - * Always set TIF_SINGLESTEP - this guarantees that - * we single-step system calls etc.. This will also - * cause us to set TF when returning to user mode. - */ - set_tsk_thread_flag(child, TIF_SINGLESTEP); - - /* - * If TF was already set, don't do anything else - */ - if (regs->eflags & TRAP_FLAG) - return; - - /* Set TF on the kernel stack.. */ - regs->eflags |= TRAP_FLAG; - - /* - * ..but if TF is changed by the instruction we will trace, - * don't mark it as being "us" that set it, so that we - * won't clear it by hand later. - */ - if (is_setting_trap_flag(child, regs)) - return; - - child->ptrace |= PT_DTRACE; -} - -static void clear_singlestep(struct task_struct *child) -{ - /* Always clear TIF_SINGLESTEP... */ - clear_tsk_thread_flag(child, TIF_SINGLESTEP); - - /* But touch TF only if it was set by us.. */ - if (child->ptrace & PT_DTRACE) { - struct pt_regs *regs = task_pt_regs(child); - regs->eflags &= ~TRAP_FLAG; - child->ptrace &= ~PT_DTRACE; - } -} - -/* - * Called by kernel/ptrace.c when detaching.. - * - * Make sure the single step bit is not set. - */ -void ptrace_disable(struct task_struct *child) -{ - clear_singlestep(child); -} - -static int putreg(struct task_struct *child, - unsigned long regno, unsigned long value) -{ - unsigned long tmp; - - switch (regno) { - case offsetof(struct user_regs_struct,fs): - if (value && (value & 3) != 3) - return -EIO; - child->thread.fsindex = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,gs): - if (value && (value & 3) != 3) - return -EIO; - child->thread.gsindex = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,ds): - if (value && (value & 3) != 3) - return -EIO; - child->thread.ds = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,es): - if (value && (value & 3) != 3) - return -EIO; - child->thread.es = value & 0xffff; - return 0; - case offsetof(struct user_regs_struct,ss): - if ((value & 3) != 3) - return -EIO; - value &= 0xffff; - return 0; - case offsetof(struct user_regs_struct,fs_base): - if (value >= TASK_SIZE_OF(child)) - return -EIO; - child->thread.fs = value; - return 0; - case offsetof(struct user_regs_struct,gs_base): - if (value >= TASK_SIZE_OF(child)) - return -EIO; - child->thread.gs = value; - return 0; - case offsetof(struct user_regs_struct, eflags): - value &= FLAG_MASK; - tmp = get_stack_long(child, EFL_OFFSET); - tmp &= ~FLAG_MASK; - value |= tmp; - break; - case offsetof(struct user_regs_struct,cs): - if ((value & 3) != 3) - return -EIO; - value &= 0xffff; - break; - } - put_stack_long(child, regno - sizeof(struct pt_regs), value); - return 0; -} - -static unsigned long getreg(struct task_struct *child, unsigned long regno) -{ - unsigned long val; - switch (regno) { - case offsetof(struct user_regs_struct, fs): - return child->thread.fsindex; - case offsetof(struct user_regs_struct, gs): - return child->thread.gsindex; - case offsetof(struct user_regs_struct, ds): - return child->thread.ds; - case offsetof(struct user_regs_struct, es): - return child->thread.es; - case offsetof(struct user_regs_struct, fs_base): - return child->thread.fs; - case offsetof(struct user_regs_struct, gs_base): - return child->thread.gs; - default: - regno = regno - sizeof(struct pt_regs); - val = get_stack_long(child, regno); - if (test_tsk_thread_flag(child, TIF_IA32)) - val &= 0xffffffff; - return val; - } - -} - -long arch_ptrace(struct task_struct *child, long request, long addr, long data) -{ - long i, ret; - unsigned ui; - - switch (request) { - /* when I and D space are separate, these will need to be fixed. */ - case PTRACE_PEEKTEXT: /* read word at location addr. */ - case PTRACE_PEEKDATA: - ret = generic_ptrace_peekdata(child, addr, data); - break; - - /* read the word at location addr in the USER area. */ - case PTRACE_PEEKUSR: { - unsigned long tmp; - - ret = -EIO; - if ((addr & 7) || - addr > sizeof(struct user) - 7) - break; - - switch (addr) { - case 0 ... sizeof(struct user_regs_struct) - sizeof(long): - tmp = getreg(child, addr); - break; - case offsetof(struct user, u_debugreg[0]): - tmp = child->thread.debugreg0; - break; - case offsetof(struct user, u_debugreg[1]): - tmp = child->thread.debugreg1; - break; - case offsetof(struct user, u_debugreg[2]): - tmp = child->thread.debugreg2; - break; - case offsetof(struct user, u_debugreg[3]): - tmp = child->thread.debugreg3; - break; - case offsetof(struct user, u_debugreg[6]): - tmp = child->thread.debugreg6; - break; - case offsetof(struct user, u_debugreg[7]): - tmp = child->thread.debugreg7; - break; - default: - tmp = 0; - break; - } - ret = put_user(tmp,(unsigned long __user *) data); - break; - } - - /* when I and D space are separate, this will have to be fixed. */ - case PTRACE_POKETEXT: /* write the word at location addr. */ - case PTRACE_POKEDATA: - ret = generic_ptrace_pokedata(child, addr, data); - break; - - case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ - { - int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7; - ret = -EIO; - if ((addr & 7) || - addr > sizeof(struct user) - 7) - break; - - switch (addr) { - case 0 ... sizeof(struct user_regs_struct) - sizeof(long): - ret = putreg(child, addr, data); - break; - /* Disallows to set a breakpoint into the vsyscall */ - case offsetof(struct user, u_debugreg[0]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg0 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[1]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg1 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[2]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg2 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[3]): - if (data >= TASK_SIZE_OF(child) - dsize) break; - child->thread.debugreg3 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[6]): - if (data >> 32) - break; - child->thread.debugreg6 = data; - ret = 0; - break; - case offsetof(struct user, u_debugreg[7]): - /* See arch/i386/kernel/ptrace.c for an explanation of - * this awkward check.*/ - data &= ~DR_CONTROL_RESERVED; - for(i=0; i<4; i++) - if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1) - break; - if (i == 4) { - child->thread.debugreg7 = data; - if (data) - set_tsk_thread_flag(child, TIF_DEBUG); - else - clear_tsk_thread_flag(child, TIF_DEBUG); - ret = 0; - } - break; - } - break; - } - case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ - case PTRACE_CONT: /* restart after signal. */ - - ret = -EIO; - if (!valid_signal(data)) - break; - if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child,TIF_SYSCALL_TRACE); - else - clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); - clear_tsk_thread_flag(child, TIF_SINGLESTEP); - child->exit_code = data; - /* make sure the single step bit is not set. */ - clear_singlestep(child); - wake_up_process(child); - ret = 0; - break; - -#ifdef CONFIG_IA32_EMULATION - /* This makes only sense with 32bit programs. Allow a - 64bit debugger to fully examine them too. Better - don't use it against 64bit processes, use - PTRACE_ARCH_PRCTL instead. */ - case PTRACE_SET_THREAD_AREA: { - struct user_desc __user *p; - int old; - p = (struct user_desc __user *)data; - get_user(old, &p->entry_number); - put_user(addr, &p->entry_number); - ret = do_set_thread_area(&child->thread, p); - put_user(old, &p->entry_number); - break; - case PTRACE_GET_THREAD_AREA: - p = (struct user_desc __user *)data; - get_user(old, &p->entry_number); - put_user(addr, &p->entry_number); - ret = do_get_thread_area(&child->thread, p); - put_user(old, &p->entry_number); - break; - } -#endif - /* normal 64bit interface to access TLS data. - Works just like arch_prctl, except that the arguments - are reversed. */ - case PTRACE_ARCH_PRCTL: - ret = do_arch_prctl(child, data, addr); - break; - -/* - * make the child exit. Best I can do is send it a sigkill. - * perhaps it should be put in the status that it wants to - * exit. - */ - case PTRACE_KILL: - ret = 0; - if (child->exit_state == EXIT_ZOMBIE) /* already dead */ - break; - clear_tsk_thread_flag(child, TIF_SINGLESTEP); - child->exit_code = SIGKILL; - /* make sure the single step bit is not set. */ - clear_singlestep(child); - wake_up_process(child); - break; - - case PTRACE_SINGLESTEP: /* set the trap flag. */ - ret = -EIO; - if (!valid_signal(data)) - break; - clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE); - set_singlestep(child); - child->exit_code = data; - /* give it a chance to run. */ - wake_up_process(child); - ret = 0; - break; - - case PTRACE_GETREGS: { /* Get all gp regs from the child. */ - if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, - sizeof(struct user_regs_struct))) { - ret = -EIO; - break; - } - ret = 0; - for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { - ret |= __put_user(getreg(child, ui),(unsigned long __user *) data); - data += sizeof(long); - } - break; - } - - case PTRACE_SETREGS: { /* Set all gp regs in the child. */ - unsigned long tmp; - if (!access_ok(VERIFY_READ, (unsigned __user *)data, - sizeof(struct user_regs_struct))) { - ret = -EIO; - break; - } - ret = 0; - for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) { - ret = __get_user(tmp, (unsigned long __user *) data); - if (ret) - break; - ret = putreg(child, ui, tmp); - if (ret) - break; - data += sizeof(long); - } - break; - } - - case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */ - if (!access_ok(VERIFY_WRITE, (unsigned __user *)data, - sizeof(struct user_i387_struct))) { - ret = -EIO; - break; - } - ret = get_fpregs((struct user_i387_struct __user *)data, child); - break; - } - - case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */ - if (!access_ok(VERIFY_READ, (unsigned __user *)data, - sizeof(struct user_i387_struct))) { - ret = -EIO; - break; - } - set_stopped_child_used_math(child); - ret = set_fpregs(child, (struct user_i387_struct __user *)data); - break; - } - - default: - ret = ptrace_request(child, request, addr, data); - break; - } - return ret; -} - -static void syscall_trace(struct pt_regs *regs) -{ - -#if 0 - printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n", - current->comm, - regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0), - current_thread_info()->flags, current->ptrace); -#endif - - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) - ? 0x80 : 0)); - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } -} - -asmlinkage void syscall_trace_enter(struct pt_regs *regs) -{ - /* do the secure computing check first */ - secure_computing(regs->orig_rax); - - if (test_thread_flag(TIF_SYSCALL_TRACE) - && (current->ptrace & PT_PTRACED)) - syscall_trace(regs); - - if (unlikely(current->audit_context)) { - if (test_thread_flag(TIF_IA32)) { - audit_syscall_entry(AUDIT_ARCH_I386, - regs->orig_rax, - regs->rbx, regs->rcx, - regs->rdx, regs->rsi); - } else { - audit_syscall_entry(AUDIT_ARCH_X86_64, - regs->orig_rax, - regs->rdi, regs->rsi, - regs->rdx, regs->r10); - } - } -} - -asmlinkage void syscall_trace_leave(struct pt_regs *regs) -{ - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax); - - if ((test_thread_flag(TIF_SYSCALL_TRACE) - || test_thread_flag(TIF_SINGLESTEP)) - && (current->ptrace & PT_PTRACED)) - syscall_trace(regs); -} diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index fab30e13483..150ba29a0d3 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -162,6 +162,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_31, ich_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH8_1, ich_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH9_7, + ich_force_enable_hpet); static struct pci_dev *cached_dev; diff --git a/arch/x86/kernel/reboot_32.c b/arch/x86/kernel/reboot.c index bb1a0f889c5..5818dc28167 100644 --- a/arch/x86/kernel/reboot_32.c +++ b/arch/x86/kernel/reboot.c @@ -1,64 +1,94 @@ -#include <linux/mm.h> #include <linux/module.h> -#include <linux/delay.h> #include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/mc146818rtc.h> -#include <linux/efi.h> -#include <linux/dmi.h> -#include <linux/ctype.h> -#include <linux/pm.h> #include <linux/reboot.h> -#include <asm/uaccess.h> +#include <linux/init.h> +#include <linux/pm.h> +#include <linux/efi.h> +#include <acpi/reboot.h> +#include <asm/io.h> #include <asm/apic.h> -#include <asm/hpet.h> #include <asm/desc.h> -#include "mach_reboot.h" +#include <asm/hpet.h> #include <asm/reboot_fixups.h> #include <asm/reboot.h> +#ifdef CONFIG_X86_32 +# include <linux/dmi.h> +# include <linux/ctype.h> +# include <linux/mc146818rtc.h> +# include <asm/pgtable.h> +#else +# include <asm/iommu.h> +#endif + /* * Power off function, if any */ void (*pm_power_off)(void); EXPORT_SYMBOL(pm_power_off); +static long no_idt[3]; static int reboot_mode; -static int reboot_thru_bios; +enum reboot_type reboot_type = BOOT_KBD; +int reboot_force; -#ifdef CONFIG_SMP +#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) static int reboot_cpu = -1; #endif + +/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] + warm Don't set the cold reboot flag + cold Set the cold reboot flag + bios Reboot by jumping through the BIOS (only for X86_32) + smp Reboot by executing reset on BSP or other CPU (only for X86_32) + triple Force a triple fault (init) + kbd Use the keyboard controller. cold reset (default) + acpi Use the RESET_REG in the FADT + efi Use efi reset_system runtime service + force Avoid anything that could hang. + */ static int __init reboot_setup(char *str) { - while(1) { + for (;;) { switch (*str) { - case 'w': /* "warm" reboot (no memory testing etc) */ + case 'w': reboot_mode = 0x1234; break; - case 'c': /* "cold" reboot (with memory testing etc) */ - reboot_mode = 0x0; - break; - case 'b': /* "bios" reboot by jumping through the BIOS */ - reboot_thru_bios = 1; - break; - case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */ - reboot_thru_bios = 0; + + case 'c': + reboot_mode = 0; break; + +#ifdef CONFIG_X86_32 #ifdef CONFIG_SMP - case 's': /* "smp" reboot by executing reset on BSP or other CPU*/ + case 's': if (isdigit(*(str+1))) { reboot_cpu = (int) (*(str+1) - '0'); if (isdigit(*(str+2))) reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0'); } - /* we will leave sorting out the final value - when we are ready to reboot, since we might not - have set up boot_cpu_id or smp_num_cpu */ + /* we will leave sorting out the final value + when we are ready to reboot, since we might not + have set up boot_cpu_id or smp_num_cpu */ break; +#endif /* CONFIG_SMP */ + + case 'b': #endif + case 'a': + case 'k': + case 't': + case 'e': + reboot_type = *str; + break; + + case 'f': + reboot_force = 1; + break; } - if((str = strchr(str,',')) != NULL) + + str = strchr(str, ','); + if (str) str++; else break; @@ -68,18 +98,21 @@ static int __init reboot_setup(char *str) __setup("reboot=", reboot_setup); + +#ifdef CONFIG_X86_32 /* * Reboot options and system auto-detection code provided by * Dell Inc. so their systems "just work". :-) */ /* - * Some machines require the "reboot=b" commandline option, this quirk makes that automatic. + * Some machines require the "reboot=b" commandline option, + * this quirk makes that automatic. */ static int __init set_bios_reboot(const struct dmi_system_id *d) { - if (!reboot_thru_bios) { - reboot_thru_bios = 1; + if (reboot_type != BOOT_BIOS) { + reboot_type = BOOT_BIOS; printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident); } return 0; @@ -143,7 +176,6 @@ static int __init reboot_init(void) dmi_check_system(reboot_dmi_table); return 0; } - core_initcall(reboot_init); /* The following code and data reboots the machine by switching to real @@ -152,7 +184,6 @@ core_initcall(reboot_init); controller to pulse the CPU reset line, which is more thorough, but doesn't work with at least one type of 486 motherboard. It is easy to stop this code working; hence the copious comments. */ - static unsigned long long real_mode_gdt_entries [3] = { @@ -161,11 +192,9 @@ real_mode_gdt_entries [3] = 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ }; -static struct Xgt_desc_struct +static struct desc_ptr real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, -real_mode_idt = { 0x3ff, 0 }, -no_idt = { 0, 0 }; - +real_mode_idt = { 0x3ff, 0 }; /* This is 16-bit protected mode code to disable paging and the cache, switch to real mode and jump to the BIOS reset code. @@ -185,7 +214,6 @@ no_idt = { 0, 0 }; More could be done here to set up the registers as if a CPU reset had occurred; hopefully real BIOSs don't assume much. */ - static unsigned char real_mode_switch [] = { 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ @@ -223,7 +251,6 @@ void machine_real_restart(unsigned char *code, int length) `outb_p' is needed instead of just `outb'. Use it to be on the safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) */ - spin_lock(&rtc_lock); CMOS_WRITE(0x00, 0x8f); spin_unlock(&rtc_lock); @@ -231,9 +258,8 @@ void machine_real_restart(unsigned char *code, int length) /* Remap the kernel at virtual address zero, as well as offset zero from the kernel segment. This assumes the kernel segment starts at virtual address PAGE_OFFSET. */ - - memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, - sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS); + memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, + sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS); /* * Use `swapper_pg_dir' as our page directory. @@ -245,7 +271,6 @@ void machine_real_restart(unsigned char *code, int length) boot)". This seems like a fairly standard thing that gets set by REBOOT.COM programs, and the previous reset routine did this too. */ - *((unsigned short *)0x472) = reboot_mode; /* For the switch to real mode, copy some code to low memory. It has @@ -253,19 +278,16 @@ void machine_real_restart(unsigned char *code, int length) has to have the same physical and virtual address, because it turns off paging. Copy it near the end of the first page, out of the way of BIOS variables. */ - - memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100), + memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100), real_mode_switch, sizeof (real_mode_switch)); - memcpy ((void *) (0x1000 - 100), code, length); + memcpy((void *)(0x1000 - 100), code, length); /* Set up the IDT for real mode. */ - load_idt(&real_mode_idt); /* Set up a GDT from which we can load segment descriptors for real mode. The GDT is not used in real mode; it is just needed here to prepare the descriptors. */ - load_gdt(&real_mode_gdt); /* Load the data segment registers, and thus the descriptors ready for @@ -273,7 +295,6 @@ void machine_real_restart(unsigned char *code, int length) selector value being loaded here. This is so that the segment registers don't have to be reloaded after switching to real mode: the values are consistent for real mode operation already. */ - __asm__ __volatile__ ("movl $0x0010,%%eax\n" "\tmovl %%eax,%%ds\n" "\tmovl %%eax,%%es\n" @@ -284,130 +305,147 @@ void machine_real_restart(unsigned char *code, int length) /* Jump to the 16-bit code that we copied earlier. It disables paging and the cache, switches to real mode, and jumps to the BIOS reset entry point. */ - __asm__ __volatile__ ("ljmp $0x0008,%0" : - : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100))); + : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100))); } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(machine_real_restart); #endif -static void native_machine_shutdown(void) +#endif /* CONFIG_X86_32 */ + +static inline void kb_wait(void) +{ + int i; + + for (i = 0; i < 0x10000; i++) { + if ((inb(0x64) & 0x02) == 0) + break; + udelay(2); + } +} + +void machine_emergency_restart(void) +{ + int i; + + /* Tell the BIOS if we want cold or warm reboot */ + *((unsigned short *)__va(0x472)) = reboot_mode; + + for (;;) { + /* Could also try the reset bit in the Hammer NB */ + switch (reboot_type) { + case BOOT_KBD: + for (i = 0; i < 10; i++) { + kb_wait(); + udelay(50); + outb(0xfe, 0x64); /* pulse reset low */ + udelay(50); + } + + case BOOT_TRIPLE: + load_idt((const struct desc_ptr *)&no_idt); + __asm__ __volatile__("int3"); + + reboot_type = BOOT_KBD; + break; + +#ifdef CONFIG_X86_32 + case BOOT_BIOS: + machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); + + reboot_type = BOOT_KBD; + break; +#endif + + case BOOT_ACPI: + acpi_reboot(); + reboot_type = BOOT_KBD; + break; + + + case BOOT_EFI: + if (efi_enabled) + efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD, + EFI_SUCCESS, 0, NULL); + + reboot_type = BOOT_KBD; + break; + } + } +} + +void machine_shutdown(void) { + /* Stop the cpus and apics */ #ifdef CONFIG_SMP int reboot_cpu_id; /* The boot cpu is always logical cpu 0 */ reboot_cpu_id = 0; +#ifdef CONFIG_X86_32 /* See if there has been given a command line override */ if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && - cpu_isset(reboot_cpu, cpu_online_map)) { + cpu_isset(reboot_cpu, cpu_online_map)) reboot_cpu_id = reboot_cpu; - } +#endif - /* Make certain the cpu I'm rebooting on is online */ - if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { + /* Make certain the cpu I'm about to reboot on is online */ + if (!cpu_isset(reboot_cpu_id, cpu_online_map)) reboot_cpu_id = smp_processor_id(); - } /* Make certain I only run on the appropriate processor */ set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); - /* O.K. Now that I'm on the appropriate processor, stop - * all of the others, and disable their local APICs. + /* O.K Now that I'm on the appropriate processor, + * stop all of the others. */ - smp_send_stop(); -#endif /* CONFIG_SMP */ +#endif lapic_shutdown(); #ifdef CONFIG_X86_IO_APIC disable_IO_APIC(); #endif + #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif -} -void __attribute__((weak)) mach_reboot_fixups(void) -{ +#ifdef CONFIG_X86_64 + pci_iommu_shutdown(); +#endif } -static void native_machine_emergency_restart(void) +void machine_restart(char *__unused) { - if (!reboot_thru_bios) { - if (efi_enabled) { - efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); - load_idt(&no_idt); - __asm__ __volatile__("int3"); - } - /* rebooting needs to touch the page at absolute addr 0 */ - *((unsigned short *)__va(0x472)) = reboot_mode; - for (;;) { - mach_reboot_fixups(); /* for board specific fixups */ - mach_reboot(); - /* That didn't work - force a triple fault.. */ - load_idt(&no_idt); - __asm__ __volatile__("int3"); - } - } - if (efi_enabled) - efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL); + printk("machine restart\n"); - machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); -} - -static void native_machine_restart(char * __unused) -{ - machine_shutdown(); + if (!reboot_force) + machine_shutdown(); machine_emergency_restart(); } -static void native_machine_halt(void) +void machine_halt(void) { } -static void native_machine_power_off(void) +void machine_power_off(void) { if (pm_power_off) { - machine_shutdown(); + if (!reboot_force) + machine_shutdown(); pm_power_off(); } } - struct machine_ops machine_ops = { - .power_off = native_machine_power_off, - .shutdown = native_machine_shutdown, - .emergency_restart = native_machine_emergency_restart, - .restart = native_machine_restart, - .halt = native_machine_halt, + .power_off = machine_power_off, + .shutdown = machine_shutdown, + .emergency_restart = machine_emergency_restart, + .restart = machine_restart, + .halt = machine_halt }; - -void machine_power_off(void) -{ - machine_ops.power_off(); -} - -void machine_shutdown(void) -{ - machine_ops.shutdown(); -} - -void machine_emergency_restart(void) -{ - machine_ops.emergency_restart(); -} - -void machine_restart(char *cmd) -{ - machine_ops.restart(cmd); -} - -void machine_halt(void) -{ - machine_ops.halt(); -} diff --git a/arch/x86/kernel/reboot_64.c b/arch/x86/kernel/reboot_64.c deleted file mode 100644 index 53620a92a8f..00000000000 --- a/arch/x86/kernel/reboot_64.c +++ /dev/null @@ -1,176 +0,0 @@ -/* Various gunk just to reboot the machine. */ -#include <linux/module.h> -#include <linux/reboot.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/string.h> -#include <linux/pm.h> -#include <linux/kdebug.h> -#include <linux/sched.h> -#include <asm/io.h> -#include <asm/delay.h> -#include <asm/desc.h> -#include <asm/hw_irq.h> -#include <asm/system.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/apic.h> -#include <asm/hpet.h> -#include <asm/gart.h> - -/* - * Power off function, if any - */ -void (*pm_power_off)(void); -EXPORT_SYMBOL(pm_power_off); - -static long no_idt[3]; -static enum { - BOOT_TRIPLE = 't', - BOOT_KBD = 'k' -} reboot_type = BOOT_KBD; -static int reboot_mode = 0; -int reboot_force; - -/* reboot=t[riple] | k[bd] [, [w]arm | [c]old] - warm Don't set the cold reboot flag - cold Set the cold reboot flag - triple Force a triple fault (init) - kbd Use the keyboard controller. cold reset (default) - force Avoid anything that could hang. - */ -static int __init reboot_setup(char *str) -{ - for (;;) { - switch (*str) { - case 'w': - reboot_mode = 0x1234; - break; - - case 'c': - reboot_mode = 0; - break; - - case 't': - case 'b': - case 'k': - reboot_type = *str; - break; - case 'f': - reboot_force = 1; - break; - } - if((str = strchr(str,',')) != NULL) - str++; - else - break; - } - return 1; -} - -__setup("reboot=", reboot_setup); - -static inline void kb_wait(void) -{ - int i; - - for (i=0; i<0x10000; i++) - if ((inb_p(0x64) & 0x02) == 0) - break; -} - -void machine_shutdown(void) -{ - unsigned long flags; - - /* Stop the cpus and apics */ -#ifdef CONFIG_SMP - int reboot_cpu_id; - - /* The boot cpu is always logical cpu 0 */ - reboot_cpu_id = 0; - - /* Make certain the cpu I'm about to reboot on is online */ - if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { - reboot_cpu_id = smp_processor_id(); - } - - /* Make certain I only run on the appropriate processor */ - set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); - - /* O.K Now that I'm on the appropriate processor, - * stop all of the others. - */ - smp_send_stop(); -#endif - - local_irq_save(flags); - -#ifndef CONFIG_SMP - disable_local_APIC(); -#endif - - disable_IO_APIC(); - -#ifdef CONFIG_HPET_TIMER - hpet_disable(); -#endif - local_irq_restore(flags); - - pci_iommu_shutdown(); -} - -void machine_emergency_restart(void) -{ - int i; - - /* Tell the BIOS if we want cold or warm reboot */ - *((unsigned short *)__va(0x472)) = reboot_mode; - - for (;;) { - /* Could also try the reset bit in the Hammer NB */ - switch (reboot_type) { - case BOOT_KBD: - for (i=0; i<10; i++) { - kb_wait(); - udelay(50); - outb(0xfe,0x64); /* pulse reset low */ - udelay(50); - } - - case BOOT_TRIPLE: - load_idt((const struct desc_ptr *)&no_idt); - __asm__ __volatile__("int3"); - - reboot_type = BOOT_KBD; - break; - } - } -} - -void machine_restart(char * __unused) -{ - printk("machine restart\n"); - - if (!reboot_force) { - machine_shutdown(); - } - machine_emergency_restart(); -} - -void machine_halt(void) -{ -} - -void machine_power_off(void) -{ - if (pm_power_off) { - if (!reboot_force) { - machine_shutdown(); - } - pm_power_off(); - } -} - diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index f452726c0fe..dec0b5ec25c 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -30,6 +30,19 @@ static void cs5536_warm_reset(struct pci_dev *dev) udelay(50); /* shouldn't get here but be safe and spin a while */ } +static void rdc321x_reset(struct pci_dev *dev) +{ + unsigned i; + /* Voluntary reset the watchdog timer */ + outl(0x80003840, 0xCF8); + /* Generate a CPU reset on next tick */ + i = inl(0xCFC); + /* Use the minimum timer resolution */ + i |= 0x1600; + outl(i, 0xCFC); + outb(1, 0x92); +} + struct device_fixup { unsigned int vendor; unsigned int device; @@ -40,6 +53,7 @@ static struct device_fixup fixups_table[] = { { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, { PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, +{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset }, }; /* diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c new file mode 100644 index 00000000000..eb9b1a198f5 --- /dev/null +++ b/arch/x86/kernel/rtc.c @@ -0,0 +1,204 @@ +/* + * RTC related functions + */ +#include <linux/acpi.h> +#include <linux/bcd.h> +#include <linux/mc146818rtc.h> + +#include <asm/time.h> +#include <asm/vsyscall.h> + +#ifdef CONFIG_X86_32 +# define CMOS_YEARS_OFFS 1900 +/* + * This is a special lock that is owned by the CPU and holds the index + * register we are working with. It is required for NMI access to the + * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. + */ +volatile unsigned long cmos_lock = 0; +EXPORT_SYMBOL(cmos_lock); +#else +/* + * x86-64 systems only exists since 2002. + * This will work up to Dec 31, 2100 + */ +# define CMOS_YEARS_OFFS 2000 +#endif + +DEFINE_SPINLOCK(rtc_lock); +EXPORT_SYMBOL(rtc_lock); + +/* + * In order to set the CMOS clock precisely, set_rtc_mmss has to be + * called 500 ms after the second nowtime has started, because when + * nowtime is written into the registers of the CMOS clock, it will + * jump to the next second precisely 500 ms later. Check the Motorola + * MC146818A or Dallas DS12887 data sheet for details. + * + * BUG: This routine does not handle hour overflow properly; it just + * sets the minutes. Usually you'll only notice that after reboot! + */ +int mach_set_rtc_mmss(unsigned long nowtime) +{ + int retval = 0; + int real_seconds, real_minutes, cmos_minutes; + unsigned char save_control, save_freq_select; + + /* tell the clock it's being set */ + save_control = CMOS_READ(RTC_CONTROL); + CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); + + /* stop and reset prescaler */ + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); + + cmos_minutes = CMOS_READ(RTC_MINUTES); + if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) + BCD_TO_BIN(cmos_minutes); + + /* + * since we're only adjusting minutes and seconds, + * don't interfere with hour overflow. This avoids + * messing with unknown time zones but requires your + * RTC not to be off by more than 15 minutes + */ + real_seconds = nowtime % 60; + real_minutes = nowtime / 60; + /* correct for half hour time zone */ + if (((abs(real_minutes - cmos_minutes) + 15)/30) & 1) + real_minutes += 30; + real_minutes %= 60; + + if (abs(real_minutes - cmos_minutes) < 30) { + if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { + BIN_TO_BCD(real_seconds); + BIN_TO_BCD(real_minutes); + } + CMOS_WRITE(real_seconds,RTC_SECONDS); + CMOS_WRITE(real_minutes,RTC_MINUTES); + } else { + printk(KERN_WARNING + "set_rtc_mmss: can't update from %d to %d\n", + cmos_minutes, real_minutes); + retval = -1; + } + + /* The following flags have to be released exactly in this order, + * otherwise the DS12887 (popular MC146818A clone with integrated + * battery and quartz) will not reset the oscillator and will not + * update precisely 500 ms later. You won't find this mentioned in + * the Dallas Semiconductor data sheets, but who believes data + * sheets anyway ... -- Markus Kuhn + */ + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + + return retval; +} + +unsigned long mach_get_cmos_time(void) +{ + unsigned int year, mon, day, hour, min, sec, century = 0; + + /* + * If UIP is clear, then we have >= 244 microseconds before + * RTC registers will be updated. Spec sheet says that this + * is the reliable way to read RTC - registers. If UIP is set + * then the register access might be invalid. + */ + while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)) + cpu_relax(); + + sec = CMOS_READ(RTC_SECONDS); + min = CMOS_READ(RTC_MINUTES); + hour = CMOS_READ(RTC_HOURS); + day = CMOS_READ(RTC_DAY_OF_MONTH); + mon = CMOS_READ(RTC_MONTH); + year = CMOS_READ(RTC_YEAR); + +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_64) + /* CHECKME: Is this really 64bit only ??? */ + if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && + acpi_gbl_FADT.century) + century = CMOS_READ(acpi_gbl_FADT.century); +#endif + + if (RTC_ALWAYS_BCD || !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY)) { + BCD_TO_BIN(sec); + BCD_TO_BIN(min); + BCD_TO_BIN(hour); + BCD_TO_BIN(day); + BCD_TO_BIN(mon); + BCD_TO_BIN(year); + } + + if (century) { + BCD_TO_BIN(century); + year += century * 100; + printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); + } else { + year += CMOS_YEARS_OFFS; + if (year < 1970) + year += 100; + } + + return mktime(year, mon, day, hour, min, sec); +} + +/* Routines for accessing the CMOS RAM/RTC. */ +unsigned char rtc_cmos_read(unsigned char addr) +{ + unsigned char val; + + lock_cmos_prefix(addr); + outb_p(addr, RTC_PORT(0)); + val = inb_p(RTC_PORT(1)); + lock_cmos_suffix(addr); + return val; +} +EXPORT_SYMBOL(rtc_cmos_read); + +void rtc_cmos_write(unsigned char val, unsigned char addr) +{ + lock_cmos_prefix(addr); + outb_p(addr, RTC_PORT(0)); + outb_p(val, RTC_PORT(1)); + lock_cmos_suffix(addr); +} +EXPORT_SYMBOL(rtc_cmos_write); + +static int set_rtc_mmss(unsigned long nowtime) +{ + int retval; + unsigned long flags; + + spin_lock_irqsave(&rtc_lock, flags); + retval = set_wallclock(nowtime); + spin_unlock_irqrestore(&rtc_lock, flags); + + return retval; +} + +/* not static: needed by APM */ +unsigned long read_persistent_clock(void) +{ + unsigned long retval, flags; + + spin_lock_irqsave(&rtc_lock, flags); + retval = get_wallclock(); + spin_unlock_irqrestore(&rtc_lock, flags); + + return retval; +} + +int update_persistent_clock(struct timespec now) +{ + return set_rtc_mmss(now.tv_sec); +} + +unsigned long long native_read_tsc(void) +{ + return __native_read_tsc(); +} +EXPORT_SYMBOL(native_read_tsc); + diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c index 3558ac78c92..309366f8f60 100644 --- a/arch/x86/kernel/setup64.c +++ b/arch/x86/kernel/setup64.c @@ -24,7 +24,11 @@ #include <asm/sections.h> #include <asm/setup.h> +#ifndef CONFIG_DEBUG_BOOT_PARAMS struct boot_params __initdata boot_params; +#else +struct boot_params boot_params; +#endif cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; @@ -37,6 +41,8 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned"))); unsigned long __supported_pte_mask __read_mostly = ~0UL; +EXPORT_SYMBOL_GPL(__supported_pte_mask); + static int do_not_nx __cpuinitdata = 0; /* noexec=on|off @@ -80,6 +86,43 @@ static int __init nonx32_setup(char *str) __setup("noexec32=", nonx32_setup); /* + * Copy data used in early init routines from the initial arrays to the + * per cpu data areas. These arrays then become expendable and the + * *_early_ptr's are zeroed indicating that the static arrays are gone. + */ +static void __init setup_per_cpu_maps(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { +#ifdef CONFIG_SMP + if (per_cpu_offset(cpu)) { +#endif + per_cpu(x86_cpu_to_apicid, cpu) = + x86_cpu_to_apicid_init[cpu]; + per_cpu(x86_bios_cpu_apicid, cpu) = + x86_bios_cpu_apicid_init[cpu]; +#ifdef CONFIG_NUMA + per_cpu(x86_cpu_to_node_map, cpu) = + x86_cpu_to_node_map_init[cpu]; +#endif +#ifdef CONFIG_SMP + } + else + printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n", + cpu); +#endif + } + + /* indicate the early static arrays will soon be gone */ + x86_cpu_to_apicid_early_ptr = NULL; + x86_bios_cpu_apicid_early_ptr = NULL; +#ifdef CONFIG_NUMA + x86_cpu_to_node_map_early_ptr = NULL; +#endif +} + +/* * Great future plan: * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. * Always point %gs to its beginning @@ -100,18 +143,21 @@ void __init setup_per_cpu_areas(void) for_each_cpu_mask (i, cpu_possible_map) { char *ptr; - if (!NODE_DATA(cpu_to_node(i))) { + if (!NODE_DATA(early_cpu_to_node(i))) { printk("cpu with no node %d, num_online_nodes %d\n", i, num_online_nodes()); ptr = alloc_bootmem_pages(size); } else { - ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); + ptr = alloc_bootmem_pages_node(NODE_DATA(early_cpu_to_node(i)), size); } if (!ptr) panic("Cannot allocate cpu data for CPU %d\n", i); cpu_pda(i)->data_offset = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); } + + /* setup percpu data maps early */ + setup_per_cpu_maps(); } void pda_init(int cpu) @@ -169,7 +215,8 @@ void syscall_init(void) #endif /* Flags to clear on syscall */ - wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); + wrmsrl(MSR_SYSCALL_MASK, + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); } void __cpuinit check_efer(void) @@ -227,7 +274,7 @@ void __cpuinit cpu_init (void) * and set up the GDT descriptor: */ if (cpu) - memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE); + memcpy(get_cpu_gdt_table(cpu), cpu_gdt_table, GDT_SIZE); cpu_gdt_descr[cpu].size = GDT_SIZE; load_gdt((const struct desc_ptr *)&cpu_gdt_descr[cpu]); @@ -257,10 +304,10 @@ void __cpuinit cpu_init (void) v, cpu); } estacks += PAGE_SIZE << order[v]; - orig_ist->ist[v] = t->ist[v] = (unsigned long)estacks; + orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; } - t->io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); /* * <= is required because the CPU will access up to * 8 bits beyond the end of the IO permission bitmap. diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 9c24b45b513..62adc5f20be 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -44,9 +44,12 @@ #include <linux/crash_dump.h> #include <linux/dmi.h> #include <linux/pfn.h> +#include <linux/pci.h> +#include <linux/init_ohci1394_dma.h> #include <video/edid.h> +#include <asm/mtrr.h> #include <asm/apic.h> #include <asm/e820.h> #include <asm/mpspec.h> @@ -67,14 +70,83 @@ address, and must not be in the .bss segment! */ unsigned long init_pg_tables_end __initdata = ~0UL; -int disable_pse __cpuinitdata = 0; - /* * Machine setup.. */ -extern struct resource code_resource; -extern struct resource data_resource; -extern struct resource bss_resource; +static struct resource data_resource = { + .name = "Kernel data", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource code_resource = { + .name = "Kernel code", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource bss_resource = { + .name = "Kernel bss", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource standard_io_resources[] = { { + .name = "dma1", + .start = 0x0000, + .end = 0x001f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic1", + .start = 0x0020, + .end = 0x0021, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer0", + .start = 0x0040, + .end = 0x0043, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer1", + .start = 0x0050, + .end = 0x0053, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "keyboard", + .start = 0x0060, + .end = 0x006f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma page reg", + .start = 0x0080, + .end = 0x008f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic2", + .start = 0x00a0, + .end = 0x00a1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma2", + .start = 0x00c0, + .end = 0x00df, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "fpu", + .start = 0x00f0, + .end = 0x00ff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +} }; /* cpu data as detected by the assembly code in head.S */ struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; @@ -116,13 +188,17 @@ extern int root_mountflags; unsigned long saved_videomode; -#define RAMDISK_IMAGE_START_MASK 0x07FF +#define RAMDISK_IMAGE_START_MASK 0x07FF #define RAMDISK_PROMPT_FLAG 0x8000 -#define RAMDISK_LOAD_FLAG 0x4000 +#define RAMDISK_LOAD_FLAG 0x4000 static char __initdata command_line[COMMAND_LINE_SIZE]; +#ifndef CONFIG_DEBUG_BOOT_PARAMS struct boot_params __initdata boot_params; +#else +struct boot_params boot_params; +#endif #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) struct edd edd; @@ -166,8 +242,7 @@ static int __init parse_mem(char *arg) return -EINVAL; if (strcmp(arg, "nopentium") == 0) { - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); - disable_pse = 1; + setup_clear_cpu_cap(X86_FEATURE_PSE); } else { /* If the user specifies memory size, we * limit the BIOS-provided memory map to @@ -176,7 +251,7 @@ static int __init parse_mem(char *arg) * trim the existing memory map. */ unsigned long long mem_size; - + mem_size = memparse(arg, &arg); limit_regions(mem_size); user_defined_memmap = 1; @@ -315,7 +390,7 @@ static void __init reserve_ebda_region(void) unsigned int addr; addr = get_bios_ebda(); if (addr) - reserve_bootmem(addr, PAGE_SIZE); + reserve_bootmem(addr, PAGE_SIZE); } #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -420,6 +495,100 @@ static inline void __init reserve_crashkernel(void) {} #endif +#ifdef CONFIG_BLK_DEV_INITRD + +static bool do_relocate_initrd = false; + +static void __init reserve_initrd(void) +{ + unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; + unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; + unsigned long ramdisk_end = ramdisk_image + ramdisk_size; + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; + unsigned long ramdisk_here; + + initrd_start = 0; + + if (!boot_params.hdr.type_of_loader || + !ramdisk_image || !ramdisk_size) + return; /* No initrd provided by bootloader */ + + if (ramdisk_end < ramdisk_image) { + printk(KERN_ERR "initrd wraps around end of memory, " + "disabling initrd\n"); + return; + } + if (ramdisk_size >= end_of_lowmem/2) { + printk(KERN_ERR "initrd too large to handle, " + "disabling initrd\n"); + return; + } + if (ramdisk_end <= end_of_lowmem) { + /* All in lowmem, easy case */ + reserve_bootmem(ramdisk_image, ramdisk_size); + initrd_start = ramdisk_image + PAGE_OFFSET; + initrd_end = initrd_start+ramdisk_size; + return; + } + + /* We need to move the initrd down into lowmem */ + ramdisk_here = (end_of_lowmem - ramdisk_size) & PAGE_MASK; + + /* Note: this includes all the lowmem currently occupied by + the initrd, we rely on that fact to keep the data intact. */ + reserve_bootmem(ramdisk_here, ramdisk_size); + initrd_start = ramdisk_here + PAGE_OFFSET; + initrd_end = initrd_start + ramdisk_size; + + do_relocate_initrd = true; +} + +#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) + +static void __init relocate_initrd(void) +{ + unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; + unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; + unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; + unsigned long ramdisk_here; + unsigned long slop, clen, mapaddr; + char *p, *q; + + if (!do_relocate_initrd) + return; + + ramdisk_here = initrd_start - PAGE_OFFSET; + + q = (char *)initrd_start; + + /* Copy any lowmem portion of the initrd */ + if (ramdisk_image < end_of_lowmem) { + clen = end_of_lowmem - ramdisk_image; + p = (char *)__va(ramdisk_image); + memcpy(q, p, clen); + q += clen; + ramdisk_image += clen; + ramdisk_size -= clen; + } + + /* Copy the highmem portion of the initrd */ + while (ramdisk_size) { + slop = ramdisk_image & ~PAGE_MASK; + clen = ramdisk_size; + if (clen > MAX_MAP_CHUNK-slop) + clen = MAX_MAP_CHUNK-slop; + mapaddr = ramdisk_image & PAGE_MASK; + p = early_ioremap(mapaddr, clen+slop); + memcpy(q, p+slop, clen); + early_iounmap(p, clen+slop); + q += clen; + ramdisk_image += clen; + ramdisk_size -= clen; + } +} + +#endif /* CONFIG_BLK_DEV_INITRD */ + void __init setup_bootmem_allocator(void) { unsigned long bootmap_size; @@ -475,26 +644,10 @@ void __init setup_bootmem_allocator(void) */ find_smp_config(); #endif - numa_kva_reserve(); #ifdef CONFIG_BLK_DEV_INITRD - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { - unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; - unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; - unsigned long ramdisk_end = ramdisk_image + ramdisk_size; - unsigned long end_of_lowmem = max_low_pfn << PAGE_SHIFT; - - if (ramdisk_end <= end_of_lowmem) { - reserve_bootmem(ramdisk_image, ramdisk_size); - initrd_start = ramdisk_image + PAGE_OFFSET; - initrd_end = initrd_start+ramdisk_size; - } else { - printk(KERN_ERR "initrd extends beyond end of memory " - "(0x%08lx > 0x%08lx)\ndisabling initrd\n", - ramdisk_end, end_of_lowmem); - initrd_start = 0; - } - } + reserve_initrd(); #endif + numa_kva_reserve(); reserve_crashkernel(); } @@ -545,17 +698,11 @@ void __init setup_arch(char **cmdline_p) memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); pre_setup_arch_hook(); early_cpu_init(); + early_ioremap_init(); - /* - * FIXME: This isn't an official loader_type right - * now but does currently work with elilo. - * If we were configured as an EFI kernel, check to make - * sure that we were loaded correctly from elilo and that - * the system table is valid. If not, then initialize normally. - */ #ifdef CONFIG_EFI - if ((boot_params.hdr.type_of_loader == 0x50) && - boot_params.efi_info.efi_systab) + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + "EL32", 4)) efi_enabled = 1; #endif @@ -579,12 +726,9 @@ void __init setup_arch(char **cmdline_p) rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); #endif ARCH_SETUP - if (efi_enabled) - efi_init(); - else { - printk(KERN_INFO "BIOS-provided physical RAM map:\n"); - print_memory_map(memory_setup()); - } + + printk(KERN_INFO "BIOS-provided physical RAM map:\n"); + print_memory_map(memory_setup()); copy_edd(); @@ -612,8 +756,16 @@ void __init setup_arch(char **cmdline_p) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; + if (efi_enabled) + efi_init(); + max_low_pfn = setup_memory(); + /* update e820 for memory not covered by WB MTRRs */ + mtrr_bp_init(); + if (mtrr_trim_uncached_memory(max_pfn)) + max_low_pfn = setup_memory(); + #ifdef CONFIG_VMI /* * Must be after max_low_pfn is determined, and before kernel @@ -636,6 +788,16 @@ void __init setup_arch(char **cmdline_p) smp_alloc_memory(); /* AP processor realmode stacks in low memory*/ #endif paging_init(); + + /* + * NOTE: On x86-32, only from this point on, fixmaps are ready for use. + */ + +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + if (init_ohci1394_dma_early) + init_ohci1394_dma_on_all_controllers(); +#endif + remapped_pgdat_init(); sparse_init(); zone_sizes_init(); @@ -644,15 +806,19 @@ void __init setup_arch(char **cmdline_p) * NOTE: at this point the bootmem allocator is fully available. */ +#ifdef CONFIG_BLK_DEV_INITRD + relocate_initrd(); +#endif + paravirt_post_allocator_init(); dmi_scan_machine(); + io_delay_init(); + #ifdef CONFIG_X86_GENERICARCH generic_apic_probe(); -#endif - if (efi_enabled) - efi_map_memmap(); +#endif #ifdef CONFIG_ACPI /* @@ -661,9 +827,7 @@ void __init setup_arch(char **cmdline_p) acpi_boot_table_init(); #endif -#ifdef CONFIG_PCI early_quirks(); -#endif #ifdef CONFIG_ACPI acpi_boot_init(); @@ -692,3 +856,26 @@ void __init setup_arch(char **cmdline_p) #endif #endif } + +/* + * Request address space for all standard resources + * + * This is called just before pcibios_init(), which is also a + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). + */ +static int __init request_standard_resources(void) +{ + int i; + + printk(KERN_INFO "Setting up standard PCI resources\n"); + init_iomem_resources(&code_resource, &data_resource, &bss_resource); + + request_resource(&iomem_resource, &video_ram_resource); + + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + return 0; +} + +subsys_initcall(request_standard_resources); diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 30d94d1d5f5..77fb87bf6e5 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -30,6 +30,7 @@ #include <linux/crash_dump.h> #include <linux/root_dev.h> #include <linux/pci.h> +#include <linux/efi.h> #include <linux/acpi.h> #include <linux/kallsyms.h> #include <linux/edd.h> @@ -39,10 +40,13 @@ #include <linux/dmi.h> #include <linux/dma-mapping.h> #include <linux/ctype.h> +#include <linux/uaccess.h> +#include <linux/init_ohci1394_dma.h> #include <asm/mtrr.h> #include <asm/uaccess.h> #include <asm/system.h> +#include <asm/vsyscall.h> #include <asm/io.h> #include <asm/smp.h> #include <asm/msr.h> @@ -50,6 +54,7 @@ #include <video/edid.h> #include <asm/e820.h> #include <asm/dma.h> +#include <asm/gart.h> #include <asm/mpspec.h> #include <asm/mmu_context.h> #include <asm/proto.h> @@ -59,6 +64,15 @@ #include <asm/sections.h> #include <asm/dmi.h> #include <asm/cacheflush.h> +#include <asm/mce.h> +#include <asm/ds.h> +#include <asm/topology.h> + +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else +#define ARCH_SETUP +#endif /* * Machine setup.. @@ -67,6 +81,8 @@ struct cpuinfo_x86 boot_cpu_data __read_mostly; EXPORT_SYMBOL(boot_cpu_data); +__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; + unsigned long mmu_cr4_features; /* Boot loader ID as an integer, for the benefit of proc_dointvec */ @@ -76,7 +92,7 @@ unsigned long saved_video_mode; int force_mwait __cpuinitdata; -/* +/* * Early DMI memory */ int dmi_alloc_index; @@ -122,25 +138,27 @@ struct resource standard_io_resources[] = { #define IORESOURCE_RAM (IORESOURCE_BUSY | IORESOURCE_MEM) -struct resource data_resource = { +static struct resource data_resource = { .name = "Kernel data", .start = 0, .end = 0, .flags = IORESOURCE_RAM, }; -struct resource code_resource = { +static struct resource code_resource = { .name = "Kernel code", .start = 0, .end = 0, .flags = IORESOURCE_RAM, }; -struct resource bss_resource = { +static struct resource bss_resource = { .name = "Kernel bss", .start = 0, .end = 0, .flags = IORESOURCE_RAM, }; +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); + #ifdef CONFIG_PROC_VMCORE /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. This option will be passed @@ -166,12 +184,12 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size); if (bootmap == -1L) - panic("Cannot find bootmem map of size %ld\n",bootmap_size); + panic("Cannot find bootmem map of size %ld\n", bootmap_size); bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); e820_register_active_regions(0, start_pfn, end_pfn); free_bootmem_with_active_regions(0, end_pfn); reserve_bootmem(bootmap, bootmap_size); -} +} #endif #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) @@ -205,7 +223,8 @@ static void __init reserve_crashkernel(void) unsigned long long crash_size, crash_base; int ret; - free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; + free_mem = + ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; ret = parse_crashkernel(boot_command_line, free_mem, &crash_size, &crash_base); @@ -229,33 +248,21 @@ static inline void __init reserve_crashkernel(void) {} #endif -#define EBDA_ADDR_POINTER 0x40E - -unsigned __initdata ebda_addr; -unsigned __initdata ebda_size; - -static void discover_ebda(void) +/* Overridden in paravirt.c if CONFIG_PARAVIRT */ +void __attribute__((weak)) __init memory_setup(void) { - /* - * there is a real-mode segmented pointer pointing to the - * 4K EBDA area at 0x40E - */ - ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); - ebda_addr <<= 4; - - ebda_size = *(unsigned short *)__va(ebda_addr); - - /* Round EBDA up to pages */ - if (ebda_size == 0) - ebda_size = 1; - ebda_size <<= 10; - ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE); - if (ebda_size > 64*1024) - ebda_size = 64*1024; + machine_specific_memory_setup(); } +/* + * setup_arch - architecture-specific boot-time initializations + * + * Note: On x86_64, fixmaps are ready for use even before this is called. + */ void __init setup_arch(char **cmdline_p) { + unsigned i; + printk(KERN_INFO "Command line: %s\n", boot_command_line); ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); @@ -269,7 +276,15 @@ void __init setup_arch(char **cmdline_p) rd_prompt = ((boot_params.hdr.ram_size & RAMDISK_PROMPT_FLAG) != 0); rd_doload = ((boot_params.hdr.ram_size & RAMDISK_LOAD_FLAG) != 0); #endif - setup_memory_region(); +#ifdef CONFIG_EFI + if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, + "EL64", 4)) + efi_enabled = 1; +#endif + + ARCH_SETUP + + memory_setup(); copy_edd(); if (!boot_params.hdr.root_flags) @@ -293,27 +308,47 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + if (init_ohci1394_dma_early) + init_ohci1394_dma_on_all_controllers(); +#endif + finish_e820_parsing(); + early_gart_iommu_check(); + e820_register_active_regions(0, 0, -1UL); /* * partially used pages are not usable - thus * we are rounding upwards: */ end_pfn = e820_end_of_ram(); + /* update e820 for memory not covered by WB MTRRs */ + mtrr_bp_init(); + if (mtrr_trim_uncached_memory(end_pfn)) { + e820_register_active_regions(0, 0, -1UL); + end_pfn = e820_end_of_ram(); + } + num_physpages = end_pfn; check_efer(); - discover_ebda(); - init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT)); + if (efi_enabled) + efi_init(); dmi_scan_machine(); + io_delay_init(); + #ifdef CONFIG_SMP - /* setup to use the static apicid table during kernel startup */ - x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init; + /* setup to use the early static init tables during kernel startup */ + x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; + x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init; +#ifdef CONFIG_NUMA + x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init; +#endif #endif #ifdef CONFIG_ACPI @@ -340,48 +375,26 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_NUMA - numa_initmem_init(0, end_pfn); + numa_initmem_init(0, end_pfn); #else contig_initmem_init(0, end_pfn); #endif - /* Reserve direct mapping */ - reserve_bootmem_generic(table_start << PAGE_SHIFT, - (table_end - table_start) << PAGE_SHIFT); - - /* reserve kernel */ - reserve_bootmem_generic(__pa_symbol(&_text), - __pa_symbol(&_end) - __pa_symbol(&_text)); + early_res_to_bootmem(); +#ifdef CONFIG_ACPI_SLEEP /* - * reserve physical page 0 - it's a special BIOS page on many boxes, - * enabling clean reboots, SMP operation, laptop functions. + * Reserve low memory region for sleep support. */ - reserve_bootmem_generic(0, PAGE_SIZE); - - /* reserve ebda region */ - if (ebda_addr) - reserve_bootmem_generic(ebda_addr, ebda_size); -#ifdef CONFIG_NUMA - /* reserve nodemap region */ - if (nodemap_addr) - reserve_bootmem_generic(nodemap_addr, nodemap_size); + acpi_reserve_bootmem(); #endif -#ifdef CONFIG_SMP - /* Reserve SMP trampoline */ - reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE); -#endif + if (efi_enabled) + efi_reserve_bootmem(); -#ifdef CONFIG_ACPI_SLEEP /* - * Reserve low memory region for sleep support. - */ - acpi_reserve_bootmem(); -#endif - /* - * Find and reserve possible boot-time SMP configuration: - */ + * Find and reserve possible boot-time SMP configuration: + */ find_smp_config(); #ifdef CONFIG_BLK_DEV_INITRD if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { @@ -395,6 +408,8 @@ void __init setup_arch(char **cmdline_p) initrd_start = ramdisk_image + PAGE_OFFSET; initrd_end = initrd_start+ramdisk_size; } else { + /* Assumes everything on node 0 */ + free_bootmem(ramdisk_image, ramdisk_size); printk(KERN_ERR "initrd extends beyond end of memory " "(0x%08lx > 0x%08lx)\ndisabling initrd\n", ramdisk_end, end_of_mem); @@ -404,17 +419,10 @@ void __init setup_arch(char **cmdline_p) #endif reserve_crashkernel(); paging_init(); + map_vsyscall(); -#ifdef CONFIG_PCI early_quirks(); -#endif - /* - * set this early, so we dont allocate cpu0 - * if MADT list doesnt list BSP first - * mpparse.c/MP_processor_info() allocates logical cpu numbers. - */ - cpu_set(0, cpu_present_map); #ifdef CONFIG_ACPI /* * Read APIC and some other early information from ACPI tables. @@ -430,25 +438,24 @@ void __init setup_arch(char **cmdline_p) if (smp_found_config) get_smp_config(); init_apic_mappings(); + ioapic_init_mappings(); /* * We trust e820 completely. No explicit ROM probing in memory. - */ - e820_reserve_resources(); + */ + e820_reserve_resources(&code_resource, &data_resource, &bss_resource); e820_mark_nosave_regions(); - { - unsigned i; /* request I/O space for devices used on all i[345]86 PCs */ for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) request_resource(&ioport_resource, &standard_io_resources[i]); - } e820_setup_gap(); #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) - conswitchp = &vga_con; + if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY)) + conswitchp = &vga_con; #elif defined(CONFIG_DUMMY_CONSOLE) conswitchp = &dummy_con; #endif @@ -479,9 +486,10 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) if (n >= 0x80000005) { cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size=(ecx>>24)+(edx>>24); + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), " + "D cache %dK (%d bytes/line)\n", + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); + c->x86_cache_size = (ecx>>24) + (edx>>24); /* On K8 L1 TLB is inclusive, so don't count it */ c->x86_tlbsize = 0; } @@ -495,11 +503,8 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", c->x86_cache_size, ecx & 0xFF); } - - if (n >= 0x80000007) - cpuid(0x80000007, &dummy, &dummy, &dummy, &c->x86_power); if (n >= 0x80000008) { - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); + cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; } @@ -508,14 +513,15 @@ static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) #ifdef CONFIG_NUMA static int nearby_node(int apicid) { - int i; + int i, node; + for (i = apicid - 1; i >= 0; i--) { - int node = apicid_to_node[i]; + node = apicid_to_node[i]; if (node != NUMA_NO_NODE && node_online(node)) return node; } for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { - int node = apicid_to_node[i]; + node = apicid_to_node[i]; if (node != NUMA_NO_NODE && node_online(node)) return node; } @@ -527,7 +533,7 @@ static int nearby_node(int apicid) * On a AMD dual core setup the lower bits of the APIC id distingush the cores. * Assumes number of cores is a power of two. */ -static void __init amd_detect_cmp(struct cpuinfo_x86 *c) +static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP unsigned bits; @@ -536,7 +542,54 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) int node = 0; unsigned apicid = hard_smp_processor_id(); #endif - unsigned ecx = cpuid_ecx(0x80000008); + bits = c->x86_coreid_bits; + + /* Low order bits define the core id (index of core in socket) */ + c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1); + /* Convert the APIC ID into the socket ID */ + c->phys_proc_id = phys_pkg_id(bits); + +#ifdef CONFIG_NUMA + node = c->phys_proc_id; + if (apicid_to_node[apicid] != NUMA_NO_NODE) + node = apicid_to_node[apicid]; + if (!node_online(node)) { + /* Two possibilities here: + - The CPU is missing memory and no node was created. + In that case try picking one from a nearby CPU + - The APIC IDs differ from the HyperTransport node IDs + which the K8 northbridge parsing fills in. + Assume they are all increased by a constant offset, + but in the same order as the HT nodeids. + If that doesn't result in a usable node fall back to the + path for the previous case. */ + + int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits); + + if (ht_nodeid >= 0 && + apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = apicid_to_node[ht_nodeid]; + /* Pick a nearby node */ + if (!node_online(node)) + node = nearby_node(apicid); + } + numa_set_node(cpu, node); + + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); +#endif +#endif +} + +static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_SMP + unsigned bits, ecx; + + /* Multi core CPU? */ + if (c->extended_cpuid_level < 0x80000008) + return; + + ecx = cpuid_ecx(0x80000008); c->x86_max_cores = (ecx & 0xff) + 1; @@ -549,37 +602,8 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) bits++; } - /* Low order bits define the core id (index of core in socket) */ - c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1); - /* Convert the APIC ID into the socket ID */ - c->phys_proc_id = phys_pkg_id(bits); - -#ifdef CONFIG_NUMA - node = c->phys_proc_id; - if (apicid_to_node[apicid] != NUMA_NO_NODE) - node = apicid_to_node[apicid]; - if (!node_online(node)) { - /* Two possibilities here: - - The CPU is missing memory and no node was created. - In that case try picking one from a nearby CPU - - The APIC IDs differ from the HyperTransport node IDs - which the K8 northbridge parsing fills in. - Assume they are all increased by a constant offset, - but in the same order as the HT nodeids. - If that doesn't result in a usable node fall back to the - path for the previous case. */ - int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits); - if (ht_nodeid >= 0 && - apicid_to_node[ht_nodeid] != NUMA_NO_NODE) - node = apicid_to_node[ht_nodeid]; - /* Pick a nearby node */ - if (!node_online(node)) - node = nearby_node(apicid); - } - numa_set_node(cpu, node); + c->x86_coreid_bits = bits; - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); -#endif #endif } @@ -595,8 +619,8 @@ static void __init amd_detect_cmp(struct cpuinfo_x86 *c) /* AMD systems with C1E don't have a working lAPIC timer. Check for that. */ static __cpuinit int amd_apic_timer_broken(void) { - u32 lo, hi; - u32 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); + u32 lo, hi, eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); + switch (eax & CPUID_XFAM) { case CPUID_XFAM_K8: if ((eax & CPUID_XMOD) < CPUID_XMOD_REV_F) @@ -614,6 +638,15 @@ static __cpuinit int amd_apic_timer_broken(void) return 0; } +static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) +{ + early_init_amd_mc(c); + + /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ + if (c->x86_power & (1<<8)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +} + static void __cpuinit init_amd(struct cpuinfo_x86 *c) { unsigned level; @@ -624,7 +657,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* * Disable TLB flush filter by setting HWCR.FFDIS on K8 * bit 6 of msr C001_0015 - * + * * Errata 63 for SH-B3 steppings * Errata 122 for all steppings (F+ have it disabled by default) */ @@ -637,35 +670,32 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ - clear_bit(0*32+31, &c->x86_capability); - + clear_bit(0*32+31, (unsigned long *)&c->x86_capability); + /* On C+ stepping K8 rep microcode works well for copy/memset */ level = cpuid_eax(1); - if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); + if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || + level >= 0x0f58)) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); if (c->x86 == 0x10 || c->x86 == 0x11) - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* Enable workaround for FXSAVE leak */ if (c->x86 >= 6) - set_bit(X86_FEATURE_FXSAVE_LEAK, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); level = get_model_name(c); if (!level) { - switch (c->x86) { + switch (c->x86) { case 15: /* Should distinguish Models here, but this is only a fallback anyways. */ strcpy(c->x86_model_id, "Hammer"); - break; - } - } + break; + } + } display_cacheinfo(c); - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ - if (c->x86_power & (1<<8)) - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); - /* Multi core CPU? */ if (c->extended_cpuid_level >= 0x80000008) amd_detect_cmp(c); @@ -677,41 +707,38 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) num_cache_leaves = 3; if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) - set_bit(X86_FEATURE_K8, &c->x86_capability); - - /* RDTSC can be speculated around */ - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_K8); - /* Family 10 doesn't support C states in MWAIT so don't use it */ - if (c->x86 == 0x10 && !force_mwait) - clear_bit(X86_FEATURE_MWAIT, &c->x86_capability); + /* MFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); if (amd_apic_timer_broken()) disable_apic_timer = 1; } -static void __cpuinit detect_ht(struct cpuinfo_x86 *c) +void __cpuinit detect_ht(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - u32 eax, ebx, ecx, edx; - int index_msb, core_bits; + u32 eax, ebx, ecx, edx; + int index_msb, core_bits; cpuid(1, &eax, &ebx, &ecx, &edx); if (!cpu_has(c, X86_FEATURE_HT)) return; - if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) goto out; smp_num_siblings = (ebx & 0xff0000) >> 16; if (smp_num_siblings == 1) { printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1 ) { + } else if (smp_num_siblings > 1) { if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of the siblings %d", smp_num_siblings); + printk(KERN_WARNING "CPU: Unsupported number of " + "siblings %d", smp_num_siblings); smp_num_siblings = 1; return; } @@ -721,7 +748,7 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c) smp_num_siblings = smp_num_siblings / c->x86_max_cores; - index_msb = get_count_order(smp_num_siblings) ; + index_msb = get_count_order(smp_num_siblings); core_bits = get_count_order(c->x86_max_cores); @@ -730,8 +757,10 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c) } out: if ((c->x86_max_cores * smp_num_siblings) > 1) { - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); - printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); } #endif @@ -773,28 +802,39 @@ static void srat_detect_node(void) #endif } +static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) +{ + if ((c->x86 == 0xf && c->x86_model >= 0x03) || + (c->x86 == 0x6 && c->x86_model >= 0x0e)) + set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); +} + static void __cpuinit init_intel(struct cpuinfo_x86 *c) { /* Cache sizes */ unsigned n; init_intel_cacheinfo(c); - if (c->cpuid_level > 9 ) { + if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); /* Check for version and the number of counters */ if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) - set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } if (cpu_has_ds) { unsigned int l1, l2; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); if (!(l1 & (1<<11))) - set_bit(X86_FEATURE_BTS, c->x86_capability); + set_cpu_cap(c, X86_FEATURE_BTS); if (!(l1 & (1<<12))) - set_bit(X86_FEATURE_PEBS, c->x86_capability); + set_cpu_cap(c, X86_FEATURE_PEBS); } + + if (cpu_has_bts) + ds_init_intel(c); + n = c->extended_cpuid_level; if (n >= 0x80000008) { unsigned eax = cpuid_eax(0x80000008); @@ -811,14 +851,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) c->x86_cache_alignment = c->x86_clflush_size * 2; if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) - set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); if (c->x86 == 6) - set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); - if (c->x86 == 15) - set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); - else - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); - c->x86_max_cores = intel_num_cpu_cores(c); + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + c->x86_max_cores = intel_num_cpu_cores(c); srat_detect_node(); } @@ -835,18 +872,12 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) c->x86_vendor = X86_VENDOR_UNKNOWN; } -struct cpu_model_info { - int vendor; - int family; - char *model_names[16]; -}; - /* Do some early cpuid on the boot CPU to get some parameter that are needed before check_bugs. Everything advanced is in identify_cpu below. */ -void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) +static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) { - u32 tfms; + u32 tfms, xlvl; c->loops_per_jiffy = loops_per_jiffy; c->x86_cache_size = -1; @@ -857,6 +888,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_clflush_size = 64; c->x86_cache_alignment = c->x86_clflush_size; c->x86_max_cores = 1; + c->x86_coreid_bits = 0; c->extended_cpuid_level = 0; memset(&c->x86_capability, 0, sizeof c->x86_capability); @@ -865,7 +897,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) (unsigned int *)&c->x86_vendor_id[0], (unsigned int *)&c->x86_vendor_id[8], (unsigned int *)&c->x86_vendor_id[4]); - + get_cpu_vendor(c); /* Initialize the standard set of capabilities */ @@ -883,7 +915,7 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86 += (tfms >> 20) & 0xff; if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; - if (c->x86_capability[0] & (1<<19)) + if (c->x86_capability[0] & (1<<19)) c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; } else { /* Have CPUID level 0 only - unheard of */ @@ -893,18 +925,6 @@ void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_SMP c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff; #endif -} - -/* - * This does the hard work of actually picking apart the CPU stuff... - */ -void __cpuinit identify_cpu(struct cpuinfo_x86 *c) -{ - int i; - u32 xlvl; - - early_identify_cpu(c); - /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; @@ -925,6 +945,30 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86_capability[2] = cpuid_edx(0x80860001); } + c->extended_cpuid_level = cpuid_eax(0x80000000); + if (c->extended_cpuid_level >= 0x80000007) + c->x86_power = cpuid_edx(0x80000007); + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + early_init_amd(c); + break; + case X86_VENDOR_INTEL: + early_init_intel(c); + break; + } + +} + +/* + * This does the hard work of actually picking apart the CPU stuff... + */ +void __cpuinit identify_cpu(struct cpuinfo_x86 *c) +{ + int i; + + early_identify_cpu(c); + init_scattered_cpuid_features(c); c->apicid = phys_pkg_id(0); @@ -954,8 +998,7 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) break; } - select_idle_routine(c); - detect_ht(c); + detect_ht(c); /* * On SMP, boot_cpu_data holds the common feature set between @@ -965,32 +1008,56 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) */ if (c != &boot_cpu_data) { /* AND the already accumulated flags with these */ - for (i = 0 ; i < NCAPINTS ; i++) + for (i = 0; i < NCAPINTS; i++) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } + /* Clear all flags overriden by options */ + for (i = 0; i < NCAPINTS; i++) + c->x86_capability[i] ^= cleared_cpu_caps[i]; + #ifdef CONFIG_X86_MCE mcheck_init(c); #endif + select_idle_routine(c); + if (c != &boot_cpu_data) mtrr_ap_init(); #ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif + +} + +static __init int setup_noclflush(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_CLFLSH); + return 1; } - +__setup("noclflush", setup_noclflush); void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { if (c->x86_model_id[0]) - printk("%s", c->x86_model_id); + printk(KERN_INFO "%s", c->x86_model_id); - if (c->x86_mask || c->cpuid_level >= 0) - printk(" stepping %02x\n", c->x86_mask); + if (c->x86_mask || c->cpuid_level >= 0) + printk(KERN_CONT " stepping %02x\n", c->x86_mask); else - printk("\n"); + printk(KERN_CONT "\n"); } +static __init int setup_disablecpuid(char *arg) +{ + int bit; + if (get_option(&arg, &bit) && bit < NCAPINTS*32) + setup_clear_cpu_cap(bit); + else + return 0; + return 1; +} +__setup("clearcpuid=", setup_disablecpuid); + /* * Get CPU information for use by the procfs. */ @@ -998,9 +1065,9 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) static int show_cpuinfo(struct seq_file *m, void *v) { struct cpuinfo_x86 *c = v; - int cpu = 0; + int cpu = 0, i; - /* + /* * These flag bits must match the definitions in <asm/cpufeature.h>. * NULL means this bit is undefined or reserved; either way it doesn't * have meaning as far as Linux is concerned. Note that it's important @@ -1010,10 +1077,10 @@ static int show_cpuinfo(struct seq_file *m, void *v) */ static const char *const x86_cap_flags[] = { /* Intel-defined */ - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", + "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", + "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", /* AMD-defined */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -1080,34 +1147,35 @@ static int show_cpuinfo(struct seq_file *m, void *v) cpu = c->cpu_index; #endif - seq_printf(m,"processor\t: %u\n" - "vendor_id\t: %s\n" - "cpu family\t: %d\n" - "model\t\t: %d\n" - "model name\t: %s\n", - (unsigned)cpu, - c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", - c->x86, - (int)c->x86_model, - c->x86_model_id[0] ? c->x86_model_id : "unknown"); - + seq_printf(m, "processor\t: %u\n" + "vendor_id\t: %s\n" + "cpu family\t: %d\n" + "model\t\t: %d\n" + "model name\t: %s\n", + (unsigned)cpu, + c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", + c->x86, + (int)c->x86_model, + c->x86_model_id[0] ? c->x86_model_id : "unknown"); + if (c->x86_mask || c->cpuid_level >= 0) seq_printf(m, "stepping\t: %d\n", c->x86_mask); else seq_printf(m, "stepping\t: unknown\n"); - - if (cpu_has(c,X86_FEATURE_TSC)) { + + if (cpu_has(c, X86_FEATURE_TSC)) { unsigned int freq = cpufreq_quick_get((unsigned)cpu); + if (!freq) freq = cpu_khz; seq_printf(m, "cpu MHz\t\t: %u.%03u\n", - freq / 1000, (freq % 1000)); + freq / 1000, (freq % 1000)); } /* Cache size */ - if (c->x86_cache_size >= 0) + if (c->x86_cache_size >= 0) seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size); - + #ifdef CONFIG_SMP if (smp_num_siblings * c->x86_max_cores > 1) { seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); @@ -1116,48 +1184,43 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); } -#endif +#endif seq_printf(m, - "fpu\t\t: yes\n" - "fpu_exception\t: yes\n" - "cpuid level\t: %d\n" - "wp\t\t: yes\n" - "flags\t\t:", + "fpu\t\t: yes\n" + "fpu_exception\t: yes\n" + "cpuid level\t: %d\n" + "wp\t\t: yes\n" + "flags\t\t:", c->cpuid_level); - { - int i; - for ( i = 0 ; i < 32*NCAPINTS ; i++ ) - if (cpu_has(c, i) && x86_cap_flags[i] != NULL) - seq_printf(m, " %s", x86_cap_flags[i]); - } - + for (i = 0; i < 32*NCAPINTS; i++) + if (cpu_has(c, i) && x86_cap_flags[i] != NULL) + seq_printf(m, " %s", x86_cap_flags[i]); + seq_printf(m, "\nbogomips\t: %lu.%02lu\n", c->loops_per_jiffy/(500000/HZ), (c->loops_per_jiffy/(5000/HZ)) % 100); - if (c->x86_tlbsize > 0) + if (c->x86_tlbsize > 0) seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize); seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size); seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment); - seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", + seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n", c->x86_phys_bits, c->x86_virt_bits); seq_printf(m, "power management:"); - { - unsigned i; - for (i = 0; i < 32; i++) - if (c->x86_power & (1 << i)) { - if (i < ARRAY_SIZE(x86_power_flags) && - x86_power_flags[i]) - seq_printf(m, "%s%s", - x86_power_flags[i][0]?" ":"", - x86_power_flags[i]); - else - seq_printf(m, " [%d]", i); - } + for (i = 0; i < 32; i++) { + if (c->x86_power & (1 << i)) { + if (i < ARRAY_SIZE(x86_power_flags) && + x86_power_flags[i]) + seq_printf(m, "%s%s", + x86_power_flags[i][0]?" ":"", + x86_power_flags[i]); + else + seq_printf(m, " [%d]", i); + } } seq_printf(m, "\n\n"); @@ -1184,8 +1247,8 @@ static void c_stop(struct seq_file *m, void *v) { } -struct seq_operations cpuinfo_op = { - .start =c_start, +const struct seq_operations cpuinfo_op = { + .start = c_start, .next = c_next, .stop = c_stop, .show = show_cpuinfo, diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 20f29e4c1d3..caee1f002fe 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -23,6 +23,7 @@ #include <asm/ucontext.h> #include <asm/uaccess.h> #include <asm/i387.h> +#include <asm/vdso.h> #include "sigframe_32.h" #define DEBUG_SIG 0 @@ -81,14 +82,14 @@ sys_sigaction(int sig, const struct old_sigaction __user *act, } asmlinkage int -sys_sigaltstack(unsigned long ebx) +sys_sigaltstack(unsigned long bx) { /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */ - struct pt_regs *regs = (struct pt_regs *)&ebx; - const stack_t __user *uss = (const stack_t __user *)ebx; - stack_t __user *uoss = (stack_t __user *)regs->ecx; + struct pt_regs *regs = (struct pt_regs *)&bx; + const stack_t __user *uss = (const stack_t __user *)bx; + stack_t __user *uoss = (stack_t __user *)regs->cx; - return do_sigaltstack(uss, uoss, regs->esp); + return do_sigaltstack(uss, uoss, regs->sp); } @@ -109,12 +110,12 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax #define COPY_SEG(seg) \ { unsigned short tmp; \ err |= __get_user(tmp, &sc->seg); \ - regs->x##seg = tmp; } + regs->seg = tmp; } #define COPY_SEG_STRICT(seg) \ { unsigned short tmp; \ err |= __get_user(tmp, &sc->seg); \ - regs->x##seg = tmp|3; } + regs->seg = tmp|3; } #define GET_SEG(seg) \ { unsigned short tmp; \ @@ -130,22 +131,22 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax COPY_SEG(fs); COPY_SEG(es); COPY_SEG(ds); - COPY(edi); - COPY(esi); - COPY(ebp); - COPY(esp); - COPY(ebx); - COPY(edx); - COPY(ecx); - COPY(eip); + COPY(di); + COPY(si); + COPY(bp); + COPY(sp); + COPY(bx); + COPY(dx); + COPY(cx); + COPY(ip); COPY_SEG_STRICT(cs); COPY_SEG_STRICT(ss); { unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->eflags); - regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); - regs->orig_eax = -1; /* disable syscall checks */ + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + regs->orig_ax = -1; /* disable syscall checks */ } { @@ -164,7 +165,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax } } - err |= __get_user(*peax, &sc->eax); + err |= __get_user(*peax, &sc->ax); return err; badframe: @@ -174,9 +175,9 @@ badframe: asmlinkage int sys_sigreturn(unsigned long __unused) { struct pt_regs *regs = (struct pt_regs *) &__unused; - struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 8); + struct sigframe __user *frame = (struct sigframe __user *)(regs->sp - 8); sigset_t set; - int eax; + int ax; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; @@ -192,17 +193,20 @@ asmlinkage int sys_sigreturn(unsigned long __unused) recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - if (restore_sigcontext(regs, &frame->sc, &eax)) + if (restore_sigcontext(regs, &frame->sc, &ax)) goto badframe; - return eax; + return ax; badframe: - if (show_unhandled_signals && printk_ratelimit()) - printk("%s%s[%d] bad frame in sigreturn frame:%p eip:%lx" - " esp:%lx oeax:%lx\n", + if (show_unhandled_signals && printk_ratelimit()) { + printk("%s%s[%d] bad frame in sigreturn frame:%p ip:%lx" + " sp:%lx oeax:%lx", task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, - current->comm, task_pid_nr(current), frame, regs->eip, - regs->esp, regs->orig_eax); + current->comm, task_pid_nr(current), frame, regs->ip, + regs->sp, regs->orig_ax); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } force_sig(SIGSEGV, current); return 0; @@ -211,9 +215,9 @@ badframe: asmlinkage int sys_rt_sigreturn(unsigned long __unused) { struct pt_regs *regs = (struct pt_regs *) &__unused; - struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->esp - 4); + struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->sp - 4); sigset_t set; - int eax; + int ax; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; @@ -226,13 +230,13 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused) recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT) + if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) goto badframe; - return eax; + return ax; badframe: force_sig(SIGSEGV, current); @@ -249,27 +253,27 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, { int tmp, err = 0; - err |= __put_user(regs->xfs, (unsigned int __user *)&sc->fs); + err |= __put_user(regs->fs, (unsigned int __user *)&sc->fs); savesegment(gs, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->gs); - err |= __put_user(regs->xes, (unsigned int __user *)&sc->es); - err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds); - err |= __put_user(regs->edi, &sc->edi); - err |= __put_user(regs->esi, &sc->esi); - err |= __put_user(regs->ebp, &sc->ebp); - err |= __put_user(regs->esp, &sc->esp); - err |= __put_user(regs->ebx, &sc->ebx); - err |= __put_user(regs->edx, &sc->edx); - err |= __put_user(regs->ecx, &sc->ecx); - err |= __put_user(regs->eax, &sc->eax); + err |= __put_user(regs->es, (unsigned int __user *)&sc->es); + err |= __put_user(regs->ds, (unsigned int __user *)&sc->ds); + err |= __put_user(regs->di, &sc->di); + err |= __put_user(regs->si, &sc->si); + err |= __put_user(regs->bp, &sc->bp); + err |= __put_user(regs->sp, &sc->sp); + err |= __put_user(regs->bx, &sc->bx); + err |= __put_user(regs->dx, &sc->dx); + err |= __put_user(regs->cx, &sc->cx); + err |= __put_user(regs->ax, &sc->ax); err |= __put_user(current->thread.trap_no, &sc->trapno); err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user(regs->eip, &sc->eip); - err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs); - err |= __put_user(regs->eflags, &sc->eflags); - err |= __put_user(regs->esp, &sc->esp_at_signal); - err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss); + err |= __put_user(regs->ip, &sc->ip); + err |= __put_user(regs->cs, (unsigned int __user *)&sc->cs); + err |= __put_user(regs->flags, &sc->flags); + err |= __put_user(regs->sp, &sc->sp_at_signal); + err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); tmp = save_i387(fpstate); if (tmp < 0) @@ -290,29 +294,36 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, static inline void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) { - unsigned long esp; + unsigned long sp; /* Default to using normal stack */ - esp = regs->esp; + sp = regs->sp; + + /* + * If we are on the alternate signal stack and would overflow it, don't. + * Return an always-bogus address instead so we will die with SIGSEGV. + */ + if (on_sig_stack(sp) && !likely(on_sig_stack(sp - frame_size))) + return (void __user *) -1L; /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(esp) == 0) - esp = current->sas_ss_sp + current->sas_ss_size; + if (sas_ss_flags(sp) == 0) + sp = current->sas_ss_sp + current->sas_ss_size; } /* This is the legacy signal stack switching. */ - else if ((regs->xss & 0xffff) != __USER_DS && + else if ((regs->ss & 0xffff) != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && ka->sa.sa_restorer) { - esp = (unsigned long) ka->sa.sa_restorer; + sp = (unsigned long) ka->sa.sa_restorer; } - esp -= frame_size; + sp -= frame_size; /* Align the stack pointer according to the i386 ABI, * i.e. so that on function entry ((sp + 4) & 15) == 0. */ - esp = ((esp + 4) & -16ul) - 4; - return (void __user *) esp; + sp = ((sp + 4) & -16ul) - 4; + return (void __user *) sp; } /* These symbols are defined with the addresses in the vsyscall page. @@ -355,9 +366,9 @@ static int setup_frame(int sig, struct k_sigaction *ka, } if (current->binfmt->hasvdso) - restorer = (void *)VDSO_SYM(&__kernel_sigreturn); + restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn); else - restorer = (void *)&frame->retcode; + restorer = &frame->retcode; if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; @@ -379,16 +390,16 @@ static int setup_frame(int sig, struct k_sigaction *ka, goto give_sigsegv; /* Set up registers for signal handler */ - regs->esp = (unsigned long) frame; - regs->eip = (unsigned long) ka->sa.sa_handler; - regs->eax = (unsigned long) sig; - regs->edx = (unsigned long) 0; - regs->ecx = (unsigned long) 0; + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ka->sa.sa_handler; + regs->ax = (unsigned long) sig; + regs->dx = (unsigned long) 0; + regs->cx = (unsigned long) 0; - regs->xds = __USER_DS; - regs->xes = __USER_DS; - regs->xss = __USER_DS; - regs->xcs = __USER_CS; + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; /* * Clear TF when entering the signal handler, but @@ -396,13 +407,13 @@ static int setup_frame(int sig, struct k_sigaction *ka, * The tracer may want to single-step inside the * handler too. */ - regs->eflags &= ~TF_MASK; + regs->flags &= ~TF_MASK; if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); #if DEBUG_SIG printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", - current->comm, current->pid, frame, regs->eip, frame->pretcode); + current->comm, current->pid, frame, regs->ip, frame->pretcode); #endif return 0; @@ -442,7 +453,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->esp), + err |= __put_user(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, @@ -452,13 +463,13 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, goto give_sigsegv; /* Set up to return from userspace. */ - restorer = (void *)VDSO_SYM(&__kernel_rt_sigreturn); + restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; err |= __put_user(restorer, &frame->pretcode); /* - * This is movl $,%eax ; int $0x80 + * This is movl $,%ax ; int $0x80 * * WE DO NOT USE IT ANY MORE! It's only left here for historical * reasons and because gdb uses it as a signature to notice @@ -472,16 +483,16 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, goto give_sigsegv; /* Set up registers for signal handler */ - regs->esp = (unsigned long) frame; - regs->eip = (unsigned long) ka->sa.sa_handler; - regs->eax = (unsigned long) usig; - regs->edx = (unsigned long) &frame->info; - regs->ecx = (unsigned long) &frame->uc; + regs->sp = (unsigned long) frame; + regs->ip = (unsigned long) ka->sa.sa_handler; + regs->ax = (unsigned long) usig; + regs->dx = (unsigned long) &frame->info; + regs->cx = (unsigned long) &frame->uc; - regs->xds = __USER_DS; - regs->xes = __USER_DS; - regs->xss = __USER_DS; - regs->xcs = __USER_CS; + regs->ds = __USER_DS; + regs->es = __USER_DS; + regs->ss = __USER_DS; + regs->cs = __USER_CS; /* * Clear TF when entering the signal handler, but @@ -489,13 +500,13 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, * The tracer may want to single-step inside the * handler too. */ - regs->eflags &= ~TF_MASK; + regs->flags &= ~TF_MASK; if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); #if DEBUG_SIG printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", - current->comm, current->pid, frame, regs->eip, frame->pretcode); + current->comm, current->pid, frame, regs->ip, frame->pretcode); #endif return 0; @@ -516,35 +527,33 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, int ret; /* Are we from a system call? */ - if (regs->orig_eax >= 0) { + if (regs->orig_ax >= 0) { /* If so, check system call restarting.. */ - switch (regs->eax) { + switch (regs->ax) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: - regs->eax = -EINTR; + regs->ax = -EINTR; break; case -ERESTARTSYS: if (!(ka->sa.sa_flags & SA_RESTART)) { - regs->eax = -EINTR; + regs->ax = -EINTR; break; } /* fallthrough */ case -ERESTARTNOINTR: - regs->eax = regs->orig_eax; - regs->eip -= 2; + regs->ax = regs->orig_ax; + regs->ip -= 2; } } /* - * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so - * that register information in the sigcontext is correct. + * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF + * flag so that register information in the sigcontext is correct. */ - if (unlikely(regs->eflags & TF_MASK) - && likely(current->ptrace & PT_DTRACE)) { - current->ptrace &= ~PT_DTRACE; - regs->eflags &= ~TF_MASK; - } + if (unlikely(regs->flags & X86_EFLAGS_TF) && + likely(test_and_clear_thread_flag(TIF_FORCED_TF))) + regs->flags &= ~X86_EFLAGS_TF; /* Set up the stack frame */ if (ka->sa.sa_flags & SA_SIGINFO) @@ -569,7 +578,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -static void fastcall do_signal(struct pt_regs *regs) +static void do_signal(struct pt_regs *regs) { siginfo_t info; int signr; @@ -599,8 +608,8 @@ static void fastcall do_signal(struct pt_regs *regs) * have been cleared if the watchpoint triggered * inside the kernel. */ - if (unlikely(current->thread.debugreg[7])) - set_debugreg(current->thread.debugreg[7], 7); + if (unlikely(current->thread.debugreg7)) + set_debugreg(current->thread.debugreg7, 7); /* Whee! Actually deliver the signal. */ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { @@ -616,19 +625,19 @@ static void fastcall do_signal(struct pt_regs *regs) } /* Did we come from a system call? */ - if (regs->orig_eax >= 0) { + if (regs->orig_ax >= 0) { /* Restart the system call - no handlers present */ - switch (regs->eax) { + switch (regs->ax) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: - regs->eax = regs->orig_eax; - regs->eip -= 2; + regs->ax = regs->orig_ax; + regs->ip -= 2; break; case -ERESTART_RESTARTBLOCK: - regs->eax = __NR_restart_syscall; - regs->eip -= 2; + regs->ax = __NR_restart_syscall; + regs->ip -= 2; break; } } @@ -651,7 +660,7 @@ void do_notify_resume(struct pt_regs *regs, void *_unused, { /* Pending single-step? */ if (thread_info_flags & _TIF_SINGLESTEP) { - regs->eflags |= TF_MASK; + regs->flags |= TF_MASK; clear_thread_flag(TIF_SINGLESTEP); } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 38d806467c0..7347bb14e30 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -39,7 +39,7 @@ asmlinkage long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) { - return do_sigaltstack(uss, uoss, regs->rsp); + return do_sigaltstack(uss, uoss, regs->sp); } @@ -64,8 +64,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned #define COPY(x) err |= __get_user(regs->x, &sc->x) - COPY(rdi); COPY(rsi); COPY(rbp); COPY(rsp); COPY(rbx); - COPY(rdx); COPY(rcx); COPY(rip); + COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); + COPY(dx); COPY(cx); COPY(ip); COPY(r8); COPY(r9); COPY(r10); @@ -86,9 +86,9 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned { unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->eflags); - regs->eflags = (regs->eflags & ~0x40DD5) | (tmpflags & 0x40DD5); - regs->orig_rax = -1; /* disable syscall checks */ + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5); + regs->orig_ax = -1; /* disable syscall checks */ } { @@ -108,7 +108,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned } } - err |= __get_user(*prax, &sc->rax); + err |= __get_user(*prax, &sc->ax); return err; badframe: @@ -119,9 +119,9 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) { struct rt_sigframe __user *frame; sigset_t set; - unsigned long eax; + unsigned long ax; - frame = (struct rt_sigframe __user *)(regs->rsp - 8); + frame = (struct rt_sigframe __user *)(regs->sp - 8); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) { goto badframe; } @@ -135,17 +135,17 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; #ifdef DEBUG_SIG - printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax); + printk("%d sigreturn ip:%lx sp:%lx frame:%p ax:%lx\n",current->pid,regs->ip,regs->sp,frame,ax); #endif - if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT) + if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT) goto badframe; - return eax; + return ax; badframe: signal_fault(regs,frame,"sigreturn"); @@ -165,14 +165,14 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo err |= __put_user(0, &sc->gs); err |= __put_user(0, &sc->fs); - err |= __put_user(regs->rdi, &sc->rdi); - err |= __put_user(regs->rsi, &sc->rsi); - err |= __put_user(regs->rbp, &sc->rbp); - err |= __put_user(regs->rsp, &sc->rsp); - err |= __put_user(regs->rbx, &sc->rbx); - err |= __put_user(regs->rdx, &sc->rdx); - err |= __put_user(regs->rcx, &sc->rcx); - err |= __put_user(regs->rax, &sc->rax); + err |= __put_user(regs->di, &sc->di); + err |= __put_user(regs->si, &sc->si); + err |= __put_user(regs->bp, &sc->bp); + err |= __put_user(regs->sp, &sc->sp); + err |= __put_user(regs->bx, &sc->bx); + err |= __put_user(regs->dx, &sc->dx); + err |= __put_user(regs->cx, &sc->cx); + err |= __put_user(regs->ax, &sc->ax); err |= __put_user(regs->r8, &sc->r8); err |= __put_user(regs->r9, &sc->r9); err |= __put_user(regs->r10, &sc->r10); @@ -183,8 +183,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo err |= __put_user(regs->r15, &sc->r15); err |= __put_user(me->thread.trap_no, &sc->trapno); err |= __put_user(me->thread.error_code, &sc->err); - err |= __put_user(regs->rip, &sc->rip); - err |= __put_user(regs->eflags, &sc->eflags); + err |= __put_user(regs->ip, &sc->ip); + err |= __put_user(regs->flags, &sc->flags); err |= __put_user(mask, &sc->oldmask); err |= __put_user(me->thread.cr2, &sc->cr2); @@ -198,18 +198,18 @@ setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned lo static void __user * get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) { - unsigned long rsp; + unsigned long sp; /* Default to using normal stack - redzone*/ - rsp = regs->rsp - 128; + sp = regs->sp - 128; /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { - if (sas_ss_flags(rsp) == 0) - rsp = current->sas_ss_sp + current->sas_ss_size; + if (sas_ss_flags(sp) == 0) + sp = current->sas_ss_sp + current->sas_ss_size; } - return (void __user *)round_down(rsp - size, 16); + return (void __user *)round_down(sp - size, 16); } static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, @@ -246,7 +246,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); - err |= __put_user(sas_ss_flags(regs->rsp), + err |= __put_user(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); @@ -271,21 +271,21 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, goto give_sigsegv; #ifdef DEBUG_SIG - printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax); + printk("%d old ip %lx old sp %lx old ax %lx\n", current->pid,regs->ip,regs->sp,regs->ax); #endif /* Set up registers for signal handler */ - regs->rdi = sig; + regs->di = sig; /* In case the signal handler was declared without prototypes */ - regs->rax = 0; + regs->ax = 0; /* This also works for non SA_SIGINFO handlers because they expect the next argument after the signal number on the stack. */ - regs->rsi = (unsigned long)&frame->info; - regs->rdx = (unsigned long)&frame->uc; - regs->rip = (unsigned long) ka->sa.sa_handler; + regs->si = (unsigned long)&frame->info; + regs->dx = (unsigned long)&frame->uc; + regs->ip = (unsigned long) ka->sa.sa_handler; - regs->rsp = (unsigned long)frame; + regs->sp = (unsigned long)frame; /* Set up the CS register to run signal handlers in 64-bit mode, even if the handler happens to be interrupting 32-bit code. */ @@ -295,12 +295,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, see include/asm-x86_64/uaccess.h for details. */ set_fs(USER_DS); - regs->eflags &= ~TF_MASK; + regs->flags &= ~X86_EFLAGS_TF; if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); #ifdef DEBUG_SIG printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n", - current->comm, current->pid, frame, regs->rip, frame->pretcode); + current->comm, current->pid, frame, regs->ip, frame->pretcode); #endif return 0; @@ -321,44 +321,40 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, int ret; #ifdef DEBUG_SIG - printk("handle_signal pid:%d sig:%lu rip:%lx rsp:%lx regs=%p\n", + printk("handle_signal pid:%d sig:%lu ip:%lx sp:%lx regs=%p\n", current->pid, sig, - regs->rip, regs->rsp, regs); + regs->ip, regs->sp, regs); #endif /* Are we from a system call? */ - if ((long)regs->orig_rax >= 0) { + if ((long)regs->orig_ax >= 0) { /* If so, check system call restarting.. */ - switch (regs->rax) { + switch (regs->ax) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: - regs->rax = -EINTR; + regs->ax = -EINTR; break; case -ERESTARTSYS: if (!(ka->sa.sa_flags & SA_RESTART)) { - regs->rax = -EINTR; + regs->ax = -EINTR; break; } /* fallthrough */ case -ERESTARTNOINTR: - regs->rax = regs->orig_rax; - regs->rip -= 2; + regs->ax = regs->orig_ax; + regs->ip -= 2; break; } } /* - * If TF is set due to a debugger (PT_DTRACE), clear the TF - * flag so that register information in the sigcontext is - * correct. + * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF + * flag so that register information in the sigcontext is correct. */ - if (unlikely(regs->eflags & TF_MASK)) { - if (likely(current->ptrace & PT_DTRACE)) { - current->ptrace &= ~PT_DTRACE; - regs->eflags &= ~TF_MASK; - } - } + if (unlikely(regs->flags & X86_EFLAGS_TF) && + likely(test_and_clear_thread_flag(TIF_FORCED_TF))) + regs->flags &= ~X86_EFLAGS_TF; #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) { @@ -430,21 +426,21 @@ static void do_signal(struct pt_regs *regs) } /* Did we come from a system call? */ - if ((long)regs->orig_rax >= 0) { + if ((long)regs->orig_ax >= 0) { /* Restart the system call - no handlers present */ - long res = regs->rax; + long res = regs->ax; switch (res) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: - regs->rax = regs->orig_rax; - regs->rip -= 2; + regs->ax = regs->orig_ax; + regs->ip -= 2; break; case -ERESTART_RESTARTBLOCK: - regs->rax = test_thread_flag(TIF_IA32) ? + regs->ax = test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall; - regs->rip -= 2; + regs->ip -= 2; break; } } @@ -461,13 +457,13 @@ void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { #ifdef DEBUG_SIG - printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n", - thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); + printk("do_notify_resume flags:%x ip:%lx sp:%lx caller:%p pending:%x\n", + thread_info_flags, regs->ip, regs->sp, __builtin_return_address(0),signal_pending(current)); #endif /* Pending single-step? */ if (thread_info_flags & _TIF_SINGLESTEP) { - regs->eflags |= TF_MASK; + regs->flags |= X86_EFLAGS_TF; clear_thread_flag(TIF_SINGLESTEP); } @@ -488,9 +484,12 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) void signal_fault(struct pt_regs *regs, void __user *frame, char *where) { struct task_struct *me = current; - if (show_unhandled_signals && printk_ratelimit()) - printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n", - me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); + if (show_unhandled_signals && printk_ratelimit()) { + printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } force_sig(SIGSEGV, me); } diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c index fcaa026eb80..dc0cde9d16f 100644 --- a/arch/x86/kernel/smp_32.c +++ b/arch/x86/kernel/smp_32.c @@ -159,7 +159,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector) apic_write_around(APIC_ICR, cfg); } -void fastcall send_IPI_self(int vector) +void send_IPI_self(int vector) { __send_IPI_shortcut(APIC_DEST_SELF, vector); } @@ -223,7 +223,7 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector) */ local_irq_save(flags); - for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) { + for_each_possible_cpu(query_cpu) { if (cpu_isset(query_cpu, mask)) { __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu), vector); @@ -256,13 +256,14 @@ static DEFINE_SPINLOCK(tlbstate_lock); * We need to reload %cr3 since the page tables may be going * away from under us.. */ -void leave_mm(unsigned long cpu) +void leave_mm(int cpu) { if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) BUG(); cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask); load_cr3(swapper_pg_dir); } +EXPORT_SYMBOL_GPL(leave_mm); /* * @@ -310,7 +311,7 @@ void leave_mm(unsigned long cpu) * 2) Leave the mm if we are in the lazy tlb mode. */ -fastcall void smp_invalidate_interrupt(struct pt_regs *regs) +void smp_invalidate_interrupt(struct pt_regs *regs) { unsigned long cpu; @@ -638,13 +639,13 @@ static void native_smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -fastcall void smp_reschedule_interrupt(struct pt_regs *regs) +void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); __get_cpu_var(irq_stat).irq_resched_count++; } -fastcall void smp_call_function_interrupt(struct pt_regs *regs) +void smp_call_function_interrupt(struct pt_regs *regs) { void (*func) (void *info) = call_data->func; void *info = call_data->info; @@ -675,7 +676,7 @@ static int convert_apicid_to_cpu(int apic_id) { int i; - for (i = 0; i < NR_CPUS; i++) { + for_each_possible_cpu(i) { if (per_cpu(x86_cpu_to_apicid, i) == apic_id) return i; } diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/smp_64.c index 03fa6ed559c..2fd74b06db6 100644 --- a/arch/x86/kernel/smp_64.c +++ b/arch/x86/kernel/smp_64.c @@ -29,7 +29,7 @@ #include <asm/idle.h> /* - * Smarter SMP flushing macros. + * Smarter SMP flushing macros. * c/o Linus Torvalds. * * These mean you can really definitely utterly forget about @@ -37,15 +37,15 @@ * * Optimizations Manfred Spraul <manfred@colorfullife.com> * - * More scalable flush, from Andi Kleen + * More scalable flush, from Andi Kleen * - * To avoid global state use 8 different call vectors. - * Each CPU uses a specific vector to trigger flushes on other - * CPUs. Depending on the received vector the target CPUs look into + * To avoid global state use 8 different call vectors. + * Each CPU uses a specific vector to trigger flushes on other + * CPUs. Depending on the received vector the target CPUs look into * the right per cpu variable for the flush data. * - * With more than 8 CPUs they are hashed to the 8 available - * vectors. The limited global vector space forces us to this right now. + * With more than 8 CPUs they are hashed to the 8 available + * vectors. The limited global vector space forces us to this right now. * In future when interrupts are split into per CPU domains this could be * fixed, at the cost of triggering multiple IPIs in some cases. */ @@ -55,7 +55,6 @@ union smp_flush_state { cpumask_t flush_cpumask; struct mm_struct *flush_mm; unsigned long flush_va; -#define FLUSH_ALL -1ULL spinlock_t tlbstate_lock; }; char pad[SMP_CACHE_BYTES]; @@ -67,16 +66,17 @@ union smp_flush_state { static DEFINE_PER_CPU(union smp_flush_state, flush_state); /* - * We cannot call mmdrop() because we are in interrupt context, + * We cannot call mmdrop() because we are in interrupt context, * instead update mm->cpu_vm_mask. */ -static inline void leave_mm(int cpu) +void leave_mm(int cpu) { if (read_pda(mmu_state) == TLBSTATE_OK) BUG(); cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); load_cr3(swapper_pg_dir); } +EXPORT_SYMBOL_GPL(leave_mm); /* * @@ -85,25 +85,25 @@ static inline void leave_mm(int cpu) * 1) switch_mm() either 1a) or 1b) * 1a) thread switch to a different mm * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask); - * Stop ipi delivery for the old mm. This is not synchronized with - * the other cpus, but smp_invalidate_interrupt ignore flush ipis - * for the wrong mm, and in the worst case we perform a superfluous - * tlb flush. + * Stop ipi delivery for the old mm. This is not synchronized with + * the other cpus, but smp_invalidate_interrupt ignore flush ipis + * for the wrong mm, and in the worst case we perform a superfluous + * tlb flush. * 1a2) set cpu mmu_state to TLBSTATE_OK - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 + * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 * was in lazy tlb mode. * 1a3) update cpu active_mm - * Now cpu0 accepts tlb flushes for the new mm. + * Now cpu0 accepts tlb flushes for the new mm. * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask); - * Now the other cpus will send tlb flush ipis. + * Now the other cpus will send tlb flush ipis. * 1a4) change cr3. * 1b) thread switch without mm change * cpu active_mm is correct, cpu0 already handles * flush ipis. * 1b1) set cpu mmu_state to TLBSTATE_OK * 1b2) test_and_set the cpu bit in cpu_vm_mask. - * Atomically set the bit [other cpus will start sending flush ipis], - * and test the bit. + * Atomically set the bit [other cpus will start sending flush ipis], + * and test the bit. * 1b3) if the bit was 0: leave_mm was called, flush the tlb. * 2) switch %%esp, ie current * @@ -137,12 +137,12 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) * orig_rax contains the negated interrupt vector. * Use that to determine where the sender put the data. */ - sender = ~regs->orig_rax - INVALIDATE_TLB_VECTOR_START; + sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START; f = &per_cpu(flush_state, sender); if (!cpu_isset(cpu, f->flush_cpumask)) goto out; - /* + /* * This was a BUG() but until someone can quote me the * line from the intel manual that guarantees an IPI to * multiple CPUs is retried _only_ on the erroring CPUs @@ -150,10 +150,10 @@ asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs) * * BUG(); */ - + if (f->flush_mm == read_pda(active_mm)) { if (read_pda(mmu_state) == TLBSTATE_OK) { - if (f->flush_va == FLUSH_ALL) + if (f->flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else __flush_tlb_one(f->flush_va); @@ -166,19 +166,22 @@ out: add_pda(irq_tlb_count, 1); } -static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, - unsigned long va) +void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, + unsigned long va) { int sender; union smp_flush_state *f; + cpumask_t cpumask = *cpumaskp; /* Caller has disabled preemption */ sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; f = &per_cpu(flush_state, sender); - /* Could avoid this lock when - num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is - probably not worth checking this for a cache-hot lock. */ + /* + * Could avoid this lock when + * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is + * probably not worth checking this for a cache-hot lock. + */ spin_lock(&f->tlbstate_lock); f->flush_mm = mm; @@ -202,14 +205,14 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, int __cpuinit init_smp_flush(void) { int i; + for_each_cpu_mask(i, cpu_possible_map) { spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); } return 0; } - core_initcall(init_smp_flush); - + void flush_tlb_current_task(void) { struct mm_struct *mm = current->mm; @@ -221,10 +224,9 @@ void flush_tlb_current_task(void) local_flush_tlb(); if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } -EXPORT_SYMBOL(flush_tlb_current_task); void flush_tlb_mm (struct mm_struct * mm) { @@ -241,11 +243,10 @@ void flush_tlb_mm (struct mm_struct * mm) leave_mm(smp_processor_id()); } if (!cpus_empty(cpu_mask)) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } -EXPORT_SYMBOL(flush_tlb_mm); void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) { @@ -259,8 +260,8 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) if (current->active_mm == mm) { if(current->mm) __flush_tlb_one(va); - else - leave_mm(smp_processor_id()); + else + leave_mm(smp_processor_id()); } if (!cpus_empty(cpu_mask)) @@ -268,7 +269,6 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) preempt_enable(); } -EXPORT_SYMBOL(flush_tlb_page); static void do_flush_tlb_all(void* info) { @@ -325,11 +325,9 @@ void unlock_ipi_call_lock(void) * this function sends a 'generic call function' IPI to all other CPU * of the system defined in the mask. */ - -static int -__smp_call_function_mask(cpumask_t mask, - void (*func)(void *), void *info, - int wait) +static int __smp_call_function_mask(cpumask_t mask, + void (*func)(void *), void *info, + int wait) { struct call_data_struct data; cpumask_t allbutself; @@ -417,11 +415,10 @@ EXPORT_SYMBOL(smp_call_function_mask); */ int smp_call_function_single (int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait) + int nonatomic, int wait) { /* prevent preemption and reschedule on another processor */ - int ret; - int me = get_cpu(); + int ret, me = get_cpu(); /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); @@ -471,9 +468,9 @@ static void stop_this_cpu(void *dummy) */ cpu_clear(smp_processor_id(), cpu_online_map); disable_local_APIC(); - for (;;) + for (;;) halt(); -} +} void smp_send_stop(void) { diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c index 4ea80cbe52e..5787a0c3e29 100644 --- a/arch/x86/kernel/smpboot_32.c +++ b/arch/x86/kernel/smpboot_32.c @@ -83,7 +83,6 @@ EXPORT_SYMBOL(cpu_online_map); cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; -EXPORT_SYMBOL(cpu_callout_map); cpumask_t cpu_possible_map; EXPORT_SYMBOL(cpu_possible_map); static cpumask_t smp_commenced_mask; @@ -92,15 +91,10 @@ static cpumask_t smp_commenced_mask; DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); -/* - * The following static array is used during kernel startup - * and the x86_cpu_to_apicid_ptr contains the address of the - * array during this time. Is it zeroed when the per_cpu - * data area is removed. - */ +/* which logical CPU number maps to which CPU (physical APIC ID) */ u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata = { [0 ... NR_CPUS-1] = BAD_APICID }; -void *x86_cpu_to_apicid_ptr; +void *x86_cpu_to_apicid_early_ptr; DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID; EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); @@ -113,7 +107,6 @@ u8 apicid_2_node[MAX_APICID]; extern const unsigned char trampoline_data []; extern const unsigned char trampoline_end []; static unsigned char *trampoline_base; -static int trampoline_exec; static void map_cpu_to_logical_apicid(void); @@ -138,17 +131,13 @@ static unsigned long __cpuinit setup_trampoline(void) */ void __init smp_alloc_memory(void) { - trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE); + trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE); /* * Has to be in very low memory so we can execute * real-mode AP code. */ if (__pa(trampoline_base) >= 0x9F000) BUG(); - /* - * Make the SMP trampoline executable: - */ - trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1); } /* @@ -405,7 +394,7 @@ static void __cpuinit start_secondary(void *unused) setup_secondary_clock(); if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); - enable_NMI_through_LVT0(NULL); + enable_NMI_through_LVT0(); enable_8259A_irq(0); } /* @@ -448,38 +437,38 @@ void __devinit initialize_secondary(void) { /* * We don't actually need to load the full TSS, - * basically just the stack pointer and the eip. + * basically just the stack pointer and the ip. */ asm volatile( "movl %0,%%esp\n\t" "jmp *%1" : - :"m" (current->thread.esp),"m" (current->thread.eip)); + :"m" (current->thread.sp),"m" (current->thread.ip)); } /* Static state in head.S used to set up a CPU */ extern struct { - void * esp; + void * sp; unsigned short ss; } stack_start; #ifdef CONFIG_NUMA /* which logical CPUs are on which nodes */ -cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly = +cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly = { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; -EXPORT_SYMBOL(node_2_cpu_mask); +EXPORT_SYMBOL(node_to_cpumask_map); /* which node each logical CPU is on */ -int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; -EXPORT_SYMBOL(cpu_2_node); +int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; +EXPORT_SYMBOL(cpu_to_node_map); /* set up a mapping between cpu and node. */ static inline void map_cpu_to_node(int cpu, int node) { printk("Mapping cpu %d to node %d\n", cpu, node); - cpu_set(cpu, node_2_cpu_mask[node]); - cpu_2_node[cpu] = node; + cpu_set(cpu, node_to_cpumask_map[node]); + cpu_to_node_map[cpu] = node; } /* undo a mapping between cpu and node. */ @@ -489,8 +478,8 @@ static inline void unmap_cpu_to_node(int cpu) printk("Unmapping cpu %d from all nodes\n", cpu); for (node = 0; node < MAX_NUMNODES; node ++) - cpu_clear(cpu, node_2_cpu_mask[node]); - cpu_2_node[cpu] = 0; + cpu_clear(cpu, node_to_cpumask_map[node]); + cpu_to_node_map[cpu] = 0; } #else /* !CONFIG_NUMA */ @@ -668,7 +657,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) * target processor state. */ startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, - (unsigned long) stack_start.esp); + (unsigned long) stack_start.sp); /* * Run STARTUP IPI loop. @@ -754,7 +743,7 @@ static inline struct task_struct * __cpuinit alloc_idle_task(int cpu) /* initialize thread_struct. we really want to avoid destroy * idle tread */ - idle->thread.esp = (unsigned long)task_pt_regs(idle); + idle->thread.sp = (unsigned long)task_pt_regs(idle); init_idle(idle, cpu); return idle; } @@ -799,7 +788,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) per_cpu(current_task, cpu) = idle; early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); - idle->thread.eip = (unsigned long) start_secondary; + idle->thread.ip = (unsigned long) start_secondary; /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); @@ -807,9 +796,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) alternatives_smp_switch(1); /* So we see what's up */ - printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); + printk("Booting processor %d/%d ip %lx\n", cpu, apicid, start_eip); /* Stack for startup_32 can be just as for start_secondary onwards */ - stack_start.esp = (void *) idle->thread.esp; + stack_start.sp = (void *) idle->thread.sp; irq_ctx_init(cpu); @@ -1091,7 +1080,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) * Allow the user to impress friends. */ Dprintk("Before bogomips.\n"); - for (cpu = 0; cpu < NR_CPUS; cpu++) + for_each_possible_cpu(cpu) if (cpu_isset(cpu, cpu_callout_map)) bogosum += cpu_data(cpu).loops_per_jiffy; printk(KERN_INFO @@ -1122,7 +1111,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) * construct cpu_sibling_map, so that we can tell sibling CPUs * efficiently. */ - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_possible_cpu(cpu) { cpus_clear(per_cpu(cpu_sibling_map, cpu)); cpus_clear(per_cpu(cpu_core_map, cpu)); } @@ -1296,12 +1285,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) setup_ioapic_dest(); #endif zap_low_mappings(); -#ifndef CONFIG_HOTPLUG_CPU - /* - * Disable executability of the SMP trampoline: - */ - set_kernel_exec((unsigned long)trampoline_base, trampoline_exec); -#endif } void __init smp_intr_init(void) diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c index aaf4e129121..cc64b8085c2 100644 --- a/arch/x86/kernel/smpboot_64.c +++ b/arch/x86/kernel/smpboot_64.c @@ -65,7 +65,7 @@ int smp_num_siblings = 1; EXPORT_SYMBOL(smp_num_siblings); /* Last level cache ID of each logical CPU */ -DEFINE_PER_CPU(u8, cpu_llc_id) = BAD_APICID; +DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID; /* Bitmask of currently online CPUs */ cpumask_t cpu_online_map __read_mostly; @@ -78,8 +78,6 @@ EXPORT_SYMBOL(cpu_online_map); */ cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; -EXPORT_SYMBOL(cpu_callout_map); - cpumask_t cpu_possible_map; EXPORT_SYMBOL(cpu_possible_map); @@ -113,10 +111,20 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 }; * a new thread. Also avoids complicated thread destroy functionality * for idle threads. */ +#ifdef CONFIG_HOTPLUG_CPU +/* + * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is + * removed after init for !CONFIG_HOTPLUG_CPU. + */ +static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); +#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) +#define set_idle_for_cpu(x,p) (per_cpu(idle_thread_array, x) = (p)) +#else struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; - #define get_idle_for_cpu(x) (idle_thread_array[(x)]) #define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p)) +#endif + /* * Currently trivial. Write the real->protected mode @@ -212,6 +220,7 @@ void __cpuinit smp_callin(void) Dprintk("CALLIN, before setup_local_APIC().\n"); setup_local_APIC(); + end_local_APIC_setup(); /* * Get our bogomips. @@ -338,7 +347,7 @@ void __cpuinit start_secondary(void) if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); - enable_NMI_through_LVT0(NULL); + enable_NMI_through_LVT0(); enable_8259A_irq(0); } @@ -370,7 +379,7 @@ void __cpuinit start_secondary(void) unlock_ipi_call_lock(); - setup_secondary_APIC_clock(); + setup_secondary_clock(); cpu_idle(); } @@ -384,19 +393,20 @@ static void inquire_remote_apic(int apicid) unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; char *names[] = { "ID", "VERSION", "SPIV" }; int timeout; - unsigned int status; + u32 status; printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); for (i = 0; i < ARRAY_SIZE(regs); i++) { - printk("... APIC #%d %s: ", apicid, names[i]); + printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]); /* * Wait for idle. */ status = safe_apic_wait_icr_idle(); if (status) - printk("a previous APIC delivery may have failed\n"); + printk(KERN_CONT + "a previous APIC delivery may have failed\n"); apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); @@ -410,10 +420,10 @@ static void inquire_remote_apic(int apicid) switch (status) { case APIC_ICR_RR_VALID: status = apic_read(APIC_RRR); - printk("%08x\n", status); + printk(KERN_CONT "%08x\n", status); break; default: - printk("failed\n"); + printk(KERN_CONT "failed\n"); } } } @@ -466,7 +476,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta */ Dprintk("#startup loops: %d.\n", num_starts); - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); for (j = 1; j <= num_starts; j++) { Dprintk("Sending STARTUP #%d.\n",j); @@ -577,7 +587,7 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) c_idle.idle = get_idle_for_cpu(cpu); if (c_idle.idle) { - c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *) + c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *) (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1); init_idle(c_idle.idle, cpu); goto do_rest; @@ -613,8 +623,8 @@ do_rest: start_rip = setup_trampoline(); - init_rsp = c_idle.idle->thread.rsp; - per_cpu(init_tss,cpu).rsp0 = init_rsp; + init_rsp = c_idle.idle->thread.sp; + load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread); initial_code = start_secondary; clear_tsk_thread_flag(c_idle.idle, TIF_FORK); @@ -691,7 +701,7 @@ do_rest: } if (boot_error) { cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ - clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ + clear_bit(cpu, (unsigned long *)&cpu_initialized); /* was set by cpu_init() */ clear_node_cpumask(cpu); /* was set by numa_add_cpu */ cpu_clear(cpu, cpu_present_map); cpu_clear(cpu, cpu_possible_map); @@ -841,24 +851,16 @@ static int __init smp_sanity_check(unsigned max_cpus) return 0; } -/* - * Copy apicid's found by MP_processor_info from initial array to the per cpu - * data area. The x86_cpu_to_apicid_init array is then expendable and the - * x86_cpu_to_apicid_ptr is zeroed indicating that the static array is no - * longer available. - */ -void __init smp_set_apicids(void) +static void __init smp_cpu_index_default(void) { - int cpu; + int i; + struct cpuinfo_x86 *c; - for_each_cpu_mask(cpu, cpu_possible_map) { - if (per_cpu_offset(cpu)) - per_cpu(x86_cpu_to_apicid, cpu) = - x86_cpu_to_apicid_init[cpu]; + for_each_cpu_mask(i, cpu_possible_map) { + c = &cpu_data(i); + /* mark all to hotplug */ + c->cpu_index = NR_CPUS; } - - /* indicate the static array will be going away soon */ - x86_cpu_to_apicid_ptr = NULL; } /* @@ -868,9 +870,9 @@ void __init smp_set_apicids(void) void __init smp_prepare_cpus(unsigned int max_cpus) { nmi_watchdog_default(); + smp_cpu_index_default(); current_cpu_data = boot_cpu_data; current_thread_info()->cpu = 0; /* needed? */ - smp_set_apicids(); set_cpu_sibling_map(0); if (smp_sanity_check(max_cpus) < 0) { @@ -885,6 +887,13 @@ void __init smp_prepare_cpus(unsigned int max_cpus) */ setup_local_APIC(); + /* + * Enable IO APIC before setting up error vector + */ + if (!skip_ioapic_setup && nr_ioapics) + enable_IO_APIC(); + end_local_APIC_setup(); + if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) { panic("Boot APIC ID in local APIC unexpected (%d vs %d)", GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id); @@ -903,7 +912,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) * Set up local APIC timer on boot CPU. */ - setup_boot_APIC_clock(); + setup_boot_clock(); } /* @@ -912,7 +921,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) void __init smp_prepare_boot_cpu(void) { int me = smp_processor_id(); - cpu_set(me, cpu_online_map); + /* already set me in cpu_online_map in boot_cpu_init() */ cpu_set(me, cpu_callout_map); per_cpu(cpu_state, me) = CPU_ONLINE; } @@ -1016,7 +1025,7 @@ void remove_cpu_from_maps(void) cpu_clear(cpu, cpu_callout_map); cpu_clear(cpu, cpu_callin_map); - clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ + clear_bit(cpu, (unsigned long *)&cpu_initialized); /* was set by cpu_init() */ clear_node_cpumask(cpu); } diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c index bbfe85a0f69..8bc38af29ae 100644 --- a/arch/x86/kernel/smpcommon_32.c +++ b/arch/x86/kernel/smpcommon_32.c @@ -14,10 +14,11 @@ __cpuinit void init_gdt(int cpu) { struct desc_struct *gdt = get_cpu_gdt_table(cpu); - pack_descriptor((u32 *)&gdt[GDT_ENTRY_PERCPU].a, - (u32 *)&gdt[GDT_ENTRY_PERCPU].b, + pack_descriptor(&gdt[GDT_ENTRY_PERCPU], __per_cpu_offset[cpu], 0xFFFFF, - 0x80 | DESCTYPE_S | 0x2, 0x8); + 0x2 | DESCTYPE_S, 0x8); + + gdt[GDT_ENTRY_PERCPU].s = 1; per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; per_cpu(cpu_number, cpu) = cpu; diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c index 2a8713ec0f9..2bf6903cb44 100644 --- a/arch/x86/kernel/srat_32.c +++ b/arch/x86/kernel/srat_32.c @@ -57,8 +57,6 @@ static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS]; static int num_memory_chunks; /* total number of memory chunks */ static u8 __initdata apicid_to_pxm[MAX_APICID]; -extern void * boot_ioremap(unsigned long, unsigned long); - /* Identify CPU proximity domains */ static void __init parse_cpu_affinity_structure(char *p) { @@ -299,7 +297,7 @@ int __init get_memcfg_from_srat(void) } rsdt = (struct acpi_table_rsdt *) - boot_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt)); + early_ioremap(rsdp->rsdt_physical_address, sizeof(struct acpi_table_rsdt)); if (!rsdt) { printk(KERN_WARNING @@ -339,11 +337,11 @@ int __init get_memcfg_from_srat(void) for (i = 0; i < tables; i++) { /* Map in header, then map in full table length. */ header = (struct acpi_table_header *) - boot_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header)); + early_ioremap(saved_rsdt.table.table_offset_entry[i], sizeof(struct acpi_table_header)); if (!header) break; header = (struct acpi_table_header *) - boot_ioremap(saved_rsdt.table.table_offset_entry[i], header->length); + early_ioremap(saved_rsdt.table.table_offset_entry[i], header->length); if (!header) break; diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 55771fd7e54..02f0f61f5b1 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -22,9 +22,9 @@ static int save_stack_stack(void *data, char *name) return -1; } -static void save_stack_address(void *data, unsigned long addr) +static void save_stack_address(void *data, unsigned long addr, int reliable) { - struct stack_trace *trace = (struct stack_trace *)data; + struct stack_trace *trace = data; if (trace->skip > 0) { trace->skip--; return; @@ -33,7 +33,8 @@ static void save_stack_address(void *data, unsigned long addr) trace->entries[trace->nr_entries++] = addr; } -static void save_stack_address_nosched(void *data, unsigned long addr) +static void +save_stack_address_nosched(void *data, unsigned long addr, int reliable) { struct stack_trace *trace = (struct stack_trace *)data; if (in_sched_functions(addr)) @@ -65,15 +66,14 @@ static const struct stacktrace_ops save_stack_ops_nosched = { */ void save_stack_trace(struct stack_trace *trace) { - dump_trace(current, NULL, NULL, &save_stack_ops, trace); + dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } -EXPORT_SYMBOL(save_stack_trace); void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { - dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace); + dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c new file mode 100644 index 00000000000..2ef1a5f8d67 --- /dev/null +++ b/arch/x86/kernel/step.c @@ -0,0 +1,203 @@ +/* + * x86 single-step support code, common to 32-bit and 64-bit. + */ +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/ptrace.h> + +unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs) +{ + unsigned long addr, seg; + + addr = regs->ip; + seg = regs->cs & 0xffff; + if (v8086_mode(regs)) { + addr = (addr & 0xffff) + (seg << 4); + return addr; + } + + /* + * We'll assume that the code segments in the GDT + * are all zero-based. That is largely true: the + * TLS segments are used for data, and the PNPBIOS + * and APM bios ones we just ignore here. + */ + if ((seg & SEGMENT_TI_MASK) == SEGMENT_LDT) { + u32 *desc; + unsigned long base; + + seg &= ~7UL; + + mutex_lock(&child->mm->context.lock); + if (unlikely((seg >> 3) >= child->mm->context.size)) + addr = -1L; /* bogus selector, access would fault */ + else { + desc = child->mm->context.ldt + seg; + base = ((desc[0] >> 16) | + ((desc[1] & 0xff) << 16) | + (desc[1] & 0xff000000)); + + /* 16-bit code segment? */ + if (!((desc[1] >> 22) & 1)) + addr &= 0xffff; + addr += base; + } + mutex_unlock(&child->mm->context.lock); + } + + return addr; +} + +static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) +{ + int i, copied; + unsigned char opcode[15]; + unsigned long addr = convert_ip_to_linear(child, regs); + + copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); + for (i = 0; i < copied; i++) { + switch (opcode[i]) { + /* popf and iret */ + case 0x9d: case 0xcf: + return 1; + + /* CHECKME: 64 65 */ + + /* opcode and address size prefixes */ + case 0x66: case 0x67: + continue; + /* irrelevant prefixes (segment overrides and repeats) */ + case 0x26: case 0x2e: + case 0x36: case 0x3e: + case 0x64: case 0x65: + case 0xf0: case 0xf2: case 0xf3: + continue; + +#ifdef CONFIG_X86_64 + case 0x40 ... 0x4f: + if (regs->cs != __USER_CS) + /* 32-bit mode: register increment */ + return 0; + /* 64-bit mode: REX prefix */ + continue; +#endif + + /* CHECKME: f2, f3 */ + + /* + * pushf: NOTE! We should probably not let + * the user see the TF bit being set. But + * it's more pain than it's worth to avoid + * it, and a debugger could emulate this + * all in user space if it _really_ cares. + */ + case 0x9c: + default: + return 0; + } + } + return 0; +} + +/* + * Enable single-stepping. Return nonzero if user mode is not using TF itself. + */ +static int enable_single_step(struct task_struct *child) +{ + struct pt_regs *regs = task_pt_regs(child); + + /* + * Always set TIF_SINGLESTEP - this guarantees that + * we single-step system calls etc.. This will also + * cause us to set TF when returning to user mode. + */ + set_tsk_thread_flag(child, TIF_SINGLESTEP); + + /* + * If TF was already set, don't do anything else + */ + if (regs->flags & X86_EFLAGS_TF) + return 0; + + /* Set TF on the kernel stack.. */ + regs->flags |= X86_EFLAGS_TF; + + /* + * ..but if TF is changed by the instruction we will trace, + * don't mark it as being "us" that set it, so that we + * won't clear it by hand later. + */ + if (is_setting_trap_flag(child, regs)) + return 0; + + set_tsk_thread_flag(child, TIF_FORCED_TF); + + return 1; +} + +/* + * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running. + */ +static void write_debugctlmsr(struct task_struct *child, unsigned long val) +{ + child->thread.debugctlmsr = val; + + if (child != current) + return; + + wrmsrl(MSR_IA32_DEBUGCTLMSR, val); +} + +/* + * Enable single or block step. + */ +static void enable_step(struct task_struct *child, bool block) +{ + /* + * Make sure block stepping (BTF) is not enabled unless it should be. + * Note that we don't try to worry about any is_setting_trap_flag() + * instructions after the first when using block stepping. + * So noone should try to use debugger block stepping in a program + * that uses user-mode single stepping itself. + */ + if (enable_single_step(child) && block) { + set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + write_debugctlmsr(child, + child->thread.debugctlmsr | DEBUGCTLMSR_BTF); + } else { + write_debugctlmsr(child, + child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR); + + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + } +} + +void user_enable_single_step(struct task_struct *child) +{ + enable_step(child, 0); +} + +void user_enable_block_step(struct task_struct *child) +{ + enable_step(child, 1); +} + +void user_disable_single_step(struct task_struct *child) +{ + /* + * Make sure block stepping (BTF) is disabled. + */ + write_debugctlmsr(child, + child->thread.debugctlmsr & ~TIF_DEBUGCTLMSR); + + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + + /* Always clear TIF_SINGLESTEP... */ + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + + /* But touch TF only if it was set by us.. */ + if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF)) + task_pt_regs(child)->flags &= ~X86_EFLAGS_TF; +} diff --git a/arch/x86/kernel/suspend_64.c b/arch/x86/kernel/suspend_64.c index 2e5efaaf880..09199511c25 100644 --- a/arch/x86/kernel/suspend_64.c +++ b/arch/x86/kernel/suspend_64.c @@ -17,9 +17,26 @@ /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; +static void fix_processor_context(void); + struct saved_context saved_context; -void __save_processor_state(struct saved_context *ctxt) +/** + * __save_processor_state - save CPU registers before creating a + * hibernation image and before restoring the memory state from it + * @ctxt - structure to store the registers contents in + * + * NOTE: If there is a CPU register the modification of which by the + * boot kernel (ie. the kernel used for loading the hibernation image) + * might affect the operations of the restored target kernel (ie. the one + * saved in the hibernation image), then its contents must be saved by this + * function. In other words, if kernel A is hibernated and different + * kernel B is used for loading the hibernation image into memory, the + * kernel A's __save_processor_state() function must save all registers + * needed by kernel A, so that it can operate correctly after the resume + * regardless of what kernel B does in the meantime. + */ +static void __save_processor_state(struct saved_context *ctxt) { kernel_fpu_begin(); @@ -69,7 +86,12 @@ static void do_fpu_end(void) kernel_fpu_end(); } -void __restore_processor_state(struct saved_context *ctxt) +/** + * __restore_processor_state - restore the contents of CPU registers saved + * by __save_processor_state() + * @ctxt - structure to load the registers contents from + */ +static void __restore_processor_state(struct saved_context *ctxt) { /* * control registers @@ -113,14 +135,14 @@ void restore_processor_state(void) __restore_processor_state(&saved_context); } -void fix_processor_context(void) +static void fix_processor_context(void) { int cpu = smp_processor_id(); struct tss_struct *t = &per_cpu(init_tss, cpu); set_tss_desc(cpu,t); /* This just modifies memory; should not be necessary. But... This is necessary, because 386 hardware has concept of busy TSS or some similar stupidity. */ - cpu_gdt(cpu)[GDT_ENTRY_TSS].type = 9; + get_cpu_gdt_table(cpu)[GDT_ENTRY_TSS].type = 9; syscall_init(); /* This sets MSR_*STAR and related */ load_TR_desc(); /* This does ltr */ diff --git a/arch/x86/kernel/suspend_asm_64.S b/arch/x86/kernel/suspend_asm_64.S index 72f952103e5..aeb9a4d7681 100644 --- a/arch/x86/kernel/suspend_asm_64.S +++ b/arch/x86/kernel/suspend_asm_64.S @@ -18,13 +18,13 @@ ENTRY(swsusp_arch_suspend) movq $saved_context, %rax - movq %rsp, pt_regs_rsp(%rax) - movq %rbp, pt_regs_rbp(%rax) - movq %rsi, pt_regs_rsi(%rax) - movq %rdi, pt_regs_rdi(%rax) - movq %rbx, pt_regs_rbx(%rax) - movq %rcx, pt_regs_rcx(%rax) - movq %rdx, pt_regs_rdx(%rax) + movq %rsp, pt_regs_sp(%rax) + movq %rbp, pt_regs_bp(%rax) + movq %rsi, pt_regs_si(%rax) + movq %rdi, pt_regs_di(%rax) + movq %rbx, pt_regs_bx(%rax) + movq %rcx, pt_regs_cx(%rax) + movq %rdx, pt_regs_dx(%rax) movq %r8, pt_regs_r8(%rax) movq %r9, pt_regs_r9(%rax) movq %r10, pt_regs_r10(%rax) @@ -34,7 +34,7 @@ ENTRY(swsusp_arch_suspend) movq %r14, pt_regs_r14(%rax) movq %r15, pt_regs_r15(%rax) pushfq - popq pt_regs_eflags(%rax) + popq pt_regs_flags(%rax) /* save the address of restore_registers */ movq $restore_registers, %rax @@ -115,13 +115,13 @@ ENTRY(restore_registers) /* We don't restore %rax, it must be 0 anyway */ movq $saved_context, %rax - movq pt_regs_rsp(%rax), %rsp - movq pt_regs_rbp(%rax), %rbp - movq pt_regs_rsi(%rax), %rsi - movq pt_regs_rdi(%rax), %rdi - movq pt_regs_rbx(%rax), %rbx - movq pt_regs_rcx(%rax), %rcx - movq pt_regs_rdx(%rax), %rdx + movq pt_regs_sp(%rax), %rsp + movq pt_regs_bp(%rax), %rbp + movq pt_regs_si(%rax), %rsi + movq pt_regs_di(%rax), %rdi + movq pt_regs_bx(%rax), %rbx + movq pt_regs_cx(%rax), %rcx + movq pt_regs_dx(%rax), %rdx movq pt_regs_r8(%rax), %r8 movq pt_regs_r9(%rax), %r9 movq pt_regs_r10(%rax), %r10 @@ -130,7 +130,7 @@ ENTRY(restore_registers) movq pt_regs_r13(%rax), %r13 movq pt_regs_r14(%rax), %r14 movq pt_regs_r15(%rax), %r15 - pushq pt_regs_eflags(%rax) + pushq pt_regs_flags(%rax) popfq xorq %rax, %rax diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 907942ee6e7..bd802a5e1aa 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -12,6 +12,7 @@ #include <linux/file.h> #include <linux/utsname.h> #include <linux/personality.h> +#include <linux/random.h> #include <asm/uaccess.h> #include <asm/ia32.h> @@ -65,6 +66,7 @@ static void find_start_end(unsigned long flags, unsigned long *begin, unsigned long *end) { if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) { + unsigned long new_begin; /* This is usually used needed to map code in small model, so it needs to be in the first 31bit. Limit it to that. This means we need to move the @@ -74,6 +76,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin, of playground for now. -AK */ *begin = 0x40000000; *end = 0x80000000; + if (current->flags & PF_RANDOMIZE) { + new_begin = randomize_range(*begin, *begin + 0x02000000, 0); + if (new_begin) + *begin = new_begin; + } } else { *begin = TASK_UNMAPPED_BASE; *end = TASK_SIZE; @@ -143,6 +150,97 @@ full_search: } } + +unsigned long +arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + const unsigned long len, const unsigned long pgoff, + const unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr = addr0; + + /* requested length too big for entire address space */ + if (len > TASK_SIZE) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + /* for MAP_32BIT mappings we force the legact mmap base */ + if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) + goto bottomup; + + /* requesting a specific address */ + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + /* check if free_area_cache is useful for us */ + if (len <= mm->cached_hole_size) { + mm->cached_hole_size = 0; + mm->free_area_cache = mm->mmap_base; + } + + /* either no address requested or can't fit in requested address hole */ + addr = mm->free_area_cache; + + /* make sure it can fit in the remaining address space */ + if (addr > len) { + vma = find_vma(mm, addr-len); + if (!vma || addr <= vma->vm_start) + /* remember the address as a hint for next time */ + return (mm->free_area_cache = addr-len); + } + + if (mm->mmap_base < len) + goto bottomup; + + addr = mm->mmap_base-len; + + do { + /* + * Lookup failure means no vma is above this address, + * else if new region fits below vma->vm_start, + * return with success: + */ + vma = find_vma(mm, addr); + if (!vma || addr+len <= vma->vm_start) + /* remember the address as a hint for next time */ + return (mm->free_area_cache = addr); + + /* remember the largest hole we saw so far */ + if (addr + mm->cached_hole_size < vma->vm_start) + mm->cached_hole_size = vma->vm_start - addr; + + /* try just below the current vma->vm_start */ + addr = vma->vm_start-len; + } while (len < vma->vm_start); + +bottomup: + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + mm->cached_hole_size = ~0UL; + mm->free_area_cache = TASK_UNMAPPED_BASE; + addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); + /* + * Restore the topdown base: + */ + mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = ~0UL; + + return addr; +} + + asmlinkage long sys_uname(struct new_utsname __user * name) { int err; diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c new file mode 100644 index 00000000000..6d7ef11e797 --- /dev/null +++ b/arch/x86/kernel/test_nx.c @@ -0,0 +1,176 @@ +/* + * test_nx.c: functional test for NX functionality + * + * (C) Copyright 2008 Intel Corporation + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/module.h> +#include <linux/sort.h> +#include <asm/uaccess.h> + +extern int rodata_test_data; + +/* + * This file checks 4 things: + * 1) Check if the stack is not executable + * 2) Check if kmalloc memory is not executable + * 3) Check if the .rodata section is not executable + * 4) Check if the .data section of a module is not executable + * + * To do this, the test code tries to execute memory in stack/kmalloc/etc, + * and then checks if the expected trap happens. + * + * Sadly, this implies having a dynamic exception handling table entry. + * ... which can be done (and will make Rusty cry)... but it can only + * be done in a stand-alone module with only 1 entry total. + * (otherwise we'd have to sort and that's just too messy) + */ + + + +/* + * We want to set up an exception handling point on our stack, + * which means a variable value. This function is rather dirty + * and walks the exception table of the module, looking for a magic + * marker and replaces it with a specific function. + */ +static void fudze_exception_table(void *marker, void *new) +{ + struct module *mod = THIS_MODULE; + struct exception_table_entry *extable; + + /* + * Note: This module has only 1 exception table entry, + * so searching and sorting is not needed. If that changes, + * this would be the place to search and re-sort the exception + * table. + */ + if (mod->num_exentries > 1) { + printk(KERN_ERR "test_nx: too many exception table entries!\n"); + printk(KERN_ERR "test_nx: test results are not reliable.\n"); + return; + } + extable = (struct exception_table_entry *)mod->extable; + extable[0].insn = (unsigned long)new; +} + + +/* + * exception tables get their symbols translated so we need + * to use a fake function to put in there, which we can then + * replace at runtime. + */ +void foo_label(void); + +/* + * returns 0 for not-executable, negative for executable + * + * Note: we cannot allow this function to be inlined, because + * that would give us more than 1 exception table entry. + * This in turn would break the assumptions above. + */ +static noinline int test_address(void *address) +{ + unsigned long result; + + /* Set up an exception table entry for our address */ + fudze_exception_table(&foo_label, address); + result = 1; + asm volatile( + "foo_label:\n" + "0: call *%[fake_code]\n" + "1:\n" + ".section .fixup,\"ax\"\n" + "2: mov %[zero], %[rslt]\n" + " ret\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 8\n" + " .quad 0b\n" + " .quad 2b\n" + ".previous\n" + : [rslt] "=r" (result) + : [fake_code] "r" (address), [zero] "r" (0UL), "0" (result) + ); + /* change the exception table back for the next round */ + fudze_exception_table(address, &foo_label); + + if (result) + return -ENODEV; + return 0; +} + +static unsigned char test_data = 0xC3; /* 0xC3 is the opcode for "ret" */ + +static int test_NX(void) +{ + int ret = 0; + /* 0xC3 is the opcode for "ret" */ + char stackcode[] = {0xC3, 0x90, 0 }; + char *heap; + + test_data = 0xC3; + + printk(KERN_INFO "Testing NX protection\n"); + + /* Test 1: check if the stack is not executable */ + if (test_address(&stackcode)) { + printk(KERN_ERR "test_nx: stack was executable\n"); + ret = -ENODEV; + } + + + /* Test 2: Check if the heap is executable */ + heap = kmalloc(64, GFP_KERNEL); + if (!heap) + return -ENOMEM; + heap[0] = 0xC3; /* opcode for "ret" */ + + if (test_address(heap)) { + printk(KERN_ERR "test_nx: heap was executable\n"); + ret = -ENODEV; + } + kfree(heap); + + /* + * The following 2 tests currently fail, this needs to get fixed + * Until then, don't run them to avoid too many people getting scared + * by the error message + */ +#if 0 + +#ifdef CONFIG_DEBUG_RODATA + /* Test 3: Check if the .rodata section is executable */ + if (rodata_test_data != 0xC3) { + printk(KERN_ERR "test_nx: .rodata marker has invalid value\n"); + ret = -ENODEV; + } else if (test_address(&rodata_test_data)) { + printk(KERN_ERR "test_nx: .rodata section is executable\n"); + ret = -ENODEV; + } +#endif + + /* Test 4: Check if the .data section of a module is executable */ + if (test_address(&test_data)) { + printk(KERN_ERR "test_nx: .data section is executable\n"); + ret = -ENODEV; + } + +#endif + return 0; +} + +static void test_exit(void) +{ +} + +module_init(test_NX); +module_exit(test_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Testcase for the NX infrastructure"); +MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c new file mode 100644 index 00000000000..4c163772000 --- /dev/null +++ b/arch/x86/kernel/test_rodata.c @@ -0,0 +1,86 @@ +/* + * test_rodata.c: functional test for mark_rodata_ro function + * + * (C) Copyright 2008 Intel Corporation + * Author: Arjan van de Ven <arjan@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/module.h> +#include <asm/sections.h> +extern int rodata_test_data; + +int rodata_test(void) +{ + unsigned long result; + unsigned long start, end; + + /* test 1: read the value */ + /* If this test fails, some previous testrun has clobbered the state */ + if (!rodata_test_data) { + printk(KERN_ERR "rodata_test: test 1 fails (start data)\n"); + return -ENODEV; + } + + /* test 2: write to the variable; this should fault */ + /* + * If this test fails, we managed to overwrite the data + * + * This is written in assembly to be able to catch the + * exception that is supposed to happen in the correct + * case + */ + + result = 1; + asm volatile( + "0: mov %[zero],(%[rodata_test])\n" + " mov %[zero], %[rslt]\n" + "1:\n" + ".section .fixup,\"ax\"\n" + "2: jmp 1b\n" + ".previous\n" + ".section __ex_table,\"a\"\n" + " .align 16\n" +#ifdef CONFIG_X86_32 + " .long 0b,2b\n" +#else + " .quad 0b,2b\n" +#endif + ".previous" + : [rslt] "=r" (result) + : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL) + ); + + + if (!result) { + printk(KERN_ERR "rodata_test: test data was not read only\n"); + return -ENODEV; + } + + /* test 3: check the value hasn't changed */ + /* If this test fails, we managed to overwrite the data */ + if (!rodata_test_data) { + printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n"); + return -ENODEV; + } + /* test 4: check if the rodata section is 4Kb aligned */ + start = (unsigned long)__start_rodata; + end = (unsigned long)__end_rodata; + if (start & (PAGE_SIZE - 1)) { + printk(KERN_ERR "rodata_test: .rodata is not 4k aligned\n"); + return -ENODEV; + } + if (end & (PAGE_SIZE - 1)) { + printk(KERN_ERR "rodata_test: .rodata end is not 4k aligned\n"); + return -ENODEV; + } + + return 0; +} + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Testcase for the DEBUG_RODATA infrastructure"); +MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 8a322c96bc2..1a89e93f3f1 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -28,98 +28,20 @@ * serialize accesses to xtime/lost_ticks). */ -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/param.h> -#include <linux/string.h> -#include <linux/mm.h> +#include <linux/init.h> #include <linux/interrupt.h> #include <linux/time.h> -#include <linux/delay.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/module.h> -#include <linux/sysdev.h> -#include <linux/bcd.h> -#include <linux/efi.h> #include <linux/mca.h> -#include <asm/io.h> -#include <asm/smp.h> -#include <asm/irq.h> -#include <asm/msr.h> -#include <asm/delay.h> -#include <asm/mpspec.h> -#include <asm/uaccess.h> -#include <asm/processor.h> -#include <asm/timer.h> -#include <asm/time.h> - -#include "mach_time.h" - -#include <linux/timex.h> - -#include <asm/hpet.h> - #include <asm/arch_hooks.h> - -#include "io_ports.h" - -#include <asm/i8259.h> +#include <asm/hpet.h> +#include <asm/time.h> #include "do_timer.h" unsigned int cpu_khz; /* Detected as we calibrate the TSC */ EXPORT_SYMBOL(cpu_khz); -DEFINE_SPINLOCK(rtc_lock); -EXPORT_SYMBOL(rtc_lock); - -/* - * This is a special lock that is owned by the CPU and holds the index - * register we are working with. It is required for NMI access to the - * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details. - */ -volatile unsigned long cmos_lock = 0; -EXPORT_SYMBOL(cmos_lock); - -/* Routines for accessing the CMOS RAM/RTC. */ -unsigned char rtc_cmos_read(unsigned char addr) -{ - unsigned char val; - lock_cmos_prefix(addr); - outb_p(addr, RTC_PORT(0)); - val = inb_p(RTC_PORT(1)); - lock_cmos_suffix(addr); - return val; -} -EXPORT_SYMBOL(rtc_cmos_read); - -void rtc_cmos_write(unsigned char val, unsigned char addr) -{ - lock_cmos_prefix(addr); - outb_p(addr, RTC_PORT(0)); - outb_p(val, RTC_PORT(1)); - lock_cmos_suffix(addr); -} -EXPORT_SYMBOL(rtc_cmos_write); - -static int set_rtc_mmss(unsigned long nowtime) -{ - int retval; - unsigned long flags; - - /* gets recalled with irq locally disabled */ - /* XXX - does irqsave resolve this? -johnstul */ - spin_lock_irqsave(&rtc_lock, flags); - retval = set_wallclock(nowtime); - spin_unlock_irqrestore(&rtc_lock, flags); - - return retval; -} - - int timer_ack; unsigned long profile_pc(struct pt_regs *regs) @@ -127,17 +49,17 @@ unsigned long profile_pc(struct pt_regs *regs) unsigned long pc = instruction_pointer(regs); #ifdef CONFIG_SMP - if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->xcs) && + if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->ebp + 4); + return *(unsigned long *)(regs->bp + 4); #else - unsigned long *sp = (unsigned long *)®s->esp; + unsigned long *sp = (unsigned long *)®s->sp; /* Return address is either directly at stack pointer - or above a saved eflags. Eflags has bits 22-31 zero, + or above a saved flags. Eflags has bits 22-31 zero, kernel addresses don't. */ - if (sp[0] >> 22) + if (sp[0] >> 22) return sp[0]; if (sp[1] >> 22) return sp[1]; @@ -193,26 +115,6 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -/* not static: needed by APM */ -unsigned long read_persistent_clock(void) -{ - unsigned long retval; - unsigned long flags; - - spin_lock_irqsave(&rtc_lock, flags); - - retval = get_wallclock(); - - spin_unlock_irqrestore(&rtc_lock, flags); - - return retval; -} - -int update_persistent_clock(struct timespec now) -{ - return set_rtc_mmss(now.tv_sec); -} - extern void (*late_time_init)(void); /* Duplicate of time_init() below, with hpet_enable part added */ void __init hpet_time_init(void) diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 368b1942b39..0380795121a 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -11,43 +11,18 @@ * RTC support code taken from arch/i386/kernel/timers/time_hpet.c */ -#include <linux/kernel.h> -#include <linux/sched.h> -#include <linux/interrupt.h> +#include <linux/clockchips.h> #include <linux/init.h> -#include <linux/mc146818rtc.h> -#include <linux/time.h> -#include <linux/ioport.h> +#include <linux/interrupt.h> #include <linux/module.h> -#include <linux/device.h> -#include <linux/sysdev.h> -#include <linux/bcd.h> -#include <linux/notifier.h> -#include <linux/cpu.h> -#include <linux/kallsyms.h> -#include <linux/acpi.h> -#include <linux/clockchips.h> +#include <linux/time.h> -#ifdef CONFIG_ACPI -#include <acpi/achware.h> /* for PM timer frequency */ -#include <acpi/acpi_bus.h> -#endif #include <asm/i8253.h> -#include <asm/pgtable.h> -#include <asm/vsyscall.h> -#include <asm/timex.h> -#include <asm/proto.h> -#include <asm/hpet.h> -#include <asm/sections.h> -#include <linux/hpet.h> -#include <asm/apic.h> #include <asm/hpet.h> -#include <asm/mpspec.h> #include <asm/nmi.h> #include <asm/vgtod.h> - -DEFINE_SPINLOCK(rtc_lock); -EXPORT_SYMBOL(rtc_lock); +#include <asm/time.h> +#include <asm/timer.h> volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; @@ -56,10 +31,10 @@ unsigned long profile_pc(struct pt_regs *regs) unsigned long pc = instruction_pointer(regs); /* Assume the lock function has either no stack frame or a copy - of eflags from PUSHF + of flags from PUSHF Eflags always has bits 22 and up cleared unlike kernel addresses. */ if (!user_mode(regs) && in_lock_functions(pc)) { - unsigned long *sp = (unsigned long *)regs->rsp; + unsigned long *sp = (unsigned long *)regs->sp; if (sp[0] >> 22) return sp[0]; if (sp[1] >> 22) @@ -69,82 +44,6 @@ unsigned long profile_pc(struct pt_regs *regs) } EXPORT_SYMBOL(profile_pc); -/* - * In order to set the CMOS clock precisely, set_rtc_mmss has to be called 500 - * ms after the second nowtime has started, because when nowtime is written - * into the registers of the CMOS clock, it will jump to the next second - * precisely 500 ms later. Check the Motorola MC146818A or Dallas DS12887 data - * sheet for details. - */ - -static int set_rtc_mmss(unsigned long nowtime) -{ - int retval = 0; - int real_seconds, real_minutes, cmos_minutes; - unsigned char control, freq_select; - unsigned long flags; - -/* - * set_rtc_mmss is called when irqs are enabled, so disable irqs here - */ - spin_lock_irqsave(&rtc_lock, flags); -/* - * Tell the clock it's being set and stop it. - */ - control = CMOS_READ(RTC_CONTROL); - CMOS_WRITE(control | RTC_SET, RTC_CONTROL); - - freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE(freq_select | RTC_DIV_RESET2, RTC_FREQ_SELECT); - - cmos_minutes = CMOS_READ(RTC_MINUTES); - BCD_TO_BIN(cmos_minutes); - -/* - * since we're only adjusting minutes and seconds, don't interfere with hour - * overflow. This avoids messing with unknown time zones but requires your RTC - * not to be off by more than 15 minutes. Since we're calling it only when - * our clock is externally synchronized using NTP, this shouldn't be a problem. - */ - - real_seconds = nowtime % 60; - real_minutes = nowtime / 60; - if (((abs(real_minutes - cmos_minutes) + 15) / 30) & 1) - real_minutes += 30; /* correct for half hour time zone */ - real_minutes %= 60; - - if (abs(real_minutes - cmos_minutes) >= 30) { - printk(KERN_WARNING "time.c: can't update CMOS clock " - "from %d to %d\n", cmos_minutes, real_minutes); - retval = -1; - } else { - BIN_TO_BCD(real_seconds); - BIN_TO_BCD(real_minutes); - CMOS_WRITE(real_seconds, RTC_SECONDS); - CMOS_WRITE(real_minutes, RTC_MINUTES); - } - -/* - * The following flags have to be released exactly in this order, otherwise the - * DS12887 (popular MC146818A clone with integrated battery and quartz) will - * not reset the oscillator and will not update precisely 500 ms later. You - * won't find this mentioned in the Dallas Semiconductor data sheets, but who - * believes data sheets anyway ... -- Markus Kuhn - */ - - CMOS_WRITE(control, RTC_CONTROL); - CMOS_WRITE(freq_select, RTC_FREQ_SELECT); - - spin_unlock_irqrestore(&rtc_lock, flags); - - return retval; -} - -int update_persistent_clock(struct timespec now) -{ - return set_rtc_mmss(now.tv_sec); -} - static irqreturn_t timer_event_interrupt(int irq, void *dev_id) { add_pda(irq0_irqs, 1); @@ -154,67 +53,10 @@ static irqreturn_t timer_event_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -unsigned long read_persistent_clock(void) -{ - unsigned int year, mon, day, hour, min, sec; - unsigned long flags; - unsigned century = 0; - - spin_lock_irqsave(&rtc_lock, flags); - /* - * if UIP is clear, then we have >= 244 microseconds before RTC - * registers will be updated. Spec sheet says that this is the - * reliable way to read RTC - registers invalid (off bus) during update - */ - while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP)) - cpu_relax(); - - - /* now read all RTC registers while stable with interrupts disabled */ - sec = CMOS_READ(RTC_SECONDS); - min = CMOS_READ(RTC_MINUTES); - hour = CMOS_READ(RTC_HOURS); - day = CMOS_READ(RTC_DAY_OF_MONTH); - mon = CMOS_READ(RTC_MONTH); - year = CMOS_READ(RTC_YEAR); -#ifdef CONFIG_ACPI - if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID && - acpi_gbl_FADT.century) - century = CMOS_READ(acpi_gbl_FADT.century); -#endif - spin_unlock_irqrestore(&rtc_lock, flags); - - /* - * We know that x86-64 always uses BCD format, no need to check the - * config register. - */ - - BCD_TO_BIN(sec); - BCD_TO_BIN(min); - BCD_TO_BIN(hour); - BCD_TO_BIN(day); - BCD_TO_BIN(mon); - BCD_TO_BIN(year); - - if (century) { - BCD_TO_BIN(century); - year += century * 100; - printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); - } else { - /* - * x86-64 systems only exists since 2002. - * This will work up to Dec 31, 2100 - */ - year += 2000; - } - - return mktime(year, mon, day, hour, min, sec); -} - /* calibrate_cpu is used on systems with fixed rate TSCs to determine * processor frequency */ #define TICK_COUNT 100000000 -static unsigned int __init tsc_calibrate_cpu_khz(void) +unsigned long __init native_calculate_cpu_khz(void) { int tsc_start, tsc_now; int i, no_ctr_free; @@ -241,7 +83,7 @@ static unsigned int __init tsc_calibrate_cpu_khz(void) rdtscl(tsc_start); do { rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); - tsc_now = get_cycles_sync(); + tsc_now = get_cycles(); } while ((tsc_now - tsc_start) < TICK_COUNT); local_irq_restore(flags); @@ -264,20 +106,22 @@ static struct irqaction irq0 = { .name = "timer" }; -void __init time_init(void) +void __init hpet_time_init(void) { if (!hpet_enable()) setup_pit_timer(); setup_irq(0, &irq0); +} +void __init time_init(void) +{ tsc_calibrate(); cpu_khz = tsc_khz; if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && - boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 16) - cpu_khz = tsc_calibrate_cpu_khz(); + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) + cpu_khz = calculate_cpu_khz(); if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); @@ -290,4 +134,5 @@ void __init time_init(void) printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); init_tsc_clocksource(); + late_time_init = choose_time_init(); } diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c new file mode 100644 index 00000000000..6dfd4e76661 --- /dev/null +++ b/arch/x86/kernel/tls.c @@ -0,0 +1,213 @@ +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/user.h> +#include <linux/regset.h> + +#include <asm/uaccess.h> +#include <asm/desc.h> +#include <asm/system.h> +#include <asm/ldt.h> +#include <asm/processor.h> +#include <asm/proto.h> + +#include "tls.h" + +/* + * sys_alloc_thread_area: get a yet unused TLS descriptor index. + */ +static int get_free_idx(void) +{ + struct thread_struct *t = ¤t->thread; + int idx; + + for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) + if (desc_empty(&t->tls_array[idx])) + return idx + GDT_ENTRY_TLS_MIN; + return -ESRCH; +} + +static void set_tls_desc(struct task_struct *p, int idx, + const struct user_desc *info, int n) +{ + struct thread_struct *t = &p->thread; + struct desc_struct *desc = &t->tls_array[idx - GDT_ENTRY_TLS_MIN]; + int cpu; + + /* + * We must not get preempted while modifying the TLS. + */ + cpu = get_cpu(); + + while (n-- > 0) { + if (LDT_empty(info)) + desc->a = desc->b = 0; + else + fill_ldt(desc, info); + ++info; + ++desc; + } + + if (t == ¤t->thread) + load_TLS(t, cpu); + + put_cpu(); +} + +/* + * Set a given TLS descriptor: + */ +int do_set_thread_area(struct task_struct *p, int idx, + struct user_desc __user *u_info, + int can_allocate) +{ + struct user_desc info; + + if (copy_from_user(&info, u_info, sizeof(info))) + return -EFAULT; + + if (idx == -1) + idx = info.entry_number; + + /* + * index -1 means the kernel should try to find and + * allocate an empty descriptor: + */ + if (idx == -1 && can_allocate) { + idx = get_free_idx(); + if (idx < 0) + return idx; + if (put_user(idx, &u_info->entry_number)) + return -EFAULT; + } + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + set_tls_desc(p, idx, &info, 1); + + return 0; +} + +asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) +{ + return do_set_thread_area(current, -1, u_info, 1); +} + + +/* + * Get the current Thread-Local Storage area: + */ + +static void fill_user_desc(struct user_desc *info, int idx, + const struct desc_struct *desc) + +{ + memset(info, 0, sizeof(*info)); + info->entry_number = idx; + info->base_addr = get_desc_base(desc); + info->limit = get_desc_limit(desc); + info->seg_32bit = desc->d; + info->contents = desc->type >> 2; + info->read_exec_only = !(desc->type & 2); + info->limit_in_pages = desc->g; + info->seg_not_present = !desc->p; + info->useable = desc->avl; +#ifdef CONFIG_X86_64 + info->lm = desc->l; +#endif +} + +int do_get_thread_area(struct task_struct *p, int idx, + struct user_desc __user *u_info) +{ + struct user_desc info; + + if (idx == -1 && get_user(idx, &u_info->entry_number)) + return -EFAULT; + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + fill_user_desc(&info, idx, + &p->thread.tls_array[idx - GDT_ENTRY_TLS_MIN]); + + if (copy_to_user(u_info, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) +{ + return do_get_thread_area(current, -1, u_info); +} + +int regset_tls_active(struct task_struct *target, + const struct user_regset *regset) +{ + struct thread_struct *t = &target->thread; + int n = GDT_ENTRY_TLS_ENTRIES; + while (n > 0 && desc_empty(&t->tls_array[n - 1])) + --n; + return n; +} + +int regset_tls_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + const struct desc_struct *tls; + + if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || + (pos % sizeof(struct user_desc)) != 0 || + (count % sizeof(struct user_desc)) != 0) + return -EINVAL; + + pos /= sizeof(struct user_desc); + count /= sizeof(struct user_desc); + + tls = &target->thread.tls_array[pos]; + + if (kbuf) { + struct user_desc *info = kbuf; + while (count-- > 0) + fill_user_desc(info++, GDT_ENTRY_TLS_MIN + pos++, + tls++); + } else { + struct user_desc __user *u_info = ubuf; + while (count-- > 0) { + struct user_desc info; + fill_user_desc(&info, GDT_ENTRY_TLS_MIN + pos++, tls++); + if (__copy_to_user(u_info++, &info, sizeof(info))) + return -EFAULT; + } + } + + return 0; +} + +int regset_tls_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; + const struct user_desc *info; + + if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || + (pos % sizeof(struct user_desc)) != 0 || + (count % sizeof(struct user_desc)) != 0) + return -EINVAL; + + if (kbuf) + info = kbuf; + else if (__copy_from_user(infobuf, ubuf, count)) + return -EFAULT; + else + info = infobuf; + + set_tls_desc(target, + GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)), + info, count / sizeof(struct user_desc)); + + return 0; +} diff --git a/arch/x86/kernel/tls.h b/arch/x86/kernel/tls.h new file mode 100644 index 00000000000..2f083a2fe21 --- /dev/null +++ b/arch/x86/kernel/tls.h @@ -0,0 +1,21 @@ +/* + * Internal declarations for x86 TLS implementation functions. + * + * Copyright (C) 2007 Red Hat, Inc. All rights reserved. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v.2. + * + * Red Hat Author: Roland McGrath. + */ + +#ifndef _ARCH_X86_KERNEL_TLS_H + +#include <linux/regset.h> + +extern user_regset_active_fn regset_tls_active; +extern user_regset_get_fn regset_tls_get; +extern user_regset_set_fn regset_tls_set; + +#endif /* _ARCH_X86_KERNEL_TLS_H */ diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 7e16d675eb8..78cbb655aa7 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -31,9 +31,10 @@ #include <linux/mmzone.h> #include <asm/cpu.h> -static struct i386_cpu cpu_devices[NR_CPUS]; +static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); -int __cpuinit arch_register_cpu(int num) +#ifdef CONFIG_HOTPLUG_CPU +int arch_register_cpu(int num) { /* * CPU0 cannot be offlined due to several @@ -44,21 +45,23 @@ int __cpuinit arch_register_cpu(int num) * Also certain PCI quirks require not to enable hotplug control * for all CPU's. */ -#ifdef CONFIG_HOTPLUG_CPU if (num) - cpu_devices[num].cpu.hotpluggable = 1; -#endif - - return register_cpu(&cpu_devices[num].cpu, num); + per_cpu(cpu_devices, num).cpu.hotpluggable = 1; + return register_cpu(&per_cpu(cpu_devices, num).cpu, num); } +EXPORT_SYMBOL(arch_register_cpu); -#ifdef CONFIG_HOTPLUG_CPU void arch_unregister_cpu(int num) { - return unregister_cpu(&cpu_devices[num].cpu); + return unregister_cpu(&per_cpu(cpu_devices, num).cpu); } -EXPORT_SYMBOL(arch_register_cpu); EXPORT_SYMBOL(arch_unregister_cpu); +#else +int arch_register_cpu(int num) +{ + return register_cpu(&per_cpu(cpu_devices, num).cpu, num); +} +EXPORT_SYMBOL(arch_register_cpu); #endif /*CONFIG_HOTPLUG_CPU*/ static int __init topology_init(void) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 02d1e1e58e8..3cf72977d01 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -76,7 +76,8 @@ char ignore_fpu_irq = 0; * F0 0F bug workaround.. We have a special link segment * for this. */ -struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = { {0, 0}, }; +gate_desc idt_table[256] + __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; asmlinkage void divide_error(void); asmlinkage void debug(void); @@ -101,6 +102,34 @@ asmlinkage void machine_check(void); int kstack_depth_to_print = 24; static unsigned int code_bytes = 64; +void printk_address(unsigned long address, int reliable) +{ +#ifdef CONFIG_KALLSYMS + unsigned long offset = 0, symsize; + const char *symname; + char *modname; + char *delim = ":"; + char namebuf[128]; + char reliab[4] = ""; + + symname = kallsyms_lookup(address, &symsize, &offset, + &modname, namebuf); + if (!symname) { + printk(" [<%08lx>]\n", address); + return; + } + if (!reliable) + strcpy(reliab, "? "); + + if (!modname) + modname = delim = ""; + printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n", + address, reliab, delim, modname, delim, symname, offset, symsize); +#else + printk(" [<%08lx>]\n", address); +#endif +} + static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned size) { return p > (void *)tinfo && @@ -114,48 +143,35 @@ struct stack_frame { }; static inline unsigned long print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long ebp, + unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { -#ifdef CONFIG_FRAME_POINTER - struct stack_frame *frame = (struct stack_frame *)ebp; - while (valid_stack_ptr(tinfo, frame, sizeof(*frame))) { - struct stack_frame *next; - unsigned long addr; + struct stack_frame *frame = (struct stack_frame *)bp; - addr = frame->return_address; - ops->address(data, addr); - /* - * break out of recursive entries (such as - * end_of_stack_stop_unwind_function). Also, - * we can never allow a frame pointer to - * move downwards! - */ - next = frame->next_frame; - if (next <= frame) - break; - frame = next; - } -#else while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { unsigned long addr; - addr = *stack++; - if (__kernel_text_address(addr)) - ops->address(data, addr); + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + 4) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + } + stack++; } -#endif - return ebp; + return bp; } #define MSG(msg) ops->warning(data, msg) void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, + unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { - unsigned long ebp = 0; - if (!task) task = current; @@ -163,17 +179,17 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long dummy; stack = &dummy; if (task != current) - stack = (unsigned long *)task->thread.esp; + stack = (unsigned long *)task->thread.sp; } #ifdef CONFIG_FRAME_POINTER - if (!ebp) { + if (!bp) { if (task == current) { - /* Grab ebp right from our regs */ - asm ("movl %%ebp, %0" : "=r" (ebp) : ); + /* Grab bp right from our regs */ + asm ("movl %%ebp, %0" : "=r" (bp) : ); } else { - /* ebp is the last reg pushed by switch_to */ - ebp = *(unsigned long *) task->thread.esp; + /* bp is the last reg pushed by switch_to */ + bp = *(unsigned long *) task->thread.sp; } } #endif @@ -182,7 +198,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, struct thread_info *context; context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - ebp = print_context_stack(context, stack, ebp, ops, data); + bp = print_context_stack(context, stack, bp, ops, data); /* Should be after the line below, but somewhere in early boot context comes out corrupted and we can't reference it -AK */ @@ -217,9 +233,11 @@ static int print_trace_stack(void *data, char *name) /* * Print one address/symbol entries per line. */ -static void print_trace_address(void *data, unsigned long addr) +static void print_trace_address(void *data, unsigned long addr, int reliable) { printk("%s [<%08lx>] ", (char *)data, addr); + if (!reliable) + printk("? "); print_symbol("%s\n", addr); touch_nmi_watchdog(); } @@ -233,32 +251,32 @@ static const struct stacktrace_ops print_trace_ops = { static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long * stack, char *log_lvl) + unsigned long *stack, unsigned long bp, char *log_lvl) { - dump_trace(task, regs, stack, &print_trace_ops, log_lvl); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); printk("%s =======================\n", log_lvl); } void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long * stack) + unsigned long *stack, unsigned long bp) { - show_trace_log_lvl(task, regs, stack, ""); + show_trace_log_lvl(task, regs, stack, bp, ""); } static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *esp, char *log_lvl) + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *stack; int i; - if (esp == NULL) { + if (sp == NULL) { if (task) - esp = (unsigned long*)task->thread.esp; + sp = (unsigned long*)task->thread.sp; else - esp = (unsigned long *)&esp; + sp = (unsigned long *)&sp; } - stack = esp; + stack = sp; for(i = 0; i < kstack_depth_to_print; i++) { if (kstack_end(stack)) break; @@ -267,13 +285,13 @@ static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%08lx ", *stack++); } printk("\n%sCall Trace:\n", log_lvl); - show_trace_log_lvl(task, regs, esp, log_lvl); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); } -void show_stack(struct task_struct *task, unsigned long *esp) +void show_stack(struct task_struct *task, unsigned long *sp) { printk(" "); - show_stack_log_lvl(task, NULL, esp, ""); + show_stack_log_lvl(task, NULL, sp, 0, ""); } /* @@ -282,13 +300,19 @@ void show_stack(struct task_struct *task, unsigned long *esp) void dump_stack(void) { unsigned long stack; + unsigned long bp = 0; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + asm("movl %%ebp, %0" : "=r" (bp):); +#endif printk("Pid: %d, comm: %.20s %s %s %.*s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - show_trace(current, NULL, &stack); + show_trace(current, NULL, &stack, bp); } EXPORT_SYMBOL(dump_stack); @@ -307,30 +331,30 @@ void show_registers(struct pt_regs *regs) * time of the fault.. */ if (!user_mode_vm(regs)) { - u8 *eip; + u8 *ip; unsigned int code_prologue = code_bytes * 43 / 64; unsigned int code_len = code_bytes; unsigned char c; printk("\n" KERN_EMERG "Stack: "); - show_stack_log_lvl(NULL, regs, ®s->esp, KERN_EMERG); + show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); printk(KERN_EMERG "Code: "); - eip = (u8 *)regs->eip - code_prologue; - if (eip < (u8 *)PAGE_OFFSET || - probe_kernel_address(eip, c)) { + ip = (u8 *)regs->ip - code_prologue; + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { /* try starting at EIP */ - eip = (u8 *)regs->eip; + ip = (u8 *)regs->ip; code_len = code_len - code_prologue + 1; } - for (i = 0; i < code_len; i++, eip++) { - if (eip < (u8 *)PAGE_OFFSET || - probe_kernel_address(eip, c)) { + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { printk(" Bad EIP value."); break; } - if (eip == (u8 *)regs->eip) + if (ip == (u8 *)regs->ip) printk("<%02x> ", c); else printk("%02x ", c); @@ -339,18 +363,57 @@ void show_registers(struct pt_regs *regs) printk("\n"); } -int is_valid_bugaddr(unsigned long eip) +int is_valid_bugaddr(unsigned long ip) { unsigned short ud2; - if (eip < PAGE_OFFSET) + if (ip < PAGE_OFFSET) return 0; - if (probe_kernel_address((unsigned short *)eip, ud2)) + if (probe_kernel_address((unsigned short *)ip, ud2)) return 0; return ud2 == 0x0b0f; } +static int die_counter; + +int __kprobes __die(const char * str, struct pt_regs * regs, long err) +{ + unsigned long sp; + unsigned short ss; + + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) != + NOTIFY_STOP) { + show_registers(regs); + /* Executive summary in case the oops scrolled away */ + sp = (unsigned long) (®s->sp); + savesegment(ss, ss); + if (user_mode(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + } + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); + print_symbol("%s", regs->ip); + printk(" SS:ESP %04x:%08lx\n", ss, sp); + return 0; + } else { + return 1; + } +} + /* * This is gone through when something in the kernel has done something bad and * is about to be terminated. @@ -366,7 +429,6 @@ void die(const char * str, struct pt_regs * regs, long err) .lock_owner = -1, .lock_owner_depth = 0 }; - static int die_counter; unsigned long flags; oops_enter(); @@ -382,43 +444,13 @@ void die(const char * str, struct pt_regs * regs, long err) raw_local_irq_save(flags); if (++die.lock_owner_depth < 3) { - unsigned long esp; - unsigned short ss; + report_bug(regs->ip, regs); - report_bug(regs->eip, regs); - - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, - ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) != - NOTIFY_STOP) { - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - esp = (unsigned long) (®s->esp); - savesegment(ss, ss); - if (user_mode(regs)) { - esp = regs->esp; - ss = regs->xss & 0xffff; - } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->eip); - print_symbol("%s", regs->eip); - printk(" SS:ESP %04x:%08lx\n", ss, esp); - } - else + if (__die(str, regs, err)) regs = NULL; - } else + } else { printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); + } bust_spinlocks(0); die.lock_owner = -1; @@ -454,7 +486,7 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, { struct task_struct *tsk = current; - if (regs->eflags & VM_MASK) { + if (regs->flags & VM_MASK) { if (vm86) goto vm86_trap; goto trap_signal; @@ -500,7 +532,7 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, } #define DO_ERROR(trapnr, signr, str, name) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ +void do_##name(struct pt_regs * regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ @@ -509,7 +541,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \ } #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ +void do_##name(struct pt_regs * regs, long error_code) \ { \ siginfo_t info; \ if (irq) \ @@ -525,7 +557,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \ } #define DO_VM86_ERROR(trapnr, signr, str, name) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ +void do_##name(struct pt_regs * regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ @@ -534,7 +566,7 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \ } #define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -fastcall void do_##name(struct pt_regs * regs, long error_code) \ +void do_##name(struct pt_regs * regs, long error_code) \ { \ siginfo_t info; \ info.si_signo = signr; \ @@ -548,13 +580,13 @@ fastcall void do_##name(struct pt_regs * regs, long error_code) \ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ } -DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->eip) +DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) #ifndef CONFIG_KPROBES DO_VM86_ERROR( 3, SIGTRAP, "int3", int3) #endif DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->eip, 0) +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) @@ -562,7 +594,7 @@ DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) -fastcall void __kprobes do_general_protection(struct pt_regs * regs, +void __kprobes do_general_protection(struct pt_regs * regs, long error_code) { int cpu = get_cpu(); @@ -596,7 +628,7 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, } put_cpu(); - if (regs->eflags & VM_MASK) + if (regs->flags & VM_MASK) goto gp_in_vm86; if (!user_mode(regs)) @@ -605,11 +637,14 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, current->thread.error_code = error_code; current->thread.trap_no = 13; if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) && - printk_ratelimit()) + printk_ratelimit()) { printk(KERN_INFO - "%s[%d] general protection eip:%lx esp:%lx error:%lx\n", + "%s[%d] general protection ip:%lx sp:%lx error:%lx", current->comm, task_pid_nr(current), - regs->eip, regs->esp, error_code); + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } force_sig(SIGSEGV, current); return; @@ -705,8 +740,8 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg) */ bust_spinlocks(1); printk(KERN_EMERG "%s", msg); - printk(" on CPU%d, eip %08lx, registers:\n", - smp_processor_id(), regs->eip); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); show_registers(regs); console_silent(); spin_unlock(&nmi_print_lock); @@ -763,7 +798,7 @@ static __kprobes void default_do_nmi(struct pt_regs * regs) static int ignore_nmis; -fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) +__kprobes void do_nmi(struct pt_regs * regs, long error_code) { int cpu; @@ -792,7 +827,7 @@ void restart_nmi(void) } #ifdef CONFIG_KPROBES -fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) +void __kprobes do_int3(struct pt_regs *regs, long error_code) { trace_hardirqs_fixup(); @@ -828,7 +863,7 @@ fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code) * find every occurrence of the TF bit that could be saved away even * by user code) */ -fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code) +void __kprobes do_debug(struct pt_regs * regs, long error_code) { unsigned int condition; struct task_struct *tsk = current; @@ -837,24 +872,30 @@ fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code) get_debugreg(condition, 6); + /* + * The processor cleared BTF, so don't mark that we need it set. + */ + clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); + tsk->thread.debugctlmsr = 0; + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) return; /* It's safe to allow irq's after DR6 has been saved */ - if (regs->eflags & X86_EFLAGS_IF) + if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg[7]) + if (!tsk->thread.debugreg7) goto clear_dr7; } - if (regs->eflags & VM_MASK) + if (regs->flags & VM_MASK) goto debug_vm86; /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg[6] = condition; + tsk->thread.debugreg6 = condition; /* * Single-stepping through TF: make sure we ignore any events in @@ -886,7 +927,7 @@ debug_vm86: clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->eflags &= ~TF_MASK; + regs->flags &= ~TF_MASK; return; } @@ -895,7 +936,7 @@ clear_TF_reenable: * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ -void math_error(void __user *eip) +void math_error(void __user *ip) { struct task_struct * task; siginfo_t info; @@ -911,7 +952,7 @@ void math_error(void __user *eip) info.si_signo = SIGFPE; info.si_errno = 0; info.si_code = __SI_FAULT; - info.si_addr = eip; + info.si_addr = ip; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked * status. 0x3f is the exception bits in these regs, 0x200 is the @@ -954,13 +995,13 @@ void math_error(void __user *eip) force_sig_info(SIGFPE, &info, task); } -fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code) +void do_coprocessor_error(struct pt_regs * regs, long error_code) { ignore_fpu_irq = 1; - math_error((void __user *)regs->eip); + math_error((void __user *)regs->ip); } -static void simd_math_error(void __user *eip) +static void simd_math_error(void __user *ip) { struct task_struct * task; siginfo_t info; @@ -976,7 +1017,7 @@ static void simd_math_error(void __user *eip) info.si_signo = SIGFPE; info.si_errno = 0; info.si_code = __SI_FAULT; - info.si_addr = eip; + info.si_addr = ip; /* * The SIMD FPU exceptions are handled a little differently, as there * is only a single status/control register. Thus, to determine which @@ -1008,19 +1049,19 @@ static void simd_math_error(void __user *eip) force_sig_info(SIGFPE, &info, task); } -fastcall void do_simd_coprocessor_error(struct pt_regs * regs, +void do_simd_coprocessor_error(struct pt_regs * regs, long error_code) { if (cpu_has_xmm) { /* Handle SIMD FPU exceptions on PIII+ processors. */ ignore_fpu_irq = 1; - simd_math_error((void __user *)regs->eip); + simd_math_error((void __user *)regs->ip); } else { /* * Handle strange cache flush from user space exception * in all other cases. This is undocumented behaviour. */ - if (regs->eflags & VM_MASK) { + if (regs->flags & VM_MASK) { handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); return; @@ -1032,7 +1073,7 @@ fastcall void do_simd_coprocessor_error(struct pt_regs * regs, } } -fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, +void do_spurious_interrupt_bug(struct pt_regs * regs, long error_code) { #if 0 @@ -1041,7 +1082,7 @@ fastcall void do_spurious_interrupt_bug(struct pt_regs * regs, #endif } -fastcall unsigned long patch_espfix_desc(unsigned long uesp, +unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) { struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; @@ -1095,51 +1136,17 @@ asmlinkage void math_emulate(long arg) #endif /* CONFIG_MATH_EMULATION */ -/* - * This needs to use 'idt_table' rather than 'idt', and - * thus use the _nonmapped_ version of the IDT, as the - * Pentium F0 0F bugfix can have resulted in the mapped - * IDT being write-protected. - */ -void set_intr_gate(unsigned int n, void *addr) -{ - _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS); -} - -/* - * This routine sets up an interrupt gate at directory privilege level 3. - */ -static inline void set_system_intr_gate(unsigned int n, void *addr) -{ - _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS); -} - -static void __init set_trap_gate(unsigned int n, void *addr) -{ - _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS); -} - -static void __init set_system_gate(unsigned int n, void *addr) -{ - _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS); -} - -static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) -{ - _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3)); -} - void __init trap_init(void) { int i; #ifdef CONFIG_EISA - void __iomem *p = ioremap(0x0FFFD9, 4); + void __iomem *p = early_ioremap(0x0FFFD9, 4); if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) { EISA_bus = 1; } - iounmap(p); + early_iounmap(p, 4); #endif #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index cc68b92316c..efc66df728b 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -74,22 +74,24 @@ asmlinkage void alignment_check(void); asmlinkage void machine_check(void); asmlinkage void spurious_interrupt_bug(void); +static unsigned int code_bytes = 64; + static inline void conditional_sti(struct pt_regs *regs) { - if (regs->eflags & X86_EFLAGS_IF) + if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } static inline void preempt_conditional_sti(struct pt_regs *regs) { preempt_disable(); - if (regs->eflags & X86_EFLAGS_IF) + if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); } static inline void preempt_conditional_cli(struct pt_regs *regs) { - if (regs->eflags & X86_EFLAGS_IF) + if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); /* Make sure to not schedule here because we could be running on an exception stack. */ @@ -98,14 +100,15 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) int kstack_depth_to_print = 12; -#ifdef CONFIG_KALLSYMS -void printk_address(unsigned long address) +void printk_address(unsigned long address, int reliable) { +#ifdef CONFIG_KALLSYMS unsigned long offset = 0, symsize; const char *symname; char *modname; char *delim = ":"; - char namebuf[128]; + char namebuf[KSYM_NAME_LEN]; + char reliab[4] = ""; symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); @@ -113,17 +116,17 @@ void printk_address(unsigned long address) printk(" [<%016lx>]\n", address); return; } + if (!reliable) + strcpy(reliab, "? "); + if (!modname) - modname = delim = ""; - printk(" [<%016lx>] %s%s%s%s+0x%lx/0x%lx\n", - address, delim, modname, delim, symname, offset, symsize); -} + modname = delim = ""; + printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n", + address, reliab, delim, modname, delim, symname, offset, symsize); #else -void printk_address(unsigned long address) -{ printk(" [<%016lx>]\n", address); -} #endif +} static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, unsigned *usedp, char **idp) @@ -208,14 +211,53 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size, void *end) +{ + void *t = tinfo; + if (end) { + if (p < end && p >= (end-THREAD_SIZE)) + return 1; + else + return 0; + } + return p > t && p < t + THREAD_SIZE - size; +} + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; + + +static inline unsigned long print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end) { - void *t = (void *)tinfo; - return p > t && p < t + THREAD_SIZE - 3; + struct stack_frame *frame = (struct stack_frame *)bp; + + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { + unsigned long addr; + + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + 8) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + } + stack++; + } + return bp; } void dump_trace(struct task_struct *tsk, struct pt_regs *regs, - unsigned long *stack, + unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { const unsigned cpu = get_cpu(); @@ -225,36 +267,28 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, if (!tsk) tsk = current; + tinfo = task_thread_info(tsk); if (!stack) { unsigned long dummy; stack = &dummy; if (tsk && tsk != current) - stack = (unsigned long *)tsk->thread.rsp; + stack = (unsigned long *)tsk->thread.sp; } - /* - * Print function call entries within a stack. 'cond' is the - * "end of stackframe" condition, that the 'stack++' - * iteration will eventually trigger. - */ -#define HANDLE_STACK(cond) \ - do while (cond) { \ - unsigned long addr = *stack++; \ - /* Use unlocked access here because except for NMIs \ - we should be already protected against module unloads */ \ - if (__kernel_text_address(addr)) { \ - /* \ - * If the address is either in the text segment of the \ - * kernel, or in the region which contains vmalloc'ed \ - * memory, it *may* be the address of a calling \ - * routine; if so, print it so that someone tracing \ - * down the cause of the crash will be able to figure \ - * out the call path that was taken. \ - */ \ - ops->address(data, addr); \ - } \ - } while (0) +#ifdef CONFIG_FRAME_POINTER + if (!bp) { + if (tsk == current) { + /* Grab bp right from our regs */ + asm("movq %%rbp, %0" : "=r" (bp):); + } else { + /* bp is the last reg pushed by switch_to */ + bp = *(unsigned long *) tsk->thread.sp; + } + } +#endif + + /* * Print function call entries in all stacks, starting at the @@ -270,7 +304,9 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, if (estack_end) { if (ops->stack(data, id) < 0) break; - HANDLE_STACK (stack < estack_end); + + bp = print_context_stack(tinfo, stack, bp, ops, + data, estack_end); ops->stack(data, "<EOE>"); /* * We link to the next stack via the @@ -288,7 +324,8 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, if (stack >= irqstack && stack < irqstack_end) { if (ops->stack(data, "IRQ") < 0) break; - HANDLE_STACK (stack < irqstack_end); + bp = print_context_stack(tinfo, stack, bp, + ops, data, irqstack_end); /* * We link to the next stack (which would be * the process stack normally) the last @@ -306,9 +343,7 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, /* * This handles the process stack: */ - tinfo = task_thread_info(tsk); - HANDLE_STACK (valid_stack_ptr(tinfo, stack)); -#undef HANDLE_STACK + bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); put_cpu(); } EXPORT_SYMBOL(dump_trace); @@ -331,10 +366,10 @@ static int print_trace_stack(void *data, char *name) return 0; } -static void print_trace_address(void *data, unsigned long addr) +static void print_trace_address(void *data, unsigned long addr, int reliable) { touch_nmi_watchdog(); - printk_address(addr); + printk_address(addr, reliable); } static const struct stacktrace_ops print_trace_ops = { @@ -345,15 +380,17 @@ static const struct stacktrace_ops print_trace_ops = { }; void -show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) +show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack, + unsigned long bp) { printk("\nCall Trace:\n"); - dump_trace(tsk, regs, stack, &print_trace_ops, NULL); + dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL); printk("\n"); } static void -_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) +_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp, + unsigned long bp) { unsigned long *stack; int i; @@ -364,14 +401,14 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) // debugging aid: "show_stack(NULL, NULL);" prints the // back trace for this cpu. - if (rsp == NULL) { + if (sp == NULL) { if (tsk) - rsp = (unsigned long *)tsk->thread.rsp; + sp = (unsigned long *)tsk->thread.sp; else - rsp = (unsigned long *)&rsp; + sp = (unsigned long *)&sp; } - stack = rsp; + stack = sp; for(i=0; i < kstack_depth_to_print; i++) { if (stack >= irqstack && stack <= irqstack_end) { if (stack == irqstack_end) { @@ -387,12 +424,12 @@ _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *rsp) printk(" %016lx", *stack++); touch_nmi_watchdog(); } - show_trace(tsk, regs, rsp); + show_trace(tsk, regs, sp, bp); } -void show_stack(struct task_struct *tsk, unsigned long * rsp) +void show_stack(struct task_struct *tsk, unsigned long * sp) { - _show_stack(tsk, NULL, rsp); + _show_stack(tsk, NULL, sp, 0); } /* @@ -401,13 +438,19 @@ void show_stack(struct task_struct *tsk, unsigned long * rsp) void dump_stack(void) { unsigned long dummy; + unsigned long bp = 0; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + asm("movq %%rbp, %0" : "=r" (bp):); +#endif printk("Pid: %d, comm: %.20s %s %s %.*s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - show_trace(NULL, NULL, &dummy); + show_trace(NULL, NULL, &dummy, bp); } EXPORT_SYMBOL(dump_stack); @@ -415,12 +458,15 @@ EXPORT_SYMBOL(dump_stack); void show_registers(struct pt_regs *regs) { int i; - int in_kernel = !user_mode(regs); - unsigned long rsp; + unsigned long sp; const int cpu = smp_processor_id(); struct task_struct *cur = cpu_pda(cpu)->pcurrent; + u8 *ip; + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; - rsp = regs->rsp; + sp = regs->sp; + ip = (u8 *) regs->ip - code_prologue; printk("CPU %d ", cpu); __show_regs(regs); printk("Process %s (pid: %d, threadinfo %p, task %p)\n", @@ -430,45 +476,43 @@ void show_registers(struct pt_regs *regs) * When in-kernel, we also print out the stack and code at the * time of the fault.. */ - if (in_kernel) { + if (!user_mode(regs)) { + unsigned char c; printk("Stack: "); - _show_stack(NULL, regs, (unsigned long*)rsp); - - printk("\nCode: "); - if (regs->rip < PAGE_OFFSET) - goto bad; - - for (i=0; i<20; i++) { - unsigned char c; - if (__get_user(c, &((unsigned char*)regs->rip)[i])) { -bad: + _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); + printk("\n"); + + printk(KERN_EMERG "Code: "); + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { + /* try starting at RIP */ + ip = (u8 *) regs->ip; + code_len = code_len - code_prologue + 1; + } + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { printk(" Bad RIP value."); break; } - printk("%02x ", c); + if (ip == (u8 *)regs->ip) + printk("<%02x> ", c); + else + printk("%02x ", c); } } printk("\n"); } -int is_valid_bugaddr(unsigned long rip) +int is_valid_bugaddr(unsigned long ip) { unsigned short ud2; - if (__copy_from_user(&ud2, (const void __user *) rip, sizeof(ud2))) + if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) return 0; return ud2 == 0x0b0f; } -#ifdef CONFIG_BUG -void out_of_line_bug(void) -{ - BUG(); -} -EXPORT_SYMBOL(out_of_line_bug); -#endif - static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; @@ -496,7 +540,7 @@ unsigned __kprobes long oops_begin(void) return flags; } -void __kprobes oops_end(unsigned long flags) +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) { die_owner = -1; bust_spinlocks(0); @@ -505,12 +549,17 @@ void __kprobes oops_end(unsigned long flags) /* Nest count reaches zero, release the lock. */ __raw_spin_unlock(&die_lock); raw_local_irq_restore(flags); + if (!regs) { + oops_exit(); + return; + } if (panic_on_oops) panic("Fatal exception"); oops_exit(); + do_exit(signr); } -void __kprobes __die(const char * str, struct pt_regs * regs, long err) +int __kprobes __die(const char * str, struct pt_regs * regs, long err) { static int die_counter; printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff,++die_counter); @@ -524,15 +573,17 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err) printk("DEBUG_PAGEALLOC"); #endif printk("\n"); - notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV); + if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; show_registers(regs); add_taint(TAINT_DIE); /* Executive summary in case the oops scrolled away */ printk(KERN_ALERT "RIP "); - printk_address(regs->rip); - printk(" RSP <%016lx>\n", regs->rsp); + printk_address(regs->ip, 1); + printk(" RSP <%016lx>\n", regs->sp); if (kexec_should_crash(current)) crash_kexec(regs); + return 0; } void die(const char * str, struct pt_regs * regs, long err) @@ -540,11 +591,11 @@ void die(const char * str, struct pt_regs * regs, long err) unsigned long flags = oops_begin(); if (!user_mode(regs)) - report_bug(regs->rip, regs); + report_bug(regs->ip, regs); - __die(str, regs, err); - oops_end(flags); - do_exit(SIGSEGV); + if (__die(str, regs, err)) + regs = NULL; + oops_end(flags, regs, SIGSEGV); } void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) @@ -561,10 +612,10 @@ void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) crash_kexec(regs); if (do_panic || panic_on_oops) panic("Non maskable interrupt"); - oops_end(flags); + oops_end(flags, NULL, SIGBUS); nmi_exit(); local_irq_enable(); - do_exit(SIGSEGV); + do_exit(SIGBUS); } static void __kprobes do_trap(int trapnr, int signr, char *str, @@ -588,11 +639,14 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, tsk->thread.trap_no = trapnr; if (show_unhandled_signals && unhandled_signal(tsk, signr) && - printk_ratelimit()) + printk_ratelimit()) { printk(KERN_INFO - "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", + "%s[%d] trap %s ip:%lx sp:%lx error:%lx", tsk->comm, tsk->pid, str, - regs->rip, regs->rsp, error_code); + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } if (info) force_sig_info(signr, info, tsk); @@ -602,19 +656,12 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, } - /* kernel trap */ - { - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) - regs->rip = fixup->fixup; - else { - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - die(str, regs, error_code); - } - return; + if (!fixup_exception(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + die(str, regs, error_code); } + return; } #define DO_ERROR(trapnr, signr, str, name) \ @@ -643,10 +690,10 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ do_trap(trapnr, signr, str, regs, error_code, &info); \ } -DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->rip) +DO_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) DO_ERROR( 4, SIGSEGV, "overflow", overflow) DO_ERROR( 5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->rip) +DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) DO_ERROR( 7, SIGSEGV, "device not available", device_not_available) DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) @@ -694,32 +741,28 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, tsk->thread.trap_no = 13; if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) + printk_ratelimit()) { printk(KERN_INFO - "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", + "%s[%d] general protection ip:%lx sp:%lx error:%lx", tsk->comm, tsk->pid, - regs->rip, regs->rsp, error_code); + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } force_sig(SIGSEGV, tsk); return; } - /* kernel gp */ - { - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return; - } + if (fixup_exception(regs)) + return; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); - } + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + if (notify_die(DIE_GPF, "general protection fault", regs, + error_code, 13, SIGSEGV) == NOTIFY_STOP) + return; + die("general protection fault", regs, error_code); } static __kprobes void @@ -832,15 +875,15 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; /* Did already sync */ - if (eregs == (struct pt_regs *)eregs->rsp) + if (eregs == (struct pt_regs *)eregs->sp) ; /* Exception from user space */ else if (user_mode(eregs)) regs = task_pt_regs(current); /* Exception from kernel and interrupts are enabled. Move to kernel process stack. */ - else if (eregs->eflags & X86_EFLAGS_IF) - regs = (struct pt_regs *)(eregs->rsp -= sizeof(struct pt_regs)); + else if (eregs->flags & X86_EFLAGS_IF) + regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); if (eregs != regs) *regs = *eregs; return regs; @@ -858,6 +901,12 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, get_debugreg(condition, 6); + /* + * The processor cleared BTF, so don't mark that we need it set. + */ + clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); + tsk->thread.debugctlmsr = 0; + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) return; @@ -873,27 +922,14 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, tsk->thread.debugreg6 = condition; - /* Mask out spurious TF errors due to lazy TF clearing */ + + /* + * Single-stepping through TF: make sure we ignore any events in + * kernel space (but re-enable TF when returning to user mode). + */ if (condition & DR_STEP) { - /* - * The TF error should be masked out only if the current - * process is not traced and if the TRAP flag has been set - * previously by a tracing process (condition detected by - * the PT_DTRACE flag); remember that the i386 TRAP flag - * can be modified by the process itself in user mode, - * allowing programs to debug themselves without the ptrace() - * interface. - */ if (!user_mode(regs)) goto clear_TF_reenable; - /* - * Was the TF flag set by a debugger? If so, clear it now, - * so that register information is correct. - */ - if (tsk->ptrace & PT_DTRACE) { - regs->eflags &= ~TF_MASK; - tsk->ptrace &= ~PT_DTRACE; - } } /* Ok, finally something we can handle */ @@ -902,7 +938,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs * regs, info.si_signo = SIGTRAP; info.si_errno = 0; info.si_code = TRAP_BRKPT; - info.si_addr = user_mode(regs) ? (void __user *)regs->rip : NULL; + info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; force_sig_info(SIGTRAP, &info, tsk); clear_dr7: @@ -912,18 +948,15 @@ clear_dr7: clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->eflags &= ~TF_MASK; + regs->flags &= ~X86_EFLAGS_TF; preempt_conditional_cli(regs); } static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) { - const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; + if (fixup_exception(regs)) return 1; - } + notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); /* Illegal floating point operation in the kernel */ current->thread.trap_no = trapnr; @@ -938,7 +971,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) */ asmlinkage void do_coprocessor_error(struct pt_regs *regs) { - void __user *rip = (void __user *)(regs->rip); + void __user *ip = (void __user *)(regs->ip); struct task_struct * task; siginfo_t info; unsigned short cwd, swd; @@ -958,7 +991,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs) info.si_signo = SIGFPE; info.si_errno = 0; info.si_code = __SI_FAULT; - info.si_addr = rip; + info.si_addr = ip; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked * status. 0x3f is the exception bits in these regs, 0x200 is the @@ -1007,7 +1040,7 @@ asmlinkage void bad_intr(void) asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) { - void __user *rip = (void __user *)(regs->rip); + void __user *ip = (void __user *)(regs->ip); struct task_struct * task; siginfo_t info; unsigned short mxcsr; @@ -1027,7 +1060,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) info.si_signo = SIGFPE; info.si_errno = 0; info.si_code = __SI_FAULT; - info.si_addr = rip; + info.si_addr = ip; /* * The SIMD FPU exceptions are handled a little differently, as there * is only a single status/control register. Thus, to determine which @@ -1089,6 +1122,7 @@ asmlinkage void math_state_restore(void) task_thread_info(me)->status |= TS_USEDFPU; me->fpu_counter++; } +EXPORT_SYMBOL_GPL(math_state_restore); void __init trap_init(void) { @@ -1144,3 +1178,14 @@ static int __init kstack_setup(char *s) return 0; } early_param("kstack", kstack_setup); + + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c index 9ebc0dab66b..43517e324be 100644 --- a/arch/x86/kernel/tsc_32.c +++ b/arch/x86/kernel/tsc_32.c @@ -5,6 +5,7 @@ #include <linux/jiffies.h> #include <linux/init.h> #include <linux/dmi.h> +#include <linux/percpu.h> #include <asm/delay.h> #include <asm/tsc.h> @@ -23,8 +24,6 @@ static int tsc_enabled; unsigned int tsc_khz; EXPORT_SYMBOL_GPL(tsc_khz); -int tsc_disable; - #ifdef CONFIG_X86_TSC static int __init tsc_setup(char *str) { @@ -39,8 +38,7 @@ static int __init tsc_setup(char *str) */ static int __init tsc_setup(char *str) { - tsc_disable = 1; - + setup_clear_cpu_cap(X86_FEATURE_TSC); return 1; } #endif @@ -80,13 +78,31 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable); * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -unsigned long cyc2ns_scale __read_mostly; -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ +DEFINE_PER_CPU(unsigned long, cyc2ns); -static inline void set_cyc2ns_scale(unsigned long cpu_khz) +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { - cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; + unsigned long flags, prev_scale, *scale; + unsigned long long tsc_now, ns_now; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + scale = &per_cpu(cyc2ns, cpu); + + rdtscll(tsc_now); + ns_now = __cycles_2_ns(tsc_now); + + prev_scale = *scale; + if (cpu_khz) + *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; + + /* + * Start smoothly with the new frequency: + */ + sched_clock_idle_wakeup_event(0); + local_irq_restore(flags); } /* @@ -239,7 +255,9 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) { tsc_khz = cpu_khz; - set_cyc2ns_scale(cpu_khz); + preempt_disable(); + set_cyc2ns_scale(cpu_khz, smp_processor_id()); + preempt_enable(); /* * TSC based sched_clock turns * to junk w/ cpufreq @@ -333,6 +351,11 @@ __cpuinit int unsynchronized_tsc(void) { if (!cpu_has_tsc || tsc_unstable) return 1; + + /* Anything with constant TSC should be synchronized */ + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return 0; + /* * Intel systems are normally all synchronized. * Exceptions must mark TSC as unstable: @@ -367,7 +390,9 @@ static inline void check_geode_tsc_reliable(void) { } void __init tsc_init(void) { - if (!cpu_has_tsc || tsc_disable) + int cpu; + + if (!cpu_has_tsc) goto out_no_tsc; cpu_khz = calculate_cpu_khz(); @@ -380,7 +405,15 @@ void __init tsc_init(void) (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); - set_cyc2ns_scale(cpu_khz); + /* + * Secondary CPUs do not run through tsc_init(), so set up + * all the scale factors for all CPUs, assuming the same + * speed as the bootup CPU. (cpufreq notifiers will fix this + * up if their speed diverges) + */ + for_each_possible_cpu(cpu) + set_cyc2ns_scale(cpu_khz, cpu); + use_tsc_delay(); /* Check and install the TSC clocksource */ @@ -403,10 +436,5 @@ void __init tsc_init(void) return; out_no_tsc: - /* - * Set the tsc_disable flag if there's no TSC support, this - * makes it a fast flag for the kernel to see whether it - * should be using the TSC. - */ - tsc_disable = 1; + setup_clear_cpu_cap(X86_FEATURE_TSC); } diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c index 9c70af45b42..947554ddabb 100644 --- a/arch/x86/kernel/tsc_64.c +++ b/arch/x86/kernel/tsc_64.c @@ -10,6 +10,7 @@ #include <asm/hpet.h> #include <asm/timex.h> +#include <asm/timer.h> static int notsc __initdata = 0; @@ -18,19 +19,51 @@ EXPORT_SYMBOL(cpu_khz); unsigned int tsc_khz; EXPORT_SYMBOL(tsc_khz); -static unsigned int cyc2ns_scale __read_mostly; +/* Accelerators for sched_clock() + * convert from cycles(64bits) => nanoseconds (64bits) + * basic equation: + * ns = cycles / (freq / ns_per_sec) + * ns = cycles * (ns_per_sec / freq) + * ns = cycles * (10^9 / (cpu_khz * 10^3)) + * ns = cycles * (10^6 / cpu_khz) + * + * Then we use scaling math (suggested by george@mvista.com) to get: + * ns = cycles * (10^6 * SC / cpu_khz) / SC + * ns = cycles * cyc2ns_scale / SC + * + * And since SC is a constant power of two, we can convert the div + * into a shift. + * + * We can use khz divisor instead of mhz to keep a better precision, since + * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. + * (mathieu.desnoyers@polymtl.ca) + * + * -johnstul@us.ibm.com "math is hard, lets go shopping!" + */ +DEFINE_PER_CPU(unsigned long, cyc2ns); -static inline void set_cyc2ns_scale(unsigned long khz) +static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) { - cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; -} + unsigned long flags, prev_scale, *scale; + unsigned long long tsc_now, ns_now; -static unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> NS_SCALE; + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + scale = &per_cpu(cyc2ns, cpu); + + rdtscll(tsc_now); + ns_now = __cycles_2_ns(tsc_now); + + prev_scale = *scale; + if (cpu_khz) + *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; + + sched_clock_idle_wakeup_event(0); + local_irq_restore(flags); } -unsigned long long sched_clock(void) +unsigned long long native_sched_clock(void) { unsigned long a = 0; @@ -44,12 +77,27 @@ unsigned long long sched_clock(void) return cycles_2_ns(a); } +/* We need to define a real function for sched_clock, to override the + weak default version */ +#ifdef CONFIG_PARAVIRT +unsigned long long sched_clock(void) +{ + return paravirt_sched_clock(); +} +#else +unsigned long long +sched_clock(void) __attribute__((alias("native_sched_clock"))); +#endif + + static int tsc_unstable; -inline int check_tsc_unstable(void) +int check_tsc_unstable(void) { return tsc_unstable; } +EXPORT_SYMBOL_GPL(check_tsc_unstable); + #ifdef CONFIG_CPU_FREQ /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency @@ -100,7 +148,9 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, mark_tsc_unstable("cpufreq changes"); } - set_cyc2ns_scale(tsc_khz_ref); + preempt_disable(); + set_cyc2ns_scale(tsc_khz_ref, smp_processor_id()); + preempt_enable(); return 0; } @@ -133,12 +183,12 @@ static unsigned long __init tsc_read_refs(unsigned long *pm, int i; for (i = 0; i < MAX_RETRIES; i++) { - t1 = get_cycles_sync(); + t1 = get_cycles(); if (hpet) *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; else *pm = acpi_pm_read_early(); - t2 = get_cycles_sync(); + t2 = get_cycles(); if ((t2 - t1) < SMI_TRESHOLD) return t2; } @@ -151,7 +201,7 @@ static unsigned long __init tsc_read_refs(unsigned long *pm, void __init tsc_calibrate(void) { unsigned long flags, tsc1, tsc2, tr1, tr2, pm1, pm2, hpet1, hpet2; - int hpet = is_hpet_enabled(); + int hpet = is_hpet_enabled(), cpu; local_irq_save(flags); @@ -162,9 +212,9 @@ void __init tsc_calibrate(void) outb(0xb0, 0x43); outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); - tr1 = get_cycles_sync(); + tr1 = get_cycles(); while ((inb(0x61) & 0x20) == 0); - tr2 = get_cycles_sync(); + tr2 = get_cycles(); tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); @@ -206,7 +256,9 @@ void __init tsc_calibrate(void) } tsc_khz = tsc2 / tsc1; - set_cyc2ns_scale(tsc_khz); + + for_each_possible_cpu(cpu) + set_cyc2ns_scale(tsc_khz, cpu); } /* @@ -222,17 +274,9 @@ __cpuinit int unsynchronized_tsc(void) if (apic_is_clustered_box()) return 1; #endif - /* Most intel systems have synchronized TSCs except for - multi node systems */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { -#ifdef CONFIG_ACPI - /* But TSC doesn't tick in C3 so don't use it there */ - if (acpi_gbl_FADT.header.length > 0 && - acpi_gbl_FADT.C3latency < 1000) - return 1; -#endif + + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; - } /* Assume multi socket systems are not synchronized */ return num_present_cpus() > 1; @@ -250,13 +294,13 @@ __setup("notsc", notsc_setup); /* clock source code: */ static cycle_t read_tsc(void) { - cycle_t ret = (cycle_t)get_cycles_sync(); + cycle_t ret = (cycle_t)get_cycles(); return ret; } static cycle_t __vsyscall_fn vread_tsc(void) { - cycle_t ret = (cycle_t)get_cycles_sync(); + cycle_t ret = (cycle_t)vget_cycles(); return ret; } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 9125efe66a0..0577825cf89 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -46,7 +46,7 @@ static __cpuinit void check_tsc_warp(void) cycles_t start, now, prev, end; int i; - start = get_cycles_sync(); + start = get_cycles(); /* * The measurement runs for 20 msecs: */ @@ -61,18 +61,18 @@ static __cpuinit void check_tsc_warp(void) */ __raw_spin_lock(&sync_lock); prev = last_tsc; - now = get_cycles_sync(); + now = get_cycles(); last_tsc = now; __raw_spin_unlock(&sync_lock); /* * Be nice every now and then (and also check whether - * measurement is done [we also insert a 100 million + * measurement is done [we also insert a 10 million * loops safety exit, so we dont lock up in case the * TSC readout is totally broken]): */ if (unlikely(!(i & 7))) { - if (now > end || i > 100000000) + if (now > end || i > 10000000) break; cpu_relax(); touch_nmi_watchdog(); @@ -87,7 +87,11 @@ static __cpuinit void check_tsc_warp(void) nr_warps++; __raw_spin_unlock(&sync_lock); } - + } + if (!(now-start)) { + printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", + now-start, end-start); + WARN_ON(1); } } @@ -129,24 +133,24 @@ void __cpuinit check_tsc_sync_source(int cpu) while (atomic_read(&stop_count) != cpus-1) cpu_relax(); - /* - * Reset it - just in case we boot another CPU later: - */ - atomic_set(&start_count, 0); - if (nr_warps) { printk("\n"); printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," " turning off TSC clock.\n", max_warp); mark_tsc_unstable("check_tsc_sync_source failed"); - nr_warps = 0; - max_warp = 0; - last_tsc = 0; } else { printk(" passed.\n"); } /* + * Reset it - just in case we boot another CPU later: + */ + atomic_set(&start_count, 0); + nr_warps = 0; + max_warp = 0; + last_tsc = 0; + + /* * Let the target continue with the bootup: */ atomic_inc(&stop_count); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 157e4bedd3c..738c2104df3 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -70,10 +70,10 @@ /* * 8- and 16-bit register defines.. */ -#define AL(regs) (((unsigned char *)&((regs)->pt.eax))[0]) -#define AH(regs) (((unsigned char *)&((regs)->pt.eax))[1]) -#define IP(regs) (*(unsigned short *)&((regs)->pt.eip)) -#define SP(regs) (*(unsigned short *)&((regs)->pt.esp)) +#define AL(regs) (((unsigned char *)&((regs)->pt.ax))[0]) +#define AH(regs) (((unsigned char *)&((regs)->pt.ax))[1]) +#define IP(regs) (*(unsigned short *)&((regs)->pt.ip)) +#define SP(regs) (*(unsigned short *)&((regs)->pt.sp)) /* * virtual flags (16 and 32-bit versions) @@ -93,12 +93,12 @@ static int copy_vm86_regs_to_user(struct vm86_regs __user *user, { int ret = 0; - /* kernel_vm86_regs is missing xgs, so copy everything up to + /* kernel_vm86_regs is missing gs, so copy everything up to (but not including) orig_eax, and then rest including orig_eax. */ - ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_eax)); - ret += copy_to_user(&user->orig_eax, ®s->pt.orig_eax, + ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax)); + ret += copy_to_user(&user->orig_eax, ®s->pt.orig_ax, sizeof(struct kernel_vm86_regs) - - offsetof(struct kernel_vm86_regs, pt.orig_eax)); + offsetof(struct kernel_vm86_regs, pt.orig_ax)); return ret; } @@ -110,18 +110,17 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs, { int ret = 0; - /* copy eax-xfs inclusive */ - ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_eax)); - /* copy orig_eax-__gsh+extra */ - ret += copy_from_user(®s->pt.orig_eax, &user->orig_eax, + /* copy ax-fs inclusive */ + ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax)); + /* copy orig_ax-__gsh+extra */ + ret += copy_from_user(®s->pt.orig_ax, &user->orig_eax, sizeof(struct kernel_vm86_regs) - - offsetof(struct kernel_vm86_regs, pt.orig_eax) + + offsetof(struct kernel_vm86_regs, pt.orig_ax) + extra); return ret; } -struct pt_regs * FASTCALL(save_v86_state(struct kernel_vm86_regs * regs)); -struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) +struct pt_regs * save_v86_state(struct kernel_vm86_regs * regs) { struct tss_struct *tss; struct pt_regs *ret; @@ -138,7 +137,7 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) printk("no vm86_info: BAD\n"); do_exit(SIGSEGV); } - set_flags(regs->pt.eflags, VEFLAGS, VIF_MASK | current->thread.v86mask); + set_flags(regs->pt.flags, VEFLAGS, VIF_MASK | current->thread.v86mask); tmp = copy_vm86_regs_to_user(¤t->thread.vm86_info->regs,regs); tmp += put_user(current->thread.screen_bitmap,¤t->thread.vm86_info->screen_bitmap); if (tmp) { @@ -147,15 +146,15 @@ struct pt_regs * fastcall save_v86_state(struct kernel_vm86_regs * regs) } tss = &per_cpu(init_tss, get_cpu()); - current->thread.esp0 = current->thread.saved_esp0; + current->thread.sp0 = current->thread.saved_sp0; current->thread.sysenter_cs = __KERNEL_CS; - load_esp0(tss, ¤t->thread); - current->thread.saved_esp0 = 0; + load_sp0(tss, ¤t->thread); + current->thread.saved_sp0 = 0; put_cpu(); ret = KVM86->regs32; - ret->xfs = current->thread.saved_fs; + ret->fs = current->thread.saved_fs; loadsegment(gs, current->thread.saved_gs); return ret; @@ -197,7 +196,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk asmlinkage int sys_vm86old(struct pt_regs regs) { - struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.ebx; + struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs.bx; struct kernel_vm86_struct info; /* declare this _on top_, * this avoids wasting of stack space. * This remains on the stack until we @@ -207,7 +206,7 @@ asmlinkage int sys_vm86old(struct pt_regs regs) int tmp, ret = -EPERM; tsk = current; - if (tsk->thread.saved_esp0) + if (tsk->thread.saved_sp0) goto out; tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, offsetof(struct kernel_vm86_struct, vm86plus) - @@ -237,12 +236,12 @@ asmlinkage int sys_vm86(struct pt_regs regs) struct vm86plus_struct __user *v86; tsk = current; - switch (regs.ebx) { + switch (regs.bx) { case VM86_REQUEST_IRQ: case VM86_FREE_IRQ: case VM86_GET_IRQ_BITS: case VM86_GET_AND_RESET_IRQ: - ret = do_vm86_irq_handling(regs.ebx, (int)regs.ecx); + ret = do_vm86_irq_handling(regs.bx, (int)regs.cx); goto out; case VM86_PLUS_INSTALL_CHECK: /* NOTE: on old vm86 stuff this will return the error @@ -256,9 +255,9 @@ asmlinkage int sys_vm86(struct pt_regs regs) /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ ret = -EPERM; - if (tsk->thread.saved_esp0) + if (tsk->thread.saved_sp0) goto out; - v86 = (struct vm86plus_struct __user *)regs.ecx; + v86 = (struct vm86plus_struct __user *)regs.cx; tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, offsetof(struct kernel_vm86_struct, regs32) - sizeof(info.regs)); @@ -281,23 +280,23 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk /* * make sure the vm86() system call doesn't try to do anything silly */ - info->regs.pt.xds = 0; - info->regs.pt.xes = 0; - info->regs.pt.xfs = 0; + info->regs.pt.ds = 0; + info->regs.pt.es = 0; + info->regs.pt.fs = 0; /* we are clearing gs later just before "jmp resume_userspace", * because it is not saved/restored. */ /* - * The eflags register is also special: we cannot trust that the user + * The flags register is also special: we cannot trust that the user * has set it up safely, so this makes sure interrupt etc flags are * inherited from protected mode. */ - VEFLAGS = info->regs.pt.eflags; - info->regs.pt.eflags &= SAFE_MASK; - info->regs.pt.eflags |= info->regs32->eflags & ~SAFE_MASK; - info->regs.pt.eflags |= VM_MASK; + VEFLAGS = info->regs.pt.flags; + info->regs.pt.flags &= SAFE_MASK; + info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK; + info->regs.pt.flags |= VM_MASK; switch (info->cpu_type) { case CPU_286: @@ -315,18 +314,18 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk } /* - * Save old state, set default return value (%eax) to 0 + * Save old state, set default return value (%ax) to 0 */ - info->regs32->eax = 0; - tsk->thread.saved_esp0 = tsk->thread.esp0; - tsk->thread.saved_fs = info->regs32->xfs; + info->regs32->ax = 0; + tsk->thread.saved_sp0 = tsk->thread.sp0; + tsk->thread.saved_fs = info->regs32->fs; savesegment(gs, tsk->thread.saved_gs); tss = &per_cpu(init_tss, get_cpu()); - tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; + tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; if (cpu_has_sep) tsk->thread.sysenter_cs = 0; - load_esp0(tss, &tsk->thread); + load_sp0(tss, &tsk->thread); put_cpu(); tsk->thread.screen_bitmap = info->screen_bitmap; @@ -352,7 +351,7 @@ static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval) struct pt_regs * regs32; regs32 = save_v86_state(regs16); - regs32->eax = retval; + regs32->ax = retval; __asm__ __volatile__("movl %0,%%esp\n\t" "movl %1,%%ebp\n\t" "jmp resume_userspace" @@ -373,30 +372,30 @@ static inline void clear_IF(struct kernel_vm86_regs * regs) static inline void clear_TF(struct kernel_vm86_regs * regs) { - regs->pt.eflags &= ~TF_MASK; + regs->pt.flags &= ~TF_MASK; } static inline void clear_AC(struct kernel_vm86_regs * regs) { - regs->pt.eflags &= ~AC_MASK; + regs->pt.flags &= ~AC_MASK; } /* It is correct to call set_IF(regs) from the set_vflags_* * functions. However someone forgot to call clear_IF(regs) * in the opposite case. * After the command sequence CLI PUSHF STI POPF you should - * end up with interrups disabled, but you ended up with + * end up with interrupts disabled, but you ended up with * interrupts enabled. * ( I was testing my own changes, but the only bug I * could find was in a function I had not changed. ) * [KD] */ -static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs * regs) +static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs * regs) { - set_flags(VEFLAGS, eflags, current->thread.v86mask); - set_flags(regs->pt.eflags, eflags, SAFE_MASK); - if (eflags & IF_MASK) + set_flags(VEFLAGS, flags, current->thread.v86mask); + set_flags(regs->pt.flags, flags, SAFE_MASK); + if (flags & IF_MASK) set_IF(regs); else clear_IF(regs); @@ -405,7 +404,7 @@ static inline void set_vflags_long(unsigned long eflags, struct kernel_vm86_regs static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs) { set_flags(VFLAGS, flags, current->thread.v86mask); - set_flags(regs->pt.eflags, flags, SAFE_MASK); + set_flags(regs->pt.flags, flags, SAFE_MASK); if (flags & IF_MASK) set_IF(regs); else @@ -414,7 +413,7 @@ static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_reg static inline unsigned long get_vflags(struct kernel_vm86_regs * regs) { - unsigned long flags = regs->pt.eflags & RETURN_MASK; + unsigned long flags = regs->pt.flags & RETURN_MASK; if (VEFLAGS & VIF_MASK) flags |= IF_MASK; @@ -518,7 +517,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i, unsigned long __user *intr_ptr; unsigned long segoffs; - if (regs->pt.xcs == BIOSSEG) + if (regs->pt.cs == BIOSSEG) goto cannot_handle; if (is_revectored(i, &KVM86->int_revectored)) goto cannot_handle; @@ -530,9 +529,9 @@ static void do_int(struct kernel_vm86_regs *regs, int i, if ((segoffs >> 16) == BIOSSEG) goto cannot_handle; pushw(ssp, sp, get_vflags(regs), cannot_handle); - pushw(ssp, sp, regs->pt.xcs, cannot_handle); + pushw(ssp, sp, regs->pt.cs, cannot_handle); pushw(ssp, sp, IP(regs), cannot_handle); - regs->pt.xcs = segoffs >> 16; + regs->pt.cs = segoffs >> 16; SP(regs) -= 6; IP(regs) = segoffs & 0xffff; clear_TF(regs); @@ -549,7 +548,7 @@ int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno if (VMPI.is_vm86pus) { if ( (trapno==3) || (trapno==1) ) return_to_32bit(regs, VM86_TRAP + (trapno << 8)); - do_int(regs, trapno, (unsigned char __user *) (regs->pt.xss << 4), SP(regs)); + do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); return 0; } if (trapno !=1) @@ -585,10 +584,10 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code) handle_vm86_trap(regs, 0, 1); \ return; } while (0) - orig_flags = *(unsigned short *)®s->pt.eflags; + orig_flags = *(unsigned short *)®s->pt.flags; - csp = (unsigned char __user *) (regs->pt.xcs << 4); - ssp = (unsigned char __user *) (regs->pt.xss << 4); + csp = (unsigned char __user *) (regs->pt.cs << 4); + ssp = (unsigned char __user *) (regs->pt.ss << 4); sp = SP(regs); ip = IP(regs); @@ -675,7 +674,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code) SP(regs) += 6; } IP(regs) = newip; - regs->pt.xcs = newcs; + regs->pt.cs = newcs; CHECK_IF_IN_TRAP; if (data32) { set_vflags_long(newflags, regs); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index f02bad68aba..4525bc2c2e1 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -62,7 +62,10 @@ static struct { void (*cpuid)(void /* non-c */); void (*_set_ldt)(u32 selector); void (*set_tr)(u32 selector); - void (*set_kernel_stack)(u32 selector, u32 esp0); + void (*write_idt_entry)(struct desc_struct *, int, u32, u32); + void (*write_gdt_entry)(struct desc_struct *, int, u32, u32); + void (*write_ldt_entry)(struct desc_struct *, int, u32, u32); + void (*set_kernel_stack)(u32 selector, u32 sp0); void (*allocate_page)(u32, u32, u32, u32, u32); void (*release_page)(u32, u32); void (*set_pte)(pte_t, pte_t *, unsigned); @@ -88,13 +91,13 @@ struct vmi_timer_ops vmi_timer_ops; #define IRQ_PATCH_DISABLE 5 static inline void patch_offset(void *insnbuf, - unsigned long eip, unsigned long dest) + unsigned long ip, unsigned long dest) { - *(unsigned long *)(insnbuf+1) = dest-eip-5; + *(unsigned long *)(insnbuf+1) = dest-ip-5; } static unsigned patch_internal(int call, unsigned len, void *insnbuf, - unsigned long eip) + unsigned long ip) { u64 reloc; struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc; @@ -103,13 +106,13 @@ static unsigned patch_internal(int call, unsigned len, void *insnbuf, case VMI_RELOCATION_CALL_REL: BUG_ON(len < 5); *(char *)insnbuf = MNEM_CALL; - patch_offset(insnbuf, eip, (unsigned long)rel->eip); + patch_offset(insnbuf, ip, (unsigned long)rel->eip); return 5; case VMI_RELOCATION_JUMP_REL: BUG_ON(len < 5); *(char *)insnbuf = MNEM_JMP; - patch_offset(insnbuf, eip, (unsigned long)rel->eip); + patch_offset(insnbuf, ip, (unsigned long)rel->eip); return 5; case VMI_RELOCATION_NOP: @@ -131,25 +134,25 @@ static unsigned patch_internal(int call, unsigned len, void *insnbuf, * sequence. The callee does nop padding for us. */ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, - unsigned long eip, unsigned len) + unsigned long ip, unsigned len) { switch (type) { case PARAVIRT_PATCH(pv_irq_ops.irq_disable): return patch_internal(VMI_CALL_DisableInterrupts, len, - insns, eip); + insns, ip); case PARAVIRT_PATCH(pv_irq_ops.irq_enable): return patch_internal(VMI_CALL_EnableInterrupts, len, - insns, eip); + insns, ip); case PARAVIRT_PATCH(pv_irq_ops.restore_fl): return patch_internal(VMI_CALL_SetInterruptMask, len, - insns, eip); + insns, ip); case PARAVIRT_PATCH(pv_irq_ops.save_fl): return patch_internal(VMI_CALL_GetInterruptMask, len, - insns, eip); + insns, ip); case PARAVIRT_PATCH(pv_cpu_ops.iret): - return patch_internal(VMI_CALL_IRET, len, insns, eip); - case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit): - return patch_internal(VMI_CALL_SYSEXIT, len, insns, eip); + return patch_internal(VMI_CALL_IRET, len, insns, ip); + case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret): + return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip); default: break; } @@ -157,36 +160,36 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns, } /* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */ -static void vmi_cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) +static void vmi_cpuid(unsigned int *ax, unsigned int *bx, + unsigned int *cx, unsigned int *dx) { int override = 0; - if (*eax == 1) + if (*ax == 1) override = 1; asm volatile ("call *%6" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx), "r" (vmi_ops.cpuid)); + : "=a" (*ax), + "=b" (*bx), + "=c" (*cx), + "=d" (*dx) + : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid)); if (override) { if (disable_pse) - *edx &= ~X86_FEATURE_PSE; + *dx &= ~X86_FEATURE_PSE; if (disable_pge) - *edx &= ~X86_FEATURE_PGE; + *dx &= ~X86_FEATURE_PGE; if (disable_sep) - *edx &= ~X86_FEATURE_SEP; + *dx &= ~X86_FEATURE_SEP; if (disable_tsc) - *edx &= ~X86_FEATURE_TSC; + *dx &= ~X86_FEATURE_TSC; if (disable_mtrr) - *edx &= ~X86_FEATURE_MTRR; + *dx &= ~X86_FEATURE_MTRR; } } static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new) { if (gdt[nr].a != new->a || gdt[nr].b != new->b) - write_gdt_entry(gdt, nr, new->a, new->b); + write_gdt_entry(gdt, nr, new, 0); } static void vmi_load_tls(struct thread_struct *t, unsigned int cpu) @@ -200,12 +203,12 @@ static void vmi_load_tls(struct thread_struct *t, unsigned int cpu) static void vmi_set_ldt(const void *addr, unsigned entries) { unsigned cpu = smp_processor_id(); - u32 low, high; + struct desc_struct desc; - pack_descriptor(&low, &high, (unsigned long)addr, + pack_descriptor(&desc, (unsigned long)addr, entries * sizeof(struct desc_struct) - 1, - DESCTYPE_LDT, 0); - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high); + DESC_LDT, 0); + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT); vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0); } @@ -214,17 +217,37 @@ static void vmi_set_tr(void) vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct)); } -static void vmi_load_esp0(struct tss_struct *tss, +static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) +{ + u32 *idt_entry = (u32 *)g; + vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[2]); +} + +static void vmi_write_gdt_entry(struct desc_struct *dt, int entry, + const void *desc, int type) +{ + u32 *gdt_entry = (u32 *)desc; + vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[2]); +} + +static void vmi_write_ldt_entry(struct desc_struct *dt, int entry, + const void *desc) +{ + u32 *ldt_entry = (u32 *)desc; + vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[2]); +} + +static void vmi_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { - tss->x86_tss.esp0 = thread->esp0; + tss->x86_tss.sp0 = thread->sp0; /* This can only happen when SEP is enabled, no need to test "SEP"arately */ if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { tss->x86_tss.ss1 = thread->sysenter_cs; wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); } - vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.esp0); + vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0); } static void vmi_flush_tlb_user(void) @@ -375,7 +398,7 @@ static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn) vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); } -static void vmi_allocate_pd(u32 pfn) +static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn) { /* * This call comes in very early, before mem_map is setup. @@ -452,7 +475,7 @@ static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval) { #ifdef CONFIG_X86_PAE - const pte_t pte = { pmdval.pmd, pmdval.pmd >> 32 }; + const pte_t pte = { .pte = pmdval.pmd }; vmi_check_page_type(__pa(pmdp) >> PAGE_SHIFT, VMI_PAGE_PMD); #else const pte_t pte = { pmdval.pud.pgd.pgd }; @@ -485,21 +508,21 @@ static void vmi_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t static void vmi_set_pud(pud_t *pudp, pud_t pudval) { /* Um, eww */ - const pte_t pte = { pudval.pgd.pgd, pudval.pgd.pgd >> 32 }; + const pte_t pte = { .pte = pudval.pgd.pgd }; vmi_check_page_type(__pa(pudp) >> PAGE_SHIFT, VMI_PAGE_PGD); vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP); } static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - const pte_t pte = { 0 }; + const pte_t pte = { .pte = 0 }; vmi_check_page_type(__pa(ptep) >> PAGE_SHIFT, VMI_PAGE_PTE); vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0)); } static void vmi_pmd_clear(pmd_t *pmd) { - const pte_t pte = { 0 }; + const pte_t pte = { .pte = 0 }; vmi_check_page_type(__pa(pmd) >> PAGE_SHIFT, VMI_PAGE_PMD); vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD); } @@ -790,10 +813,13 @@ static inline int __init activate_vmi(void) para_fill(pv_cpu_ops.store_idt, GetIDT); para_fill(pv_cpu_ops.store_tr, GetTR); pv_cpu_ops.load_tls = vmi_load_tls; - para_fill(pv_cpu_ops.write_ldt_entry, WriteLDTEntry); - para_fill(pv_cpu_ops.write_gdt_entry, WriteGDTEntry); - para_fill(pv_cpu_ops.write_idt_entry, WriteIDTEntry); - para_wrap(pv_cpu_ops.load_esp0, vmi_load_esp0, set_kernel_stack, UpdateKernelStack); + para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry, + write_ldt_entry, WriteLDTEntry); + para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry, + write_gdt_entry, WriteGDTEntry); + para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry, + write_idt_entry, WriteIDTEntry); + para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack); para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); para_fill(pv_cpu_ops.io_delay, IODelay); @@ -870,7 +896,7 @@ static inline int __init activate_vmi(void) * the backend. They are performance critical anyway, so requiring * a patch is not a big problem. */ - pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0; + pv_cpu_ops.irq_enable_syscall_ret = (void *)0xfeedbab0; pv_cpu_ops.iret = (void *)0xbadbab0; #ifdef CONFIG_SMP @@ -963,19 +989,19 @@ static int __init parse_vmi(char *arg) return -EINVAL; if (!strcmp(arg, "disable_pge")) { - clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); disable_pge = 1; } else if (!strcmp(arg, "disable_pse")) { - clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE); disable_pse = 1; } else if (!strcmp(arg, "disable_sep")) { - clear_bit(X86_FEATURE_SEP, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); disable_sep = 1; } else if (!strcmp(arg, "disable_tsc")) { - clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC); disable_tsc = 1; } else if (!strcmp(arg, "disable_mtrr")) { - clear_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR); disable_mtrr = 1; } else if (!strcmp(arg, "disable_timer")) { disable_vmi_timer = 1; diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index b1b5ab08b26..a2b030780aa 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -35,7 +35,6 @@ #include <asm/i8253.h> #include <irq_vectors.h> -#include "io_ports.h" #define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) #define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring()) @@ -238,7 +237,7 @@ static void __devinit vmi_time_init_clockevent(void) void __init vmi_time_init(void) { /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ - outb_p(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ + outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ vmi_time_init_clockevent(); setup_irq(0, &vmi_clock_action); diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 84c913f38f9..f1148ac8abe 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -8,12 +8,6 @@ * put it inside the section definition. */ -/* Don't define absolute symbols until and unless you know that symbol - * value is should remain constant even if kernel image is relocated - * at run time. Absolute symbols are not relocated. If symbol value should - * change if kernel is relocated, make the symbol section relative and - * put it inside the section definition. - */ #define LOAD_OFFSET __PAGE_OFFSET #include <asm-generic/vmlinux.lds.h> @@ -44,6 +38,8 @@ SECTIONS /* read-only */ .text : AT(ADDR(.text) - LOAD_OFFSET) { + . = ALIGN(4096); /* not really needed, already page aligned */ + *(.text.page_aligned) TEXT_TEXT SCHED_TEXT LOCK_TEXT diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index ea5386944e6..0992b9946c6 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -37,16 +37,15 @@ SECTIONS KPROBES_TEXT *(.fixup) *(.gnu.warning) - } :text = 0x9090 - /* out-of-line lock text */ - .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) } - - _etext = .; /* End of text section */ + _etext = .; /* End of text section */ + } :text = 0x9090 . = ALIGN(16); /* Exception table */ - __start___ex_table = .; - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } - __stop___ex_table = .; + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } NOTES :text :note @@ -179,6 +178,14 @@ SECTIONS } __con_initcall_end = .; SECURITY_INIT + + . = ALIGN(8); + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + . = ALIGN(8); __alt_instructions = .; .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 414caf0c5f9..d971210a6d3 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -25,21 +25,24 @@ static int __init vsmp_init(void) return 0; /* Check if we are running on a ScaleMP vSMP box */ - if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != PCI_VENDOR_ID_SCALEMP) || - (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != PCI_DEVICE_ID_SCALEMP_VSMP_CTL)) + if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) != + PCI_VENDOR_ID_SCALEMP) || + (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) != + PCI_DEVICE_ID_SCALEMP_VSMP_CTL)) return 0; /* set vSMP magic bits to indicate vSMP capable kernel */ address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8); cap = readl(address); ctl = readl(address + 4); - printk("vSMP CTL: capabilities:0x%08x control:0x%08x\n", cap, ctl); + printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n", + cap, ctl); if (cap & ctl & (1 << 4)) { /* Turn on vSMP IRQ fastpath handling (see system.h) */ ctl &= ~(1 << 4); writel(ctl, address + 4); ctl = readl(address + 4); - printk("vSMP CTL: control set to:0x%08x\n", ctl); + printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl); } iounmap(address); diff --git a/arch/x86/kernel/vsyscall_32.S b/arch/x86/kernel/vsyscall_32.S deleted file mode 100644 index a5ab3dc4fd2..00000000000 --- a/arch/x86/kernel/vsyscall_32.S +++ /dev/null @@ -1,15 +0,0 @@ -#include <linux/init.h> - -__INITDATA - - .globl vsyscall_int80_start, vsyscall_int80_end -vsyscall_int80_start: - .incbin "arch/x86/kernel/vsyscall-int80_32.so" -vsyscall_int80_end: - - .globl vsyscall_sysenter_start, vsyscall_sysenter_end -vsyscall_sysenter_start: - .incbin "arch/x86/kernel/vsyscall-sysenter_32.so" -vsyscall_sysenter_end: - -__FINIT diff --git a/arch/x86/kernel/vsyscall_32.lds.S b/arch/x86/kernel/vsyscall_32.lds.S deleted file mode 100644 index 4a8b0ed9b8f..00000000000 --- a/arch/x86/kernel/vsyscall_32.lds.S +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Linker script for vsyscall DSO. The vsyscall page is an ELF shared - * object prelinked to its virtual address, and with only one read-only - * segment (that fits in one page). This script controls its layout. - */ -#include <asm/asm-offsets.h> - -SECTIONS -{ - . = VDSO_PRELINK_asm + SIZEOF_HEADERS; - - .hash : { *(.hash) } :text - .gnu.hash : { *(.gnu.hash) } - .dynsym : { *(.dynsym) } - .dynstr : { *(.dynstr) } - .gnu.version : { *(.gnu.version) } - .gnu.version_d : { *(.gnu.version_d) } - .gnu.version_r : { *(.gnu.version_r) } - - /* This linker script is used both with -r and with -shared. - For the layouts to match, we need to skip more than enough - space for the dynamic symbol table et al. If this amount - is insufficient, ld -shared will barf. Just increase it here. */ - . = VDSO_PRELINK_asm + 0x400; - - .text : { *(.text) } :text =0x90909090 - .note : { *(.note.*) } :text :note - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text - .dynamic : { *(.dynamic) } :text :dynamic - .useless : { - *(.got.plt) *(.got) - *(.data .data.* .gnu.linkonce.d.*) - *(.dynbss) - *(.bss .bss.* .gnu.linkonce.b.*) - } :text -} - -/* - * We must supply the ELF program headers explicitly to get just one - * PT_LOAD segment, and set the flags explicitly to make segments read-only. - */ -PHDRS -{ - text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ - dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ - note PT_NOTE FLAGS(4); /* PF_R */ - eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ -} - -/* - * This controls what symbols we export from the DSO. - */ -VERSION -{ - LINUX_2.5 { - global: - __kernel_vsyscall; - __kernel_sigreturn; - __kernel_rt_sigreturn; - - local: *; - }; -} - -/* The ELF entry point can be used to set the AT_SYSINFO value. */ -ENTRY(__kernel_vsyscall); diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index ad4005c6d4a..3f824277458 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -43,7 +43,7 @@ #include <asm/vgtod.h> #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) -#define __syscall_clobber "r11","rcx","memory" +#define __syscall_clobber "r11","cx","memory" #define __pa_vsymbol(x) \ ({unsigned long v; \ extern char __vsyscall_0; \ @@ -190,7 +190,7 @@ time_t __vsyscall(1) vtime(time_t *t) long __vsyscall(2) vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) { - unsigned int dummy, p; + unsigned int p; unsigned long j = 0; /* Fast cache - only recompute value once per jiffies and avoid @@ -205,7 +205,7 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) p = tcache->blob[1]; } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { /* Load per CPU data from RDTSCP */ - rdtscp(dummy, dummy, p); + native_read_tscp(&p); } else { /* Load per CPU data from GDT */ asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); @@ -297,7 +297,7 @@ static void __cpuinit vsyscall_set_cpu(int cpu) /* Store cpu number in limit so that it can be loaded quickly in user space in vgetcpu. 12 bits for the CPU and 8 bits for the node. */ - d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU); + d = (unsigned long *)(get_cpu_gdt_table(cpu) + GDT_ENTRY_PER_CPU); *d = 0x0f40000000000ULL; *d |= cpu; *d |= (node & 0xf) << 12; @@ -319,7 +319,7 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) return NOTIFY_DONE; } -static void __init map_vsyscall(void) +void __init map_vsyscall(void) { extern char __vsyscall_0; unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); @@ -335,7 +335,6 @@ static int __init vsyscall_init(void) BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)); BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE))); BUG_ON((unsigned long) &vgetcpu != VSYSCALL_ADDR(__NR_vgetcpu)); - map_vsyscall(); #ifdef CONFIG_SYSCTL register_sysctl_table(kernel_root_table2); #endif diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 77c25b30763..a66e9c1a053 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -8,6 +8,7 @@ #include <asm/processor.h> #include <asm/uaccess.h> #include <asm/pgtable.h> +#include <asm/desc.h> EXPORT_SYMBOL(kernel_thread); @@ -34,13 +35,6 @@ EXPORT_SYMBOL(__copy_from_user_inatomic); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); -#ifdef CONFIG_SMP -extern void __write_lock_failed(rwlock_t *rw); -extern void __read_lock_failed(rwlock_t *rw); -EXPORT_SYMBOL(__write_lock_failed); -EXPORT_SYMBOL(__read_lock_failed); -#endif - /* Export string functions. We normally rely on gcc builtin for most of these, but gcc sometimes decides not to inline them. */ #undef memcpy @@ -60,3 +54,8 @@ EXPORT_SYMBOL(init_level4_pgt); EXPORT_SYMBOL(load_gs_index); EXPORT_SYMBOL(_proxy_pda); + +#ifdef CONFIG_PARAVIRT +/* Virtualized guests may want to use it */ +EXPORT_SYMBOL_GPL(cpu_gdt_descr); +#endif diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 19626ace0f5..964dfa36d36 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig @@ -1,6 +1,7 @@ config LGUEST_GUEST bool "Lguest guest support" select PARAVIRT + depends on X86_32 depends on !X86_PAE depends on !(X86_VISWS || X86_VOYAGER) select VIRTIO diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 92c56117eae..a63373759f0 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -175,8 +175,8 @@ static void lguest_leave_lazy_mode(void) * check there when it wants to deliver an interrupt. */ -/* save_flags() is expected to return the processor state (ie. "eflags"). The - * eflags word contains all kind of stuff, but in practice Linux only cares +/* save_flags() is expected to return the processor state (ie. "flags"). The + * flags word contains all kind of stuff, but in practice Linux only cares * about the interrupt flag. Our "save_flags()" just returns that. */ static unsigned long save_fl(void) { @@ -217,19 +217,20 @@ static void irq_enable(void) * address of the handler, and... well, who cares? The Guest just asks the * Host to make the change anyway, because the Host controls the real IDT. */ -static void lguest_write_idt_entry(struct desc_struct *dt, - int entrynum, u32 low, u32 high) +static void lguest_write_idt_entry(gate_desc *dt, + int entrynum, const gate_desc *g) { + u32 *desc = (u32 *)g; /* Keep the local copy up to date. */ - write_dt_entry(dt, entrynum, low, high); + native_write_idt_entry(dt, entrynum, g); /* Tell Host about this new entry. */ - hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high); + hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); } /* Changing to a different IDT is very rare: we keep the IDT up-to-date every * time it is written, so we can simply loop through all entries and tell the * Host about them. */ -static void lguest_load_idt(const struct Xgt_desc_struct *desc) +static void lguest_load_idt(const struct desc_ptr *desc) { unsigned int i; struct desc_struct *idt = (void *)desc->address; @@ -252,7 +253,7 @@ static void lguest_load_idt(const struct Xgt_desc_struct *desc) * hypercall and use that repeatedly to load a new IDT. I don't think it * really matters, but wouldn't it be nice if they were the same? */ -static void lguest_load_gdt(const struct Xgt_desc_struct *desc) +static void lguest_load_gdt(const struct desc_ptr *desc) { BUG_ON((desc->size+1)/8 != GDT_ENTRIES); hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0); @@ -261,10 +262,10 @@ static void lguest_load_gdt(const struct Xgt_desc_struct *desc) /* For a single GDT entry which changes, we do the lazy thing: alter our GDT, * then tell the Host to reload the entire thing. This operation is so rare * that this naive implementation is reasonable. */ -static void lguest_write_gdt_entry(struct desc_struct *dt, - int entrynum, u32 low, u32 high) +static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, + const void *desc, int type) { - write_dt_entry(dt, entrynum, low, high); + native_write_gdt_entry(dt, entrynum, desc, type); hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0); } @@ -323,30 +324,30 @@ static void lguest_load_tr_desc(void) * anyone (including userspace) can just use the raw "cpuid" instruction and * the Host won't even notice since it isn't privileged. So we try not to get * too worked up about it. */ -static void lguest_cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) +static void lguest_cpuid(unsigned int *ax, unsigned int *bx, + unsigned int *cx, unsigned int *dx) { - int function = *eax; + int function = *ax; - native_cpuid(eax, ebx, ecx, edx); + native_cpuid(ax, bx, cx, dx); switch (function) { case 1: /* Basic feature request. */ /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ - *ecx &= 0x00002201; + *cx &= 0x00002201; /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ - *edx &= 0x07808101; + *dx &= 0x07808101; /* The Host can do a nice optimization if it knows that the * kernel mappings (addresses above 0xC0000000 or whatever * PAGE_OFFSET is set to) haven't changed. But Linux calls * flush_tlb_user() for both user and kernel mappings unless * the Page Global Enable (PGE) feature bit is set. */ - *edx |= 0x00002000; + *dx |= 0x00002000; break; case 0x80000000: /* Futureproof this a little: if they ask how much extended * processor information there is, limit it to known fields. */ - if (*eax > 0x80000008) - *eax = 0x80000008; + if (*ax > 0x80000008) + *ax = 0x80000008; break; } } @@ -755,10 +756,10 @@ static void lguest_time_init(void) * segment), the privilege level (we're privilege level 1, the Host is 0 and * will not tolerate us trying to use that), the stack pointer, and the number * of pages in the stack. */ -static void lguest_load_esp0(struct tss_struct *tss, +static void lguest_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { - lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0, + lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->sp0, THREAD_SIZE/PAGE_SIZE); } @@ -788,11 +789,11 @@ static void lguest_wbinvd(void) * code qualifies for Advanced. It will also never interrupt anything. It * does, however, allow us to get through the Linux boot code. */ #ifdef CONFIG_X86_LOCAL_APIC -static void lguest_apic_write(unsigned long reg, unsigned long v) +static void lguest_apic_write(unsigned long reg, u32 v) { } -static unsigned long lguest_apic_read(unsigned long reg) +static u32 lguest_apic_read(unsigned long reg) { return 0; } @@ -957,7 +958,7 @@ __init void lguest_init(void) pv_cpu_ops.cpuid = lguest_cpuid; pv_cpu_ops.load_idt = lguest_load_idt; pv_cpu_ops.iret = lguest_iret; - pv_cpu_ops.load_esp0 = lguest_load_esp0; + pv_cpu_ops.load_sp0 = lguest_load_sp0; pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; pv_cpu_ops.set_ldt = lguest_set_ldt; pv_cpu_ops.load_tls = lguest_load_tls; diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 329da276c6f..4876182daf8 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -1,5 +1,27 @@ +# +# Makefile for x86 specific library files. +# + +obj-$(CONFIG_SMP) := msr-on-cpu.o + +lib-y := delay_$(BITS).o +lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o +lib-y += memcpy_$(BITS).o + ifeq ($(CONFIG_X86_32),y) -include ${srctree}/arch/x86/lib/Makefile_32 + lib-y += checksum_32.o + lib-y += strstr_32.o + lib-y += bitops_32.o semaphore_32.o string_32.o + + lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o else -include ${srctree}/arch/x86/lib/Makefile_64 + obj-y += io_64.o iomap_copy_64.o + + CFLAGS_csum-partial_64.o := -funroll-loops + + lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o + lib-y += thunk_64.o clear_page_64.o copy_page_64.o + lib-y += bitstr_64.o bitops_64.o + lib-y += memmove_64.o memset_64.o + lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o endif diff --git a/arch/x86/lib/Makefile_32 b/arch/x86/lib/Makefile_32 deleted file mode 100644 index 98d1f1e2e2e..00000000000 --- a/arch/x86/lib/Makefile_32 +++ /dev/null @@ -1,11 +0,0 @@ -# -# Makefile for i386-specific library files.. -# - - -lib-y = checksum_32.o delay_32.o usercopy_32.o getuser_32.o putuser_32.o memcpy_32.o strstr_32.o \ - bitops_32.o semaphore_32.o string_32.o - -lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o - -obj-$(CONFIG_SMP) += msr-on-cpu.o diff --git a/arch/x86/lib/Makefile_64 b/arch/x86/lib/Makefile_64 deleted file mode 100644 index bbabad3c933..00000000000 --- a/arch/x86/lib/Makefile_64 +++ /dev/null @@ -1,13 +0,0 @@ -# -# Makefile for x86_64-specific library files. -# - -CFLAGS_csum-partial_64.o := -funroll-loops - -obj-y := io_64.o iomap_copy_64.o -obj-$(CONFIG_SMP) += msr-on-cpu.o - -lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \ - usercopy_64.o getuser_64.o putuser_64.o \ - thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o -lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c index 8ac51b82a63..37756b6fb32 100644 --- a/arch/x86/lib/memcpy_32.c +++ b/arch/x86/lib/memcpy_32.c @@ -34,8 +34,8 @@ void *memmove(void *dest, const void *src, size_t n) "cld" : "=&c" (d0), "=&S" (d1), "=&D" (d2) :"0" (n), - "1" (n-1+(const char *)src), - "2" (n-1+(char *)dest) + "1" (n-1+src), + "2" (n-1+dest) :"memory"); } return dest; diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c index 751ebae8ec4..80175e47b19 100644 --- a/arch/x86/lib/memmove_64.c +++ b/arch/x86/lib/memmove_64.c @@ -11,8 +11,8 @@ void *memmove(void * dest,const void *src,size_t count) if (dest < src) { return memcpy(dest,src,count); } else { - char *p = (char *) dest + count; - char *s = (char *) src + count; + char *p = dest + count; + const char *s = src + count; while (count--) *--p = *--s; } diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S index 444fba40098..3899bd37fdf 100644 --- a/arch/x86/lib/semaphore_32.S +++ b/arch/x86/lib/semaphore_32.S @@ -29,7 +29,7 @@ * registers (%eax, %edx and %ecx) except %eax whish is either a return * value or just clobbered.. */ - .section .sched.text + .section .sched.text, "ax" ENTRY(__down_failed) CFI_STARTPROC FRAME @@ -49,7 +49,7 @@ ENTRY(__down_failed) ENDFRAME ret CFI_ENDPROC - END(__down_failed) + ENDPROC(__down_failed) ENTRY(__down_failed_interruptible) CFI_STARTPROC @@ -70,7 +70,7 @@ ENTRY(__down_failed_interruptible) ENDFRAME ret CFI_ENDPROC - END(__down_failed_interruptible) + ENDPROC(__down_failed_interruptible) ENTRY(__down_failed_trylock) CFI_STARTPROC @@ -91,7 +91,7 @@ ENTRY(__down_failed_trylock) ENDFRAME ret CFI_ENDPROC - END(__down_failed_trylock) + ENDPROC(__down_failed_trylock) ENTRY(__up_wakeup) CFI_STARTPROC @@ -112,7 +112,7 @@ ENTRY(__up_wakeup) ENDFRAME ret CFI_ENDPROC - END(__up_wakeup) + ENDPROC(__up_wakeup) /* * rw spinlock fallbacks @@ -132,7 +132,7 @@ ENTRY(__write_lock_failed) ENDFRAME ret CFI_ENDPROC - END(__write_lock_failed) + ENDPROC(__write_lock_failed) ENTRY(__read_lock_failed) CFI_STARTPROC @@ -148,7 +148,7 @@ ENTRY(__read_lock_failed) ENDFRAME ret CFI_ENDPROC - END(__read_lock_failed) + ENDPROC(__read_lock_failed) #endif @@ -170,7 +170,7 @@ ENTRY(call_rwsem_down_read_failed) CFI_ADJUST_CFA_OFFSET -4 ret CFI_ENDPROC - END(call_rwsem_down_read_failed) + ENDPROC(call_rwsem_down_read_failed) ENTRY(call_rwsem_down_write_failed) CFI_STARTPROC @@ -182,7 +182,7 @@ ENTRY(call_rwsem_down_write_failed) CFI_ADJUST_CFA_OFFSET -4 ret CFI_ENDPROC - END(call_rwsem_down_write_failed) + ENDPROC(call_rwsem_down_write_failed) ENTRY(call_rwsem_wake) CFI_STARTPROC @@ -196,7 +196,7 @@ ENTRY(call_rwsem_wake) CFI_ADJUST_CFA_OFFSET -4 1: ret CFI_ENDPROC - END(call_rwsem_wake) + ENDPROC(call_rwsem_wake) /* Fix up special calling conventions */ ENTRY(call_rwsem_downgrade_wake) @@ -214,6 +214,6 @@ ENTRY(call_rwsem_downgrade_wake) CFI_ADJUST_CFA_OFFSET -4 ret CFI_ENDPROC - END(call_rwsem_downgrade_wake) + ENDPROC(call_rwsem_downgrade_wake) #endif diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index 6ea73f3de56..8b92d428ab0 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -33,7 +33,7 @@ .endm - .section .sched.text + .section .sched.text, "ax" #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed diff --git a/arch/x86/mach-rdc321x/Makefile b/arch/x86/mach-rdc321x/Makefile new file mode 100644 index 00000000000..1faac8125e3 --- /dev/null +++ b/arch/x86/mach-rdc321x/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the RDC321x specific parts of the kernel +# +obj-$(CONFIG_X86_RDC321X) := gpio.o platform.o wdt.o + diff --git a/arch/x86/mach-rdc321x/gpio.c b/arch/x86/mach-rdc321x/gpio.c new file mode 100644 index 00000000000..031269163bd --- /dev/null +++ b/arch/x86/mach-rdc321x/gpio.c @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2007, OpenWrt.org, Florian Fainelli <florian@openwrt.org> + * RDC321x architecture specific GPIO support + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include <linux/autoconf.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/types.h> +#include <linux/module.h> +#include <linux/delay.h> + +#include <asm/mach-rdc321x/rdc321x_defs.h> + +static inline int rdc_gpio_is_valid(unsigned gpio) +{ + return (gpio <= RDC_MAX_GPIO); +} + +static unsigned int rdc_gpio_read(unsigned gpio) +{ + unsigned int val; + + val = 0x80000000 | (7 << 11) | ((gpio&0x20?0x84:0x48)); + outl(val, RDC3210_CFGREG_ADDR); + udelay(10); + val = inl(RDC3210_CFGREG_DATA); + val |= (0x1 << (gpio & 0x1F)); + outl(val, RDC3210_CFGREG_DATA); + udelay(10); + val = 0x80000000 | (7 << 11) | ((gpio&0x20?0x88:0x4C)); + outl(val, RDC3210_CFGREG_ADDR); + udelay(10); + val = inl(RDC3210_CFGREG_DATA); + + return val; +} + +static void rdc_gpio_write(unsigned int val) +{ + if (val) { + outl(val, RDC3210_CFGREG_DATA); + udelay(10); + } +} + +int rdc_gpio_get_value(unsigned gpio) +{ + if (rdc_gpio_is_valid(gpio)) + return (int)rdc_gpio_read(gpio); + else + return -EINVAL; +} +EXPORT_SYMBOL(rdc_gpio_get_value); + +void rdc_gpio_set_value(unsigned gpio, int value) +{ + unsigned int val; + + if (!rdc_gpio_is_valid(gpio)) + return; + + val = rdc_gpio_read(gpio); + + if (value) + val &= ~(0x1 << (gpio & 0x1F)); + else + val |= (0x1 << (gpio & 0x1F)); + + rdc_gpio_write(val); +} +EXPORT_SYMBOL(rdc_gpio_set_value); + +int rdc_gpio_direction_input(unsigned gpio) +{ + return 0; +} +EXPORT_SYMBOL(rdc_gpio_direction_input); + +int rdc_gpio_direction_output(unsigned gpio, int value) +{ + return 0; +} +EXPORT_SYMBOL(rdc_gpio_direction_output); + + diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c new file mode 100644 index 00000000000..dda6024a586 --- /dev/null +++ b/arch/x86/mach-rdc321x/platform.c @@ -0,0 +1,68 @@ +/* + * Generic RDC321x platform devices + * + * Copyright (C) 2007 Florian Fainelli <florian@openwrt.org> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the + * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, + * Boston, MA 02110-1301, USA. + * + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/device.h> +#include <linux/platform_device.h> +#include <linux/version.h> +#include <linux/leds.h> + +#include <asm/gpio.h> + +/* LEDS */ +static struct gpio_led default_leds[] = { + { .name = "rdc:dmz", .gpio = 1, }, +}; + +static struct gpio_led_platform_data rdc321x_led_data = { + .num_leds = ARRAY_SIZE(default_leds), + .leds = default_leds, +}; + +static struct platform_device rdc321x_leds = { + .name = "leds-gpio", + .id = -1, + .dev = { + .platform_data = &rdc321x_led_data, + } +}; + +/* Watchdog */ +static struct platform_device rdc321x_wdt = { + .name = "rdc321x-wdt", + .id = -1, + .num_resources = 0, +}; + +static struct platform_device *rdc321x_devs[] = { + &rdc321x_leds, + &rdc321x_wdt +}; + +static int __init rdc_board_setup(void) +{ + return platform_add_devices(rdc321x_devs, ARRAY_SIZE(rdc321x_devs)); +} + +arch_initcall(rdc_board_setup); diff --git a/arch/x86/mach-rdc321x/wdt.c b/arch/x86/mach-rdc321x/wdt.c new file mode 100644 index 00000000000..ec5625ae706 --- /dev/null +++ b/arch/x86/mach-rdc321x/wdt.c @@ -0,0 +1,275 @@ +/* + * RDC321x watchdog driver + * + * Copyright (C) 2007 Florian Fainelli <florian@openwrt.org> + * + * This driver is highly inspired from the cpu5_wdt driver + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/miscdevice.h> +#include <linux/fs.h> +#include <linux/init.h> +#include <linux/ioport.h> +#include <linux/timer.h> +#include <linux/completion.h> +#include <linux/jiffies.h> +#include <linux/platform_device.h> +#include <linux/watchdog.h> +#include <linux/io.h> +#include <linux/uaccess.h> + +#include <asm/mach-rdc321x/rdc321x_defs.h> + +#define RDC_WDT_MASK 0x80000000 /* Mask */ +#define RDC_WDT_EN 0x00800000 /* Enable bit */ +#define RDC_WDT_WTI 0x00200000 /* Generate CPU reset/NMI/WDT on timeout */ +#define RDC_WDT_RST 0x00100000 /* Reset bit */ +#define RDC_WDT_WIF 0x00040000 /* WDT IRQ Flag */ +#define RDC_WDT_IRT 0x00000100 /* IRQ Routing table */ +#define RDC_WDT_CNT 0x00000001 /* WDT count */ + +#define RDC_CLS_TMR 0x80003844 /* Clear timer */ + +#define RDC_WDT_INTERVAL (HZ/10+1) + +int nowayout = WATCHDOG_NOWAYOUT; +module_param(nowayout, int, 0); +MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); + +static int ticks = 1000; + +/* some device data */ + +static struct { + struct completion stop; + volatile int running; + struct timer_list timer; + volatile int queue; + int default_ticks; + unsigned long inuse; +} rdc321x_wdt_device; + +/* generic helper functions */ + +static void rdc321x_wdt_trigger(unsigned long unused) +{ + if (rdc321x_wdt_device.running) + ticks--; + + /* keep watchdog alive */ + outl(RDC_WDT_EN|inl(RDC3210_CFGREG_DATA), RDC3210_CFGREG_DATA); + + /* requeue?? */ + if (rdc321x_wdt_device.queue && ticks) + mod_timer(&rdc321x_wdt_device.timer, + jiffies + RDC_WDT_INTERVAL); + else { + /* ticks doesn't matter anyway */ + complete(&rdc321x_wdt_device.stop); + } + +} + +static void rdc321x_wdt_reset(void) +{ + ticks = rdc321x_wdt_device.default_ticks; +} + +static void rdc321x_wdt_start(void) +{ + if (!rdc321x_wdt_device.queue) { + rdc321x_wdt_device.queue = 1; + + /* Clear the timer */ + outl(RDC_CLS_TMR, RDC3210_CFGREG_ADDR); + + /* Enable watchdog and set the timeout to 81.92 us */ + outl(RDC_WDT_EN|RDC_WDT_CNT, RDC3210_CFGREG_DATA); + + mod_timer(&rdc321x_wdt_device.timer, + jiffies + RDC_WDT_INTERVAL); + } + + /* if process dies, counter is not decremented */ + rdc321x_wdt_device.running++; +} + +static int rdc321x_wdt_stop(void) +{ + if (rdc321x_wdt_device.running) + rdc321x_wdt_device.running = 0; + + ticks = rdc321x_wdt_device.default_ticks; + + return -EIO; +} + +/* filesystem operations */ + +static int rdc321x_wdt_open(struct inode *inode, struct file *file) +{ + if (test_and_set_bit(0, &rdc321x_wdt_device.inuse)) + return -EBUSY; + + return nonseekable_open(inode, file); +} + +static int rdc321x_wdt_release(struct inode *inode, struct file *file) +{ + clear_bit(0, &rdc321x_wdt_device.inuse); + return 0; +} + +static int rdc321x_wdt_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + unsigned int value; + static struct watchdog_info ident = { + .options = WDIOF_CARDRESET, + .identity = "RDC321x WDT", + }; + + switch (cmd) { + case WDIOC_KEEPALIVE: + rdc321x_wdt_reset(); + break; + case WDIOC_GETSTATUS: + /* Read the value from the DATA register */ + value = inl(RDC3210_CFGREG_DATA); + if (copy_to_user(argp, &value, sizeof(int))) + return -EFAULT; + break; + case WDIOC_GETSUPPORT: + if (copy_to_user(argp, &ident, sizeof(ident))) + return -EFAULT; + break; + case WDIOC_SETOPTIONS: + if (copy_from_user(&value, argp, sizeof(int))) + return -EFAULT; + switch (value) { + case WDIOS_ENABLECARD: + rdc321x_wdt_start(); + break; + case WDIOS_DISABLECARD: + return rdc321x_wdt_stop(); + default: + return -EINVAL; + } + break; + default: + return -ENOTTY; + } + return 0; +} + +static ssize_t rdc321x_wdt_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + if (!count) + return -EIO; + + rdc321x_wdt_reset(); + + return count; +} + +static const struct file_operations rdc321x_wdt_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .ioctl = rdc321x_wdt_ioctl, + .open = rdc321x_wdt_open, + .write = rdc321x_wdt_write, + .release = rdc321x_wdt_release, +}; + +static struct miscdevice rdc321x_wdt_misc = { + .minor = WATCHDOG_MINOR, + .name = "watchdog", + .fops = &rdc321x_wdt_fops, +}; + +static int __devinit rdc321x_wdt_probe(struct platform_device *pdev) +{ + int err; + + err = misc_register(&rdc321x_wdt_misc); + if (err < 0) { + printk(KERN_ERR PFX "watchdog misc_register failed\n"); + return err; + } + + /* Reset the watchdog */ + outl(RDC_WDT_RST, RDC3210_CFGREG_DATA); + + init_completion(&rdc321x_wdt_device.stop); + rdc321x_wdt_device.queue = 0; + + clear_bit(0, &rdc321x_wdt_device.inuse); + + setup_timer(&rdc321x_wdt_device.timer, rdc321x_wdt_trigger, 0); + + rdc321x_wdt_device.default_ticks = ticks; + + printk(KERN_INFO PFX "watchdog init success\n"); + + return 0; +} + +static int rdc321x_wdt_remove(struct platform_device *pdev) +{ + if (rdc321x_wdt_device.queue) { + rdc321x_wdt_device.queue = 0; + wait_for_completion(&rdc321x_wdt_device.stop); + } + + misc_deregister(&rdc321x_wdt_misc); + + return 0; +} + +static struct platform_driver rdc321x_wdt_driver = { + .probe = rdc321x_wdt_probe, + .remove = rdc321x_wdt_remove, + .driver = { + .owner = THIS_MODULE, + .name = "rdc321x-wdt", + }, +}; + +static int __init rdc321x_wdt_init(void) +{ + return platform_driver_register(&rdc321x_wdt_driver); +} + +static void __exit rdc321x_wdt_exit(void) +{ + platform_driver_unregister(&rdc321x_wdt_driver); +} + +module_init(rdc321x_wdt_init); +module_exit(rdc321x_wdt_exit); + +MODULE_AUTHOR("Florian Fainelli <florian@openwrt.org>"); +MODULE_DESCRIPTION("RDC321x watchdog driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR); diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c index f3c74fab8b9..2a8456a1f44 100644 --- a/arch/x86/mach-visws/mpparse.c +++ b/arch/x86/mach-visws/mpparse.c @@ -36,19 +36,19 @@ unsigned int __initdata maxcpus = NR_CPUS; static void __init MP_processor_info (struct mpc_config_processor *m) { - int ver, logical_apicid; + int ver, logical_apicid; physid_mask_t apic_cpus; - + if (!(m->mpc_cpuflag & CPU_ENABLED)) return; logical_apicid = m->mpc_apicid; - printk(KERN_INFO "%sCPU #%d %ld:%ld APIC version %d\n", - m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "", - m->mpc_apicid, - (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, - (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, - m->mpc_apicver); + printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n", + m->mpc_cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "", + m->mpc_apicid, + (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, + (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, + m->mpc_apicver); if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) boot_cpu_physical_apicid = m->mpc_apicid; diff --git a/arch/x86/mach-voyager/setup.c b/arch/x86/mach-voyager/setup.c index 3bef977cb29..5ae5466b9eb 100644 --- a/arch/x86/mach-voyager/setup.c +++ b/arch/x86/mach-voyager/setup.c @@ -37,14 +37,14 @@ void __init pre_setup_arch_hook(void) { /* Voyagers run their CPUs from independent clocks, so disable * the TSC code because we can't sync them */ - tsc_disable = 1; + setup_clear_cpu_cap(X86_FEATURE_TSC); } void __init trap_init_hook(void) { } -static struct irqaction irq0 = { +static struct irqaction irq0 = { .handler = timer_interrupt, .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL, .mask = CPU_MASK_NONE, @@ -59,44 +59,47 @@ void __init time_init_hook(void) /* Hook for machine specific memory setup. */ -char * __init machine_specific_memory_setup(void) +char *__init machine_specific_memory_setup(void) { char *who; who = "NOT VOYAGER"; - if(voyager_level == 5) { + if (voyager_level == 5) { __u32 addr, length; int i; who = "Voyager-SUS"; e820.nr_map = 0; - for(i=0; voyager_memory_detect(i, &addr, &length); i++) { + for (i = 0; voyager_memory_detect(i, &addr, &length); i++) { add_memory_region(addr, length, E820_RAM); } return who; - } else if(voyager_level == 4) { + } else if (voyager_level == 4) { __u32 tom; - __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT)<<8; + __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT) << 8; /* select the DINO config space */ outb(VOYAGER_DINO, VOYAGER_CAT_CONFIG_PORT); /* Read DINO top of memory register */ tom = ((inb(catbase + 0x4) & 0xf0) << 16) - + ((inb(catbase + 0x5) & 0x7f) << 24); + + ((inb(catbase + 0x5) & 0x7f) << 24); - if(inb(catbase) != VOYAGER_DINO) { - printk(KERN_ERR "Voyager: Failed to get DINO for L4, setting tom to EXT_MEM_K\n"); - tom = (boot_params.screen_info.ext_mem_k)<<10; + if (inb(catbase) != VOYAGER_DINO) { + printk(KERN_ERR + "Voyager: Failed to get DINO for L4, setting tom to EXT_MEM_K\n"); + tom = (boot_params.screen_info.ext_mem_k) << 10; } who = "Voyager-TOM"; add_memory_region(0, 0x9f000, E820_RAM); /* map from 1M to top of memory */ - add_memory_region(1*1024*1024, tom - 1*1024*1024, E820_RAM); + add_memory_region(1 * 1024 * 1024, tom - 1 * 1024 * 1024, + E820_RAM); /* FIXME: Should check the ASICs to see if I need to * take out the 8M window. Just do it at the moment * */ - add_memory_region(8*1024*1024, 8*1024*1024, E820_RESERVED); + add_memory_region(8 * 1024 * 1024, 8 * 1024 * 1024, + E820_RESERVED); return who; } @@ -114,8 +117,7 @@ char * __init machine_specific_memory_setup(void) unsigned long mem_size; /* compare results from other methods and take the greater */ - if (boot_params.alt_mem_k - < boot_params.screen_info.ext_mem_k) { + if (boot_params.alt_mem_k < boot_params.screen_info.ext_mem_k) { mem_size = boot_params.screen_info.ext_mem_k; who = "BIOS-88"; } else { @@ -126,6 +128,6 @@ char * __init machine_specific_memory_setup(void) e820.nr_map = 0; add_memory_region(0, LOWMEMSIZE(), E820_RAM); add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); - } + } return who; } diff --git a/arch/x86/mach-voyager/voyager_basic.c b/arch/x86/mach-voyager/voyager_basic.c index 9b77b39b71a..6a949e4edde 100644 --- a/arch/x86/mach-voyager/voyager_basic.c +++ b/arch/x86/mach-voyager/voyager_basic.c @@ -35,7 +35,7 @@ /* * Power off function, if any */ -void (*pm_power_off)(void); +void (*pm_power_off) (void); EXPORT_SYMBOL(pm_power_off); int voyager_level = 0; @@ -43,39 +43,38 @@ int voyager_level = 0; struct voyager_SUS *voyager_SUS = NULL; #ifdef CONFIG_SMP -static void -voyager_dump(int dummy1, struct tty_struct *dummy3) +static void voyager_dump(int dummy1, struct tty_struct *dummy3) { /* get here via a sysrq */ voyager_smp_dump(); } static struct sysrq_key_op sysrq_voyager_dump_op = { - .handler = voyager_dump, - .help_msg = "Voyager", - .action_msg = "Dump Voyager Status", + .handler = voyager_dump, + .help_msg = "Voyager", + .action_msg = "Dump Voyager Status", }; #endif -void -voyager_detect(struct voyager_bios_info *bios) +void voyager_detect(struct voyager_bios_info *bios) { - if(bios->len != 0xff) { - int class = (bios->class_1 << 8) - | (bios->class_2 & 0xff); + if (bios->len != 0xff) { + int class = (bios->class_1 << 8) + | (bios->class_2 & 0xff); printk("Voyager System detected.\n" " Class %x, Revision %d.%d\n", class, bios->major, bios->minor); - if(class == VOYAGER_LEVEL4) + if (class == VOYAGER_LEVEL4) voyager_level = 4; - else if(class < VOYAGER_LEVEL5_AND_ABOVE) + else if (class < VOYAGER_LEVEL5_AND_ABOVE) voyager_level = 3; else voyager_level = 5; printk(" Architecture Level %d\n", voyager_level); - if(voyager_level < 4) - printk("\n**WARNING**: Voyager HAL only supports Levels 4 and 5 Architectures at the moment\n\n"); + if (voyager_level < 4) + printk + ("\n**WARNING**: Voyager HAL only supports Levels 4 and 5 Architectures at the moment\n\n"); /* install the power off handler */ pm_power_off = voyager_power_off; #ifdef CONFIG_SMP @@ -86,15 +85,13 @@ voyager_detect(struct voyager_bios_info *bios) } } -void -voyager_system_interrupt(int cpl, void *dev_id) +void voyager_system_interrupt(int cpl, void *dev_id) { printk("Voyager: detected system interrupt\n"); } /* Routine to read information from the extended CMOS area */ -__u8 -voyager_extended_cmos_read(__u16 addr) +__u8 voyager_extended_cmos_read(__u16 addr) { outb(addr & 0xff, 0x74); outb((addr >> 8) & 0xff, 0x75); @@ -108,12 +105,11 @@ voyager_extended_cmos_read(__u16 addr) typedef struct ClickMap { struct Entry { - __u32 Address; - __u32 Length; + __u32 Address; + __u32 Length; } Entry[CLICK_ENTRIES]; } ClickMap_t; - /* This routine is pretty much an awful hack to read the bios clickmap by * mapping it into page 0. There are usually three regions in the map: * Base Memory @@ -122,8 +118,7 @@ typedef struct ClickMap { * * Returns are 0 for failure and 1 for success on extracting region. */ -int __init -voyager_memory_detect(int region, __u32 *start, __u32 *length) +int __init voyager_memory_detect(int region, __u32 * start, __u32 * length) { int i; int retval = 0; @@ -132,13 +127,14 @@ voyager_memory_detect(int region, __u32 *start, __u32 *length) unsigned long map_addr; unsigned long old; - if(region >= CLICK_ENTRIES) { + if (region >= CLICK_ENTRIES) { printk("Voyager: Illegal ClickMap region %d\n", region); return 0; } - for(i = 0; i < sizeof(cmos); i++) - cmos[i] = voyager_extended_cmos_read(VOYAGER_MEMORY_CLICKMAP + i); + for (i = 0; i < sizeof(cmos); i++) + cmos[i] = + voyager_extended_cmos_read(VOYAGER_MEMORY_CLICKMAP + i); map_addr = *(unsigned long *)cmos; @@ -147,10 +143,10 @@ voyager_memory_detect(int region, __u32 *start, __u32 *length) pg0[0] = ((map_addr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT); local_flush_tlb(); /* now clear everything out but page 0 */ - map = (ClickMap_t *)(map_addr & (~PAGE_MASK)); + map = (ClickMap_t *) (map_addr & (~PAGE_MASK)); /* zero length is the end of the clickmap */ - if(map->Entry[region].Length != 0) { + if (map->Entry[region].Length != 0) { *length = map->Entry[region].Length * CLICK_SIZE; *start = map->Entry[region].Address; retval = 1; @@ -165,10 +161,9 @@ voyager_memory_detect(int region, __u32 *start, __u32 *length) /* voyager specific handling code for timer interrupts. Used to hand * off the timer tick to the SMP code, since the VIC doesn't have an * internal timer (The QIC does, but that's another story). */ -void -voyager_timer_interrupt(void) +void voyager_timer_interrupt(void) { - if((jiffies & 0x3ff) == 0) { + if ((jiffies & 0x3ff) == 0) { /* There seems to be something flaky in either * hardware or software that is resetting the timer 0 @@ -186,18 +181,20 @@ voyager_timer_interrupt(void) __u16 val; spin_lock(&i8253_lock); - + outb_p(0x00, 0x43); val = inb_p(0x40); val |= inb(0x40) << 8; spin_unlock(&i8253_lock); - if(val > LATCH) { - printk("\nVOYAGER: countdown timer value too high (%d), resetting\n\n", val); + if (val > LATCH) { + printk + ("\nVOYAGER: countdown timer value too high (%d), resetting\n\n", + val); spin_lock(&i8253_lock); - outb(0x34,0x43); - outb_p(LATCH & 0xff , 0x40); /* LSB */ - outb(LATCH >> 8 , 0x40); /* MSB */ + outb(0x34, 0x43); + outb_p(LATCH & 0xff, 0x40); /* LSB */ + outb(LATCH >> 8, 0x40); /* MSB */ spin_unlock(&i8253_lock); } } @@ -206,14 +203,13 @@ voyager_timer_interrupt(void) #endif } -void -voyager_power_off(void) +void voyager_power_off(void) { printk("VOYAGER Power Off\n"); - if(voyager_level == 5) { + if (voyager_level == 5) { voyager_cat_power_off(); - } else if(voyager_level == 4) { + } else if (voyager_level == 4) { /* This doesn't apparently work on most L4 machines, * but the specs say to do this to get automatic power * off. Unfortunately, if it doesn't power off the @@ -222,10 +218,8 @@ voyager_power_off(void) #if 0 int port; - /* enable the voyager Configuration Space */ - outb((inb(VOYAGER_MC_SETUP) & 0xf0) | 0x8, - VOYAGER_MC_SETUP); + outb((inb(VOYAGER_MC_SETUP) & 0xf0) | 0x8, VOYAGER_MC_SETUP); /* the port for the power off flag is an offset from the floating base */ port = (inb(VOYAGER_SSPB_RELOCATION_PORT) << 8) + 0x21; @@ -235,62 +229,57 @@ voyager_power_off(void) } /* and wait for it to happen */ local_irq_disable(); - for(;;) + for (;;) halt(); } /* copied from process.c */ -static inline void -kb_wait(void) +static inline void kb_wait(void) { int i; - for (i=0; i<0x10000; i++) + for (i = 0; i < 0x10000; i++) if ((inb_p(0x64) & 0x02) == 0) break; } -void -machine_shutdown(void) +void machine_shutdown(void) { /* Architecture specific shutdown needed before a kexec */ } -void -machine_restart(char *cmd) +void machine_restart(char *cmd) { printk("Voyager Warm Restart\n"); kb_wait(); - if(voyager_level == 5) { + if (voyager_level == 5) { /* write magic values to the RTC to inform system that * shutdown is beginning */ outb(0x8f, 0x70); - outb(0x5 , 0x71); - + outb(0x5, 0x71); + udelay(50); - outb(0xfe,0x64); /* pull reset low */ - } else if(voyager_level == 4) { - __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT)<<8; + outb(0xfe, 0x64); /* pull reset low */ + } else if (voyager_level == 4) { + __u16 catbase = inb(VOYAGER_SSPB_RELOCATION_PORT) << 8; __u8 basebd = inb(VOYAGER_MC_SETUP); - + outb(basebd | 0x08, VOYAGER_MC_SETUP); outb(0x02, catbase + 0x21); } local_irq_disable(); - for(;;) + for (;;) halt(); } -void -machine_emergency_restart(void) +void machine_emergency_restart(void) { /*for now, just hook this to a warm restart */ machine_restart(NULL); } -void -mca_nmi_hook(void) +void mca_nmi_hook(void) { __u8 dumpval __maybe_unused = inb(0xf823); __u8 swnmi __maybe_unused = inb(0xf813); @@ -301,8 +290,8 @@ mca_nmi_hook(void) /* clear swnmi */ outb(0xff, 0xf813); /* tell SUS to ignore dump */ - if(voyager_level == 5 && voyager_SUS != NULL) { - if(voyager_SUS->SUS_mbox == VOYAGER_DUMP_BUTTON_NMI) { + if (voyager_level == 5 && voyager_SUS != NULL) { + if (voyager_SUS->SUS_mbox == VOYAGER_DUMP_BUTTON_NMI) { voyager_SUS->kernel_mbox = VOYAGER_NO_COMMAND; voyager_SUS->kernel_flags |= VOYAGER_OS_IN_PROGRESS; udelay(1000); @@ -310,15 +299,14 @@ mca_nmi_hook(void) voyager_SUS->kernel_flags &= ~VOYAGER_OS_IN_PROGRESS; } } - printk(KERN_ERR "VOYAGER: Dump switch pressed, printing CPU%d tracebacks\n", smp_processor_id()); + printk(KERN_ERR + "VOYAGER: Dump switch pressed, printing CPU%d tracebacks\n", + smp_processor_id()); show_stack(NULL, NULL); show_state(); } - - -void -machine_halt(void) +void machine_halt(void) { /* treat a halt like a power off */ machine_power_off(); diff --git a/arch/x86/mach-voyager/voyager_cat.c b/arch/x86/mach-voyager/voyager_cat.c index 2132ca652df..17a7904f75b 100644 --- a/arch/x86/mach-voyager/voyager_cat.c +++ b/arch/x86/mach-voyager/voyager_cat.c @@ -39,34 +39,32 @@ #define CAT_DATA (sspb + 0xd) /* the internal cat functions */ -static void cat_pack(__u8 *msg, __u16 start_bit, __u8 *data, - __u16 num_bits); -static void cat_unpack(__u8 *msg, __u16 start_bit, __u8 *data, +static void cat_pack(__u8 * msg, __u16 start_bit, __u8 * data, __u16 num_bits); +static void cat_unpack(__u8 * msg, __u16 start_bit, __u8 * data, __u16 num_bits); -static void cat_build_header(__u8 *header, const __u16 len, +static void cat_build_header(__u8 * header, const __u16 len, const __u16 smallest_reg_bits, const __u16 longest_reg_bits); -static int cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, +static int cat_sendinst(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 op); -static int cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp, - __u8 reg, __u8 *value); -static int cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, +static int cat_getdata(voyager_module_t * modp, voyager_asic_t * asicp, + __u8 reg, __u8 * value); +static int cat_shiftout(__u8 * data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits); -static int cat_write(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, +static int cat_write(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 value); -static int cat_read(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, - __u8 *value); -static int cat_subread(voyager_module_t *modp, voyager_asic_t *asicp, +static int cat_read(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, + __u8 * value); +static int cat_subread(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset, __u16 len, void *buf); -static int cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp, +static int cat_senddata(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 value); -static int cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp); -static int cat_connect(voyager_module_t *modp, voyager_asic_t *asicp); +static int cat_disconnect(voyager_module_t * modp, voyager_asic_t * asicp); +static int cat_connect(voyager_module_t * modp, voyager_asic_t * asicp); -static inline const char * -cat_module_name(int module_id) +static inline const char *cat_module_name(int module_id) { - switch(module_id) { + switch (module_id) { case 0x10: return "Processor Slot 0"; case 0x11: @@ -105,14 +103,14 @@ voyager_module_t *voyager_cat_list; /* the I/O port assignments for the VIC and QIC */ static struct resource vic_res = { - .name = "Voyager Interrupt Controller", - .start = 0xFC00, - .end = 0xFC6F + .name = "Voyager Interrupt Controller", + .start = 0xFC00, + .end = 0xFC6F }; static struct resource qic_res = { - .name = "Quad Interrupt Controller", - .start = 0xFC70, - .end = 0xFCFF + .name = "Quad Interrupt Controller", + .start = 0xFC70, + .end = 0xFCFF }; /* This function is used to pack a data bit stream inside a message. @@ -120,7 +118,7 @@ static struct resource qic_res = { * Note: This function assumes that any unused bit in the data stream * is set to zero so that the ors will work correctly */ static void -cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits) +cat_pack(__u8 * msg, const __u16 start_bit, __u8 * data, const __u16 num_bits) { /* compute initial shift needed */ const __u16 offset = start_bit % BITS_PER_BYTE; @@ -130,7 +128,7 @@ cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits) int i; /* adjust if we have more than a byte of residue */ - if(residue >= BITS_PER_BYTE) { + if (residue >= BITS_PER_BYTE) { residue -= BITS_PER_BYTE; len++; } @@ -138,24 +136,25 @@ cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits) /* clear out the bits. We assume here that if len==0 then * residue >= offset. This is always true for the catbus * operations */ - msg[byte] &= 0xff << (BITS_PER_BYTE - offset); + msg[byte] &= 0xff << (BITS_PER_BYTE - offset); msg[byte++] |= data[0] >> offset; - if(len == 0) + if (len == 0) return; - for(i = 1; i < len; i++) - msg[byte++] = (data[i-1] << (BITS_PER_BYTE - offset)) - | (data[i] >> offset); - if(residue != 0) { + for (i = 1; i < len; i++) + msg[byte++] = (data[i - 1] << (BITS_PER_BYTE - offset)) + | (data[i] >> offset); + if (residue != 0) { __u8 mask = 0xff >> residue; - __u8 last_byte = data[i-1] << (BITS_PER_BYTE - offset) - | (data[i] >> offset); - + __u8 last_byte = data[i - 1] << (BITS_PER_BYTE - offset) + | (data[i] >> offset); + last_byte &= ~mask; msg[byte] &= mask; msg[byte] |= last_byte; } return; } + /* unpack the data again (same arguments as cat_pack()). data buffer * must be zero populated. * @@ -163,7 +162,7 @@ cat_pack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits) * data (starting at bit 0 in data). */ static void -cat_unpack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits) +cat_unpack(__u8 * msg, const __u16 start_bit, __u8 * data, const __u16 num_bits) { /* compute initial shift needed */ const __u16 offset = start_bit % BITS_PER_BYTE; @@ -172,97 +171,97 @@ cat_unpack(__u8 *msg, const __u16 start_bit, __u8 *data, const __u16 num_bits) __u16 byte = start_bit / BITS_PER_BYTE; int i; - if(last_bits != 0) + if (last_bits != 0) len++; /* special case: want < 8 bits from msg and we can get it from * a single byte of the msg */ - if(len == 0 && BITS_PER_BYTE - offset >= num_bits) { + if (len == 0 && BITS_PER_BYTE - offset >= num_bits) { data[0] = msg[byte] << offset; data[0] &= 0xff >> (BITS_PER_BYTE - num_bits); return; } - for(i = 0; i < len; i++) { + for (i = 0; i < len; i++) { /* this annoying if has to be done just in case a read of * msg one beyond the array causes a panic */ - if(offset != 0) { + if (offset != 0) { data[i] = msg[byte++] << offset; data[i] |= msg[byte] >> (BITS_PER_BYTE - offset); - } - else { + } else { data[i] = msg[byte++]; } } /* do we need to truncate the final byte */ - if(last_bits != 0) { - data[i-1] &= 0xff << (BITS_PER_BYTE - last_bits); + if (last_bits != 0) { + data[i - 1] &= 0xff << (BITS_PER_BYTE - last_bits); } return; } static void -cat_build_header(__u8 *header, const __u16 len, const __u16 smallest_reg_bits, +cat_build_header(__u8 * header, const __u16 len, const __u16 smallest_reg_bits, const __u16 longest_reg_bits) { int i; __u16 start_bit = (smallest_reg_bits - 1) % BITS_PER_BYTE; __u8 *last_byte = &header[len - 1]; - if(start_bit == 0) + if (start_bit == 0) start_bit = 1; /* must have at least one bit in the hdr */ - - for(i=0; i < len; i++) + + for (i = 0; i < len; i++) header[i] = 0; - for(i = start_bit; i > 0; i--) + for (i = start_bit; i > 0; i--) *last_byte = ((*last_byte) << 1) + 1; } static int -cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, __u8 op) +cat_sendinst(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 op) { __u8 parity, inst, inst_buf[4] = { 0 }; __u8 iseq[VOYAGER_MAX_SCAN_PATH], hseq[VOYAGER_MAX_REG_SIZE]; __u16 ibytes, hbytes, padbits; int i; - + /* * Parity is the parity of the register number + 1 (READ_REGISTER * and WRITE_REGISTER always add '1' to the number of bits == 1) */ - parity = (__u8)(1 + (reg & 0x01) + - ((__u8)(reg & 0x02) >> 1) + - ((__u8)(reg & 0x04) >> 2) + - ((__u8)(reg & 0x08) >> 3)) % 2; + parity = (__u8) (1 + (reg & 0x01) + + ((__u8) (reg & 0x02) >> 1) + + ((__u8) (reg & 0x04) >> 2) + + ((__u8) (reg & 0x08) >> 3)) % 2; inst = ((parity << 7) | (reg << 2) | op); outb(VOYAGER_CAT_IRCYC, CAT_CMD); - if(!modp->scan_path_connected) { - if(asicp->asic_id != VOYAGER_CAT_ID) { - printk("**WARNING***: cat_sendinst has disconnected scan path not to CAT asic\n"); + if (!modp->scan_path_connected) { + if (asicp->asic_id != VOYAGER_CAT_ID) { + printk + ("**WARNING***: cat_sendinst has disconnected scan path not to CAT asic\n"); return 1; } outb(VOYAGER_CAT_HEADER, CAT_DATA); outb(inst, CAT_DATA); - if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) { + if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) { CDEBUG(("VOYAGER CAT: cat_sendinst failed to get CAT_HEADER\n")); return 1; } return 0; } ibytes = modp->inst_bits / BITS_PER_BYTE; - if((padbits = modp->inst_bits % BITS_PER_BYTE) != 0) { + if ((padbits = modp->inst_bits % BITS_PER_BYTE) != 0) { padbits = BITS_PER_BYTE - padbits; ibytes++; } hbytes = modp->largest_reg / BITS_PER_BYTE; - if(modp->largest_reg % BITS_PER_BYTE) + if (modp->largest_reg % BITS_PER_BYTE) hbytes++; CDEBUG(("cat_sendinst: ibytes=%d, hbytes=%d\n", ibytes, hbytes)); /* initialise the instruction sequence to 0xff */ - for(i=0; i < ibytes + hbytes; i++) + for (i = 0; i < ibytes + hbytes; i++) iseq[i] = 0xff; cat_build_header(hseq, hbytes, modp->smallest_reg, modp->largest_reg); cat_pack(iseq, modp->inst_bits, hseq, hbytes * BITS_PER_BYTE); @@ -271,11 +270,11 @@ cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, __u8 op) cat_pack(iseq, asicp->bit_location, inst_buf, asicp->ireg_length); #ifdef VOYAGER_CAT_DEBUG printk("ins = 0x%x, iseq: ", inst); - for(i=0; i< ibytes + hbytes; i++) + for (i = 0; i < ibytes + hbytes; i++) printk("0x%x ", iseq[i]); printk("\n"); #endif - if(cat_shiftout(iseq, ibytes, hbytes, padbits)) { + if (cat_shiftout(iseq, ibytes, hbytes, padbits)) { CDEBUG(("VOYAGER CAT: cat_sendinst: cat_shiftout failed\n")); return 1; } @@ -284,72 +283,74 @@ cat_sendinst(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, __u8 op) } static int -cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, - __u8 *value) +cat_getdata(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, + __u8 * value) { - if(!modp->scan_path_connected) { - if(asicp->asic_id != VOYAGER_CAT_ID) { + if (!modp->scan_path_connected) { + if (asicp->asic_id != VOYAGER_CAT_ID) { CDEBUG(("VOYAGER CAT: ERROR: cat_getdata to CAT asic with scan path connected\n")); return 1; } - if(reg > VOYAGER_SUBADDRHI) + if (reg > VOYAGER_SUBADDRHI) outb(VOYAGER_CAT_RUN, CAT_CMD); outb(VOYAGER_CAT_DRCYC, CAT_CMD); outb(VOYAGER_CAT_HEADER, CAT_DATA); *value = inb(CAT_DATA); outb(0xAA, CAT_DATA); - if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) { + if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) { CDEBUG(("cat_getdata: failed to get VOYAGER_CAT_HEADER\n")); return 1; } return 0; - } - else { - __u16 sbits = modp->num_asics -1 + asicp->ireg_length; + } else { + __u16 sbits = modp->num_asics - 1 + asicp->ireg_length; __u16 sbytes = sbits / BITS_PER_BYTE; __u16 tbytes; - __u8 string[VOYAGER_MAX_SCAN_PATH], trailer[VOYAGER_MAX_REG_SIZE]; + __u8 string[VOYAGER_MAX_SCAN_PATH], + trailer[VOYAGER_MAX_REG_SIZE]; __u8 padbits; int i; - + outb(VOYAGER_CAT_DRCYC, CAT_CMD); - if((padbits = sbits % BITS_PER_BYTE) != 0) { + if ((padbits = sbits % BITS_PER_BYTE) != 0) { padbits = BITS_PER_BYTE - padbits; sbytes++; } tbytes = asicp->ireg_length / BITS_PER_BYTE; - if(asicp->ireg_length % BITS_PER_BYTE) + if (asicp->ireg_length % BITS_PER_BYTE) tbytes++; CDEBUG(("cat_getdata: tbytes = %d, sbytes = %d, padbits = %d\n", - tbytes, sbytes, padbits)); + tbytes, sbytes, padbits)); cat_build_header(trailer, tbytes, 1, asicp->ireg_length); - - for(i = tbytes - 1; i >= 0; i--) { + for (i = tbytes - 1; i >= 0; i--) { outb(trailer[i], CAT_DATA); string[sbytes + i] = inb(CAT_DATA); } - for(i = sbytes - 1; i >= 0; i--) { + for (i = sbytes - 1; i >= 0; i--) { outb(0xaa, CAT_DATA); string[i] = inb(CAT_DATA); } *value = 0; - cat_unpack(string, padbits + (tbytes * BITS_PER_BYTE) + asicp->asic_location, value, asicp->ireg_length); + cat_unpack(string, + padbits + (tbytes * BITS_PER_BYTE) + + asicp->asic_location, value, asicp->ireg_length); #ifdef VOYAGER_CAT_DEBUG printk("value=0x%x, string: ", *value); - for(i=0; i< tbytes+sbytes; i++) + for (i = 0; i < tbytes + sbytes; i++) printk("0x%x ", string[i]); printk("\n"); #endif - + /* sanity check the rest of the return */ - for(i=0; i < tbytes; i++) { + for (i = 0; i < tbytes; i++) { __u8 input = 0; - cat_unpack(string, padbits + (i * BITS_PER_BYTE), &input, BITS_PER_BYTE); - if(trailer[i] != input) { + cat_unpack(string, padbits + (i * BITS_PER_BYTE), + &input, BITS_PER_BYTE); + if (trailer[i] != input) { CDEBUG(("cat_getdata: failed to sanity check rest of ret(%d) 0x%x != 0x%x\n", i, input, trailer[i])); return 1; } @@ -360,14 +361,14 @@ cat_getdata(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, } static int -cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits) +cat_shiftout(__u8 * data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits) { int i; - - for(i = data_bytes + header_bytes - 1; i >= header_bytes; i--) + + for (i = data_bytes + header_bytes - 1; i >= header_bytes; i--) outb(data[i], CAT_DATA); - for(i = header_bytes - 1; i >= 0; i--) { + for (i = header_bytes - 1; i >= 0; i--) { __u8 header = 0; __u8 input; @@ -376,7 +377,7 @@ cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits) CDEBUG(("cat_shiftout: returned 0x%x\n", input)); cat_unpack(data, ((data_bytes + i) * BITS_PER_BYTE) - pad_bits, &header, BITS_PER_BYTE); - if(input != header) { + if (input != header) { CDEBUG(("VOYAGER CAT: cat_shiftout failed to return header 0x%x != 0x%x\n", input, header)); return 1; } @@ -385,57 +386,57 @@ cat_shiftout(__u8 *data, __u16 data_bytes, __u16 header_bytes, __u8 pad_bits) } static int -cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp, +cat_senddata(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 value) { outb(VOYAGER_CAT_DRCYC, CAT_CMD); - if(!modp->scan_path_connected) { - if(asicp->asic_id != VOYAGER_CAT_ID) { + if (!modp->scan_path_connected) { + if (asicp->asic_id != VOYAGER_CAT_ID) { CDEBUG(("VOYAGER CAT: ERROR: scan path disconnected when asic != CAT\n")); return 1; } outb(VOYAGER_CAT_HEADER, CAT_DATA); outb(value, CAT_DATA); - if(inb(CAT_DATA) != VOYAGER_CAT_HEADER) { + if (inb(CAT_DATA) != VOYAGER_CAT_HEADER) { CDEBUG(("cat_senddata: failed to get correct header response to sent data\n")); return 1; } - if(reg > VOYAGER_SUBADDRHI) { + if (reg > VOYAGER_SUBADDRHI) { outb(VOYAGER_CAT_RUN, CAT_CMD); outb(VOYAGER_CAT_END, CAT_CMD); outb(VOYAGER_CAT_RUN, CAT_CMD); } - + return 0; - } - else { + } else { __u16 hbytes = asicp->ireg_length / BITS_PER_BYTE; - __u16 dbytes = (modp->num_asics - 1 + asicp->ireg_length)/BITS_PER_BYTE; - __u8 padbits, dseq[VOYAGER_MAX_SCAN_PATH], - hseq[VOYAGER_MAX_REG_SIZE]; + __u16 dbytes = + (modp->num_asics - 1 + asicp->ireg_length) / BITS_PER_BYTE; + __u8 padbits, dseq[VOYAGER_MAX_SCAN_PATH], + hseq[VOYAGER_MAX_REG_SIZE]; int i; - if((padbits = (modp->num_asics - 1 - + asicp->ireg_length) % BITS_PER_BYTE) != 0) { + if ((padbits = (modp->num_asics - 1 + + asicp->ireg_length) % BITS_PER_BYTE) != 0) { padbits = BITS_PER_BYTE - padbits; dbytes++; } - if(asicp->ireg_length % BITS_PER_BYTE) + if (asicp->ireg_length % BITS_PER_BYTE) hbytes++; - + cat_build_header(hseq, hbytes, 1, asicp->ireg_length); - - for(i = 0; i < dbytes + hbytes; i++) + + for (i = 0; i < dbytes + hbytes; i++) dseq[i] = 0xff; CDEBUG(("cat_senddata: dbytes=%d, hbytes=%d, padbits=%d\n", dbytes, hbytes, padbits)); cat_pack(dseq, modp->num_asics - 1 + asicp->ireg_length, hseq, hbytes * BITS_PER_BYTE); - cat_pack(dseq, asicp->asic_location, &value, + cat_pack(dseq, asicp->asic_location, &value, asicp->ireg_length); #ifdef VOYAGER_CAT_DEBUG printk("dseq "); - for(i=0; i<hbytes+dbytes; i++) { + for (i = 0; i < hbytes + dbytes; i++) { printk("0x%x ", dseq[i]); } printk("\n"); @@ -445,121 +446,125 @@ cat_senddata(voyager_module_t *modp, voyager_asic_t *asicp, } static int -cat_write(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, - __u8 value) +cat_write(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, __u8 value) { - if(cat_sendinst(modp, asicp, reg, VOYAGER_WRITE_CONFIG)) + if (cat_sendinst(modp, asicp, reg, VOYAGER_WRITE_CONFIG)) return 1; return cat_senddata(modp, asicp, reg, value); } static int -cat_read(voyager_module_t *modp, voyager_asic_t *asicp, __u8 reg, - __u8 *value) +cat_read(voyager_module_t * modp, voyager_asic_t * asicp, __u8 reg, + __u8 * value) { - if(cat_sendinst(modp, asicp, reg, VOYAGER_READ_CONFIG)) + if (cat_sendinst(modp, asicp, reg, VOYAGER_READ_CONFIG)) return 1; return cat_getdata(modp, asicp, reg, value); } static int -cat_subaddrsetup(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset, +cat_subaddrsetup(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset, __u16 len) { __u8 val; - if(len > 1) { + if (len > 1) { /* set auto increment */ __u8 newval; - - if(cat_read(modp, asicp, VOYAGER_AUTO_INC_REG, &val)) { + + if (cat_read(modp, asicp, VOYAGER_AUTO_INC_REG, &val)) { CDEBUG(("cat_subaddrsetup: read of VOYAGER_AUTO_INC_REG failed\n")); return 1; } - CDEBUG(("cat_subaddrsetup: VOYAGER_AUTO_INC_REG = 0x%x\n", val)); + CDEBUG(("cat_subaddrsetup: VOYAGER_AUTO_INC_REG = 0x%x\n", + val)); newval = val | VOYAGER_AUTO_INC; - if(newval != val) { - if(cat_write(modp, asicp, VOYAGER_AUTO_INC_REG, val)) { + if (newval != val) { + if (cat_write(modp, asicp, VOYAGER_AUTO_INC_REG, val)) { CDEBUG(("cat_subaddrsetup: write to VOYAGER_AUTO_INC_REG failed\n")); return 1; } } } - if(cat_write(modp, asicp, VOYAGER_SUBADDRLO, (__u8)(offset &0xff))) { + if (cat_write(modp, asicp, VOYAGER_SUBADDRLO, (__u8) (offset & 0xff))) { CDEBUG(("cat_subaddrsetup: write to SUBADDRLO failed\n")); return 1; } - if(asicp->subaddr > VOYAGER_SUBADDR_LO) { - if(cat_write(modp, asicp, VOYAGER_SUBADDRHI, (__u8)(offset >> 8))) { + if (asicp->subaddr > VOYAGER_SUBADDR_LO) { + if (cat_write + (modp, asicp, VOYAGER_SUBADDRHI, (__u8) (offset >> 8))) { CDEBUG(("cat_subaddrsetup: write to SUBADDRHI failed\n")); return 1; } cat_read(modp, asicp, VOYAGER_SUBADDRHI, &val); - CDEBUG(("cat_subaddrsetup: offset = %d, hi = %d\n", offset, val)); + CDEBUG(("cat_subaddrsetup: offset = %d, hi = %d\n", offset, + val)); } cat_read(modp, asicp, VOYAGER_SUBADDRLO, &val); CDEBUG(("cat_subaddrsetup: offset = %d, lo = %d\n", offset, val)); return 0; } - + static int -cat_subwrite(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset, - __u16 len, void *buf) +cat_subwrite(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset, + __u16 len, void *buf) { int i, retval; /* FIXME: need special actions for VOYAGER_CAT_ID here */ - if(asicp->asic_id == VOYAGER_CAT_ID) { + if (asicp->asic_id == VOYAGER_CAT_ID) { CDEBUG(("cat_subwrite: ATTEMPT TO WRITE TO CAT ASIC\n")); /* FIXME -- This is supposed to be handled better * There is a problem writing to the cat asic in the * PSI. The 30us delay seems to work, though */ udelay(30); } - - if((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) { + + if ((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) { printk("cat_subwrite: cat_subaddrsetup FAILED\n"); return retval; } - - if(cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_WRITE_CONFIG)) { + + if (cat_sendinst + (modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_WRITE_CONFIG)) { printk("cat_subwrite: cat_sendinst FAILED\n"); return 1; } - for(i = 0; i < len; i++) { - if(cat_senddata(modp, asicp, 0xFF, ((__u8 *)buf)[i])) { - printk("cat_subwrite: cat_sendata element at %d FAILED\n", i); + for (i = 0; i < len; i++) { + if (cat_senddata(modp, asicp, 0xFF, ((__u8 *) buf)[i])) { + printk + ("cat_subwrite: cat_sendata element at %d FAILED\n", + i); return 1; } } return 0; } static int -cat_subread(voyager_module_t *modp, voyager_asic_t *asicp, __u16 offset, +cat_subread(voyager_module_t * modp, voyager_asic_t * asicp, __u16 offset, __u16 len, void *buf) { int i, retval; - if((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) { + if ((retval = cat_subaddrsetup(modp, asicp, offset, len)) != 0) { CDEBUG(("cat_subread: cat_subaddrsetup FAILED\n")); return retval; } - if(cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_READ_CONFIG)) { + if (cat_sendinst(modp, asicp, VOYAGER_SUBADDRDATA, VOYAGER_READ_CONFIG)) { CDEBUG(("cat_subread: cat_sendinst failed\n")); return 1; } - for(i = 0; i < len; i++) { - if(cat_getdata(modp, asicp, 0xFF, - &((__u8 *)buf)[i])) { - CDEBUG(("cat_subread: cat_getdata element %d failed\n", i)); + for (i = 0; i < len; i++) { + if (cat_getdata(modp, asicp, 0xFF, &((__u8 *) buf)[i])) { + CDEBUG(("cat_subread: cat_getdata element %d failed\n", + i)); return 1; } } return 0; } - /* buffer for storing EPROM data read in during initialisation */ static __initdata __u8 eprom_buf[0xFFFF]; static voyager_module_t *voyager_initial_module; @@ -568,8 +573,7 @@ static voyager_module_t *voyager_initial_module; * boot cpu *after* all memory initialisation has been done (so we can * use kmalloc) but before smp initialisation, so we can probe the SMP * configuration and pick up necessary information. */ -void __init -voyager_cat_init(void) +void __init voyager_cat_init(void) { voyager_module_t **modpp = &voyager_initial_module; voyager_asic_t **asicpp; @@ -578,27 +582,29 @@ voyager_cat_init(void) unsigned long qic_addr = 0; __u8 qabc_data[0x20]; __u8 num_submodules, val; - voyager_eprom_hdr_t *eprom_hdr = (voyager_eprom_hdr_t *)&eprom_buf[0]; - + voyager_eprom_hdr_t *eprom_hdr = (voyager_eprom_hdr_t *) & eprom_buf[0]; + __u8 cmos[4]; unsigned long addr; - + /* initiallise the SUS mailbox */ - for(i=0; i<sizeof(cmos); i++) + for (i = 0; i < sizeof(cmos); i++) cmos[i] = voyager_extended_cmos_read(VOYAGER_DUMP_LOCATION + i); addr = *(unsigned long *)cmos; - if((addr & 0xff000000) != 0xff000000) { - printk(KERN_ERR "Voyager failed to get SUS mailbox (addr = 0x%lx\n", addr); + if ((addr & 0xff000000) != 0xff000000) { + printk(KERN_ERR + "Voyager failed to get SUS mailbox (addr = 0x%lx\n", + addr); } else { static struct resource res; - + res.name = "voyager SUS"; res.start = addr; - res.end = addr+0x3ff; - + res.end = addr + 0x3ff; + request_resource(&iomem_resource, &res); voyager_SUS = (struct voyager_SUS *) - ioremap(addr, 0x400); + ioremap(addr, 0x400); printk(KERN_NOTICE "Voyager SUS mailbox version 0x%x\n", voyager_SUS->SUS_version); voyager_SUS->kernel_version = VOYAGER_MAILBOX_VERSION; @@ -609,8 +615,6 @@ voyager_cat_init(void) voyager_extended_vic_processors = 0; voyager_quad_processors = 0; - - printk("VOYAGER: beginning CAT bus probe\n"); /* set up the SuperSet Port Block which tells us where the * CAT communication port is */ @@ -618,14 +622,14 @@ voyager_cat_init(void) VDEBUG(("VOYAGER DEBUG: sspb = 0x%x\n", sspb)); /* now find out if were 8 slot or normal */ - if((inb(VIC_PROC_WHO_AM_I) & EIGHT_SLOT_IDENTIFIER) - == EIGHT_SLOT_IDENTIFIER) { + if ((inb(VIC_PROC_WHO_AM_I) & EIGHT_SLOT_IDENTIFIER) + == EIGHT_SLOT_IDENTIFIER) { voyager_8slot = 1; - printk(KERN_NOTICE "Voyager: Eight slot 51xx configuration detected\n"); + printk(KERN_NOTICE + "Voyager: Eight slot 51xx configuration detected\n"); } - for(i = VOYAGER_MIN_MODULE; - i <= VOYAGER_MAX_MODULE; i++) { + for (i = VOYAGER_MIN_MODULE; i <= VOYAGER_MAX_MODULE; i++) { __u8 input; int asic; __u16 eprom_size; @@ -643,21 +647,21 @@ voyager_cat_init(void) outb(0xAA, CAT_DATA); input = inb(CAT_DATA); outb(VOYAGER_CAT_END, CAT_CMD); - if(input != VOYAGER_CAT_HEADER) { + if (input != VOYAGER_CAT_HEADER) { continue; } CDEBUG(("VOYAGER DEBUG: found module id 0x%x, %s\n", i, cat_module_name(i))); - *modpp = kmalloc(sizeof(voyager_module_t), GFP_KERNEL); /*&voyager_module_storage[cat_count++];*/ - if(*modpp == NULL) { + *modpp = kmalloc(sizeof(voyager_module_t), GFP_KERNEL); /*&voyager_module_storage[cat_count++]; */ + if (*modpp == NULL) { printk("**WARNING** kmalloc failure in cat_init\n"); continue; } memset(*modpp, 0, sizeof(voyager_module_t)); /* need temporary asic for cat_subread. It will be * filled in correctly later */ - (*modpp)->asic = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count];*/ - if((*modpp)->asic == NULL) { + (*modpp)->asic = kmalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count]; */ + if ((*modpp)->asic == NULL) { printk("**WARNING** kmalloc failure in cat_init\n"); continue; } @@ -666,47 +670,52 @@ voyager_cat_init(void) (*modpp)->asic->subaddr = VOYAGER_SUBADDR_HI; (*modpp)->module_addr = i; (*modpp)->scan_path_connected = 0; - if(i == VOYAGER_PSI) { + if (i == VOYAGER_PSI) { /* Exception leg for modules with no EEPROM */ printk("Module \"%s\"\n", cat_module_name(i)); continue; } - + CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET)); outb(VOYAGER_CAT_RUN, CAT_CMD); cat_disconnect(*modpp, (*modpp)->asic); - if(cat_subread(*modpp, (*modpp)->asic, - VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size), - &eprom_size)) { - printk("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", i); + if (cat_subread(*modpp, (*modpp)->asic, + VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size), + &eprom_size)) { + printk + ("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", + i); outb(VOYAGER_CAT_END, CAT_CMD); continue; } - if(eprom_size > sizeof(eprom_buf)) { - printk("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n", i, eprom_size); + if (eprom_size > sizeof(eprom_buf)) { + printk + ("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n", + i, eprom_size); outb(VOYAGER_CAT_END, CAT_CMD); continue; } outb(VOYAGER_CAT_END, CAT_CMD); outb(VOYAGER_CAT_RUN, CAT_CMD); - CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, eprom_size)); - if(cat_subread(*modpp, (*modpp)->asic, 0, - eprom_size, eprom_buf)) { + CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, + eprom_size)); + if (cat_subread + (*modpp, (*modpp)->asic, 0, eprom_size, eprom_buf)) { outb(VOYAGER_CAT_END, CAT_CMD); continue; } outb(VOYAGER_CAT_END, CAT_CMD); printk("Module \"%s\", version 0x%x, tracer 0x%x, asics %d\n", cat_module_name(i), eprom_hdr->version_id, - *((__u32 *)eprom_hdr->tracer), eprom_hdr->num_asics); + *((__u32 *) eprom_hdr->tracer), eprom_hdr->num_asics); (*modpp)->ee_size = eprom_hdr->ee_size; (*modpp)->num_asics = eprom_hdr->num_asics; asicpp = &((*modpp)->asic); sp_offset = eprom_hdr->scan_path_offset; /* All we really care about are the Quad cards. We - * identify them because they are in a processor slot - * and have only four asics */ - if((i < 0x10 || (i>=0x14 && i < 0x1c) || i>0x1f)) { + * identify them because they are in a processor slot + * and have only four asics */ + if ((i < 0x10 || (i >= 0x14 && i < 0x1c) || i > 0x1f)) { modpp = &((*modpp)->next); continue; } @@ -717,16 +726,17 @@ voyager_cat_init(void) &num_submodules); /* lowest two bits, active low */ num_submodules = ~(0xfc | num_submodules); - CDEBUG(("VOYAGER CAT: %d submodules present\n", num_submodules)); - if(num_submodules == 0) { + CDEBUG(("VOYAGER CAT: %d submodules present\n", + num_submodules)); + if (num_submodules == 0) { /* fill in the dyadic extended processors */ __u8 cpu = i & 0x07; printk("Module \"%s\": Dyadic Processor Card\n", cat_module_name(i)); - voyager_extended_vic_processors |= (1<<cpu); + voyager_extended_vic_processors |= (1 << cpu); cpu += 4; - voyager_extended_vic_processors |= (1<<cpu); + voyager_extended_vic_processors |= (1 << cpu); outb(VOYAGER_CAT_END, CAT_CMD); continue; } @@ -740,28 +750,32 @@ voyager_cat_init(void) cat_write(*modpp, (*modpp)->asic, VOYAGER_SUBMODSELECT, val); outb(VOYAGER_CAT_END, CAT_CMD); - CDEBUG(("cat_init: Reading eeprom for module 0x%x at offset %d\n", i, VOYAGER_XSUM_END_OFFSET)); outb(VOYAGER_CAT_RUN, CAT_CMD); cat_disconnect(*modpp, (*modpp)->asic); - if(cat_subread(*modpp, (*modpp)->asic, - VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size), - &eprom_size)) { - printk("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", i); + if (cat_subread(*modpp, (*modpp)->asic, + VOYAGER_XSUM_END_OFFSET, sizeof(eprom_size), + &eprom_size)) { + printk + ("**WARNING**: Voyager couldn't read EPROM size for module 0x%x\n", + i); outb(VOYAGER_CAT_END, CAT_CMD); continue; } - if(eprom_size > sizeof(eprom_buf)) { - printk("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n", i, eprom_size); + if (eprom_size > sizeof(eprom_buf)) { + printk + ("**WARNING**: Voyager insufficient size to read EPROM data, module 0x%x. Need %d\n", + i, eprom_size); outb(VOYAGER_CAT_END, CAT_CMD); continue; } outb(VOYAGER_CAT_END, CAT_CMD); outb(VOYAGER_CAT_RUN, CAT_CMD); - CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, eprom_size)); - if(cat_subread(*modpp, (*modpp)->asic, 0, - eprom_size, eprom_buf)) { + CDEBUG(("cat_init: module 0x%x, eeprom_size %d\n", i, + eprom_size)); + if (cat_subread + (*modpp, (*modpp)->asic, 0, eprom_size, eprom_buf)) { outb(VOYAGER_CAT_END, CAT_CMD); continue; } @@ -773,30 +787,35 @@ voyager_cat_init(void) sp_offset = eprom_hdr->scan_path_offset; /* get rid of the dummy CAT asic and read the real one */ kfree((*modpp)->asic); - for(asic=0; asic < (*modpp)->num_asics; asic++) { + for (asic = 0; asic < (*modpp)->num_asics; asic++) { int j; - voyager_asic_t *asicp = *asicpp - = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++];*/ + voyager_asic_t *asicp = *asicpp = kzalloc(sizeof(voyager_asic_t), GFP_KERNEL); /*&voyager_asic_storage[asic_count++]; */ voyager_sp_table_t *sp_table; voyager_at_t *asic_table; voyager_jtt_t *jtag_table; - if(asicp == NULL) { - printk("**WARNING** kmalloc failure in cat_init\n"); + if (asicp == NULL) { + printk + ("**WARNING** kmalloc failure in cat_init\n"); continue; } asicpp = &(asicp->next); asicp->asic_location = asic; - sp_table = (voyager_sp_table_t *)(eprom_buf + sp_offset); + sp_table = + (voyager_sp_table_t *) (eprom_buf + sp_offset); asicp->asic_id = sp_table->asic_id; - asic_table = (voyager_at_t *)(eprom_buf + sp_table->asic_data_offset); - for(j=0; j<4; j++) + asic_table = + (voyager_at_t *) (eprom_buf + + sp_table->asic_data_offset); + for (j = 0; j < 4; j++) asicp->jtag_id[j] = asic_table->jtag_id[j]; - jtag_table = (voyager_jtt_t *)(eprom_buf + asic_table->jtag_offset); + jtag_table = + (voyager_jtt_t *) (eprom_buf + + asic_table->jtag_offset); asicp->ireg_length = jtag_table->ireg_len; asicp->bit_location = (*modpp)->inst_bits; (*modpp)->inst_bits += asicp->ireg_length; - if(asicp->ireg_length > (*modpp)->largest_reg) + if (asicp->ireg_length > (*modpp)->largest_reg) (*modpp)->largest_reg = asicp->ireg_length; if (asicp->ireg_length < (*modpp)->smallest_reg || (*modpp)->smallest_reg == 0) @@ -804,15 +823,13 @@ voyager_cat_init(void) CDEBUG(("asic 0x%x, ireg_length=%d, bit_location=%d\n", asicp->asic_id, asicp->ireg_length, asicp->bit_location)); - if(asicp->asic_id == VOYAGER_QUAD_QABC) { + if (asicp->asic_id == VOYAGER_QUAD_QABC) { CDEBUG(("VOYAGER CAT: QABC ASIC found\n")); qabc_asic = asicp; } sp_offset += sizeof(voyager_sp_table_t); } - CDEBUG(("Module inst_bits = %d, largest_reg = %d, smallest_reg=%d\n", - (*modpp)->inst_bits, (*modpp)->largest_reg, - (*modpp)->smallest_reg)); + CDEBUG(("Module inst_bits = %d, largest_reg = %d, smallest_reg=%d\n", (*modpp)->inst_bits, (*modpp)->largest_reg, (*modpp)->smallest_reg)); /* OK, now we have the QUAD ASICs set up, use them. * we need to: * @@ -828,10 +845,11 @@ voyager_cat_init(void) qic_addr = qabc_data[5] << 8; qic_addr = (qic_addr | qabc_data[6]) << 8; qic_addr = (qic_addr | qabc_data[7]) << 8; - printk("Module \"%s\": Quad Processor Card; CPI 0x%lx, SET=0x%x\n", - cat_module_name(i), qic_addr, qabc_data[8]); + printk + ("Module \"%s\": Quad Processor Card; CPI 0x%lx, SET=0x%x\n", + cat_module_name(i), qic_addr, qabc_data[8]); #if 0 /* plumbing fails---FIXME */ - if((qabc_data[8] & 0xf0) == 0) { + if ((qabc_data[8] & 0xf0) == 0) { /* FIXME: 32 way 8 CPU slot monster cannot be * plumbed this way---need to check for it */ @@ -842,94 +860,97 @@ voyager_cat_init(void) #ifdef VOYAGER_CAT_DEBUG /* verify plumbing */ cat_subread(*modpp, qabc_asic, 8, 1, &qabc_data[8]); - if((qabc_data[8] & 0xf0) == 0) { - CDEBUG(("PLUMBING FAILED: 0x%x\n", qabc_data[8])); + if ((qabc_data[8] & 0xf0) == 0) { + CDEBUG(("PLUMBING FAILED: 0x%x\n", + qabc_data[8])); } #endif } #endif { - struct resource *res = kzalloc(sizeof(struct resource),GFP_KERNEL); + struct resource *res = + kzalloc(sizeof(struct resource), GFP_KERNEL); res->name = kmalloc(128, GFP_KERNEL); - sprintf((char *)res->name, "Voyager %s Quad CPI", cat_module_name(i)); + sprintf((char *)res->name, "Voyager %s Quad CPI", + cat_module_name(i)); res->start = qic_addr; res->end = qic_addr + 0x3ff; request_resource(&iomem_resource, res); } qic_addr = (unsigned long)ioremap(qic_addr, 0x400); - - for(j = 0; j < 4; j++) { + + for (j = 0; j < 4; j++) { __u8 cpu; - if(voyager_8slot) { + if (voyager_8slot) { /* 8 slot has a different mapping, * each slot has only one vic line, so * 1 cpu in each slot must be < 8 */ - cpu = (i & 0x07) + j*8; + cpu = (i & 0x07) + j * 8; } else { - cpu = (i & 0x03) + j*4; + cpu = (i & 0x03) + j * 4; } - if( (qabc_data[8] & (1<<j))) { - voyager_extended_vic_processors |= (1<<cpu); + if ((qabc_data[8] & (1 << j))) { + voyager_extended_vic_processors |= (1 << cpu); } - if(qabc_data[8] & (1<<(j+4)) ) { + if (qabc_data[8] & (1 << (j + 4))) { /* Second SET register plumbed: Quad * card has two VIC connected CPUs. * Secondary cannot be booted as a VIC * CPU */ - voyager_extended_vic_processors |= (1<<cpu); - voyager_allowed_boot_processors &= (~(1<<cpu)); + voyager_extended_vic_processors |= (1 << cpu); + voyager_allowed_boot_processors &= + (~(1 << cpu)); } - voyager_quad_processors |= (1<<cpu); + voyager_quad_processors |= (1 << cpu); voyager_quad_cpi_addr[cpu] = (struct voyager_qic_cpi *) - (qic_addr+(j<<8)); + (qic_addr + (j << 8)); CDEBUG(("CPU%d: CPI address 0x%lx\n", cpu, (unsigned long)voyager_quad_cpi_addr[cpu])); } outb(VOYAGER_CAT_END, CAT_CMD); - - *asicpp = NULL; modpp = &((*modpp)->next); } *modpp = NULL; - printk("CAT Bus Initialisation finished: extended procs 0x%x, quad procs 0x%x, allowed vic boot = 0x%x\n", voyager_extended_vic_processors, voyager_quad_processors, voyager_allowed_boot_processors); + printk + ("CAT Bus Initialisation finished: extended procs 0x%x, quad procs 0x%x, allowed vic boot = 0x%x\n", + voyager_extended_vic_processors, voyager_quad_processors, + voyager_allowed_boot_processors); request_resource(&ioport_resource, &vic_res); - if(voyager_quad_processors) + if (voyager_quad_processors) request_resource(&ioport_resource, &qic_res); /* set up the front power switch */ } -int -voyager_cat_readb(__u8 module, __u8 asic, int reg) +int voyager_cat_readb(__u8 module, __u8 asic, int reg) { return 0; } -static int -cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp) +static int cat_disconnect(voyager_module_t * modp, voyager_asic_t * asicp) { __u8 val; int err = 0; - if(!modp->scan_path_connected) + if (!modp->scan_path_connected) return 0; - if(asicp->asic_id != VOYAGER_CAT_ID) { + if (asicp->asic_id != VOYAGER_CAT_ID) { CDEBUG(("cat_disconnect: ASIC is not CAT\n")); return 1; } err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val); - if(err) { + if (err) { CDEBUG(("cat_disconnect: failed to read SCANPATH\n")); return err; } val &= VOYAGER_DISCONNECT_ASIC; err = cat_write(modp, asicp, VOYAGER_SCANPATH, val); - if(err) { + if (err) { CDEBUG(("cat_disconnect: failed to write SCANPATH\n")); return err; } @@ -940,27 +961,26 @@ cat_disconnect(voyager_module_t *modp, voyager_asic_t *asicp) return 0; } -static int -cat_connect(voyager_module_t *modp, voyager_asic_t *asicp) +static int cat_connect(voyager_module_t * modp, voyager_asic_t * asicp) { __u8 val; int err = 0; - if(modp->scan_path_connected) + if (modp->scan_path_connected) return 0; - if(asicp->asic_id != VOYAGER_CAT_ID) { + if (asicp->asic_id != VOYAGER_CAT_ID) { CDEBUG(("cat_connect: ASIC is not CAT\n")); return 1; } err = cat_read(modp, asicp, VOYAGER_SCANPATH, &val); - if(err) { + if (err) { CDEBUG(("cat_connect: failed to read SCANPATH\n")); return err; } val |= VOYAGER_CONNECT_ASIC; err = cat_write(modp, asicp, VOYAGER_SCANPATH, val); - if(err) { + if (err) { CDEBUG(("cat_connect: failed to write SCANPATH\n")); return err; } @@ -971,11 +991,10 @@ cat_connect(voyager_module_t *modp, voyager_asic_t *asicp) return 0; } -void -voyager_cat_power_off(void) +void voyager_cat_power_off(void) { /* Power the machine off by writing to the PSI over the CAT - * bus */ + * bus */ __u8 data; voyager_module_t psi = { 0 }; voyager_asic_t psi_asic = { 0 }; @@ -1009,8 +1028,7 @@ voyager_cat_power_off(void) struct voyager_status voyager_status = { 0 }; -void -voyager_cat_psi(__u8 cmd, __u16 reg, __u8 *data) +void voyager_cat_psi(__u8 cmd, __u16 reg, __u8 * data) { voyager_module_t psi = { 0 }; voyager_asic_t psi_asic = { 0 }; @@ -1027,7 +1045,7 @@ voyager_cat_psi(__u8 cmd, __u16 reg, __u8 *data) outb(VOYAGER_PSI, VOYAGER_CAT_CONFIG_PORT); outb(VOYAGER_CAT_RUN, CAT_CMD); cat_disconnect(&psi, &psi_asic); - switch(cmd) { + switch (cmd) { case VOYAGER_PSI_READ: cat_read(&psi, &psi_asic, reg, data); break; @@ -1047,8 +1065,7 @@ voyager_cat_psi(__u8 cmd, __u16 reg, __u8 *data) outb(VOYAGER_CAT_END, CAT_CMD); } -void -voyager_cat_do_common_interrupt(void) +void voyager_cat_do_common_interrupt(void) { /* This is caused either by a memory parity error or something * in the PSI */ @@ -1057,7 +1074,7 @@ voyager_cat_do_common_interrupt(void) voyager_asic_t psi_asic = { 0 }; struct voyager_psi psi_reg; int i; - re_read: + re_read: psi.asic = &psi_asic; psi.asic->asic_id = VOYAGER_CAT_ID; psi.asic->subaddr = VOYAGER_SUBADDR_HI; @@ -1072,43 +1089,45 @@ voyager_cat_do_common_interrupt(void) cat_disconnect(&psi, &psi_asic); /* Read the status. NOTE: Need to read *all* the PSI regs here * otherwise the cmn int will be reasserted */ - for(i = 0; i < sizeof(psi_reg.regs); i++) { - cat_read(&psi, &psi_asic, i, &((__u8 *)&psi_reg.regs)[i]); + for (i = 0; i < sizeof(psi_reg.regs); i++) { + cat_read(&psi, &psi_asic, i, &((__u8 *) & psi_reg.regs)[i]); } outb(VOYAGER_CAT_END, CAT_CMD); - if((psi_reg.regs.checkbit & 0x02) == 0) { + if ((psi_reg.regs.checkbit & 0x02) == 0) { psi_reg.regs.checkbit |= 0x02; cat_write(&psi, &psi_asic, 5, psi_reg.regs.checkbit); printk("VOYAGER RE-READ PSI\n"); goto re_read; } outb(VOYAGER_CAT_RUN, CAT_CMD); - for(i = 0; i < sizeof(psi_reg.subregs); i++) { + for (i = 0; i < sizeof(psi_reg.subregs); i++) { /* This looks strange, but the PSI doesn't do auto increment * correctly */ - cat_subread(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG + i, - 1, &((__u8 *)&psi_reg.subregs)[i]); + cat_subread(&psi, &psi_asic, VOYAGER_PSI_SUPPLY_REG + i, + 1, &((__u8 *) & psi_reg.subregs)[i]); } outb(VOYAGER_CAT_END, CAT_CMD); #ifdef VOYAGER_CAT_DEBUG printk("VOYAGER PSI: "); - for(i=0; i<sizeof(psi_reg.regs); i++) - printk("%02x ", ((__u8 *)&psi_reg.regs)[i]); + for (i = 0; i < sizeof(psi_reg.regs); i++) + printk("%02x ", ((__u8 *) & psi_reg.regs)[i]); printk("\n "); - for(i=0; i<sizeof(psi_reg.subregs); i++) - printk("%02x ", ((__u8 *)&psi_reg.subregs)[i]); + for (i = 0; i < sizeof(psi_reg.subregs); i++) + printk("%02x ", ((__u8 *) & psi_reg.subregs)[i]); printk("\n"); #endif - if(psi_reg.regs.intstatus & PSI_MON) { + if (psi_reg.regs.intstatus & PSI_MON) { /* switch off or power fail */ - if(psi_reg.subregs.supply & PSI_SWITCH_OFF) { - if(voyager_status.switch_off) { - printk(KERN_ERR "Voyager front panel switch turned off again---Immediate power off!\n"); + if (psi_reg.subregs.supply & PSI_SWITCH_OFF) { + if (voyager_status.switch_off) { + printk(KERN_ERR + "Voyager front panel switch turned off again---Immediate power off!\n"); voyager_cat_power_off(); /* not reached */ } else { - printk(KERN_ERR "Voyager front panel switch turned off\n"); + printk(KERN_ERR + "Voyager front panel switch turned off\n"); voyager_status.switch_off = 1; voyager_status.request_from_kernel = 1; wake_up_process(voyager_thread); @@ -1127,7 +1146,7 @@ voyager_cat_do_common_interrupt(void) VDEBUG(("Voyager ac fail reg 0x%x\n", psi_reg.subregs.ACfail)); - if((psi_reg.subregs.ACfail & AC_FAIL_STAT_CHANGE) == 0) { + if ((psi_reg.subregs.ACfail & AC_FAIL_STAT_CHANGE) == 0) { /* No further update */ return; } @@ -1135,20 +1154,20 @@ voyager_cat_do_common_interrupt(void) /* Don't bother trying to find out who failed. * FIXME: This probably makes the code incorrect on * anything other than a 345x */ - for(i=0; i< 5; i++) { - if( psi_reg.subregs.ACfail &(1<<i)) { + for (i = 0; i < 5; i++) { + if (psi_reg.subregs.ACfail & (1 << i)) { break; } } printk(KERN_NOTICE "AC FAIL IN SUPPLY %d\n", i); #endif /* DON'T do this: it shuts down the AC PSI - outb(VOYAGER_CAT_RUN, CAT_CMD); - data = PSI_MASK_MASK | i; - cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_MASK, - 1, &data); - outb(VOYAGER_CAT_END, CAT_CMD); - */ + outb(VOYAGER_CAT_RUN, CAT_CMD); + data = PSI_MASK_MASK | i; + cat_subwrite(&psi, &psi_asic, VOYAGER_PSI_MASK, + 1, &data); + outb(VOYAGER_CAT_END, CAT_CMD); + */ printk(KERN_ERR "Voyager AC power failure\n"); outb(VOYAGER_CAT_RUN, CAT_CMD); data = PSI_COLD_START; @@ -1159,16 +1178,16 @@ voyager_cat_do_common_interrupt(void) voyager_status.request_from_kernel = 1; wake_up_process(voyager_thread); } - - - } else if(psi_reg.regs.intstatus & PSI_FAULT) { + + } else if (psi_reg.regs.intstatus & PSI_FAULT) { /* Major fault! */ - printk(KERN_ERR "Voyager PSI Detected major fault, immediate power off!\n"); + printk(KERN_ERR + "Voyager PSI Detected major fault, immediate power off!\n"); voyager_cat_power_off(); /* not reached */ - } else if(psi_reg.regs.intstatus & (PSI_DC_FAIL | PSI_ALARM - | PSI_CURRENT | PSI_DVM - | PSI_PSCFAULT | PSI_STAT_CHG)) { + } else if (psi_reg.regs.intstatus & (PSI_DC_FAIL | PSI_ALARM + | PSI_CURRENT | PSI_DVM + | PSI_PSCFAULT | PSI_STAT_CHG)) { /* other psi fault */ printk(KERN_WARNING "Voyager PSI status 0x%x\n", data); diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 88124dd3540..dffa786f61f 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -32,7 +32,8 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0 }; /* CPU IRQ affinity -- set to all ones initially */ -static unsigned long cpu_irq_affinity[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1] = ~0UL }; +static unsigned long cpu_irq_affinity[NR_CPUS] __cacheline_aligned = + {[0 ... NR_CPUS-1] = ~0UL }; /* per CPU data structure (for /proc/cpuinfo et al), visible externally * indexed physically */ @@ -76,7 +77,6 @@ EXPORT_SYMBOL(cpu_online_map); * by scheduler but indexed physically */ cpumask_t phys_cpu_present_map = CPU_MASK_NONE; - /* The internal functions */ static void send_CPI(__u32 cpuset, __u8 cpi); static void ack_CPI(__u8 cpi); @@ -101,94 +101,86 @@ int hard_smp_processor_id(void); int safe_smp_processor_id(void); /* Inline functions */ -static inline void -send_one_QIC_CPI(__u8 cpu, __u8 cpi) +static inline void send_one_QIC_CPI(__u8 cpu, __u8 cpi) { voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi = - (smp_processor_id() << 16) + cpi; + (smp_processor_id() << 16) + cpi; } -static inline void -send_QIC_CPI(__u32 cpuset, __u8 cpi) +static inline void send_QIC_CPI(__u32 cpuset, __u8 cpi) { int cpu; for_each_online_cpu(cpu) { - if(cpuset & (1<<cpu)) { + if (cpuset & (1 << cpu)) { #ifdef VOYAGER_DEBUG - if(!cpu_isset(cpu, cpu_online_map)) - VDEBUG(("CPU%d sending cpi %d to CPU%d not in cpu_online_map\n", hard_smp_processor_id(), cpi, cpu)); + if (!cpu_isset(cpu, cpu_online_map)) + VDEBUG(("CPU%d sending cpi %d to CPU%d not in " + "cpu_online_map\n", + hard_smp_processor_id(), cpi, cpu)); #endif send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET); } } } -static inline void -wrapper_smp_local_timer_interrupt(void) +static inline void wrapper_smp_local_timer_interrupt(void) { irq_enter(); smp_local_timer_interrupt(); irq_exit(); } -static inline void -send_one_CPI(__u8 cpu, __u8 cpi) +static inline void send_one_CPI(__u8 cpu, __u8 cpi) { - if(voyager_quad_processors & (1<<cpu)) + if (voyager_quad_processors & (1 << cpu)) send_one_QIC_CPI(cpu, cpi - QIC_CPI_OFFSET); else - send_CPI(1<<cpu, cpi); + send_CPI(1 << cpu, cpi); } -static inline void -send_CPI_allbutself(__u8 cpi) +static inline void send_CPI_allbutself(__u8 cpi) { __u8 cpu = smp_processor_id(); __u32 mask = cpus_addr(cpu_online_map)[0] & ~(1 << cpu); send_CPI(mask, cpi); } -static inline int -is_cpu_quad(void) +static inline int is_cpu_quad(void) { __u8 cpumask = inb(VIC_PROC_WHO_AM_I); return ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER); } -static inline int -is_cpu_extended(void) +static inline int is_cpu_extended(void) { __u8 cpu = hard_smp_processor_id(); - return(voyager_extended_vic_processors & (1<<cpu)); + return (voyager_extended_vic_processors & (1 << cpu)); } -static inline int -is_cpu_vic_boot(void) +static inline int is_cpu_vic_boot(void) { __u8 cpu = hard_smp_processor_id(); - return(voyager_extended_vic_processors - & voyager_allowed_boot_processors & (1<<cpu)); + return (voyager_extended_vic_processors + & voyager_allowed_boot_processors & (1 << cpu)); } - -static inline void -ack_CPI(__u8 cpi) +static inline void ack_CPI(__u8 cpi) { - switch(cpi) { + switch (cpi) { case VIC_CPU_BOOT_CPI: - if(is_cpu_quad() && !is_cpu_vic_boot()) + if (is_cpu_quad() && !is_cpu_vic_boot()) ack_QIC_CPI(cpi); else ack_VIC_CPI(cpi); break; case VIC_SYS_INT: - case VIC_CMN_INT: + case VIC_CMN_INT: /* These are slightly strange. Even on the Quad card, * They are vectored as VIC CPIs */ - if(is_cpu_quad()) + if (is_cpu_quad()) ack_special_QIC_CPI(cpi); else ack_VIC_CPI(cpi); @@ -205,11 +197,11 @@ ack_CPI(__u8 cpi) * 8259 IRQs except that masks and things must be kept per processor */ static struct irq_chip vic_chip = { - .name = "VIC", - .startup = startup_vic_irq, - .mask = mask_vic_irq, - .unmask = unmask_vic_irq, - .set_affinity = set_vic_irq_affinity, + .name = "VIC", + .startup = startup_vic_irq, + .mask = mask_vic_irq, + .unmask = unmask_vic_irq, + .set_affinity = set_vic_irq_affinity, }; /* used to count up as CPUs are brought on line (starts at 0) */ @@ -223,7 +215,7 @@ static __u32 trampoline_base; /* The per cpu profile stuff - used in smp_local_timer_interrupt */ static DEFINE_PER_CPU(int, prof_multiplier) = 1; static DEFINE_PER_CPU(int, prof_old_multiplier) = 1; -static DEFINE_PER_CPU(int, prof_counter) = 1; +static DEFINE_PER_CPU(int, prof_counter) = 1; /* the map used to check if a CPU has booted */ static __u32 cpu_booted_map; @@ -235,7 +227,6 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE; /* This is for the new dynamic CPU boot code */ cpumask_t cpu_callin_map = CPU_MASK_NONE; cpumask_t cpu_callout_map = CPU_MASK_NONE; -EXPORT_SYMBOL(cpu_callout_map); cpumask_t cpu_possible_map = CPU_MASK_NONE; EXPORT_SYMBOL(cpu_possible_map); @@ -246,9 +237,9 @@ static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned; static __u16 vic_irq_enable_mask[NR_CPUS] __cacheline_aligned = { 0 }; /* Lock for enable/disable of VIC interrupts */ -static __cacheline_aligned DEFINE_SPINLOCK(vic_irq_lock); +static __cacheline_aligned DEFINE_SPINLOCK(vic_irq_lock); -/* The boot processor is correctly set up in PC mode when it +/* The boot processor is correctly set up in PC mode when it * comes up, but the secondaries need their master/slave 8259 * pairs initializing correctly */ @@ -262,8 +253,7 @@ static unsigned long vic_tick[NR_CPUS] __cacheline_aligned = { 0 }; static unsigned long vic_cpi_mailbox[NR_CPUS] __cacheline_aligned; /* debugging routine to read the isr of the cpu's pic */ -static inline __u16 -vic_read_isr(void) +static inline __u16 vic_read_isr(void) { __u16 isr; @@ -275,17 +265,16 @@ vic_read_isr(void) return isr; } -static __init void -qic_setup(void) +static __init void qic_setup(void) { - if(!is_cpu_quad()) { + if (!is_cpu_quad()) { /* not a quad, no setup */ return; } outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0); outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1); - - if(is_cpu_extended()) { + + if (is_cpu_extended()) { /* the QIC duplicate of the VIC base register */ outb(VIC_DEFAULT_CPI_BASE, QIC_VIC_CPI_BASE_REGISTER); outb(QIC_DEFAULT_CPI_BASE, QIC_CPI_BASE_REGISTER); @@ -295,8 +284,7 @@ qic_setup(void) } } -static __init void -vic_setup_pic(void) +static __init void vic_setup_pic(void) { outb(1, VIC_REDIRECT_REGISTER_1); /* clear the claim registers for dynamic routing */ @@ -333,7 +321,7 @@ vic_setup_pic(void) /* ICW2: slave vector base */ outb(FIRST_EXTERNAL_VECTOR + 8, 0xA1); - + /* ICW3: slave ID */ outb(0x02, 0xA1); @@ -341,19 +329,18 @@ vic_setup_pic(void) outb(0x01, 0xA1); } -static void -do_quad_bootstrap(void) +static void do_quad_bootstrap(void) { - if(is_cpu_quad() && is_cpu_vic_boot()) { + if (is_cpu_quad() && is_cpu_vic_boot()) { int i; unsigned long flags; __u8 cpuid = hard_smp_processor_id(); local_irq_save(flags); - for(i = 0; i<4; i++) { + for (i = 0; i < 4; i++) { /* FIXME: this would be >>3 &0x7 on the 32 way */ - if(((cpuid >> 2) & 0x03) == i) + if (((cpuid >> 2) & 0x03) == i) /* don't lower our own mask! */ continue; @@ -368,12 +355,10 @@ do_quad_bootstrap(void) } } - /* Set up all the basic stuff: read the SMP config and make all the * SMP information reflect only the boot cpu. All others will be * brought on-line later. */ -void __init -find_smp_config(void) +void __init find_smp_config(void) { int i; @@ -382,24 +367,31 @@ find_smp_config(void) printk("VOYAGER SMP: Boot cpu is %d\n", boot_cpu_id); /* initialize the CPU structures (moved from smp_boot_cpus) */ - for(i=0; i<NR_CPUS; i++) { + for (i = 0; i < NR_CPUS; i++) { cpu_irq_affinity[i] = ~0; } cpu_online_map = cpumask_of_cpu(boot_cpu_id); /* The boot CPU must be extended */ - voyager_extended_vic_processors = 1<<boot_cpu_id; + voyager_extended_vic_processors = 1 << boot_cpu_id; /* initially, all of the first 8 CPUs can boot */ voyager_allowed_boot_processors = 0xff; /* set up everything for just this CPU, we can alter * this as we start the other CPUs later */ /* now get the CPU disposition from the extended CMOS */ - cpus_addr(phys_cpu_present_map)[0] = voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK); - cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 1) << 8; - cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 2) << 16; - cpus_addr(phys_cpu_present_map)[0] |= voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 3) << 24; + cpus_addr(phys_cpu_present_map)[0] = + voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK); + cpus_addr(phys_cpu_present_map)[0] |= + voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + 1) << 8; + cpus_addr(phys_cpu_present_map)[0] |= + voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + + 2) << 16; + cpus_addr(phys_cpu_present_map)[0] |= + voyager_extended_cmos_read(VOYAGER_PROCESSOR_PRESENT_MASK + + 3) << 24; cpu_possible_map = phys_cpu_present_map; - printk("VOYAGER SMP: phys_cpu_present_map = 0x%lx\n", cpus_addr(phys_cpu_present_map)[0]); + printk("VOYAGER SMP: phys_cpu_present_map = 0x%lx\n", + cpus_addr(phys_cpu_present_map)[0]); /* Here we set up the VIC to enable SMP */ /* enable the CPIs by writing the base vector to their register */ outb(VIC_DEFAULT_CPI_BASE, VIC_CPI_BASE_REGISTER); @@ -427,8 +419,7 @@ find_smp_config(void) /* * The bootstrap kernel entry code has set these up. Save them * for a given CPU, id is physical */ -void __init -smp_store_cpu_info(int id) +void __init smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = &cpu_data(id); @@ -438,21 +429,19 @@ smp_store_cpu_info(int id) } /* set up the trampoline and return the physical address of the code */ -static __u32 __init -setup_trampoline(void) +static __u32 __init setup_trampoline(void) { /* these two are global symbols in trampoline.S */ extern const __u8 trampoline_end[]; extern const __u8 trampoline_data[]; - memcpy((__u8 *)trampoline_base, trampoline_data, + memcpy((__u8 *) trampoline_base, trampoline_data, trampoline_end - trampoline_data); - return virt_to_phys((__u8 *)trampoline_base); + return virt_to_phys((__u8 *) trampoline_base); } /* Routine initially called when a non-boot CPU is brought online */ -static void __init -start_secondary(void *unused) +static void __init start_secondary(void *unused) { __u8 cpuid = hard_smp_processor_id(); /* external functions not defined in the headers */ @@ -464,17 +453,18 @@ start_secondary(void *unused) ack_CPI(VIC_CPU_BOOT_CPI); /* setup the 8259 master slave pair belonging to this CPU --- - * we won't actually receive any until the boot CPU - * relinquishes it's static routing mask */ + * we won't actually receive any until the boot CPU + * relinquishes it's static routing mask */ vic_setup_pic(); qic_setup(); - if(is_cpu_quad() && !is_cpu_vic_boot()) { + if (is_cpu_quad() && !is_cpu_vic_boot()) { /* clear the boot CPI */ __u8 dummy; - dummy = voyager_quad_cpi_addr[cpuid]->qic_cpi[VIC_CPU_BOOT_CPI].cpi; + dummy = + voyager_quad_cpi_addr[cpuid]->qic_cpi[VIC_CPU_BOOT_CPI].cpi; printk("read dummy %d\n", dummy); } @@ -516,7 +506,6 @@ start_secondary(void *unused) cpu_idle(); } - /* Routine to kick start the given CPU and wait for it to report ready * (or timeout in startup). When this routine returns, the requested * CPU is either fully running and configured or known to be dead. @@ -524,29 +513,28 @@ start_secondary(void *unused) * We call this routine sequentially 1 CPU at a time, so no need for * locking */ -static void __init -do_boot_cpu(__u8 cpu) +static void __init do_boot_cpu(__u8 cpu) { struct task_struct *idle; int timeout; unsigned long flags; - int quad_boot = (1<<cpu) & voyager_quad_processors - & ~( voyager_extended_vic_processors - & voyager_allowed_boot_processors); + int quad_boot = (1 << cpu) & voyager_quad_processors + & ~(voyager_extended_vic_processors + & voyager_allowed_boot_processors); /* This is an area in head.S which was used to set up the * initial kernel stack. We need to alter this to give the * booting CPU a new stack (taken from its idle process) */ extern struct { - __u8 *esp; + __u8 *sp; unsigned short ss; } stack_start; /* This is the format of the CPI IDT gate (in real mode) which * we're hijacking to boot the CPU */ - union IDTFormat { + union IDTFormat { struct seg { - __u16 Offset; - __u16 Segment; + __u16 Offset; + __u16 Segment; } idt; __u32 val; } hijack_source; @@ -565,37 +553,44 @@ do_boot_cpu(__u8 cpu) alternatives_smp_switch(1); idle = fork_idle(cpu); - if(IS_ERR(idle)) + if (IS_ERR(idle)) panic("failed fork for CPU%d", cpu); - idle->thread.eip = (unsigned long) start_secondary; + idle->thread.ip = (unsigned long)start_secondary; /* init_tasks (in sched.c) is indexed logically */ - stack_start.esp = (void *) idle->thread.esp; + stack_start.sp = (void *)idle->thread.sp; init_gdt(cpu); - per_cpu(current_task, cpu) = idle; + per_cpu(current_task, cpu) = idle; early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); irq_ctx_init(cpu); /* Note: Don't modify initial ss override */ - VDEBUG(("VOYAGER SMP: Booting CPU%d at 0x%lx[%x:%x], stack %p\n", cpu, + VDEBUG(("VOYAGER SMP: Booting CPU%d at 0x%lx[%x:%x], stack %p\n", cpu, (unsigned long)hijack_source.val, hijack_source.idt.Segment, - hijack_source.idt.Offset, stack_start.esp)); + hijack_source.idt.Offset, stack_start.sp)); /* init lowmem identity mapping */ clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); flush_tlb_all(); - if(quad_boot) { + if (quad_boot) { printk("CPU %d: non extended Quad boot\n", cpu); - hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_CPI + QIC_DEFAULT_CPI_BASE)*4); + hijack_vector = + (__u32 *) + phys_to_virt((VIC_CPU_BOOT_CPI + QIC_DEFAULT_CPI_BASE) * 4); *hijack_vector = hijack_source.val; } else { printk("CPU%d: extended VIC boot\n", cpu); - hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_CPI + VIC_DEFAULT_CPI_BASE)*4); + hijack_vector = + (__u32 *) + phys_to_virt((VIC_CPU_BOOT_CPI + VIC_DEFAULT_CPI_BASE) * 4); *hijack_vector = hijack_source.val; /* VIC errata, may also receive interrupt at this address */ - hijack_vector = (__u32 *)phys_to_virt((VIC_CPU_BOOT_ERRATA_CPI + VIC_DEFAULT_CPI_BASE)*4); + hijack_vector = + (__u32 *) + phys_to_virt((VIC_CPU_BOOT_ERRATA_CPI + + VIC_DEFAULT_CPI_BASE) * 4); *hijack_vector = hijack_source.val; } /* All non-boot CPUs start with interrupts fully masked. Need @@ -603,73 +598,76 @@ do_boot_cpu(__u8 cpu) * this in the VIC by masquerading as the processor we're * about to boot and lowering its interrupt mask */ local_irq_save(flags); - if(quad_boot) { + if (quad_boot) { send_one_QIC_CPI(cpu, VIC_CPU_BOOT_CPI); } else { outb(VIC_CPU_MASQUERADE_ENABLE | cpu, VIC_PROCESSOR_ID); /* here we're altering registers belonging to `cpu' */ - + outb(VIC_BOOT_INTERRUPT_MASK, 0x21); /* now go back to our original identity */ outb(boot_cpu_id, VIC_PROCESSOR_ID); /* and boot the CPU */ - send_CPI((1<<cpu), VIC_CPU_BOOT_CPI); + send_CPI((1 << cpu), VIC_CPU_BOOT_CPI); } cpu_booted_map = 0; local_irq_restore(flags); /* now wait for it to become ready (or timeout) */ - for(timeout = 0; timeout < 50000; timeout++) { - if(cpu_booted_map) + for (timeout = 0; timeout < 50000; timeout++) { + if (cpu_booted_map) break; udelay(100); } /* reset the page table */ zap_low_mappings(); - + if (cpu_booted_map) { VDEBUG(("CPU%d: Booted successfully, back in CPU %d\n", cpu, smp_processor_id())); - + printk("CPU%d: ", cpu); print_cpu_info(&cpu_data(cpu)); wmb(); cpu_set(cpu, cpu_callout_map); cpu_set(cpu, cpu_present_map); - } - else { + } else { printk("CPU%d FAILED TO BOOT: ", cpu); - if (*((volatile unsigned char *)phys_to_virt(start_phys_address))==0xA5) + if (* + ((volatile unsigned char *)phys_to_virt(start_phys_address)) + == 0xA5) printk("Stuck.\n"); else printk("Not responding.\n"); - + cpucount--; } } -void __init -smp_boot_cpus(void) +void __init smp_boot_cpus(void) { int i; /* CAT BUS initialisation must be done after the memory */ /* FIXME: The L4 has a catbus too, it just needs to be * accessed in a totally different way */ - if(voyager_level == 5) { + if (voyager_level == 5) { voyager_cat_init(); /* now that the cat has probed the Voyager System Bus, sanity * check the cpu map */ - if( ((voyager_quad_processors | voyager_extended_vic_processors) - & cpus_addr(phys_cpu_present_map)[0]) != cpus_addr(phys_cpu_present_map)[0]) { + if (((voyager_quad_processors | voyager_extended_vic_processors) + & cpus_addr(phys_cpu_present_map)[0]) != + cpus_addr(phys_cpu_present_map)[0]) { /* should panic */ - printk("\n\n***WARNING*** Sanity check of CPU present map FAILED\n"); + printk("\n\n***WARNING*** " + "Sanity check of CPU present map FAILED\n"); } - } else if(voyager_level == 4) - voyager_extended_vic_processors = cpus_addr(phys_cpu_present_map)[0]; + } else if (voyager_level == 4) + voyager_extended_vic_processors = + cpus_addr(phys_cpu_present_map)[0]; /* this sets up the idle task to run on the current cpu */ voyager_extended_cpus = 1; @@ -678,14 +676,14 @@ smp_boot_cpus(void) //global_irq_holder = boot_cpu_id; /* FIXME: Need to do something about this but currently only works - * on CPUs with a tsc which none of mine have. - smp_tune_scheduling(); + * on CPUs with a tsc which none of mine have. + smp_tune_scheduling(); */ smp_store_cpu_info(boot_cpu_id); printk("CPU%d: ", boot_cpu_id); print_cpu_info(&cpu_data(boot_cpu_id)); - if(is_cpu_quad()) { + if (is_cpu_quad()) { /* booting on a Quad CPU */ printk("VOYAGER SMP: Boot CPU is Quad\n"); qic_setup(); @@ -697,11 +695,11 @@ smp_boot_cpus(void) cpu_set(boot_cpu_id, cpu_online_map); cpu_set(boot_cpu_id, cpu_callout_map); - - /* loop over all the extended VIC CPUs and boot them. The + + /* loop over all the extended VIC CPUs and boot them. The * Quad CPUs must be bootstrapped by their extended VIC cpu */ - for(i = 0; i < NR_CPUS; i++) { - if(i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map)) + for (i = 0; i < NR_CPUS; i++) { + if (i == boot_cpu_id || !cpu_isset(i, phys_cpu_present_map)) continue; do_boot_cpu(i); /* This udelay seems to be needed for the Quad boots @@ -715,25 +713,26 @@ smp_boot_cpus(void) for (i = 0; i < NR_CPUS; i++) if (cpu_isset(i, cpu_online_map)) bogosum += cpu_data(i).loops_per_jiffy; - printk(KERN_INFO "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", - cpucount+1, - bogosum/(500000/HZ), - (bogosum/(5000/HZ))%100); + printk(KERN_INFO "Total of %d processors activated " + "(%lu.%02lu BogoMIPS).\n", + cpucount + 1, bogosum / (500000 / HZ), + (bogosum / (5000 / HZ)) % 100); } voyager_extended_cpus = hweight32(voyager_extended_vic_processors); - printk("VOYAGER: Extended (interrupt handling CPUs): %d, non-extended: %d\n", voyager_extended_cpus, num_booting_cpus() - voyager_extended_cpus); + printk("VOYAGER: Extended (interrupt handling CPUs): " + "%d, non-extended: %d\n", voyager_extended_cpus, + num_booting_cpus() - voyager_extended_cpus); /* that's it, switch to symmetric mode */ outb(0, VIC_PRIORITY_REGISTER); outb(0, VIC_CLAIM_REGISTER_0); outb(0, VIC_CLAIM_REGISTER_1); - + VDEBUG(("VOYAGER SMP: Booted with %d CPUs\n", num_booting_cpus())); } /* Reload the secondary CPUs task structure (this function does not * return ) */ -void __init -initialize_secondary(void) +void __init initialize_secondary(void) { #if 0 // AC kernels only @@ -745,11 +744,9 @@ initialize_secondary(void) * basically just the stack pointer and the eip. */ - asm volatile( - "movl %0,%%esp\n\t" - "jmp *%1" - : - :"r" (current->thread.esp),"r" (current->thread.eip)); + asm volatile ("movl %0,%%esp\n\t" + "jmp *%1"::"r" (current->thread.sp), + "r"(current->thread.ip)); } /* handle a Voyager SYS_INT -- If we don't, the base board will @@ -758,25 +755,23 @@ initialize_secondary(void) * System interrupts occur because some problem was detected on the * various busses. To find out what you have to probe all the * hardware via the CAT bus. FIXME: At the moment we do nothing. */ -fastcall void -smp_vic_sys_interrupt(struct pt_regs *regs) +void smp_vic_sys_interrupt(struct pt_regs *regs) { ack_CPI(VIC_SYS_INT); - printk("Voyager SYSTEM INTERRUPT\n"); + printk("Voyager SYSTEM INTERRUPT\n"); } /* Handle a voyager CMN_INT; These interrupts occur either because of * a system status change or because a single bit memory error * occurred. FIXME: At the moment, ignore all this. */ -fastcall void -smp_vic_cmn_interrupt(struct pt_regs *regs) +void smp_vic_cmn_interrupt(struct pt_regs *regs) { static __u8 in_cmn_int = 0; static DEFINE_SPINLOCK(cmn_int_lock); /* common ints are broadcast, so make sure we only do this once */ _raw_spin_lock(&cmn_int_lock); - if(in_cmn_int) + if (in_cmn_int) goto unlock_end; in_cmn_int++; @@ -784,12 +779,12 @@ smp_vic_cmn_interrupt(struct pt_regs *regs) VDEBUG(("Voyager COMMON INTERRUPT\n")); - if(voyager_level == 5) + if (voyager_level == 5) voyager_cat_do_common_interrupt(); _raw_spin_lock(&cmn_int_lock); in_cmn_int = 0; - unlock_end: + unlock_end: _raw_spin_unlock(&cmn_int_lock); ack_CPI(VIC_CMN_INT); } @@ -797,26 +792,23 @@ smp_vic_cmn_interrupt(struct pt_regs *regs) /* * Reschedule call back. Nothing to do, all the work is done * automatically when we return from the interrupt. */ -static void -smp_reschedule_interrupt(void) +static void smp_reschedule_interrupt(void) { /* do nothing */ } -static struct mm_struct * flush_mm; +static struct mm_struct *flush_mm; static unsigned long flush_va; static DEFINE_SPINLOCK(tlbstate_lock); -#define FLUSH_ALL 0xffffffff /* - * We cannot call mmdrop() because we are in interrupt context, + * We cannot call mmdrop() because we are in interrupt context, * instead update mm->cpu_vm_mask. * * We need to reload %cr3 since the page tables may be going * away from under us.. */ -static inline void -leave_mm (unsigned long cpu) +static inline void voyager_leave_mm(unsigned long cpu) { if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) BUG(); @@ -824,12 +816,10 @@ leave_mm (unsigned long cpu) load_cr3(swapper_pg_dir); } - /* * Invalidate call-back */ -static void -smp_invalidate_interrupt(void) +static void smp_invalidate_interrupt(void) { __u8 cpu = smp_processor_id(); @@ -837,18 +827,18 @@ smp_invalidate_interrupt(void) return; /* This will flood messages. Don't uncomment unless you see * Problems with cross cpu invalidation - VDEBUG(("VOYAGER SMP: CPU%d received INVALIDATE_CPI\n", - smp_processor_id())); - */ + VDEBUG(("VOYAGER SMP: CPU%d received INVALIDATE_CPI\n", + smp_processor_id())); + */ if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { - if (flush_va == FLUSH_ALL) + if (flush_va == TLB_FLUSH_ALL) local_flush_tlb(); else __flush_tlb_one(flush_va); } else - leave_mm(cpu); + voyager_leave_mm(cpu); } smp_mb__before_clear_bit(); clear_bit(cpu, &smp_invalidate_needed); @@ -857,11 +847,10 @@ smp_invalidate_interrupt(void) /* All the new flush operations for 2.4 */ - /* This routine is called with a physical cpu mask */ static void -voyager_flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, - unsigned long va) +voyager_flush_tlb_others(unsigned long cpumask, struct mm_struct *mm, + unsigned long va) { int stuck = 50000; @@ -875,7 +864,7 @@ voyager_flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, BUG(); spin_lock(&tlbstate_lock); - + flush_mm = mm; flush_va = va; atomic_set_mask(cpumask, &smp_invalidate_needed); @@ -887,23 +876,23 @@ voyager_flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, while (smp_invalidate_needed) { mb(); - if(--stuck == 0) { - printk("***WARNING*** Stuck doing invalidate CPI (CPU%d)\n", smp_processor_id()); + if (--stuck == 0) { + printk("***WARNING*** Stuck doing invalidate CPI " + "(CPU%d)\n", smp_processor_id()); break; } } /* Uncomment only to debug invalidation problems - VDEBUG(("VOYAGER SMP: Completed invalidate CPI (CPU%d)\n", cpu)); - */ + VDEBUG(("VOYAGER SMP: Completed invalidate CPI (CPU%d)\n", cpu)); + */ flush_mm = NULL; flush_va = 0; spin_unlock(&tlbstate_lock); } -void -flush_tlb_current_task(void) +void flush_tlb_current_task(void) { struct mm_struct *mm = current->mm; unsigned long cpu_mask; @@ -913,14 +902,12 @@ flush_tlb_current_task(void) cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id()); local_flush_tlb(); if (cpu_mask) - voyager_flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + voyager_flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } - -void -flush_tlb_mm (struct mm_struct * mm) +void flush_tlb_mm(struct mm_struct *mm) { unsigned long cpu_mask; @@ -932,15 +919,15 @@ flush_tlb_mm (struct mm_struct * mm) if (current->mm) local_flush_tlb(); else - leave_mm(smp_processor_id()); + voyager_leave_mm(smp_processor_id()); } if (cpu_mask) - voyager_flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + voyager_flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); preempt_enable(); } -void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) +void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) { struct mm_struct *mm = vma->vm_mm; unsigned long cpu_mask; @@ -949,10 +936,10 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) cpu_mask = cpus_addr(mm->cpu_vm_mask)[0] & ~(1 << smp_processor_id()); if (current->active_mm == mm) { - if(current->mm) + if (current->mm) __flush_tlb_one(va); - else - leave_mm(smp_processor_id()); + else + voyager_leave_mm(smp_processor_id()); } if (cpu_mask) @@ -960,21 +947,21 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) preempt_enable(); } + EXPORT_SYMBOL(flush_tlb_page); /* enable the requested IRQs */ -static void -smp_enable_irq_interrupt(void) +static void smp_enable_irq_interrupt(void) { __u8 irq; __u8 cpu = get_cpu(); VDEBUG(("VOYAGER SMP: CPU%d enabling irq mask 0x%x\n", cpu, - vic_irq_enable_mask[cpu])); + vic_irq_enable_mask[cpu])); spin_lock(&vic_irq_lock); - for(irq = 0; irq < 16; irq++) { - if(vic_irq_enable_mask[cpu] & (1<<irq)) + for (irq = 0; irq < 16; irq++) { + if (vic_irq_enable_mask[cpu] & (1 << irq)) enable_local_vic_irq(irq); } vic_irq_enable_mask[cpu] = 0; @@ -982,17 +969,16 @@ smp_enable_irq_interrupt(void) put_cpu_no_resched(); } - + /* * CPU halt call-back */ -static void -smp_stop_cpu_function(void *dummy) +static void smp_stop_cpu_function(void *dummy) { VDEBUG(("VOYAGER SMP: CPU%d is STOPPING\n", smp_processor_id())); cpu_clear(smp_processor_id(), cpu_online_map); local_irq_disable(); - for(;;) + for (;;) halt(); } @@ -1006,14 +992,13 @@ struct call_data_struct { int wait; }; -static struct call_data_struct * call_data; +static struct call_data_struct *call_data; /* execute a thread on a new CPU. The function to be called must be * previously set up. This is used to schedule a function for * execution on all CPUs - set up the function then broadcast a * function_interrupt CPI to come here on each CPU */ -static void -smp_call_function_interrupt(void) +static void smp_call_function_interrupt(void) { void (*func) (void *info) = call_data->func; void *info = call_data->info; @@ -1027,16 +1012,17 @@ smp_call_function_interrupt(void) * about to execute the function */ mb(); - if(!test_and_clear_bit(cpu, &call_data->started)) { + if (!test_and_clear_bit(cpu, &call_data->started)) { /* If the bit wasn't set, this could be a replay */ - printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion with no call pending\n", cpu); + printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion" + " with no call pending\n", cpu); return; } /* * At this point the info structure may be out of scope unless wait==1 */ irq_enter(); - (*func)(info); + (*func) (info); __get_cpu_var(irq_stat).irq_call_count++; irq_exit(); if (wait) { @@ -1046,14 +1032,13 @@ smp_call_function_interrupt(void) } static int -voyager_smp_call_function_mask (cpumask_t cpumask, - void (*func) (void *info), void *info, - int wait) +voyager_smp_call_function_mask(cpumask_t cpumask, + void (*func) (void *info), void *info, int wait) { struct call_data_struct data; u32 mask = cpus_addr(cpumask)[0]; - mask &= ~(1<<smp_processor_id()); + mask &= ~(1 << smp_processor_id()); if (!mask) return 0; @@ -1093,7 +1078,7 @@ voyager_smp_call_function_mask (cpumask_t cpumask, * so we use the system clock to interrupt one processor, which in * turn, broadcasts a timer CPI to all the others --- we receive that * CPI here. We don't use this actually for counting so losing - * ticks doesn't matter + * ticks doesn't matter * * FIXME: For those CPUs which actually have a local APIC, we could * try to use it to trigger this interrupt instead of having to @@ -1101,8 +1086,7 @@ voyager_smp_call_function_mask (cpumask_t cpumask, * no local APIC, so I can't do this * * This function is currently a placeholder and is unused in the code */ -fastcall void -smp_apic_timer_interrupt(struct pt_regs *regs) +void smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); wrapper_smp_local_timer_interrupt(); @@ -1110,8 +1094,7 @@ smp_apic_timer_interrupt(struct pt_regs *regs) } /* All of the QUAD interrupt GATES */ -fastcall void -smp_qic_timer_interrupt(struct pt_regs *regs) +void smp_qic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); ack_QIC_CPI(QIC_TIMER_CPI); @@ -1119,127 +1102,112 @@ smp_qic_timer_interrupt(struct pt_regs *regs) set_irq_regs(old_regs); } -fastcall void -smp_qic_invalidate_interrupt(struct pt_regs *regs) +void smp_qic_invalidate_interrupt(struct pt_regs *regs) { ack_QIC_CPI(QIC_INVALIDATE_CPI); smp_invalidate_interrupt(); } -fastcall void -smp_qic_reschedule_interrupt(struct pt_regs *regs) +void smp_qic_reschedule_interrupt(struct pt_regs *regs) { ack_QIC_CPI(QIC_RESCHEDULE_CPI); smp_reschedule_interrupt(); } -fastcall void -smp_qic_enable_irq_interrupt(struct pt_regs *regs) +void smp_qic_enable_irq_interrupt(struct pt_regs *regs) { ack_QIC_CPI(QIC_ENABLE_IRQ_CPI); smp_enable_irq_interrupt(); } -fastcall void -smp_qic_call_function_interrupt(struct pt_regs *regs) +void smp_qic_call_function_interrupt(struct pt_regs *regs) { ack_QIC_CPI(QIC_CALL_FUNCTION_CPI); smp_call_function_interrupt(); } -fastcall void -smp_vic_cpi_interrupt(struct pt_regs *regs) +void smp_vic_cpi_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); __u8 cpu = smp_processor_id(); - if(is_cpu_quad()) + if (is_cpu_quad()) ack_QIC_CPI(VIC_CPI_LEVEL0); else ack_VIC_CPI(VIC_CPI_LEVEL0); - if(test_and_clear_bit(VIC_TIMER_CPI, &vic_cpi_mailbox[cpu])) + if (test_and_clear_bit(VIC_TIMER_CPI, &vic_cpi_mailbox[cpu])) wrapper_smp_local_timer_interrupt(); - if(test_and_clear_bit(VIC_INVALIDATE_CPI, &vic_cpi_mailbox[cpu])) + if (test_and_clear_bit(VIC_INVALIDATE_CPI, &vic_cpi_mailbox[cpu])) smp_invalidate_interrupt(); - if(test_and_clear_bit(VIC_RESCHEDULE_CPI, &vic_cpi_mailbox[cpu])) + if (test_and_clear_bit(VIC_RESCHEDULE_CPI, &vic_cpi_mailbox[cpu])) smp_reschedule_interrupt(); - if(test_and_clear_bit(VIC_ENABLE_IRQ_CPI, &vic_cpi_mailbox[cpu])) + if (test_and_clear_bit(VIC_ENABLE_IRQ_CPI, &vic_cpi_mailbox[cpu])) smp_enable_irq_interrupt(); - if(test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu])) + if (test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu])) smp_call_function_interrupt(); set_irq_regs(old_regs); } -static void -do_flush_tlb_all(void* info) +static void do_flush_tlb_all(void *info) { unsigned long cpu = smp_processor_id(); __flush_tlb_all(); if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY) - leave_mm(cpu); + voyager_leave_mm(cpu); } - /* flush the TLB of every active CPU in the system */ -void -flush_tlb_all(void) +void flush_tlb_all(void) { on_each_cpu(do_flush_tlb_all, 0, 1, 1); } /* used to set up the trampoline for other CPUs when the memory manager * is sorted out */ -void __init -smp_alloc_memory(void) +void __init smp_alloc_memory(void) { - trampoline_base = (__u32)alloc_bootmem_low_pages(PAGE_SIZE); - if(__pa(trampoline_base) >= 0x93000) + trampoline_base = (__u32) alloc_bootmem_low_pages(PAGE_SIZE); + if (__pa(trampoline_base) >= 0x93000) BUG(); } /* send a reschedule CPI to one CPU by physical CPU number*/ -static void -voyager_smp_send_reschedule(int cpu) +static void voyager_smp_send_reschedule(int cpu) { send_one_CPI(cpu, VIC_RESCHEDULE_CPI); } - -int -hard_smp_processor_id(void) +int hard_smp_processor_id(void) { __u8 i; __u8 cpumask = inb(VIC_PROC_WHO_AM_I); - if((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER) + if ((cpumask & QUAD_IDENTIFIER) == QUAD_IDENTIFIER) return cpumask & 0x1F; - for(i = 0; i < 8; i++) { - if(cpumask & (1<<i)) + for (i = 0; i < 8; i++) { + if (cpumask & (1 << i)) return i; } printk("** WARNING ** Illegal cpuid returned by VIC: %d", cpumask); return 0; } -int -safe_smp_processor_id(void) +int safe_smp_processor_id(void) { return hard_smp_processor_id(); } /* broadcast a halt to all other CPUs */ -static void -voyager_smp_send_stop(void) +static void voyager_smp_send_stop(void) { smp_call_function(smp_stop_cpu_function, NULL, 1, 1); } /* this function is triggered in time.c when a clock tick fires * we need to re-broadcast the tick to all CPUs */ -void -smp_vic_timer_interrupt(void) +void smp_vic_timer_interrupt(void) { send_CPI_allbutself(VIC_TIMER_CPI); smp_local_timer_interrupt(); @@ -1253,8 +1221,7 @@ smp_vic_timer_interrupt(void) * multiplier is 1 and it can be changed by writing the new multiplier * value into /proc/profile. */ -void -smp_local_timer_interrupt(void) +void smp_local_timer_interrupt(void) { int cpu = smp_processor_id(); long weight; @@ -1269,18 +1236,18 @@ smp_local_timer_interrupt(void) * * Interrupts are already masked off at this point. */ - per_cpu(prof_counter,cpu) = per_cpu(prof_multiplier, cpu); + per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu); if (per_cpu(prof_counter, cpu) != - per_cpu(prof_old_multiplier, cpu)) { + per_cpu(prof_old_multiplier, cpu)) { /* FIXME: need to update the vic timer tick here */ per_cpu(prof_old_multiplier, cpu) = - per_cpu(prof_counter, cpu); + per_cpu(prof_counter, cpu); } update_process_times(user_mode_vm(get_irq_regs())); } - if( ((1<<cpu) & voyager_extended_vic_processors) == 0) + if (((1 << cpu) & voyager_extended_vic_processors) == 0) /* only extended VIC processors participate in * interrupt distribution */ return; @@ -1296,12 +1263,12 @@ smp_local_timer_interrupt(void) * we can take more than 100K local irqs per second on a 100 MHz P5. */ - if((++vic_tick[cpu] & 0x7) != 0) + if ((++vic_tick[cpu] & 0x7) != 0) return; /* get here every 16 ticks (about every 1/6 of a second) */ /* Change our priority to give someone else a chance at getting - * the IRQ. The algorithm goes like this: + * the IRQ. The algorithm goes like this: * * In the VIC, the dynamically routed interrupt is always * handled by the lowest priority eligible (i.e. receiving @@ -1325,18 +1292,18 @@ smp_local_timer_interrupt(void) * affinity code since we now try to even up the interrupt * counts when an affinity binding is keeping them on a * particular CPU*/ - weight = (vic_intr_count[cpu]*voyager_extended_cpus + weight = (vic_intr_count[cpu] * voyager_extended_cpus - vic_intr_total) >> 4; weight += 4; - if(weight > 7) + if (weight > 7) weight = 7; - if(weight < 0) + if (weight < 0) weight = 0; - - outb((__u8)weight, VIC_PRIORITY_REGISTER); + + outb((__u8) weight, VIC_PRIORITY_REGISTER); #ifdef VOYAGER_DEBUG - if((vic_tick[cpu] & 0xFFF) == 0) { + if ((vic_tick[cpu] & 0xFFF) == 0) { /* print this message roughly every 25 secs */ printk("VOYAGER SMP: vic_tick[%d] = %lu, weight = %ld\n", cpu, vic_tick[cpu], weight); @@ -1345,15 +1312,14 @@ smp_local_timer_interrupt(void) } /* setup the profiling timer */ -int -setup_profiling_timer(unsigned int multiplier) +int setup_profiling_timer(unsigned int multiplier) { int i; - if ( (!multiplier)) + if ((!multiplier)) return -EINVAL; - /* + /* * Set the new multiplier for each CPU. CPUs don't start using the * new values until the next timer interrupt in which they do process * accounting. @@ -1367,15 +1333,13 @@ setup_profiling_timer(unsigned int multiplier) /* This is a bit of a mess, but forced on us by the genirq changes * there's no genirq handler that really does what voyager wants * so hack it up with the simple IRQ handler */ -static void fastcall -handle_vic_irq(unsigned int irq, struct irq_desc *desc) +static void handle_vic_irq(unsigned int irq, struct irq_desc *desc) { before_handle_vic_irq(irq); handle_simple_irq(irq, desc); after_handle_vic_irq(irq); } - /* The CPIs are handled in the per cpu 8259s, so they must be * enabled to be received: FIX: enabling the CPIs in the early * boot sequence interferes with bug checking; enable them later @@ -1385,13 +1349,12 @@ handle_vic_irq(unsigned int irq, struct irq_desc *desc) #define QIC_SET_GATE(cpi, vector) \ set_intr_gate((cpi) + QIC_DEFAULT_CPI_BASE, (vector)) -void __init -smp_intr_init(void) +void __init smp_intr_init(void) { int i; /* initialize the per cpu irq mask to all disabled */ - for(i = 0; i < NR_CPUS; i++) + for (i = 0; i < NR_CPUS; i++) vic_irq_mask[i] = 0xFFFF; VIC_SET_GATE(VIC_CPI_LEVEL0, vic_cpi_interrupt); @@ -1404,42 +1367,40 @@ smp_intr_init(void) QIC_SET_GATE(QIC_RESCHEDULE_CPI, qic_reschedule_interrupt); QIC_SET_GATE(QIC_ENABLE_IRQ_CPI, qic_enable_irq_interrupt); QIC_SET_GATE(QIC_CALL_FUNCTION_CPI, qic_call_function_interrupt); - - /* now put the VIC descriptor into the first 48 IRQs + /* now put the VIC descriptor into the first 48 IRQs * * This is for later: first 16 correspond to PC IRQs; next 16 * are Primary MC IRQs and final 16 are Secondary MC IRQs */ - for(i = 0; i < 48; i++) + for (i = 0; i < 48; i++) set_irq_chip_and_handler(i, &vic_chip, handle_vic_irq); } /* send a CPI at level cpi to a set of cpus in cpuset (set 1 bit per * processor to receive CPI */ -static void -send_CPI(__u32 cpuset, __u8 cpi) +static void send_CPI(__u32 cpuset, __u8 cpi) { int cpu; __u32 quad_cpuset = (cpuset & voyager_quad_processors); - if(cpi < VIC_START_FAKE_CPI) { - /* fake CPI are only used for booting, so send to the + if (cpi < VIC_START_FAKE_CPI) { + /* fake CPI are only used for booting, so send to the * extended quads as well---Quads must be VIC booted */ - outb((__u8)(cpuset), VIC_CPI_Registers[cpi]); + outb((__u8) (cpuset), VIC_CPI_Registers[cpi]); return; } - if(quad_cpuset) + if (quad_cpuset) send_QIC_CPI(quad_cpuset, cpi); cpuset &= ~quad_cpuset; cpuset &= 0xff; /* only first 8 CPUs vaild for VIC CPI */ - if(cpuset == 0) + if (cpuset == 0) return; for_each_online_cpu(cpu) { - if(cpuset & (1<<cpu)) + if (cpuset & (1 << cpu)) set_bit(cpi, &vic_cpi_mailbox[cpu]); } - if(cpuset) - outb((__u8)cpuset, VIC_CPI_Registers[VIC_CPI_LEVEL0]); + if (cpuset) + outb((__u8) cpuset, VIC_CPI_Registers[VIC_CPI_LEVEL0]); } /* Acknowledge receipt of CPI in the QIC, clear in QIC hardware and @@ -1448,20 +1409,19 @@ send_CPI(__u32 cpuset, __u8 cpi) * DON'T make this inline otherwise the cache line read will be * optimised away * */ -static int -ack_QIC_CPI(__u8 cpi) { +static int ack_QIC_CPI(__u8 cpi) +{ __u8 cpu = hard_smp_processor_id(); cpi &= 7; - outb(1<<cpi, QIC_INTERRUPT_CLEAR1); + outb(1 << cpi, QIC_INTERRUPT_CLEAR1); return voyager_quad_cpi_addr[cpu]->qic_cpi[cpi].cpi; } -static void -ack_special_QIC_CPI(__u8 cpi) +static void ack_special_QIC_CPI(__u8 cpi) { - switch(cpi) { + switch (cpi) { case VIC_CMN_INT: outb(QIC_CMN_INT, QIC_INTERRUPT_CLEAR0); break; @@ -1474,8 +1434,7 @@ ack_special_QIC_CPI(__u8 cpi) } /* Acknowledge receipt of CPI in the VIC (essentially an EOI) */ -static void -ack_VIC_CPI(__u8 cpi) +static void ack_VIC_CPI(__u8 cpi) { #ifdef VOYAGER_DEBUG unsigned long flags; @@ -1484,17 +1443,17 @@ ack_VIC_CPI(__u8 cpi) local_irq_save(flags); isr = vic_read_isr(); - if((isr & (1<<(cpi &7))) == 0) { + if ((isr & (1 << (cpi & 7))) == 0) { printk("VOYAGER SMP: CPU%d lost CPI%d\n", cpu, cpi); } #endif /* send specific EOI; the two system interrupts have * bit 4 set for a separate vector but behave as the * corresponding 3 bit intr */ - outb_p(0x60|(cpi & 7),0x20); + outb_p(0x60 | (cpi & 7), 0x20); #ifdef VOYAGER_DEBUG - if((vic_read_isr() & (1<<(cpi &7))) != 0) { + if ((vic_read_isr() & (1 << (cpi & 7))) != 0) { printk("VOYAGER SMP: CPU%d still asserting CPI%d\n", cpu, cpi); } local_irq_restore(flags); @@ -1502,12 +1461,11 @@ ack_VIC_CPI(__u8 cpi) } /* cribbed with thanks from irq.c */ -#define __byte(x,y) (((unsigned char *)&(y))[x]) +#define __byte(x,y) (((unsigned char *)&(y))[x]) #define cached_21(cpu) (__byte(0,vic_irq_mask[cpu])) #define cached_A1(cpu) (__byte(1,vic_irq_mask[cpu])) -static unsigned int -startup_vic_irq(unsigned int irq) +static unsigned int startup_vic_irq(unsigned int irq) { unmask_vic_irq(irq); @@ -1535,13 +1493,12 @@ startup_vic_irq(unsigned int irq) * broadcast an Interrupt enable CPI which causes all other CPUs to * adjust their masks accordingly. */ -static void -unmask_vic_irq(unsigned int irq) +static void unmask_vic_irq(unsigned int irq) { /* linux doesn't to processor-irq affinity, so enable on * all CPUs we know about */ int cpu = smp_processor_id(), real_cpu; - __u16 mask = (1<<irq); + __u16 mask = (1 << irq); __u32 processorList = 0; unsigned long flags; @@ -1549,78 +1506,72 @@ unmask_vic_irq(unsigned int irq) irq, cpu, cpu_irq_affinity[cpu])); spin_lock_irqsave(&vic_irq_lock, flags); for_each_online_cpu(real_cpu) { - if(!(voyager_extended_vic_processors & (1<<real_cpu))) + if (!(voyager_extended_vic_processors & (1 << real_cpu))) continue; - if(!(cpu_irq_affinity[real_cpu] & mask)) { + if (!(cpu_irq_affinity[real_cpu] & mask)) { /* irq has no affinity for this CPU, ignore */ continue; } - if(real_cpu == cpu) { + if (real_cpu == cpu) { enable_local_vic_irq(irq); - } - else if(vic_irq_mask[real_cpu] & mask) { + } else if (vic_irq_mask[real_cpu] & mask) { vic_irq_enable_mask[real_cpu] |= mask; - processorList |= (1<<real_cpu); + processorList |= (1 << real_cpu); } } spin_unlock_irqrestore(&vic_irq_lock, flags); - if(processorList) + if (processorList) send_CPI(processorList, VIC_ENABLE_IRQ_CPI); } -static void -mask_vic_irq(unsigned int irq) +static void mask_vic_irq(unsigned int irq) { /* lazy disable, do nothing */ } -static void -enable_local_vic_irq(unsigned int irq) +static void enable_local_vic_irq(unsigned int irq) { __u8 cpu = smp_processor_id(); __u16 mask = ~(1 << irq); __u16 old_mask = vic_irq_mask[cpu]; vic_irq_mask[cpu] &= mask; - if(vic_irq_mask[cpu] == old_mask) + if (vic_irq_mask[cpu] == old_mask) return; VDEBUG(("VOYAGER DEBUG: Enabling irq %d in hardware on CPU %d\n", irq, cpu)); if (irq & 8) { - outb_p(cached_A1(cpu),0xA1); + outb_p(cached_A1(cpu), 0xA1); (void)inb_p(0xA1); - } - else { - outb_p(cached_21(cpu),0x21); + } else { + outb_p(cached_21(cpu), 0x21); (void)inb_p(0x21); } } -static void -disable_local_vic_irq(unsigned int irq) +static void disable_local_vic_irq(unsigned int irq) { __u8 cpu = smp_processor_id(); __u16 mask = (1 << irq); __u16 old_mask = vic_irq_mask[cpu]; - if(irq == 7) + if (irq == 7) return; vic_irq_mask[cpu] |= mask; - if(old_mask == vic_irq_mask[cpu]) + if (old_mask == vic_irq_mask[cpu]) return; VDEBUG(("VOYAGER DEBUG: Disabling irq %d in hardware on CPU %d\n", irq, cpu)); if (irq & 8) { - outb_p(cached_A1(cpu),0xA1); + outb_p(cached_A1(cpu), 0xA1); (void)inb_p(0xA1); - } - else { - outb_p(cached_21(cpu),0x21); + } else { + outb_p(cached_21(cpu), 0x21); (void)inb_p(0x21); } } @@ -1631,8 +1582,7 @@ disable_local_vic_irq(unsigned int irq) * interrupt in the vic, so we merely set a flag (IRQ_DISABLED). If * this interrupt actually comes in, then we mask and ack here to push * the interrupt off to another CPU */ -static void -before_handle_vic_irq(unsigned int irq) +static void before_handle_vic_irq(unsigned int irq) { irq_desc_t *desc = irq_desc + irq; __u8 cpu = smp_processor_id(); @@ -1641,16 +1591,16 @@ before_handle_vic_irq(unsigned int irq) vic_intr_total++; vic_intr_count[cpu]++; - if(!(cpu_irq_affinity[cpu] & (1<<irq))) { + if (!(cpu_irq_affinity[cpu] & (1 << irq))) { /* The irq is not in our affinity mask, push it off * onto another CPU */ - VDEBUG(("VOYAGER DEBUG: affinity triggered disable of irq %d on cpu %d\n", - irq, cpu)); + VDEBUG(("VOYAGER DEBUG: affinity triggered disable of irq %d " + "on cpu %d\n", irq, cpu)); disable_local_vic_irq(irq); /* set IRQ_INPROGRESS to prevent the handler in irq.c from * actually calling the interrupt routine */ desc->status |= IRQ_REPLAY | IRQ_INPROGRESS; - } else if(desc->status & IRQ_DISABLED) { + } else if (desc->status & IRQ_DISABLED) { /* Damn, the interrupt actually arrived, do the lazy * disable thing. The interrupt routine in irq.c will * not handle a IRQ_DISABLED interrupt, so nothing more @@ -1667,8 +1617,7 @@ before_handle_vic_irq(unsigned int irq) } /* Finish the VIC interrupt: basically mask */ -static void -after_handle_vic_irq(unsigned int irq) +static void after_handle_vic_irq(unsigned int irq) { irq_desc_t *desc = irq_desc + irq; @@ -1685,11 +1634,11 @@ after_handle_vic_irq(unsigned int irq) #ifdef VOYAGER_DEBUG /* DEBUG: before we ack, check what's in progress */ isr = vic_read_isr(); - if((isr & (1<<irq) && !(status & IRQ_REPLAY)) == 0) { + if ((isr & (1 << irq) && !(status & IRQ_REPLAY)) == 0) { int i; __u8 cpu = smp_processor_id(); __u8 real_cpu; - int mask; /* Um... initialize me??? --RR */ + int mask; /* Um... initialize me??? --RR */ printk("VOYAGER SMP: CPU%d lost interrupt %d\n", cpu, irq); @@ -1698,9 +1647,10 @@ after_handle_vic_irq(unsigned int irq) outb(VIC_CPU_MASQUERADE_ENABLE | real_cpu, VIC_PROCESSOR_ID); isr = vic_read_isr(); - if(isr & (1<<irq)) { - printk("VOYAGER SMP: CPU%d ack irq %d\n", - real_cpu, irq); + if (isr & (1 << irq)) { + printk + ("VOYAGER SMP: CPU%d ack irq %d\n", + real_cpu, irq); ack_vic_irq(irq); } outb(cpu, VIC_PROCESSOR_ID); @@ -1711,7 +1661,7 @@ after_handle_vic_irq(unsigned int irq) * receipt by another CPU so everything must be in * order here */ ack_vic_irq(irq); - if(status & IRQ_REPLAY) { + if (status & IRQ_REPLAY) { /* replay is set if we disable the interrupt * in the before_handle_vic_irq() routine, so * clear the in progress bit here to allow the @@ -1720,9 +1670,9 @@ after_handle_vic_irq(unsigned int irq) } #ifdef VOYAGER_DEBUG isr = vic_read_isr(); - if((isr & (1<<irq)) != 0) - printk("VOYAGER SMP: after_handle_vic_irq() after ack irq=%d, isr=0x%x\n", - irq, isr); + if ((isr & (1 << irq)) != 0) + printk("VOYAGER SMP: after_handle_vic_irq() after " + "ack irq=%d, isr=0x%x\n", irq, isr); #endif /* VOYAGER_DEBUG */ } _raw_spin_unlock(&vic_irq_lock); @@ -1731,7 +1681,6 @@ after_handle_vic_irq(unsigned int irq) * may be intercepted by another CPU if reasserted */ } - /* Linux processor - interrupt affinity manipulations. * * For each processor, we maintain a 32 bit irq affinity mask. @@ -1748,8 +1697,7 @@ after_handle_vic_irq(unsigned int irq) * change the mask and then do an interrupt enable CPI to re-enable on * the selected processors */ -void -set_vic_irq_affinity(unsigned int irq, cpumask_t mask) +void set_vic_irq_affinity(unsigned int irq, cpumask_t mask) { /* Only extended processors handle interrupts */ unsigned long real_mask; @@ -1757,13 +1705,13 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask) int cpu; real_mask = cpus_addr(mask)[0] & voyager_extended_vic_processors; - - if(cpus_addr(mask)[0] == 0) + + if (cpus_addr(mask)[0] == 0) /* can't have no CPUs to accept the interrupt -- extremely * bad things will happen */ return; - if(irq == 0) + if (irq == 0) /* can't change the affinity of the timer IRQ. This * is due to the constraint in the voyager * architecture that the CPI also comes in on and IRQ @@ -1772,7 +1720,7 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask) * will no-longer be able to accept VIC CPIs */ return; - if(irq >= 32) + if (irq >= 32) /* You can only have 32 interrupts in a voyager system * (and 32 only if you have a secondary microchannel * bus) */ @@ -1780,8 +1728,8 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask) for_each_online_cpu(cpu) { unsigned long cpu_mask = 1 << cpu; - - if(cpu_mask & real_mask) { + + if (cpu_mask & real_mask) { /* enable the interrupt for this cpu */ cpu_irq_affinity[cpu] |= irq_mask; } else { @@ -1800,25 +1748,23 @@ set_vic_irq_affinity(unsigned int irq, cpumask_t mask) unmask_vic_irq(irq); } -static void -ack_vic_irq(unsigned int irq) +static void ack_vic_irq(unsigned int irq) { if (irq & 8) { - outb(0x62,0x20); /* Specific EOI to cascade */ - outb(0x60|(irq & 7),0xA0); + outb(0x62, 0x20); /* Specific EOI to cascade */ + outb(0x60 | (irq & 7), 0xA0); } else { - outb(0x60 | (irq & 7),0x20); + outb(0x60 | (irq & 7), 0x20); } } /* enable the CPIs. In the VIC, the CPIs are delivered by the 8259 * but are not vectored by it. This means that the 8259 mask must be * lowered to receive them */ -static __init void -vic_enable_cpi(void) +static __init void vic_enable_cpi(void) { __u8 cpu = smp_processor_id(); - + /* just take a copy of the current mask (nop for boot cpu) */ vic_irq_mask[cpu] = vic_irq_mask[boot_cpu_id]; @@ -1827,7 +1773,7 @@ vic_enable_cpi(void) /* for sys int and cmn int */ enable_local_vic_irq(7); - if(is_cpu_quad()) { + if (is_cpu_quad()) { outb(QIC_DEFAULT_MASK0, QIC_MASK_REGISTER0); outb(QIC_CPI_ENABLE, QIC_MASK_REGISTER1); VDEBUG(("VOYAGER SMP: QIC ENABLE CPI: CPU%d: MASK 0x%x\n", @@ -1838,8 +1784,7 @@ vic_enable_cpi(void) cpu, vic_irq_mask[cpu])); } -void -voyager_smp_dump() +void voyager_smp_dump() { int old_cpu = smp_processor_id(), cpu; @@ -1865,10 +1810,10 @@ voyager_smp_dump() cpu, vic_irq_mask[cpu], imr, irr, isr); #if 0 /* These lines are put in to try to unstick an un ack'd irq */ - if(isr != 0) { + if (isr != 0) { int irq; - for(irq=0; irq<16; irq++) { - if(isr & (1<<irq)) { + for (irq = 0; irq < 16; irq++) { + if (isr & (1 << irq)) { printk("\tCPU%d: ack irq %d\n", cpu, irq); local_irq_save(flags); @@ -1884,17 +1829,15 @@ voyager_smp_dump() } } -void -smp_voyager_power_off(void *dummy) +void smp_voyager_power_off(void *dummy) { - if(smp_processor_id() == boot_cpu_id) + if (smp_processor_id() == boot_cpu_id) voyager_power_off(); else smp_stop_cpu_function(NULL); } -static void __init -voyager_smp_prepare_cpus(unsigned int max_cpus) +static void __init voyager_smp_prepare_cpus(unsigned int max_cpus) { /* FIXME: ignore max_cpus for now */ smp_boot_cpus(); @@ -1911,8 +1854,7 @@ static void __cpuinit voyager_smp_prepare_boot_cpu(void) cpu_set(smp_processor_id(), cpu_present_map); } -static int __cpuinit -voyager_cpu_up(unsigned int cpu) +static int __cpuinit voyager_cpu_up(unsigned int cpu) { /* This only works at boot for x86. See "rewrite" above. */ if (cpu_isset(cpu, smp_commenced_mask)) @@ -1928,14 +1870,12 @@ voyager_cpu_up(unsigned int cpu) return 0; } -static void __init -voyager_smp_cpus_done(unsigned int max_cpus) +static void __init voyager_smp_cpus_done(unsigned int max_cpus) { zap_low_mappings(); } -void __init -smp_setup_processor_id(void) +void __init smp_setup_processor_id(void) { current_thread_info()->cpu = hard_smp_processor_id(); x86_write_percpu(cpu_number, hard_smp_processor_id()); diff --git a/arch/x86/mach-voyager/voyager_thread.c b/arch/x86/mach-voyager/voyager_thread.c index 50f9366c411..c69c931818e 100644 --- a/arch/x86/mach-voyager/voyager_thread.c +++ b/arch/x86/mach-voyager/voyager_thread.c @@ -30,12 +30,10 @@ #include <asm/mtrr.h> #include <asm/msr.h> - struct task_struct *voyager_thread; static __u8 set_timeout; -static int -execute(const char *string) +static int execute(const char *string) { int ret; @@ -52,48 +50,48 @@ execute(const char *string) NULL, }; - if ((ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) { - printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", - string, ret); + if ((ret = + call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC)) != 0) { + printk(KERN_ERR "Voyager failed to run \"%s\": %i\n", string, + ret); } return ret; } -static void -check_from_kernel(void) +static void check_from_kernel(void) { - if(voyager_status.switch_off) { - + if (voyager_status.switch_off) { + /* FIXME: This should be configurable via proc */ execute("umask 600; echo 0 > /etc/initrunlvl; kill -HUP 1"); - } else if(voyager_status.power_fail) { + } else if (voyager_status.power_fail) { VDEBUG(("Voyager daemon detected AC power failure\n")); - + /* FIXME: This should be configureable via proc */ execute("umask 600; echo F > /etc/powerstatus; kill -PWR 1"); set_timeout = 1; } } -static void -check_continuing_condition(void) +static void check_continuing_condition(void) { - if(voyager_status.power_fail) { + if (voyager_status.power_fail) { __u8 data; - voyager_cat_psi(VOYAGER_PSI_SUBREAD, + voyager_cat_psi(VOYAGER_PSI_SUBREAD, VOYAGER_PSI_AC_FAIL_REG, &data); - if((data & 0x1f) == 0) { + if ((data & 0x1f) == 0) { /* all power restored */ - printk(KERN_NOTICE "VOYAGER AC power restored, cancelling shutdown\n"); + printk(KERN_NOTICE + "VOYAGER AC power restored, cancelling shutdown\n"); /* FIXME: should be user configureable */ - execute("umask 600; echo O > /etc/powerstatus; kill -PWR 1"); + execute + ("umask 600; echo O > /etc/powerstatus; kill -PWR 1"); set_timeout = 0; } } } -static int -thread(void *unused) +static int thread(void *unused) { printk(KERN_NOTICE "Voyager starting monitor thread\n"); @@ -102,7 +100,7 @@ thread(void *unused) schedule_timeout(set_timeout ? HZ : MAX_SCHEDULE_TIMEOUT); VDEBUG(("Voyager Daemon awoken\n")); - if(voyager_status.request_from_kernel == 0) { + if (voyager_status.request_from_kernel == 0) { /* probably awoken from timeout */ check_continuing_condition(); } else { @@ -112,20 +110,18 @@ thread(void *unused) } } -static int __init -voyager_thread_start(void) +static int __init voyager_thread_start(void) { voyager_thread = kthread_run(thread, NULL, "kvoyagerd"); if (IS_ERR(voyager_thread)) { - printk(KERN_ERR "Voyager: Failed to create system monitor thread.\n"); + printk(KERN_ERR + "Voyager: Failed to create system monitor thread.\n"); return PTR_ERR(voyager_thread); } return 0; } - -static void __exit -voyager_thread_stop(void) +static void __exit voyager_thread_stop(void) { kthread_stop(voyager_thread); } diff --git a/arch/x86/math-emu/errors.c b/arch/x86/math-emu/errors.c index a1b0d22f697..59d353d2c59 100644 --- a/arch/x86/math-emu/errors.c +++ b/arch/x86/math-emu/errors.c @@ -33,45 +33,41 @@ #undef PRINT_MESSAGES /* */ - #if 0 void Un_impl(void) { - u_char byte1, FPU_modrm; - unsigned long address = FPU_ORIG_EIP; - - RE_ENTRANT_CHECK_OFF; - /* No need to check access_ok(), we have previously fetched these bytes. */ - printk("Unimplemented FPU Opcode at eip=%p : ", (void __user *) address); - if ( FPU_CS == __USER_CS ) - { - while ( 1 ) - { - FPU_get_user(byte1, (u_char __user *) address); - if ( (byte1 & 0xf8) == 0xd8 ) break; - printk("[%02x]", byte1); - address++; + u_char byte1, FPU_modrm; + unsigned long address = FPU_ORIG_EIP; + + RE_ENTRANT_CHECK_OFF; + /* No need to check access_ok(), we have previously fetched these bytes. */ + printk("Unimplemented FPU Opcode at eip=%p : ", (void __user *)address); + if (FPU_CS == __USER_CS) { + while (1) { + FPU_get_user(byte1, (u_char __user *) address); + if ((byte1 & 0xf8) == 0xd8) + break; + printk("[%02x]", byte1); + address++; + } + printk("%02x ", byte1); + FPU_get_user(FPU_modrm, 1 + (u_char __user *) address); + + if (FPU_modrm >= 0300) + printk("%02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8, + FPU_modrm & 7); + else + printk("/%d\n", (FPU_modrm >> 3) & 7); + } else { + printk("cs selector = %04x\n", FPU_CS); } - printk("%02x ", byte1); - FPU_get_user(FPU_modrm, 1 + (u_char __user *) address); - - if (FPU_modrm >= 0300) - printk("%02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8, FPU_modrm & 7); - else - printk("/%d\n", (FPU_modrm >> 3) & 7); - } - else - { - printk("cs selector = %04x\n", FPU_CS); - } - - RE_ENTRANT_CHECK_ON; - - EXCEPTION(EX_Invalid); -} -#endif /* 0 */ + RE_ENTRANT_CHECK_ON; + EXCEPTION(EX_Invalid); + +} +#endif /* 0 */ /* Called for opcodes which are illegal and which are known to result in a @@ -79,139 +75,152 @@ void Un_impl(void) */ void FPU_illegal(void) { - math_abort(FPU_info,SIGILL); + math_abort(FPU_info, SIGILL); } - - void FPU_printall(void) { - int i; - static const char *tag_desc[] = { "Valid", "Zero", "ERROR", "Empty", - "DeNorm", "Inf", "NaN" }; - u_char byte1, FPU_modrm; - unsigned long address = FPU_ORIG_EIP; - - RE_ENTRANT_CHECK_OFF; - /* No need to check access_ok(), we have previously fetched these bytes. */ - printk("At %p:", (void *) address); - if ( FPU_CS == __USER_CS ) - { + int i; + static const char *tag_desc[] = { "Valid", "Zero", "ERROR", "Empty", + "DeNorm", "Inf", "NaN" + }; + u_char byte1, FPU_modrm; + unsigned long address = FPU_ORIG_EIP; + + RE_ENTRANT_CHECK_OFF; + /* No need to check access_ok(), we have previously fetched these bytes. */ + printk("At %p:", (void *)address); + if (FPU_CS == __USER_CS) { #define MAX_PRINTED_BYTES 20 - for ( i = 0; i < MAX_PRINTED_BYTES; i++ ) - { - FPU_get_user(byte1, (u_char __user *) address); - if ( (byte1 & 0xf8) == 0xd8 ) - { - printk(" %02x", byte1); - break; - } - printk(" [%02x]", byte1); - address++; - } - if ( i == MAX_PRINTED_BYTES ) - printk(" [more..]\n"); - else - { - FPU_get_user(FPU_modrm, 1 + (u_char __user *) address); - - if (FPU_modrm >= 0300) - printk(" %02x (%02x+%d)\n", FPU_modrm, FPU_modrm & 0xf8, FPU_modrm & 7); - else - printk(" /%d, mod=%d rm=%d\n", - (FPU_modrm >> 3) & 7, (FPU_modrm >> 6) & 3, FPU_modrm & 7); + for (i = 0; i < MAX_PRINTED_BYTES; i++) { + FPU_get_user(byte1, (u_char __user *) address); + if ((byte1 & 0xf8) == 0xd8) { + printk(" %02x", byte1); + break; + } + printk(" [%02x]", byte1); + address++; + } + if (i == MAX_PRINTED_BYTES) + printk(" [more..]\n"); + else { + FPU_get_user(FPU_modrm, 1 + (u_char __user *) address); + + if (FPU_modrm >= 0300) + printk(" %02x (%02x+%d)\n", FPU_modrm, + FPU_modrm & 0xf8, FPU_modrm & 7); + else + printk(" /%d, mod=%d rm=%d\n", + (FPU_modrm >> 3) & 7, + (FPU_modrm >> 6) & 3, FPU_modrm & 7); + } + } else { + printk("%04x\n", FPU_CS); } - } - else - { - printk("%04x\n", FPU_CS); - } - partial_status = status_word(); + partial_status = status_word(); #ifdef DEBUGGING -if ( partial_status & SW_Backward ) printk("SW: backward compatibility\n"); -if ( partial_status & SW_C3 ) printk("SW: condition bit 3\n"); -if ( partial_status & SW_C2 ) printk("SW: condition bit 2\n"); -if ( partial_status & SW_C1 ) printk("SW: condition bit 1\n"); -if ( partial_status & SW_C0 ) printk("SW: condition bit 0\n"); -if ( partial_status & SW_Summary ) printk("SW: exception summary\n"); -if ( partial_status & SW_Stack_Fault ) printk("SW: stack fault\n"); -if ( partial_status & SW_Precision ) printk("SW: loss of precision\n"); -if ( partial_status & SW_Underflow ) printk("SW: underflow\n"); -if ( partial_status & SW_Overflow ) printk("SW: overflow\n"); -if ( partial_status & SW_Zero_Div ) printk("SW: divide by zero\n"); -if ( partial_status & SW_Denorm_Op ) printk("SW: denormalized operand\n"); -if ( partial_status & SW_Invalid ) printk("SW: invalid operation\n"); + if (partial_status & SW_Backward) + printk("SW: backward compatibility\n"); + if (partial_status & SW_C3) + printk("SW: condition bit 3\n"); + if (partial_status & SW_C2) + printk("SW: condition bit 2\n"); + if (partial_status & SW_C1) + printk("SW: condition bit 1\n"); + if (partial_status & SW_C0) + printk("SW: condition bit 0\n"); + if (partial_status & SW_Summary) + printk("SW: exception summary\n"); + if (partial_status & SW_Stack_Fault) + printk("SW: stack fault\n"); + if (partial_status & SW_Precision) + printk("SW: loss of precision\n"); + if (partial_status & SW_Underflow) + printk("SW: underflow\n"); + if (partial_status & SW_Overflow) + printk("SW: overflow\n"); + if (partial_status & SW_Zero_Div) + printk("SW: divide by zero\n"); + if (partial_status & SW_Denorm_Op) + printk("SW: denormalized operand\n"); + if (partial_status & SW_Invalid) + printk("SW: invalid operation\n"); #endif /* DEBUGGING */ - printk(" SW: b=%d st=%ld es=%d sf=%d cc=%d%d%d%d ef=%d%d%d%d%d%d\n", - partial_status & 0x8000 ? 1 : 0, /* busy */ - (partial_status & 0x3800) >> 11, /* stack top pointer */ - partial_status & 0x80 ? 1 : 0, /* Error summary status */ - partial_status & 0x40 ? 1 : 0, /* Stack flag */ - partial_status & SW_C3?1:0, partial_status & SW_C2?1:0, /* cc */ - partial_status & SW_C1?1:0, partial_status & SW_C0?1:0, /* cc */ - partial_status & SW_Precision?1:0, partial_status & SW_Underflow?1:0, - partial_status & SW_Overflow?1:0, partial_status & SW_Zero_Div?1:0, - partial_status & SW_Denorm_Op?1:0, partial_status & SW_Invalid?1:0); - -printk(" CW: ic=%d rc=%ld%ld pc=%ld%ld iem=%d ef=%d%d%d%d%d%d\n", - control_word & 0x1000 ? 1 : 0, - (control_word & 0x800) >> 11, (control_word & 0x400) >> 10, - (control_word & 0x200) >> 9, (control_word & 0x100) >> 8, - control_word & 0x80 ? 1 : 0, - control_word & SW_Precision?1:0, control_word & SW_Underflow?1:0, - control_word & SW_Overflow?1:0, control_word & SW_Zero_Div?1:0, - control_word & SW_Denorm_Op?1:0, control_word & SW_Invalid?1:0); - - for ( i = 0; i < 8; i++ ) - { - FPU_REG *r = &st(i); - u_char tagi = FPU_gettagi(i); - switch (tagi) - { - case TAG_Empty: - continue; - break; - case TAG_Zero: - case TAG_Special: - tagi = FPU_Special(r); - case TAG_Valid: - printk("st(%d) %c .%04lx %04lx %04lx %04lx e%+-6d ", i, - getsign(r) ? '-' : '+', - (long)(r->sigh >> 16), - (long)(r->sigh & 0xFFFF), - (long)(r->sigl >> 16), - (long)(r->sigl & 0xFFFF), - exponent(r) - EXP_BIAS + 1); - break; - default: - printk("Whoops! Error in errors.c: tag%d is %d ", i, tagi); - continue; - break; + printk(" SW: b=%d st=%d es=%d sf=%d cc=%d%d%d%d ef=%d%d%d%d%d%d\n", partial_status & 0x8000 ? 1 : 0, /* busy */ + (partial_status & 0x3800) >> 11, /* stack top pointer */ + partial_status & 0x80 ? 1 : 0, /* Error summary status */ + partial_status & 0x40 ? 1 : 0, /* Stack flag */ + partial_status & SW_C3 ? 1 : 0, partial_status & SW_C2 ? 1 : 0, /* cc */ + partial_status & SW_C1 ? 1 : 0, partial_status & SW_C0 ? 1 : 0, /* cc */ + partial_status & SW_Precision ? 1 : 0, + partial_status & SW_Underflow ? 1 : 0, + partial_status & SW_Overflow ? 1 : 0, + partial_status & SW_Zero_Div ? 1 : 0, + partial_status & SW_Denorm_Op ? 1 : 0, + partial_status & SW_Invalid ? 1 : 0); + + printk(" CW: ic=%d rc=%d%d pc=%d%d iem=%d ef=%d%d%d%d%d%d\n", + control_word & 0x1000 ? 1 : 0, + (control_word & 0x800) >> 11, (control_word & 0x400) >> 10, + (control_word & 0x200) >> 9, (control_word & 0x100) >> 8, + control_word & 0x80 ? 1 : 0, + control_word & SW_Precision ? 1 : 0, + control_word & SW_Underflow ? 1 : 0, + control_word & SW_Overflow ? 1 : 0, + control_word & SW_Zero_Div ? 1 : 0, + control_word & SW_Denorm_Op ? 1 : 0, + control_word & SW_Invalid ? 1 : 0); + + for (i = 0; i < 8; i++) { + FPU_REG *r = &st(i); + u_char tagi = FPU_gettagi(i); + switch (tagi) { + case TAG_Empty: + continue; + break; + case TAG_Zero: + case TAG_Special: + tagi = FPU_Special(r); + case TAG_Valid: + printk("st(%d) %c .%04lx %04lx %04lx %04lx e%+-6d ", i, + getsign(r) ? '-' : '+', + (long)(r->sigh >> 16), + (long)(r->sigh & 0xFFFF), + (long)(r->sigl >> 16), + (long)(r->sigl & 0xFFFF), + exponent(r) - EXP_BIAS + 1); + break; + default: + printk("Whoops! Error in errors.c: tag%d is %d ", i, + tagi); + continue; + break; + } + printk("%s\n", tag_desc[(int)(unsigned)tagi]); } - printk("%s\n", tag_desc[(int) (unsigned) tagi]); - } - RE_ENTRANT_CHECK_ON; + RE_ENTRANT_CHECK_ON; } static struct { - int type; - const char *name; + int type; + const char *name; } exception_names[] = { - { EX_StackOver, "stack overflow" }, - { EX_StackUnder, "stack underflow" }, - { EX_Precision, "loss of precision" }, - { EX_Underflow, "underflow" }, - { EX_Overflow, "overflow" }, - { EX_ZeroDiv, "divide by zero" }, - { EX_Denormal, "denormalized operand" }, - { EX_Invalid, "invalid operation" }, - { EX_INTERNAL, "INTERNAL BUG in "FPU_VERSION }, - { 0, NULL } + { + EX_StackOver, "stack overflow"}, { + EX_StackUnder, "stack underflow"}, { + EX_Precision, "loss of precision"}, { + EX_Underflow, "underflow"}, { + EX_Overflow, "overflow"}, { + EX_ZeroDiv, "divide by zero"}, { + EX_Denormal, "denormalized operand"}, { + EX_Invalid, "invalid operation"}, { + EX_INTERNAL, "INTERNAL BUG in " FPU_VERSION}, { + 0, NULL} }; /* @@ -295,445 +304,386 @@ static struct { asmlinkage void FPU_exception(int n) { - int i, int_type; - - int_type = 0; /* Needed only to stop compiler warnings */ - if ( n & EX_INTERNAL ) - { - int_type = n - EX_INTERNAL; - n = EX_INTERNAL; - /* Set lots of exception bits! */ - partial_status |= (SW_Exc_Mask | SW_Summary | SW_Backward); - } - else - { - /* Extract only the bits which we use to set the status word */ - n &= (SW_Exc_Mask); - /* Set the corresponding exception bit */ - partial_status |= n; - /* Set summary bits iff exception isn't masked */ - if ( partial_status & ~control_word & CW_Exceptions ) - partial_status |= (SW_Summary | SW_Backward); - if ( n & (SW_Stack_Fault | EX_Precision) ) - { - if ( !(n & SW_C1) ) - /* This bit distinguishes over- from underflow for a stack fault, - and roundup from round-down for precision loss. */ - partial_status &= ~SW_C1; + int i, int_type; + + int_type = 0; /* Needed only to stop compiler warnings */ + if (n & EX_INTERNAL) { + int_type = n - EX_INTERNAL; + n = EX_INTERNAL; + /* Set lots of exception bits! */ + partial_status |= (SW_Exc_Mask | SW_Summary | SW_Backward); + } else { + /* Extract only the bits which we use to set the status word */ + n &= (SW_Exc_Mask); + /* Set the corresponding exception bit */ + partial_status |= n; + /* Set summary bits iff exception isn't masked */ + if (partial_status & ~control_word & CW_Exceptions) + partial_status |= (SW_Summary | SW_Backward); + if (n & (SW_Stack_Fault | EX_Precision)) { + if (!(n & SW_C1)) + /* This bit distinguishes over- from underflow for a stack fault, + and roundup from round-down for precision loss. */ + partial_status &= ~SW_C1; + } } - } - RE_ENTRANT_CHECK_OFF; - if ( (~control_word & n & CW_Exceptions) || (n == EX_INTERNAL) ) - { + RE_ENTRANT_CHECK_OFF; + if ((~control_word & n & CW_Exceptions) || (n == EX_INTERNAL)) { #ifdef PRINT_MESSAGES - /* My message from the sponsor */ - printk(FPU_VERSION" "__DATE__" (C) W. Metzenthen.\n"); + /* My message from the sponsor */ + printk(FPU_VERSION " " __DATE__ " (C) W. Metzenthen.\n"); #endif /* PRINT_MESSAGES */ - - /* Get a name string for error reporting */ - for (i=0; exception_names[i].type; i++) - if ( (exception_names[i].type & n) == exception_names[i].type ) - break; - - if (exception_names[i].type) - { + + /* Get a name string for error reporting */ + for (i = 0; exception_names[i].type; i++) + if ((exception_names[i].type & n) == + exception_names[i].type) + break; + + if (exception_names[i].type) { #ifdef PRINT_MESSAGES - printk("FP Exception: %s!\n", exception_names[i].name); + printk("FP Exception: %s!\n", exception_names[i].name); #endif /* PRINT_MESSAGES */ - } - else - printk("FPU emulator: Unknown Exception: 0x%04x!\n", n); - - if ( n == EX_INTERNAL ) - { - printk("FPU emulator: Internal error type 0x%04x\n", int_type); - FPU_printall(); - } + } else + printk("FPU emulator: Unknown Exception: 0x%04x!\n", n); + + if (n == EX_INTERNAL) { + printk("FPU emulator: Internal error type 0x%04x\n", + int_type); + FPU_printall(); + } #ifdef PRINT_MESSAGES - else - FPU_printall(); + else + FPU_printall(); #endif /* PRINT_MESSAGES */ - /* - * The 80486 generates an interrupt on the next non-control FPU - * instruction. So we need some means of flagging it. - * We use the ES (Error Summary) bit for this. - */ - } - RE_ENTRANT_CHECK_ON; + /* + * The 80486 generates an interrupt on the next non-control FPU + * instruction. So we need some means of flagging it. + * We use the ES (Error Summary) bit for this. + */ + } + RE_ENTRANT_CHECK_ON; #ifdef __DEBUG__ - math_abort(FPU_info,SIGFPE); + math_abort(FPU_info, SIGFPE); #endif /* __DEBUG__ */ } - /* Real operation attempted on a NaN. */ /* Returns < 0 if the exception is unmasked */ int real_1op_NaN(FPU_REG *a) { - int signalling, isNaN; - - isNaN = (exponent(a) == EXP_OVER) && (a->sigh & 0x80000000); - - /* The default result for the case of two "equal" NaNs (signs may - differ) is chosen to reproduce 80486 behaviour */ - signalling = isNaN && !(a->sigh & 0x40000000); - - if ( !signalling ) - { - if ( !isNaN ) /* pseudo-NaN, or other unsupported? */ - { - if ( control_word & CW_Invalid ) - { - /* Masked response */ - reg_copy(&CONST_QNaN, a); - } - EXCEPTION(EX_Invalid); - return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; + int signalling, isNaN; + + isNaN = (exponent(a) == EXP_OVER) && (a->sigh & 0x80000000); + + /* The default result for the case of two "equal" NaNs (signs may + differ) is chosen to reproduce 80486 behaviour */ + signalling = isNaN && !(a->sigh & 0x40000000); + + if (!signalling) { + if (!isNaN) { /* pseudo-NaN, or other unsupported? */ + if (control_word & CW_Invalid) { + /* Masked response */ + reg_copy(&CONST_QNaN, a); + } + EXCEPTION(EX_Invalid); + return (!(control_word & CW_Invalid) ? FPU_Exception : + 0) | TAG_Special; + } + return TAG_Special; } - return TAG_Special; - } - if ( control_word & CW_Invalid ) - { - /* The masked response */ - if ( !(a->sigh & 0x80000000) ) /* pseudo-NaN ? */ - { - reg_copy(&CONST_QNaN, a); + if (control_word & CW_Invalid) { + /* The masked response */ + if (!(a->sigh & 0x80000000)) { /* pseudo-NaN ? */ + reg_copy(&CONST_QNaN, a); + } + /* ensure a Quiet NaN */ + a->sigh |= 0x40000000; } - /* ensure a Quiet NaN */ - a->sigh |= 0x40000000; - } - EXCEPTION(EX_Invalid); + EXCEPTION(EX_Invalid); - return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; + return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; } - /* Real operation attempted on two operands, one a NaN. */ /* Returns < 0 if the exception is unmasked */ int real_2op_NaN(FPU_REG const *b, u_char tagb, - int deststnr, - FPU_REG const *defaultNaN) + int deststnr, FPU_REG const *defaultNaN) { - FPU_REG *dest = &st(deststnr); - FPU_REG const *a = dest; - u_char taga = FPU_gettagi(deststnr); - FPU_REG const *x; - int signalling, unsupported; - - if ( taga == TAG_Special ) - taga = FPU_Special(a); - if ( tagb == TAG_Special ) - tagb = FPU_Special(b); - - /* TW_NaN is also used for unsupported data types. */ - unsupported = ((taga == TW_NaN) - && !((exponent(a) == EXP_OVER) && (a->sigh & 0x80000000))) - || ((tagb == TW_NaN) - && !((exponent(b) == EXP_OVER) && (b->sigh & 0x80000000))); - if ( unsupported ) - { - if ( control_word & CW_Invalid ) - { - /* Masked response */ - FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr); - } - EXCEPTION(EX_Invalid); - return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; - } - - if (taga == TW_NaN) - { - x = a; - if (tagb == TW_NaN) - { - signalling = !(a->sigh & b->sigh & 0x40000000); - if ( significand(b) > significand(a) ) - x = b; - else if ( significand(b) == significand(a) ) - { - /* The default result for the case of two "equal" NaNs (signs may - differ) is chosen to reproduce 80486 behaviour */ - x = defaultNaN; - } - } - else - { - /* return the quiet version of the NaN in a */ - signalling = !(a->sigh & 0x40000000); + FPU_REG *dest = &st(deststnr); + FPU_REG const *a = dest; + u_char taga = FPU_gettagi(deststnr); + FPU_REG const *x; + int signalling, unsupported; + + if (taga == TAG_Special) + taga = FPU_Special(a); + if (tagb == TAG_Special) + tagb = FPU_Special(b); + + /* TW_NaN is also used for unsupported data types. */ + unsupported = ((taga == TW_NaN) + && !((exponent(a) == EXP_OVER) + && (a->sigh & 0x80000000))) + || ((tagb == TW_NaN) + && !((exponent(b) == EXP_OVER) && (b->sigh & 0x80000000))); + if (unsupported) { + if (control_word & CW_Invalid) { + /* Masked response */ + FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr); + } + EXCEPTION(EX_Invalid); + return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | + TAG_Special; } - } - else + + if (taga == TW_NaN) { + x = a; + if (tagb == TW_NaN) { + signalling = !(a->sigh & b->sigh & 0x40000000); + if (significand(b) > significand(a)) + x = b; + else if (significand(b) == significand(a)) { + /* The default result for the case of two "equal" NaNs (signs may + differ) is chosen to reproduce 80486 behaviour */ + x = defaultNaN; + } + } else { + /* return the quiet version of the NaN in a */ + signalling = !(a->sigh & 0x40000000); + } + } else #ifdef PARANOID - if (tagb == TW_NaN) + if (tagb == TW_NaN) #endif /* PARANOID */ - { - signalling = !(b->sigh & 0x40000000); - x = b; - } + { + signalling = !(b->sigh & 0x40000000); + x = b; + } #ifdef PARANOID - else - { - signalling = 0; - EXCEPTION(EX_INTERNAL|0x113); - x = &CONST_QNaN; - } + else { + signalling = 0; + EXCEPTION(EX_INTERNAL | 0x113); + x = &CONST_QNaN; + } #endif /* PARANOID */ - if ( (!signalling) || (control_word & CW_Invalid) ) - { - if ( ! x ) - x = b; + if ((!signalling) || (control_word & CW_Invalid)) { + if (!x) + x = b; - if ( !(x->sigh & 0x80000000) ) /* pseudo-NaN ? */ - x = &CONST_QNaN; + if (!(x->sigh & 0x80000000)) /* pseudo-NaN ? */ + x = &CONST_QNaN; - FPU_copy_to_regi(x, TAG_Special, deststnr); + FPU_copy_to_regi(x, TAG_Special, deststnr); - if ( !signalling ) - return TAG_Special; + if (!signalling) + return TAG_Special; - /* ensure a Quiet NaN */ - dest->sigh |= 0x40000000; - } + /* ensure a Quiet NaN */ + dest->sigh |= 0x40000000; + } - EXCEPTION(EX_Invalid); + EXCEPTION(EX_Invalid); - return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; + return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Special; } - /* Invalid arith operation on Valid registers */ /* Returns < 0 if the exception is unmasked */ asmlinkage int arith_invalid(int deststnr) { - EXCEPTION(EX_Invalid); - - if ( control_word & CW_Invalid ) - { - /* The masked response */ - FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr); - } - - return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Valid; + EXCEPTION(EX_Invalid); -} + if (control_word & CW_Invalid) { + /* The masked response */ + FPU_copy_to_regi(&CONST_QNaN, TAG_Special, deststnr); + } + return (!(control_word & CW_Invalid) ? FPU_Exception : 0) | TAG_Valid; + +} /* Divide a finite number by zero */ asmlinkage int FPU_divide_by_zero(int deststnr, u_char sign) { - FPU_REG *dest = &st(deststnr); - int tag = TAG_Valid; + FPU_REG *dest = &st(deststnr); + int tag = TAG_Valid; + + if (control_word & CW_ZeroDiv) { + /* The masked response */ + FPU_copy_to_regi(&CONST_INF, TAG_Special, deststnr); + setsign(dest, sign); + tag = TAG_Special; + } - if ( control_word & CW_ZeroDiv ) - { - /* The masked response */ - FPU_copy_to_regi(&CONST_INF, TAG_Special, deststnr); - setsign(dest, sign); - tag = TAG_Special; - } - - EXCEPTION(EX_ZeroDiv); + EXCEPTION(EX_ZeroDiv); - return (!(control_word & CW_ZeroDiv) ? FPU_Exception : 0) | tag; + return (!(control_word & CW_ZeroDiv) ? FPU_Exception : 0) | tag; } - /* This may be called often, so keep it lean */ int set_precision_flag(int flags) { - if ( control_word & CW_Precision ) - { - partial_status &= ~(SW_C1 & flags); - partial_status |= flags; /* The masked response */ - return 0; - } - else - { - EXCEPTION(flags); - return 1; - } + if (control_word & CW_Precision) { + partial_status &= ~(SW_C1 & flags); + partial_status |= flags; /* The masked response */ + return 0; + } else { + EXCEPTION(flags); + return 1; + } } - /* This may be called often, so keep it lean */ asmlinkage void set_precision_flag_up(void) { - if ( control_word & CW_Precision ) - partial_status |= (SW_Precision | SW_C1); /* The masked response */ - else - EXCEPTION(EX_Precision | SW_C1); + if (control_word & CW_Precision) + partial_status |= (SW_Precision | SW_C1); /* The masked response */ + else + EXCEPTION(EX_Precision | SW_C1); } - /* This may be called often, so keep it lean */ asmlinkage void set_precision_flag_down(void) { - if ( control_word & CW_Precision ) - { /* The masked response */ - partial_status &= ~SW_C1; - partial_status |= SW_Precision; - } - else - EXCEPTION(EX_Precision); + if (control_word & CW_Precision) { /* The masked response */ + partial_status &= ~SW_C1; + partial_status |= SW_Precision; + } else + EXCEPTION(EX_Precision); } - asmlinkage int denormal_operand(void) { - if ( control_word & CW_Denormal ) - { /* The masked response */ - partial_status |= SW_Denorm_Op; - return TAG_Special; - } - else - { - EXCEPTION(EX_Denormal); - return TAG_Special | FPU_Exception; - } + if (control_word & CW_Denormal) { /* The masked response */ + partial_status |= SW_Denorm_Op; + return TAG_Special; + } else { + EXCEPTION(EX_Denormal); + return TAG_Special | FPU_Exception; + } } - asmlinkage int arith_overflow(FPU_REG *dest) { - int tag = TAG_Valid; + int tag = TAG_Valid; - if ( control_word & CW_Overflow ) - { - /* The masked response */ + if (control_word & CW_Overflow) { + /* The masked response */ /* ###### The response here depends upon the rounding mode */ - reg_copy(&CONST_INF, dest); - tag = TAG_Special; - } - else - { - /* Subtract the magic number from the exponent */ - addexponent(dest, (-3 * (1 << 13))); - } - - EXCEPTION(EX_Overflow); - if ( control_word & CW_Overflow ) - { - /* The overflow exception is masked. */ - /* By definition, precision is lost. - The roundup bit (C1) is also set because we have - "rounded" upwards to Infinity. */ - EXCEPTION(EX_Precision | SW_C1); - return tag; - } - - return tag; + reg_copy(&CONST_INF, dest); + tag = TAG_Special; + } else { + /* Subtract the magic number from the exponent */ + addexponent(dest, (-3 * (1 << 13))); + } -} + EXCEPTION(EX_Overflow); + if (control_word & CW_Overflow) { + /* The overflow exception is masked. */ + /* By definition, precision is lost. + The roundup bit (C1) is also set because we have + "rounded" upwards to Infinity. */ + EXCEPTION(EX_Precision | SW_C1); + return tag; + } + + return tag; +} asmlinkage int arith_underflow(FPU_REG *dest) { - int tag = TAG_Valid; - - if ( control_word & CW_Underflow ) - { - /* The masked response */ - if ( exponent16(dest) <= EXP_UNDER - 63 ) - { - reg_copy(&CONST_Z, dest); - partial_status &= ~SW_C1; /* Round down. */ - tag = TAG_Zero; + int tag = TAG_Valid; + + if (control_word & CW_Underflow) { + /* The masked response */ + if (exponent16(dest) <= EXP_UNDER - 63) { + reg_copy(&CONST_Z, dest); + partial_status &= ~SW_C1; /* Round down. */ + tag = TAG_Zero; + } else { + stdexp(dest); + } + } else { + /* Add the magic number to the exponent. */ + addexponent(dest, (3 * (1 << 13)) + EXTENDED_Ebias); } - else - { - stdexp(dest); + + EXCEPTION(EX_Underflow); + if (control_word & CW_Underflow) { + /* The underflow exception is masked. */ + EXCEPTION(EX_Precision); + return tag; } - } - else - { - /* Add the magic number to the exponent. */ - addexponent(dest, (3 * (1 << 13)) + EXTENDED_Ebias); - } - - EXCEPTION(EX_Underflow); - if ( control_word & CW_Underflow ) - { - /* The underflow exception is masked. */ - EXCEPTION(EX_Precision); - return tag; - } - - return tag; -} + return tag; +} void FPU_stack_overflow(void) { - if ( control_word & CW_Invalid ) - { - /* The masked response */ - top--; - FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); - } + if (control_word & CW_Invalid) { + /* The masked response */ + top--; + FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); + } - EXCEPTION(EX_StackOver); + EXCEPTION(EX_StackOver); - return; + return; } - void FPU_stack_underflow(void) { - if ( control_word & CW_Invalid ) - { - /* The masked response */ - FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); - } + if (control_word & CW_Invalid) { + /* The masked response */ + FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); + } - EXCEPTION(EX_StackUnder); + EXCEPTION(EX_StackUnder); - return; + return; } - void FPU_stack_underflow_i(int i) { - if ( control_word & CW_Invalid ) - { - /* The masked response */ - FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i); - } + if (control_word & CW_Invalid) { + /* The masked response */ + FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i); + } - EXCEPTION(EX_StackUnder); + EXCEPTION(EX_StackUnder); - return; + return; } - void FPU_stack_underflow_pop(int i) { - if ( control_word & CW_Invalid ) - { - /* The masked response */ - FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i); - FPU_pop(); - } + if (control_word & CW_Invalid) { + /* The masked response */ + FPU_copy_to_regi(&CONST_QNaN, TAG_Special, i); + FPU_pop(); + } - EXCEPTION(EX_StackUnder); + EXCEPTION(EX_StackUnder); - return; + return; } - diff --git a/arch/x86/math-emu/exception.h b/arch/x86/math-emu/exception.h index b463f21a811..67f43a4683d 100644 --- a/arch/x86/math-emu/exception.h +++ b/arch/x86/math-emu/exception.h @@ -9,7 +9,6 @@ #ifndef _EXCEPTION_H_ #define _EXCEPTION_H_ - #ifdef __ASSEMBLY__ #define Const_(x) $##x #else @@ -20,8 +19,8 @@ #include "fpu_emu.h" #endif /* SW_C1 */ -#define FPU_BUSY Const_(0x8000) /* FPU busy bit (8087 compatibility) */ -#define EX_ErrorSummary Const_(0x0080) /* Error summary status */ +#define FPU_BUSY Const_(0x8000) /* FPU busy bit (8087 compatibility) */ +#define EX_ErrorSummary Const_(0x0080) /* Error summary status */ /* Special exceptions: */ #define EX_INTERNAL Const_(0x8000) /* Internal error in wm-FPU-emu */ #define EX_StackOver Const_(0x0041|SW_C1) /* stack overflow */ @@ -34,11 +33,9 @@ #define EX_Denormal Const_(0x0002) /* denormalized operand */ #define EX_Invalid Const_(0x0001) /* invalid operation */ - #define PRECISION_LOST_UP Const_((EX_Precision | SW_C1)) #define PRECISION_LOST_DOWN Const_(EX_Precision) - #ifndef __ASSEMBLY__ #ifdef DEBUG @@ -48,6 +45,6 @@ #define EXCEPTION(x) FPU_exception(x) #endif -#endif /* __ASSEMBLY__ */ +#endif /* __ASSEMBLY__ */ #endif /* _EXCEPTION_H_ */ diff --git a/arch/x86/math-emu/fpu_arith.c b/arch/x86/math-emu/fpu_arith.c index 6972dec01af..aeab24e083c 100644 --- a/arch/x86/math-emu/fpu_arith.c +++ b/arch/x86/math-emu/fpu_arith.c @@ -15,160 +15,138 @@ #include "control_w.h" #include "status_w.h" - void fadd__(void) { - /* fadd st,st(i) */ - int i = FPU_rm; - clear_C1(); - FPU_add(&st(i), FPU_gettagi(i), 0, control_word); + /* fadd st,st(i) */ + int i = FPU_rm; + clear_C1(); + FPU_add(&st(i), FPU_gettagi(i), 0, control_word); } - void fmul__(void) { - /* fmul st,st(i) */ - int i = FPU_rm; - clear_C1(); - FPU_mul(&st(i), FPU_gettagi(i), 0, control_word); + /* fmul st,st(i) */ + int i = FPU_rm; + clear_C1(); + FPU_mul(&st(i), FPU_gettagi(i), 0, control_word); } - - void fsub__(void) { - /* fsub st,st(i) */ - clear_C1(); - FPU_sub(0, FPU_rm, control_word); + /* fsub st,st(i) */ + clear_C1(); + FPU_sub(0, FPU_rm, control_word); } - void fsubr_(void) { - /* fsubr st,st(i) */ - clear_C1(); - FPU_sub(REV, FPU_rm, control_word); + /* fsubr st,st(i) */ + clear_C1(); + FPU_sub(REV, FPU_rm, control_word); } - void fdiv__(void) { - /* fdiv st,st(i) */ - clear_C1(); - FPU_div(0, FPU_rm, control_word); + /* fdiv st,st(i) */ + clear_C1(); + FPU_div(0, FPU_rm, control_word); } - void fdivr_(void) { - /* fdivr st,st(i) */ - clear_C1(); - FPU_div(REV, FPU_rm, control_word); + /* fdivr st,st(i) */ + clear_C1(); + FPU_div(REV, FPU_rm, control_word); } - - void fadd_i(void) { - /* fadd st(i),st */ - int i = FPU_rm; - clear_C1(); - FPU_add(&st(i), FPU_gettagi(i), i, control_word); + /* fadd st(i),st */ + int i = FPU_rm; + clear_C1(); + FPU_add(&st(i), FPU_gettagi(i), i, control_word); } - void fmul_i(void) { - /* fmul st(i),st */ - clear_C1(); - FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word); + /* fmul st(i),st */ + clear_C1(); + FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word); } - void fsubri(void) { - /* fsubr st(i),st */ - clear_C1(); - FPU_sub(DEST_RM, FPU_rm, control_word); + /* fsubr st(i),st */ + clear_C1(); + FPU_sub(DEST_RM, FPU_rm, control_word); } - void fsub_i(void) { - /* fsub st(i),st */ - clear_C1(); - FPU_sub(REV|DEST_RM, FPU_rm, control_word); + /* fsub st(i),st */ + clear_C1(); + FPU_sub(REV | DEST_RM, FPU_rm, control_word); } - void fdivri(void) { - /* fdivr st(i),st */ - clear_C1(); - FPU_div(DEST_RM, FPU_rm, control_word); + /* fdivr st(i),st */ + clear_C1(); + FPU_div(DEST_RM, FPU_rm, control_word); } - void fdiv_i(void) { - /* fdiv st(i),st */ - clear_C1(); - FPU_div(REV|DEST_RM, FPU_rm, control_word); + /* fdiv st(i),st */ + clear_C1(); + FPU_div(REV | DEST_RM, FPU_rm, control_word); } - - void faddp_(void) { - /* faddp st(i),st */ - int i = FPU_rm; - clear_C1(); - if ( FPU_add(&st(i), FPU_gettagi(i), i, control_word) >= 0 ) - FPU_pop(); + /* faddp st(i),st */ + int i = FPU_rm; + clear_C1(); + if (FPU_add(&st(i), FPU_gettagi(i), i, control_word) >= 0) + FPU_pop(); } - void fmulp_(void) { - /* fmulp st(i),st */ - clear_C1(); - if ( FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word) >= 0 ) - FPU_pop(); + /* fmulp st(i),st */ + clear_C1(); + if (FPU_mul(&st(0), FPU_gettag0(), FPU_rm, control_word) >= 0) + FPU_pop(); } - - void fsubrp(void) { - /* fsubrp st(i),st */ - clear_C1(); - if ( FPU_sub(DEST_RM, FPU_rm, control_word) >= 0 ) - FPU_pop(); + /* fsubrp st(i),st */ + clear_C1(); + if (FPU_sub(DEST_RM, FPU_rm, control_word) >= 0) + FPU_pop(); } - void fsubp_(void) { - /* fsubp st(i),st */ - clear_C1(); - if ( FPU_sub(REV|DEST_RM, FPU_rm, control_word) >= 0 ) - FPU_pop(); + /* fsubp st(i),st */ + clear_C1(); + if (FPU_sub(REV | DEST_RM, FPU_rm, control_word) >= 0) + FPU_pop(); } - void fdivrp(void) { - /* fdivrp st(i),st */ - clear_C1(); - if ( FPU_div(DEST_RM, FPU_rm, control_word) >= 0 ) - FPU_pop(); + /* fdivrp st(i),st */ + clear_C1(); + if (FPU_div(DEST_RM, FPU_rm, control_word) >= 0) + FPU_pop(); } - void fdivp_(void) { - /* fdivp st(i),st */ - clear_C1(); - if ( FPU_div(REV|DEST_RM, FPU_rm, control_word) >= 0 ) - FPU_pop(); + /* fdivp st(i),st */ + clear_C1(); + if (FPU_div(REV | DEST_RM, FPU_rm, control_word) >= 0) + FPU_pop(); } diff --git a/arch/x86/math-emu/fpu_asm.h b/arch/x86/math-emu/fpu_asm.h index 9ba12416df1..955b932735a 100644 --- a/arch/x86/math-emu/fpu_asm.h +++ b/arch/x86/math-emu/fpu_asm.h @@ -14,7 +14,6 @@ #define EXCEPTION FPU_exception - #define PARAM1 8(%ebp) #define PARAM2 12(%ebp) #define PARAM3 16(%ebp) diff --git a/arch/x86/math-emu/fpu_aux.c b/arch/x86/math-emu/fpu_aux.c index 20886cfb9f7..491e737ce54 100644 --- a/arch/x86/math-emu/fpu_aux.c +++ b/arch/x86/math-emu/fpu_aux.c @@ -16,34 +16,34 @@ #include "status_w.h" #include "control_w.h" - static void fnop(void) { } static void fclex(void) { - partial_status &= ~(SW_Backward|SW_Summary|SW_Stack_Fault|SW_Precision| - SW_Underflow|SW_Overflow|SW_Zero_Div|SW_Denorm_Op| - SW_Invalid); - no_ip_update = 1; + partial_status &= + ~(SW_Backward | SW_Summary | SW_Stack_Fault | SW_Precision | + SW_Underflow | SW_Overflow | SW_Zero_Div | SW_Denorm_Op | + SW_Invalid); + no_ip_update = 1; } /* Needs to be externally visible */ void finit(void) { - control_word = 0x037f; - partial_status = 0; - top = 0; /* We don't keep top in the status word internally. */ - fpu_tag_word = 0xffff; - /* The behaviour is different from that detailed in - Section 15.1.6 of the Intel manual */ - operand_address.offset = 0; - operand_address.selector = 0; - instruction_address.offset = 0; - instruction_address.selector = 0; - instruction_address.opcode = 0; - no_ip_update = 1; + control_word = 0x037f; + partial_status = 0; + top = 0; /* We don't keep top in the status word internally. */ + fpu_tag_word = 0xffff; + /* The behaviour is different from that detailed in + Section 15.1.6 of the Intel manual */ + operand_address.offset = 0; + operand_address.selector = 0; + instruction_address.offset = 0; + instruction_address.selector = 0; + instruction_address.opcode = 0; + no_ip_update = 1; } /* @@ -54,151 +54,134 @@ void finit(void) #define fsetpm fnop static FUNC const finit_table[] = { - feni, fdisi, fclex, finit, - fsetpm, FPU_illegal, FPU_illegal, FPU_illegal + feni, fdisi, fclex, finit, + fsetpm, FPU_illegal, FPU_illegal, FPU_illegal }; void finit_(void) { - (finit_table[FPU_rm])(); + (finit_table[FPU_rm]) (); } - static void fstsw_ax(void) { - *(short *) &FPU_EAX = status_word(); - no_ip_update = 1; + *(short *)&FPU_EAX = status_word(); + no_ip_update = 1; } static FUNC const fstsw_table[] = { - fstsw_ax, FPU_illegal, FPU_illegal, FPU_illegal, - FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal + fstsw_ax, FPU_illegal, FPU_illegal, FPU_illegal, + FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal }; void fstsw_(void) { - (fstsw_table[FPU_rm])(); + (fstsw_table[FPU_rm]) (); } - static FUNC const fp_nop_table[] = { - fnop, FPU_illegal, FPU_illegal, FPU_illegal, - FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal + fnop, FPU_illegal, FPU_illegal, FPU_illegal, + FPU_illegal, FPU_illegal, FPU_illegal, FPU_illegal }; void fp_nop(void) { - (fp_nop_table[FPU_rm])(); + (fp_nop_table[FPU_rm]) (); } - void fld_i_(void) { - FPU_REG *st_new_ptr; - int i; - u_char tag; - - if ( STACK_OVERFLOW ) - { FPU_stack_overflow(); return; } - - /* fld st(i) */ - i = FPU_rm; - if ( NOT_EMPTY(i) ) - { - reg_copy(&st(i), st_new_ptr); - tag = FPU_gettagi(i); - push(); - FPU_settag0(tag); - } - else - { - if ( control_word & CW_Invalid ) - { - /* The masked response */ - FPU_stack_underflow(); + FPU_REG *st_new_ptr; + int i; + u_char tag; + + if (STACK_OVERFLOW) { + FPU_stack_overflow(); + return; } - else - EXCEPTION(EX_StackUnder); - } -} + /* fld st(i) */ + i = FPU_rm; + if (NOT_EMPTY(i)) { + reg_copy(&st(i), st_new_ptr); + tag = FPU_gettagi(i); + push(); + FPU_settag0(tag); + } else { + if (control_word & CW_Invalid) { + /* The masked response */ + FPU_stack_underflow(); + } else + EXCEPTION(EX_StackUnder); + } +} void fxch_i(void) { - /* fxch st(i) */ - FPU_REG t; - int i = FPU_rm; - FPU_REG *st0_ptr = &st(0), *sti_ptr = &st(i); - long tag_word = fpu_tag_word; - int regnr = top & 7, regnri = ((regnr + i) & 7); - u_char st0_tag = (tag_word >> (regnr*2)) & 3; - u_char sti_tag = (tag_word >> (regnri*2)) & 3; - - if ( st0_tag == TAG_Empty ) - { - if ( sti_tag == TAG_Empty ) - { - FPU_stack_underflow(); - FPU_stack_underflow_i(i); - return; + /* fxch st(i) */ + FPU_REG t; + int i = FPU_rm; + FPU_REG *st0_ptr = &st(0), *sti_ptr = &st(i); + long tag_word = fpu_tag_word; + int regnr = top & 7, regnri = ((regnr + i) & 7); + u_char st0_tag = (tag_word >> (regnr * 2)) & 3; + u_char sti_tag = (tag_word >> (regnri * 2)) & 3; + + if (st0_tag == TAG_Empty) { + if (sti_tag == TAG_Empty) { + FPU_stack_underflow(); + FPU_stack_underflow_i(i); + return; + } + if (control_word & CW_Invalid) { + /* Masked response */ + FPU_copy_to_reg0(sti_ptr, sti_tag); + } + FPU_stack_underflow_i(i); + return; } - if ( control_word & CW_Invalid ) - { - /* Masked response */ - FPU_copy_to_reg0(sti_ptr, sti_tag); + if (sti_tag == TAG_Empty) { + if (control_word & CW_Invalid) { + /* Masked response */ + FPU_copy_to_regi(st0_ptr, st0_tag, i); + } + FPU_stack_underflow(); + return; } - FPU_stack_underflow_i(i); - return; - } - if ( sti_tag == TAG_Empty ) - { - if ( control_word & CW_Invalid ) - { - /* Masked response */ - FPU_copy_to_regi(st0_ptr, st0_tag, i); - } - FPU_stack_underflow(); - return; - } - clear_C1(); - - reg_copy(st0_ptr, &t); - reg_copy(sti_ptr, st0_ptr); - reg_copy(&t, sti_ptr); - - tag_word &= ~(3 << (regnr*2)) & ~(3 << (regnri*2)); - tag_word |= (sti_tag << (regnr*2)) | (st0_tag << (regnri*2)); - fpu_tag_word = tag_word; -} + clear_C1(); + reg_copy(st0_ptr, &t); + reg_copy(sti_ptr, st0_ptr); + reg_copy(&t, sti_ptr); + + tag_word &= ~(3 << (regnr * 2)) & ~(3 << (regnri * 2)); + tag_word |= (sti_tag << (regnr * 2)) | (st0_tag << (regnri * 2)); + fpu_tag_word = tag_word; +} void ffree_(void) { - /* ffree st(i) */ - FPU_settagi(FPU_rm, TAG_Empty); + /* ffree st(i) */ + FPU_settagi(FPU_rm, TAG_Empty); } - void ffreep(void) { - /* ffree st(i) + pop - unofficial code */ - FPU_settagi(FPU_rm, TAG_Empty); - FPU_pop(); + /* ffree st(i) + pop - unofficial code */ + FPU_settagi(FPU_rm, TAG_Empty); + FPU_pop(); } - void fst_i_(void) { - /* fst st(i) */ - FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm); + /* fst st(i) */ + FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm); } - void fstp_i(void) { - /* fstp st(i) */ - FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm); - FPU_pop(); + /* fstp st(i) */ + FPU_copy_to_regi(&st(0), FPU_gettag0(), FPU_rm); + FPU_pop(); } - diff --git a/arch/x86/math-emu/fpu_emu.h b/arch/x86/math-emu/fpu_emu.h index 65120f52385..4dae511c85a 100644 --- a/arch/x86/math-emu/fpu_emu.h +++ b/arch/x86/math-emu/fpu_emu.h @@ -7,7 +7,6 @@ | | +---------------------------------------------------------------------------*/ - #ifndef _FPU_EMU_H_ #define _FPU_EMU_H_ @@ -28,15 +27,15 @@ #endif #define EXP_BIAS Const(0) -#define EXP_OVER Const(0x4000) /* smallest invalid large exponent */ -#define EXP_UNDER Const(-0x3fff) /* largest invalid small exponent */ -#define EXP_WAY_UNDER Const(-0x6000) /* Below the smallest denormal, but - still a 16 bit nr. */ +#define EXP_OVER Const(0x4000) /* smallest invalid large exponent */ +#define EXP_UNDER Const(-0x3fff) /* largest invalid small exponent */ +#define EXP_WAY_UNDER Const(-0x6000) /* Below the smallest denormal, but + still a 16 bit nr. */ #define EXP_Infinity EXP_OVER #define EXP_NaN EXP_OVER #define EXTENDED_Ebias Const(0x3fff) -#define EXTENDED_Emin (-0x3ffe) /* smallest valid exponent */ +#define EXTENDED_Emin (-0x3ffe) /* smallest valid exponent */ #define SIGN_POS Const(0) #define SIGN_NEG Const(0x80) @@ -44,10 +43,9 @@ #define SIGN_Positive Const(0) #define SIGN_Negative Const(0x8000) - /* Keep the order TAG_Valid, TAG_Zero, TW_Denormal */ /* The following fold to 2 (Special) in the Tag Word */ -#define TW_Denormal Const(4) /* De-normal */ +#define TW_Denormal Const(4) /* De-normal */ #define TW_Infinity Const(5) /* + or - infinity */ #define TW_NaN Const(6) /* Not a Number */ #define TW_Unsupported Const(7) /* Not supported by an 80486 */ @@ -67,14 +65,13 @@ #define DEST_RM 0x20 #define LOADED 0x40 -#define FPU_Exception Const(0x80000000) /* Added to tag returns. */ - +#define FPU_Exception Const(0x80000000) /* Added to tag returns. */ #ifndef __ASSEMBLY__ #include "fpu_system.h" -#include <asm/sigcontext.h> /* for struct _fpstate */ +#include <asm/sigcontext.h> /* for struct _fpstate */ #include <asm/math_emu.h> #include <linux/linkage.h> @@ -112,30 +109,33 @@ extern u_char emulating; #define PREFIX_DEFAULT 7 struct address { - unsigned int offset; - unsigned int selector:16; - unsigned int opcode:11; - unsigned int empty:5; + unsigned int offset; + unsigned int selector:16; + unsigned int opcode:11; + unsigned int empty:5; }; struct fpu__reg { - unsigned sigl; - unsigned sigh; - short exp; + unsigned sigl; + unsigned sigh; + short exp; }; -typedef void (*FUNC)(void); +typedef void (*FUNC) (void); typedef struct fpu__reg FPU_REG; -typedef void (*FUNC_ST0)(FPU_REG *st0_ptr, u_char st0_tag); -typedef struct { u_char address_size, operand_size, segment; } - overrides; +typedef void (*FUNC_ST0) (FPU_REG *st0_ptr, u_char st0_tag); +typedef struct { + u_char address_size, operand_size, segment; +} overrides; /* This structure is 32 bits: */ -typedef struct { overrides override; - u_char default_mode; } fpu_addr_modes; +typedef struct { + overrides override; + u_char default_mode; +} fpu_addr_modes; /* PROTECTED has a restricted meaning in the emulator; it is used to signal that the emulator needs to do special things to ensure that protection is respected in a segmented model. */ #define PROTECTED 4 -#define SIXTEEN 1 /* We rely upon this being 1 (true) */ +#define SIXTEEN 1 /* We rely upon this being 1 (true) */ #define VM86 SIXTEEN #define PM16 (SIXTEEN | PROTECTED) #define SEG32 PROTECTED @@ -168,8 +168,8 @@ extern u_char const data_sizes_16[32]; static inline void reg_copy(FPU_REG const *x, FPU_REG *y) { - *(short *)&(y->exp) = *(const short *)&(x->exp); - *(long long *)&(y->sigl) = *(const long long *)&(x->sigl); + *(short *)&(y->exp) = *(const short *)&(x->exp); + *(long long *)&(y->sigl) = *(const long long *)&(x->sigl); } #define exponent(x) (((*(short *)&((x)->exp)) & 0x7fff) - EXTENDED_Ebias) @@ -184,27 +184,26 @@ static inline void reg_copy(FPU_REG const *x, FPU_REG *y) #define significand(x) ( ((unsigned long long *)&((x)->sigl))[0] ) - /*----- Prototypes for functions written in assembler -----*/ /* extern void reg_move(FPU_REG *a, FPU_REG *b); */ asmlinkage int FPU_normalize(FPU_REG *x); asmlinkage int FPU_normalize_nuo(FPU_REG *x); asmlinkage int FPU_u_sub(FPU_REG const *arg1, FPU_REG const *arg2, - FPU_REG *answ, unsigned int control_w, u_char sign, + FPU_REG * answ, unsigned int control_w, u_char sign, int expa, int expb); asmlinkage int FPU_u_mul(FPU_REG const *arg1, FPU_REG const *arg2, - FPU_REG *answ, unsigned int control_w, u_char sign, + FPU_REG * answ, unsigned int control_w, u_char sign, int expon); asmlinkage int FPU_u_div(FPU_REG const *arg1, FPU_REG const *arg2, - FPU_REG *answ, unsigned int control_w, u_char sign); + FPU_REG * answ, unsigned int control_w, u_char sign); asmlinkage int FPU_u_add(FPU_REG const *arg1, FPU_REG const *arg2, - FPU_REG *answ, unsigned int control_w, u_char sign, + FPU_REG * answ, unsigned int control_w, u_char sign, int expa, int expb); asmlinkage int wm_sqrt(FPU_REG *n, int dummy1, int dummy2, unsigned int control_w, u_char sign); -asmlinkage unsigned FPU_shrx(void *l, unsigned x); -asmlinkage unsigned FPU_shrxs(void *v, unsigned x); +asmlinkage unsigned FPU_shrx(void *l, unsigned x); +asmlinkage unsigned FPU_shrxs(void *v, unsigned x); asmlinkage unsigned long FPU_div_small(unsigned long long *x, unsigned long y); asmlinkage int FPU_round(FPU_REG *arg, unsigned int extent, int dummy, unsigned int control_w, u_char sign); diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 1853524c8b5..760baeea5f0 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -25,10 +25,11 @@ +---------------------------------------------------------------------------*/ #include <linux/signal.h> -#include <linux/ptrace.h> +#include <linux/regset.h> #include <asm/uaccess.h> #include <asm/desc.h> +#include <asm/user.h> #include "fpu_system.h" #include "fpu_emu.h" @@ -36,726 +37,727 @@ #include "control_w.h" #include "status_w.h" -#define __BAD__ FPU_illegal /* Illegal on an 80486, causes SIGILL */ +#define __BAD__ FPU_illegal /* Illegal on an 80486, causes SIGILL */ -#ifndef NO_UNDOC_CODE /* Un-documented FPU op-codes supported by default. */ +#ifndef NO_UNDOC_CODE /* Un-documented FPU op-codes supported by default. */ /* WARNING: These codes are not documented by Intel in their 80486 manual and may not work on FPU clones or later Intel FPUs. */ /* Changes to support the un-doc codes provided by Linus Torvalds. */ -#define _d9_d8_ fstp_i /* unofficial code (19) */ -#define _dc_d0_ fcom_st /* unofficial code (14) */ -#define _dc_d8_ fcompst /* unofficial code (1c) */ -#define _dd_c8_ fxch_i /* unofficial code (0d) */ -#define _de_d0_ fcompst /* unofficial code (16) */ -#define _df_c0_ ffreep /* unofficial code (07) ffree + pop */ -#define _df_c8_ fxch_i /* unofficial code (0f) */ -#define _df_d0_ fstp_i /* unofficial code (17) */ -#define _df_d8_ fstp_i /* unofficial code (1f) */ +#define _d9_d8_ fstp_i /* unofficial code (19) */ +#define _dc_d0_ fcom_st /* unofficial code (14) */ +#define _dc_d8_ fcompst /* unofficial code (1c) */ +#define _dd_c8_ fxch_i /* unofficial code (0d) */ +#define _de_d0_ fcompst /* unofficial code (16) */ +#define _df_c0_ ffreep /* unofficial code (07) ffree + pop */ +#define _df_c8_ fxch_i /* unofficial code (0f) */ +#define _df_d0_ fstp_i /* unofficial code (17) */ +#define _df_d8_ fstp_i /* unofficial code (1f) */ static FUNC const st_instr_table[64] = { - fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, _df_c0_, - fmul__, fxch_i, __BAD__, __BAD__, fmul_i, _dd_c8_, fmulp_, _df_c8_, - fcom_st, fp_nop, __BAD__, __BAD__, _dc_d0_, fst_i_, _de_d0_, _df_d0_, - fcompst, _d9_d8_, __BAD__, __BAD__, _dc_d8_, fstp_i, fcompp, _df_d8_, - fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_, - fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__, - fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__, - fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, + fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, _df_c0_, + fmul__, fxch_i, __BAD__, __BAD__, fmul_i, _dd_c8_, fmulp_, _df_c8_, + fcom_st, fp_nop, __BAD__, __BAD__, _dc_d0_, fst_i_, _de_d0_, _df_d0_, + fcompst, _d9_d8_, __BAD__, __BAD__, _dc_d8_, fstp_i, fcompp, _df_d8_, + fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_, + fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__, + fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__, + fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, }; -#else /* Support only documented FPU op-codes */ +#else /* Support only documented FPU op-codes */ static FUNC const st_instr_table[64] = { - fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, __BAD__, - fmul__, fxch_i, __BAD__, __BAD__, fmul_i, __BAD__, fmulp_, __BAD__, - fcom_st, fp_nop, __BAD__, __BAD__, __BAD__, fst_i_, __BAD__, __BAD__, - fcompst, __BAD__, __BAD__, __BAD__, __BAD__, fstp_i, fcompp, __BAD__, - fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_, - fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__, - fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__, - fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, + fadd__, fld_i_, __BAD__, __BAD__, fadd_i, ffree_, faddp_, __BAD__, + fmul__, fxch_i, __BAD__, __BAD__, fmul_i, __BAD__, fmulp_, __BAD__, + fcom_st, fp_nop, __BAD__, __BAD__, __BAD__, fst_i_, __BAD__, __BAD__, + fcompst, __BAD__, __BAD__, __BAD__, __BAD__, fstp_i, fcompp, __BAD__, + fsub__, FPU_etc, __BAD__, finit_, fsubri, fucom_, fsubrp, fstsw_, + fsubr_, fconst, fucompp, __BAD__, fsub_i, fucomp, fsubp_, __BAD__, + fdiv__, FPU_triga, __BAD__, __BAD__, fdivri, __BAD__, fdivrp, __BAD__, + fdivr_, FPU_trigb, __BAD__, __BAD__, fdiv_i, __BAD__, fdivp_, __BAD__, }; #endif /* NO_UNDOC_CODE */ - -#define _NONE_ 0 /* Take no special action */ -#define _REG0_ 1 /* Need to check for not empty st(0) */ -#define _REGI_ 2 /* Need to check for not empty st(0) and st(rm) */ -#define _REGi_ 0 /* Uses st(rm) */ -#define _PUSH_ 3 /* Need to check for space to push onto stack */ -#define _null_ 4 /* Function illegal or not implemented */ -#define _REGIi 5 /* Uses st(0) and st(rm), result to st(rm) */ -#define _REGIp 6 /* Uses st(0) and st(rm), result to st(rm) then pop */ -#define _REGIc 0 /* Compare st(0) and st(rm) */ -#define _REGIn 0 /* Uses st(0) and st(rm), but handle checks later */ +#define _NONE_ 0 /* Take no special action */ +#define _REG0_ 1 /* Need to check for not empty st(0) */ +#define _REGI_ 2 /* Need to check for not empty st(0) and st(rm) */ +#define _REGi_ 0 /* Uses st(rm) */ +#define _PUSH_ 3 /* Need to check for space to push onto stack */ +#define _null_ 4 /* Function illegal or not implemented */ +#define _REGIi 5 /* Uses st(0) and st(rm), result to st(rm) */ +#define _REGIp 6 /* Uses st(0) and st(rm), result to st(rm) then pop */ +#define _REGIc 0 /* Compare st(0) and st(rm) */ +#define _REGIn 0 /* Uses st(0) and st(rm), but handle checks later */ #ifndef NO_UNDOC_CODE /* Un-documented FPU op-codes supported by default. (see above) */ static u_char const type_table[64] = { - _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_, - _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_, - _REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, - _REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, - _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, - _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_, - _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, - _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ + _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _REGi_, + _REGI_, _REGIn, _null_, _null_, _REGIi, _REGI_, _REGIp, _REGI_, + _REGIc, _NONE_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, + _REGIc, _REG0_, _null_, _null_, _REGIc, _REG0_, _REGIc, _REG0_, + _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, + _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_, + _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, + _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ }; -#else /* Support only documented FPU op-codes */ +#else /* Support only documented FPU op-codes */ static u_char const type_table[64] = { - _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_, - _REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_, - _REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_, - _REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_, - _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, - _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_, - _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, - _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ + _REGI_, _NONE_, _null_, _null_, _REGIi, _REGi_, _REGIp, _null_, + _REGI_, _REGIn, _null_, _null_, _REGIi, _null_, _REGIp, _null_, + _REGIc, _NONE_, _null_, _null_, _null_, _REG0_, _null_, _null_, + _REGIc, _null_, _null_, _null_, _null_, _REG0_, _REGIc, _null_, + _REGI_, _NONE_, _null_, _NONE_, _REGIi, _REGIc, _REGIp, _NONE_, + _REGI_, _NONE_, _REGIc, _null_, _REGIi, _REGIc, _REGIp, _null_, + _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_, + _REGI_, _NONE_, _null_, _null_, _REGIi, _null_, _REGIp, _null_ }; #endif /* NO_UNDOC_CODE */ - #ifdef RE_ENTRANT_CHECKING -u_char emulating=0; +u_char emulating = 0; #endif /* RE_ENTRANT_CHECKING */ -static int valid_prefix(u_char *Byte, u_char __user **fpu_eip, - overrides *override); +static int valid_prefix(u_char *Byte, u_char __user ** fpu_eip, + overrides * override); asmlinkage void math_emulate(long arg) { - u_char FPU_modrm, byte1; - unsigned short code; - fpu_addr_modes addr_modes; - int unmasked; - FPU_REG loaded_data; - FPU_REG *st0_ptr; - u_char loaded_tag, st0_tag; - void __user *data_address; - struct address data_sel_off; - struct address entry_sel_off; - unsigned long code_base = 0; - unsigned long code_limit = 0; /* Initialized to stop compiler warnings */ - struct desc_struct code_descriptor; + u_char FPU_modrm, byte1; + unsigned short code; + fpu_addr_modes addr_modes; + int unmasked; + FPU_REG loaded_data; + FPU_REG *st0_ptr; + u_char loaded_tag, st0_tag; + void __user *data_address; + struct address data_sel_off; + struct address entry_sel_off; + unsigned long code_base = 0; + unsigned long code_limit = 0; /* Initialized to stop compiler warnings */ + struct desc_struct code_descriptor; #ifdef RE_ENTRANT_CHECKING - if ( emulating ) - { - printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n"); - } - RE_ENTRANT_CHECK_ON; + if (emulating) { + printk("ERROR: wm-FPU-emu is not RE-ENTRANT!\n"); + } + RE_ENTRANT_CHECK_ON; #endif /* RE_ENTRANT_CHECKING */ - if (!used_math()) - { - finit(); - set_used_math(); - } - - SETUP_DATA_AREA(arg); - - FPU_ORIG_EIP = FPU_EIP; - - if ( (FPU_EFLAGS & 0x00020000) != 0 ) - { - /* Virtual 8086 mode */ - addr_modes.default_mode = VM86; - FPU_EIP += code_base = FPU_CS << 4; - code_limit = code_base + 0xffff; /* Assumes code_base <= 0xffff0000 */ - } - else if ( FPU_CS == __USER_CS && FPU_DS == __USER_DS ) - { - addr_modes.default_mode = 0; - } - else if ( FPU_CS == __KERNEL_CS ) - { - printk("math_emulate: %04x:%08lx\n",FPU_CS,FPU_EIP); - panic("Math emulation needed in kernel"); - } - else - { - - if ( (FPU_CS & 4) != 4 ) /* Must be in the LDT */ - { - /* Can only handle segmented addressing via the LDT - for now, and it must be 16 bit */ - printk("FPU emulator: Unsupported addressing mode\n"); - math_abort(FPU_info, SIGILL); + if (!used_math()) { + finit(); + set_used_math(); } - code_descriptor = LDT_DESCRIPTOR(FPU_CS); - if ( SEG_D_SIZE(code_descriptor) ) - { - /* The above test may be wrong, the book is not clear */ - /* Segmented 32 bit protected mode */ - addr_modes.default_mode = SEG32; + SETUP_DATA_AREA(arg); + + FPU_ORIG_EIP = FPU_EIP; + + if ((FPU_EFLAGS & 0x00020000) != 0) { + /* Virtual 8086 mode */ + addr_modes.default_mode = VM86; + FPU_EIP += code_base = FPU_CS << 4; + code_limit = code_base + 0xffff; /* Assumes code_base <= 0xffff0000 */ + } else if (FPU_CS == __USER_CS && FPU_DS == __USER_DS) { + addr_modes.default_mode = 0; + } else if (FPU_CS == __KERNEL_CS) { + printk("math_emulate: %04x:%08lx\n", FPU_CS, FPU_EIP); + panic("Math emulation needed in kernel"); + } else { + + if ((FPU_CS & 4) != 4) { /* Must be in the LDT */ + /* Can only handle segmented addressing via the LDT + for now, and it must be 16 bit */ + printk("FPU emulator: Unsupported addressing mode\n"); + math_abort(FPU_info, SIGILL); + } + + code_descriptor = LDT_DESCRIPTOR(FPU_CS); + if (SEG_D_SIZE(code_descriptor)) { + /* The above test may be wrong, the book is not clear */ + /* Segmented 32 bit protected mode */ + addr_modes.default_mode = SEG32; + } else { + /* 16 bit protected mode */ + addr_modes.default_mode = PM16; + } + FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor); + code_limit = code_base + + (SEG_LIMIT(code_descriptor) + + 1) * SEG_GRANULARITY(code_descriptor) + - 1; + if (code_limit < code_base) + code_limit = 0xffffffff; } - else - { - /* 16 bit protected mode */ - addr_modes.default_mode = PM16; + + FPU_lookahead = !(FPU_EFLAGS & X86_EFLAGS_TF); + + if (!valid_prefix(&byte1, (u_char __user **) & FPU_EIP, + &addr_modes.override)) { + RE_ENTRANT_CHECK_OFF; + printk + ("FPU emulator: Unknown prefix byte 0x%02x, probably due to\n" + "FPU emulator: self-modifying code! (emulation impossible)\n", + byte1); + RE_ENTRANT_CHECK_ON; + EXCEPTION(EX_INTERNAL | 0x126); + math_abort(FPU_info, SIGILL); } - FPU_EIP += code_base = SEG_BASE_ADDR(code_descriptor); - code_limit = code_base - + (SEG_LIMIT(code_descriptor)+1) * SEG_GRANULARITY(code_descriptor) - - 1; - if ( code_limit < code_base ) code_limit = 0xffffffff; - } - - FPU_lookahead = 1; - if (current->ptrace & PT_PTRACED) - FPU_lookahead = 0; - - if ( !valid_prefix(&byte1, (u_char __user **)&FPU_EIP, - &addr_modes.override) ) - { - RE_ENTRANT_CHECK_OFF; - printk("FPU emulator: Unknown prefix byte 0x%02x, probably due to\n" - "FPU emulator: self-modifying code! (emulation impossible)\n", - byte1); - RE_ENTRANT_CHECK_ON; - EXCEPTION(EX_INTERNAL|0x126); - math_abort(FPU_info,SIGILL); - } - -do_another_FPU_instruction: - - no_ip_update = 0; - - FPU_EIP++; /* We have fetched the prefix and first code bytes. */ - - if ( addr_modes.default_mode ) - { - /* This checks for the minimum instruction bytes. - We also need to check any extra (address mode) code access. */ - if ( FPU_EIP > code_limit ) - math_abort(FPU_info,SIGSEGV); - } - - if ( (byte1 & 0xf8) != 0xd8 ) - { - if ( byte1 == FWAIT_OPCODE ) - { - if (partial_status & SW_Summary) - goto do_the_FPU_interrupt; - else - goto FPU_fwait_done; + + do_another_FPU_instruction: + + no_ip_update = 0; + + FPU_EIP++; /* We have fetched the prefix and first code bytes. */ + + if (addr_modes.default_mode) { + /* This checks for the minimum instruction bytes. + We also need to check any extra (address mode) code access. */ + if (FPU_EIP > code_limit) + math_abort(FPU_info, SIGSEGV); } + + if ((byte1 & 0xf8) != 0xd8) { + if (byte1 == FWAIT_OPCODE) { + if (partial_status & SW_Summary) + goto do_the_FPU_interrupt; + else + goto FPU_fwait_done; + } #ifdef PARANOID - EXCEPTION(EX_INTERNAL|0x128); - math_abort(FPU_info,SIGILL); + EXCEPTION(EX_INTERNAL | 0x128); + math_abort(FPU_info, SIGILL); #endif /* PARANOID */ - } - - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(1); - FPU_get_user(FPU_modrm, (u_char __user *) FPU_EIP); - RE_ENTRANT_CHECK_ON; - FPU_EIP++; - - if (partial_status & SW_Summary) - { - /* Ignore the error for now if the current instruction is a no-wait - control instruction */ - /* The 80486 manual contradicts itself on this topic, - but a real 80486 uses the following instructions: - fninit, fnstenv, fnsave, fnstsw, fnstenv, fnclex. - */ - code = (FPU_modrm << 8) | byte1; - if ( ! ( (((code & 0xf803) == 0xe003) || /* fnclex, fninit, fnstsw */ - (((code & 0x3003) == 0x3001) && /* fnsave, fnstcw, fnstenv, - fnstsw */ - ((code & 0xc000) != 0xc000))) ) ) - { - /* - * We need to simulate the action of the kernel to FPU - * interrupts here. - */ - do_the_FPU_interrupt: - - FPU_EIP = FPU_ORIG_EIP; /* Point to current FPU instruction. */ - - RE_ENTRANT_CHECK_OFF; - current->thread.trap_no = 16; - current->thread.error_code = 0; - send_sig(SIGFPE, current, 1); - return; - } - } - - entry_sel_off.offset = FPU_ORIG_EIP; - entry_sel_off.selector = FPU_CS; - entry_sel_off.opcode = (byte1 << 8) | FPU_modrm; - - FPU_rm = FPU_modrm & 7; - - if ( FPU_modrm < 0300 ) - { - /* All of these instructions use the mod/rm byte to get a data address */ - - if ( (addr_modes.default_mode & SIXTEEN) - ^ (addr_modes.override.address_size == ADDR_SIZE_PREFIX) ) - data_address = FPU_get_address_16(FPU_modrm, &FPU_EIP, &data_sel_off, - addr_modes); - else - data_address = FPU_get_address(FPU_modrm, &FPU_EIP, &data_sel_off, - addr_modes); - - if ( addr_modes.default_mode ) - { - if ( FPU_EIP-1 > code_limit ) - math_abort(FPU_info,SIGSEGV); } - if ( !(byte1 & 1) ) - { - unsigned short status1 = partial_status; - - st0_ptr = &st(0); - st0_tag = FPU_gettag0(); - - /* Stack underflow has priority */ - if ( NOT_EMPTY_ST0 ) - { - if ( addr_modes.default_mode & PROTECTED ) - { - /* This table works for 16 and 32 bit protected mode */ - if ( access_limit < data_sizes_16[(byte1 >> 1) & 3] ) - math_abort(FPU_info,SIGSEGV); + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(FPU_modrm, (u_char __user *) FPU_EIP); + RE_ENTRANT_CHECK_ON; + FPU_EIP++; + + if (partial_status & SW_Summary) { + /* Ignore the error for now if the current instruction is a no-wait + control instruction */ + /* The 80486 manual contradicts itself on this topic, + but a real 80486 uses the following instructions: + fninit, fnstenv, fnsave, fnstsw, fnstenv, fnclex. + */ + code = (FPU_modrm << 8) | byte1; + if (!((((code & 0xf803) == 0xe003) || /* fnclex, fninit, fnstsw */ + (((code & 0x3003) == 0x3001) && /* fnsave, fnstcw, fnstenv, + fnstsw */ + ((code & 0xc000) != 0xc000))))) { + /* + * We need to simulate the action of the kernel to FPU + * interrupts here. + */ + do_the_FPU_interrupt: + + FPU_EIP = FPU_ORIG_EIP; /* Point to current FPU instruction. */ + + RE_ENTRANT_CHECK_OFF; + current->thread.trap_no = 16; + current->thread.error_code = 0; + send_sig(SIGFPE, current, 1); + return; } + } - unmasked = 0; /* Do this here to stop compiler warnings. */ - switch ( (byte1 >> 1) & 3 ) - { - case 0: - unmasked = FPU_load_single((float __user *)data_address, - &loaded_data); - loaded_tag = unmasked & 0xff; - unmasked &= ~0xff; - break; - case 1: - loaded_tag = FPU_load_int32((long __user *)data_address, &loaded_data); - break; - case 2: - unmasked = FPU_load_double((double __user *)data_address, - &loaded_data); - loaded_tag = unmasked & 0xff; - unmasked &= ~0xff; - break; - case 3: - default: /* Used here to suppress gcc warnings. */ - loaded_tag = FPU_load_int16((short __user *)data_address, &loaded_data); - break; - } + entry_sel_off.offset = FPU_ORIG_EIP; + entry_sel_off.selector = FPU_CS; + entry_sel_off.opcode = (byte1 << 8) | FPU_modrm; - /* No more access to user memory, it is safe - to use static data now */ - - /* NaN operands have the next priority. */ - /* We have to delay looking at st(0) until after - loading the data, because that data might contain an SNaN */ - if ( ((st0_tag == TAG_Special) && isNaN(st0_ptr)) || - ((loaded_tag == TAG_Special) && isNaN(&loaded_data)) ) - { - /* Restore the status word; we might have loaded a - denormal. */ - partial_status = status1; - if ( (FPU_modrm & 0x30) == 0x10 ) - { - /* fcom or fcomp */ - EXCEPTION(EX_Invalid); - setcc(SW_C3 | SW_C2 | SW_C0); - if ( (FPU_modrm & 0x08) && (control_word & CW_Invalid) ) - FPU_pop(); /* fcomp, masked, so we pop. */ - } - else - { - if ( loaded_tag == TAG_Special ) - loaded_tag = FPU_Special(&loaded_data); -#ifdef PECULIAR_486 - /* This is not really needed, but gives behaviour - identical to an 80486 */ - if ( (FPU_modrm & 0x28) == 0x20 ) - /* fdiv or fsub */ - real_2op_NaN(&loaded_data, loaded_tag, 0, &loaded_data); - else -#endif /* PECULIAR_486 */ - /* fadd, fdivr, fmul, or fsubr */ - real_2op_NaN(&loaded_data, loaded_tag, 0, st0_ptr); - } - goto reg_mem_instr_done; - } + FPU_rm = FPU_modrm & 7; - if ( unmasked && !((FPU_modrm & 0x30) == 0x10) ) - { - /* Is not a comparison instruction. */ - if ( (FPU_modrm & 0x38) == 0x38 ) - { - /* fdivr */ - if ( (st0_tag == TAG_Zero) && - ((loaded_tag == TAG_Valid) - || (loaded_tag == TAG_Special - && isdenormal(&loaded_data))) ) - { - if ( FPU_divide_by_zero(0, getsign(&loaded_data)) - < 0 ) - { - /* We use the fact here that the unmasked - exception in the loaded data was for a - denormal operand */ - /* Restore the state of the denormal op bit */ - partial_status &= ~SW_Denorm_Op; - partial_status |= status1 & SW_Denorm_Op; - } - else - setsign(st0_ptr, getsign(&loaded_data)); - } - } - goto reg_mem_instr_done; - } + if (FPU_modrm < 0300) { + /* All of these instructions use the mod/rm byte to get a data address */ - switch ( (FPU_modrm >> 3) & 7 ) - { - case 0: /* fadd */ - clear_C1(); - FPU_add(&loaded_data, loaded_tag, 0, control_word); - break; - case 1: /* fmul */ - clear_C1(); - FPU_mul(&loaded_data, loaded_tag, 0, control_word); - break; - case 2: /* fcom */ - FPU_compare_st_data(&loaded_data, loaded_tag); - break; - case 3: /* fcomp */ - if ( !FPU_compare_st_data(&loaded_data, loaded_tag) - && !unmasked ) - FPU_pop(); - break; - case 4: /* fsub */ - clear_C1(); - FPU_sub(LOADED|loaded_tag, (int)&loaded_data, control_word); - break; - case 5: /* fsubr */ - clear_C1(); - FPU_sub(REV|LOADED|loaded_tag, (int)&loaded_data, control_word); - break; - case 6: /* fdiv */ - clear_C1(); - FPU_div(LOADED|loaded_tag, (int)&loaded_data, control_word); - break; - case 7: /* fdivr */ - clear_C1(); - if ( st0_tag == TAG_Zero ) - partial_status = status1; /* Undo any denorm tag, - zero-divide has priority. */ - FPU_div(REV|LOADED|loaded_tag, (int)&loaded_data, control_word); - break; + if ((addr_modes.default_mode & SIXTEEN) + ^ (addr_modes.override.address_size == ADDR_SIZE_PREFIX)) + data_address = + FPU_get_address_16(FPU_modrm, &FPU_EIP, + &data_sel_off, addr_modes); + else + data_address = + FPU_get_address(FPU_modrm, &FPU_EIP, &data_sel_off, + addr_modes); + + if (addr_modes.default_mode) { + if (FPU_EIP - 1 > code_limit) + math_abort(FPU_info, SIGSEGV); } - } - else - { - if ( (FPU_modrm & 0x30) == 0x10 ) - { - /* The instruction is fcom or fcomp */ - EXCEPTION(EX_StackUnder); - setcc(SW_C3 | SW_C2 | SW_C0); - if ( (FPU_modrm & 0x08) && (control_word & CW_Invalid) ) - FPU_pop(); /* fcomp */ + + if (!(byte1 & 1)) { + unsigned short status1 = partial_status; + + st0_ptr = &st(0); + st0_tag = FPU_gettag0(); + + /* Stack underflow has priority */ + if (NOT_EMPTY_ST0) { + if (addr_modes.default_mode & PROTECTED) { + /* This table works for 16 and 32 bit protected mode */ + if (access_limit < + data_sizes_16[(byte1 >> 1) & 3]) + math_abort(FPU_info, SIGSEGV); + } + + unmasked = 0; /* Do this here to stop compiler warnings. */ + switch ((byte1 >> 1) & 3) { + case 0: + unmasked = + FPU_load_single((float __user *) + data_address, + &loaded_data); + loaded_tag = unmasked & 0xff; + unmasked &= ~0xff; + break; + case 1: + loaded_tag = + FPU_load_int32((long __user *) + data_address, + &loaded_data); + break; + case 2: + unmasked = + FPU_load_double((double __user *) + data_address, + &loaded_data); + loaded_tag = unmasked & 0xff; + unmasked &= ~0xff; + break; + case 3: + default: /* Used here to suppress gcc warnings. */ + loaded_tag = + FPU_load_int16((short __user *) + data_address, + &loaded_data); + break; + } + + /* No more access to user memory, it is safe + to use static data now */ + + /* NaN operands have the next priority. */ + /* We have to delay looking at st(0) until after + loading the data, because that data might contain an SNaN */ + if (((st0_tag == TAG_Special) && isNaN(st0_ptr)) + || ((loaded_tag == TAG_Special) + && isNaN(&loaded_data))) { + /* Restore the status word; we might have loaded a + denormal. */ + partial_status = status1; + if ((FPU_modrm & 0x30) == 0x10) { + /* fcom or fcomp */ + EXCEPTION(EX_Invalid); + setcc(SW_C3 | SW_C2 | SW_C0); + if ((FPU_modrm & 0x08) + && (control_word & + CW_Invalid)) + FPU_pop(); /* fcomp, masked, so we pop. */ + } else { + if (loaded_tag == TAG_Special) + loaded_tag = + FPU_Special + (&loaded_data); +#ifdef PECULIAR_486 + /* This is not really needed, but gives behaviour + identical to an 80486 */ + if ((FPU_modrm & 0x28) == 0x20) + /* fdiv or fsub */ + real_2op_NaN + (&loaded_data, + loaded_tag, 0, + &loaded_data); + else +#endif /* PECULIAR_486 */ + /* fadd, fdivr, fmul, or fsubr */ + real_2op_NaN + (&loaded_data, + loaded_tag, 0, + st0_ptr); + } + goto reg_mem_instr_done; + } + + if (unmasked && !((FPU_modrm & 0x30) == 0x10)) { + /* Is not a comparison instruction. */ + if ((FPU_modrm & 0x38) == 0x38) { + /* fdivr */ + if ((st0_tag == TAG_Zero) && + ((loaded_tag == TAG_Valid) + || (loaded_tag == + TAG_Special + && + isdenormal + (&loaded_data)))) { + if (FPU_divide_by_zero + (0, + getsign + (&loaded_data)) + < 0) { + /* We use the fact here that the unmasked + exception in the loaded data was for a + denormal operand */ + /* Restore the state of the denormal op bit */ + partial_status + &= + ~SW_Denorm_Op; + partial_status + |= + status1 & + SW_Denorm_Op; + } else + setsign(st0_ptr, + getsign + (&loaded_data)); + } + } + goto reg_mem_instr_done; + } + + switch ((FPU_modrm >> 3) & 7) { + case 0: /* fadd */ + clear_C1(); + FPU_add(&loaded_data, loaded_tag, 0, + control_word); + break; + case 1: /* fmul */ + clear_C1(); + FPU_mul(&loaded_data, loaded_tag, 0, + control_word); + break; + case 2: /* fcom */ + FPU_compare_st_data(&loaded_data, + loaded_tag); + break; + case 3: /* fcomp */ + if (!FPU_compare_st_data + (&loaded_data, loaded_tag) + && !unmasked) + FPU_pop(); + break; + case 4: /* fsub */ + clear_C1(); + FPU_sub(LOADED | loaded_tag, + (int)&loaded_data, + control_word); + break; + case 5: /* fsubr */ + clear_C1(); + FPU_sub(REV | LOADED | loaded_tag, + (int)&loaded_data, + control_word); + break; + case 6: /* fdiv */ + clear_C1(); + FPU_div(LOADED | loaded_tag, + (int)&loaded_data, + control_word); + break; + case 7: /* fdivr */ + clear_C1(); + if (st0_tag == TAG_Zero) + partial_status = status1; /* Undo any denorm tag, + zero-divide has priority. */ + FPU_div(REV | LOADED | loaded_tag, + (int)&loaded_data, + control_word); + break; + } + } else { + if ((FPU_modrm & 0x30) == 0x10) { + /* The instruction is fcom or fcomp */ + EXCEPTION(EX_StackUnder); + setcc(SW_C3 | SW_C2 | SW_C0); + if ((FPU_modrm & 0x08) + && (control_word & CW_Invalid)) + FPU_pop(); /* fcomp */ + } else + FPU_stack_underflow(); + } + reg_mem_instr_done: + operand_address = data_sel_off; + } else { + if (!(no_ip_update = + FPU_load_store(((FPU_modrm & 0x38) | (byte1 & 6)) + >> 1, addr_modes, data_address))) { + operand_address = data_sel_off; + } } - else - FPU_stack_underflow(); - } - reg_mem_instr_done: - operand_address = data_sel_off; - } - else - { - if ( !(no_ip_update = - FPU_load_store(((FPU_modrm & 0x38) | (byte1 & 6)) >> 1, - addr_modes, data_address)) ) - { - operand_address = data_sel_off; - } - } - } - else - { - /* None of these instructions access user memory */ - u_char instr_index = (FPU_modrm & 0x38) | (byte1 & 7); + } else { + /* None of these instructions access user memory */ + u_char instr_index = (FPU_modrm & 0x38) | (byte1 & 7); #ifdef PECULIAR_486 - /* This is supposed to be undefined, but a real 80486 seems - to do this: */ - operand_address.offset = 0; - operand_address.selector = FPU_DS; + /* This is supposed to be undefined, but a real 80486 seems + to do this: */ + operand_address.offset = 0; + operand_address.selector = FPU_DS; #endif /* PECULIAR_486 */ - st0_ptr = &st(0); - st0_tag = FPU_gettag0(); - switch ( type_table[(int) instr_index] ) - { - case _NONE_: /* also _REGIc: _REGIn */ - break; - case _REG0_: - if ( !NOT_EMPTY_ST0 ) - { - FPU_stack_underflow(); - goto FPU_instruction_done; - } - break; - case _REGIi: - if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) ) - { - FPU_stack_underflow_i(FPU_rm); - goto FPU_instruction_done; - } - break; - case _REGIp: - if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) ) - { - FPU_stack_underflow_pop(FPU_rm); - goto FPU_instruction_done; - } - break; - case _REGI_: - if ( !NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm) ) - { - FPU_stack_underflow(); - goto FPU_instruction_done; - } - break; - case _PUSH_: /* Only used by the fld st(i) instruction */ - break; - case _null_: - FPU_illegal(); - goto FPU_instruction_done; - default: - EXCEPTION(EX_INTERNAL|0x111); - goto FPU_instruction_done; - } - (*st_instr_table[(int) instr_index])(); + st0_ptr = &st(0); + st0_tag = FPU_gettag0(); + switch (type_table[(int)instr_index]) { + case _NONE_: /* also _REGIc: _REGIn */ + break; + case _REG0_: + if (!NOT_EMPTY_ST0) { + FPU_stack_underflow(); + goto FPU_instruction_done; + } + break; + case _REGIi: + if (!NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm)) { + FPU_stack_underflow_i(FPU_rm); + goto FPU_instruction_done; + } + break; + case _REGIp: + if (!NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm)) { + FPU_stack_underflow_pop(FPU_rm); + goto FPU_instruction_done; + } + break; + case _REGI_: + if (!NOT_EMPTY_ST0 || !NOT_EMPTY(FPU_rm)) { + FPU_stack_underflow(); + goto FPU_instruction_done; + } + break; + case _PUSH_: /* Only used by the fld st(i) instruction */ + break; + case _null_: + FPU_illegal(); + goto FPU_instruction_done; + default: + EXCEPTION(EX_INTERNAL | 0x111); + goto FPU_instruction_done; + } + (*st_instr_table[(int)instr_index]) (); -FPU_instruction_done: - ; - } + FPU_instruction_done: + ; + } - if ( ! no_ip_update ) - instruction_address = entry_sel_off; + if (!no_ip_update) + instruction_address = entry_sel_off; -FPU_fwait_done: + FPU_fwait_done: #ifdef DEBUG - RE_ENTRANT_CHECK_OFF; - FPU_printall(); - RE_ENTRANT_CHECK_ON; + RE_ENTRANT_CHECK_OFF; + FPU_printall(); + RE_ENTRANT_CHECK_ON; #endif /* DEBUG */ - if (FPU_lookahead && !need_resched()) - { - FPU_ORIG_EIP = FPU_EIP - code_base; - if ( valid_prefix(&byte1, (u_char __user **)&FPU_EIP, - &addr_modes.override) ) - goto do_another_FPU_instruction; - } + if (FPU_lookahead && !need_resched()) { + FPU_ORIG_EIP = FPU_EIP - code_base; + if (valid_prefix(&byte1, (u_char __user **) & FPU_EIP, + &addr_modes.override)) + goto do_another_FPU_instruction; + } - if ( addr_modes.default_mode ) - FPU_EIP -= code_base; + if (addr_modes.default_mode) + FPU_EIP -= code_base; - RE_ENTRANT_CHECK_OFF; + RE_ENTRANT_CHECK_OFF; } - /* Support for prefix bytes is not yet complete. To properly handle all prefix bytes, further changes are needed in the emulator code which accesses user address space. Access to separate segments is important for msdos emulation. */ static int valid_prefix(u_char *Byte, u_char __user **fpu_eip, - overrides *override) + overrides * override) { - u_char byte; - u_char __user *ip = *fpu_eip; - - *override = (overrides) { 0, 0, PREFIX_DEFAULT }; /* defaults */ - - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(1); - FPU_get_user(byte, ip); - RE_ENTRANT_CHECK_ON; - - while ( 1 ) - { - switch ( byte ) - { - case ADDR_SIZE_PREFIX: - override->address_size = ADDR_SIZE_PREFIX; - goto do_next_byte; - - case OP_SIZE_PREFIX: - override->operand_size = OP_SIZE_PREFIX; - goto do_next_byte; - - case PREFIX_CS: - override->segment = PREFIX_CS_; - goto do_next_byte; - case PREFIX_ES: - override->segment = PREFIX_ES_; - goto do_next_byte; - case PREFIX_SS: - override->segment = PREFIX_SS_; - goto do_next_byte; - case PREFIX_FS: - override->segment = PREFIX_FS_; - goto do_next_byte; - case PREFIX_GS: - override->segment = PREFIX_GS_; - goto do_next_byte; - case PREFIX_DS: - override->segment = PREFIX_DS_; - goto do_next_byte; + u_char byte; + u_char __user *ip = *fpu_eip; + + *override = (overrides) { + 0, 0, PREFIX_DEFAULT}; /* defaults */ + + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(byte, ip); + RE_ENTRANT_CHECK_ON; + + while (1) { + switch (byte) { + case ADDR_SIZE_PREFIX: + override->address_size = ADDR_SIZE_PREFIX; + goto do_next_byte; + + case OP_SIZE_PREFIX: + override->operand_size = OP_SIZE_PREFIX; + goto do_next_byte; + + case PREFIX_CS: + override->segment = PREFIX_CS_; + goto do_next_byte; + case PREFIX_ES: + override->segment = PREFIX_ES_; + goto do_next_byte; + case PREFIX_SS: + override->segment = PREFIX_SS_; + goto do_next_byte; + case PREFIX_FS: + override->segment = PREFIX_FS_; + goto do_next_byte; + case PREFIX_GS: + override->segment = PREFIX_GS_; + goto do_next_byte; + case PREFIX_DS: + override->segment = PREFIX_DS_; + goto do_next_byte; /* lock is not a valid prefix for FPU instructions, let the cpu handle it to generate a SIGILL. */ /* case PREFIX_LOCK: */ - /* rep.. prefixes have no meaning for FPU instructions */ - case PREFIX_REPE: - case PREFIX_REPNE: - - do_next_byte: - ip++; - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(1); - FPU_get_user(byte, ip); - RE_ENTRANT_CHECK_ON; - break; - case FWAIT_OPCODE: - *Byte = byte; - return 1; - default: - if ( (byte & 0xf8) == 0xd8 ) - { - *Byte = byte; - *fpu_eip = ip; - return 1; - } - else - { - /* Not a valid sequence of prefix bytes followed by - an FPU instruction. */ - *Byte = byte; /* Needed for error message. */ - return 0; - } + /* rep.. prefixes have no meaning for FPU instructions */ + case PREFIX_REPE: + case PREFIX_REPNE: + + do_next_byte: + ip++; + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(byte, ip); + RE_ENTRANT_CHECK_ON; + break; + case FWAIT_OPCODE: + *Byte = byte; + return 1; + default: + if ((byte & 0xf8) == 0xd8) { + *Byte = byte; + *fpu_eip = ip; + return 1; + } else { + /* Not a valid sequence of prefix bytes followed by + an FPU instruction. */ + *Byte = byte; /* Needed for error message. */ + return 0; + } + } } - } } - -void math_abort(struct info * info, unsigned int signal) +void math_abort(struct info *info, unsigned int signal) { FPU_EIP = FPU_ORIG_EIP; current->thread.trap_no = 16; current->thread.error_code = 0; - send_sig(signal,current,1); + send_sig(signal, current, 1); RE_ENTRANT_CHECK_OFF; - __asm__("movl %0,%%esp ; ret": :"g" (((long) info)-4)); + __asm__("movl %0,%%esp ; ret": :"g"(((long)info) - 4)); #ifdef PARANOID - printk("ERROR: wm-FPU-emu math_abort failed!\n"); + printk("ERROR: wm-FPU-emu math_abort failed!\n"); #endif /* PARANOID */ } - - #define S387 ((struct i387_soft_struct *)s387) #define sstatus_word() \ ((S387->swd & ~SW_Top & 0xffff) | ((S387->ftop << SW_Top_Shift) & SW_Top)) -int restore_i387_soft(void *s387, struct _fpstate __user *buf) +int fpregs_soft_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) { - u_char __user *d = (u_char __user *)buf; - int offset, other, i, tags, regnr, tag, newtop; - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, d, 7*4 + 8*10); - if (__copy_from_user(&S387->cwd, d, 7*4)) - return -1; - RE_ENTRANT_CHECK_ON; - - d += 7*4; - - S387->ftop = (S387->swd >> SW_Top_Shift) & 7; - offset = (S387->ftop & 7) * 10; - other = 80 - offset; - - RE_ENTRANT_CHECK_OFF; - /* Copy all registers in stack order. */ - if (__copy_from_user(((u_char *)&S387->st_space)+offset, d, other)) - return -1; - if ( offset ) - if (__copy_from_user((u_char *)&S387->st_space, d+other, offset)) - return -1; - RE_ENTRANT_CHECK_ON; - - /* The tags may need to be corrected now. */ - tags = S387->twd; - newtop = S387->ftop; - for ( i = 0; i < 8; i++ ) - { - regnr = (i+newtop) & 7; - if ( ((tags >> ((regnr & 7)*2)) & 3) != TAG_Empty ) - { - /* The loaded data over-rides all other cases. */ - tag = FPU_tagof((FPU_REG *)((u_char *)S387->st_space + 10*regnr)); - tags &= ~(3 << (regnr*2)); - tags |= (tag & 3) << (regnr*2); + struct i387_soft_struct *s387 = &target->thread.i387.soft; + void *space = s387->st_space; + int ret; + int offset, other, i, tags, regnr, tag, newtop; + + RE_ENTRANT_CHECK_OFF; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, s387, 0, + offsetof(struct i387_soft_struct, st_space)); + RE_ENTRANT_CHECK_ON; + + if (ret) + return ret; + + S387->ftop = (S387->swd >> SW_Top_Shift) & 7; + offset = (S387->ftop & 7) * 10; + other = 80 - offset; + + RE_ENTRANT_CHECK_OFF; + + /* Copy all registers in stack order. */ + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + space + offset, 0, other); + if (!ret && offset) + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + space, 0, offset); + + RE_ENTRANT_CHECK_ON; + + /* The tags may need to be corrected now. */ + tags = S387->twd; + newtop = S387->ftop; + for (i = 0; i < 8; i++) { + regnr = (i + newtop) & 7; + if (((tags >> ((regnr & 7) * 2)) & 3) != TAG_Empty) { + /* The loaded data over-rides all other cases. */ + tag = + FPU_tagof((FPU_REG *) ((u_char *) S387->st_space + + 10 * regnr)); + tags &= ~(3 << (regnr * 2)); + tags |= (tag & 3) << (regnr * 2); + } } - } - S387->twd = tags; + S387->twd = tags; - return 0; + return ret; } - -int save_i387_soft(void *s387, struct _fpstate __user * buf) +int fpregs_soft_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) { - u_char __user *d = (u_char __user *)buf; - int offset = (S387->ftop & 7) * 10, other = 80 - offset; + struct i387_soft_struct *s387 = &target->thread.i387.soft; + const void *space = s387->st_space; + int ret; + int offset = (S387->ftop & 7) * 10, other = 80 - offset; + + RE_ENTRANT_CHECK_OFF; - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, d, 7*4 + 8*10); #ifdef PECULIAR_486 - S387->cwd &= ~0xe080; - /* An 80486 sets nearly all of the reserved bits to 1. */ - S387->cwd |= 0xffff0040; - S387->swd = sstatus_word() | 0xffff0000; - S387->twd |= 0xffff0000; - S387->fcs &= ~0xf8000000; - S387->fos |= 0xffff0000; + S387->cwd &= ~0xe080; + /* An 80486 sets nearly all of the reserved bits to 1. */ + S387->cwd |= 0xffff0040; + S387->swd = sstatus_word() | 0xffff0000; + S387->twd |= 0xffff0000; + S387->fcs &= ~0xf8000000; + S387->fos |= 0xffff0000; #endif /* PECULIAR_486 */ - if (__copy_to_user(d, &S387->cwd, 7*4)) - return -1; - RE_ENTRANT_CHECK_ON; - - d += 7*4; - - RE_ENTRANT_CHECK_OFF; - /* Copy all registers in stack order. */ - if (__copy_to_user(d, ((u_char *)&S387->st_space)+offset, other)) - return -1; - if ( offset ) - if (__copy_to_user(d+other, (u_char *)&S387->st_space, offset)) - return -1; - RE_ENTRANT_CHECK_ON; - - return 1; + + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, s387, 0, + offsetof(struct i387_soft_struct, st_space)); + + /* Copy all registers in stack order. */ + if (!ret) + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + space + offset, 0, other); + if (!ret) + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + space, 0, offset); + + RE_ENTRANT_CHECK_ON; + + return ret; } diff --git a/arch/x86/math-emu/fpu_etc.c b/arch/x86/math-emu/fpu_etc.c index e3b5d465587..233e5af566f 100644 --- a/arch/x86/math-emu/fpu_etc.c +++ b/arch/x86/math-emu/fpu_etc.c @@ -16,128 +16,115 @@ #include "status_w.h" #include "reg_constant.h" - static void fchs(FPU_REG *st0_ptr, u_char st0tag) { - if ( st0tag ^ TAG_Empty ) - { - signbyte(st0_ptr) ^= SIGN_NEG; - clear_C1(); - } - else - FPU_stack_underflow(); + if (st0tag ^ TAG_Empty) { + signbyte(st0_ptr) ^= SIGN_NEG; + clear_C1(); + } else + FPU_stack_underflow(); } - static void fabs(FPU_REG *st0_ptr, u_char st0tag) { - if ( st0tag ^ TAG_Empty ) - { - setpositive(st0_ptr); - clear_C1(); - } - else - FPU_stack_underflow(); + if (st0tag ^ TAG_Empty) { + setpositive(st0_ptr); + clear_C1(); + } else + FPU_stack_underflow(); } - static void ftst_(FPU_REG *st0_ptr, u_char st0tag) { - switch (st0tag) - { - case TAG_Zero: - setcc(SW_C3); - break; - case TAG_Valid: - if (getsign(st0_ptr) == SIGN_POS) - setcc(0); - else - setcc(SW_C0); - break; - case TAG_Special: - switch ( FPU_Special(st0_ptr) ) - { - case TW_Denormal: - if (getsign(st0_ptr) == SIGN_POS) - setcc(0); - else - setcc(SW_C0); - if ( denormal_operand() < 0 ) - { -#ifdef PECULIAR_486 - /* This is weird! */ - if (getsign(st0_ptr) == SIGN_POS) + switch (st0tag) { + case TAG_Zero: setcc(SW_C3); + break; + case TAG_Valid: + if (getsign(st0_ptr) == SIGN_POS) + setcc(0); + else + setcc(SW_C0); + break; + case TAG_Special: + switch (FPU_Special(st0_ptr)) { + case TW_Denormal: + if (getsign(st0_ptr) == SIGN_POS) + setcc(0); + else + setcc(SW_C0); + if (denormal_operand() < 0) { +#ifdef PECULIAR_486 + /* This is weird! */ + if (getsign(st0_ptr) == SIGN_POS) + setcc(SW_C3); #endif /* PECULIAR_486 */ - return; - } - break; - case TW_NaN: - setcc(SW_C0|SW_C2|SW_C3); /* Operand is not comparable */ - EXCEPTION(EX_Invalid); - break; - case TW_Infinity: - if (getsign(st0_ptr) == SIGN_POS) - setcc(0); - else - setcc(SW_C0); - break; - default: - setcc(SW_C0|SW_C2|SW_C3); /* Operand is not comparable */ - EXCEPTION(EX_INTERNAL|0x14); - break; + return; + } + break; + case TW_NaN: + setcc(SW_C0 | SW_C2 | SW_C3); /* Operand is not comparable */ + EXCEPTION(EX_Invalid); + break; + case TW_Infinity: + if (getsign(st0_ptr) == SIGN_POS) + setcc(0); + else + setcc(SW_C0); + break; + default: + setcc(SW_C0 | SW_C2 | SW_C3); /* Operand is not comparable */ + EXCEPTION(EX_INTERNAL | 0x14); + break; + } + break; + case TAG_Empty: + setcc(SW_C0 | SW_C2 | SW_C3); + EXCEPTION(EX_StackUnder); + break; } - break; - case TAG_Empty: - setcc(SW_C0|SW_C2|SW_C3); - EXCEPTION(EX_StackUnder); - break; - } } - static void fxam(FPU_REG *st0_ptr, u_char st0tag) { - int c = 0; - switch (st0tag) - { - case TAG_Empty: - c = SW_C3|SW_C0; - break; - case TAG_Zero: - c = SW_C3; - break; - case TAG_Valid: - c = SW_C2; - break; - case TAG_Special: - switch ( FPU_Special(st0_ptr) ) - { - case TW_Denormal: - c = SW_C2|SW_C3; /* Denormal */ - break; - case TW_NaN: - /* We also use NaN for unsupported types. */ - if ( (st0_ptr->sigh & 0x80000000) && (exponent(st0_ptr) == EXP_OVER) ) - c = SW_C0; - break; - case TW_Infinity: - c = SW_C2|SW_C0; - break; + int c = 0; + switch (st0tag) { + case TAG_Empty: + c = SW_C3 | SW_C0; + break; + case TAG_Zero: + c = SW_C3; + break; + case TAG_Valid: + c = SW_C2; + break; + case TAG_Special: + switch (FPU_Special(st0_ptr)) { + case TW_Denormal: + c = SW_C2 | SW_C3; /* Denormal */ + break; + case TW_NaN: + /* We also use NaN for unsupported types. */ + if ((st0_ptr->sigh & 0x80000000) + && (exponent(st0_ptr) == EXP_OVER)) + c = SW_C0; + break; + case TW_Infinity: + c = SW_C2 | SW_C0; + break; + } } - } - if ( getsign(st0_ptr) == SIGN_NEG ) - c |= SW_C1; - setcc(c); + if (getsign(st0_ptr) == SIGN_NEG) + c |= SW_C1; + setcc(c); } - static FUNC_ST0 const fp_etc_table[] = { - fchs, fabs, (FUNC_ST0)FPU_illegal, (FUNC_ST0)FPU_illegal, - ftst_, fxam, (FUNC_ST0)FPU_illegal, (FUNC_ST0)FPU_illegal + fchs, fabs, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal, + ftst_, fxam, (FUNC_ST0) FPU_illegal, (FUNC_ST0) FPU_illegal }; void FPU_etc(void) { - (fp_etc_table[FPU_rm])(&st(0), FPU_gettag0()); + (fp_etc_table[FPU_rm]) (&st(0), FPU_gettag0()); } diff --git a/arch/x86/math-emu/fpu_proto.h b/arch/x86/math-emu/fpu_proto.h index 37a8a7fe7e2..aa49b6a0d85 100644 --- a/arch/x86/math-emu/fpu_proto.h +++ b/arch/x86/math-emu/fpu_proto.h @@ -66,7 +66,7 @@ extern int FPU_Special(FPU_REG const *ptr); extern int isNaN(FPU_REG const *ptr); extern void FPU_pop(void); extern int FPU_empty_i(int stnr); -extern int FPU_stackoverflow(FPU_REG **st_new_ptr); +extern int FPU_stackoverflow(FPU_REG ** st_new_ptr); extern void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr); extern void FPU_copy_to_reg1(FPU_REG const *r, u_char tag); extern void FPU_copy_to_reg0(FPU_REG const *r, u_char tag); @@ -75,21 +75,23 @@ extern void FPU_triga(void); extern void FPU_trigb(void); /* get_address.c */ extern void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip, - struct address *addr, fpu_addr_modes addr_modes); + struct address *addr, + fpu_addr_modes addr_modes); extern void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip, - struct address *addr, fpu_addr_modes addr_modes); + struct address *addr, + fpu_addr_modes addr_modes); /* load_store.c */ extern int FPU_load_store(u_char type, fpu_addr_modes addr_modes, - void __user *data_address); + void __user * data_address); /* poly_2xm1.c */ -extern int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result); +extern int poly_2xm1(u_char sign, FPU_REG * arg, FPU_REG *result); /* poly_atan.c */ -extern void poly_atan(FPU_REG *st0_ptr, u_char st0_tag, FPU_REG *st1_ptr, +extern void poly_atan(FPU_REG * st0_ptr, u_char st0_tag, FPU_REG *st1_ptr, u_char st1_tag); /* poly_l2.c */ extern void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign); extern int poly_l2p1(u_char s0, u_char s1, FPU_REG *r0, FPU_REG *r1, - FPU_REG *d); + FPU_REG * d); /* poly_sin.c */ extern void poly_sine(FPU_REG *st0_ptr); extern void poly_cos(FPU_REG *st0_ptr); @@ -117,10 +119,13 @@ extern int FPU_load_int32(long __user *_s, FPU_REG *loaded_data); extern int FPU_load_int16(short __user *_s, FPU_REG *loaded_data); extern int FPU_load_bcd(u_char __user *s); extern int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag, - long double __user *d); -extern int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat); -extern int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single); -extern int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d); + long double __user * d); +extern int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, + double __user * dfloat); +extern int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, + float __user * single); +extern int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, + long long __user * d); extern int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d); extern int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d); extern int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d); @@ -137,4 +142,3 @@ extern int FPU_div(int flags, int regrm, int control_w); /* reg_convert.c */ extern int FPU_to_exp16(FPU_REG const *a, FPU_REG *x); #endif /* _FPU_PROTO_H */ - diff --git a/arch/x86/math-emu/fpu_tags.c b/arch/x86/math-emu/fpu_tags.c index cb436fe20e4..d9c657cd774 100644 --- a/arch/x86/math-emu/fpu_tags.c +++ b/arch/x86/math-emu/fpu_tags.c @@ -14,114 +14,102 @@ #include "fpu_system.h" #include "exception.h" - void FPU_pop(void) { - fpu_tag_word |= 3 << ((top & 7)*2); - top++; + fpu_tag_word |= 3 << ((top & 7) * 2); + top++; } - int FPU_gettag0(void) { - return (fpu_tag_word >> ((top & 7)*2)) & 3; + return (fpu_tag_word >> ((top & 7) * 2)) & 3; } - int FPU_gettagi(int stnr) { - return (fpu_tag_word >> (((top+stnr) & 7)*2)) & 3; + return (fpu_tag_word >> (((top + stnr) & 7) * 2)) & 3; } - int FPU_gettag(int regnr) { - return (fpu_tag_word >> ((regnr & 7)*2)) & 3; + return (fpu_tag_word >> ((regnr & 7) * 2)) & 3; } - void FPU_settag0(int tag) { - int regnr = top; - regnr &= 7; - fpu_tag_word &= ~(3 << (regnr*2)); - fpu_tag_word |= (tag & 3) << (regnr*2); + int regnr = top; + regnr &= 7; + fpu_tag_word &= ~(3 << (regnr * 2)); + fpu_tag_word |= (tag & 3) << (regnr * 2); } - void FPU_settagi(int stnr, int tag) { - int regnr = stnr+top; - regnr &= 7; - fpu_tag_word &= ~(3 << (regnr*2)); - fpu_tag_word |= (tag & 3) << (regnr*2); + int regnr = stnr + top; + regnr &= 7; + fpu_tag_word &= ~(3 << (regnr * 2)); + fpu_tag_word |= (tag & 3) << (regnr * 2); } - void FPU_settag(int regnr, int tag) { - regnr &= 7; - fpu_tag_word &= ~(3 << (regnr*2)); - fpu_tag_word |= (tag & 3) << (regnr*2); + regnr &= 7; + fpu_tag_word &= ~(3 << (regnr * 2)); + fpu_tag_word |= (tag & 3) << (regnr * 2); } - int FPU_Special(FPU_REG const *ptr) { - int exp = exponent(ptr); - - if ( exp == EXP_BIAS+EXP_UNDER ) - return TW_Denormal; - else if ( exp != EXP_BIAS+EXP_OVER ) - return TW_NaN; - else if ( (ptr->sigh == 0x80000000) && (ptr->sigl == 0) ) - return TW_Infinity; - return TW_NaN; + int exp = exponent(ptr); + + if (exp == EXP_BIAS + EXP_UNDER) + return TW_Denormal; + else if (exp != EXP_BIAS + EXP_OVER) + return TW_NaN; + else if ((ptr->sigh == 0x80000000) && (ptr->sigl == 0)) + return TW_Infinity; + return TW_NaN; } - int isNaN(FPU_REG const *ptr) { - return ( (exponent(ptr) == EXP_BIAS+EXP_OVER) - && !((ptr->sigh == 0x80000000) && (ptr->sigl == 0)) ); + return ((exponent(ptr) == EXP_BIAS + EXP_OVER) + && !((ptr->sigh == 0x80000000) && (ptr->sigl == 0))); } - int FPU_empty_i(int stnr) { - int regnr = (top+stnr) & 7; + int regnr = (top + stnr) & 7; - return ((fpu_tag_word >> (regnr*2)) & 3) == TAG_Empty; + return ((fpu_tag_word >> (regnr * 2)) & 3) == TAG_Empty; } - -int FPU_stackoverflow(FPU_REG **st_new_ptr) +int FPU_stackoverflow(FPU_REG ** st_new_ptr) { - *st_new_ptr = &st(-1); + *st_new_ptr = &st(-1); - return ((fpu_tag_word >> (((top - 1) & 7)*2)) & 3) != TAG_Empty; + return ((fpu_tag_word >> (((top - 1) & 7) * 2)) & 3) != TAG_Empty; } - void FPU_copy_to_regi(FPU_REG const *r, u_char tag, int stnr) { - reg_copy(r, &st(stnr)); - FPU_settagi(stnr, tag); + reg_copy(r, &st(stnr)); + FPU_settagi(stnr, tag); } void FPU_copy_to_reg1(FPU_REG const *r, u_char tag) { - reg_copy(r, &st(1)); - FPU_settagi(1, tag); + reg_copy(r, &st(1)); + FPU_settagi(1, tag); } void FPU_copy_to_reg0(FPU_REG const *r, u_char tag) { - int regnr = top; - regnr &= 7; + int regnr = top; + regnr &= 7; - reg_copy(r, &st(0)); + reg_copy(r, &st(0)); - fpu_tag_word &= ~(3 << (regnr*2)); - fpu_tag_word |= (tag & 3) << (regnr*2); + fpu_tag_word &= ~(3 << (regnr * 2)); + fpu_tag_word |= (tag & 3) << (regnr * 2); } diff --git a/arch/x86/math-emu/fpu_trig.c b/arch/x86/math-emu/fpu_trig.c index 403cbde1d42..ecd06680581 100644 --- a/arch/x86/math-emu/fpu_trig.c +++ b/arch/x86/math-emu/fpu_trig.c @@ -15,11 +15,10 @@ #include "fpu_emu.h" #include "status_w.h" #include "control_w.h" -#include "reg_constant.h" +#include "reg_constant.h" static void rem_kernel(unsigned long long st0, unsigned long long *y, - unsigned long long st1, - unsigned long long q, int n); + unsigned long long st1, unsigned long long q, int n); #define BETTER_THAN_486 @@ -33,788 +32,706 @@ static void rem_kernel(unsigned long long st0, unsigned long long *y, precision of the result sometimes degrades to about 63.9 bits */ static int trig_arg(FPU_REG *st0_ptr, int even) { - FPU_REG tmp; - u_char tmptag; - unsigned long long q; - int old_cw = control_word, saved_status = partial_status; - int tag, st0_tag = TAG_Valid; - - if ( exponent(st0_ptr) >= 63 ) - { - partial_status |= SW_C2; /* Reduction incomplete. */ - return -1; - } - - control_word &= ~CW_RC; - control_word |= RC_CHOP; - - setpositive(st0_ptr); - tag = FPU_u_div(st0_ptr, &CONST_PI2, &tmp, PR_64_BITS | RC_CHOP | 0x3f, - SIGN_POS); - - FPU_round_to_int(&tmp, tag); /* Fortunately, this can't overflow - to 2^64 */ - q = significand(&tmp); - if ( q ) - { - rem_kernel(significand(st0_ptr), - &significand(&tmp), - significand(&CONST_PI2), - q, exponent(st0_ptr) - exponent(&CONST_PI2)); - setexponent16(&tmp, exponent(&CONST_PI2)); - st0_tag = FPU_normalize(&tmp); - FPU_copy_to_reg0(&tmp, st0_tag); - } - - if ( (even && !(q & 1)) || (!even && (q & 1)) ) - { - st0_tag = FPU_sub(REV|LOADED|TAG_Valid, (int)&CONST_PI2, FULL_PRECISION); + FPU_REG tmp; + u_char tmptag; + unsigned long long q; + int old_cw = control_word, saved_status = partial_status; + int tag, st0_tag = TAG_Valid; + + if (exponent(st0_ptr) >= 63) { + partial_status |= SW_C2; /* Reduction incomplete. */ + return -1; + } -#ifdef BETTER_THAN_486 - /* So far, the results are exact but based upon a 64 bit - precision approximation to pi/2. The technique used - now is equivalent to using an approximation to pi/2 which - is accurate to about 128 bits. */ - if ( (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64) || (q > 1) ) - { - /* This code gives the effect of having pi/2 to better than - 128 bits precision. */ - - significand(&tmp) = q + 1; - setexponent16(&tmp, 63); - FPU_normalize(&tmp); - tmptag = - FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, FULL_PRECISION, SIGN_POS, - exponent(&CONST_PI2extra) + exponent(&tmp)); - setsign(&tmp, getsign(&CONST_PI2extra)); - st0_tag = FPU_add(&tmp, tmptag, 0, FULL_PRECISION); - if ( signnegative(st0_ptr) ) - { - /* CONST_PI2extra is negative, so the result of the addition - can be negative. This means that the argument is actually - in a different quadrant. The correction is always < pi/2, - so it can't overflow into yet another quadrant. */ - setpositive(st0_ptr); - q++; - } + control_word &= ~CW_RC; + control_word |= RC_CHOP; + + setpositive(st0_ptr); + tag = FPU_u_div(st0_ptr, &CONST_PI2, &tmp, PR_64_BITS | RC_CHOP | 0x3f, + SIGN_POS); + + FPU_round_to_int(&tmp, tag); /* Fortunately, this can't overflow + to 2^64 */ + q = significand(&tmp); + if (q) { + rem_kernel(significand(st0_ptr), + &significand(&tmp), + significand(&CONST_PI2), + q, exponent(st0_ptr) - exponent(&CONST_PI2)); + setexponent16(&tmp, exponent(&CONST_PI2)); + st0_tag = FPU_normalize(&tmp); + FPU_copy_to_reg0(&tmp, st0_tag); } + + if ((even && !(q & 1)) || (!even && (q & 1))) { + st0_tag = + FPU_sub(REV | LOADED | TAG_Valid, (int)&CONST_PI2, + FULL_PRECISION); + +#ifdef BETTER_THAN_486 + /* So far, the results are exact but based upon a 64 bit + precision approximation to pi/2. The technique used + now is equivalent to using an approximation to pi/2 which + is accurate to about 128 bits. */ + if ((exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64) + || (q > 1)) { + /* This code gives the effect of having pi/2 to better than + 128 bits precision. */ + + significand(&tmp) = q + 1; + setexponent16(&tmp, 63); + FPU_normalize(&tmp); + tmptag = + FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, + FULL_PRECISION, SIGN_POS, + exponent(&CONST_PI2extra) + + exponent(&tmp)); + setsign(&tmp, getsign(&CONST_PI2extra)); + st0_tag = FPU_add(&tmp, tmptag, 0, FULL_PRECISION); + if (signnegative(st0_ptr)) { + /* CONST_PI2extra is negative, so the result of the addition + can be negative. This means that the argument is actually + in a different quadrant. The correction is always < pi/2, + so it can't overflow into yet another quadrant. */ + setpositive(st0_ptr); + q++; + } + } #endif /* BETTER_THAN_486 */ - } + } #ifdef BETTER_THAN_486 - else - { - /* So far, the results are exact but based upon a 64 bit - precision approximation to pi/2. The technique used - now is equivalent to using an approximation to pi/2 which - is accurate to about 128 bits. */ - if ( ((q > 0) && (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64)) - || (q > 1) ) - { - /* This code gives the effect of having p/2 to better than - 128 bits precision. */ - - significand(&tmp) = q; - setexponent16(&tmp, 63); - FPU_normalize(&tmp); /* This must return TAG_Valid */ - tmptag = FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, FULL_PRECISION, - SIGN_POS, - exponent(&CONST_PI2extra) + exponent(&tmp)); - setsign(&tmp, getsign(&CONST_PI2extra)); - st0_tag = FPU_sub(LOADED|(tmptag & 0x0f), (int)&tmp, - FULL_PRECISION); - if ( (exponent(st0_ptr) == exponent(&CONST_PI2)) && - ((st0_ptr->sigh > CONST_PI2.sigh) - || ((st0_ptr->sigh == CONST_PI2.sigh) - && (st0_ptr->sigl > CONST_PI2.sigl))) ) - { - /* CONST_PI2extra is negative, so the result of the - subtraction can be larger than pi/2. This means - that the argument is actually in a different quadrant. - The correction is always < pi/2, so it can't overflow - into yet another quadrant. */ - st0_tag = FPU_sub(REV|LOADED|TAG_Valid, (int)&CONST_PI2, - FULL_PRECISION); - q++; - } + else { + /* So far, the results are exact but based upon a 64 bit + precision approximation to pi/2. The technique used + now is equivalent to using an approximation to pi/2 which + is accurate to about 128 bits. */ + if (((q > 0) + && (exponent(st0_ptr) <= exponent(&CONST_PI2extra) + 64)) + || (q > 1)) { + /* This code gives the effect of having p/2 to better than + 128 bits precision. */ + + significand(&tmp) = q; + setexponent16(&tmp, 63); + FPU_normalize(&tmp); /* This must return TAG_Valid */ + tmptag = + FPU_u_mul(&CONST_PI2extra, &tmp, &tmp, + FULL_PRECISION, SIGN_POS, + exponent(&CONST_PI2extra) + + exponent(&tmp)); + setsign(&tmp, getsign(&CONST_PI2extra)); + st0_tag = FPU_sub(LOADED | (tmptag & 0x0f), (int)&tmp, + FULL_PRECISION); + if ((exponent(st0_ptr) == exponent(&CONST_PI2)) && + ((st0_ptr->sigh > CONST_PI2.sigh) + || ((st0_ptr->sigh == CONST_PI2.sigh) + && (st0_ptr->sigl > CONST_PI2.sigl)))) { + /* CONST_PI2extra is negative, so the result of the + subtraction can be larger than pi/2. This means + that the argument is actually in a different quadrant. + The correction is always < pi/2, so it can't overflow + into yet another quadrant. */ + st0_tag = + FPU_sub(REV | LOADED | TAG_Valid, + (int)&CONST_PI2, FULL_PRECISION); + q++; + } + } } - } #endif /* BETTER_THAN_486 */ - FPU_settag0(st0_tag); - control_word = old_cw; - partial_status = saved_status & ~SW_C2; /* Reduction complete. */ + FPU_settag0(st0_tag); + control_word = old_cw; + partial_status = saved_status & ~SW_C2; /* Reduction complete. */ - return (q & 3) | even; + return (q & 3) | even; } - /* Convert a long to register */ static void convert_l2reg(long const *arg, int deststnr) { - int tag; - long num = *arg; - u_char sign; - FPU_REG *dest = &st(deststnr); - - if (num == 0) - { - FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); - return; - } - - if (num > 0) - { sign = SIGN_POS; } - else - { num = -num; sign = SIGN_NEG; } - - dest->sigh = num; - dest->sigl = 0; - setexponent16(dest, 31); - tag = FPU_normalize(dest); - FPU_settagi(deststnr, tag); - setsign(dest, sign); - return; -} + int tag; + long num = *arg; + u_char sign; + FPU_REG *dest = &st(deststnr); + if (num == 0) { + FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); + return; + } + + if (num > 0) { + sign = SIGN_POS; + } else { + num = -num; + sign = SIGN_NEG; + } + + dest->sigh = num; + dest->sigl = 0; + setexponent16(dest, 31); + tag = FPU_normalize(dest); + FPU_settagi(deststnr, tag); + setsign(dest, sign); + return; +} static void single_arg_error(FPU_REG *st0_ptr, u_char st0_tag) { - if ( st0_tag == TAG_Empty ) - FPU_stack_underflow(); /* Puts a QNaN in st(0) */ - else if ( st0_tag == TW_NaN ) - real_1op_NaN(st0_ptr); /* return with a NaN in st(0) */ + if (st0_tag == TAG_Empty) + FPU_stack_underflow(); /* Puts a QNaN in st(0) */ + else if (st0_tag == TW_NaN) + real_1op_NaN(st0_ptr); /* return with a NaN in st(0) */ #ifdef PARANOID - else - EXCEPTION(EX_INTERNAL|0x0112); + else + EXCEPTION(EX_INTERNAL | 0x0112); #endif /* PARANOID */ } - static void single_arg_2_error(FPU_REG *st0_ptr, u_char st0_tag) { - int isNaN; - - switch ( st0_tag ) - { - case TW_NaN: - isNaN = (exponent(st0_ptr) == EXP_OVER) && (st0_ptr->sigh & 0x80000000); - if ( isNaN && !(st0_ptr->sigh & 0x40000000) ) /* Signaling ? */ - { - EXCEPTION(EX_Invalid); - if ( control_word & CW_Invalid ) - { - /* The masked response */ - /* Convert to a QNaN */ - st0_ptr->sigh |= 0x40000000; - push(); - FPU_copy_to_reg0(st0_ptr, TAG_Special); - } - } - else if ( isNaN ) - { - /* A QNaN */ - push(); - FPU_copy_to_reg0(st0_ptr, TAG_Special); - } - else - { - /* pseudoNaN or other unsupported */ - EXCEPTION(EX_Invalid); - if ( control_word & CW_Invalid ) - { - /* The masked response */ - FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); - push(); - FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); - } - } - break; /* return with a NaN in st(0) */ + int isNaN; + + switch (st0_tag) { + case TW_NaN: + isNaN = (exponent(st0_ptr) == EXP_OVER) + && (st0_ptr->sigh & 0x80000000); + if (isNaN && !(st0_ptr->sigh & 0x40000000)) { /* Signaling ? */ + EXCEPTION(EX_Invalid); + if (control_word & CW_Invalid) { + /* The masked response */ + /* Convert to a QNaN */ + st0_ptr->sigh |= 0x40000000; + push(); + FPU_copy_to_reg0(st0_ptr, TAG_Special); + } + } else if (isNaN) { + /* A QNaN */ + push(); + FPU_copy_to_reg0(st0_ptr, TAG_Special); + } else { + /* pseudoNaN or other unsupported */ + EXCEPTION(EX_Invalid); + if (control_word & CW_Invalid) { + /* The masked response */ + FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); + push(); + FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); + } + } + break; /* return with a NaN in st(0) */ #ifdef PARANOID - default: - EXCEPTION(EX_INTERNAL|0x0112); + default: + EXCEPTION(EX_INTERNAL | 0x0112); #endif /* PARANOID */ - } + } } - /*---------------------------------------------------------------------------*/ static void f2xm1(FPU_REG *st0_ptr, u_char tag) { - FPU_REG a; + FPU_REG a; - clear_C1(); + clear_C1(); - if ( tag == TAG_Valid ) - { - /* For an 80486 FPU, the result is undefined if the arg is >= 1.0 */ - if ( exponent(st0_ptr) < 0 ) - { - denormal_arg: + if (tag == TAG_Valid) { + /* For an 80486 FPU, the result is undefined if the arg is >= 1.0 */ + if (exponent(st0_ptr) < 0) { + denormal_arg: - FPU_to_exp16(st0_ptr, &a); + FPU_to_exp16(st0_ptr, &a); - /* poly_2xm1(x) requires 0 < st(0) < 1. */ - poly_2xm1(getsign(st0_ptr), &a, st0_ptr); + /* poly_2xm1(x) requires 0 < st(0) < 1. */ + poly_2xm1(getsign(st0_ptr), &a, st0_ptr); + } + set_precision_flag_up(); /* 80486 appears to always do this */ + return; } - set_precision_flag_up(); /* 80486 appears to always do this */ - return; - } - if ( tag == TAG_Zero ) - return; + if (tag == TAG_Zero) + return; - if ( tag == TAG_Special ) - tag = FPU_Special(st0_ptr); + if (tag == TAG_Special) + tag = FPU_Special(st0_ptr); - switch ( tag ) - { - case TW_Denormal: - if ( denormal_operand() < 0 ) - return; - goto denormal_arg; - case TW_Infinity: - if ( signnegative(st0_ptr) ) - { - /* -infinity gives -1 (p16-10) */ - FPU_copy_to_reg0(&CONST_1, TAG_Valid); - setnegative(st0_ptr); + switch (tag) { + case TW_Denormal: + if (denormal_operand() < 0) + return; + goto denormal_arg; + case TW_Infinity: + if (signnegative(st0_ptr)) { + /* -infinity gives -1 (p16-10) */ + FPU_copy_to_reg0(&CONST_1, TAG_Valid); + setnegative(st0_ptr); + } + return; + default: + single_arg_error(st0_ptr, tag); } - return; - default: - single_arg_error(st0_ptr, tag); - } } - static void fptan(FPU_REG *st0_ptr, u_char st0_tag) { - FPU_REG *st_new_ptr; - int q; - u_char arg_sign = getsign(st0_ptr); - - /* Stack underflow has higher priority */ - if ( st0_tag == TAG_Empty ) - { - FPU_stack_underflow(); /* Puts a QNaN in st(0) */ - if ( control_word & CW_Invalid ) - { - st_new_ptr = &st(-1); - push(); - FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */ + FPU_REG *st_new_ptr; + int q; + u_char arg_sign = getsign(st0_ptr); + + /* Stack underflow has higher priority */ + if (st0_tag == TAG_Empty) { + FPU_stack_underflow(); /* Puts a QNaN in st(0) */ + if (control_word & CW_Invalid) { + st_new_ptr = &st(-1); + push(); + FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */ + } + return; } - return; - } - - if ( STACK_OVERFLOW ) - { FPU_stack_overflow(); return; } - - if ( st0_tag == TAG_Valid ) - { - if ( exponent(st0_ptr) > -40 ) - { - if ( (q = trig_arg(st0_ptr, 0)) == -1 ) - { - /* Operand is out of range */ - return; - } - - poly_tan(st0_ptr); - setsign(st0_ptr, (q & 1) ^ (arg_sign != 0)); - set_precision_flag_up(); /* We do not really know if up or down */ + + if (STACK_OVERFLOW) { + FPU_stack_overflow(); + return; } - else - { - /* For a small arg, the result == the argument */ - /* Underflow may happen */ - denormal_arg: + if (st0_tag == TAG_Valid) { + if (exponent(st0_ptr) > -40) { + if ((q = trig_arg(st0_ptr, 0)) == -1) { + /* Operand is out of range */ + return; + } + + poly_tan(st0_ptr); + setsign(st0_ptr, (q & 1) ^ (arg_sign != 0)); + set_precision_flag_up(); /* We do not really know if up or down */ + } else { + /* For a small arg, the result == the argument */ + /* Underflow may happen */ + + denormal_arg: + + FPU_to_exp16(st0_ptr, st0_ptr); - FPU_to_exp16(st0_ptr, st0_ptr); - - st0_tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign); - FPU_settag0(st0_tag); + st0_tag = + FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign); + FPU_settag0(st0_tag); + } + push(); + FPU_copy_to_reg0(&CONST_1, TAG_Valid); + return; } - push(); - FPU_copy_to_reg0(&CONST_1, TAG_Valid); - return; - } - - if ( st0_tag == TAG_Zero ) - { - push(); - FPU_copy_to_reg0(&CONST_1, TAG_Valid); - setcc(0); - return; - } - - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); - - if ( st0_tag == TW_Denormal ) - { - if ( denormal_operand() < 0 ) - return; - goto denormal_arg; - } - - if ( st0_tag == TW_Infinity ) - { - /* The 80486 treats infinity as an invalid operand */ - if ( arith_invalid(0) >= 0 ) - { - st_new_ptr = &st(-1); - push(); - arith_invalid(0); + if (st0_tag == TAG_Zero) { + push(); + FPU_copy_to_reg0(&CONST_1, TAG_Valid); + setcc(0); + return; + } + + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); + + if (st0_tag == TW_Denormal) { + if (denormal_operand() < 0) + return; + + goto denormal_arg; } - return; - } - single_arg_2_error(st0_ptr, st0_tag); -} + if (st0_tag == TW_Infinity) { + /* The 80486 treats infinity as an invalid operand */ + if (arith_invalid(0) >= 0) { + st_new_ptr = &st(-1); + push(); + arith_invalid(0); + } + return; + } + single_arg_2_error(st0_ptr, st0_tag); +} static void fxtract(FPU_REG *st0_ptr, u_char st0_tag) { - FPU_REG *st_new_ptr; - u_char sign; - register FPU_REG *st1_ptr = st0_ptr; /* anticipate */ - - if ( STACK_OVERFLOW ) - { FPU_stack_overflow(); return; } - - clear_C1(); - - if ( st0_tag == TAG_Valid ) - { - long e; - - push(); - sign = getsign(st1_ptr); - reg_copy(st1_ptr, st_new_ptr); - setexponent16(st_new_ptr, exponent(st_new_ptr)); - - denormal_arg: - - e = exponent16(st_new_ptr); - convert_l2reg(&e, 1); - setexponentpos(st_new_ptr, 0); - setsign(st_new_ptr, sign); - FPU_settag0(TAG_Valid); /* Needed if arg was a denormal */ - return; - } - else if ( st0_tag == TAG_Zero ) - { - sign = getsign(st0_ptr); - - if ( FPU_divide_by_zero(0, SIGN_NEG) < 0 ) - return; + FPU_REG *st_new_ptr; + u_char sign; + register FPU_REG *st1_ptr = st0_ptr; /* anticipate */ - push(); - FPU_copy_to_reg0(&CONST_Z, TAG_Zero); - setsign(st_new_ptr, sign); - return; - } + if (STACK_OVERFLOW) { + FPU_stack_overflow(); + return; + } - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); + clear_C1(); - if ( st0_tag == TW_Denormal ) - { - if (denormal_operand() < 0 ) - return; + if (st0_tag == TAG_Valid) { + long e; - push(); - sign = getsign(st1_ptr); - FPU_to_exp16(st1_ptr, st_new_ptr); - goto denormal_arg; - } - else if ( st0_tag == TW_Infinity ) - { - sign = getsign(st0_ptr); - setpositive(st0_ptr); - push(); - FPU_copy_to_reg0(&CONST_INF, TAG_Special); - setsign(st_new_ptr, sign); - return; - } - else if ( st0_tag == TW_NaN ) - { - if ( real_1op_NaN(st0_ptr) < 0 ) - return; + push(); + sign = getsign(st1_ptr); + reg_copy(st1_ptr, st_new_ptr); + setexponent16(st_new_ptr, exponent(st_new_ptr)); + + denormal_arg: + + e = exponent16(st_new_ptr); + convert_l2reg(&e, 1); + setexponentpos(st_new_ptr, 0); + setsign(st_new_ptr, sign); + FPU_settag0(TAG_Valid); /* Needed if arg was a denormal */ + return; + } else if (st0_tag == TAG_Zero) { + sign = getsign(st0_ptr); + + if (FPU_divide_by_zero(0, SIGN_NEG) < 0) + return; - push(); - FPU_copy_to_reg0(st0_ptr, TAG_Special); - return; - } - else if ( st0_tag == TAG_Empty ) - { - /* Is this the correct behaviour? */ - if ( control_word & EX_Invalid ) - { - FPU_stack_underflow(); - push(); - FPU_stack_underflow(); + push(); + FPU_copy_to_reg0(&CONST_Z, TAG_Zero); + setsign(st_new_ptr, sign); + return; + } + + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); + + if (st0_tag == TW_Denormal) { + if (denormal_operand() < 0) + return; + + push(); + sign = getsign(st1_ptr); + FPU_to_exp16(st1_ptr, st_new_ptr); + goto denormal_arg; + } else if (st0_tag == TW_Infinity) { + sign = getsign(st0_ptr); + setpositive(st0_ptr); + push(); + FPU_copy_to_reg0(&CONST_INF, TAG_Special); + setsign(st_new_ptr, sign); + return; + } else if (st0_tag == TW_NaN) { + if (real_1op_NaN(st0_ptr) < 0) + return; + + push(); + FPU_copy_to_reg0(st0_ptr, TAG_Special); + return; + } else if (st0_tag == TAG_Empty) { + /* Is this the correct behaviour? */ + if (control_word & EX_Invalid) { + FPU_stack_underflow(); + push(); + FPU_stack_underflow(); + } else + EXCEPTION(EX_StackUnder); } - else - EXCEPTION(EX_StackUnder); - } #ifdef PARANOID - else - EXCEPTION(EX_INTERNAL | 0x119); + else + EXCEPTION(EX_INTERNAL | 0x119); #endif /* PARANOID */ } - static void fdecstp(void) { - clear_C1(); - top--; + clear_C1(); + top--; } static void fincstp(void) { - clear_C1(); - top++; + clear_C1(); + top++; } - static void fsqrt_(FPU_REG *st0_ptr, u_char st0_tag) { - int expon; - - clear_C1(); - - if ( st0_tag == TAG_Valid ) - { - u_char tag; - - if (signnegative(st0_ptr)) - { - arith_invalid(0); /* sqrt(negative) is invalid */ - return; - } + int expon; + + clear_C1(); - /* make st(0) in [1.0 .. 4.0) */ - expon = exponent(st0_ptr); - - denormal_arg: - - setexponent16(st0_ptr, (expon & 1)); - - /* Do the computation, the sign of the result will be positive. */ - tag = wm_sqrt(st0_ptr, 0, 0, control_word, SIGN_POS); - addexponent(st0_ptr, expon >> 1); - FPU_settag0(tag); - return; - } - - if ( st0_tag == TAG_Zero ) - return; - - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); - - if ( st0_tag == TW_Infinity ) - { - if ( signnegative(st0_ptr) ) - arith_invalid(0); /* sqrt(-Infinity) is invalid */ - return; - } - else if ( st0_tag == TW_Denormal ) - { - if (signnegative(st0_ptr)) - { - arith_invalid(0); /* sqrt(negative) is invalid */ - return; + if (st0_tag == TAG_Valid) { + u_char tag; + + if (signnegative(st0_ptr)) { + arith_invalid(0); /* sqrt(negative) is invalid */ + return; + } + + /* make st(0) in [1.0 .. 4.0) */ + expon = exponent(st0_ptr); + + denormal_arg: + + setexponent16(st0_ptr, (expon & 1)); + + /* Do the computation, the sign of the result will be positive. */ + tag = wm_sqrt(st0_ptr, 0, 0, control_word, SIGN_POS); + addexponent(st0_ptr, expon >> 1); + FPU_settag0(tag); + return; } - if ( denormal_operand() < 0 ) - return; + if (st0_tag == TAG_Zero) + return; - FPU_to_exp16(st0_ptr, st0_ptr); + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); - expon = exponent16(st0_ptr); + if (st0_tag == TW_Infinity) { + if (signnegative(st0_ptr)) + arith_invalid(0); /* sqrt(-Infinity) is invalid */ + return; + } else if (st0_tag == TW_Denormal) { + if (signnegative(st0_ptr)) { + arith_invalid(0); /* sqrt(negative) is invalid */ + return; + } - goto denormal_arg; - } + if (denormal_operand() < 0) + return; - single_arg_error(st0_ptr, st0_tag); + FPU_to_exp16(st0_ptr, st0_ptr); -} + expon = exponent16(st0_ptr); + + goto denormal_arg; + } + single_arg_error(st0_ptr, st0_tag); + +} static void frndint_(FPU_REG *st0_ptr, u_char st0_tag) { - int flags, tag; + int flags, tag; - if ( st0_tag == TAG_Valid ) - { - u_char sign; + if (st0_tag == TAG_Valid) { + u_char sign; - denormal_arg: + denormal_arg: - sign = getsign(st0_ptr); + sign = getsign(st0_ptr); - if (exponent(st0_ptr) > 63) - return; + if (exponent(st0_ptr) > 63) + return; + + if (st0_tag == TW_Denormal) { + if (denormal_operand() < 0) + return; + } + + /* Fortunately, this can't overflow to 2^64 */ + if ((flags = FPU_round_to_int(st0_ptr, st0_tag))) + set_precision_flag(flags); - if ( st0_tag == TW_Denormal ) - { - if (denormal_operand() < 0 ) - return; + setexponent16(st0_ptr, 63); + tag = FPU_normalize(st0_ptr); + setsign(st0_ptr, sign); + FPU_settag0(tag); + return; } - /* Fortunately, this can't overflow to 2^64 */ - if ( (flags = FPU_round_to_int(st0_ptr, st0_tag)) ) - set_precision_flag(flags); - - setexponent16(st0_ptr, 63); - tag = FPU_normalize(st0_ptr); - setsign(st0_ptr, sign); - FPU_settag0(tag); - return; - } - - if ( st0_tag == TAG_Zero ) - return; - - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); - - if ( st0_tag == TW_Denormal ) - goto denormal_arg; - else if ( st0_tag == TW_Infinity ) - return; - else - single_arg_error(st0_ptr, st0_tag); -} + if (st0_tag == TAG_Zero) + return; + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); + + if (st0_tag == TW_Denormal) + goto denormal_arg; + else if (st0_tag == TW_Infinity) + return; + else + single_arg_error(st0_ptr, st0_tag); +} static int fsin(FPU_REG *st0_ptr, u_char tag) { - u_char arg_sign = getsign(st0_ptr); - - if ( tag == TAG_Valid ) - { - int q; - - if ( exponent(st0_ptr) > -40 ) - { - if ( (q = trig_arg(st0_ptr, 0)) == -1 ) - { - /* Operand is out of range */ - return 1; - } - - poly_sine(st0_ptr); - - if (q & 2) - changesign(st0_ptr); - - setsign(st0_ptr, getsign(st0_ptr) ^ arg_sign); - - /* We do not really know if up or down */ - set_precision_flag_up(); - return 0; + u_char arg_sign = getsign(st0_ptr); + + if (tag == TAG_Valid) { + int q; + + if (exponent(st0_ptr) > -40) { + if ((q = trig_arg(st0_ptr, 0)) == -1) { + /* Operand is out of range */ + return 1; + } + + poly_sine(st0_ptr); + + if (q & 2) + changesign(st0_ptr); + + setsign(st0_ptr, getsign(st0_ptr) ^ arg_sign); + + /* We do not really know if up or down */ + set_precision_flag_up(); + return 0; + } else { + /* For a small arg, the result == the argument */ + set_precision_flag_up(); /* Must be up. */ + return 0; + } } - else - { - /* For a small arg, the result == the argument */ - set_precision_flag_up(); /* Must be up. */ - return 0; + + if (tag == TAG_Zero) { + setcc(0); + return 0; } - } - - if ( tag == TAG_Zero ) - { - setcc(0); - return 0; - } - - if ( tag == TAG_Special ) - tag = FPU_Special(st0_ptr); - - if ( tag == TW_Denormal ) - { - if ( denormal_operand() < 0 ) - return 1; - - /* For a small arg, the result == the argument */ - /* Underflow may happen */ - FPU_to_exp16(st0_ptr, st0_ptr); - - tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign); - - FPU_settag0(tag); - - return 0; - } - else if ( tag == TW_Infinity ) - { - /* The 80486 treats infinity as an invalid operand */ - arith_invalid(0); - return 1; - } - else - { - single_arg_error(st0_ptr, tag); - return 1; - } -} + if (tag == TAG_Special) + tag = FPU_Special(st0_ptr); + + if (tag == TW_Denormal) { + if (denormal_operand() < 0) + return 1; + + /* For a small arg, the result == the argument */ + /* Underflow may happen */ + FPU_to_exp16(st0_ptr, st0_ptr); + + tag = FPU_round(st0_ptr, 1, 0, FULL_PRECISION, arg_sign); + + FPU_settag0(tag); + + return 0; + } else if (tag == TW_Infinity) { + /* The 80486 treats infinity as an invalid operand */ + arith_invalid(0); + return 1; + } else { + single_arg_error(st0_ptr, tag); + return 1; + } +} static int f_cos(FPU_REG *st0_ptr, u_char tag) { - u_char st0_sign; - - st0_sign = getsign(st0_ptr); - - if ( tag == TAG_Valid ) - { - int q; - - if ( exponent(st0_ptr) > -40 ) - { - if ( (exponent(st0_ptr) < 0) - || ((exponent(st0_ptr) == 0) - && (significand(st0_ptr) <= 0xc90fdaa22168c234LL)) ) - { - poly_cos(st0_ptr); - - /* We do not really know if up or down */ - set_precision_flag_down(); - - return 0; - } - else if ( (q = trig_arg(st0_ptr, FCOS)) != -1 ) - { - poly_sine(st0_ptr); - - if ((q+1) & 2) - changesign(st0_ptr); - - /* We do not really know if up or down */ - set_precision_flag_down(); - - return 0; - } - else - { - /* Operand is out of range */ - return 1; - } - } - else - { - denormal_arg: + u_char st0_sign; + + st0_sign = getsign(st0_ptr); - setcc(0); - FPU_copy_to_reg0(&CONST_1, TAG_Valid); + if (tag == TAG_Valid) { + int q; + + if (exponent(st0_ptr) > -40) { + if ((exponent(st0_ptr) < 0) + || ((exponent(st0_ptr) == 0) + && (significand(st0_ptr) <= + 0xc90fdaa22168c234LL))) { + poly_cos(st0_ptr); + + /* We do not really know if up or down */ + set_precision_flag_down(); + + return 0; + } else if ((q = trig_arg(st0_ptr, FCOS)) != -1) { + poly_sine(st0_ptr); + + if ((q + 1) & 2) + changesign(st0_ptr); + + /* We do not really know if up or down */ + set_precision_flag_down(); + + return 0; + } else { + /* Operand is out of range */ + return 1; + } + } else { + denormal_arg: + + setcc(0); + FPU_copy_to_reg0(&CONST_1, TAG_Valid); #ifdef PECULIAR_486 - set_precision_flag_down(); /* 80486 appears to do this. */ + set_precision_flag_down(); /* 80486 appears to do this. */ #else - set_precision_flag_up(); /* Must be up. */ + set_precision_flag_up(); /* Must be up. */ #endif /* PECULIAR_486 */ - return 0; + return 0; + } + } else if (tag == TAG_Zero) { + FPU_copy_to_reg0(&CONST_1, TAG_Valid); + setcc(0); + return 0; } - } - else if ( tag == TAG_Zero ) - { - FPU_copy_to_reg0(&CONST_1, TAG_Valid); - setcc(0); - return 0; - } - - if ( tag == TAG_Special ) - tag = FPU_Special(st0_ptr); - - if ( tag == TW_Denormal ) - { - if ( denormal_operand() < 0 ) - return 1; - - goto denormal_arg; - } - else if ( tag == TW_Infinity ) - { - /* The 80486 treats infinity as an invalid operand */ - arith_invalid(0); - return 1; - } - else - { - single_arg_error(st0_ptr, tag); /* requires st0_ptr == &st(0) */ - return 1; - } -} + if (tag == TAG_Special) + tag = FPU_Special(st0_ptr); + + if (tag == TW_Denormal) { + if (denormal_operand() < 0) + return 1; + + goto denormal_arg; + } else if (tag == TW_Infinity) { + /* The 80486 treats infinity as an invalid operand */ + arith_invalid(0); + return 1; + } else { + single_arg_error(st0_ptr, tag); /* requires st0_ptr == &st(0) */ + return 1; + } +} static void fcos(FPU_REG *st0_ptr, u_char st0_tag) { - f_cos(st0_ptr, st0_tag); + f_cos(st0_ptr, st0_tag); } - static void fsincos(FPU_REG *st0_ptr, u_char st0_tag) { - FPU_REG *st_new_ptr; - FPU_REG arg; - u_char tag; - - /* Stack underflow has higher priority */ - if ( st0_tag == TAG_Empty ) - { - FPU_stack_underflow(); /* Puts a QNaN in st(0) */ - if ( control_word & CW_Invalid ) - { - st_new_ptr = &st(-1); - push(); - FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */ + FPU_REG *st_new_ptr; + FPU_REG arg; + u_char tag; + + /* Stack underflow has higher priority */ + if (st0_tag == TAG_Empty) { + FPU_stack_underflow(); /* Puts a QNaN in st(0) */ + if (control_word & CW_Invalid) { + st_new_ptr = &st(-1); + push(); + FPU_stack_underflow(); /* Puts a QNaN in the new st(0) */ + } + return; } - return; - } - - if ( STACK_OVERFLOW ) - { FPU_stack_overflow(); return; } - - if ( st0_tag == TAG_Special ) - tag = FPU_Special(st0_ptr); - else - tag = st0_tag; - - if ( tag == TW_NaN ) - { - single_arg_2_error(st0_ptr, TW_NaN); - return; - } - else if ( tag == TW_Infinity ) - { - /* The 80486 treats infinity as an invalid operand */ - if ( arith_invalid(0) >= 0 ) - { - /* Masked response */ - push(); - arith_invalid(0); + + if (STACK_OVERFLOW) { + FPU_stack_overflow(); + return; } - return; - } - - reg_copy(st0_ptr, &arg); - if ( !fsin(st0_ptr, st0_tag) ) - { - push(); - FPU_copy_to_reg0(&arg, st0_tag); - f_cos(&st(0), st0_tag); - } - else - { - /* An error, so restore st(0) */ - FPU_copy_to_reg0(&arg, st0_tag); - } -} + if (st0_tag == TAG_Special) + tag = FPU_Special(st0_ptr); + else + tag = st0_tag; + + if (tag == TW_NaN) { + single_arg_2_error(st0_ptr, TW_NaN); + return; + } else if (tag == TW_Infinity) { + /* The 80486 treats infinity as an invalid operand */ + if (arith_invalid(0) >= 0) { + /* Masked response */ + push(); + arith_invalid(0); + } + return; + } + + reg_copy(st0_ptr, &arg); + if (!fsin(st0_ptr, st0_tag)) { + push(); + FPU_copy_to_reg0(&arg, st0_tag); + f_cos(&st(0), st0_tag); + } else { + /* An error, so restore st(0) */ + FPU_copy_to_reg0(&arg, st0_tag); + } +} /*---------------------------------------------------------------------------*/ /* The following all require two arguments: st(0) and st(1) */ @@ -826,1020 +743,901 @@ static void fsincos(FPU_REG *st0_ptr, u_char st0_tag) result must be zero. */ static void rem_kernel(unsigned long long st0, unsigned long long *y, - unsigned long long st1, - unsigned long long q, int n) + unsigned long long st1, unsigned long long q, int n) { - int dummy; - unsigned long long x; - - x = st0 << n; - - /* Do the required multiplication and subtraction in the one operation */ - - /* lsw x -= lsw st1 * lsw q */ - asm volatile ("mull %4; subl %%eax,%0; sbbl %%edx,%1" - :"=m" (((unsigned *)&x)[0]), "=m" (((unsigned *)&x)[1]), - "=a" (dummy) - :"2" (((unsigned *)&st1)[0]), "m" (((unsigned *)&q)[0]) - :"%dx"); - /* msw x -= msw st1 * lsw q */ - asm volatile ("mull %3; subl %%eax,%0" - :"=m" (((unsigned *)&x)[1]), "=a" (dummy) - :"1" (((unsigned *)&st1)[1]), "m" (((unsigned *)&q)[0]) - :"%dx"); - /* msw x -= lsw st1 * msw q */ - asm volatile ("mull %3; subl %%eax,%0" - :"=m" (((unsigned *)&x)[1]), "=a" (dummy) - :"1" (((unsigned *)&st1)[0]), "m" (((unsigned *)&q)[1]) - :"%dx"); - - *y = x; + int dummy; + unsigned long long x; + + x = st0 << n; + + /* Do the required multiplication and subtraction in the one operation */ + + /* lsw x -= lsw st1 * lsw q */ + asm volatile ("mull %4; subl %%eax,%0; sbbl %%edx,%1":"=m" + (((unsigned *)&x)[0]), "=m"(((unsigned *)&x)[1]), + "=a"(dummy) + :"2"(((unsigned *)&st1)[0]), "m"(((unsigned *)&q)[0]) + :"%dx"); + /* msw x -= msw st1 * lsw q */ + asm volatile ("mull %3; subl %%eax,%0":"=m" (((unsigned *)&x)[1]), + "=a"(dummy) + :"1"(((unsigned *)&st1)[1]), "m"(((unsigned *)&q)[0]) + :"%dx"); + /* msw x -= lsw st1 * msw q */ + asm volatile ("mull %3; subl %%eax,%0":"=m" (((unsigned *)&x)[1]), + "=a"(dummy) + :"1"(((unsigned *)&st1)[0]), "m"(((unsigned *)&q)[1]) + :"%dx"); + + *y = x; } - /* Remainder of st(0) / st(1) */ /* This routine produces exact results, i.e. there is never any rounding or truncation, etc of the result. */ static void do_fprem(FPU_REG *st0_ptr, u_char st0_tag, int round) { - FPU_REG *st1_ptr = &st(1); - u_char st1_tag = FPU_gettagi(1); - - if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) - { - FPU_REG tmp, st0, st1; - u_char st0_sign, st1_sign; - u_char tmptag; - int tag; - int old_cw; - int expdif; - long long q; - unsigned short saved_status; - int cc; - - fprem_valid: - /* Convert registers for internal use. */ - st0_sign = FPU_to_exp16(st0_ptr, &st0); - st1_sign = FPU_to_exp16(st1_ptr, &st1); - expdif = exponent16(&st0) - exponent16(&st1); - - old_cw = control_word; - cc = 0; - - /* We want the status following the denorm tests, but don't want - the status changed by the arithmetic operations. */ - saved_status = partial_status; - control_word &= ~CW_RC; - control_word |= RC_CHOP; - - if ( expdif < 64 ) - { - /* This should be the most common case */ - - if ( expdif > -2 ) - { - u_char sign = st0_sign ^ st1_sign; - tag = FPU_u_div(&st0, &st1, &tmp, - PR_64_BITS | RC_CHOP | 0x3f, - sign); - setsign(&tmp, sign); - - if ( exponent(&tmp) >= 0 ) - { - FPU_round_to_int(&tmp, tag); /* Fortunately, this can't - overflow to 2^64 */ - q = significand(&tmp); - - rem_kernel(significand(&st0), - &significand(&tmp), - significand(&st1), - q, expdif); - - setexponent16(&tmp, exponent16(&st1)); - } - else - { - reg_copy(&st0, &tmp); - q = 0; - } - - if ( (round == RC_RND) && (tmp.sigh & 0xc0000000) ) - { - /* We may need to subtract st(1) once more, - to get a result <= 1/2 of st(1). */ - unsigned long long x; - expdif = exponent16(&st1) - exponent16(&tmp); - if ( expdif <= 1 ) - { - if ( expdif == 0 ) - x = significand(&st1) - significand(&tmp); - else /* expdif is 1 */ - x = (significand(&st1) << 1) - significand(&tmp); - if ( (x < significand(&tmp)) || - /* or equi-distant (from 0 & st(1)) and q is odd */ - ((x == significand(&tmp)) && (q & 1) ) ) - { - st0_sign = ! st0_sign; - significand(&tmp) = x; - q++; + FPU_REG *st1_ptr = &st(1); + u_char st1_tag = FPU_gettagi(1); + + if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) { + FPU_REG tmp, st0, st1; + u_char st0_sign, st1_sign; + u_char tmptag; + int tag; + int old_cw; + int expdif; + long long q; + unsigned short saved_status; + int cc; + + fprem_valid: + /* Convert registers for internal use. */ + st0_sign = FPU_to_exp16(st0_ptr, &st0); + st1_sign = FPU_to_exp16(st1_ptr, &st1); + expdif = exponent16(&st0) - exponent16(&st1); + + old_cw = control_word; + cc = 0; + + /* We want the status following the denorm tests, but don't want + the status changed by the arithmetic operations. */ + saved_status = partial_status; + control_word &= ~CW_RC; + control_word |= RC_CHOP; + + if (expdif < 64) { + /* This should be the most common case */ + + if (expdif > -2) { + u_char sign = st0_sign ^ st1_sign; + tag = FPU_u_div(&st0, &st1, &tmp, + PR_64_BITS | RC_CHOP | 0x3f, + sign); + setsign(&tmp, sign); + + if (exponent(&tmp) >= 0) { + FPU_round_to_int(&tmp, tag); /* Fortunately, this can't + overflow to 2^64 */ + q = significand(&tmp); + + rem_kernel(significand(&st0), + &significand(&tmp), + significand(&st1), + q, expdif); + + setexponent16(&tmp, exponent16(&st1)); + } else { + reg_copy(&st0, &tmp); + q = 0; + } + + if ((round == RC_RND) + && (tmp.sigh & 0xc0000000)) { + /* We may need to subtract st(1) once more, + to get a result <= 1/2 of st(1). */ + unsigned long long x; + expdif = + exponent16(&st1) - exponent16(&tmp); + if (expdif <= 1) { + if (expdif == 0) + x = significand(&st1) - + significand(&tmp); + else /* expdif is 1 */ + x = (significand(&st1) + << 1) - + significand(&tmp); + if ((x < significand(&tmp)) || + /* or equi-distant (from 0 & st(1)) and q is odd */ + ((x == significand(&tmp)) + && (q & 1))) { + st0_sign = !st0_sign; + significand(&tmp) = x; + q++; + } + } + } + + if (q & 4) + cc |= SW_C0; + if (q & 2) + cc |= SW_C3; + if (q & 1) + cc |= SW_C1; + } else { + control_word = old_cw; + setcc(0); + return; } - } - } - - if (q & 4) cc |= SW_C0; - if (q & 2) cc |= SW_C3; - if (q & 1) cc |= SW_C1; - } - else - { - control_word = old_cw; - setcc(0); - return; - } - } - else - { - /* There is a large exponent difference ( >= 64 ) */ - /* To make much sense, the code in this section should - be done at high precision. */ - int exp_1, N; - u_char sign; - - /* prevent overflow here */ - /* N is 'a number between 32 and 63' (p26-113) */ - reg_copy(&st0, &tmp); - tmptag = st0_tag; - N = (expdif & 0x0000001f) + 32; /* This choice gives results - identical to an AMD 486 */ - setexponent16(&tmp, N); - exp_1 = exponent16(&st1); - setexponent16(&st1, 0); - expdif -= N; - - sign = getsign(&tmp) ^ st1_sign; - tag = FPU_u_div(&tmp, &st1, &tmp, PR_64_BITS | RC_CHOP | 0x3f, - sign); - setsign(&tmp, sign); - - FPU_round_to_int(&tmp, tag); /* Fortunately, this can't - overflow to 2^64 */ - - rem_kernel(significand(&st0), - &significand(&tmp), - significand(&st1), - significand(&tmp), - exponent(&tmp) - ); - setexponent16(&tmp, exp_1 + expdif); - - /* It is possible for the operation to be complete here. - What does the IEEE standard say? The Intel 80486 manual - implies that the operation will never be completed at this - point, and the behaviour of a real 80486 confirms this. - */ - if ( !(tmp.sigh | tmp.sigl) ) - { - /* The result is zero */ - control_word = old_cw; - partial_status = saved_status; - FPU_copy_to_reg0(&CONST_Z, TAG_Zero); - setsign(&st0, st0_sign); + } else { + /* There is a large exponent difference ( >= 64 ) */ + /* To make much sense, the code in this section should + be done at high precision. */ + int exp_1, N; + u_char sign; + + /* prevent overflow here */ + /* N is 'a number between 32 and 63' (p26-113) */ + reg_copy(&st0, &tmp); + tmptag = st0_tag; + N = (expdif & 0x0000001f) + 32; /* This choice gives results + identical to an AMD 486 */ + setexponent16(&tmp, N); + exp_1 = exponent16(&st1); + setexponent16(&st1, 0); + expdif -= N; + + sign = getsign(&tmp) ^ st1_sign; + tag = + FPU_u_div(&tmp, &st1, &tmp, + PR_64_BITS | RC_CHOP | 0x3f, sign); + setsign(&tmp, sign); + + FPU_round_to_int(&tmp, tag); /* Fortunately, this can't + overflow to 2^64 */ + + rem_kernel(significand(&st0), + &significand(&tmp), + significand(&st1), + significand(&tmp), exponent(&tmp) + ); + setexponent16(&tmp, exp_1 + expdif); + + /* It is possible for the operation to be complete here. + What does the IEEE standard say? The Intel 80486 manual + implies that the operation will never be completed at this + point, and the behaviour of a real 80486 confirms this. + */ + if (!(tmp.sigh | tmp.sigl)) { + /* The result is zero */ + control_word = old_cw; + partial_status = saved_status; + FPU_copy_to_reg0(&CONST_Z, TAG_Zero); + setsign(&st0, st0_sign); #ifdef PECULIAR_486 - setcc(SW_C2); + setcc(SW_C2); #else - setcc(0); + setcc(0); #endif /* PECULIAR_486 */ - return; - } - cc = SW_C2; - } + return; + } + cc = SW_C2; + } - control_word = old_cw; - partial_status = saved_status; - tag = FPU_normalize_nuo(&tmp); - reg_copy(&tmp, st0_ptr); - - /* The only condition to be looked for is underflow, - and it can occur here only if underflow is unmasked. */ - if ( (exponent16(&tmp) <= EXP_UNDER) && (tag != TAG_Zero) - && !(control_word & CW_Underflow) ) - { - setcc(cc); - tag = arith_underflow(st0_ptr); - setsign(st0_ptr, st0_sign); - FPU_settag0(tag); - return; - } - else if ( (exponent16(&tmp) > EXP_UNDER) || (tag == TAG_Zero) ) - { - stdexp(st0_ptr); - setsign(st0_ptr, st0_sign); - } - else - { - tag = FPU_round(st0_ptr, 0, 0, FULL_PRECISION, st0_sign); - } - FPU_settag0(tag); - setcc(cc); + control_word = old_cw; + partial_status = saved_status; + tag = FPU_normalize_nuo(&tmp); + reg_copy(&tmp, st0_ptr); + + /* The only condition to be looked for is underflow, + and it can occur here only if underflow is unmasked. */ + if ((exponent16(&tmp) <= EXP_UNDER) && (tag != TAG_Zero) + && !(control_word & CW_Underflow)) { + setcc(cc); + tag = arith_underflow(st0_ptr); + setsign(st0_ptr, st0_sign); + FPU_settag0(tag); + return; + } else if ((exponent16(&tmp) > EXP_UNDER) || (tag == TAG_Zero)) { + stdexp(st0_ptr); + setsign(st0_ptr, st0_sign); + } else { + tag = + FPU_round(st0_ptr, 0, 0, FULL_PRECISION, st0_sign); + } + FPU_settag0(tag); + setcc(cc); - return; - } + return; + } - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); - if ( st1_tag == TAG_Special ) - st1_tag = FPU_Special(st1_ptr); + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); + if (st1_tag == TAG_Special) + st1_tag = FPU_Special(st1_ptr); - if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) + if (((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid)) - || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) ) - { - if ( denormal_operand() < 0 ) - return; - goto fprem_valid; - } - else if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) ) - { - FPU_stack_underflow(); - return; - } - else if ( st0_tag == TAG_Zero ) - { - if ( st1_tag == TAG_Valid ) - { - setcc(0); return; - } - else if ( st1_tag == TW_Denormal ) - { - if ( denormal_operand() < 0 ) - return; - setcc(0); return; - } - else if ( st1_tag == TAG_Zero ) - { arith_invalid(0); return; } /* fprem(?,0) always invalid */ - else if ( st1_tag == TW_Infinity ) - { setcc(0); return; } - } - else if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) ) - { - if ( st1_tag == TAG_Zero ) - { - arith_invalid(0); /* fprem(Valid,Zero) is invalid */ - return; - } - else if ( st1_tag != TW_NaN ) - { - if ( ((st0_tag == TW_Denormal) || (st1_tag == TW_Denormal)) - && (denormal_operand() < 0) ) - return; - - if ( st1_tag == TW_Infinity ) - { - /* fprem(Valid,Infinity) is o.k. */ - setcc(0); return; - } - } - } - else if ( st0_tag == TW_Infinity ) - { - if ( st1_tag != TW_NaN ) - { - arith_invalid(0); /* fprem(Infinity,?) is invalid */ - return; + || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal))) { + if (denormal_operand() < 0) + return; + goto fprem_valid; + } else if ((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) { + FPU_stack_underflow(); + return; + } else if (st0_tag == TAG_Zero) { + if (st1_tag == TAG_Valid) { + setcc(0); + return; + } else if (st1_tag == TW_Denormal) { + if (denormal_operand() < 0) + return; + setcc(0); + return; + } else if (st1_tag == TAG_Zero) { + arith_invalid(0); + return; + } /* fprem(?,0) always invalid */ + else if (st1_tag == TW_Infinity) { + setcc(0); + return; + } + } else if ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) { + if (st1_tag == TAG_Zero) { + arith_invalid(0); /* fprem(Valid,Zero) is invalid */ + return; + } else if (st1_tag != TW_NaN) { + if (((st0_tag == TW_Denormal) + || (st1_tag == TW_Denormal)) + && (denormal_operand() < 0)) + return; + + if (st1_tag == TW_Infinity) { + /* fprem(Valid,Infinity) is o.k. */ + setcc(0); + return; + } + } + } else if (st0_tag == TW_Infinity) { + if (st1_tag != TW_NaN) { + arith_invalid(0); /* fprem(Infinity,?) is invalid */ + return; + } } - } - /* One of the registers must contain a NaN if we got here. */ + /* One of the registers must contain a NaN if we got here. */ #ifdef PARANOID - if ( (st0_tag != TW_NaN) && (st1_tag != TW_NaN) ) - EXCEPTION(EX_INTERNAL | 0x118); + if ((st0_tag != TW_NaN) && (st1_tag != TW_NaN)) + EXCEPTION(EX_INTERNAL | 0x118); #endif /* PARANOID */ - real_2op_NaN(st1_ptr, st1_tag, 0, st1_ptr); + real_2op_NaN(st1_ptr, st1_tag, 0, st1_ptr); } - /* ST(1) <- ST(1) * log ST; pop ST */ static void fyl2x(FPU_REG *st0_ptr, u_char st0_tag) { - FPU_REG *st1_ptr = &st(1), exponent; - u_char st1_tag = FPU_gettagi(1); - u_char sign; - int e, tag; - - clear_C1(); - - if ( (st0_tag == TAG_Valid) && (st1_tag == TAG_Valid) ) - { - both_valid: - /* Both regs are Valid or Denormal */ - if ( signpositive(st0_ptr) ) - { - if ( st0_tag == TW_Denormal ) - FPU_to_exp16(st0_ptr, st0_ptr); - else - /* Convert st(0) for internal use. */ - setexponent16(st0_ptr, exponent(st0_ptr)); - - if ( (st0_ptr->sigh == 0x80000000) && (st0_ptr->sigl == 0) ) - { - /* Special case. The result can be precise. */ - u_char esign; - e = exponent16(st0_ptr); - if ( e >= 0 ) - { - exponent.sigh = e; - esign = SIGN_POS; - } - else - { - exponent.sigh = -e; - esign = SIGN_NEG; + FPU_REG *st1_ptr = &st(1), exponent; + u_char st1_tag = FPU_gettagi(1); + u_char sign; + int e, tag; + + clear_C1(); + + if ((st0_tag == TAG_Valid) && (st1_tag == TAG_Valid)) { + both_valid: + /* Both regs are Valid or Denormal */ + if (signpositive(st0_ptr)) { + if (st0_tag == TW_Denormal) + FPU_to_exp16(st0_ptr, st0_ptr); + else + /* Convert st(0) for internal use. */ + setexponent16(st0_ptr, exponent(st0_ptr)); + + if ((st0_ptr->sigh == 0x80000000) + && (st0_ptr->sigl == 0)) { + /* Special case. The result can be precise. */ + u_char esign; + e = exponent16(st0_ptr); + if (e >= 0) { + exponent.sigh = e; + esign = SIGN_POS; + } else { + exponent.sigh = -e; + esign = SIGN_NEG; + } + exponent.sigl = 0; + setexponent16(&exponent, 31); + tag = FPU_normalize_nuo(&exponent); + stdexp(&exponent); + setsign(&exponent, esign); + tag = + FPU_mul(&exponent, tag, 1, FULL_PRECISION); + if (tag >= 0) + FPU_settagi(1, tag); + } else { + /* The usual case */ + sign = getsign(st1_ptr); + if (st1_tag == TW_Denormal) + FPU_to_exp16(st1_ptr, st1_ptr); + else + /* Convert st(1) for internal use. */ + setexponent16(st1_ptr, + exponent(st1_ptr)); + poly_l2(st0_ptr, st1_ptr, sign); + } + } else { + /* negative */ + if (arith_invalid(1) < 0) + return; } - exponent.sigl = 0; - setexponent16(&exponent, 31); - tag = FPU_normalize_nuo(&exponent); - stdexp(&exponent); - setsign(&exponent, esign); - tag = FPU_mul(&exponent, tag, 1, FULL_PRECISION); - if ( tag >= 0 ) - FPU_settagi(1, tag); - } - else - { - /* The usual case */ - sign = getsign(st1_ptr); - if ( st1_tag == TW_Denormal ) - FPU_to_exp16(st1_ptr, st1_ptr); - else - /* Convert st(1) for internal use. */ - setexponent16(st1_ptr, exponent(st1_ptr)); - poly_l2(st0_ptr, st1_ptr, sign); - } - } - else - { - /* negative */ - if ( arith_invalid(1) < 0 ) - return; - } - FPU_pop(); - - return; - } - - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); - if ( st1_tag == TAG_Special ) - st1_tag = FPU_Special(st1_ptr); - - if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) ) - { - FPU_stack_underflow_pop(1); - return; - } - else if ( (st0_tag <= TW_Denormal) && (st1_tag <= TW_Denormal) ) - { - if ( st0_tag == TAG_Zero ) - { - if ( st1_tag == TAG_Zero ) - { - /* Both args zero is invalid */ - if ( arith_invalid(1) < 0 ) - return; - } - else - { - u_char sign; - sign = getsign(st1_ptr)^SIGN_NEG; - if ( FPU_divide_by_zero(1, sign) < 0 ) - return; + FPU_pop(); - setsign(st1_ptr, sign); - } - } - else if ( st1_tag == TAG_Zero ) - { - /* st(1) contains zero, st(0) valid <> 0 */ - /* Zero is the valid answer */ - sign = getsign(st1_ptr); - - if ( signnegative(st0_ptr) ) - { - /* log(negative) */ - if ( arith_invalid(1) < 0 ) return; - } - else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; - else - { - if ( exponent(st0_ptr) < 0 ) - sign ^= SIGN_NEG; - - FPU_copy_to_reg1(&CONST_Z, TAG_Zero); - setsign(st1_ptr, sign); - } } - else - { - /* One or both operands are denormals. */ - if ( denormal_operand() < 0 ) - return; - goto both_valid; - } - } - else if ( (st0_tag == TW_NaN) || (st1_tag == TW_NaN) ) - { - if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) - return; - } - /* One or both arg must be an infinity */ - else if ( st0_tag == TW_Infinity ) - { - if ( (signnegative(st0_ptr)) || (st1_tag == TAG_Zero) ) - { - /* log(-infinity) or 0*log(infinity) */ - if ( arith_invalid(1) < 0 ) - return; - } - else - { - u_char sign = getsign(st1_ptr); - if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); + if (st1_tag == TAG_Special) + st1_tag = FPU_Special(st1_ptr); - FPU_copy_to_reg1(&CONST_INF, TAG_Special); - setsign(st1_ptr, sign); - } - } - /* st(1) must be infinity here */ - else if ( ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) - && ( signpositive(st0_ptr) ) ) - { - if ( exponent(st0_ptr) >= 0 ) - { - if ( (exponent(st0_ptr) == 0) && - (st0_ptr->sigh == 0x80000000) && - (st0_ptr->sigl == 0) ) - { - /* st(0) holds 1.0 */ - /* infinity*log(1) */ - if ( arith_invalid(1) < 0 ) + if ((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) { + FPU_stack_underflow_pop(1); return; - } - /* else st(0) is positive and > 1.0 */ + } else if ((st0_tag <= TW_Denormal) && (st1_tag <= TW_Denormal)) { + if (st0_tag == TAG_Zero) { + if (st1_tag == TAG_Zero) { + /* Both args zero is invalid */ + if (arith_invalid(1) < 0) + return; + } else { + u_char sign; + sign = getsign(st1_ptr) ^ SIGN_NEG; + if (FPU_divide_by_zero(1, sign) < 0) + return; + + setsign(st1_ptr, sign); + } + } else if (st1_tag == TAG_Zero) { + /* st(1) contains zero, st(0) valid <> 0 */ + /* Zero is the valid answer */ + sign = getsign(st1_ptr); + + if (signnegative(st0_ptr)) { + /* log(negative) */ + if (arith_invalid(1) < 0) + return; + } else if ((st0_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; + else { + if (exponent(st0_ptr) < 0) + sign ^= SIGN_NEG; + + FPU_copy_to_reg1(&CONST_Z, TAG_Zero); + setsign(st1_ptr, sign); + } + } else { + /* One or both operands are denormals. */ + if (denormal_operand() < 0) + return; + goto both_valid; + } + } else if ((st0_tag == TW_NaN) || (st1_tag == TW_NaN)) { + if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0) + return; + } + /* One or both arg must be an infinity */ + else if (st0_tag == TW_Infinity) { + if ((signnegative(st0_ptr)) || (st1_tag == TAG_Zero)) { + /* log(-infinity) or 0*log(infinity) */ + if (arith_invalid(1) < 0) + return; + } else { + u_char sign = getsign(st1_ptr); + + if ((st1_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; + + FPU_copy_to_reg1(&CONST_INF, TAG_Special); + setsign(st1_ptr, sign); + } } - else - { - /* st(0) is positive and < 1.0 */ + /* st(1) must be infinity here */ + else if (((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) + && (signpositive(st0_ptr))) { + if (exponent(st0_ptr) >= 0) { + if ((exponent(st0_ptr) == 0) && + (st0_ptr->sigh == 0x80000000) && + (st0_ptr->sigl == 0)) { + /* st(0) holds 1.0 */ + /* infinity*log(1) */ + if (arith_invalid(1) < 0) + return; + } + /* else st(0) is positive and > 1.0 */ + } else { + /* st(0) is positive and < 1.0 */ - if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; + if ((st0_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; - changesign(st1_ptr); - } - } - else - { - /* st(0) must be zero or negative */ - if ( st0_tag == TAG_Zero ) - { - /* This should be invalid, but a real 80486 is happy with it. */ + changesign(st1_ptr); + } + } else { + /* st(0) must be zero or negative */ + if (st0_tag == TAG_Zero) { + /* This should be invalid, but a real 80486 is happy with it. */ #ifndef PECULIAR_486 - sign = getsign(st1_ptr); - if ( FPU_divide_by_zero(1, sign) < 0 ) - return; + sign = getsign(st1_ptr); + if (FPU_divide_by_zero(1, sign) < 0) + return; #endif /* PECULIAR_486 */ - changesign(st1_ptr); + changesign(st1_ptr); + } else if (arith_invalid(1) < 0) /* log(negative) */ + return; } - else if ( arith_invalid(1) < 0 ) /* log(negative) */ - return; - } - FPU_pop(); + FPU_pop(); } - static void fpatan(FPU_REG *st0_ptr, u_char st0_tag) { - FPU_REG *st1_ptr = &st(1); - u_char st1_tag = FPU_gettagi(1); - int tag; + FPU_REG *st1_ptr = &st(1); + u_char st1_tag = FPU_gettagi(1); + int tag; - clear_C1(); - if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) - { - valid_atan: + clear_C1(); + if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) { + valid_atan: - poly_atan(st0_ptr, st0_tag, st1_ptr, st1_tag); + poly_atan(st0_ptr, st0_tag, st1_ptr, st1_tag); - FPU_pop(); + FPU_pop(); - return; - } + return; + } - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); - if ( st1_tag == TAG_Special ) - st1_tag = FPU_Special(st1_ptr); + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); + if (st1_tag == TAG_Special) + st1_tag = FPU_Special(st1_ptr); - if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) + if (((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid)) - || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) ) - { - if ( denormal_operand() < 0 ) - return; + || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal))) { + if (denormal_operand() < 0) + return; - goto valid_atan; - } - else if ( (st0_tag == TAG_Empty) || (st1_tag == TAG_Empty) ) - { - FPU_stack_underflow_pop(1); - return; - } - else if ( (st0_tag == TW_NaN) || (st1_tag == TW_NaN) ) - { - if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) >= 0 ) - FPU_pop(); - return; - } - else if ( (st0_tag == TW_Infinity) || (st1_tag == TW_Infinity) ) - { - u_char sign = getsign(st1_ptr); - if ( st0_tag == TW_Infinity ) - { - if ( st1_tag == TW_Infinity ) - { - if ( signpositive(st0_ptr) ) - { - FPU_copy_to_reg1(&CONST_PI4, TAG_Valid); - } - else - { - setpositive(st1_ptr); - tag = FPU_u_add(&CONST_PI4, &CONST_PI2, st1_ptr, - FULL_PRECISION, SIGN_POS, - exponent(&CONST_PI4), exponent(&CONST_PI2)); - if ( tag >= 0 ) - FPU_settagi(1, tag); - } - } - else - { - if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) + goto valid_atan; + } else if ((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) { + FPU_stack_underflow_pop(1); + return; + } else if ((st0_tag == TW_NaN) || (st1_tag == TW_NaN)) { + if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) >= 0) + FPU_pop(); return; + } else if ((st0_tag == TW_Infinity) || (st1_tag == TW_Infinity)) { + u_char sign = getsign(st1_ptr); + if (st0_tag == TW_Infinity) { + if (st1_tag == TW_Infinity) { + if (signpositive(st0_ptr)) { + FPU_copy_to_reg1(&CONST_PI4, TAG_Valid); + } else { + setpositive(st1_ptr); + tag = + FPU_u_add(&CONST_PI4, &CONST_PI2, + st1_ptr, FULL_PRECISION, + SIGN_POS, + exponent(&CONST_PI4), + exponent(&CONST_PI2)); + if (tag >= 0) + FPU_settagi(1, tag); + } + } else { + if ((st1_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; + + if (signpositive(st0_ptr)) { + FPU_copy_to_reg1(&CONST_Z, TAG_Zero); + setsign(st1_ptr, sign); /* An 80486 preserves the sign */ + FPU_pop(); + return; + } else { + FPU_copy_to_reg1(&CONST_PI, TAG_Valid); + } + } + } else { + /* st(1) is infinity, st(0) not infinity */ + if ((st0_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; - if ( signpositive(st0_ptr) ) - { - FPU_copy_to_reg1(&CONST_Z, TAG_Zero); - setsign(st1_ptr, sign); /* An 80486 preserves the sign */ - FPU_pop(); - return; + FPU_copy_to_reg1(&CONST_PI2, TAG_Valid); } - else - { - FPU_copy_to_reg1(&CONST_PI, TAG_Valid); + setsign(st1_ptr, sign); + } else if (st1_tag == TAG_Zero) { + /* st(0) must be valid or zero */ + u_char sign = getsign(st1_ptr); + + if ((st0_tag == TW_Denormal) && (denormal_operand() < 0)) + return; + + if (signpositive(st0_ptr)) { + /* An 80486 preserves the sign */ + FPU_pop(); + return; } - } - } - else - { - /* st(1) is infinity, st(0) not infinity */ - if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; - FPU_copy_to_reg1(&CONST_PI2, TAG_Valid); - } - setsign(st1_ptr, sign); - } - else if ( st1_tag == TAG_Zero ) - { - /* st(0) must be valid or zero */ - u_char sign = getsign(st1_ptr); - - if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; + FPU_copy_to_reg1(&CONST_PI, TAG_Valid); + setsign(st1_ptr, sign); + } else if (st0_tag == TAG_Zero) { + /* st(1) must be TAG_Valid here */ + u_char sign = getsign(st1_ptr); - if ( signpositive(st0_ptr) ) - { - /* An 80486 preserves the sign */ - FPU_pop(); - return; - } + if ((st1_tag == TW_Denormal) && (denormal_operand() < 0)) + return; - FPU_copy_to_reg1(&CONST_PI, TAG_Valid); - setsign(st1_ptr, sign); - } - else if ( st0_tag == TAG_Zero ) - { - /* st(1) must be TAG_Valid here */ - u_char sign = getsign(st1_ptr); - - if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; - - FPU_copy_to_reg1(&CONST_PI2, TAG_Valid); - setsign(st1_ptr, sign); - } + FPU_copy_to_reg1(&CONST_PI2, TAG_Valid); + setsign(st1_ptr, sign); + } #ifdef PARANOID - else - EXCEPTION(EX_INTERNAL | 0x125); + else + EXCEPTION(EX_INTERNAL | 0x125); #endif /* PARANOID */ - FPU_pop(); - set_precision_flag_up(); /* We do not really know if up or down */ + FPU_pop(); + set_precision_flag_up(); /* We do not really know if up or down */ } - static void fprem(FPU_REG *st0_ptr, u_char st0_tag) { - do_fprem(st0_ptr, st0_tag, RC_CHOP); + do_fprem(st0_ptr, st0_tag, RC_CHOP); } - static void fprem1(FPU_REG *st0_ptr, u_char st0_tag) { - do_fprem(st0_ptr, st0_tag, RC_RND); + do_fprem(st0_ptr, st0_tag, RC_RND); } - static void fyl2xp1(FPU_REG *st0_ptr, u_char st0_tag) { - u_char sign, sign1; - FPU_REG *st1_ptr = &st(1), a, b; - u_char st1_tag = FPU_gettagi(1); + u_char sign, sign1; + FPU_REG *st1_ptr = &st(1), a, b; + u_char st1_tag = FPU_gettagi(1); - clear_C1(); - if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) - { - valid_yl2xp1: + clear_C1(); + if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) { + valid_yl2xp1: - sign = getsign(st0_ptr); - sign1 = getsign(st1_ptr); + sign = getsign(st0_ptr); + sign1 = getsign(st1_ptr); - FPU_to_exp16(st0_ptr, &a); - FPU_to_exp16(st1_ptr, &b); + FPU_to_exp16(st0_ptr, &a); + FPU_to_exp16(st1_ptr, &b); - if ( poly_l2p1(sign, sign1, &a, &b, st1_ptr) ) - return; + if (poly_l2p1(sign, sign1, &a, &b, st1_ptr)) + return; - FPU_pop(); - return; - } + FPU_pop(); + return; + } - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); - if ( st1_tag == TAG_Special ) - st1_tag = FPU_Special(st1_ptr); + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); + if (st1_tag == TAG_Special) + st1_tag = FPU_Special(st1_ptr); - if ( ((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) + if (((st0_tag == TAG_Valid) && (st1_tag == TW_Denormal)) || ((st0_tag == TW_Denormal) && (st1_tag == TAG_Valid)) - || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal)) ) - { - if ( denormal_operand() < 0 ) - return; - - goto valid_yl2xp1; - } - else if ( (st0_tag == TAG_Empty) | (st1_tag == TAG_Empty) ) - { - FPU_stack_underflow_pop(1); - return; - } - else if ( st0_tag == TAG_Zero ) - { - switch ( st1_tag ) - { - case TW_Denormal: - if ( denormal_operand() < 0 ) - return; - - case TAG_Zero: - case TAG_Valid: - setsign(st0_ptr, getsign(st0_ptr) ^ getsign(st1_ptr)); - FPU_copy_to_reg1(st0_ptr, st0_tag); - break; - - case TW_Infinity: - /* Infinity*log(1) */ - if ( arith_invalid(1) < 0 ) - return; - break; + || ((st0_tag == TW_Denormal) && (st1_tag == TW_Denormal))) { + if (denormal_operand() < 0) + return; - case TW_NaN: - if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) - return; - break; - - default: + goto valid_yl2xp1; + } else if ((st0_tag == TAG_Empty) | (st1_tag == TAG_Empty)) { + FPU_stack_underflow_pop(1); + return; + } else if (st0_tag == TAG_Zero) { + switch (st1_tag) { + case TW_Denormal: + if (denormal_operand() < 0) + return; + + case TAG_Zero: + case TAG_Valid: + setsign(st0_ptr, getsign(st0_ptr) ^ getsign(st1_ptr)); + FPU_copy_to_reg1(st0_ptr, st0_tag); + break; + + case TW_Infinity: + /* Infinity*log(1) */ + if (arith_invalid(1) < 0) + return; + break; + + case TW_NaN: + if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0) + return; + break; + + default: #ifdef PARANOID - EXCEPTION(EX_INTERNAL | 0x116); - return; + EXCEPTION(EX_INTERNAL | 0x116); + return; #endif /* PARANOID */ - break; - } - } - else if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) ) - { - switch ( st1_tag ) - { - case TAG_Zero: - if ( signnegative(st0_ptr) ) - { - if ( exponent(st0_ptr) >= 0 ) - { - /* st(0) holds <= -1.0 */ -#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */ - changesign(st1_ptr); + break; + } + } else if ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) { + switch (st1_tag) { + case TAG_Zero: + if (signnegative(st0_ptr)) { + if (exponent(st0_ptr) >= 0) { + /* st(0) holds <= -1.0 */ +#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */ + changesign(st1_ptr); #else - if ( arith_invalid(1) < 0 ) - return; + if (arith_invalid(1) < 0) + return; #endif /* PECULIAR_486 */ - } - else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; - else - changesign(st1_ptr); - } - else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; - break; - - case TW_Infinity: - if ( signnegative(st0_ptr) ) - { - if ( (exponent(st0_ptr) >= 0) && - !((st0_ptr->sigh == 0x80000000) && - (st0_ptr->sigl == 0)) ) - { - /* st(0) holds < -1.0 */ -#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */ - changesign(st1_ptr); + } else if ((st0_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; + else + changesign(st1_ptr); + } else if ((st0_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; + break; + + case TW_Infinity: + if (signnegative(st0_ptr)) { + if ((exponent(st0_ptr) >= 0) && + !((st0_ptr->sigh == 0x80000000) && + (st0_ptr->sigl == 0))) { + /* st(0) holds < -1.0 */ +#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */ + changesign(st1_ptr); #else - if ( arith_invalid(1) < 0 ) return; + if (arith_invalid(1) < 0) + return; #endif /* PECULIAR_486 */ + } else if ((st0_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; + else + changesign(st1_ptr); + } else if ((st0_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; + break; + + case TW_NaN: + if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0) + return; } - else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; - else - changesign(st1_ptr); - } - else if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; - break; - - case TW_NaN: - if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) - return; - } - } - else if ( st0_tag == TW_NaN ) - { - if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) - return; - } - else if ( st0_tag == TW_Infinity ) - { - if ( st1_tag == TW_NaN ) - { - if ( real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0 ) - return; - } - else if ( signnegative(st0_ptr) ) - { + } else if (st0_tag == TW_NaN) { + if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0) + return; + } else if (st0_tag == TW_Infinity) { + if (st1_tag == TW_NaN) { + if (real_2op_NaN(st0_ptr, st0_tag, 1, st0_ptr) < 0) + return; + } else if (signnegative(st0_ptr)) { #ifndef PECULIAR_486 - /* This should have higher priority than denormals, but... */ - if ( arith_invalid(1) < 0 ) /* log(-infinity) */ - return; + /* This should have higher priority than denormals, but... */ + if (arith_invalid(1) < 0) /* log(-infinity) */ + return; #endif /* PECULIAR_486 */ - if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; + if ((st1_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; #ifdef PECULIAR_486 - /* Denormal operands actually get higher priority */ - if ( arith_invalid(1) < 0 ) /* log(-infinity) */ - return; + /* Denormal operands actually get higher priority */ + if (arith_invalid(1) < 0) /* log(-infinity) */ + return; #endif /* PECULIAR_486 */ - } - else if ( st1_tag == TAG_Zero ) - { - /* log(infinity) */ - if ( arith_invalid(1) < 0 ) - return; - } - - /* st(1) must be valid here. */ + } else if (st1_tag == TAG_Zero) { + /* log(infinity) */ + if (arith_invalid(1) < 0) + return; + } - else if ( (st1_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; + /* st(1) must be valid here. */ + + else if ((st1_tag == TW_Denormal) && (denormal_operand() < 0)) + return; - /* The Manual says that log(Infinity) is invalid, but a real - 80486 sensibly says that it is o.k. */ - else - { - u_char sign = getsign(st1_ptr); - FPU_copy_to_reg1(&CONST_INF, TAG_Special); - setsign(st1_ptr, sign); + /* The Manual says that log(Infinity) is invalid, but a real + 80486 sensibly says that it is o.k. */ + else { + u_char sign = getsign(st1_ptr); + FPU_copy_to_reg1(&CONST_INF, TAG_Special); + setsign(st1_ptr, sign); + } } - } #ifdef PARANOID - else - { - EXCEPTION(EX_INTERNAL | 0x117); - return; - } + else { + EXCEPTION(EX_INTERNAL | 0x117); + return; + } #endif /* PARANOID */ - FPU_pop(); - return; + FPU_pop(); + return; } - static void fscale(FPU_REG *st0_ptr, u_char st0_tag) { - FPU_REG *st1_ptr = &st(1); - u_char st1_tag = FPU_gettagi(1); - int old_cw = control_word; - u_char sign = getsign(st0_ptr); - - clear_C1(); - if ( !((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid)) ) - { - long scale; - FPU_REG tmp; - - /* Convert register for internal use. */ - setexponent16(st0_ptr, exponent(st0_ptr)); - - valid_scale: - - if ( exponent(st1_ptr) > 30 ) - { - /* 2^31 is far too large, would require 2^(2^30) or 2^(-2^30) */ - - if ( signpositive(st1_ptr) ) - { - EXCEPTION(EX_Overflow); - FPU_copy_to_reg0(&CONST_INF, TAG_Special); - } - else - { - EXCEPTION(EX_Underflow); - FPU_copy_to_reg0(&CONST_Z, TAG_Zero); - } - setsign(st0_ptr, sign); - return; - } - - control_word &= ~CW_RC; - control_word |= RC_CHOP; - reg_copy(st1_ptr, &tmp); - FPU_round_to_int(&tmp, st1_tag); /* This can never overflow here */ - control_word = old_cw; - scale = signnegative(st1_ptr) ? -tmp.sigl : tmp.sigl; - scale += exponent16(st0_ptr); - - setexponent16(st0_ptr, scale); - - /* Use FPU_round() to properly detect under/overflow etc */ - FPU_round(st0_ptr, 0, 0, control_word, sign); - - return; - } - - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); - if ( st1_tag == TAG_Special ) - st1_tag = FPU_Special(st1_ptr); - - if ( (st0_tag == TAG_Valid) || (st0_tag == TW_Denormal) ) - { - switch ( st1_tag ) - { - case TAG_Valid: - /* st(0) must be a denormal */ - if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; - - FPU_to_exp16(st0_ptr, st0_ptr); /* Will not be left on stack */ - goto valid_scale; - - case TAG_Zero: - if ( st0_tag == TW_Denormal ) - denormal_operand(); - return; - - case TW_Denormal: - denormal_operand(); - return; - - case TW_Infinity: - if ( (st0_tag == TW_Denormal) && (denormal_operand() < 0) ) - return; - - if ( signpositive(st1_ptr) ) - FPU_copy_to_reg0(&CONST_INF, TAG_Special); - else - FPU_copy_to_reg0(&CONST_Z, TAG_Zero); - setsign(st0_ptr, sign); - return; + FPU_REG *st1_ptr = &st(1); + u_char st1_tag = FPU_gettagi(1); + int old_cw = control_word; + u_char sign = getsign(st0_ptr); + + clear_C1(); + if (!((st0_tag ^ TAG_Valid) | (st1_tag ^ TAG_Valid))) { + long scale; + FPU_REG tmp; + + /* Convert register for internal use. */ + setexponent16(st0_ptr, exponent(st0_ptr)); + + valid_scale: + + if (exponent(st1_ptr) > 30) { + /* 2^31 is far too large, would require 2^(2^30) or 2^(-2^30) */ + + if (signpositive(st1_ptr)) { + EXCEPTION(EX_Overflow); + FPU_copy_to_reg0(&CONST_INF, TAG_Special); + } else { + EXCEPTION(EX_Underflow); + FPU_copy_to_reg0(&CONST_Z, TAG_Zero); + } + setsign(st0_ptr, sign); + return; + } - case TW_NaN: - real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); - return; - } - } - else if ( st0_tag == TAG_Zero ) - { - switch ( st1_tag ) - { - case TAG_Valid: - case TAG_Zero: - return; + control_word &= ~CW_RC; + control_word |= RC_CHOP; + reg_copy(st1_ptr, &tmp); + FPU_round_to_int(&tmp, st1_tag); /* This can never overflow here */ + control_word = old_cw; + scale = signnegative(st1_ptr) ? -tmp.sigl : tmp.sigl; + scale += exponent16(st0_ptr); - case TW_Denormal: - denormal_operand(); - return; + setexponent16(st0_ptr, scale); - case TW_Infinity: - if ( signpositive(st1_ptr) ) - arith_invalid(0); /* Zero scaled by +Infinity */ - return; + /* Use FPU_round() to properly detect under/overflow etc */ + FPU_round(st0_ptr, 0, 0, control_word, sign); - case TW_NaN: - real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); - return; + return; } - } - else if ( st0_tag == TW_Infinity ) - { - switch ( st1_tag ) - { - case TAG_Valid: - case TAG_Zero: - return; - - case TW_Denormal: - denormal_operand(); - return; - case TW_Infinity: - if ( signnegative(st1_ptr) ) - arith_invalid(0); /* Infinity scaled by -Infinity */ - return; - - case TW_NaN: - real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); - return; + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); + if (st1_tag == TAG_Special) + st1_tag = FPU_Special(st1_ptr); + + if ((st0_tag == TAG_Valid) || (st0_tag == TW_Denormal)) { + switch (st1_tag) { + case TAG_Valid: + /* st(0) must be a denormal */ + if ((st0_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; + + FPU_to_exp16(st0_ptr, st0_ptr); /* Will not be left on stack */ + goto valid_scale; + + case TAG_Zero: + if (st0_tag == TW_Denormal) + denormal_operand(); + return; + + case TW_Denormal: + denormal_operand(); + return; + + case TW_Infinity: + if ((st0_tag == TW_Denormal) + && (denormal_operand() < 0)) + return; + + if (signpositive(st1_ptr)) + FPU_copy_to_reg0(&CONST_INF, TAG_Special); + else + FPU_copy_to_reg0(&CONST_Z, TAG_Zero); + setsign(st0_ptr, sign); + return; + + case TW_NaN: + real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); + return; + } + } else if (st0_tag == TAG_Zero) { + switch (st1_tag) { + case TAG_Valid: + case TAG_Zero: + return; + + case TW_Denormal: + denormal_operand(); + return; + + case TW_Infinity: + if (signpositive(st1_ptr)) + arith_invalid(0); /* Zero scaled by +Infinity */ + return; + + case TW_NaN: + real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); + return; + } + } else if (st0_tag == TW_Infinity) { + switch (st1_tag) { + case TAG_Valid: + case TAG_Zero: + return; + + case TW_Denormal: + denormal_operand(); + return; + + case TW_Infinity: + if (signnegative(st1_ptr)) + arith_invalid(0); /* Infinity scaled by -Infinity */ + return; + + case TW_NaN: + real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); + return; + } + } else if (st0_tag == TW_NaN) { + if (st1_tag != TAG_Empty) { + real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); + return; + } } - } - else if ( st0_tag == TW_NaN ) - { - if ( st1_tag != TAG_Empty ) - { real_2op_NaN(st1_ptr, st1_tag, 0, st0_ptr); return; } - } - #ifdef PARANOID - if ( !((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty)) ) - { - EXCEPTION(EX_INTERNAL | 0x115); - return; - } + if (!((st0_tag == TAG_Empty) || (st1_tag == TAG_Empty))) { + EXCEPTION(EX_INTERNAL | 0x115); + return; + } #endif - /* At least one of st(0), st(1) must be empty */ - FPU_stack_underflow(); + /* At least one of st(0), st(1) must be empty */ + FPU_stack_underflow(); } - /*---------------------------------------------------------------------------*/ static FUNC_ST0 const trig_table_a[] = { - f2xm1, fyl2x, fptan, fpatan, - fxtract, fprem1, (FUNC_ST0)fdecstp, (FUNC_ST0)fincstp + f2xm1, fyl2x, fptan, fpatan, + fxtract, fprem1, (FUNC_ST0) fdecstp, (FUNC_ST0) fincstp }; void FPU_triga(void) { - (trig_table_a[FPU_rm])(&st(0), FPU_gettag0()); + (trig_table_a[FPU_rm]) (&st(0), FPU_gettag0()); } - -static FUNC_ST0 const trig_table_b[] = - { - fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0)fsin, fcos - }; +static FUNC_ST0 const trig_table_b[] = { + fprem, fyl2xp1, fsqrt_, fsincos, frndint_, fscale, (FUNC_ST0) fsin, fcos +}; void FPU_trigb(void) { - (trig_table_b[FPU_rm])(&st(0), FPU_gettag0()); + (trig_table_b[FPU_rm]) (&st(0), FPU_gettag0()); } diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index 2e2c51a8bd3..d701e2b39e4 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -17,7 +17,6 @@ | other processes using the emulator while swapping is in progress. | +---------------------------------------------------------------------------*/ - #include <linux/stddef.h> #include <asm/uaccess.h> @@ -27,31 +26,30 @@ #include "exception.h" #include "fpu_emu.h" - #define FPU_WRITE_BIT 0x10 static int reg_offset[] = { - offsetof(struct info,___eax), - offsetof(struct info,___ecx), - offsetof(struct info,___edx), - offsetof(struct info,___ebx), - offsetof(struct info,___esp), - offsetof(struct info,___ebp), - offsetof(struct info,___esi), - offsetof(struct info,___edi) + offsetof(struct info, ___eax), + offsetof(struct info, ___ecx), + offsetof(struct info, ___edx), + offsetof(struct info, ___ebx), + offsetof(struct info, ___esp), + offsetof(struct info, ___ebp), + offsetof(struct info, ___esi), + offsetof(struct info, ___edi) }; #define REG_(x) (*(long *)(reg_offset[(x)]+(u_char *) FPU_info)) static int reg_offset_vm86[] = { - offsetof(struct info,___cs), - offsetof(struct info,___vm86_ds), - offsetof(struct info,___vm86_es), - offsetof(struct info,___vm86_fs), - offsetof(struct info,___vm86_gs), - offsetof(struct info,___ss), - offsetof(struct info,___vm86_ds) - }; + offsetof(struct info, ___cs), + offsetof(struct info, ___vm86_ds), + offsetof(struct info, ___vm86_es), + offsetof(struct info, ___vm86_fs), + offsetof(struct info, ___vm86_gs), + offsetof(struct info, ___ss), + offsetof(struct info, ___vm86_ds) +}; #define VM86_REG_(x) (*(unsigned short *) \ (reg_offset_vm86[((unsigned)x)]+(u_char *) FPU_info)) @@ -60,158 +58,141 @@ static int reg_offset_vm86[] = { #define ___GS ___ds static int reg_offset_pm[] = { - offsetof(struct info,___cs), - offsetof(struct info,___ds), - offsetof(struct info,___es), - offsetof(struct info,___fs), - offsetof(struct info,___GS), - offsetof(struct info,___ss), - offsetof(struct info,___ds) - }; + offsetof(struct info, ___cs), + offsetof(struct info, ___ds), + offsetof(struct info, ___es), + offsetof(struct info, ___fs), + offsetof(struct info, ___GS), + offsetof(struct info, ___ss), + offsetof(struct info, ___ds) +}; #define PM_REG_(x) (*(unsigned short *) \ (reg_offset_pm[((unsigned)x)]+(u_char *) FPU_info)) - /* Decode the SIB byte. This function assumes mod != 0 */ static int sib(int mod, unsigned long *fpu_eip) { - u_char ss,index,base; - long offset; - - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(1); - FPU_get_user(base, (u_char __user *) (*fpu_eip)); /* The SIB byte */ - RE_ENTRANT_CHECK_ON; - (*fpu_eip)++; - ss = base >> 6; - index = (base >> 3) & 7; - base &= 7; - - if ((mod == 0) && (base == 5)) - offset = 0; /* No base register */ - else - offset = REG_(base); - - if (index == 4) - { - /* No index register */ - /* A non-zero ss is illegal */ - if ( ss ) - EXCEPTION(EX_Invalid); - } - else - { - offset += (REG_(index)) << ss; - } - - if (mod == 1) - { - /* 8 bit signed displacement */ - long displacement; - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(1); - FPU_get_user(displacement, (signed char __user *) (*fpu_eip)); - offset += displacement; - RE_ENTRANT_CHECK_ON; - (*fpu_eip)++; - } - else if (mod == 2 || base == 5) /* The second condition also has mod==0 */ - { - /* 32 bit displacement */ - long displacement; - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(4); - FPU_get_user(displacement, (long __user *) (*fpu_eip)); - offset += displacement; - RE_ENTRANT_CHECK_ON; - (*fpu_eip) += 4; - } - - return offset; -} + u_char ss, index, base; + long offset; + + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(base, (u_char __user *) (*fpu_eip)); /* The SIB byte */ + RE_ENTRANT_CHECK_ON; + (*fpu_eip)++; + ss = base >> 6; + index = (base >> 3) & 7; + base &= 7; + + if ((mod == 0) && (base == 5)) + offset = 0; /* No base register */ + else + offset = REG_(base); + + if (index == 4) { + /* No index register */ + /* A non-zero ss is illegal */ + if (ss) + EXCEPTION(EX_Invalid); + } else { + offset += (REG_(index)) << ss; + } + + if (mod == 1) { + /* 8 bit signed displacement */ + long displacement; + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(displacement, (signed char __user *)(*fpu_eip)); + offset += displacement; + RE_ENTRANT_CHECK_ON; + (*fpu_eip)++; + } else if (mod == 2 || base == 5) { /* The second condition also has mod==0 */ + /* 32 bit displacement */ + long displacement; + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(4); + FPU_get_user(displacement, (long __user *)(*fpu_eip)); + offset += displacement; + RE_ENTRANT_CHECK_ON; + (*fpu_eip) += 4; + } + return offset; +} -static unsigned long vm86_segment(u_char segment, - struct address *addr) +static unsigned long vm86_segment(u_char segment, struct address *addr) { - segment--; + segment--; #ifdef PARANOID - if ( segment > PREFIX_SS_ ) - { - EXCEPTION(EX_INTERNAL|0x130); - math_abort(FPU_info,SIGSEGV); - } + if (segment > PREFIX_SS_) { + EXCEPTION(EX_INTERNAL | 0x130); + math_abort(FPU_info, SIGSEGV); + } #endif /* PARANOID */ - addr->selector = VM86_REG_(segment); - return (unsigned long)VM86_REG_(segment) << 4; + addr->selector = VM86_REG_(segment); + return (unsigned long)VM86_REG_(segment) << 4; } - /* This should work for 16 and 32 bit protected mode. */ static long pm_address(u_char FPU_modrm, u_char segment, struct address *addr, long offset) -{ - struct desc_struct descriptor; - unsigned long base_address, limit, address, seg_top; +{ + struct desc_struct descriptor; + unsigned long base_address, limit, address, seg_top; - segment--; + segment--; #ifdef PARANOID - /* segment is unsigned, so this also detects if segment was 0: */ - if ( segment > PREFIX_SS_ ) - { - EXCEPTION(EX_INTERNAL|0x132); - math_abort(FPU_info,SIGSEGV); - } + /* segment is unsigned, so this also detects if segment was 0: */ + if (segment > PREFIX_SS_) { + EXCEPTION(EX_INTERNAL | 0x132); + math_abort(FPU_info, SIGSEGV); + } #endif /* PARANOID */ - switch ( segment ) - { - /* gs isn't used by the kernel, so it still has its - user-space value. */ - case PREFIX_GS_-1: - /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */ - savesegment(gs, addr->selector); - break; - default: - addr->selector = PM_REG_(segment); - } - - descriptor = LDT_DESCRIPTOR(PM_REG_(segment)); - base_address = SEG_BASE_ADDR(descriptor); - address = base_address + offset; - limit = base_address - + (SEG_LIMIT(descriptor)+1) * SEG_GRANULARITY(descriptor) - 1; - if ( limit < base_address ) limit = 0xffffffff; - - if ( SEG_EXPAND_DOWN(descriptor) ) - { - if ( SEG_G_BIT(descriptor) ) - seg_top = 0xffffffff; - else - { - seg_top = base_address + (1 << 20); - if ( seg_top < base_address ) seg_top = 0xffffffff; + switch (segment) { + /* gs isn't used by the kernel, so it still has its + user-space value. */ + case PREFIX_GS_ - 1: + /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */ + savesegment(gs, addr->selector); + break; + default: + addr->selector = PM_REG_(segment); } - access_limit = - (address <= limit) || (address >= seg_top) ? 0 : - ((seg_top-address) >= 255 ? 255 : seg_top-address); - } - else - { - access_limit = - (address > limit) || (address < base_address) ? 0 : - ((limit-address) >= 254 ? 255 : limit-address+1); - } - if ( SEG_EXECUTE_ONLY(descriptor) || - (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT)) ) - { - access_limit = 0; - } - return address; -} + descriptor = LDT_DESCRIPTOR(PM_REG_(segment)); + base_address = SEG_BASE_ADDR(descriptor); + address = base_address + offset; + limit = base_address + + (SEG_LIMIT(descriptor) + 1) * SEG_GRANULARITY(descriptor) - 1; + if (limit < base_address) + limit = 0xffffffff; + + if (SEG_EXPAND_DOWN(descriptor)) { + if (SEG_G_BIT(descriptor)) + seg_top = 0xffffffff; + else { + seg_top = base_address + (1 << 20); + if (seg_top < base_address) + seg_top = 0xffffffff; + } + access_limit = + (address <= limit) || (address >= seg_top) ? 0 : + ((seg_top - address) >= 255 ? 255 : seg_top - address); + } else { + access_limit = + (address > limit) || (address < base_address) ? 0 : + ((limit - address) >= 254 ? 255 : limit - address + 1); + } + if (SEG_EXECUTE_ONLY(descriptor) || + (!SEG_WRITE_PERM(descriptor) && (FPU_modrm & FPU_WRITE_BIT))) { + access_limit = 0; + } + return address; +} /* MOD R/M byte: MOD == 3 has a special use for the FPU @@ -221,7 +202,6 @@ static long pm_address(u_char FPU_modrm, u_char segment, ..... ......... ......... MOD OPCODE(2) R/M - SIB byte 7 6 5 4 3 2 1 0 @@ -231,208 +211,194 @@ static long pm_address(u_char FPU_modrm, u_char segment, */ void __user *FPU_get_address(u_char FPU_modrm, unsigned long *fpu_eip, - struct address *addr, - fpu_addr_modes addr_modes) + struct address *addr, fpu_addr_modes addr_modes) +{ + u_char mod; + unsigned rm = FPU_modrm & 7; + long *cpu_reg_ptr; + int address = 0; /* Initialized just to stop compiler warnings. */ + + /* Memory accessed via the cs selector is write protected + in `non-segmented' 32 bit protected mode. */ + if (!addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT) + && (addr_modes.override.segment == PREFIX_CS_)) { + math_abort(FPU_info, SIGSEGV); + } + + addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */ + + mod = (FPU_modrm >> 6) & 3; + + if (rm == 4 && mod != 3) { + address = sib(mod, fpu_eip); + } else { + cpu_reg_ptr = ®_(rm); + switch (mod) { + case 0: + if (rm == 5) { + /* Special case: disp32 */ + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(4); + FPU_get_user(address, + (unsigned long __user + *)(*fpu_eip)); + (*fpu_eip) += 4; + RE_ENTRANT_CHECK_ON; + addr->offset = address; + return (void __user *)address; + } else { + address = *cpu_reg_ptr; /* Just return the contents + of the cpu register */ + addr->offset = address; + return (void __user *)address; + } + case 1: + /* 8 bit signed displacement */ + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(address, (signed char __user *)(*fpu_eip)); + RE_ENTRANT_CHECK_ON; + (*fpu_eip)++; + break; + case 2: + /* 32 bit displacement */ + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(4); + FPU_get_user(address, (long __user *)(*fpu_eip)); + (*fpu_eip) += 4; + RE_ENTRANT_CHECK_ON; + break; + case 3: + /* Not legal for the FPU */ + EXCEPTION(EX_Invalid); + } + address += *cpu_reg_ptr; + } + + addr->offset = address; + + switch (addr_modes.default_mode) { + case 0: + break; + case VM86: + address += vm86_segment(addr_modes.override.segment, addr); + break; + case PM16: + case SEG32: + address = pm_address(FPU_modrm, addr_modes.override.segment, + addr, address); + break; + default: + EXCEPTION(EX_INTERNAL | 0x133); + } + + return (void __user *)address; +} + +void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip, + struct address *addr, fpu_addr_modes addr_modes) { - u_char mod; - unsigned rm = FPU_modrm & 7; - long *cpu_reg_ptr; - int address = 0; /* Initialized just to stop compiler warnings. */ - - /* Memory accessed via the cs selector is write protected - in `non-segmented' 32 bit protected mode. */ - if ( !addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT) - && (addr_modes.override.segment == PREFIX_CS_) ) - { - math_abort(FPU_info,SIGSEGV); - } - - addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */ - - mod = (FPU_modrm >> 6) & 3; - - if (rm == 4 && mod != 3) - { - address = sib(mod, fpu_eip); - } - else - { - cpu_reg_ptr = & REG_(rm); - switch (mod) - { + u_char mod; + unsigned rm = FPU_modrm & 7; + int address = 0; /* Default used for mod == 0 */ + + /* Memory accessed via the cs selector is write protected + in `non-segmented' 32 bit protected mode. */ + if (!addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT) + && (addr_modes.override.segment == PREFIX_CS_)) { + math_abort(FPU_info, SIGSEGV); + } + + addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */ + + mod = (FPU_modrm >> 6) & 3; + + switch (mod) { case 0: - if (rm == 5) - { - /* Special case: disp32 */ - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(4); - FPU_get_user(address, (unsigned long __user *) (*fpu_eip)); - (*fpu_eip) += 4; - RE_ENTRANT_CHECK_ON; - addr->offset = address; - return (void __user *) address; - } - else - { - address = *cpu_reg_ptr; /* Just return the contents - of the cpu register */ - addr->offset = address; - return (void __user *) address; - } + if (rm == 6) { + /* Special case: disp16 */ + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(2); + FPU_get_user(address, + (unsigned short __user *)(*fpu_eip)); + (*fpu_eip) += 2; + RE_ENTRANT_CHECK_ON; + goto add_segment; + } + break; case 1: - /* 8 bit signed displacement */ - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(1); - FPU_get_user(address, (signed char __user *) (*fpu_eip)); - RE_ENTRANT_CHECK_ON; - (*fpu_eip)++; - break; + /* 8 bit signed displacement */ + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(1); + FPU_get_user(address, (signed char __user *)(*fpu_eip)); + RE_ENTRANT_CHECK_ON; + (*fpu_eip)++; + break; case 2: - /* 32 bit displacement */ - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(4); - FPU_get_user(address, (long __user *) (*fpu_eip)); - (*fpu_eip) += 4; - RE_ENTRANT_CHECK_ON; - break; + /* 16 bit displacement */ + RE_ENTRANT_CHECK_OFF; + FPU_code_access_ok(2); + FPU_get_user(address, (unsigned short __user *)(*fpu_eip)); + (*fpu_eip) += 2; + RE_ENTRANT_CHECK_ON; + break; case 3: - /* Not legal for the FPU */ - EXCEPTION(EX_Invalid); + /* Not legal for the FPU */ + EXCEPTION(EX_Invalid); + break; + } + switch (rm) { + case 0: + address += FPU_info->___ebx + FPU_info->___esi; + break; + case 1: + address += FPU_info->___ebx + FPU_info->___edi; + break; + case 2: + address += FPU_info->___ebp + FPU_info->___esi; + if (addr_modes.override.segment == PREFIX_DEFAULT) + addr_modes.override.segment = PREFIX_SS_; + break; + case 3: + address += FPU_info->___ebp + FPU_info->___edi; + if (addr_modes.override.segment == PREFIX_DEFAULT) + addr_modes.override.segment = PREFIX_SS_; + break; + case 4: + address += FPU_info->___esi; + break; + case 5: + address += FPU_info->___edi; + break; + case 6: + address += FPU_info->___ebp; + if (addr_modes.override.segment == PREFIX_DEFAULT) + addr_modes.override.segment = PREFIX_SS_; + break; + case 7: + address += FPU_info->___ebx; + break; } - address += *cpu_reg_ptr; - } - - addr->offset = address; - - switch ( addr_modes.default_mode ) - { - case 0: - break; - case VM86: - address += vm86_segment(addr_modes.override.segment, addr); - break; - case PM16: - case SEG32: - address = pm_address(FPU_modrm, addr_modes.override.segment, - addr, address); - break; - default: - EXCEPTION(EX_INTERNAL|0x133); - } - - return (void __user *)address; -} + add_segment: + address &= 0xffff; -void __user *FPU_get_address_16(u_char FPU_modrm, unsigned long *fpu_eip, - struct address *addr, - fpu_addr_modes addr_modes) -{ - u_char mod; - unsigned rm = FPU_modrm & 7; - int address = 0; /* Default used for mod == 0 */ - - /* Memory accessed via the cs selector is write protected - in `non-segmented' 32 bit protected mode. */ - if ( !addr_modes.default_mode && (FPU_modrm & FPU_WRITE_BIT) - && (addr_modes.override.segment == PREFIX_CS_) ) - { - math_abort(FPU_info,SIGSEGV); - } - - addr->selector = FPU_DS; /* Default, for 32 bit non-segmented mode. */ - - mod = (FPU_modrm >> 6) & 3; - - switch (mod) - { - case 0: - if (rm == 6) - { - /* Special case: disp16 */ - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(2); - FPU_get_user(address, (unsigned short __user *) (*fpu_eip)); - (*fpu_eip) += 2; - RE_ENTRANT_CHECK_ON; - goto add_segment; + addr->offset = address; + + switch (addr_modes.default_mode) { + case 0: + break; + case VM86: + address += vm86_segment(addr_modes.override.segment, addr); + break; + case PM16: + case SEG32: + address = pm_address(FPU_modrm, addr_modes.override.segment, + addr, address); + break; + default: + EXCEPTION(EX_INTERNAL | 0x131); } - break; - case 1: - /* 8 bit signed displacement */ - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(1); - FPU_get_user(address, (signed char __user *) (*fpu_eip)); - RE_ENTRANT_CHECK_ON; - (*fpu_eip)++; - break; - case 2: - /* 16 bit displacement */ - RE_ENTRANT_CHECK_OFF; - FPU_code_access_ok(2); - FPU_get_user(address, (unsigned short __user *) (*fpu_eip)); - (*fpu_eip) += 2; - RE_ENTRANT_CHECK_ON; - break; - case 3: - /* Not legal for the FPU */ - EXCEPTION(EX_Invalid); - break; - } - switch ( rm ) - { - case 0: - address += FPU_info->___ebx + FPU_info->___esi; - break; - case 1: - address += FPU_info->___ebx + FPU_info->___edi; - break; - case 2: - address += FPU_info->___ebp + FPU_info->___esi; - if ( addr_modes.override.segment == PREFIX_DEFAULT ) - addr_modes.override.segment = PREFIX_SS_; - break; - case 3: - address += FPU_info->___ebp + FPU_info->___edi; - if ( addr_modes.override.segment == PREFIX_DEFAULT ) - addr_modes.override.segment = PREFIX_SS_; - break; - case 4: - address += FPU_info->___esi; - break; - case 5: - address += FPU_info->___edi; - break; - case 6: - address += FPU_info->___ebp; - if ( addr_modes.override.segment == PREFIX_DEFAULT ) - addr_modes.override.segment = PREFIX_SS_; - break; - case 7: - address += FPU_info->___ebx; - break; - } - - add_segment: - address &= 0xffff; - - addr->offset = address; - - switch ( addr_modes.default_mode ) - { - case 0: - break; - case VM86: - address += vm86_segment(addr_modes.override.segment, addr); - break; - case PM16: - case SEG32: - address = pm_address(FPU_modrm, addr_modes.override.segment, - addr, address); - break; - default: - EXCEPTION(EX_INTERNAL|0x131); - } - - return (void __user *)address ; + + return (void __user *)address; } diff --git a/arch/x86/math-emu/load_store.c b/arch/x86/math-emu/load_store.c index eebd6fb1c8a..2931ff35521 100644 --- a/arch/x86/math-emu/load_store.c +++ b/arch/x86/math-emu/load_store.c @@ -26,247 +26,257 @@ #include "status_w.h" #include "control_w.h" - -#define _NONE_ 0 /* st0_ptr etc not needed */ -#define _REG0_ 1 /* Will be storing st(0) */ -#define _PUSH_ 3 /* Need to check for space to push onto stack */ -#define _null_ 4 /* Function illegal or not implemented */ +#define _NONE_ 0 /* st0_ptr etc not needed */ +#define _REG0_ 1 /* Will be storing st(0) */ +#define _PUSH_ 3 /* Need to check for space to push onto stack */ +#define _null_ 4 /* Function illegal or not implemented */ #define pop_0() { FPU_settag0(TAG_Empty); top++; } - static u_char const type_table[32] = { - _PUSH_, _PUSH_, _PUSH_, _PUSH_, - _null_, _null_, _null_, _null_, - _REG0_, _REG0_, _REG0_, _REG0_, - _REG0_, _REG0_, _REG0_, _REG0_, - _NONE_, _null_, _NONE_, _PUSH_, - _NONE_, _PUSH_, _null_, _PUSH_, - _NONE_, _null_, _NONE_, _REG0_, - _NONE_, _REG0_, _NONE_, _REG0_ - }; + _PUSH_, _PUSH_, _PUSH_, _PUSH_, + _null_, _null_, _null_, _null_, + _REG0_, _REG0_, _REG0_, _REG0_, + _REG0_, _REG0_, _REG0_, _REG0_, + _NONE_, _null_, _NONE_, _PUSH_, + _NONE_, _PUSH_, _null_, _PUSH_, + _NONE_, _null_, _NONE_, _REG0_, + _NONE_, _REG0_, _NONE_, _REG0_ +}; u_char const data_sizes_16[32] = { - 4, 4, 8, 2, 0, 0, 0, 0, - 4, 4, 8, 2, 4, 4, 8, 2, - 14, 0, 94, 10, 2, 10, 0, 8, - 14, 0, 94, 10, 2, 10, 2, 8 + 4, 4, 8, 2, 0, 0, 0, 0, + 4, 4, 8, 2, 4, 4, 8, 2, + 14, 0, 94, 10, 2, 10, 0, 8, + 14, 0, 94, 10, 2, 10, 2, 8 }; static u_char const data_sizes_32[32] = { - 4, 4, 8, 2, 0, 0, 0, 0, - 4, 4, 8, 2, 4, 4, 8, 2, - 28, 0,108, 10, 2, 10, 0, 8, - 28, 0,108, 10, 2, 10, 2, 8 + 4, 4, 8, 2, 0, 0, 0, 0, + 4, 4, 8, 2, 4, 4, 8, 2, + 28, 0, 108, 10, 2, 10, 0, 8, + 28, 0, 108, 10, 2, 10, 2, 8 }; int FPU_load_store(u_char type, fpu_addr_modes addr_modes, - void __user *data_address) + void __user * data_address) { - FPU_REG loaded_data; - FPU_REG *st0_ptr; - u_char st0_tag = TAG_Empty; /* This is just to stop a gcc warning. */ - u_char loaded_tag; + FPU_REG loaded_data; + FPU_REG *st0_ptr; + u_char st0_tag = TAG_Empty; /* This is just to stop a gcc warning. */ + u_char loaded_tag; - st0_ptr = NULL; /* Initialized just to stop compiler warnings. */ + st0_ptr = NULL; /* Initialized just to stop compiler warnings. */ - if ( addr_modes.default_mode & PROTECTED ) - { - if ( addr_modes.default_mode == SEG32 ) - { - if ( access_limit < data_sizes_32[type] ) - math_abort(FPU_info,SIGSEGV); - } - else if ( addr_modes.default_mode == PM16 ) - { - if ( access_limit < data_sizes_16[type] ) - math_abort(FPU_info,SIGSEGV); - } + if (addr_modes.default_mode & PROTECTED) { + if (addr_modes.default_mode == SEG32) { + if (access_limit < data_sizes_32[type]) + math_abort(FPU_info, SIGSEGV); + } else if (addr_modes.default_mode == PM16) { + if (access_limit < data_sizes_16[type]) + math_abort(FPU_info, SIGSEGV); + } #ifdef PARANOID - else - EXCEPTION(EX_INTERNAL|0x140); + else + EXCEPTION(EX_INTERNAL | 0x140); #endif /* PARANOID */ - } + } - switch ( type_table[type] ) - { - case _NONE_: - break; - case _REG0_: - st0_ptr = &st(0); /* Some of these instructions pop after - storing */ - st0_tag = FPU_gettag0(); - break; - case _PUSH_: - { - if ( FPU_gettagi(-1) != TAG_Empty ) - { FPU_stack_overflow(); return 0; } - top--; - st0_ptr = &st(0); - } - break; - case _null_: - FPU_illegal(); - return 0; + switch (type_table[type]) { + case _NONE_: + break; + case _REG0_: + st0_ptr = &st(0); /* Some of these instructions pop after + storing */ + st0_tag = FPU_gettag0(); + break; + case _PUSH_: + { + if (FPU_gettagi(-1) != TAG_Empty) { + FPU_stack_overflow(); + return 0; + } + top--; + st0_ptr = &st(0); + } + break; + case _null_: + FPU_illegal(); + return 0; #ifdef PARANOID - default: - EXCEPTION(EX_INTERNAL|0x141); - return 0; + default: + EXCEPTION(EX_INTERNAL | 0x141); + return 0; #endif /* PARANOID */ - } - - switch ( type ) - { - case 000: /* fld m32real */ - clear_C1(); - loaded_tag = FPU_load_single((float __user *)data_address, &loaded_data); - if ( (loaded_tag == TAG_Special) - && isNaN(&loaded_data) - && (real_1op_NaN(&loaded_data) < 0) ) - { - top++; - break; - } - FPU_copy_to_reg0(&loaded_data, loaded_tag); - break; - case 001: /* fild m32int */ - clear_C1(); - loaded_tag = FPU_load_int32((long __user *)data_address, &loaded_data); - FPU_copy_to_reg0(&loaded_data, loaded_tag); - break; - case 002: /* fld m64real */ - clear_C1(); - loaded_tag = FPU_load_double((double __user *)data_address, &loaded_data); - if ( (loaded_tag == TAG_Special) - && isNaN(&loaded_data) - && (real_1op_NaN(&loaded_data) < 0) ) - { - top++; - break; } - FPU_copy_to_reg0(&loaded_data, loaded_tag); - break; - case 003: /* fild m16int */ - clear_C1(); - loaded_tag = FPU_load_int16((short __user *)data_address, &loaded_data); - FPU_copy_to_reg0(&loaded_data, loaded_tag); - break; - case 010: /* fst m32real */ - clear_C1(); - FPU_store_single(st0_ptr, st0_tag, (float __user *)data_address); - break; - case 011: /* fist m32int */ - clear_C1(); - FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address); - break; - case 012: /* fst m64real */ - clear_C1(); - FPU_store_double(st0_ptr, st0_tag, (double __user *)data_address); - break; - case 013: /* fist m16int */ - clear_C1(); - FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address); - break; - case 014: /* fstp m32real */ - clear_C1(); - if ( FPU_store_single(st0_ptr, st0_tag, (float __user *)data_address) ) - pop_0(); /* pop only if the number was actually stored - (see the 80486 manual p16-28) */ - break; - case 015: /* fistp m32int */ - clear_C1(); - if ( FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address) ) - pop_0(); /* pop only if the number was actually stored - (see the 80486 manual p16-28) */ - break; - case 016: /* fstp m64real */ - clear_C1(); - if ( FPU_store_double(st0_ptr, st0_tag, (double __user *)data_address) ) - pop_0(); /* pop only if the number was actually stored - (see the 80486 manual p16-28) */ - break; - case 017: /* fistp m16int */ - clear_C1(); - if ( FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address) ) - pop_0(); /* pop only if the number was actually stored - (see the 80486 manual p16-28) */ - break; - case 020: /* fldenv m14/28byte */ - fldenv(addr_modes, (u_char __user *)data_address); - /* Ensure that the values just loaded are not changed by - fix-up operations. */ - return 1; - case 022: /* frstor m94/108byte */ - frstor(addr_modes, (u_char __user *)data_address); - /* Ensure that the values just loaded are not changed by - fix-up operations. */ - return 1; - case 023: /* fbld m80dec */ - clear_C1(); - loaded_tag = FPU_load_bcd((u_char __user *)data_address); - FPU_settag0(loaded_tag); - break; - case 024: /* fldcw */ - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, data_address, 2); - FPU_get_user(control_word, (unsigned short __user *) data_address); - RE_ENTRANT_CHECK_ON; - if ( partial_status & ~control_word & CW_Exceptions ) - partial_status |= (SW_Summary | SW_Backward); - else - partial_status &= ~(SW_Summary | SW_Backward); + + switch (type) { + case 000: /* fld m32real */ + clear_C1(); + loaded_tag = + FPU_load_single((float __user *)data_address, &loaded_data); + if ((loaded_tag == TAG_Special) + && isNaN(&loaded_data) + && (real_1op_NaN(&loaded_data) < 0)) { + top++; + break; + } + FPU_copy_to_reg0(&loaded_data, loaded_tag); + break; + case 001: /* fild m32int */ + clear_C1(); + loaded_tag = + FPU_load_int32((long __user *)data_address, &loaded_data); + FPU_copy_to_reg0(&loaded_data, loaded_tag); + break; + case 002: /* fld m64real */ + clear_C1(); + loaded_tag = + FPU_load_double((double __user *)data_address, + &loaded_data); + if ((loaded_tag == TAG_Special) + && isNaN(&loaded_data) + && (real_1op_NaN(&loaded_data) < 0)) { + top++; + break; + } + FPU_copy_to_reg0(&loaded_data, loaded_tag); + break; + case 003: /* fild m16int */ + clear_C1(); + loaded_tag = + FPU_load_int16((short __user *)data_address, &loaded_data); + FPU_copy_to_reg0(&loaded_data, loaded_tag); + break; + case 010: /* fst m32real */ + clear_C1(); + FPU_store_single(st0_ptr, st0_tag, + (float __user *)data_address); + break; + case 011: /* fist m32int */ + clear_C1(); + FPU_store_int32(st0_ptr, st0_tag, (long __user *)data_address); + break; + case 012: /* fst m64real */ + clear_C1(); + FPU_store_double(st0_ptr, st0_tag, + (double __user *)data_address); + break; + case 013: /* fist m16int */ + clear_C1(); + FPU_store_int16(st0_ptr, st0_tag, (short __user *)data_address); + break; + case 014: /* fstp m32real */ + clear_C1(); + if (FPU_store_single + (st0_ptr, st0_tag, (float __user *)data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + case 015: /* fistp m32int */ + clear_C1(); + if (FPU_store_int32 + (st0_ptr, st0_tag, (long __user *)data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + case 016: /* fstp m64real */ + clear_C1(); + if (FPU_store_double + (st0_ptr, st0_tag, (double __user *)data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + case 017: /* fistp m16int */ + clear_C1(); + if (FPU_store_int16 + (st0_ptr, st0_tag, (short __user *)data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + case 020: /* fldenv m14/28byte */ + fldenv(addr_modes, (u_char __user *) data_address); + /* Ensure that the values just loaded are not changed by + fix-up operations. */ + return 1; + case 022: /* frstor m94/108byte */ + frstor(addr_modes, (u_char __user *) data_address); + /* Ensure that the values just loaded are not changed by + fix-up operations. */ + return 1; + case 023: /* fbld m80dec */ + clear_C1(); + loaded_tag = FPU_load_bcd((u_char __user *) data_address); + FPU_settag0(loaded_tag); + break; + case 024: /* fldcw */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, data_address, 2); + FPU_get_user(control_word, + (unsigned short __user *)data_address); + RE_ENTRANT_CHECK_ON; + if (partial_status & ~control_word & CW_Exceptions) + partial_status |= (SW_Summary | SW_Backward); + else + partial_status &= ~(SW_Summary | SW_Backward); #ifdef PECULIAR_486 - control_word |= 0x40; /* An 80486 appears to always set this bit */ + control_word |= 0x40; /* An 80486 appears to always set this bit */ #endif /* PECULIAR_486 */ - return 1; - case 025: /* fld m80real */ - clear_C1(); - loaded_tag = FPU_load_extended((long double __user *)data_address, 0); - FPU_settag0(loaded_tag); - break; - case 027: /* fild m64int */ - clear_C1(); - loaded_tag = FPU_load_int64((long long __user *)data_address); - if (loaded_tag == TAG_Error) + return 1; + case 025: /* fld m80real */ + clear_C1(); + loaded_tag = + FPU_load_extended((long double __user *)data_address, 0); + FPU_settag0(loaded_tag); + break; + case 027: /* fild m64int */ + clear_C1(); + loaded_tag = FPU_load_int64((long long __user *)data_address); + if (loaded_tag == TAG_Error) + return 0; + FPU_settag0(loaded_tag); + break; + case 030: /* fstenv m14/28byte */ + fstenv(addr_modes, (u_char __user *) data_address); + return 1; + case 032: /* fsave */ + fsave(addr_modes, (u_char __user *) data_address); + return 1; + case 033: /* fbstp m80dec */ + clear_C1(); + if (FPU_store_bcd + (st0_ptr, st0_tag, (u_char __user *) data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + case 034: /* fstcw m16int */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, data_address, 2); + FPU_put_user(control_word, + (unsigned short __user *)data_address); + RE_ENTRANT_CHECK_ON; + return 1; + case 035: /* fstp m80real */ + clear_C1(); + if (FPU_store_extended + (st0_ptr, st0_tag, (long double __user *)data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + case 036: /* fstsw m2byte */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, data_address, 2); + FPU_put_user(status_word(), + (unsigned short __user *)data_address); + RE_ENTRANT_CHECK_ON; + return 1; + case 037: /* fistp m64int */ + clear_C1(); + if (FPU_store_int64 + (st0_ptr, st0_tag, (long long __user *)data_address)) + pop_0(); /* pop only if the number was actually stored + (see the 80486 manual p16-28) */ + break; + } return 0; - FPU_settag0(loaded_tag); - break; - case 030: /* fstenv m14/28byte */ - fstenv(addr_modes, (u_char __user *)data_address); - return 1; - case 032: /* fsave */ - fsave(addr_modes, (u_char __user *)data_address); - return 1; - case 033: /* fbstp m80dec */ - clear_C1(); - if ( FPU_store_bcd(st0_ptr, st0_tag, (u_char __user *)data_address) ) - pop_0(); /* pop only if the number was actually stored - (see the 80486 manual p16-28) */ - break; - case 034: /* fstcw m16int */ - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,data_address,2); - FPU_put_user(control_word, (unsigned short __user *) data_address); - RE_ENTRANT_CHECK_ON; - return 1; - case 035: /* fstp m80real */ - clear_C1(); - if ( FPU_store_extended(st0_ptr, st0_tag, (long double __user *)data_address) ) - pop_0(); /* pop only if the number was actually stored - (see the 80486 manual p16-28) */ - break; - case 036: /* fstsw m2byte */ - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,data_address,2); - FPU_put_user(status_word(),(unsigned short __user *) data_address); - RE_ENTRANT_CHECK_ON; - return 1; - case 037: /* fistp m64int */ - clear_C1(); - if ( FPU_store_int64(st0_ptr, st0_tag, (long long __user *)data_address) ) - pop_0(); /* pop only if the number was actually stored - (see the 80486 manual p16-28) */ - break; - } - return 0; } diff --git a/arch/x86/math-emu/poly.h b/arch/x86/math-emu/poly.h index 4db79811492..168eb44c93c 100644 --- a/arch/x86/math-emu/poly.h +++ b/arch/x86/math-emu/poly.h @@ -21,9 +21,9 @@ allows. 9-byte would probably be sufficient. */ typedef struct { - unsigned long lsw; - unsigned long midw; - unsigned long msw; + unsigned long lsw; + unsigned long midw; + unsigned long msw; } Xsig; asmlinkage void mul64(unsigned long long const *a, unsigned long long const *b, @@ -49,7 +49,6 @@ asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, const Xsig *dest); /* Macro to access the 8 ms bytes of an Xsig as a long long */ #define XSIG_LL(x) (*(unsigned long long *)&x.midw) - /* Need to run gcc with optimizations on to get these to actually be in-line. @@ -63,59 +62,53 @@ asmlinkage void div_Xsig(Xsig *x1, const Xsig *x2, const Xsig *dest); static inline unsigned long mul_32_32(const unsigned long arg1, const unsigned long arg2) { - int retval; - asm volatile ("mull %2; movl %%edx,%%eax" \ - :"=a" (retval) \ - :"0" (arg1), "g" (arg2) \ - :"dx"); - return retval; + int retval; + asm volatile ("mull %2; movl %%edx,%%eax":"=a" (retval) + :"0"(arg1), "g"(arg2) + :"dx"); + return retval; } - /* Add the 12 byte Xsig x2 to Xsig dest, with no checks for overflow. */ static inline void add_Xsig_Xsig(Xsig *dest, const Xsig *x2) { - asm volatile ("movl %1,%%edi; movl %2,%%esi;\n" - "movl (%%esi),%%eax; addl %%eax,(%%edi);\n" - "movl 4(%%esi),%%eax; adcl %%eax,4(%%edi);\n" - "movl 8(%%esi),%%eax; adcl %%eax,8(%%edi);\n" - :"=g" (*dest):"g" (dest), "g" (x2) - :"ax","si","di"); + asm volatile ("movl %1,%%edi; movl %2,%%esi;\n" + "movl (%%esi),%%eax; addl %%eax,(%%edi);\n" + "movl 4(%%esi),%%eax; adcl %%eax,4(%%edi);\n" + "movl 8(%%esi),%%eax; adcl %%eax,8(%%edi);\n":"=g" + (*dest):"g"(dest), "g"(x2) + :"ax", "si", "di"); } - /* Add the 12 byte Xsig x2 to Xsig dest, adjust exp if overflow occurs. */ /* Note: the constraints in the asm statement didn't always work properly with gcc 2.5.8. Changing from using edi to using ecx got around the problem, but keep fingers crossed! */ static inline void add_two_Xsig(Xsig *dest, const Xsig *x2, long int *exp) { - asm volatile ("movl %2,%%ecx; movl %3,%%esi;\n" - "movl (%%esi),%%eax; addl %%eax,(%%ecx);\n" - "movl 4(%%esi),%%eax; adcl %%eax,4(%%ecx);\n" - "movl 8(%%esi),%%eax; adcl %%eax,8(%%ecx);\n" - "jnc 0f;\n" - "rcrl 8(%%ecx); rcrl 4(%%ecx); rcrl (%%ecx)\n" - "movl %4,%%ecx; incl (%%ecx)\n" - "movl $1,%%eax; jmp 1f;\n" - "0: xorl %%eax,%%eax;\n" - "1:\n" - :"=g" (*exp), "=g" (*dest) - :"g" (dest), "g" (x2), "g" (exp) - :"cx","si","ax"); + asm volatile ("movl %2,%%ecx; movl %3,%%esi;\n" + "movl (%%esi),%%eax; addl %%eax,(%%ecx);\n" + "movl 4(%%esi),%%eax; adcl %%eax,4(%%ecx);\n" + "movl 8(%%esi),%%eax; adcl %%eax,8(%%ecx);\n" + "jnc 0f;\n" + "rcrl 8(%%ecx); rcrl 4(%%ecx); rcrl (%%ecx)\n" + "movl %4,%%ecx; incl (%%ecx)\n" + "movl $1,%%eax; jmp 1f;\n" + "0: xorl %%eax,%%eax;\n" "1:\n":"=g" (*exp), "=g"(*dest) + :"g"(dest), "g"(x2), "g"(exp) + :"cx", "si", "ax"); } - /* Negate (subtract from 1.0) the 12 byte Xsig */ /* This is faster in a loop on my 386 than using the "neg" instruction. */ static inline void negate_Xsig(Xsig *x) { - asm volatile("movl %1,%%esi;\n" - "xorl %%ecx,%%ecx;\n" - "movl %%ecx,%%eax; subl (%%esi),%%eax; movl %%eax,(%%esi);\n" - "movl %%ecx,%%eax; sbbl 4(%%esi),%%eax; movl %%eax,4(%%esi);\n" - "movl %%ecx,%%eax; sbbl 8(%%esi),%%eax; movl %%eax,8(%%esi);\n" - :"=g" (*x):"g" (x):"si","ax","cx"); + asm volatile ("movl %1,%%esi;\n" + "xorl %%ecx,%%ecx;\n" + "movl %%ecx,%%eax; subl (%%esi),%%eax; movl %%eax,(%%esi);\n" + "movl %%ecx,%%eax; sbbl 4(%%esi),%%eax; movl %%eax,4(%%esi);\n" + "movl %%ecx,%%eax; sbbl 8(%%esi),%%eax; movl %%eax,8(%%esi);\n":"=g" + (*x):"g"(x):"si", "ax", "cx"); } #endif /* _POLY_H */ diff --git a/arch/x86/math-emu/poly_2xm1.c b/arch/x86/math-emu/poly_2xm1.c index 9766ad5e974..b00e9e10cdc 100644 --- a/arch/x86/math-emu/poly_2xm1.c +++ b/arch/x86/math-emu/poly_2xm1.c @@ -17,21 +17,19 @@ #include "control_w.h" #include "poly.h" - #define HIPOWER 11 -static const unsigned long long lterms[HIPOWER] = -{ - 0x0000000000000000LL, /* This term done separately as 12 bytes */ - 0xf5fdeffc162c7543LL, - 0x1c6b08d704a0bfa6LL, - 0x0276556df749cc21LL, - 0x002bb0ffcf14f6b8LL, - 0x0002861225ef751cLL, - 0x00001ffcbfcd5422LL, - 0x00000162c005d5f1LL, - 0x0000000da96ccb1bLL, - 0x0000000078d1b897LL, - 0x000000000422b029LL +static const unsigned long long lterms[HIPOWER] = { + 0x0000000000000000LL, /* This term done separately as 12 bytes */ + 0xf5fdeffc162c7543LL, + 0x1c6b08d704a0bfa6LL, + 0x0276556df749cc21LL, + 0x002bb0ffcf14f6b8LL, + 0x0002861225ef751cLL, + 0x00001ffcbfcd5422LL, + 0x00000162c005d5f1LL, + 0x0000000da96ccb1bLL, + 0x0000000078d1b897LL, + 0x000000000422b029LL }; static const Xsig hiterm = MK_XSIG(0xb17217f7, 0xd1cf79ab, 0xc8a39194); @@ -45,112 +43,103 @@ static const Xsig shiftterm2 = MK_XSIG(0xb504f333, 0xf9de6484, 0x597d89b3); static const Xsig shiftterm3 = MK_XSIG(0xd744fcca, 0xd69d6af4, 0x39a68bb9); static const Xsig *shiftterm[] = { &shiftterm0, &shiftterm1, - &shiftterm2, &shiftterm3 }; - + &shiftterm2, &shiftterm3 +}; /*--- poly_2xm1() -----------------------------------------------------------+ | Requires st(0) which is TAG_Valid and < 1. | +---------------------------------------------------------------------------*/ -int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result) +int poly_2xm1(u_char sign, FPU_REG *arg, FPU_REG *result) { - long int exponent, shift; - unsigned long long Xll; - Xsig accumulator, Denom, argSignif; - u_char tag; + long int exponent, shift; + unsigned long long Xll; + Xsig accumulator, Denom, argSignif; + u_char tag; - exponent = exponent16(arg); + exponent = exponent16(arg); #ifdef PARANOID - if ( exponent >= 0 ) /* Don't want a |number| >= 1.0 */ - { - /* Number negative, too large, or not Valid. */ - EXCEPTION(EX_INTERNAL|0x127); - return 1; - } + if (exponent >= 0) { /* Don't want a |number| >= 1.0 */ + /* Number negative, too large, or not Valid. */ + EXCEPTION(EX_INTERNAL | 0x127); + return 1; + } #endif /* PARANOID */ - argSignif.lsw = 0; - XSIG_LL(argSignif) = Xll = significand(arg); - - if ( exponent == -1 ) - { - shift = (argSignif.msw & 0x40000000) ? 3 : 2; - /* subtract 0.5 or 0.75 */ - exponent -= 2; - XSIG_LL(argSignif) <<= 2; - Xll <<= 2; - } - else if ( exponent == -2 ) - { - shift = 1; - /* subtract 0.25 */ - exponent--; - XSIG_LL(argSignif) <<= 1; - Xll <<= 1; - } - else - shift = 0; - - if ( exponent < -2 ) - { - /* Shift the argument right by the required places. */ - if ( FPU_shrx(&Xll, -2-exponent) >= 0x80000000U ) - Xll++; /* round up */ - } - - accumulator.lsw = accumulator.midw = accumulator.msw = 0; - polynomial_Xsig(&accumulator, &Xll, lterms, HIPOWER-1); - mul_Xsig_Xsig(&accumulator, &argSignif); - shr_Xsig(&accumulator, 3); - - mul_Xsig_Xsig(&argSignif, &hiterm); /* The leading term */ - add_two_Xsig(&accumulator, &argSignif, &exponent); - - if ( shift ) - { - /* The argument is large, use the identity: - f(x+a) = f(a) * (f(x) + 1) - 1; - */ - shr_Xsig(&accumulator, - exponent); - accumulator.msw |= 0x80000000; /* add 1.0 */ - mul_Xsig_Xsig(&accumulator, shiftterm[shift]); - accumulator.msw &= 0x3fffffff; /* subtract 1.0 */ - exponent = 1; - } - - if ( sign != SIGN_POS ) - { - /* The argument is negative, use the identity: - f(-x) = -f(x) / (1 + f(x)) - */ - Denom.lsw = accumulator.lsw; - XSIG_LL(Denom) = XSIG_LL(accumulator); - if ( exponent < 0 ) - shr_Xsig(&Denom, - exponent); - else if ( exponent > 0 ) - { - /* exponent must be 1 here */ - XSIG_LL(Denom) <<= 1; - if ( Denom.lsw & 0x80000000 ) - XSIG_LL(Denom) |= 1; - (Denom.lsw) <<= 1; + argSignif.lsw = 0; + XSIG_LL(argSignif) = Xll = significand(arg); + + if (exponent == -1) { + shift = (argSignif.msw & 0x40000000) ? 3 : 2; + /* subtract 0.5 or 0.75 */ + exponent -= 2; + XSIG_LL(argSignif) <<= 2; + Xll <<= 2; + } else if (exponent == -2) { + shift = 1; + /* subtract 0.25 */ + exponent--; + XSIG_LL(argSignif) <<= 1; + Xll <<= 1; + } else + shift = 0; + + if (exponent < -2) { + /* Shift the argument right by the required places. */ + if (FPU_shrx(&Xll, -2 - exponent) >= 0x80000000U) + Xll++; /* round up */ + } + + accumulator.lsw = accumulator.midw = accumulator.msw = 0; + polynomial_Xsig(&accumulator, &Xll, lterms, HIPOWER - 1); + mul_Xsig_Xsig(&accumulator, &argSignif); + shr_Xsig(&accumulator, 3); + + mul_Xsig_Xsig(&argSignif, &hiterm); /* The leading term */ + add_two_Xsig(&accumulator, &argSignif, &exponent); + + if (shift) { + /* The argument is large, use the identity: + f(x+a) = f(a) * (f(x) + 1) - 1; + */ + shr_Xsig(&accumulator, -exponent); + accumulator.msw |= 0x80000000; /* add 1.0 */ + mul_Xsig_Xsig(&accumulator, shiftterm[shift]); + accumulator.msw &= 0x3fffffff; /* subtract 1.0 */ + exponent = 1; + } + + if (sign != SIGN_POS) { + /* The argument is negative, use the identity: + f(-x) = -f(x) / (1 + f(x)) + */ + Denom.lsw = accumulator.lsw; + XSIG_LL(Denom) = XSIG_LL(accumulator); + if (exponent < 0) + shr_Xsig(&Denom, -exponent); + else if (exponent > 0) { + /* exponent must be 1 here */ + XSIG_LL(Denom) <<= 1; + if (Denom.lsw & 0x80000000) + XSIG_LL(Denom) |= 1; + (Denom.lsw) <<= 1; + } + Denom.msw |= 0x80000000; /* add 1.0 */ + div_Xsig(&accumulator, &Denom, &accumulator); } - Denom.msw |= 0x80000000; /* add 1.0 */ - div_Xsig(&accumulator, &Denom, &accumulator); - } - /* Convert to 64 bit signed-compatible */ - exponent += round_Xsig(&accumulator); + /* Convert to 64 bit signed-compatible */ + exponent += round_Xsig(&accumulator); - result = &st(0); - significand(result) = XSIG_LL(accumulator); - setexponent16(result, exponent); + result = &st(0); + significand(result) = XSIG_LL(accumulator); + setexponent16(result, exponent); - tag = FPU_round(result, 1, 0, FULL_PRECISION, sign); + tag = FPU_round(result, 1, 0, FULL_PRECISION, sign); - setsign(result, sign); - FPU_settag0(tag); + setsign(result, sign); + FPU_settag0(tag); - return 0; + return 0; } diff --git a/arch/x86/math-emu/poly_atan.c b/arch/x86/math-emu/poly_atan.c index 82f702952f6..20c28e58e2d 100644 --- a/arch/x86/math-emu/poly_atan.c +++ b/arch/x86/math-emu/poly_atan.c @@ -18,28 +18,25 @@ #include "control_w.h" #include "poly.h" - #define HIPOWERon 6 /* odd poly, negative terms */ -static const unsigned long long oddnegterms[HIPOWERon] = -{ - 0x0000000000000000LL, /* Dummy (not for - 1.0) */ - 0x015328437f756467LL, - 0x0005dda27b73dec6LL, - 0x0000226bf2bfb91aLL, - 0x000000ccc439c5f7LL, - 0x0000000355438407LL -} ; +static const unsigned long long oddnegterms[HIPOWERon] = { + 0x0000000000000000LL, /* Dummy (not for - 1.0) */ + 0x015328437f756467LL, + 0x0005dda27b73dec6LL, + 0x0000226bf2bfb91aLL, + 0x000000ccc439c5f7LL, + 0x0000000355438407LL +}; #define HIPOWERop 6 /* odd poly, positive terms */ -static const unsigned long long oddplterms[HIPOWERop] = -{ +static const unsigned long long oddplterms[HIPOWERop] = { /* 0xaaaaaaaaaaaaaaabLL, transferred to fixedpterm[] */ - 0x0db55a71875c9ac2LL, - 0x0029fce2d67880b0LL, - 0x0000dfd3908b4596LL, - 0x00000550fd61dab4LL, - 0x0000001c9422b3f9LL, - 0x000000003e3301e1LL + 0x0db55a71875c9ac2LL, + 0x0029fce2d67880b0LL, + 0x0000dfd3908b4596LL, + 0x00000550fd61dab4LL, + 0x0000001c9422b3f9LL, + 0x000000003e3301e1LL }; static const unsigned long long denomterm = 0xebd9b842c5c53a0eLL; @@ -48,182 +45,164 @@ static const Xsig fixedpterm = MK_XSIG(0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa); static const Xsig pi_signif = MK_XSIG(0xc90fdaa2, 0x2168c234, 0xc4c6628b); - /*--- poly_atan() -----------------------------------------------------------+ | | +---------------------------------------------------------------------------*/ -void poly_atan(FPU_REG *st0_ptr, u_char st0_tag, - FPU_REG *st1_ptr, u_char st1_tag) +void poly_atan(FPU_REG *st0_ptr, u_char st0_tag, + FPU_REG *st1_ptr, u_char st1_tag) { - u_char transformed, inverted, - sign1, sign2; - int exponent; - long int dummy_exp; - Xsig accumulator, Numer, Denom, accumulatore, argSignif, - argSq, argSqSq; - u_char tag; - - sign1 = getsign(st0_ptr); - sign2 = getsign(st1_ptr); - if ( st0_tag == TAG_Valid ) - { - exponent = exponent(st0_ptr); - } - else - { - /* This gives non-compatible stack contents... */ - FPU_to_exp16(st0_ptr, st0_ptr); - exponent = exponent16(st0_ptr); - } - if ( st1_tag == TAG_Valid ) - { - exponent -= exponent(st1_ptr); - } - else - { - /* This gives non-compatible stack contents... */ - FPU_to_exp16(st1_ptr, st1_ptr); - exponent -= exponent16(st1_ptr); - } - - if ( (exponent < 0) || ((exponent == 0) && - ((st0_ptr->sigh < st1_ptr->sigh) || - ((st0_ptr->sigh == st1_ptr->sigh) && - (st0_ptr->sigl < st1_ptr->sigl))) ) ) - { - inverted = 1; - Numer.lsw = Denom.lsw = 0; - XSIG_LL(Numer) = significand(st0_ptr); - XSIG_LL(Denom) = significand(st1_ptr); - } - else - { - inverted = 0; - exponent = -exponent; - Numer.lsw = Denom.lsw = 0; - XSIG_LL(Numer) = significand(st1_ptr); - XSIG_LL(Denom) = significand(st0_ptr); - } - div_Xsig(&Numer, &Denom, &argSignif); - exponent += norm_Xsig(&argSignif); - - if ( (exponent >= -1) - || ((exponent == -2) && (argSignif.msw > 0xd413ccd0)) ) - { - /* The argument is greater than sqrt(2)-1 (=0.414213562...) */ - /* Convert the argument by an identity for atan */ - transformed = 1; - - if ( exponent >= 0 ) - { + u_char transformed, inverted, sign1, sign2; + int exponent; + long int dummy_exp; + Xsig accumulator, Numer, Denom, accumulatore, argSignif, argSq, argSqSq; + u_char tag; + + sign1 = getsign(st0_ptr); + sign2 = getsign(st1_ptr); + if (st0_tag == TAG_Valid) { + exponent = exponent(st0_ptr); + } else { + /* This gives non-compatible stack contents... */ + FPU_to_exp16(st0_ptr, st0_ptr); + exponent = exponent16(st0_ptr); + } + if (st1_tag == TAG_Valid) { + exponent -= exponent(st1_ptr); + } else { + /* This gives non-compatible stack contents... */ + FPU_to_exp16(st1_ptr, st1_ptr); + exponent -= exponent16(st1_ptr); + } + + if ((exponent < 0) || ((exponent == 0) && + ((st0_ptr->sigh < st1_ptr->sigh) || + ((st0_ptr->sigh == st1_ptr->sigh) && + (st0_ptr->sigl < st1_ptr->sigl))))) { + inverted = 1; + Numer.lsw = Denom.lsw = 0; + XSIG_LL(Numer) = significand(st0_ptr); + XSIG_LL(Denom) = significand(st1_ptr); + } else { + inverted = 0; + exponent = -exponent; + Numer.lsw = Denom.lsw = 0; + XSIG_LL(Numer) = significand(st1_ptr); + XSIG_LL(Denom) = significand(st0_ptr); + } + div_Xsig(&Numer, &Denom, &argSignif); + exponent += norm_Xsig(&argSignif); + + if ((exponent >= -1) + || ((exponent == -2) && (argSignif.msw > 0xd413ccd0))) { + /* The argument is greater than sqrt(2)-1 (=0.414213562...) */ + /* Convert the argument by an identity for atan */ + transformed = 1; + + if (exponent >= 0) { #ifdef PARANOID - if ( !( (exponent == 0) && - (argSignif.lsw == 0) && (argSignif.midw == 0) && - (argSignif.msw == 0x80000000) ) ) - { - EXCEPTION(EX_INTERNAL|0x104); /* There must be a logic error */ - return; - } + if (!((exponent == 0) && + (argSignif.lsw == 0) && (argSignif.midw == 0) && + (argSignif.msw == 0x80000000))) { + EXCEPTION(EX_INTERNAL | 0x104); /* There must be a logic error */ + return; + } #endif /* PARANOID */ - argSignif.msw = 0; /* Make the transformed arg -> 0.0 */ + argSignif.msw = 0; /* Make the transformed arg -> 0.0 */ + } else { + Numer.lsw = Denom.lsw = argSignif.lsw; + XSIG_LL(Numer) = XSIG_LL(Denom) = XSIG_LL(argSignif); + + if (exponent < -1) + shr_Xsig(&Numer, -1 - exponent); + negate_Xsig(&Numer); + + shr_Xsig(&Denom, -exponent); + Denom.msw |= 0x80000000; + + div_Xsig(&Numer, &Denom, &argSignif); + + exponent = -1 + norm_Xsig(&argSignif); + } + } else { + transformed = 0; + } + + argSq.lsw = argSignif.lsw; + argSq.midw = argSignif.midw; + argSq.msw = argSignif.msw; + mul_Xsig_Xsig(&argSq, &argSq); + + argSqSq.lsw = argSq.lsw; + argSqSq.midw = argSq.midw; + argSqSq.msw = argSq.msw; + mul_Xsig_Xsig(&argSqSq, &argSqSq); + + accumulatore.lsw = argSq.lsw; + XSIG_LL(accumulatore) = XSIG_LL(argSq); + + shr_Xsig(&argSq, 2 * (-1 - exponent - 1)); + shr_Xsig(&argSqSq, 4 * (-1 - exponent - 1)); + + /* Now have argSq etc with binary point at the left + .1xxxxxxxx */ + + /* Do the basic fixed point polynomial evaluation */ + accumulator.msw = accumulator.midw = accumulator.lsw = 0; + polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), + oddplterms, HIPOWERop - 1); + mul64_Xsig(&accumulator, &XSIG_LL(argSq)); + negate_Xsig(&accumulator); + polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), oddnegterms, + HIPOWERon - 1); + negate_Xsig(&accumulator); + add_two_Xsig(&accumulator, &fixedpterm, &dummy_exp); + + mul64_Xsig(&accumulatore, &denomterm); + shr_Xsig(&accumulatore, 1 + 2 * (-1 - exponent)); + accumulatore.msw |= 0x80000000; + + div_Xsig(&accumulator, &accumulatore, &accumulator); + + mul_Xsig_Xsig(&accumulator, &argSignif); + mul_Xsig_Xsig(&accumulator, &argSq); + + shr_Xsig(&accumulator, 3); + negate_Xsig(&accumulator); + add_Xsig_Xsig(&accumulator, &argSignif); + + if (transformed) { + /* compute pi/4 - accumulator */ + shr_Xsig(&accumulator, -1 - exponent); + negate_Xsig(&accumulator); + add_Xsig_Xsig(&accumulator, &pi_signif); + exponent = -1; + } + + if (inverted) { + /* compute pi/2 - accumulator */ + shr_Xsig(&accumulator, -exponent); + negate_Xsig(&accumulator); + add_Xsig_Xsig(&accumulator, &pi_signif); + exponent = 0; } - else - { - Numer.lsw = Denom.lsw = argSignif.lsw; - XSIG_LL(Numer) = XSIG_LL(Denom) = XSIG_LL(argSignif); - - if ( exponent < -1 ) - shr_Xsig(&Numer, -1-exponent); - negate_Xsig(&Numer); - - shr_Xsig(&Denom, -exponent); - Denom.msw |= 0x80000000; - - div_Xsig(&Numer, &Denom, &argSignif); - - exponent = -1 + norm_Xsig(&argSignif); + + if (sign1) { + /* compute pi - accumulator */ + shr_Xsig(&accumulator, 1 - exponent); + negate_Xsig(&accumulator); + add_Xsig_Xsig(&accumulator, &pi_signif); + exponent = 1; } - } - else - { - transformed = 0; - } - - argSq.lsw = argSignif.lsw; argSq.midw = argSignif.midw; - argSq.msw = argSignif.msw; - mul_Xsig_Xsig(&argSq, &argSq); - - argSqSq.lsw = argSq.lsw; argSqSq.midw = argSq.midw; argSqSq.msw = argSq.msw; - mul_Xsig_Xsig(&argSqSq, &argSqSq); - - accumulatore.lsw = argSq.lsw; - XSIG_LL(accumulatore) = XSIG_LL(argSq); - - shr_Xsig(&argSq, 2*(-1-exponent-1)); - shr_Xsig(&argSqSq, 4*(-1-exponent-1)); - - /* Now have argSq etc with binary point at the left - .1xxxxxxxx */ - - /* Do the basic fixed point polynomial evaluation */ - accumulator.msw = accumulator.midw = accumulator.lsw = 0; - polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), - oddplterms, HIPOWERop-1); - mul64_Xsig(&accumulator, &XSIG_LL(argSq)); - negate_Xsig(&accumulator); - polynomial_Xsig(&accumulator, &XSIG_LL(argSqSq), oddnegterms, HIPOWERon-1); - negate_Xsig(&accumulator); - add_two_Xsig(&accumulator, &fixedpterm, &dummy_exp); - - mul64_Xsig(&accumulatore, &denomterm); - shr_Xsig(&accumulatore, 1 + 2*(-1-exponent)); - accumulatore.msw |= 0x80000000; - - div_Xsig(&accumulator, &accumulatore, &accumulator); - - mul_Xsig_Xsig(&accumulator, &argSignif); - mul_Xsig_Xsig(&accumulator, &argSq); - - shr_Xsig(&accumulator, 3); - negate_Xsig(&accumulator); - add_Xsig_Xsig(&accumulator, &argSignif); - - if ( transformed ) - { - /* compute pi/4 - accumulator */ - shr_Xsig(&accumulator, -1-exponent); - negate_Xsig(&accumulator); - add_Xsig_Xsig(&accumulator, &pi_signif); - exponent = -1; - } - - if ( inverted ) - { - /* compute pi/2 - accumulator */ - shr_Xsig(&accumulator, -exponent); - negate_Xsig(&accumulator); - add_Xsig_Xsig(&accumulator, &pi_signif); - exponent = 0; - } - - if ( sign1 ) - { - /* compute pi - accumulator */ - shr_Xsig(&accumulator, 1 - exponent); - negate_Xsig(&accumulator); - add_Xsig_Xsig(&accumulator, &pi_signif); - exponent = 1; - } - - exponent += round_Xsig(&accumulator); - - significand(st1_ptr) = XSIG_LL(accumulator); - setexponent16(st1_ptr, exponent); - - tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign2); - FPU_settagi(1, tag); - - set_precision_flag_up(); /* We do not really know if up or down, - use this as the default. */ + + exponent += round_Xsig(&accumulator); + + significand(st1_ptr) = XSIG_LL(accumulator); + setexponent16(st1_ptr, exponent); + + tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign2); + FPU_settagi(1, tag); + + set_precision_flag_up(); /* We do not really know if up or down, + use this as the default. */ } diff --git a/arch/x86/math-emu/poly_l2.c b/arch/x86/math-emu/poly_l2.c index dd00e1d5b07..8e2ff4b28a0 100644 --- a/arch/x86/math-emu/poly_l2.c +++ b/arch/x86/math-emu/poly_l2.c @@ -10,7 +10,6 @@ | | +---------------------------------------------------------------------------*/ - #include "exception.h" #include "reg_constant.h" #include "fpu_emu.h" @@ -18,184 +17,163 @@ #include "control_w.h" #include "poly.h" - static void log2_kernel(FPU_REG const *arg, u_char argsign, - Xsig *accum_result, long int *expon); - + Xsig * accum_result, long int *expon); /*--- poly_l2() -------------------------------------------------------------+ | Base 2 logarithm by a polynomial approximation. | +---------------------------------------------------------------------------*/ -void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign) +void poly_l2(FPU_REG *st0_ptr, FPU_REG *st1_ptr, u_char st1_sign) { - long int exponent, expon, expon_expon; - Xsig accumulator, expon_accum, yaccum; - u_char sign, argsign; - FPU_REG x; - int tag; - - exponent = exponent16(st0_ptr); - - /* From st0_ptr, make a number > sqrt(2)/2 and < sqrt(2) */ - if ( st0_ptr->sigh > (unsigned)0xb504f334 ) - { - /* Treat as sqrt(2)/2 < st0_ptr < 1 */ - significand(&x) = - significand(st0_ptr); - setexponent16(&x, -1); - exponent++; - argsign = SIGN_NEG; - } - else - { - /* Treat as 1 <= st0_ptr < sqrt(2) */ - x.sigh = st0_ptr->sigh - 0x80000000; - x.sigl = st0_ptr->sigl; - setexponent16(&x, 0); - argsign = SIGN_POS; - } - tag = FPU_normalize_nuo(&x); - - if ( tag == TAG_Zero ) - { - expon = 0; - accumulator.msw = accumulator.midw = accumulator.lsw = 0; - } - else - { - log2_kernel(&x, argsign, &accumulator, &expon); - } - - if ( exponent < 0 ) - { - sign = SIGN_NEG; - exponent = -exponent; - } - else - sign = SIGN_POS; - expon_accum.msw = exponent; expon_accum.midw = expon_accum.lsw = 0; - if ( exponent ) - { - expon_expon = 31 + norm_Xsig(&expon_accum); - shr_Xsig(&accumulator, expon_expon - expon); - - if ( sign ^ argsign ) - negate_Xsig(&accumulator); - add_Xsig_Xsig(&accumulator, &expon_accum); - } - else - { - expon_expon = expon; - sign = argsign; - } - - yaccum.lsw = 0; XSIG_LL(yaccum) = significand(st1_ptr); - mul_Xsig_Xsig(&accumulator, &yaccum); - - expon_expon += round_Xsig(&accumulator); - - if ( accumulator.msw == 0 ) - { - FPU_copy_to_reg1(&CONST_Z, TAG_Zero); - return; - } - - significand(st1_ptr) = XSIG_LL(accumulator); - setexponent16(st1_ptr, expon_expon + exponent16(st1_ptr) + 1); - - tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign ^ st1_sign); - FPU_settagi(1, tag); - - set_precision_flag_up(); /* 80486 appears to always do this */ - - return; + long int exponent, expon, expon_expon; + Xsig accumulator, expon_accum, yaccum; + u_char sign, argsign; + FPU_REG x; + int tag; + + exponent = exponent16(st0_ptr); + + /* From st0_ptr, make a number > sqrt(2)/2 and < sqrt(2) */ + if (st0_ptr->sigh > (unsigned)0xb504f334) { + /* Treat as sqrt(2)/2 < st0_ptr < 1 */ + significand(&x) = -significand(st0_ptr); + setexponent16(&x, -1); + exponent++; + argsign = SIGN_NEG; + } else { + /* Treat as 1 <= st0_ptr < sqrt(2) */ + x.sigh = st0_ptr->sigh - 0x80000000; + x.sigl = st0_ptr->sigl; + setexponent16(&x, 0); + argsign = SIGN_POS; + } + tag = FPU_normalize_nuo(&x); -} + if (tag == TAG_Zero) { + expon = 0; + accumulator.msw = accumulator.midw = accumulator.lsw = 0; + } else { + log2_kernel(&x, argsign, &accumulator, &expon); + } + + if (exponent < 0) { + sign = SIGN_NEG; + exponent = -exponent; + } else + sign = SIGN_POS; + expon_accum.msw = exponent; + expon_accum.midw = expon_accum.lsw = 0; + if (exponent) { + expon_expon = 31 + norm_Xsig(&expon_accum); + shr_Xsig(&accumulator, expon_expon - expon); + + if (sign ^ argsign) + negate_Xsig(&accumulator); + add_Xsig_Xsig(&accumulator, &expon_accum); + } else { + expon_expon = expon; + sign = argsign; + } + + yaccum.lsw = 0; + XSIG_LL(yaccum) = significand(st1_ptr); + mul_Xsig_Xsig(&accumulator, &yaccum); + + expon_expon += round_Xsig(&accumulator); + + if (accumulator.msw == 0) { + FPU_copy_to_reg1(&CONST_Z, TAG_Zero); + return; + } + + significand(st1_ptr) = XSIG_LL(accumulator); + setexponent16(st1_ptr, expon_expon + exponent16(st1_ptr) + 1); + tag = FPU_round(st1_ptr, 1, 0, FULL_PRECISION, sign ^ st1_sign); + FPU_settagi(1, tag); + + set_precision_flag_up(); /* 80486 appears to always do this */ + + return; + +} /*--- poly_l2p1() -----------------------------------------------------------+ | Base 2 logarithm by a polynomial approximation. | | log2(x+1) | +---------------------------------------------------------------------------*/ -int poly_l2p1(u_char sign0, u_char sign1, - FPU_REG *st0_ptr, FPU_REG *st1_ptr, FPU_REG *dest) +int poly_l2p1(u_char sign0, u_char sign1, + FPU_REG * st0_ptr, FPU_REG * st1_ptr, FPU_REG * dest) { - u_char tag; - long int exponent; - Xsig accumulator, yaccum; + u_char tag; + long int exponent; + Xsig accumulator, yaccum; - if ( exponent16(st0_ptr) < 0 ) - { - log2_kernel(st0_ptr, sign0, &accumulator, &exponent); + if (exponent16(st0_ptr) < 0) { + log2_kernel(st0_ptr, sign0, &accumulator, &exponent); - yaccum.lsw = 0; - XSIG_LL(yaccum) = significand(st1_ptr); - mul_Xsig_Xsig(&accumulator, &yaccum); + yaccum.lsw = 0; + XSIG_LL(yaccum) = significand(st1_ptr); + mul_Xsig_Xsig(&accumulator, &yaccum); - exponent += round_Xsig(&accumulator); + exponent += round_Xsig(&accumulator); - exponent += exponent16(st1_ptr) + 1; - if ( exponent < EXP_WAY_UNDER ) exponent = EXP_WAY_UNDER; + exponent += exponent16(st1_ptr) + 1; + if (exponent < EXP_WAY_UNDER) + exponent = EXP_WAY_UNDER; - significand(dest) = XSIG_LL(accumulator); - setexponent16(dest, exponent); + significand(dest) = XSIG_LL(accumulator); + setexponent16(dest, exponent); - tag = FPU_round(dest, 1, 0, FULL_PRECISION, sign0 ^ sign1); - FPU_settagi(1, tag); + tag = FPU_round(dest, 1, 0, FULL_PRECISION, sign0 ^ sign1); + FPU_settagi(1, tag); - if ( tag == TAG_Valid ) - set_precision_flag_up(); /* 80486 appears to always do this */ - } - else - { - /* The magnitude of st0_ptr is far too large. */ + if (tag == TAG_Valid) + set_precision_flag_up(); /* 80486 appears to always do this */ + } else { + /* The magnitude of st0_ptr is far too large. */ - if ( sign0 != SIGN_POS ) - { - /* Trying to get the log of a negative number. */ -#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */ - changesign(st1_ptr); + if (sign0 != SIGN_POS) { + /* Trying to get the log of a negative number. */ +#ifdef PECULIAR_486 /* Stupid 80486 doesn't worry about log(negative). */ + changesign(st1_ptr); #else - if ( arith_invalid(1) < 0 ) - return 1; + if (arith_invalid(1) < 0) + return 1; #endif /* PECULIAR_486 */ - } + } - /* 80486 appears to do this */ - if ( sign0 == SIGN_NEG ) - set_precision_flag_down(); - else - set_precision_flag_up(); - } + /* 80486 appears to do this */ + if (sign0 == SIGN_NEG) + set_precision_flag_down(); + else + set_precision_flag_up(); + } - if ( exponent(dest) <= EXP_UNDER ) - EXCEPTION(EX_Underflow); + if (exponent(dest) <= EXP_UNDER) + EXCEPTION(EX_Underflow); - return 0; + return 0; } - - - #undef HIPOWER #define HIPOWER 10 -static const unsigned long long logterms[HIPOWER] = -{ - 0x2a8eca5705fc2ef0LL, - 0xf6384ee1d01febceLL, - 0x093bb62877cdf642LL, - 0x006985d8a9ec439bLL, - 0x0005212c4f55a9c8LL, - 0x00004326a16927f0LL, - 0x0000038d1d80a0e7LL, - 0x0000003141cc80c6LL, - 0x00000002b1668c9fLL, - 0x000000002c7a46aaLL +static const unsigned long long logterms[HIPOWER] = { + 0x2a8eca5705fc2ef0LL, + 0xf6384ee1d01febceLL, + 0x093bb62877cdf642LL, + 0x006985d8a9ec439bLL, + 0x0005212c4f55a9c8LL, + 0x00004326a16927f0LL, + 0x0000038d1d80a0e7LL, + 0x0000003141cc80c6LL, + 0x00000002b1668c9fLL, + 0x000000002c7a46aaLL }; static const unsigned long leadterm = 0xb8000000; - /*--- log2_kernel() ---------------------------------------------------------+ | Base 2 logarithm by a polynomial approximation. | | log2(x+1) | @@ -203,70 +181,64 @@ static const unsigned long leadterm = 0xb8000000; static void log2_kernel(FPU_REG const *arg, u_char argsign, Xsig *accum_result, long int *expon) { - long int exponent, adj; - unsigned long long Xsq; - Xsig accumulator, Numer, Denom, argSignif, arg_signif; - - exponent = exponent16(arg); - Numer.lsw = Denom.lsw = 0; - XSIG_LL(Numer) = XSIG_LL(Denom) = significand(arg); - if ( argsign == SIGN_POS ) - { - shr_Xsig(&Denom, 2 - (1 + exponent)); - Denom.msw |= 0x80000000; - div_Xsig(&Numer, &Denom, &argSignif); - } - else - { - shr_Xsig(&Denom, 1 - (1 + exponent)); - negate_Xsig(&Denom); - if ( Denom.msw & 0x80000000 ) - { - div_Xsig(&Numer, &Denom, &argSignif); - exponent ++; - } - else - { - /* Denom must be 1.0 */ - argSignif.lsw = Numer.lsw; argSignif.midw = Numer.midw; - argSignif.msw = Numer.msw; + long int exponent, adj; + unsigned long long Xsq; + Xsig accumulator, Numer, Denom, argSignif, arg_signif; + + exponent = exponent16(arg); + Numer.lsw = Denom.lsw = 0; + XSIG_LL(Numer) = XSIG_LL(Denom) = significand(arg); + if (argsign == SIGN_POS) { + shr_Xsig(&Denom, 2 - (1 + exponent)); + Denom.msw |= 0x80000000; + div_Xsig(&Numer, &Denom, &argSignif); + } else { + shr_Xsig(&Denom, 1 - (1 + exponent)); + negate_Xsig(&Denom); + if (Denom.msw & 0x80000000) { + div_Xsig(&Numer, &Denom, &argSignif); + exponent++; + } else { + /* Denom must be 1.0 */ + argSignif.lsw = Numer.lsw; + argSignif.midw = Numer.midw; + argSignif.msw = Numer.msw; + } } - } #ifndef PECULIAR_486 - /* Should check here that |local_arg| is within the valid range */ - if ( exponent >= -2 ) - { - if ( (exponent > -2) || - (argSignif.msw > (unsigned)0xafb0ccc0) ) - { - /* The argument is too large */ + /* Should check here that |local_arg| is within the valid range */ + if (exponent >= -2) { + if ((exponent > -2) || (argSignif.msw > (unsigned)0xafb0ccc0)) { + /* The argument is too large */ + } } - } #endif /* PECULIAR_486 */ - arg_signif.lsw = argSignif.lsw; XSIG_LL(arg_signif) = XSIG_LL(argSignif); - adj = norm_Xsig(&argSignif); - accumulator.lsw = argSignif.lsw; XSIG_LL(accumulator) = XSIG_LL(argSignif); - mul_Xsig_Xsig(&accumulator, &accumulator); - shr_Xsig(&accumulator, 2*(-1 - (1 + exponent + adj))); - Xsq = XSIG_LL(accumulator); - if ( accumulator.lsw & 0x80000000 ) - Xsq++; - - accumulator.msw = accumulator.midw = accumulator.lsw = 0; - /* Do the basic fixed point polynomial evaluation */ - polynomial_Xsig(&accumulator, &Xsq, logterms, HIPOWER-1); - - mul_Xsig_Xsig(&accumulator, &argSignif); - shr_Xsig(&accumulator, 6 - adj); - - mul32_Xsig(&arg_signif, leadterm); - add_two_Xsig(&accumulator, &arg_signif, &exponent); - - *expon = exponent + 1; - accum_result->lsw = accumulator.lsw; - accum_result->midw = accumulator.midw; - accum_result->msw = accumulator.msw; + arg_signif.lsw = argSignif.lsw; + XSIG_LL(arg_signif) = XSIG_LL(argSignif); + adj = norm_Xsig(&argSignif); + accumulator.lsw = argSignif.lsw; + XSIG_LL(accumulator) = XSIG_LL(argSignif); + mul_Xsig_Xsig(&accumulator, &accumulator); + shr_Xsig(&accumulator, 2 * (-1 - (1 + exponent + adj))); + Xsq = XSIG_LL(accumulator); + if (accumulator.lsw & 0x80000000) + Xsq++; + + accumulator.msw = accumulator.midw = accumulator.lsw = 0; + /* Do the basic fixed point polynomial evaluation */ + polynomial_Xsig(&accumulator, &Xsq, logterms, HIPOWER - 1); + + mul_Xsig_Xsig(&accumulator, &argSignif); + shr_Xsig(&accumulator, 6 - adj); + + mul32_Xsig(&arg_signif, leadterm); + add_two_Xsig(&accumulator, &arg_signif, &exponent); + + *expon = exponent + 1; + accum_result->lsw = accumulator.lsw; + accum_result->midw = accumulator.midw; + accum_result->msw = accumulator.msw; } diff --git a/arch/x86/math-emu/poly_sin.c b/arch/x86/math-emu/poly_sin.c index a36313fb06f..b862039c728 100644 --- a/arch/x86/math-emu/poly_sin.c +++ b/arch/x86/math-emu/poly_sin.c @@ -11,7 +11,6 @@ | | +---------------------------------------------------------------------------*/ - #include "exception.h" #include "reg_constant.h" #include "fpu_emu.h" @@ -19,379 +18,361 @@ #include "control_w.h" #include "poly.h" - #define N_COEFF_P 4 #define N_COEFF_N 4 -static const unsigned long long pos_terms_l[N_COEFF_P] = -{ - 0xaaaaaaaaaaaaaaabLL, - 0x00d00d00d00cf906LL, - 0x000006b99159a8bbLL, - 0x000000000d7392e6LL +static const unsigned long long pos_terms_l[N_COEFF_P] = { + 0xaaaaaaaaaaaaaaabLL, + 0x00d00d00d00cf906LL, + 0x000006b99159a8bbLL, + 0x000000000d7392e6LL }; -static const unsigned long long neg_terms_l[N_COEFF_N] = -{ - 0x2222222222222167LL, - 0x0002e3bc74aab624LL, - 0x0000000b09229062LL, - 0x00000000000c7973LL +static const unsigned long long neg_terms_l[N_COEFF_N] = { + 0x2222222222222167LL, + 0x0002e3bc74aab624LL, + 0x0000000b09229062LL, + 0x00000000000c7973LL }; - - #define N_COEFF_PH 4 #define N_COEFF_NH 4 -static const unsigned long long pos_terms_h[N_COEFF_PH] = -{ - 0x0000000000000000LL, - 0x05b05b05b05b0406LL, - 0x000049f93edd91a9LL, - 0x00000000c9c9ed62LL +static const unsigned long long pos_terms_h[N_COEFF_PH] = { + 0x0000000000000000LL, + 0x05b05b05b05b0406LL, + 0x000049f93edd91a9LL, + 0x00000000c9c9ed62LL }; -static const unsigned long long neg_terms_h[N_COEFF_NH] = -{ - 0xaaaaaaaaaaaaaa98LL, - 0x001a01a01a019064LL, - 0x0000008f76c68a77LL, - 0x0000000000d58f5eLL +static const unsigned long long neg_terms_h[N_COEFF_NH] = { + 0xaaaaaaaaaaaaaa98LL, + 0x001a01a01a019064LL, + 0x0000008f76c68a77LL, + 0x0000000000d58f5eLL }; - /*--- poly_sine() -----------------------------------------------------------+ | | +---------------------------------------------------------------------------*/ -void poly_sine(FPU_REG *st0_ptr) +void poly_sine(FPU_REG *st0_ptr) { - int exponent, echange; - Xsig accumulator, argSqrd, argTo4; - unsigned long fix_up, adj; - unsigned long long fixed_arg; - FPU_REG result; + int exponent, echange; + Xsig accumulator, argSqrd, argTo4; + unsigned long fix_up, adj; + unsigned long long fixed_arg; + FPU_REG result; - exponent = exponent(st0_ptr); + exponent = exponent(st0_ptr); - accumulator.lsw = accumulator.midw = accumulator.msw = 0; + accumulator.lsw = accumulator.midw = accumulator.msw = 0; - /* Split into two ranges, for arguments below and above 1.0 */ - /* The boundary between upper and lower is approx 0.88309101259 */ - if ( (exponent < -1) || ((exponent == -1) && (st0_ptr->sigh <= 0xe21240aa)) ) - { - /* The argument is <= 0.88309101259 */ + /* Split into two ranges, for arguments below and above 1.0 */ + /* The boundary between upper and lower is approx 0.88309101259 */ + if ((exponent < -1) + || ((exponent == -1) && (st0_ptr->sigh <= 0xe21240aa))) { + /* The argument is <= 0.88309101259 */ + + argSqrd.msw = st0_ptr->sigh; + argSqrd.midw = st0_ptr->sigl; + argSqrd.lsw = 0; + mul64_Xsig(&argSqrd, &significand(st0_ptr)); + shr_Xsig(&argSqrd, 2 * (-1 - exponent)); + argTo4.msw = argSqrd.msw; + argTo4.midw = argSqrd.midw; + argTo4.lsw = argSqrd.lsw; + mul_Xsig_Xsig(&argTo4, &argTo4); - argSqrd.msw = st0_ptr->sigh; argSqrd.midw = st0_ptr->sigl; argSqrd.lsw = 0; - mul64_Xsig(&argSqrd, &significand(st0_ptr)); - shr_Xsig(&argSqrd, 2*(-1-exponent)); - argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw; - argTo4.lsw = argSqrd.lsw; - mul_Xsig_Xsig(&argTo4, &argTo4); + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l, + N_COEFF_N - 1); + mul_Xsig_Xsig(&accumulator, &argSqrd); + negate_Xsig(&accumulator); - polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l, - N_COEFF_N-1); - mul_Xsig_Xsig(&accumulator, &argSqrd); - negate_Xsig(&accumulator); + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l, + N_COEFF_P - 1); - polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l, - N_COEFF_P-1); + shr_Xsig(&accumulator, 2); /* Divide by four */ + accumulator.msw |= 0x80000000; /* Add 1.0 */ - shr_Xsig(&accumulator, 2); /* Divide by four */ - accumulator.msw |= 0x80000000; /* Add 1.0 */ + mul64_Xsig(&accumulator, &significand(st0_ptr)); + mul64_Xsig(&accumulator, &significand(st0_ptr)); + mul64_Xsig(&accumulator, &significand(st0_ptr)); - mul64_Xsig(&accumulator, &significand(st0_ptr)); - mul64_Xsig(&accumulator, &significand(st0_ptr)); - mul64_Xsig(&accumulator, &significand(st0_ptr)); + /* Divide by four, FPU_REG compatible, etc */ + exponent = 3 * exponent; - /* Divide by four, FPU_REG compatible, etc */ - exponent = 3*exponent; + /* The minimum exponent difference is 3 */ + shr_Xsig(&accumulator, exponent(st0_ptr) - exponent); - /* The minimum exponent difference is 3 */ - shr_Xsig(&accumulator, exponent(st0_ptr) - exponent); + negate_Xsig(&accumulator); + XSIG_LL(accumulator) += significand(st0_ptr); - negate_Xsig(&accumulator); - XSIG_LL(accumulator) += significand(st0_ptr); + echange = round_Xsig(&accumulator); - echange = round_Xsig(&accumulator); + setexponentpos(&result, exponent(st0_ptr) + echange); + } else { + /* The argument is > 0.88309101259 */ + /* We use sin(st(0)) = cos(pi/2-st(0)) */ - setexponentpos(&result, exponent(st0_ptr) + echange); - } - else - { - /* The argument is > 0.88309101259 */ - /* We use sin(st(0)) = cos(pi/2-st(0)) */ + fixed_arg = significand(st0_ptr); - fixed_arg = significand(st0_ptr); + if (exponent == 0) { + /* The argument is >= 1.0 */ - if ( exponent == 0 ) - { - /* The argument is >= 1.0 */ + /* Put the binary point at the left. */ + fixed_arg <<= 1; + } + /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */ + fixed_arg = 0x921fb54442d18469LL - fixed_arg; + /* There is a special case which arises due to rounding, to fix here. */ + if (fixed_arg == 0xffffffffffffffffLL) + fixed_arg = 0; - /* Put the binary point at the left. */ - fixed_arg <<= 1; - } - /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */ - fixed_arg = 0x921fb54442d18469LL - fixed_arg; - /* There is a special case which arises due to rounding, to fix here. */ - if ( fixed_arg == 0xffffffffffffffffLL ) - fixed_arg = 0; + XSIG_LL(argSqrd) = fixed_arg; + argSqrd.lsw = 0; + mul64_Xsig(&argSqrd, &fixed_arg); - XSIG_LL(argSqrd) = fixed_arg; argSqrd.lsw = 0; - mul64_Xsig(&argSqrd, &fixed_arg); + XSIG_LL(argTo4) = XSIG_LL(argSqrd); + argTo4.lsw = argSqrd.lsw; + mul_Xsig_Xsig(&argTo4, &argTo4); - XSIG_LL(argTo4) = XSIG_LL(argSqrd); argTo4.lsw = argSqrd.lsw; - mul_Xsig_Xsig(&argTo4, &argTo4); + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h, + N_COEFF_NH - 1); + mul_Xsig_Xsig(&accumulator, &argSqrd); + negate_Xsig(&accumulator); - polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h, - N_COEFF_NH-1); - mul_Xsig_Xsig(&accumulator, &argSqrd); - negate_Xsig(&accumulator); + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h, + N_COEFF_PH - 1); + negate_Xsig(&accumulator); - polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h, - N_COEFF_PH-1); - negate_Xsig(&accumulator); + mul64_Xsig(&accumulator, &fixed_arg); + mul64_Xsig(&accumulator, &fixed_arg); - mul64_Xsig(&accumulator, &fixed_arg); - mul64_Xsig(&accumulator, &fixed_arg); + shr_Xsig(&accumulator, 3); + negate_Xsig(&accumulator); - shr_Xsig(&accumulator, 3); - negate_Xsig(&accumulator); + add_Xsig_Xsig(&accumulator, &argSqrd); - add_Xsig_Xsig(&accumulator, &argSqrd); + shr_Xsig(&accumulator, 1); - shr_Xsig(&accumulator, 1); + accumulator.lsw |= 1; /* A zero accumulator here would cause problems */ + negate_Xsig(&accumulator); - accumulator.lsw |= 1; /* A zero accumulator here would cause problems */ - negate_Xsig(&accumulator); + /* The basic computation is complete. Now fix the answer to + compensate for the error due to the approximation used for + pi/2 + */ - /* The basic computation is complete. Now fix the answer to - compensate for the error due to the approximation used for - pi/2 - */ + /* This has an exponent of -65 */ + fix_up = 0x898cc517; + /* The fix-up needs to be improved for larger args */ + if (argSqrd.msw & 0xffc00000) { + /* Get about 32 bit precision in these: */ + fix_up -= mul_32_32(0x898cc517, argSqrd.msw) / 6; + } + fix_up = mul_32_32(fix_up, LL_MSW(fixed_arg)); - /* This has an exponent of -65 */ - fix_up = 0x898cc517; - /* The fix-up needs to be improved for larger args */ - if ( argSqrd.msw & 0xffc00000 ) - { - /* Get about 32 bit precision in these: */ - fix_up -= mul_32_32(0x898cc517, argSqrd.msw) / 6; - } - fix_up = mul_32_32(fix_up, LL_MSW(fixed_arg)); + adj = accumulator.lsw; /* temp save */ + accumulator.lsw -= fix_up; + if (accumulator.lsw > adj) + XSIG_LL(accumulator)--; - adj = accumulator.lsw; /* temp save */ - accumulator.lsw -= fix_up; - if ( accumulator.lsw > adj ) - XSIG_LL(accumulator) --; + echange = round_Xsig(&accumulator); - echange = round_Xsig(&accumulator); - - setexponentpos(&result, echange - 1); - } + setexponentpos(&result, echange - 1); + } - significand(&result) = XSIG_LL(accumulator); - setsign(&result, getsign(st0_ptr)); - FPU_copy_to_reg0(&result, TAG_Valid); + significand(&result) = XSIG_LL(accumulator); + setsign(&result, getsign(st0_ptr)); + FPU_copy_to_reg0(&result, TAG_Valid); #ifdef PARANOID - if ( (exponent(&result) >= 0) - && (significand(&result) > 0x8000000000000000LL) ) - { - EXCEPTION(EX_INTERNAL|0x150); - } + if ((exponent(&result) >= 0) + && (significand(&result) > 0x8000000000000000LL)) { + EXCEPTION(EX_INTERNAL | 0x150); + } #endif /* PARANOID */ } - - /*--- poly_cos() ------------------------------------------------------------+ | | +---------------------------------------------------------------------------*/ -void poly_cos(FPU_REG *st0_ptr) +void poly_cos(FPU_REG *st0_ptr) { - FPU_REG result; - long int exponent, exp2, echange; - Xsig accumulator, argSqrd, fix_up, argTo4; - unsigned long long fixed_arg; + FPU_REG result; + long int exponent, exp2, echange; + Xsig accumulator, argSqrd, fix_up, argTo4; + unsigned long long fixed_arg; #ifdef PARANOID - if ( (exponent(st0_ptr) > 0) - || ((exponent(st0_ptr) == 0) - && (significand(st0_ptr) > 0xc90fdaa22168c234LL)) ) - { - EXCEPTION(EX_Invalid); - FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); - return; - } -#endif /* PARANOID */ - - exponent = exponent(st0_ptr); - - accumulator.lsw = accumulator.midw = accumulator.msw = 0; - - if ( (exponent < -1) || ((exponent == -1) && (st0_ptr->sigh <= 0xb00d6f54)) ) - { - /* arg is < 0.687705 */ - - argSqrd.msw = st0_ptr->sigh; argSqrd.midw = st0_ptr->sigl; - argSqrd.lsw = 0; - mul64_Xsig(&argSqrd, &significand(st0_ptr)); - - if ( exponent < -1 ) - { - /* shift the argument right by the required places */ - shr_Xsig(&argSqrd, 2*(-1-exponent)); - } - - argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw; - argTo4.lsw = argSqrd.lsw; - mul_Xsig_Xsig(&argTo4, &argTo4); - - polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h, - N_COEFF_NH-1); - mul_Xsig_Xsig(&accumulator, &argSqrd); - negate_Xsig(&accumulator); - - polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h, - N_COEFF_PH-1); - negate_Xsig(&accumulator); - - mul64_Xsig(&accumulator, &significand(st0_ptr)); - mul64_Xsig(&accumulator, &significand(st0_ptr)); - shr_Xsig(&accumulator, -2*(1+exponent)); - - shr_Xsig(&accumulator, 3); - negate_Xsig(&accumulator); - - add_Xsig_Xsig(&accumulator, &argSqrd); - - shr_Xsig(&accumulator, 1); - - /* It doesn't matter if accumulator is all zero here, the - following code will work ok */ - negate_Xsig(&accumulator); - - if ( accumulator.lsw & 0x80000000 ) - XSIG_LL(accumulator) ++; - if ( accumulator.msw == 0 ) - { - /* The result is 1.0 */ - FPU_copy_to_reg0(&CONST_1, TAG_Valid); - return; - } - else - { - significand(&result) = XSIG_LL(accumulator); - - /* will be a valid positive nr with expon = -1 */ - setexponentpos(&result, -1); - } - } - else - { - fixed_arg = significand(st0_ptr); - - if ( exponent == 0 ) - { - /* The argument is >= 1.0 */ - - /* Put the binary point at the left. */ - fixed_arg <<= 1; - } - /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */ - fixed_arg = 0x921fb54442d18469LL - fixed_arg; - /* There is a special case which arises due to rounding, to fix here. */ - if ( fixed_arg == 0xffffffffffffffffLL ) - fixed_arg = 0; - - exponent = -1; - exp2 = -1; - - /* A shift is needed here only for a narrow range of arguments, - i.e. for fixed_arg approx 2^-32, but we pick up more... */ - if ( !(LL_MSW(fixed_arg) & 0xffff0000) ) - { - fixed_arg <<= 16; - exponent -= 16; - exp2 -= 16; + if ((exponent(st0_ptr) > 0) + || ((exponent(st0_ptr) == 0) + && (significand(st0_ptr) > 0xc90fdaa22168c234LL))) { + EXCEPTION(EX_Invalid); + FPU_copy_to_reg0(&CONST_QNaN, TAG_Special); + return; } +#endif /* PARANOID */ - XSIG_LL(argSqrd) = fixed_arg; argSqrd.lsw = 0; - mul64_Xsig(&argSqrd, &fixed_arg); - - if ( exponent < -1 ) - { - /* shift the argument right by the required places */ - shr_Xsig(&argSqrd, 2*(-1-exponent)); - } - - argTo4.msw = argSqrd.msw; argTo4.midw = argSqrd.midw; - argTo4.lsw = argSqrd.lsw; - mul_Xsig_Xsig(&argTo4, &argTo4); - - polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l, - N_COEFF_N-1); - mul_Xsig_Xsig(&accumulator, &argSqrd); - negate_Xsig(&accumulator); - - polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l, - N_COEFF_P-1); - - shr_Xsig(&accumulator, 2); /* Divide by four */ - accumulator.msw |= 0x80000000; /* Add 1.0 */ - - mul64_Xsig(&accumulator, &fixed_arg); - mul64_Xsig(&accumulator, &fixed_arg); - mul64_Xsig(&accumulator, &fixed_arg); - - /* Divide by four, FPU_REG compatible, etc */ - exponent = 3*exponent; - - /* The minimum exponent difference is 3 */ - shr_Xsig(&accumulator, exp2 - exponent); - - negate_Xsig(&accumulator); - XSIG_LL(accumulator) += fixed_arg; - - /* The basic computation is complete. Now fix the answer to - compensate for the error due to the approximation used for - pi/2 - */ - - /* This has an exponent of -65 */ - XSIG_LL(fix_up) = 0x898cc51701b839a2ll; - fix_up.lsw = 0; - - /* The fix-up needs to be improved for larger args */ - if ( argSqrd.msw & 0xffc00000 ) - { - /* Get about 32 bit precision in these: */ - fix_up.msw -= mul_32_32(0x898cc517, argSqrd.msw) / 2; - fix_up.msw += mul_32_32(0x898cc517, argTo4.msw) / 24; + exponent = exponent(st0_ptr); + + accumulator.lsw = accumulator.midw = accumulator.msw = 0; + + if ((exponent < -1) + || ((exponent == -1) && (st0_ptr->sigh <= 0xb00d6f54))) { + /* arg is < 0.687705 */ + + argSqrd.msw = st0_ptr->sigh; + argSqrd.midw = st0_ptr->sigl; + argSqrd.lsw = 0; + mul64_Xsig(&argSqrd, &significand(st0_ptr)); + + if (exponent < -1) { + /* shift the argument right by the required places */ + shr_Xsig(&argSqrd, 2 * (-1 - exponent)); + } + + argTo4.msw = argSqrd.msw; + argTo4.midw = argSqrd.midw; + argTo4.lsw = argSqrd.lsw; + mul_Xsig_Xsig(&argTo4, &argTo4); + + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_h, + N_COEFF_NH - 1); + mul_Xsig_Xsig(&accumulator, &argSqrd); + negate_Xsig(&accumulator); + + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_h, + N_COEFF_PH - 1); + negate_Xsig(&accumulator); + + mul64_Xsig(&accumulator, &significand(st0_ptr)); + mul64_Xsig(&accumulator, &significand(st0_ptr)); + shr_Xsig(&accumulator, -2 * (1 + exponent)); + + shr_Xsig(&accumulator, 3); + negate_Xsig(&accumulator); + + add_Xsig_Xsig(&accumulator, &argSqrd); + + shr_Xsig(&accumulator, 1); + + /* It doesn't matter if accumulator is all zero here, the + following code will work ok */ + negate_Xsig(&accumulator); + + if (accumulator.lsw & 0x80000000) + XSIG_LL(accumulator)++; + if (accumulator.msw == 0) { + /* The result is 1.0 */ + FPU_copy_to_reg0(&CONST_1, TAG_Valid); + return; + } else { + significand(&result) = XSIG_LL(accumulator); + + /* will be a valid positive nr with expon = -1 */ + setexponentpos(&result, -1); + } + } else { + fixed_arg = significand(st0_ptr); + + if (exponent == 0) { + /* The argument is >= 1.0 */ + + /* Put the binary point at the left. */ + fixed_arg <<= 1; + } + /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */ + fixed_arg = 0x921fb54442d18469LL - fixed_arg; + /* There is a special case which arises due to rounding, to fix here. */ + if (fixed_arg == 0xffffffffffffffffLL) + fixed_arg = 0; + + exponent = -1; + exp2 = -1; + + /* A shift is needed here only for a narrow range of arguments, + i.e. for fixed_arg approx 2^-32, but we pick up more... */ + if (!(LL_MSW(fixed_arg) & 0xffff0000)) { + fixed_arg <<= 16; + exponent -= 16; + exp2 -= 16; + } + + XSIG_LL(argSqrd) = fixed_arg; + argSqrd.lsw = 0; + mul64_Xsig(&argSqrd, &fixed_arg); + + if (exponent < -1) { + /* shift the argument right by the required places */ + shr_Xsig(&argSqrd, 2 * (-1 - exponent)); + } + + argTo4.msw = argSqrd.msw; + argTo4.midw = argSqrd.midw; + argTo4.lsw = argSqrd.lsw; + mul_Xsig_Xsig(&argTo4, &argTo4); + + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), neg_terms_l, + N_COEFF_N - 1); + mul_Xsig_Xsig(&accumulator, &argSqrd); + negate_Xsig(&accumulator); + + polynomial_Xsig(&accumulator, &XSIG_LL(argTo4), pos_terms_l, + N_COEFF_P - 1); + + shr_Xsig(&accumulator, 2); /* Divide by four */ + accumulator.msw |= 0x80000000; /* Add 1.0 */ + + mul64_Xsig(&accumulator, &fixed_arg); + mul64_Xsig(&accumulator, &fixed_arg); + mul64_Xsig(&accumulator, &fixed_arg); + + /* Divide by four, FPU_REG compatible, etc */ + exponent = 3 * exponent; + + /* The minimum exponent difference is 3 */ + shr_Xsig(&accumulator, exp2 - exponent); + + negate_Xsig(&accumulator); + XSIG_LL(accumulator) += fixed_arg; + + /* The basic computation is complete. Now fix the answer to + compensate for the error due to the approximation used for + pi/2 + */ + + /* This has an exponent of -65 */ + XSIG_LL(fix_up) = 0x898cc51701b839a2ll; + fix_up.lsw = 0; + + /* The fix-up needs to be improved for larger args */ + if (argSqrd.msw & 0xffc00000) { + /* Get about 32 bit precision in these: */ + fix_up.msw -= mul_32_32(0x898cc517, argSqrd.msw) / 2; + fix_up.msw += mul_32_32(0x898cc517, argTo4.msw) / 24; + } + + exp2 += norm_Xsig(&accumulator); + shr_Xsig(&accumulator, 1); /* Prevent overflow */ + exp2++; + shr_Xsig(&fix_up, 65 + exp2); + + add_Xsig_Xsig(&accumulator, &fix_up); + + echange = round_Xsig(&accumulator); + + setexponentpos(&result, exp2 + echange); + significand(&result) = XSIG_LL(accumulator); } - exp2 += norm_Xsig(&accumulator); - shr_Xsig(&accumulator, 1); /* Prevent overflow */ - exp2++; - shr_Xsig(&fix_up, 65 + exp2); - - add_Xsig_Xsig(&accumulator, &fix_up); - - echange = round_Xsig(&accumulator); - - setexponentpos(&result, exp2 + echange); - significand(&result) = XSIG_LL(accumulator); - } - - FPU_copy_to_reg0(&result, TAG_Valid); + FPU_copy_to_reg0(&result, TAG_Valid); #ifdef PARANOID - if ( (exponent(&result) >= 0) - && (significand(&result) > 0x8000000000000000LL) ) - { - EXCEPTION(EX_INTERNAL|0x151); - } + if ((exponent(&result) >= 0) + && (significand(&result) > 0x8000000000000000LL)) { + EXCEPTION(EX_INTERNAL | 0x151); + } #endif /* PARANOID */ } diff --git a/arch/x86/math-emu/poly_tan.c b/arch/x86/math-emu/poly_tan.c index 8df3e03b6e6..1875763e0c0 100644 --- a/arch/x86/math-emu/poly_tan.c +++ b/arch/x86/math-emu/poly_tan.c @@ -17,206 +17,196 @@ #include "control_w.h" #include "poly.h" - #define HiPOWERop 3 /* odd poly, positive terms */ -static const unsigned long long oddplterm[HiPOWERop] = -{ - 0x0000000000000000LL, - 0x0051a1cf08fca228LL, - 0x0000000071284ff7LL +static const unsigned long long oddplterm[HiPOWERop] = { + 0x0000000000000000LL, + 0x0051a1cf08fca228LL, + 0x0000000071284ff7LL }; #define HiPOWERon 2 /* odd poly, negative terms */ -static const unsigned long long oddnegterm[HiPOWERon] = -{ - 0x1291a9a184244e80LL, - 0x0000583245819c21LL +static const unsigned long long oddnegterm[HiPOWERon] = { + 0x1291a9a184244e80LL, + 0x0000583245819c21LL }; #define HiPOWERep 2 /* even poly, positive terms */ -static const unsigned long long evenplterm[HiPOWERep] = -{ - 0x0e848884b539e888LL, - 0x00003c7f18b887daLL +static const unsigned long long evenplterm[HiPOWERep] = { + 0x0e848884b539e888LL, + 0x00003c7f18b887daLL }; #define HiPOWERen 2 /* even poly, negative terms */ -static const unsigned long long evennegterm[HiPOWERen] = -{ - 0xf1f0200fd51569ccLL, - 0x003afb46105c4432LL +static const unsigned long long evennegterm[HiPOWERen] = { + 0xf1f0200fd51569ccLL, + 0x003afb46105c4432LL }; static const unsigned long long twothirds = 0xaaaaaaaaaaaaaaabLL; - /*--- poly_tan() ------------------------------------------------------------+ | | +---------------------------------------------------------------------------*/ -void poly_tan(FPU_REG *st0_ptr) +void poly_tan(FPU_REG *st0_ptr) { - long int exponent; - int invert; - Xsig argSq, argSqSq, accumulatoro, accumulatore, accum, - argSignif, fix_up; - unsigned long adj; + long int exponent; + int invert; + Xsig argSq, argSqSq, accumulatoro, accumulatore, accum, + argSignif, fix_up; + unsigned long adj; - exponent = exponent(st0_ptr); + exponent = exponent(st0_ptr); #ifdef PARANOID - if ( signnegative(st0_ptr) ) /* Can't hack a number < 0.0 */ - { arith_invalid(0); return; } /* Need a positive number */ + if (signnegative(st0_ptr)) { /* Can't hack a number < 0.0 */ + arith_invalid(0); + return; + } /* Need a positive number */ #endif /* PARANOID */ - /* Split the problem into two domains, smaller and larger than pi/4 */ - if ( (exponent == 0) || ((exponent == -1) && (st0_ptr->sigh > 0xc90fdaa2)) ) - { - /* The argument is greater than (approx) pi/4 */ - invert = 1; - accum.lsw = 0; - XSIG_LL(accum) = significand(st0_ptr); - - if ( exponent == 0 ) - { - /* The argument is >= 1.0 */ - /* Put the binary point at the left. */ - XSIG_LL(accum) <<= 1; - } - /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */ - XSIG_LL(accum) = 0x921fb54442d18469LL - XSIG_LL(accum); - /* This is a special case which arises due to rounding. */ - if ( XSIG_LL(accum) == 0xffffffffffffffffLL ) - { - FPU_settag0(TAG_Valid); - significand(st0_ptr) = 0x8a51e04daabda360LL; - setexponent16(st0_ptr, (0x41 + EXTENDED_Ebias) | SIGN_Negative); - return; + /* Split the problem into two domains, smaller and larger than pi/4 */ + if ((exponent == 0) + || ((exponent == -1) && (st0_ptr->sigh > 0xc90fdaa2))) { + /* The argument is greater than (approx) pi/4 */ + invert = 1; + accum.lsw = 0; + XSIG_LL(accum) = significand(st0_ptr); + + if (exponent == 0) { + /* The argument is >= 1.0 */ + /* Put the binary point at the left. */ + XSIG_LL(accum) <<= 1; + } + /* pi/2 in hex is: 1.921fb54442d18469 898CC51701B839A2 52049C1 */ + XSIG_LL(accum) = 0x921fb54442d18469LL - XSIG_LL(accum); + /* This is a special case which arises due to rounding. */ + if (XSIG_LL(accum) == 0xffffffffffffffffLL) { + FPU_settag0(TAG_Valid); + significand(st0_ptr) = 0x8a51e04daabda360LL; + setexponent16(st0_ptr, + (0x41 + EXTENDED_Ebias) | SIGN_Negative); + return; + } + + argSignif.lsw = accum.lsw; + XSIG_LL(argSignif) = XSIG_LL(accum); + exponent = -1 + norm_Xsig(&argSignif); + } else { + invert = 0; + argSignif.lsw = 0; + XSIG_LL(accum) = XSIG_LL(argSignif) = significand(st0_ptr); + + if (exponent < -1) { + /* shift the argument right by the required places */ + if (FPU_shrx(&XSIG_LL(accum), -1 - exponent) >= + 0x80000000U) + XSIG_LL(accum)++; /* round up */ + } } - argSignif.lsw = accum.lsw; - XSIG_LL(argSignif) = XSIG_LL(accum); - exponent = -1 + norm_Xsig(&argSignif); - } - else - { - invert = 0; - argSignif.lsw = 0; - XSIG_LL(accum) = XSIG_LL(argSignif) = significand(st0_ptr); - - if ( exponent < -1 ) - { - /* shift the argument right by the required places */ - if ( FPU_shrx(&XSIG_LL(accum), -1-exponent) >= 0x80000000U ) - XSIG_LL(accum) ++; /* round up */ - } - } - - XSIG_LL(argSq) = XSIG_LL(accum); argSq.lsw = accum.lsw; - mul_Xsig_Xsig(&argSq, &argSq); - XSIG_LL(argSqSq) = XSIG_LL(argSq); argSqSq.lsw = argSq.lsw; - mul_Xsig_Xsig(&argSqSq, &argSqSq); - - /* Compute the negative terms for the numerator polynomial */ - accumulatoro.msw = accumulatoro.midw = accumulatoro.lsw = 0; - polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddnegterm, HiPOWERon-1); - mul_Xsig_Xsig(&accumulatoro, &argSq); - negate_Xsig(&accumulatoro); - /* Add the positive terms */ - polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddplterm, HiPOWERop-1); - - - /* Compute the positive terms for the denominator polynomial */ - accumulatore.msw = accumulatore.midw = accumulatore.lsw = 0; - polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evenplterm, HiPOWERep-1); - mul_Xsig_Xsig(&accumulatore, &argSq); - negate_Xsig(&accumulatore); - /* Add the negative terms */ - polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evennegterm, HiPOWERen-1); - /* Multiply by arg^2 */ - mul64_Xsig(&accumulatore, &XSIG_LL(argSignif)); - mul64_Xsig(&accumulatore, &XSIG_LL(argSignif)); - /* de-normalize and divide by 2 */ - shr_Xsig(&accumulatore, -2*(1+exponent) + 1); - negate_Xsig(&accumulatore); /* This does 1 - accumulator */ - - /* Now find the ratio. */ - if ( accumulatore.msw == 0 ) - { - /* accumulatoro must contain 1.0 here, (actually, 0) but it - really doesn't matter what value we use because it will - have negligible effect in later calculations - */ - XSIG_LL(accum) = 0x8000000000000000LL; - accum.lsw = 0; - } - else - { - div_Xsig(&accumulatoro, &accumulatore, &accum); - } - - /* Multiply by 1/3 * arg^3 */ - mul64_Xsig(&accum, &XSIG_LL(argSignif)); - mul64_Xsig(&accum, &XSIG_LL(argSignif)); - mul64_Xsig(&accum, &XSIG_LL(argSignif)); - mul64_Xsig(&accum, &twothirds); - shr_Xsig(&accum, -2*(exponent+1)); - - /* tan(arg) = arg + accum */ - add_two_Xsig(&accum, &argSignif, &exponent); - - if ( invert ) - { - /* We now have the value of tan(pi_2 - arg) where pi_2 is an - approximation for pi/2 - */ - /* The next step is to fix the answer to compensate for the - error due to the approximation used for pi/2 - */ - - /* This is (approx) delta, the error in our approx for pi/2 - (see above). It has an exponent of -65 - */ - XSIG_LL(fix_up) = 0x898cc51701b839a2LL; - fix_up.lsw = 0; - - if ( exponent == 0 ) - adj = 0xffffffff; /* We want approx 1.0 here, but - this is close enough. */ - else if ( exponent > -30 ) - { - adj = accum.msw >> -(exponent+1); /* tan */ - adj = mul_32_32(adj, adj); /* tan^2 */ + XSIG_LL(argSq) = XSIG_LL(accum); + argSq.lsw = accum.lsw; + mul_Xsig_Xsig(&argSq, &argSq); + XSIG_LL(argSqSq) = XSIG_LL(argSq); + argSqSq.lsw = argSq.lsw; + mul_Xsig_Xsig(&argSqSq, &argSqSq); + + /* Compute the negative terms for the numerator polynomial */ + accumulatoro.msw = accumulatoro.midw = accumulatoro.lsw = 0; + polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddnegterm, + HiPOWERon - 1); + mul_Xsig_Xsig(&accumulatoro, &argSq); + negate_Xsig(&accumulatoro); + /* Add the positive terms */ + polynomial_Xsig(&accumulatoro, &XSIG_LL(argSqSq), oddplterm, + HiPOWERop - 1); + + /* Compute the positive terms for the denominator polynomial */ + accumulatore.msw = accumulatore.midw = accumulatore.lsw = 0; + polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evenplterm, + HiPOWERep - 1); + mul_Xsig_Xsig(&accumulatore, &argSq); + negate_Xsig(&accumulatore); + /* Add the negative terms */ + polynomial_Xsig(&accumulatore, &XSIG_LL(argSqSq), evennegterm, + HiPOWERen - 1); + /* Multiply by arg^2 */ + mul64_Xsig(&accumulatore, &XSIG_LL(argSignif)); + mul64_Xsig(&accumulatore, &XSIG_LL(argSignif)); + /* de-normalize and divide by 2 */ + shr_Xsig(&accumulatore, -2 * (1 + exponent) + 1); + negate_Xsig(&accumulatore); /* This does 1 - accumulator */ + + /* Now find the ratio. */ + if (accumulatore.msw == 0) { + /* accumulatoro must contain 1.0 here, (actually, 0) but it + really doesn't matter what value we use because it will + have negligible effect in later calculations + */ + XSIG_LL(accum) = 0x8000000000000000LL; + accum.lsw = 0; + } else { + div_Xsig(&accumulatoro, &accumulatore, &accum); } - else - adj = 0; - adj = mul_32_32(0x898cc517, adj); /* delta * tan^2 */ - - fix_up.msw += adj; - if ( !(fix_up.msw & 0x80000000) ) /* did fix_up overflow ? */ - { - /* Yes, we need to add an msb */ - shr_Xsig(&fix_up, 1); - fix_up.msw |= 0x80000000; - shr_Xsig(&fix_up, 64 + exponent); + + /* Multiply by 1/3 * arg^3 */ + mul64_Xsig(&accum, &XSIG_LL(argSignif)); + mul64_Xsig(&accum, &XSIG_LL(argSignif)); + mul64_Xsig(&accum, &XSIG_LL(argSignif)); + mul64_Xsig(&accum, &twothirds); + shr_Xsig(&accum, -2 * (exponent + 1)); + + /* tan(arg) = arg + accum */ + add_two_Xsig(&accum, &argSignif, &exponent); + + if (invert) { + /* We now have the value of tan(pi_2 - arg) where pi_2 is an + approximation for pi/2 + */ + /* The next step is to fix the answer to compensate for the + error due to the approximation used for pi/2 + */ + + /* This is (approx) delta, the error in our approx for pi/2 + (see above). It has an exponent of -65 + */ + XSIG_LL(fix_up) = 0x898cc51701b839a2LL; + fix_up.lsw = 0; + + if (exponent == 0) + adj = 0xffffffff; /* We want approx 1.0 here, but + this is close enough. */ + else if (exponent > -30) { + adj = accum.msw >> -(exponent + 1); /* tan */ + adj = mul_32_32(adj, adj); /* tan^2 */ + } else + adj = 0; + adj = mul_32_32(0x898cc517, adj); /* delta * tan^2 */ + + fix_up.msw += adj; + if (!(fix_up.msw & 0x80000000)) { /* did fix_up overflow ? */ + /* Yes, we need to add an msb */ + shr_Xsig(&fix_up, 1); + fix_up.msw |= 0x80000000; + shr_Xsig(&fix_up, 64 + exponent); + } else + shr_Xsig(&fix_up, 65 + exponent); + + add_two_Xsig(&accum, &fix_up, &exponent); + + /* accum now contains tan(pi/2 - arg). + Use tan(arg) = 1.0 / tan(pi/2 - arg) + */ + accumulatoro.lsw = accumulatoro.midw = 0; + accumulatoro.msw = 0x80000000; + div_Xsig(&accumulatoro, &accum, &accum); + exponent = -exponent - 1; } - else - shr_Xsig(&fix_up, 65 + exponent); - - add_two_Xsig(&accum, &fix_up, &exponent); - - /* accum now contains tan(pi/2 - arg). - Use tan(arg) = 1.0 / tan(pi/2 - arg) - */ - accumulatoro.lsw = accumulatoro.midw = 0; - accumulatoro.msw = 0x80000000; - div_Xsig(&accumulatoro, &accum, &accum); - exponent = - exponent - 1; - } - - /* Transfer the result */ - round_Xsig(&accum); - FPU_settag0(TAG_Valid); - significand(st0_ptr) = XSIG_LL(accum); - setexponent16(st0_ptr, exponent + EXTENDED_Ebias); /* Result is positive. */ + + /* Transfer the result */ + round_Xsig(&accum); + FPU_settag0(TAG_Valid); + significand(st0_ptr) = XSIG_LL(accum); + setexponent16(st0_ptr, exponent + EXTENDED_Ebias); /* Result is positive. */ } diff --git a/arch/x86/math-emu/reg_add_sub.c b/arch/x86/math-emu/reg_add_sub.c index 7cd3b37ac08..deea48b9f13 100644 --- a/arch/x86/math-emu/reg_add_sub.c +++ b/arch/x86/math-emu/reg_add_sub.c @@ -27,7 +27,7 @@ static int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa, FPU_REG const *b, u_char tagb, u_char signb, - FPU_REG *dest, int deststnr, int control_w); + FPU_REG * dest, int deststnr, int control_w); /* Operates on st(0) and st(n), or on st(0) and temporary data. @@ -35,340 +35,299 @@ int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa, */ int FPU_add(FPU_REG const *b, u_char tagb, int deststnr, int control_w) { - FPU_REG *a = &st(0); - FPU_REG *dest = &st(deststnr); - u_char signb = getsign(b); - u_char taga = FPU_gettag0(); - u_char signa = getsign(a); - u_char saved_sign = getsign(dest); - int diff, tag, expa, expb; - - if ( !(taga | tagb) ) - { - expa = exponent(a); - expb = exponent(b); - - valid_add: - /* Both registers are valid */ - if (!(signa ^ signb)) - { - /* signs are the same */ - tag = FPU_u_add(a, b, dest, control_w, signa, expa, expb); - } - else - { - /* The signs are different, so do a subtraction */ - diff = expa - expb; - if (!diff) - { - diff = a->sigh - b->sigh; /* This works only if the ms bits - are identical. */ - if (!diff) - { - diff = a->sigl > b->sigl; - if (!diff) - diff = -(a->sigl < b->sigl); + FPU_REG *a = &st(0); + FPU_REG *dest = &st(deststnr); + u_char signb = getsign(b); + u_char taga = FPU_gettag0(); + u_char signa = getsign(a); + u_char saved_sign = getsign(dest); + int diff, tag, expa, expb; + + if (!(taga | tagb)) { + expa = exponent(a); + expb = exponent(b); + + valid_add: + /* Both registers are valid */ + if (!(signa ^ signb)) { + /* signs are the same */ + tag = + FPU_u_add(a, b, dest, control_w, signa, expa, expb); + } else { + /* The signs are different, so do a subtraction */ + diff = expa - expb; + if (!diff) { + diff = a->sigh - b->sigh; /* This works only if the ms bits + are identical. */ + if (!diff) { + diff = a->sigl > b->sigl; + if (!diff) + diff = -(a->sigl < b->sigl); + } + } + + if (diff > 0) { + tag = + FPU_u_sub(a, b, dest, control_w, signa, + expa, expb); + } else if (diff < 0) { + tag = + FPU_u_sub(b, a, dest, control_w, signb, + expb, expa); + } else { + FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); + /* sign depends upon rounding mode */ + setsign(dest, ((control_w & CW_RC) != RC_DOWN) + ? SIGN_POS : SIGN_NEG); + return TAG_Zero; + } } - } - - if (diff > 0) - { - tag = FPU_u_sub(a, b, dest, control_w, signa, expa, expb); - } - else if ( diff < 0 ) - { - tag = FPU_u_sub(b, a, dest, control_w, signb, expb, expa); - } - else - { - FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); - /* sign depends upon rounding mode */ - setsign(dest, ((control_w & CW_RC) != RC_DOWN) - ? SIGN_POS : SIGN_NEG); - return TAG_Zero; - } - } - if ( tag < 0 ) - { - setsign(dest, saved_sign); - return tag; + if (tag < 0) { + setsign(dest, saved_sign); + return tag; + } + FPU_settagi(deststnr, tag); + return tag; } - FPU_settagi(deststnr, tag); - return tag; - } - if ( taga == TAG_Special ) - taga = FPU_Special(a); - if ( tagb == TAG_Special ) - tagb = FPU_Special(b); + if (taga == TAG_Special) + taga = FPU_Special(a); + if (tagb == TAG_Special) + tagb = FPU_Special(b); - if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) + if (((taga == TAG_Valid) && (tagb == TW_Denormal)) || ((taga == TW_Denormal) && (tagb == TAG_Valid)) - || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) - { - FPU_REG x, y; + || ((taga == TW_Denormal) && (tagb == TW_Denormal))) { + FPU_REG x, y; + + if (denormal_operand() < 0) + return FPU_Exception; + + FPU_to_exp16(a, &x); + FPU_to_exp16(b, &y); + a = &x; + b = &y; + expa = exponent16(a); + expb = exponent16(b); + goto valid_add; + } - if ( denormal_operand() < 0 ) - return FPU_Exception; + if ((taga == TW_NaN) || (tagb == TW_NaN)) { + if (deststnr == 0) + return real_2op_NaN(b, tagb, deststnr, a); + else + return real_2op_NaN(a, taga, deststnr, a); + } - FPU_to_exp16(a, &x); - FPU_to_exp16(b, &y); - a = &x; - b = &y; - expa = exponent16(a); - expb = exponent16(b); - goto valid_add; - } - - if ( (taga == TW_NaN) || (tagb == TW_NaN) ) - { - if ( deststnr == 0 ) - return real_2op_NaN(b, tagb, deststnr, a); - else - return real_2op_NaN(a, taga, deststnr, a); - } - - return add_sub_specials(a, taga, signa, b, tagb, signb, - dest, deststnr, control_w); + return add_sub_specials(a, taga, signa, b, tagb, signb, + dest, deststnr, control_w); } - /* Subtract b from a. (a-b) -> dest */ int FPU_sub(int flags, int rm, int control_w) { - FPU_REG const *a, *b; - FPU_REG *dest; - u_char taga, tagb, signa, signb, saved_sign, sign; - int diff, tag = 0, expa, expb, deststnr; - - a = &st(0); - taga = FPU_gettag0(); - - deststnr = 0; - if ( flags & LOADED ) - { - b = (FPU_REG *)rm; - tagb = flags & 0x0f; - } - else - { - b = &st(rm); - tagb = FPU_gettagi(rm); - - if ( flags & DEST_RM ) - deststnr = rm; - } - - signa = getsign(a); - signb = getsign(b); - - if ( flags & REV ) - { - signa ^= SIGN_NEG; - signb ^= SIGN_NEG; - } - - dest = &st(deststnr); - saved_sign = getsign(dest); - - if ( !(taga | tagb) ) - { - expa = exponent(a); - expb = exponent(b); - - valid_subtract: - /* Both registers are valid */ - - diff = expa - expb; - - if (!diff) - { - diff = a->sigh - b->sigh; /* Works only if ms bits are identical */ - if (!diff) - { - diff = a->sigl > b->sigl; - if (!diff) - diff = -(a->sigl < b->sigl); - } + FPU_REG const *a, *b; + FPU_REG *dest; + u_char taga, tagb, signa, signb, saved_sign, sign; + int diff, tag = 0, expa, expb, deststnr; + + a = &st(0); + taga = FPU_gettag0(); + + deststnr = 0; + if (flags & LOADED) { + b = (FPU_REG *) rm; + tagb = flags & 0x0f; + } else { + b = &st(rm); + tagb = FPU_gettagi(rm); + + if (flags & DEST_RM) + deststnr = rm; } - switch ( (((int)signa)*2 + signb) / SIGN_NEG ) - { - case 0: /* P - P */ - case 3: /* N - N */ - if (diff > 0) - { - /* |a| > |b| */ - tag = FPU_u_sub(a, b, dest, control_w, signa, expa, expb); - } - else if ( diff == 0 ) - { - FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); - - /* sign depends upon rounding mode */ - setsign(dest, ((control_w & CW_RC) != RC_DOWN) - ? SIGN_POS : SIGN_NEG); - return TAG_Zero; - } - else - { - sign = signa ^ SIGN_NEG; - tag = FPU_u_sub(b, a, dest, control_w, sign, expb, expa); - } - break; - case 1: /* P - N */ - tag = FPU_u_add(a, b, dest, control_w, SIGN_POS, expa, expb); - break; - case 2: /* N - P */ - tag = FPU_u_add(a, b, dest, control_w, SIGN_NEG, expa, expb); - break; + signa = getsign(a); + signb = getsign(b); + + if (flags & REV) { + signa ^= SIGN_NEG; + signb ^= SIGN_NEG; + } + + dest = &st(deststnr); + saved_sign = getsign(dest); + + if (!(taga | tagb)) { + expa = exponent(a); + expb = exponent(b); + + valid_subtract: + /* Both registers are valid */ + + diff = expa - expb; + + if (!diff) { + diff = a->sigh - b->sigh; /* Works only if ms bits are identical */ + if (!diff) { + diff = a->sigl > b->sigl; + if (!diff) + diff = -(a->sigl < b->sigl); + } + } + + switch ((((int)signa) * 2 + signb) / SIGN_NEG) { + case 0: /* P - P */ + case 3: /* N - N */ + if (diff > 0) { + /* |a| > |b| */ + tag = + FPU_u_sub(a, b, dest, control_w, signa, + expa, expb); + } else if (diff == 0) { + FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); + + /* sign depends upon rounding mode */ + setsign(dest, ((control_w & CW_RC) != RC_DOWN) + ? SIGN_POS : SIGN_NEG); + return TAG_Zero; + } else { + sign = signa ^ SIGN_NEG; + tag = + FPU_u_sub(b, a, dest, control_w, sign, expb, + expa); + } + break; + case 1: /* P - N */ + tag = + FPU_u_add(a, b, dest, control_w, SIGN_POS, expa, + expb); + break; + case 2: /* N - P */ + tag = + FPU_u_add(a, b, dest, control_w, SIGN_NEG, expa, + expb); + break; #ifdef PARANOID - default: - EXCEPTION(EX_INTERNAL|0x111); - return -1; + default: + EXCEPTION(EX_INTERNAL | 0x111); + return -1; #endif + } + if (tag < 0) { + setsign(dest, saved_sign); + return tag; + } + FPU_settagi(deststnr, tag); + return tag; } - if ( tag < 0 ) - { - setsign(dest, saved_sign); - return tag; - } - FPU_settagi(deststnr, tag); - return tag; - } - if ( taga == TAG_Special ) - taga = FPU_Special(a); - if ( tagb == TAG_Special ) - tagb = FPU_Special(b); + if (taga == TAG_Special) + taga = FPU_Special(a); + if (tagb == TAG_Special) + tagb = FPU_Special(b); - if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) + if (((taga == TAG_Valid) && (tagb == TW_Denormal)) || ((taga == TW_Denormal) && (tagb == TAG_Valid)) - || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) - { - FPU_REG x, y; + || ((taga == TW_Denormal) && (tagb == TW_Denormal))) { + FPU_REG x, y; - if ( denormal_operand() < 0 ) - return FPU_Exception; + if (denormal_operand() < 0) + return FPU_Exception; + + FPU_to_exp16(a, &x); + FPU_to_exp16(b, &y); + a = &x; + b = &y; + expa = exponent16(a); + expb = exponent16(b); - FPU_to_exp16(a, &x); - FPU_to_exp16(b, &y); - a = &x; - b = &y; - expa = exponent16(a); - expb = exponent16(b); - - goto valid_subtract; - } - - if ( (taga == TW_NaN) || (tagb == TW_NaN) ) - { - FPU_REG const *d1, *d2; - if ( flags & REV ) - { - d1 = b; - d2 = a; + goto valid_subtract; } - else - { - d1 = a; - d2 = b; + + if ((taga == TW_NaN) || (tagb == TW_NaN)) { + FPU_REG const *d1, *d2; + if (flags & REV) { + d1 = b; + d2 = a; + } else { + d1 = a; + d2 = b; + } + if (flags & LOADED) + return real_2op_NaN(b, tagb, deststnr, d1); + if (flags & DEST_RM) + return real_2op_NaN(a, taga, deststnr, d2); + else + return real_2op_NaN(b, tagb, deststnr, d2); } - if ( flags & LOADED ) - return real_2op_NaN(b, tagb, deststnr, d1); - if ( flags & DEST_RM ) - return real_2op_NaN(a, taga, deststnr, d2); - else - return real_2op_NaN(b, tagb, deststnr, d2); - } - - return add_sub_specials(a, taga, signa, b, tagb, signb ^ SIGN_NEG, - dest, deststnr, control_w); -} + return add_sub_specials(a, taga, signa, b, tagb, signb ^ SIGN_NEG, + dest, deststnr, control_w); +} static int add_sub_specials(FPU_REG const *a, u_char taga, u_char signa, FPU_REG const *b, u_char tagb, u_char signb, - FPU_REG *dest, int deststnr, int control_w) + FPU_REG * dest, int deststnr, int control_w) { - if ( ((taga == TW_Denormal) || (tagb == TW_Denormal)) - && (denormal_operand() < 0) ) - return FPU_Exception; - - if (taga == TAG_Zero) - { - if (tagb == TAG_Zero) - { - /* Both are zero, result will be zero. */ - u_char different_signs = signa ^ signb; - - FPU_copy_to_regi(a, TAG_Zero, deststnr); - if ( different_signs ) - { - /* Signs are different. */ - /* Sign of answer depends upon rounding mode. */ - setsign(dest, ((control_w & CW_RC) != RC_DOWN) - ? SIGN_POS : SIGN_NEG); - } - else - setsign(dest, signa); /* signa may differ from the sign of a. */ - return TAG_Zero; - } - else - { - reg_copy(b, dest); - if ( (tagb == TW_Denormal) && (b->sigh & 0x80000000) ) - { - /* A pseudoDenormal, convert it. */ - addexponent(dest, 1); - tagb = TAG_Valid; - } - else if ( tagb > TAG_Empty ) - tagb = TAG_Special; - setsign(dest, signb); /* signb may differ from the sign of b. */ - FPU_settagi(deststnr, tagb); - return tagb; - } - } - else if (tagb == TAG_Zero) - { - reg_copy(a, dest); - if ( (taga == TW_Denormal) && (a->sigh & 0x80000000) ) - { - /* A pseudoDenormal */ - addexponent(dest, 1); - taga = TAG_Valid; - } - else if ( taga > TAG_Empty ) - taga = TAG_Special; - setsign(dest, signa); /* signa may differ from the sign of a. */ - FPU_settagi(deststnr, taga); - return taga; - } - else if (taga == TW_Infinity) - { - if ( (tagb != TW_Infinity) || (signa == signb) ) - { - FPU_copy_to_regi(a, TAG_Special, deststnr); - setsign(dest, signa); /* signa may differ from the sign of a. */ - return taga; + if (((taga == TW_Denormal) || (tagb == TW_Denormal)) + && (denormal_operand() < 0)) + return FPU_Exception; + + if (taga == TAG_Zero) { + if (tagb == TAG_Zero) { + /* Both are zero, result will be zero. */ + u_char different_signs = signa ^ signb; + + FPU_copy_to_regi(a, TAG_Zero, deststnr); + if (different_signs) { + /* Signs are different. */ + /* Sign of answer depends upon rounding mode. */ + setsign(dest, ((control_w & CW_RC) != RC_DOWN) + ? SIGN_POS : SIGN_NEG); + } else + setsign(dest, signa); /* signa may differ from the sign of a. */ + return TAG_Zero; + } else { + reg_copy(b, dest); + if ((tagb == TW_Denormal) && (b->sigh & 0x80000000)) { + /* A pseudoDenormal, convert it. */ + addexponent(dest, 1); + tagb = TAG_Valid; + } else if (tagb > TAG_Empty) + tagb = TAG_Special; + setsign(dest, signb); /* signb may differ from the sign of b. */ + FPU_settagi(deststnr, tagb); + return tagb; + } + } else if (tagb == TAG_Zero) { + reg_copy(a, dest); + if ((taga == TW_Denormal) && (a->sigh & 0x80000000)) { + /* A pseudoDenormal */ + addexponent(dest, 1); + taga = TAG_Valid; + } else if (taga > TAG_Empty) + taga = TAG_Special; + setsign(dest, signa); /* signa may differ from the sign of a. */ + FPU_settagi(deststnr, taga); + return taga; + } else if (taga == TW_Infinity) { + if ((tagb != TW_Infinity) || (signa == signb)) { + FPU_copy_to_regi(a, TAG_Special, deststnr); + setsign(dest, signa); /* signa may differ from the sign of a. */ + return taga; + } + /* Infinity-Infinity is undefined. */ + return arith_invalid(deststnr); + } else if (tagb == TW_Infinity) { + FPU_copy_to_regi(b, TAG_Special, deststnr); + setsign(dest, signb); /* signb may differ from the sign of b. */ + return tagb; } - /* Infinity-Infinity is undefined. */ - return arith_invalid(deststnr); - } - else if (tagb == TW_Infinity) - { - FPU_copy_to_regi(b, TAG_Special, deststnr); - setsign(dest, signb); /* signb may differ from the sign of b. */ - return tagb; - } - #ifdef PARANOID - EXCEPTION(EX_INTERNAL|0x101); + EXCEPTION(EX_INTERNAL | 0x101); #endif - return FPU_Exception; + return FPU_Exception; } - diff --git a/arch/x86/math-emu/reg_compare.c b/arch/x86/math-emu/reg_compare.c index f37c5b5a35a..ecce55fc2e2 100644 --- a/arch/x86/math-emu/reg_compare.c +++ b/arch/x86/math-emu/reg_compare.c @@ -20,362 +20,331 @@ #include "control_w.h" #include "status_w.h" - static int compare(FPU_REG const *b, int tagb) { - int diff, exp0, expb; - u_char st0_tag; - FPU_REG *st0_ptr; - FPU_REG x, y; - u_char st0_sign, signb = getsign(b); - - st0_ptr = &st(0); - st0_tag = FPU_gettag0(); - st0_sign = getsign(st0_ptr); - - if ( tagb == TAG_Special ) - tagb = FPU_Special(b); - if ( st0_tag == TAG_Special ) - st0_tag = FPU_Special(st0_ptr); - - if ( ((st0_tag != TAG_Valid) && (st0_tag != TW_Denormal)) - || ((tagb != TAG_Valid) && (tagb != TW_Denormal)) ) - { - if ( st0_tag == TAG_Zero ) - { - if ( tagb == TAG_Zero ) return COMP_A_eq_B; - if ( tagb == TAG_Valid ) - return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B); - if ( tagb == TW_Denormal ) - return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B) - | COMP_Denormal; - } - else if ( tagb == TAG_Zero ) - { - if ( st0_tag == TAG_Valid ) - return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); - if ( st0_tag == TW_Denormal ) - return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) - | COMP_Denormal; + int diff, exp0, expb; + u_char st0_tag; + FPU_REG *st0_ptr; + FPU_REG x, y; + u_char st0_sign, signb = getsign(b); + + st0_ptr = &st(0); + st0_tag = FPU_gettag0(); + st0_sign = getsign(st0_ptr); + + if (tagb == TAG_Special) + tagb = FPU_Special(b); + if (st0_tag == TAG_Special) + st0_tag = FPU_Special(st0_ptr); + + if (((st0_tag != TAG_Valid) && (st0_tag != TW_Denormal)) + || ((tagb != TAG_Valid) && (tagb != TW_Denormal))) { + if (st0_tag == TAG_Zero) { + if (tagb == TAG_Zero) + return COMP_A_eq_B; + if (tagb == TAG_Valid) + return ((signb == + SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B); + if (tagb == TW_Denormal) + return ((signb == + SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B) + | COMP_Denormal; + } else if (tagb == TAG_Zero) { + if (st0_tag == TAG_Valid) + return ((st0_sign == + SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); + if (st0_tag == TW_Denormal) + return ((st0_sign == + SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) + | COMP_Denormal; + } + + if (st0_tag == TW_Infinity) { + if ((tagb == TAG_Valid) || (tagb == TAG_Zero)) + return ((st0_sign == + SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); + else if (tagb == TW_Denormal) + return ((st0_sign == + SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) + | COMP_Denormal; + else if (tagb == TW_Infinity) { + /* The 80486 book says that infinities can be equal! */ + return (st0_sign == signb) ? COMP_A_eq_B : + ((st0_sign == + SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); + } + /* Fall through to the NaN code */ + } else if (tagb == TW_Infinity) { + if ((st0_tag == TAG_Valid) || (st0_tag == TAG_Zero)) + return ((signb == + SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B); + if (st0_tag == TW_Denormal) + return ((signb == + SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B) + | COMP_Denormal; + /* Fall through to the NaN code */ + } + + /* The only possibility now should be that one of the arguments + is a NaN */ + if ((st0_tag == TW_NaN) || (tagb == TW_NaN)) { + int signalling = 0, unsupported = 0; + if (st0_tag == TW_NaN) { + signalling = + (st0_ptr->sigh & 0xc0000000) == 0x80000000; + unsupported = !((exponent(st0_ptr) == EXP_OVER) + && (st0_ptr-> + sigh & 0x80000000)); + } + if (tagb == TW_NaN) { + signalling |= + (b->sigh & 0xc0000000) == 0x80000000; + unsupported |= !((exponent(b) == EXP_OVER) + && (b->sigh & 0x80000000)); + } + if (signalling || unsupported) + return COMP_No_Comp | COMP_SNaN | COMP_NaN; + else + /* Neither is a signaling NaN */ + return COMP_No_Comp | COMP_NaN; + } + + EXCEPTION(EX_Invalid); } - if ( st0_tag == TW_Infinity ) - { - if ( (tagb == TAG_Valid) || (tagb == TAG_Zero) ) - return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); - else if ( tagb == TW_Denormal ) - return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) - | COMP_Denormal; - else if ( tagb == TW_Infinity ) - { - /* The 80486 book says that infinities can be equal! */ - return (st0_sign == signb) ? COMP_A_eq_B : - ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B); - } - /* Fall through to the NaN code */ - } - else if ( tagb == TW_Infinity ) - { - if ( (st0_tag == TAG_Valid) || (st0_tag == TAG_Zero) ) - return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B); - if ( st0_tag == TW_Denormal ) - return ((signb == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B) - | COMP_Denormal; - /* Fall through to the NaN code */ + if (st0_sign != signb) { + return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) + | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? + COMP_Denormal : 0); } - /* The only possibility now should be that one of the arguments - is a NaN */ - if ( (st0_tag == TW_NaN) || (tagb == TW_NaN) ) - { - int signalling = 0, unsupported = 0; - if ( st0_tag == TW_NaN ) - { - signalling = (st0_ptr->sigh & 0xc0000000) == 0x80000000; - unsupported = !((exponent(st0_ptr) == EXP_OVER) - && (st0_ptr->sigh & 0x80000000)); - } - if ( tagb == TW_NaN ) - { - signalling |= (b->sigh & 0xc0000000) == 0x80000000; - unsupported |= !((exponent(b) == EXP_OVER) - && (b->sigh & 0x80000000)); - } - if ( signalling || unsupported ) - return COMP_No_Comp | COMP_SNaN | COMP_NaN; - else - /* Neither is a signaling NaN */ - return COMP_No_Comp | COMP_NaN; + if ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) { + FPU_to_exp16(st0_ptr, &x); + FPU_to_exp16(b, &y); + st0_ptr = &x; + b = &y; + exp0 = exponent16(st0_ptr); + expb = exponent16(b); + } else { + exp0 = exponent(st0_ptr); + expb = exponent(b); } - - EXCEPTION(EX_Invalid); - } - - if (st0_sign != signb) - { - return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) - | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? - COMP_Denormal : 0); - } - - if ( (st0_tag == TW_Denormal) || (tagb == TW_Denormal) ) - { - FPU_to_exp16(st0_ptr, &x); - FPU_to_exp16(b, &y); - st0_ptr = &x; - b = &y; - exp0 = exponent16(st0_ptr); - expb = exponent16(b); - } - else - { - exp0 = exponent(st0_ptr); - expb = exponent(b); - } #ifdef PARANOID - if (!(st0_ptr->sigh & 0x80000000)) EXCEPTION(EX_Invalid); - if (!(b->sigh & 0x80000000)) EXCEPTION(EX_Invalid); + if (!(st0_ptr->sigh & 0x80000000)) + EXCEPTION(EX_Invalid); + if (!(b->sigh & 0x80000000)) + EXCEPTION(EX_Invalid); #endif /* PARANOID */ - diff = exp0 - expb; - if ( diff == 0 ) - { - diff = st0_ptr->sigh - b->sigh; /* Works only if ms bits are - identical */ - if ( diff == 0 ) - { - diff = st0_ptr->sigl > b->sigl; - if ( diff == 0 ) - diff = -(st0_ptr->sigl < b->sigl); + diff = exp0 - expb; + if (diff == 0) { + diff = st0_ptr->sigh - b->sigh; /* Works only if ms bits are + identical */ + if (diff == 0) { + diff = st0_ptr->sigl > b->sigl; + if (diff == 0) + diff = -(st0_ptr->sigl < b->sigl); + } } - } - - if ( diff > 0 ) - { - return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) - | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? - COMP_Denormal : 0); - } - if ( diff < 0 ) - { - return ((st0_sign == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B) - | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? - COMP_Denormal : 0); - } - - return COMP_A_eq_B - | ( ((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? - COMP_Denormal : 0); -} + if (diff > 0) { + return ((st0_sign == SIGN_POS) ? COMP_A_gt_B : COMP_A_lt_B) + | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? + COMP_Denormal : 0); + } + if (diff < 0) { + return ((st0_sign == SIGN_POS) ? COMP_A_lt_B : COMP_A_gt_B) + | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? + COMP_Denormal : 0); + } + return COMP_A_eq_B + | (((st0_tag == TW_Denormal) || (tagb == TW_Denormal)) ? + COMP_Denormal : 0); + +} /* This function requires that st(0) is not empty */ int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag) { - int f = 0, c; - - c = compare(loaded_data, loaded_tag); - - if (c & COMP_NaN) - { - EXCEPTION(EX_Invalid); - f = SW_C3 | SW_C2 | SW_C0; - } - else - switch (c & 7) - { - case COMP_A_lt_B: - f = SW_C0; - break; - case COMP_A_eq_B: - f = SW_C3; - break; - case COMP_A_gt_B: - f = 0; - break; - case COMP_No_Comp: - f = SW_C3 | SW_C2 | SW_C0; - break; + int f = 0, c; + + c = compare(loaded_data, loaded_tag); + + if (c & COMP_NaN) { + EXCEPTION(EX_Invalid); + f = SW_C3 | SW_C2 | SW_C0; + } else + switch (c & 7) { + case COMP_A_lt_B: + f = SW_C0; + break; + case COMP_A_eq_B: + f = SW_C3; + break; + case COMP_A_gt_B: + f = 0; + break; + case COMP_No_Comp: + f = SW_C3 | SW_C2 | SW_C0; + break; #ifdef PARANOID - default: - EXCEPTION(EX_INTERNAL|0x121); - f = SW_C3 | SW_C2 | SW_C0; - break; + default: + EXCEPTION(EX_INTERNAL | 0x121); + f = SW_C3 | SW_C2 | SW_C0; + break; #endif /* PARANOID */ - } - setcc(f); - if (c & COMP_Denormal) - { - return denormal_operand() < 0; - } - return 0; + } + setcc(f); + if (c & COMP_Denormal) { + return denormal_operand() < 0; + } + return 0; } - static int compare_st_st(int nr) { - int f = 0, c; - FPU_REG *st_ptr; - - if ( !NOT_EMPTY(0) || !NOT_EMPTY(nr) ) - { - setcc(SW_C3 | SW_C2 | SW_C0); - /* Stack fault */ - EXCEPTION(EX_StackUnder); - return !(control_word & CW_Invalid); - } - - st_ptr = &st(nr); - c = compare(st_ptr, FPU_gettagi(nr)); - if (c & COMP_NaN) - { - setcc(SW_C3 | SW_C2 | SW_C0); - EXCEPTION(EX_Invalid); - return !(control_word & CW_Invalid); - } - else - switch (c & 7) - { - case COMP_A_lt_B: - f = SW_C0; - break; - case COMP_A_eq_B: - f = SW_C3; - break; - case COMP_A_gt_B: - f = 0; - break; - case COMP_No_Comp: - f = SW_C3 | SW_C2 | SW_C0; - break; + int f = 0, c; + FPU_REG *st_ptr; + + if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) { + setcc(SW_C3 | SW_C2 | SW_C0); + /* Stack fault */ + EXCEPTION(EX_StackUnder); + return !(control_word & CW_Invalid); + } + + st_ptr = &st(nr); + c = compare(st_ptr, FPU_gettagi(nr)); + if (c & COMP_NaN) { + setcc(SW_C3 | SW_C2 | SW_C0); + EXCEPTION(EX_Invalid); + return !(control_word & CW_Invalid); + } else + switch (c & 7) { + case COMP_A_lt_B: + f = SW_C0; + break; + case COMP_A_eq_B: + f = SW_C3; + break; + case COMP_A_gt_B: + f = 0; + break; + case COMP_No_Comp: + f = SW_C3 | SW_C2 | SW_C0; + break; #ifdef PARANOID - default: - EXCEPTION(EX_INTERNAL|0x122); - f = SW_C3 | SW_C2 | SW_C0; - break; + default: + EXCEPTION(EX_INTERNAL | 0x122); + f = SW_C3 | SW_C2 | SW_C0; + break; #endif /* PARANOID */ - } - setcc(f); - if (c & COMP_Denormal) - { - return denormal_operand() < 0; - } - return 0; + } + setcc(f); + if (c & COMP_Denormal) { + return denormal_operand() < 0; + } + return 0; } - static int compare_u_st_st(int nr) { - int f = 0, c; - FPU_REG *st_ptr; - - if ( !NOT_EMPTY(0) || !NOT_EMPTY(nr) ) - { - setcc(SW_C3 | SW_C2 | SW_C0); - /* Stack fault */ - EXCEPTION(EX_StackUnder); - return !(control_word & CW_Invalid); - } - - st_ptr = &st(nr); - c = compare(st_ptr, FPU_gettagi(nr)); - if (c & COMP_NaN) - { - setcc(SW_C3 | SW_C2 | SW_C0); - if (c & COMP_SNaN) /* This is the only difference between - un-ordered and ordinary comparisons */ - { - EXCEPTION(EX_Invalid); - return !(control_word & CW_Invalid); + int f = 0, c; + FPU_REG *st_ptr; + + if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) { + setcc(SW_C3 | SW_C2 | SW_C0); + /* Stack fault */ + EXCEPTION(EX_StackUnder); + return !(control_word & CW_Invalid); } - return 0; - } - else - switch (c & 7) - { - case COMP_A_lt_B: - f = SW_C0; - break; - case COMP_A_eq_B: - f = SW_C3; - break; - case COMP_A_gt_B: - f = 0; - break; - case COMP_No_Comp: - f = SW_C3 | SW_C2 | SW_C0; - break; + + st_ptr = &st(nr); + c = compare(st_ptr, FPU_gettagi(nr)); + if (c & COMP_NaN) { + setcc(SW_C3 | SW_C2 | SW_C0); + if (c & COMP_SNaN) { /* This is the only difference between + un-ordered and ordinary comparisons */ + EXCEPTION(EX_Invalid); + return !(control_word & CW_Invalid); + } + return 0; + } else + switch (c & 7) { + case COMP_A_lt_B: + f = SW_C0; + break; + case COMP_A_eq_B: + f = SW_C3; + break; + case COMP_A_gt_B: + f = 0; + break; + case COMP_No_Comp: + f = SW_C3 | SW_C2 | SW_C0; + break; #ifdef PARANOID - default: - EXCEPTION(EX_INTERNAL|0x123); - f = SW_C3 | SW_C2 | SW_C0; - break; -#endif /* PARANOID */ - } - setcc(f); - if (c & COMP_Denormal) - { - return denormal_operand() < 0; - } - return 0; + default: + EXCEPTION(EX_INTERNAL | 0x123); + f = SW_C3 | SW_C2 | SW_C0; + break; +#endif /* PARANOID */ + } + setcc(f); + if (c & COMP_Denormal) { + return denormal_operand() < 0; + } + return 0; } /*---------------------------------------------------------------------------*/ void fcom_st(void) { - /* fcom st(i) */ - compare_st_st(FPU_rm); + /* fcom st(i) */ + compare_st_st(FPU_rm); } - void fcompst(void) { - /* fcomp st(i) */ - if ( !compare_st_st(FPU_rm) ) - FPU_pop(); + /* fcomp st(i) */ + if (!compare_st_st(FPU_rm)) + FPU_pop(); } - void fcompp(void) { - /* fcompp */ - if (FPU_rm != 1) - { - FPU_illegal(); - return; - } - if ( !compare_st_st(1) ) - poppop(); + /* fcompp */ + if (FPU_rm != 1) { + FPU_illegal(); + return; + } + if (!compare_st_st(1)) + poppop(); } - void fucom_(void) { - /* fucom st(i) */ - compare_u_st_st(FPU_rm); + /* fucom st(i) */ + compare_u_st_st(FPU_rm); } - void fucomp(void) { - /* fucomp st(i) */ - if ( !compare_u_st_st(FPU_rm) ) - FPU_pop(); + /* fucomp st(i) */ + if (!compare_u_st_st(FPU_rm)) + FPU_pop(); } - void fucompp(void) { - /* fucompp */ - if (FPU_rm == 1) - { - if ( !compare_u_st_st(1) ) - poppop(); - } - else - FPU_illegal(); + /* fucompp */ + if (FPU_rm == 1) { + if (!compare_u_st_st(1)) + poppop(); + } else + FPU_illegal(); } diff --git a/arch/x86/math-emu/reg_constant.c b/arch/x86/math-emu/reg_constant.c index a8501580196..04869e64b18 100644 --- a/arch/x86/math-emu/reg_constant.c +++ b/arch/x86/math-emu/reg_constant.c @@ -16,29 +16,28 @@ #include "reg_constant.h" #include "control_w.h" - #define MAKE_REG(s,e,l,h) { l, h, \ ((EXTENDED_Ebias+(e)) | ((SIGN_##s != 0)*0x8000)) } -FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000); +FPU_REG const CONST_1 = MAKE_REG(POS, 0, 0x00000000, 0x80000000); #if 0 -FPU_REG const CONST_2 = MAKE_REG(POS, 1, 0x00000000, 0x80000000); +FPU_REG const CONST_2 = MAKE_REG(POS, 1, 0x00000000, 0x80000000); FPU_REG const CONST_HALF = MAKE_REG(POS, -1, 0x00000000, 0x80000000); -#endif /* 0 */ -static FPU_REG const CONST_L2T = MAKE_REG(POS, 1, 0xcd1b8afe, 0xd49a784b); -static FPU_REG const CONST_L2E = MAKE_REG(POS, 0, 0x5c17f0bc, 0xb8aa3b29); -FPU_REG const CONST_PI = MAKE_REG(POS, 1, 0x2168c235, 0xc90fdaa2); -FPU_REG const CONST_PI2 = MAKE_REG(POS, 0, 0x2168c235, 0xc90fdaa2); -FPU_REG const CONST_PI4 = MAKE_REG(POS, -1, 0x2168c235, 0xc90fdaa2); -static FPU_REG const CONST_LG2 = MAKE_REG(POS, -2, 0xfbcff799, 0x9a209a84); -static FPU_REG const CONST_LN2 = MAKE_REG(POS, -1, 0xd1cf79ac, 0xb17217f7); +#endif /* 0 */ +static FPU_REG const CONST_L2T = MAKE_REG(POS, 1, 0xcd1b8afe, 0xd49a784b); +static FPU_REG const CONST_L2E = MAKE_REG(POS, 0, 0x5c17f0bc, 0xb8aa3b29); +FPU_REG const CONST_PI = MAKE_REG(POS, 1, 0x2168c235, 0xc90fdaa2); +FPU_REG const CONST_PI2 = MAKE_REG(POS, 0, 0x2168c235, 0xc90fdaa2); +FPU_REG const CONST_PI4 = MAKE_REG(POS, -1, 0x2168c235, 0xc90fdaa2); +static FPU_REG const CONST_LG2 = MAKE_REG(POS, -2, 0xfbcff799, 0x9a209a84); +static FPU_REG const CONST_LN2 = MAKE_REG(POS, -1, 0xd1cf79ac, 0xb17217f7); /* Extra bits to take pi/2 to more than 128 bits precision. */ FPU_REG const CONST_PI2extra = MAKE_REG(NEG, -66, - 0xfc8f8cbb, 0xece675d1); + 0xfc8f8cbb, 0xece675d1); /* Only the sign (and tag) is used in internal zeroes */ -FPU_REG const CONST_Z = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0); +FPU_REG const CONST_Z = MAKE_REG(POS, EXP_UNDER, 0x0, 0x0); /* Only the sign and significand (and tag) are used in internal NaNs */ /* The 80486 never generates one of these @@ -48,24 +47,22 @@ FPU_REG const CONST_SNAN = MAKE_REG(POS, EXP_OVER, 0x00000001, 0x80000000); FPU_REG const CONST_QNaN = MAKE_REG(NEG, EXP_OVER, 0x00000000, 0xC0000000); /* Only the sign (and tag) is used in internal infinities */ -FPU_REG const CONST_INF = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000); - +FPU_REG const CONST_INF = MAKE_REG(POS, EXP_OVER, 0x00000000, 0x80000000); static void fld_const(FPU_REG const *c, int adj, u_char tag) { - FPU_REG *st_new_ptr; - - if ( STACK_OVERFLOW ) - { - FPU_stack_overflow(); - return; - } - push(); - reg_copy(c, st_new_ptr); - st_new_ptr->sigl += adj; /* For all our fldxxx constants, we don't need to - borrow or carry. */ - FPU_settag0(tag); - clear_C1(); + FPU_REG *st_new_ptr; + + if (STACK_OVERFLOW) { + FPU_stack_overflow(); + return; + } + push(); + reg_copy(c, st_new_ptr); + st_new_ptr->sigl += adj; /* For all our fldxxx constants, we don't need to + borrow or carry. */ + FPU_settag0(tag); + clear_C1(); } /* A fast way to find out whether x is one of RC_DOWN or RC_CHOP @@ -75,46 +72,46 @@ static void fld_const(FPU_REG const *c, int adj, u_char tag) static void fld1(int rc) { - fld_const(&CONST_1, 0, TAG_Valid); + fld_const(&CONST_1, 0, TAG_Valid); } static void fldl2t(int rc) { - fld_const(&CONST_L2T, (rc == RC_UP) ? 1 : 0, TAG_Valid); + fld_const(&CONST_L2T, (rc == RC_UP) ? 1 : 0, TAG_Valid); } static void fldl2e(int rc) { - fld_const(&CONST_L2E, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); + fld_const(&CONST_L2E, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); } static void fldpi(int rc) { - fld_const(&CONST_PI, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); + fld_const(&CONST_PI, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); } static void fldlg2(int rc) { - fld_const(&CONST_LG2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); + fld_const(&CONST_LG2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); } static void fldln2(int rc) { - fld_const(&CONST_LN2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); + fld_const(&CONST_LN2, DOWN_OR_CHOP(rc) ? -1 : 0, TAG_Valid); } static void fldz(int rc) { - fld_const(&CONST_Z, 0, TAG_Zero); + fld_const(&CONST_Z, 0, TAG_Zero); } -typedef void (*FUNC_RC)(int); +typedef void (*FUNC_RC) (int); static FUNC_RC constants_table[] = { - fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, (FUNC_RC)FPU_illegal + fld1, fldl2t, fldl2e, fldpi, fldlg2, fldln2, fldz, (FUNC_RC) FPU_illegal }; void fconst(void) { - (constants_table[FPU_rm])(control_word & CW_RC); + (constants_table[FPU_rm]) (control_word & CW_RC); } diff --git a/arch/x86/math-emu/reg_convert.c b/arch/x86/math-emu/reg_convert.c index 45a25875270..10806077997 100644 --- a/arch/x86/math-emu/reg_convert.c +++ b/arch/x86/math-emu/reg_convert.c @@ -13,41 +13,34 @@ #include "exception.h" #include "fpu_emu.h" - int FPU_to_exp16(FPU_REG const *a, FPU_REG *x) { - int sign = getsign(a); - - *(long long *)&(x->sigl) = *(const long long *)&(a->sigl); - - /* Set up the exponent as a 16 bit quantity. */ - setexponent16(x, exponent(a)); - - if ( exponent16(x) == EXP_UNDER ) - { - /* The number is a de-normal or pseudodenormal. */ - /* We only deal with the significand and exponent. */ - - if (x->sigh & 0x80000000) - { - /* Is a pseudodenormal. */ - /* This is non-80486 behaviour because the number - loses its 'denormal' identity. */ - addexponent(x, 1); - } - else - { - /* Is a denormal. */ - addexponent(x, 1); - FPU_normalize_nuo(x); + int sign = getsign(a); + + *(long long *)&(x->sigl) = *(const long long *)&(a->sigl); + + /* Set up the exponent as a 16 bit quantity. */ + setexponent16(x, exponent(a)); + + if (exponent16(x) == EXP_UNDER) { + /* The number is a de-normal or pseudodenormal. */ + /* We only deal with the significand and exponent. */ + + if (x->sigh & 0x80000000) { + /* Is a pseudodenormal. */ + /* This is non-80486 behaviour because the number + loses its 'denormal' identity. */ + addexponent(x, 1); + } else { + /* Is a denormal. */ + addexponent(x, 1); + FPU_normalize_nuo(x); + } } - } - if ( !(x->sigh & 0x80000000) ) - { - EXCEPTION(EX_INTERNAL | 0x180); - } + if (!(x->sigh & 0x80000000)) { + EXCEPTION(EX_INTERNAL | 0x180); + } - return sign; + return sign; } - diff --git a/arch/x86/math-emu/reg_divide.c b/arch/x86/math-emu/reg_divide.c index 5cee7ff920d..6827012db34 100644 --- a/arch/x86/math-emu/reg_divide.c +++ b/arch/x86/math-emu/reg_divide.c @@ -26,182 +26,157 @@ */ int FPU_div(int flags, int rm, int control_w) { - FPU_REG x, y; - FPU_REG const *a, *b, *st0_ptr, *st_ptr; - FPU_REG *dest; - u_char taga, tagb, signa, signb, sign, saved_sign; - int tag, deststnr; - - if ( flags & DEST_RM ) - deststnr = rm; - else - deststnr = 0; - - if ( flags & REV ) - { - b = &st(0); - st0_ptr = b; - tagb = FPU_gettag0(); - if ( flags & LOADED ) - { - a = (FPU_REG *)rm; - taga = flags & 0x0f; + FPU_REG x, y; + FPU_REG const *a, *b, *st0_ptr, *st_ptr; + FPU_REG *dest; + u_char taga, tagb, signa, signb, sign, saved_sign; + int tag, deststnr; + + if (flags & DEST_RM) + deststnr = rm; + else + deststnr = 0; + + if (flags & REV) { + b = &st(0); + st0_ptr = b; + tagb = FPU_gettag0(); + if (flags & LOADED) { + a = (FPU_REG *) rm; + taga = flags & 0x0f; + } else { + a = &st(rm); + st_ptr = a; + taga = FPU_gettagi(rm); + } + } else { + a = &st(0); + st0_ptr = a; + taga = FPU_gettag0(); + if (flags & LOADED) { + b = (FPU_REG *) rm; + tagb = flags & 0x0f; + } else { + b = &st(rm); + st_ptr = b; + tagb = FPU_gettagi(rm); + } } - else - { - a = &st(rm); - st_ptr = a; - taga = FPU_gettagi(rm); - } - } - else - { - a = &st(0); - st0_ptr = a; - taga = FPU_gettag0(); - if ( flags & LOADED ) - { - b = (FPU_REG *)rm; - tagb = flags & 0x0f; - } - else - { - b = &st(rm); - st_ptr = b; - tagb = FPU_gettagi(rm); - } - } - signa = getsign(a); - signb = getsign(b); + signa = getsign(a); + signb = getsign(b); - sign = signa ^ signb; + sign = signa ^ signb; - dest = &st(deststnr); - saved_sign = getsign(dest); + dest = &st(deststnr); + saved_sign = getsign(dest); - if ( !(taga | tagb) ) - { - /* Both regs Valid, this should be the most common case. */ - reg_copy(a, &x); - reg_copy(b, &y); - setpositive(&x); - setpositive(&y); - tag = FPU_u_div(&x, &y, dest, control_w, sign); + if (!(taga | tagb)) { + /* Both regs Valid, this should be the most common case. */ + reg_copy(a, &x); + reg_copy(b, &y); + setpositive(&x); + setpositive(&y); + tag = FPU_u_div(&x, &y, dest, control_w, sign); - if ( tag < 0 ) - return tag; + if (tag < 0) + return tag; - FPU_settagi(deststnr, tag); - return tag; - } + FPU_settagi(deststnr, tag); + return tag; + } - if ( taga == TAG_Special ) - taga = FPU_Special(a); - if ( tagb == TAG_Special ) - tagb = FPU_Special(b); + if (taga == TAG_Special) + taga = FPU_Special(a); + if (tagb == TAG_Special) + tagb = FPU_Special(b); - if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) + if (((taga == TAG_Valid) && (tagb == TW_Denormal)) || ((taga == TW_Denormal) && (tagb == TAG_Valid)) - || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) - { - if ( denormal_operand() < 0 ) - return FPU_Exception; - - FPU_to_exp16(a, &x); - FPU_to_exp16(b, &y); - tag = FPU_u_div(&x, &y, dest, control_w, sign); - if ( tag < 0 ) - return tag; - - FPU_settagi(deststnr, tag); - return tag; - } - else if ( (taga <= TW_Denormal) && (tagb <= TW_Denormal) ) - { - if ( tagb != TAG_Zero ) - { - /* Want to find Zero/Valid */ - if ( tagb == TW_Denormal ) - { - if ( denormal_operand() < 0 ) - return FPU_Exception; - } - - /* The result is zero. */ - FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); - setsign(dest, sign); - return TAG_Zero; + || ((taga == TW_Denormal) && (tagb == TW_Denormal))) { + if (denormal_operand() < 0) + return FPU_Exception; + + FPU_to_exp16(a, &x); + FPU_to_exp16(b, &y); + tag = FPU_u_div(&x, &y, dest, control_w, sign); + if (tag < 0) + return tag; + + FPU_settagi(deststnr, tag); + return tag; + } else if ((taga <= TW_Denormal) && (tagb <= TW_Denormal)) { + if (tagb != TAG_Zero) { + /* Want to find Zero/Valid */ + if (tagb == TW_Denormal) { + if (denormal_operand() < 0) + return FPU_Exception; + } + + /* The result is zero. */ + FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); + setsign(dest, sign); + return TAG_Zero; + } + /* We have an exception condition, either 0/0 or Valid/Zero. */ + if (taga == TAG_Zero) { + /* 0/0 */ + return arith_invalid(deststnr); + } + /* Valid/Zero */ + return FPU_divide_by_zero(deststnr, sign); } - /* We have an exception condition, either 0/0 or Valid/Zero. */ - if ( taga == TAG_Zero ) - { - /* 0/0 */ - return arith_invalid(deststnr); + /* Must have infinities, NaNs, etc */ + else if ((taga == TW_NaN) || (tagb == TW_NaN)) { + if (flags & LOADED) + return real_2op_NaN((FPU_REG *) rm, flags & 0x0f, 0, + st0_ptr); + + if (flags & DEST_RM) { + int tag; + tag = FPU_gettag0(); + if (tag == TAG_Special) + tag = FPU_Special(st0_ptr); + return real_2op_NaN(st0_ptr, tag, rm, + (flags & REV) ? st0_ptr : &st(rm)); + } else { + int tag; + tag = FPU_gettagi(rm); + if (tag == TAG_Special) + tag = FPU_Special(&st(rm)); + return real_2op_NaN(&st(rm), tag, 0, + (flags & REV) ? st0_ptr : &st(rm)); + } + } else if (taga == TW_Infinity) { + if (tagb == TW_Infinity) { + /* infinity/infinity */ + return arith_invalid(deststnr); + } else { + /* tagb must be Valid or Zero */ + if ((tagb == TW_Denormal) && (denormal_operand() < 0)) + return FPU_Exception; + + /* Infinity divided by Zero or Valid does + not raise and exception, but returns Infinity */ + FPU_copy_to_regi(a, TAG_Special, deststnr); + setsign(dest, sign); + return taga; + } + } else if (tagb == TW_Infinity) { + if ((taga == TW_Denormal) && (denormal_operand() < 0)) + return FPU_Exception; + + /* The result is zero. */ + FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); + setsign(dest, sign); + return TAG_Zero; } - /* Valid/Zero */ - return FPU_divide_by_zero(deststnr, sign); - } - /* Must have infinities, NaNs, etc */ - else if ( (taga == TW_NaN) || (tagb == TW_NaN) ) - { - if ( flags & LOADED ) - return real_2op_NaN((FPU_REG *)rm, flags & 0x0f, 0, st0_ptr); - - if ( flags & DEST_RM ) - { - int tag; - tag = FPU_gettag0(); - if ( tag == TAG_Special ) - tag = FPU_Special(st0_ptr); - return real_2op_NaN(st0_ptr, tag, rm, (flags & REV) ? st0_ptr : &st(rm)); - } - else - { - int tag; - tag = FPU_gettagi(rm); - if ( tag == TAG_Special ) - tag = FPU_Special(&st(rm)); - return real_2op_NaN(&st(rm), tag, 0, (flags & REV) ? st0_ptr : &st(rm)); - } - } - else if (taga == TW_Infinity) - { - if (tagb == TW_Infinity) - { - /* infinity/infinity */ - return arith_invalid(deststnr); - } - else - { - /* tagb must be Valid or Zero */ - if ( (tagb == TW_Denormal) && (denormal_operand() < 0) ) - return FPU_Exception; - - /* Infinity divided by Zero or Valid does - not raise and exception, but returns Infinity */ - FPU_copy_to_regi(a, TAG_Special, deststnr); - setsign(dest, sign); - return taga; - } - } - else if (tagb == TW_Infinity) - { - if ( (taga == TW_Denormal) && (denormal_operand() < 0) ) - return FPU_Exception; - - /* The result is zero. */ - FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); - setsign(dest, sign); - return TAG_Zero; - } #ifdef PARANOID - else - { - EXCEPTION(EX_INTERNAL|0x102); - return FPU_Exception; - } -#endif /* PARANOID */ + else { + EXCEPTION(EX_INTERNAL | 0x102); + return FPU_Exception; + } +#endif /* PARANOID */ return 0; } diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c index e976caef649..799d4af5be6 100644 --- a/arch/x86/math-emu/reg_ld_str.c +++ b/arch/x86/math-emu/reg_ld_str.c @@ -27,1084 +27,938 @@ #include "control_w.h" #include "status_w.h" - -#define DOUBLE_Emax 1023 /* largest valid exponent */ +#define DOUBLE_Emax 1023 /* largest valid exponent */ #define DOUBLE_Ebias 1023 -#define DOUBLE_Emin (-1022) /* smallest valid exponent */ +#define DOUBLE_Emin (-1022) /* smallest valid exponent */ -#define SINGLE_Emax 127 /* largest valid exponent */ +#define SINGLE_Emax 127 /* largest valid exponent */ #define SINGLE_Ebias 127 -#define SINGLE_Emin (-126) /* smallest valid exponent */ - +#define SINGLE_Emin (-126) /* smallest valid exponent */ static u_char normalize_no_excep(FPU_REG *r, int exp, int sign) { - u_char tag; + u_char tag; - setexponent16(r, exp); + setexponent16(r, exp); - tag = FPU_normalize_nuo(r); - stdexp(r); - if ( sign ) - setnegative(r); + tag = FPU_normalize_nuo(r); + stdexp(r); + if (sign) + setnegative(r); - return tag; + return tag; } - int FPU_tagof(FPU_REG *ptr) { - int exp; - - exp = exponent16(ptr) & 0x7fff; - if ( exp == 0 ) - { - if ( !(ptr->sigh | ptr->sigl) ) - { - return TAG_Zero; + int exp; + + exp = exponent16(ptr) & 0x7fff; + if (exp == 0) { + if (!(ptr->sigh | ptr->sigl)) { + return TAG_Zero; + } + /* The number is a de-normal or pseudodenormal. */ + return TAG_Special; + } + + if (exp == 0x7fff) { + /* Is an Infinity, a NaN, or an unsupported data type. */ + return TAG_Special; } - /* The number is a de-normal or pseudodenormal. */ - return TAG_Special; - } - - if ( exp == 0x7fff ) - { - /* Is an Infinity, a NaN, or an unsupported data type. */ - return TAG_Special; - } - - if ( !(ptr->sigh & 0x80000000) ) - { - /* Unsupported data type. */ - /* Valid numbers have the ms bit set to 1. */ - /* Unnormal. */ - return TAG_Special; - } - - return TAG_Valid; -} + if (!(ptr->sigh & 0x80000000)) { + /* Unsupported data type. */ + /* Valid numbers have the ms bit set to 1. */ + /* Unnormal. */ + return TAG_Special; + } + + return TAG_Valid; +} /* Get a long double from user memory */ int FPU_load_extended(long double __user *s, int stnr) { - FPU_REG *sti_ptr = &st(stnr); + FPU_REG *sti_ptr = &st(stnr); - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, s, 10); - __copy_from_user(sti_ptr, s, 10); - RE_ENTRANT_CHECK_ON; + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, s, 10); + __copy_from_user(sti_ptr, s, 10); + RE_ENTRANT_CHECK_ON; - return FPU_tagof(sti_ptr); + return FPU_tagof(sti_ptr); } - /* Get a double from user memory */ int FPU_load_double(double __user *dfloat, FPU_REG *loaded_data) { - int exp, tag, negative; - unsigned m64, l64; - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, dfloat, 8); - FPU_get_user(m64, 1 + (unsigned long __user *) dfloat); - FPU_get_user(l64, (unsigned long __user *) dfloat); - RE_ENTRANT_CHECK_ON; - - negative = (m64 & 0x80000000) ? SIGN_Negative : SIGN_Positive; - exp = ((m64 & 0x7ff00000) >> 20) - DOUBLE_Ebias + EXTENDED_Ebias; - m64 &= 0xfffff; - if ( exp > DOUBLE_Emax + EXTENDED_Ebias ) - { - /* Infinity or NaN */ - if ((m64 == 0) && (l64 == 0)) - { - /* +- infinity */ - loaded_data->sigh = 0x80000000; - loaded_data->sigl = 0x00000000; - exp = EXP_Infinity + EXTENDED_Ebias; - tag = TAG_Special; - } - else - { - /* Must be a signaling or quiet NaN */ - exp = EXP_NaN + EXTENDED_Ebias; - loaded_data->sigh = (m64 << 11) | 0x80000000; - loaded_data->sigh |= l64 >> 21; - loaded_data->sigl = l64 << 11; - tag = TAG_Special; /* The calling function must look for NaNs */ - } - } - else if ( exp < DOUBLE_Emin + EXTENDED_Ebias ) - { - /* Zero or de-normal */ - if ((m64 == 0) && (l64 == 0)) - { - /* Zero */ - reg_copy(&CONST_Z, loaded_data); - exp = 0; - tag = TAG_Zero; - } - else - { - /* De-normal */ - loaded_data->sigh = m64 << 11; - loaded_data->sigh |= l64 >> 21; - loaded_data->sigl = l64 << 11; - - return normalize_no_excep(loaded_data, DOUBLE_Emin, negative) - | (denormal_operand() < 0 ? FPU_Exception : 0); - } - } - else - { - loaded_data->sigh = (m64 << 11) | 0x80000000; - loaded_data->sigh |= l64 >> 21; - loaded_data->sigl = l64 << 11; + int exp, tag, negative; + unsigned m64, l64; + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, dfloat, 8); + FPU_get_user(m64, 1 + (unsigned long __user *)dfloat); + FPU_get_user(l64, (unsigned long __user *)dfloat); + RE_ENTRANT_CHECK_ON; + + negative = (m64 & 0x80000000) ? SIGN_Negative : SIGN_Positive; + exp = ((m64 & 0x7ff00000) >> 20) - DOUBLE_Ebias + EXTENDED_Ebias; + m64 &= 0xfffff; + if (exp > DOUBLE_Emax + EXTENDED_Ebias) { + /* Infinity or NaN */ + if ((m64 == 0) && (l64 == 0)) { + /* +- infinity */ + loaded_data->sigh = 0x80000000; + loaded_data->sigl = 0x00000000; + exp = EXP_Infinity + EXTENDED_Ebias; + tag = TAG_Special; + } else { + /* Must be a signaling or quiet NaN */ + exp = EXP_NaN + EXTENDED_Ebias; + loaded_data->sigh = (m64 << 11) | 0x80000000; + loaded_data->sigh |= l64 >> 21; + loaded_data->sigl = l64 << 11; + tag = TAG_Special; /* The calling function must look for NaNs */ + } + } else if (exp < DOUBLE_Emin + EXTENDED_Ebias) { + /* Zero or de-normal */ + if ((m64 == 0) && (l64 == 0)) { + /* Zero */ + reg_copy(&CONST_Z, loaded_data); + exp = 0; + tag = TAG_Zero; + } else { + /* De-normal */ + loaded_data->sigh = m64 << 11; + loaded_data->sigh |= l64 >> 21; + loaded_data->sigl = l64 << 11; + + return normalize_no_excep(loaded_data, DOUBLE_Emin, + negative) + | (denormal_operand() < 0 ? FPU_Exception : 0); + } + } else { + loaded_data->sigh = (m64 << 11) | 0x80000000; + loaded_data->sigh |= l64 >> 21; + loaded_data->sigl = l64 << 11; - tag = TAG_Valid; - } + tag = TAG_Valid; + } - setexponent16(loaded_data, exp | negative); + setexponent16(loaded_data, exp | negative); - return tag; + return tag; } - /* Get a float from user memory */ int FPU_load_single(float __user *single, FPU_REG *loaded_data) { - unsigned m32; - int exp, tag, negative; - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, single, 4); - FPU_get_user(m32, (unsigned long __user *) single); - RE_ENTRANT_CHECK_ON; - - negative = (m32 & 0x80000000) ? SIGN_Negative : SIGN_Positive; - - if (!(m32 & 0x7fffffff)) - { - /* Zero */ - reg_copy(&CONST_Z, loaded_data); - addexponent(loaded_data, negative); - return TAG_Zero; - } - exp = ((m32 & 0x7f800000) >> 23) - SINGLE_Ebias + EXTENDED_Ebias; - m32 = (m32 & 0x7fffff) << 8; - if ( exp < SINGLE_Emin + EXTENDED_Ebias ) - { - /* De-normals */ - loaded_data->sigh = m32; - loaded_data->sigl = 0; - - return normalize_no_excep(loaded_data, SINGLE_Emin, negative) - | (denormal_operand() < 0 ? FPU_Exception : 0); - } - else if ( exp > SINGLE_Emax + EXTENDED_Ebias ) - { - /* Infinity or NaN */ - if ( m32 == 0 ) - { - /* +- infinity */ - loaded_data->sigh = 0x80000000; - loaded_data->sigl = 0x00000000; - exp = EXP_Infinity + EXTENDED_Ebias; - tag = TAG_Special; + unsigned m32; + int exp, tag, negative; + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, single, 4); + FPU_get_user(m32, (unsigned long __user *)single); + RE_ENTRANT_CHECK_ON; + + negative = (m32 & 0x80000000) ? SIGN_Negative : SIGN_Positive; + + if (!(m32 & 0x7fffffff)) { + /* Zero */ + reg_copy(&CONST_Z, loaded_data); + addexponent(loaded_data, negative); + return TAG_Zero; } - else - { - /* Must be a signaling or quiet NaN */ - exp = EXP_NaN + EXTENDED_Ebias; - loaded_data->sigh = m32 | 0x80000000; - loaded_data->sigl = 0; - tag = TAG_Special; /* The calling function must look for NaNs */ + exp = ((m32 & 0x7f800000) >> 23) - SINGLE_Ebias + EXTENDED_Ebias; + m32 = (m32 & 0x7fffff) << 8; + if (exp < SINGLE_Emin + EXTENDED_Ebias) { + /* De-normals */ + loaded_data->sigh = m32; + loaded_data->sigl = 0; + + return normalize_no_excep(loaded_data, SINGLE_Emin, negative) + | (denormal_operand() < 0 ? FPU_Exception : 0); + } else if (exp > SINGLE_Emax + EXTENDED_Ebias) { + /* Infinity or NaN */ + if (m32 == 0) { + /* +- infinity */ + loaded_data->sigh = 0x80000000; + loaded_data->sigl = 0x00000000; + exp = EXP_Infinity + EXTENDED_Ebias; + tag = TAG_Special; + } else { + /* Must be a signaling or quiet NaN */ + exp = EXP_NaN + EXTENDED_Ebias; + loaded_data->sigh = m32 | 0x80000000; + loaded_data->sigl = 0; + tag = TAG_Special; /* The calling function must look for NaNs */ + } + } else { + loaded_data->sigh = m32 | 0x80000000; + loaded_data->sigl = 0; + tag = TAG_Valid; } - } - else - { - loaded_data->sigh = m32 | 0x80000000; - loaded_data->sigl = 0; - tag = TAG_Valid; - } - setexponent16(loaded_data, exp | negative); /* Set the sign. */ + setexponent16(loaded_data, exp | negative); /* Set the sign. */ - return tag; + return tag; } - /* Get a long long from user memory */ int FPU_load_int64(long long __user *_s) { - long long s; - int sign; - FPU_REG *st0_ptr = &st(0); - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, _s, 8); - if (copy_from_user(&s,_s,8)) - FPU_abort; - RE_ENTRANT_CHECK_ON; - - if (s == 0) - { - reg_copy(&CONST_Z, st0_ptr); - return TAG_Zero; - } - - if (s > 0) - sign = SIGN_Positive; - else - { - s = -s; - sign = SIGN_Negative; - } - - significand(st0_ptr) = s; - - return normalize_no_excep(st0_ptr, 63, sign); -} + long long s; + int sign; + FPU_REG *st0_ptr = &st(0); + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, _s, 8); + if (copy_from_user(&s, _s, 8)) + FPU_abort; + RE_ENTRANT_CHECK_ON; + + if (s == 0) { + reg_copy(&CONST_Z, st0_ptr); + return TAG_Zero; + } + + if (s > 0) + sign = SIGN_Positive; + else { + s = -s; + sign = SIGN_Negative; + } + significand(st0_ptr) = s; + + return normalize_no_excep(st0_ptr, 63, sign); +} /* Get a long from user memory */ int FPU_load_int32(long __user *_s, FPU_REG *loaded_data) { - long s; - int negative; + long s; + int negative; - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, _s, 4); - FPU_get_user(s, _s); - RE_ENTRANT_CHECK_ON; + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, _s, 4); + FPU_get_user(s, _s); + RE_ENTRANT_CHECK_ON; - if (s == 0) - { reg_copy(&CONST_Z, loaded_data); return TAG_Zero; } + if (s == 0) { + reg_copy(&CONST_Z, loaded_data); + return TAG_Zero; + } - if (s > 0) - negative = SIGN_Positive; - else - { - s = -s; - negative = SIGN_Negative; - } + if (s > 0) + negative = SIGN_Positive; + else { + s = -s; + negative = SIGN_Negative; + } - loaded_data->sigh = s; - loaded_data->sigl = 0; + loaded_data->sigh = s; + loaded_data->sigl = 0; - return normalize_no_excep(loaded_data, 31, negative); + return normalize_no_excep(loaded_data, 31, negative); } - /* Get a short from user memory */ int FPU_load_int16(short __user *_s, FPU_REG *loaded_data) { - int s, negative; + int s, negative; - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, _s, 2); - /* Cast as short to get the sign extended. */ - FPU_get_user(s, _s); - RE_ENTRANT_CHECK_ON; + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, _s, 2); + /* Cast as short to get the sign extended. */ + FPU_get_user(s, _s); + RE_ENTRANT_CHECK_ON; - if (s == 0) - { reg_copy(&CONST_Z, loaded_data); return TAG_Zero; } + if (s == 0) { + reg_copy(&CONST_Z, loaded_data); + return TAG_Zero; + } - if (s > 0) - negative = SIGN_Positive; - else - { - s = -s; - negative = SIGN_Negative; - } + if (s > 0) + negative = SIGN_Positive; + else { + s = -s; + negative = SIGN_Negative; + } - loaded_data->sigh = s << 16; - loaded_data->sigl = 0; + loaded_data->sigh = s << 16; + loaded_data->sigl = 0; - return normalize_no_excep(loaded_data, 15, negative); + return normalize_no_excep(loaded_data, 15, negative); } - /* Get a packed bcd array from user memory */ int FPU_load_bcd(u_char __user *s) { - FPU_REG *st0_ptr = &st(0); - int pos; - u_char bcd; - long long l=0; - int sign; - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, s, 10); - RE_ENTRANT_CHECK_ON; - for ( pos = 8; pos >= 0; pos--) - { - l *= 10; - RE_ENTRANT_CHECK_OFF; - FPU_get_user(bcd, s+pos); - RE_ENTRANT_CHECK_ON; - l += bcd >> 4; - l *= 10; - l += bcd & 0x0f; - } - - RE_ENTRANT_CHECK_OFF; - FPU_get_user(sign, s+9); - sign = sign & 0x80 ? SIGN_Negative : SIGN_Positive; - RE_ENTRANT_CHECK_ON; - - if ( l == 0 ) - { - reg_copy(&CONST_Z, st0_ptr); - addexponent(st0_ptr, sign); /* Set the sign. */ - return TAG_Zero; - } - else - { - significand(st0_ptr) = l; - return normalize_no_excep(st0_ptr, 63, sign); - } + FPU_REG *st0_ptr = &st(0); + int pos; + u_char bcd; + long long l = 0; + int sign; + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, s, 10); + RE_ENTRANT_CHECK_ON; + for (pos = 8; pos >= 0; pos--) { + l *= 10; + RE_ENTRANT_CHECK_OFF; + FPU_get_user(bcd, s + pos); + RE_ENTRANT_CHECK_ON; + l += bcd >> 4; + l *= 10; + l += bcd & 0x0f; + } + + RE_ENTRANT_CHECK_OFF; + FPU_get_user(sign, s + 9); + sign = sign & 0x80 ? SIGN_Negative : SIGN_Positive; + RE_ENTRANT_CHECK_ON; + + if (l == 0) { + reg_copy(&CONST_Z, st0_ptr); + addexponent(st0_ptr, sign); /* Set the sign. */ + return TAG_Zero; + } else { + significand(st0_ptr) = l; + return normalize_no_excep(st0_ptr, 63, sign); + } } /*===========================================================================*/ /* Put a long double into user memory */ -int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag, long double __user *d) +int FPU_store_extended(FPU_REG *st0_ptr, u_char st0_tag, + long double __user * d) { - /* - The only exception raised by an attempt to store to an - extended format is the Invalid Stack exception, i.e. - attempting to store from an empty register. - */ - - if ( st0_tag != TAG_Empty ) - { - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, d, 10); - - FPU_put_user(st0_ptr->sigl, (unsigned long __user *) d); - FPU_put_user(st0_ptr->sigh, (unsigned long __user *) ((u_char __user *)d + 4)); - FPU_put_user(exponent16(st0_ptr), (unsigned short __user *) ((u_char __user *)d + 8)); - RE_ENTRANT_CHECK_ON; - - return 1; - } - - /* Empty register (stack underflow) */ - EXCEPTION(EX_StackUnder); - if ( control_word & CW_Invalid ) - { - /* The masked response */ - /* Put out the QNaN indefinite */ - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,d,10); - FPU_put_user(0, (unsigned long __user *) d); - FPU_put_user(0xc0000000, 1 + (unsigned long __user *) d); - FPU_put_user(0xffff, 4 + (short __user *) d); - RE_ENTRANT_CHECK_ON; - return 1; - } - else - return 0; + /* + The only exception raised by an attempt to store to an + extended format is the Invalid Stack exception, i.e. + attempting to store from an empty register. + */ + + if (st0_tag != TAG_Empty) { + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 10); + + FPU_put_user(st0_ptr->sigl, (unsigned long __user *)d); + FPU_put_user(st0_ptr->sigh, + (unsigned long __user *)((u_char __user *) d + 4)); + FPU_put_user(exponent16(st0_ptr), + (unsigned short __user *)((u_char __user *) d + + 8)); + RE_ENTRANT_CHECK_ON; + + return 1; + } -} + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + if (control_word & CW_Invalid) { + /* The masked response */ + /* Put out the QNaN indefinite */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 10); + FPU_put_user(0, (unsigned long __user *)d); + FPU_put_user(0xc0000000, 1 + (unsigned long __user *)d); + FPU_put_user(0xffff, 4 + (short __user *)d); + RE_ENTRANT_CHECK_ON; + return 1; + } else + return 0; +} /* Put a double into user memory */ int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat) { - unsigned long l[2]; - unsigned long increment = 0; /* avoid gcc warnings */ - int precision_loss; - int exp; - FPU_REG tmp; + unsigned long l[2]; + unsigned long increment = 0; /* avoid gcc warnings */ + int precision_loss; + int exp; + FPU_REG tmp; - if ( st0_tag == TAG_Valid ) - { - reg_copy(st0_ptr, &tmp); - exp = exponent(&tmp); + if (st0_tag == TAG_Valid) { + reg_copy(st0_ptr, &tmp); + exp = exponent(&tmp); - if ( exp < DOUBLE_Emin ) /* It may be a denormal */ - { - addexponent(&tmp, -DOUBLE_Emin + 52); /* largest exp to be 51 */ + if (exp < DOUBLE_Emin) { /* It may be a denormal */ + addexponent(&tmp, -DOUBLE_Emin + 52); /* largest exp to be 51 */ - denormal_arg: + denormal_arg: - if ( (precision_loss = FPU_round_to_int(&tmp, st0_tag)) ) - { + if ((precision_loss = FPU_round_to_int(&tmp, st0_tag))) { #ifdef PECULIAR_486 - /* Did it round to a non-denormal ? */ - /* This behaviour might be regarded as peculiar, it appears - that the 80486 rounds to the dest precision, then - converts to decide underflow. */ - if ( !((tmp.sigh == 0x00100000) && (tmp.sigl == 0) && - (st0_ptr->sigl & 0x000007ff)) ) + /* Did it round to a non-denormal ? */ + /* This behaviour might be regarded as peculiar, it appears + that the 80486 rounds to the dest precision, then + converts to decide underflow. */ + if (! + ((tmp.sigh == 0x00100000) && (tmp.sigl == 0) + && (st0_ptr->sigl & 0x000007ff))) #endif /* PECULIAR_486 */ - { - EXCEPTION(EX_Underflow); - /* This is a special case: see sec 16.2.5.1 of - the 80486 book */ - if ( !(control_word & CW_Underflow) ) - return 0; - } - EXCEPTION(precision_loss); - if ( !(control_word & CW_Precision) ) - return 0; - } - l[0] = tmp.sigl; - l[1] = tmp.sigh; - } - else - { - if ( tmp.sigl & 0x000007ff ) - { - precision_loss = 1; - switch (control_word & CW_RC) - { - case RC_RND: - /* Rounding can get a little messy.. */ - increment = ((tmp.sigl & 0x7ff) > 0x400) | /* nearest */ - ((tmp.sigl & 0xc00) == 0xc00); /* odd -> even */ - break; - case RC_DOWN: /* towards -infinity */ - increment = signpositive(&tmp) ? 0 : tmp.sigl & 0x7ff; - break; - case RC_UP: /* towards +infinity */ - increment = signpositive(&tmp) ? tmp.sigl & 0x7ff : 0; - break; - case RC_CHOP: - increment = 0; - break; - } - - /* Truncate the mantissa */ - tmp.sigl &= 0xfffff800; - - if ( increment ) - { - if ( tmp.sigl >= 0xfffff800 ) - { - /* the sigl part overflows */ - if ( tmp.sigh == 0xffffffff ) - { - /* The sigh part overflows */ - tmp.sigh = 0x80000000; - exp++; - if (exp >= EXP_OVER) - goto overflow; + { + EXCEPTION(EX_Underflow); + /* This is a special case: see sec 16.2.5.1 of + the 80486 book */ + if (!(control_word & CW_Underflow)) + return 0; + } + EXCEPTION(precision_loss); + if (!(control_word & CW_Precision)) + return 0; } - else - { - tmp.sigh ++; + l[0] = tmp.sigl; + l[1] = tmp.sigh; + } else { + if (tmp.sigl & 0x000007ff) { + precision_loss = 1; + switch (control_word & CW_RC) { + case RC_RND: + /* Rounding can get a little messy.. */ + increment = ((tmp.sigl & 0x7ff) > 0x400) | /* nearest */ + ((tmp.sigl & 0xc00) == 0xc00); /* odd -> even */ + break; + case RC_DOWN: /* towards -infinity */ + increment = + signpositive(&tmp) ? 0 : tmp. + sigl & 0x7ff; + break; + case RC_UP: /* towards +infinity */ + increment = + signpositive(&tmp) ? tmp. + sigl & 0x7ff : 0; + break; + case RC_CHOP: + increment = 0; + break; + } + + /* Truncate the mantissa */ + tmp.sigl &= 0xfffff800; + + if (increment) { + if (tmp.sigl >= 0xfffff800) { + /* the sigl part overflows */ + if (tmp.sigh == 0xffffffff) { + /* The sigh part overflows */ + tmp.sigh = 0x80000000; + exp++; + if (exp >= EXP_OVER) + goto overflow; + } else { + tmp.sigh++; + } + tmp.sigl = 0x00000000; + } else { + /* We only need to increment sigl */ + tmp.sigl += 0x00000800; + } + } + } else + precision_loss = 0; + + l[0] = (tmp.sigl >> 11) | (tmp.sigh << 21); + l[1] = ((tmp.sigh >> 11) & 0xfffff); + + if (exp > DOUBLE_Emax) { + overflow: + EXCEPTION(EX_Overflow); + if (!(control_word & CW_Overflow)) + return 0; + set_precision_flag_up(); + if (!(control_word & CW_Precision)) + return 0; + + /* This is a special case: see sec 16.2.5.1 of the 80486 book */ + /* Overflow to infinity */ + l[0] = 0x00000000; /* Set to */ + l[1] = 0x7ff00000; /* + INF */ + } else { + if (precision_loss) { + if (increment) + set_precision_flag_up(); + else + set_precision_flag_down(); + } + /* Add the exponent */ + l[1] |= (((exp + DOUBLE_Ebias) & 0x7ff) << 20); } - tmp.sigl = 0x00000000; - } - else - { - /* We only need to increment sigl */ - tmp.sigl += 0x00000800; - } - } - } - else - precision_loss = 0; - - l[0] = (tmp.sigl >> 11) | (tmp.sigh << 21); - l[1] = ((tmp.sigh >> 11) & 0xfffff); - - if ( exp > DOUBLE_Emax ) - { - overflow: - EXCEPTION(EX_Overflow); - if ( !(control_word & CW_Overflow) ) - return 0; - set_precision_flag_up(); - if ( !(control_word & CW_Precision) ) - return 0; - - /* This is a special case: see sec 16.2.5.1 of the 80486 book */ - /* Overflow to infinity */ - l[0] = 0x00000000; /* Set to */ - l[1] = 0x7ff00000; /* + INF */ - } - else - { - if ( precision_loss ) - { - if ( increment ) - set_precision_flag_up(); - else - set_precision_flag_down(); } - /* Add the exponent */ - l[1] |= (((exp+DOUBLE_Ebias) & 0x7ff) << 20); - } - } - } - else if (st0_tag == TAG_Zero) - { - /* Number is zero */ - l[0] = 0; - l[1] = 0; - } - else if ( st0_tag == TAG_Special ) - { - st0_tag = FPU_Special(st0_ptr); - if ( st0_tag == TW_Denormal ) - { - /* A denormal will always underflow. */ + } else if (st0_tag == TAG_Zero) { + /* Number is zero */ + l[0] = 0; + l[1] = 0; + } else if (st0_tag == TAG_Special) { + st0_tag = FPU_Special(st0_ptr); + if (st0_tag == TW_Denormal) { + /* A denormal will always underflow. */ #ifndef PECULIAR_486 - /* An 80486 is supposed to be able to generate - a denormal exception here, but... */ - /* Underflow has priority. */ - if ( control_word & CW_Underflow ) - denormal_operand(); + /* An 80486 is supposed to be able to generate + a denormal exception here, but... */ + /* Underflow has priority. */ + if (control_word & CW_Underflow) + denormal_operand(); #endif /* PECULIAR_486 */ - reg_copy(st0_ptr, &tmp); - goto denormal_arg; - } - else if (st0_tag == TW_Infinity) - { - l[0] = 0; - l[1] = 0x7ff00000; - } - else if (st0_tag == TW_NaN) - { - /* Is it really a NaN ? */ - if ( (exponent(st0_ptr) == EXP_OVER) - && (st0_ptr->sigh & 0x80000000) ) - { - /* See if we can get a valid NaN from the FPU_REG */ - l[0] = (st0_ptr->sigl >> 11) | (st0_ptr->sigh << 21); - l[1] = ((st0_ptr->sigh >> 11) & 0xfffff); - if ( !(st0_ptr->sigh & 0x40000000) ) - { - /* It is a signalling NaN */ - EXCEPTION(EX_Invalid); - if ( !(control_word & CW_Invalid) ) - return 0; - l[1] |= (0x40000000 >> 11); + reg_copy(st0_ptr, &tmp); + goto denormal_arg; + } else if (st0_tag == TW_Infinity) { + l[0] = 0; + l[1] = 0x7ff00000; + } else if (st0_tag == TW_NaN) { + /* Is it really a NaN ? */ + if ((exponent(st0_ptr) == EXP_OVER) + && (st0_ptr->sigh & 0x80000000)) { + /* See if we can get a valid NaN from the FPU_REG */ + l[0] = + (st0_ptr->sigl >> 11) | (st0_ptr-> + sigh << 21); + l[1] = ((st0_ptr->sigh >> 11) & 0xfffff); + if (!(st0_ptr->sigh & 0x40000000)) { + /* It is a signalling NaN */ + EXCEPTION(EX_Invalid); + if (!(control_word & CW_Invalid)) + return 0; + l[1] |= (0x40000000 >> 11); + } + l[1] |= 0x7ff00000; + } else { + /* It is an unsupported data type */ + EXCEPTION(EX_Invalid); + if (!(control_word & CW_Invalid)) + return 0; + l[0] = 0; + l[1] = 0xfff80000; + } } - l[1] |= 0x7ff00000; - } - else - { - /* It is an unsupported data type */ - EXCEPTION(EX_Invalid); - if ( !(control_word & CW_Invalid) ) - return 0; - l[0] = 0; - l[1] = 0xfff80000; - } + } else if (st0_tag == TAG_Empty) { + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + if (control_word & CW_Invalid) { + /* The masked response */ + /* Put out the QNaN indefinite */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, dfloat, 8); + FPU_put_user(0, (unsigned long __user *)dfloat); + FPU_put_user(0xfff80000, + 1 + (unsigned long __user *)dfloat); + RE_ENTRANT_CHECK_ON; + return 1; + } else + return 0; } - } - else if ( st0_tag == TAG_Empty ) - { - /* Empty register (stack underflow) */ - EXCEPTION(EX_StackUnder); - if ( control_word & CW_Invalid ) - { - /* The masked response */ - /* Put out the QNaN indefinite */ - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,dfloat,8); - FPU_put_user(0, (unsigned long __user *) dfloat); - FPU_put_user(0xfff80000, 1 + (unsigned long __user *) dfloat); - RE_ENTRANT_CHECK_ON; - return 1; - } - else - return 0; - } - if ( getsign(st0_ptr) ) - l[1] |= 0x80000000; - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,dfloat,8); - FPU_put_user(l[0], (unsigned long __user *)dfloat); - FPU_put_user(l[1], 1 + (unsigned long __user *)dfloat); - RE_ENTRANT_CHECK_ON; - - return 1; -} + if (getsign(st0_ptr)) + l[1] |= 0x80000000; + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, dfloat, 8); + FPU_put_user(l[0], (unsigned long __user *)dfloat); + FPU_put_user(l[1], 1 + (unsigned long __user *)dfloat); + RE_ENTRANT_CHECK_ON; + + return 1; +} /* Put a float into user memory */ int FPU_store_single(FPU_REG *st0_ptr, u_char st0_tag, float __user *single) { - long templ = 0; - unsigned long increment = 0; /* avoid gcc warnings */ - int precision_loss; - int exp; - FPU_REG tmp; + long templ = 0; + unsigned long increment = 0; /* avoid gcc warnings */ + int precision_loss; + int exp; + FPU_REG tmp; - if ( st0_tag == TAG_Valid ) - { + if (st0_tag == TAG_Valid) { - reg_copy(st0_ptr, &tmp); - exp = exponent(&tmp); + reg_copy(st0_ptr, &tmp); + exp = exponent(&tmp); - if ( exp < SINGLE_Emin ) - { - addexponent(&tmp, -SINGLE_Emin + 23); /* largest exp to be 22 */ + if (exp < SINGLE_Emin) { + addexponent(&tmp, -SINGLE_Emin + 23); /* largest exp to be 22 */ - denormal_arg: + denormal_arg: - if ( (precision_loss = FPU_round_to_int(&tmp, st0_tag)) ) - { + if ((precision_loss = FPU_round_to_int(&tmp, st0_tag))) { #ifdef PECULIAR_486 - /* Did it round to a non-denormal ? */ - /* This behaviour might be regarded as peculiar, it appears - that the 80486 rounds to the dest precision, then - converts to decide underflow. */ - if ( !((tmp.sigl == 0x00800000) && - ((st0_ptr->sigh & 0x000000ff) || st0_ptr->sigl)) ) + /* Did it round to a non-denormal ? */ + /* This behaviour might be regarded as peculiar, it appears + that the 80486 rounds to the dest precision, then + converts to decide underflow. */ + if (!((tmp.sigl == 0x00800000) && + ((st0_ptr->sigh & 0x000000ff) + || st0_ptr->sigl))) #endif /* PECULIAR_486 */ - { - EXCEPTION(EX_Underflow); - /* This is a special case: see sec 16.2.5.1 of - the 80486 book */ - if ( !(control_word & CW_Underflow) ) - return 0; - } - EXCEPTION(precision_loss); - if ( !(control_word & CW_Precision) ) - return 0; - } - templ = tmp.sigl; - } - else - { - if ( tmp.sigl | (tmp.sigh & 0x000000ff) ) - { - unsigned long sigh = tmp.sigh; - unsigned long sigl = tmp.sigl; - - precision_loss = 1; - switch (control_word & CW_RC) - { - case RC_RND: - increment = ((sigh & 0xff) > 0x80) /* more than half */ - || (((sigh & 0xff) == 0x80) && sigl) /* more than half */ - || ((sigh & 0x180) == 0x180); /* round to even */ - break; - case RC_DOWN: /* towards -infinity */ - increment = signpositive(&tmp) - ? 0 : (sigl | (sigh & 0xff)); - break; - case RC_UP: /* towards +infinity */ - increment = signpositive(&tmp) - ? (sigl | (sigh & 0xff)) : 0; - break; - case RC_CHOP: - increment = 0; - break; - } - - /* Truncate part of the mantissa */ - tmp.sigl = 0; - - if (increment) - { - if ( sigh >= 0xffffff00 ) - { - /* The sigh part overflows */ - tmp.sigh = 0x80000000; - exp++; - if ( exp >= EXP_OVER ) - goto overflow; - } - else - { - tmp.sigh &= 0xffffff00; - tmp.sigh += 0x100; - } - } - else - { - tmp.sigh &= 0xffffff00; /* Finish the truncation */ - } - } - else - precision_loss = 0; - - templ = (tmp.sigh >> 8) & 0x007fffff; - - if ( exp > SINGLE_Emax ) - { - overflow: - EXCEPTION(EX_Overflow); - if ( !(control_word & CW_Overflow) ) - return 0; - set_precision_flag_up(); - if ( !(control_word & CW_Precision) ) - return 0; - - /* This is a special case: see sec 16.2.5.1 of the 80486 book. */ - /* Masked response is overflow to infinity. */ - templ = 0x7f800000; - } - else - { - if ( precision_loss ) - { - if ( increment ) - set_precision_flag_up(); - else - set_precision_flag_down(); + { + EXCEPTION(EX_Underflow); + /* This is a special case: see sec 16.2.5.1 of + the 80486 book */ + if (!(control_word & CW_Underflow)) + return 0; + } + EXCEPTION(precision_loss); + if (!(control_word & CW_Precision)) + return 0; + } + templ = tmp.sigl; + } else { + if (tmp.sigl | (tmp.sigh & 0x000000ff)) { + unsigned long sigh = tmp.sigh; + unsigned long sigl = tmp.sigl; + + precision_loss = 1; + switch (control_word & CW_RC) { + case RC_RND: + increment = ((sigh & 0xff) > 0x80) /* more than half */ + ||(((sigh & 0xff) == 0x80) && sigl) /* more than half */ + ||((sigh & 0x180) == 0x180); /* round to even */ + break; + case RC_DOWN: /* towards -infinity */ + increment = signpositive(&tmp) + ? 0 : (sigl | (sigh & 0xff)); + break; + case RC_UP: /* towards +infinity */ + increment = signpositive(&tmp) + ? (sigl | (sigh & 0xff)) : 0; + break; + case RC_CHOP: + increment = 0; + break; + } + + /* Truncate part of the mantissa */ + tmp.sigl = 0; + + if (increment) { + if (sigh >= 0xffffff00) { + /* The sigh part overflows */ + tmp.sigh = 0x80000000; + exp++; + if (exp >= EXP_OVER) + goto overflow; + } else { + tmp.sigh &= 0xffffff00; + tmp.sigh += 0x100; + } + } else { + tmp.sigh &= 0xffffff00; /* Finish the truncation */ + } + } else + precision_loss = 0; + + templ = (tmp.sigh >> 8) & 0x007fffff; + + if (exp > SINGLE_Emax) { + overflow: + EXCEPTION(EX_Overflow); + if (!(control_word & CW_Overflow)) + return 0; + set_precision_flag_up(); + if (!(control_word & CW_Precision)) + return 0; + + /* This is a special case: see sec 16.2.5.1 of the 80486 book. */ + /* Masked response is overflow to infinity. */ + templ = 0x7f800000; + } else { + if (precision_loss) { + if (increment) + set_precision_flag_up(); + else + set_precision_flag_down(); + } + /* Add the exponent */ + templ |= ((exp + SINGLE_Ebias) & 0xff) << 23; + } } - /* Add the exponent */ - templ |= ((exp+SINGLE_Ebias) & 0xff) << 23; - } - } - } - else if (st0_tag == TAG_Zero) - { - templ = 0; - } - else if ( st0_tag == TAG_Special ) - { - st0_tag = FPU_Special(st0_ptr); - if (st0_tag == TW_Denormal) - { - reg_copy(st0_ptr, &tmp); - - /* A denormal will always underflow. */ + } else if (st0_tag == TAG_Zero) { + templ = 0; + } else if (st0_tag == TAG_Special) { + st0_tag = FPU_Special(st0_ptr); + if (st0_tag == TW_Denormal) { + reg_copy(st0_ptr, &tmp); + + /* A denormal will always underflow. */ #ifndef PECULIAR_486 - /* An 80486 is supposed to be able to generate - a denormal exception here, but... */ - /* Underflow has priority. */ - if ( control_word & CW_Underflow ) - denormal_operand(); -#endif /* PECULIAR_486 */ - goto denormal_arg; - } - else if (st0_tag == TW_Infinity) - { - templ = 0x7f800000; - } - else if (st0_tag == TW_NaN) - { - /* Is it really a NaN ? */ - if ( (exponent(st0_ptr) == EXP_OVER) && (st0_ptr->sigh & 0x80000000) ) - { - /* See if we can get a valid NaN from the FPU_REG */ - templ = st0_ptr->sigh >> 8; - if ( !(st0_ptr->sigh & 0x40000000) ) - { - /* It is a signalling NaN */ - EXCEPTION(EX_Invalid); - if ( !(control_word & CW_Invalid) ) - return 0; - templ |= (0x40000000 >> 8); + /* An 80486 is supposed to be able to generate + a denormal exception here, but... */ + /* Underflow has priority. */ + if (control_word & CW_Underflow) + denormal_operand(); +#endif /* PECULIAR_486 */ + goto denormal_arg; + } else if (st0_tag == TW_Infinity) { + templ = 0x7f800000; + } else if (st0_tag == TW_NaN) { + /* Is it really a NaN ? */ + if ((exponent(st0_ptr) == EXP_OVER) + && (st0_ptr->sigh & 0x80000000)) { + /* See if we can get a valid NaN from the FPU_REG */ + templ = st0_ptr->sigh >> 8; + if (!(st0_ptr->sigh & 0x40000000)) { + /* It is a signalling NaN */ + EXCEPTION(EX_Invalid); + if (!(control_word & CW_Invalid)) + return 0; + templ |= (0x40000000 >> 8); + } + templ |= 0x7f800000; + } else { + /* It is an unsupported data type */ + EXCEPTION(EX_Invalid); + if (!(control_word & CW_Invalid)) + return 0; + templ = 0xffc00000; + } } - templ |= 0x7f800000; - } - else - { - /* It is an unsupported data type */ - EXCEPTION(EX_Invalid); - if ( !(control_word & CW_Invalid) ) - return 0; - templ = 0xffc00000; - } - } #ifdef PARANOID - else - { - EXCEPTION(EX_INTERNAL|0x164); - return 0; - } + else { + EXCEPTION(EX_INTERNAL | 0x164); + return 0; + } #endif - } - else if ( st0_tag == TAG_Empty ) - { - /* Empty register (stack underflow) */ - EXCEPTION(EX_StackUnder); - if ( control_word & EX_Invalid ) - { - /* The masked response */ - /* Put out the QNaN indefinite */ - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,single,4); - FPU_put_user(0xffc00000, (unsigned long __user *) single); - RE_ENTRANT_CHECK_ON; - return 1; + } else if (st0_tag == TAG_Empty) { + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + if (control_word & EX_Invalid) { + /* The masked response */ + /* Put out the QNaN indefinite */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, single, 4); + FPU_put_user(0xffc00000, + (unsigned long __user *)single); + RE_ENTRANT_CHECK_ON; + return 1; + } else + return 0; } - else - return 0; - } #ifdef PARANOID - else - { - EXCEPTION(EX_INTERNAL|0x163); - return 0; - } + else { + EXCEPTION(EX_INTERNAL | 0x163); + return 0; + } #endif - if ( getsign(st0_ptr) ) - templ |= 0x80000000; + if (getsign(st0_ptr)) + templ |= 0x80000000; - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,single,4); - FPU_put_user(templ,(unsigned long __user *) single); - RE_ENTRANT_CHECK_ON; + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, single, 4); + FPU_put_user(templ, (unsigned long __user *)single); + RE_ENTRANT_CHECK_ON; - return 1; + return 1; } - /* Put a long long into user memory */ int FPU_store_int64(FPU_REG *st0_ptr, u_char st0_tag, long long __user *d) { - FPU_REG t; - long long tll; - int precision_loss; - - if ( st0_tag == TAG_Empty ) - { - /* Empty register (stack underflow) */ - EXCEPTION(EX_StackUnder); - goto invalid_operand; - } - else if ( st0_tag == TAG_Special ) - { - st0_tag = FPU_Special(st0_ptr); - if ( (st0_tag == TW_Infinity) || - (st0_tag == TW_NaN) ) - { - EXCEPTION(EX_Invalid); - goto invalid_operand; + FPU_REG t; + long long tll; + int precision_loss; + + if (st0_tag == TAG_Empty) { + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + goto invalid_operand; + } else if (st0_tag == TAG_Special) { + st0_tag = FPU_Special(st0_ptr); + if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) { + EXCEPTION(EX_Invalid); + goto invalid_operand; + } } - } - - reg_copy(st0_ptr, &t); - precision_loss = FPU_round_to_int(&t, st0_tag); - ((long *)&tll)[0] = t.sigl; - ((long *)&tll)[1] = t.sigh; - if ( (precision_loss == 1) || - ((t.sigh & 0x80000000) && - !((t.sigh == 0x80000000) && (t.sigl == 0) && - signnegative(&t))) ) - { - EXCEPTION(EX_Invalid); - /* This is a special case: see sec 16.2.5.1 of the 80486 book */ - invalid_operand: - if ( control_word & EX_Invalid ) - { - /* Produce something like QNaN "indefinite" */ - tll = 0x8000000000000000LL; + + reg_copy(st0_ptr, &t); + precision_loss = FPU_round_to_int(&t, st0_tag); + ((long *)&tll)[0] = t.sigl; + ((long *)&tll)[1] = t.sigh; + if ((precision_loss == 1) || + ((t.sigh & 0x80000000) && + !((t.sigh == 0x80000000) && (t.sigl == 0) && signnegative(&t)))) { + EXCEPTION(EX_Invalid); + /* This is a special case: see sec 16.2.5.1 of the 80486 book */ + invalid_operand: + if (control_word & EX_Invalid) { + /* Produce something like QNaN "indefinite" */ + tll = 0x8000000000000000LL; + } else + return 0; + } else { + if (precision_loss) + set_precision_flag(precision_loss); + if (signnegative(&t)) + tll = -tll; } - else - return 0; - } - else - { - if ( precision_loss ) - set_precision_flag(precision_loss); - if ( signnegative(&t) ) - tll = - tll; - } - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,d,8); - if (copy_to_user(d, &tll, 8)) - FPU_abort; - RE_ENTRANT_CHECK_ON; - - return 1; -} + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 8); + if (copy_to_user(d, &tll, 8)) + FPU_abort; + RE_ENTRANT_CHECK_ON; + + return 1; +} /* Put a long into user memory */ int FPU_store_int32(FPU_REG *st0_ptr, u_char st0_tag, long __user *d) { - FPU_REG t; - int precision_loss; - - if ( st0_tag == TAG_Empty ) - { - /* Empty register (stack underflow) */ - EXCEPTION(EX_StackUnder); - goto invalid_operand; - } - else if ( st0_tag == TAG_Special ) - { - st0_tag = FPU_Special(st0_ptr); - if ( (st0_tag == TW_Infinity) || - (st0_tag == TW_NaN) ) - { - EXCEPTION(EX_Invalid); - goto invalid_operand; + FPU_REG t; + int precision_loss; + + if (st0_tag == TAG_Empty) { + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + goto invalid_operand; + } else if (st0_tag == TAG_Special) { + st0_tag = FPU_Special(st0_ptr); + if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) { + EXCEPTION(EX_Invalid); + goto invalid_operand; + } } - } - - reg_copy(st0_ptr, &t); - precision_loss = FPU_round_to_int(&t, st0_tag); - if (t.sigh || - ((t.sigl & 0x80000000) && - !((t.sigl == 0x80000000) && signnegative(&t))) ) - { - EXCEPTION(EX_Invalid); - /* This is a special case: see sec 16.2.5.1 of the 80486 book */ - invalid_operand: - if ( control_word & EX_Invalid ) - { - /* Produce something like QNaN "indefinite" */ - t.sigl = 0x80000000; + + reg_copy(st0_ptr, &t); + precision_loss = FPU_round_to_int(&t, st0_tag); + if (t.sigh || + ((t.sigl & 0x80000000) && + !((t.sigl == 0x80000000) && signnegative(&t)))) { + EXCEPTION(EX_Invalid); + /* This is a special case: see sec 16.2.5.1 of the 80486 book */ + invalid_operand: + if (control_word & EX_Invalid) { + /* Produce something like QNaN "indefinite" */ + t.sigl = 0x80000000; + } else + return 0; + } else { + if (precision_loss) + set_precision_flag(precision_loss); + if (signnegative(&t)) + t.sigl = -(long)t.sigl; } - else - return 0; - } - else - { - if ( precision_loss ) - set_precision_flag(precision_loss); - if ( signnegative(&t) ) - t.sigl = -(long)t.sigl; - } - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,d,4); - FPU_put_user(t.sigl, (unsigned long __user *) d); - RE_ENTRANT_CHECK_ON; - - return 1; -} + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 4); + FPU_put_user(t.sigl, (unsigned long __user *)d); + RE_ENTRANT_CHECK_ON; + + return 1; +} /* Put a short into user memory */ int FPU_store_int16(FPU_REG *st0_ptr, u_char st0_tag, short __user *d) { - FPU_REG t; - int precision_loss; - - if ( st0_tag == TAG_Empty ) - { - /* Empty register (stack underflow) */ - EXCEPTION(EX_StackUnder); - goto invalid_operand; - } - else if ( st0_tag == TAG_Special ) - { - st0_tag = FPU_Special(st0_ptr); - if ( (st0_tag == TW_Infinity) || - (st0_tag == TW_NaN) ) - { - EXCEPTION(EX_Invalid); - goto invalid_operand; + FPU_REG t; + int precision_loss; + + if (st0_tag == TAG_Empty) { + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + goto invalid_operand; + } else if (st0_tag == TAG_Special) { + st0_tag = FPU_Special(st0_ptr); + if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) { + EXCEPTION(EX_Invalid); + goto invalid_operand; + } } - } - - reg_copy(st0_ptr, &t); - precision_loss = FPU_round_to_int(&t, st0_tag); - if (t.sigh || - ((t.sigl & 0xffff8000) && - !((t.sigl == 0x8000) && signnegative(&t))) ) - { - EXCEPTION(EX_Invalid); - /* This is a special case: see sec 16.2.5.1 of the 80486 book */ - invalid_operand: - if ( control_word & EX_Invalid ) - { - /* Produce something like QNaN "indefinite" */ - t.sigl = 0x8000; + + reg_copy(st0_ptr, &t); + precision_loss = FPU_round_to_int(&t, st0_tag); + if (t.sigh || + ((t.sigl & 0xffff8000) && + !((t.sigl == 0x8000) && signnegative(&t)))) { + EXCEPTION(EX_Invalid); + /* This is a special case: see sec 16.2.5.1 of the 80486 book */ + invalid_operand: + if (control_word & EX_Invalid) { + /* Produce something like QNaN "indefinite" */ + t.sigl = 0x8000; + } else + return 0; + } else { + if (precision_loss) + set_precision_flag(precision_loss); + if (signnegative(&t)) + t.sigl = -t.sigl; } - else - return 0; - } - else - { - if ( precision_loss ) - set_precision_flag(precision_loss); - if ( signnegative(&t) ) - t.sigl = -t.sigl; - } - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,d,2); - FPU_put_user((short)t.sigl, d); - RE_ENTRANT_CHECK_ON; - - return 1; -} + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 2); + FPU_put_user((short)t.sigl, d); + RE_ENTRANT_CHECK_ON; + + return 1; +} /* Put a packed bcd array into user memory */ int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d) { - FPU_REG t; - unsigned long long ll; - u_char b; - int i, precision_loss; - u_char sign = (getsign(st0_ptr) == SIGN_NEG) ? 0x80 : 0; - - if ( st0_tag == TAG_Empty ) - { - /* Empty register (stack underflow) */ - EXCEPTION(EX_StackUnder); - goto invalid_operand; - } - else if ( st0_tag == TAG_Special ) - { - st0_tag = FPU_Special(st0_ptr); - if ( (st0_tag == TW_Infinity) || - (st0_tag == TW_NaN) ) - { - EXCEPTION(EX_Invalid); - goto invalid_operand; + FPU_REG t; + unsigned long long ll; + u_char b; + int i, precision_loss; + u_char sign = (getsign(st0_ptr) == SIGN_NEG) ? 0x80 : 0; + + if (st0_tag == TAG_Empty) { + /* Empty register (stack underflow) */ + EXCEPTION(EX_StackUnder); + goto invalid_operand; + } else if (st0_tag == TAG_Special) { + st0_tag = FPU_Special(st0_ptr); + if ((st0_tag == TW_Infinity) || (st0_tag == TW_NaN)) { + EXCEPTION(EX_Invalid); + goto invalid_operand; + } + } + + reg_copy(st0_ptr, &t); + precision_loss = FPU_round_to_int(&t, st0_tag); + ll = significand(&t); + + /* Check for overflow, by comparing with 999999999999999999 decimal. */ + if ((t.sigh > 0x0de0b6b3) || + ((t.sigh == 0x0de0b6b3) && (t.sigl > 0xa763ffff))) { + EXCEPTION(EX_Invalid); + /* This is a special case: see sec 16.2.5.1 of the 80486 book */ + invalid_operand: + if (control_word & CW_Invalid) { + /* Produce the QNaN "indefinite" */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 10); + for (i = 0; i < 7; i++) + FPU_put_user(0, d + i); /* These bytes "undefined" */ + FPU_put_user(0xc0, d + 7); /* This byte "undefined" */ + FPU_put_user(0xff, d + 8); + FPU_put_user(0xff, d + 9); + RE_ENTRANT_CHECK_ON; + return 1; + } else + return 0; + } else if (precision_loss) { + /* Precision loss doesn't stop the data transfer */ + set_precision_flag(precision_loss); } - } - - reg_copy(st0_ptr, &t); - precision_loss = FPU_round_to_int(&t, st0_tag); - ll = significand(&t); - - /* Check for overflow, by comparing with 999999999999999999 decimal. */ - if ( (t.sigh > 0x0de0b6b3) || - ((t.sigh == 0x0de0b6b3) && (t.sigl > 0xa763ffff)) ) - { - EXCEPTION(EX_Invalid); - /* This is a special case: see sec 16.2.5.1 of the 80486 book */ - invalid_operand: - if ( control_word & CW_Invalid ) - { - /* Produce the QNaN "indefinite" */ - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,d,10); - for ( i = 0; i < 7; i++) - FPU_put_user(0, d+i); /* These bytes "undefined" */ - FPU_put_user(0xc0, d+7); /* This byte "undefined" */ - FPU_put_user(0xff, d+8); - FPU_put_user(0xff, d+9); - RE_ENTRANT_CHECK_ON; - return 1; + + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 10); + RE_ENTRANT_CHECK_ON; + for (i = 0; i < 9; i++) { + b = FPU_div_small(&ll, 10); + b |= (FPU_div_small(&ll, 10)) << 4; + RE_ENTRANT_CHECK_OFF; + FPU_put_user(b, d + i); + RE_ENTRANT_CHECK_ON; } - else - return 0; - } - else if ( precision_loss ) - { - /* Precision loss doesn't stop the data transfer */ - set_precision_flag(precision_loss); - } - - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,d,10); - RE_ENTRANT_CHECK_ON; - for ( i = 0; i < 9; i++) - { - b = FPU_div_small(&ll, 10); - b |= (FPU_div_small(&ll, 10)) << 4; - RE_ENTRANT_CHECK_OFF; - FPU_put_user(b, d+i); - RE_ENTRANT_CHECK_ON; - } - RE_ENTRANT_CHECK_OFF; - FPU_put_user(sign, d+9); - RE_ENTRANT_CHECK_ON; - - return 1; + RE_ENTRANT_CHECK_OFF; + FPU_put_user(sign, d + 9); + RE_ENTRANT_CHECK_ON; + + return 1; } /*===========================================================================*/ @@ -1119,59 +973,56 @@ int FPU_store_bcd(FPU_REG *st0_ptr, u_char st0_tag, u_char __user *d) largest possible value */ int FPU_round_to_int(FPU_REG *r, u_char tag) { - u_char very_big; - unsigned eax; - - if (tag == TAG_Zero) - { - /* Make sure that zero is returned */ - significand(r) = 0; - return 0; /* o.k. */ - } - - if (exponent(r) > 63) - { - r->sigl = r->sigh = ~0; /* The largest representable number */ - return 1; /* overflow */ - } - - eax = FPU_shrxs(&r->sigl, 63 - exponent(r)); - very_big = !(~(r->sigh) | ~(r->sigl)); /* test for 0xfff...fff */ + u_char very_big; + unsigned eax; + + if (tag == TAG_Zero) { + /* Make sure that zero is returned */ + significand(r) = 0; + return 0; /* o.k. */ + } + + if (exponent(r) > 63) { + r->sigl = r->sigh = ~0; /* The largest representable number */ + return 1; /* overflow */ + } + + eax = FPU_shrxs(&r->sigl, 63 - exponent(r)); + very_big = !(~(r->sigh) | ~(r->sigl)); /* test for 0xfff...fff */ #define half_or_more (eax & 0x80000000) #define frac_part (eax) #define more_than_half ((eax & 0x80000001) == 0x80000001) - switch (control_word & CW_RC) - { - case RC_RND: - if ( more_than_half /* nearest */ - || (half_or_more && (r->sigl & 1)) ) /* odd -> even */ - { - if ( very_big ) return 1; /* overflow */ - significand(r) ++; - return PRECISION_LOST_UP; - } - break; - case RC_DOWN: - if (frac_part && getsign(r)) - { - if ( very_big ) return 1; /* overflow */ - significand(r) ++; - return PRECISION_LOST_UP; - } - break; - case RC_UP: - if (frac_part && !getsign(r)) - { - if ( very_big ) return 1; /* overflow */ - significand(r) ++; - return PRECISION_LOST_UP; + switch (control_word & CW_RC) { + case RC_RND: + if (more_than_half /* nearest */ + || (half_or_more && (r->sigl & 1))) { /* odd -> even */ + if (very_big) + return 1; /* overflow */ + significand(r)++; + return PRECISION_LOST_UP; + } + break; + case RC_DOWN: + if (frac_part && getsign(r)) { + if (very_big) + return 1; /* overflow */ + significand(r)++; + return PRECISION_LOST_UP; + } + break; + case RC_UP: + if (frac_part && !getsign(r)) { + if (very_big) + return 1; /* overflow */ + significand(r)++; + return PRECISION_LOST_UP; + } + break; + case RC_CHOP: + break; } - break; - case RC_CHOP: - break; - } - return eax ? PRECISION_LOST_DOWN : 0; + return eax ? PRECISION_LOST_DOWN : 0; } @@ -1179,197 +1030,195 @@ int FPU_round_to_int(FPU_REG *r, u_char tag) u_char __user *fldenv(fpu_addr_modes addr_modes, u_char __user *s) { - unsigned short tag_word = 0; - u_char tag; - int i; - - if ( (addr_modes.default_mode == VM86) || - ((addr_modes.default_mode == PM16) - ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX)) ) - { - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, s, 0x0e); - FPU_get_user(control_word, (unsigned short __user *) s); - FPU_get_user(partial_status, (unsigned short __user *) (s+2)); - FPU_get_user(tag_word, (unsigned short __user *) (s+4)); - FPU_get_user(instruction_address.offset, (unsigned short __user *) (s+6)); - FPU_get_user(instruction_address.selector, (unsigned short __user *) (s+8)); - FPU_get_user(operand_address.offset, (unsigned short __user *) (s+0x0a)); - FPU_get_user(operand_address.selector, (unsigned short __user *) (s+0x0c)); - RE_ENTRANT_CHECK_ON; - s += 0x0e; - if ( addr_modes.default_mode == VM86 ) - { - instruction_address.offset - += (instruction_address.selector & 0xf000) << 4; - operand_address.offset += (operand_address.selector & 0xf000) << 4; + unsigned short tag_word = 0; + u_char tag; + int i; + + if ((addr_modes.default_mode == VM86) || + ((addr_modes.default_mode == PM16) + ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX))) { + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, s, 0x0e); + FPU_get_user(control_word, (unsigned short __user *)s); + FPU_get_user(partial_status, (unsigned short __user *)(s + 2)); + FPU_get_user(tag_word, (unsigned short __user *)(s + 4)); + FPU_get_user(instruction_address.offset, + (unsigned short __user *)(s + 6)); + FPU_get_user(instruction_address.selector, + (unsigned short __user *)(s + 8)); + FPU_get_user(operand_address.offset, + (unsigned short __user *)(s + 0x0a)); + FPU_get_user(operand_address.selector, + (unsigned short __user *)(s + 0x0c)); + RE_ENTRANT_CHECK_ON; + s += 0x0e; + if (addr_modes.default_mode == VM86) { + instruction_address.offset + += (instruction_address.selector & 0xf000) << 4; + operand_address.offset += + (operand_address.selector & 0xf000) << 4; + } + } else { + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, s, 0x1c); + FPU_get_user(control_word, (unsigned short __user *)s); + FPU_get_user(partial_status, (unsigned short __user *)(s + 4)); + FPU_get_user(tag_word, (unsigned short __user *)(s + 8)); + FPU_get_user(instruction_address.offset, + (unsigned long __user *)(s + 0x0c)); + FPU_get_user(instruction_address.selector, + (unsigned short __user *)(s + 0x10)); + FPU_get_user(instruction_address.opcode, + (unsigned short __user *)(s + 0x12)); + FPU_get_user(operand_address.offset, + (unsigned long __user *)(s + 0x14)); + FPU_get_user(operand_address.selector, + (unsigned long __user *)(s + 0x18)); + RE_ENTRANT_CHECK_ON; + s += 0x1c; } - } - else - { - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ, s, 0x1c); - FPU_get_user(control_word, (unsigned short __user *) s); - FPU_get_user(partial_status, (unsigned short __user *) (s+4)); - FPU_get_user(tag_word, (unsigned short __user *) (s+8)); - FPU_get_user(instruction_address.offset, (unsigned long __user *) (s+0x0c)); - FPU_get_user(instruction_address.selector, (unsigned short __user *) (s+0x10)); - FPU_get_user(instruction_address.opcode, (unsigned short __user *) (s+0x12)); - FPU_get_user(operand_address.offset, (unsigned long __user *) (s+0x14)); - FPU_get_user(operand_address.selector, (unsigned long __user *) (s+0x18)); - RE_ENTRANT_CHECK_ON; - s += 0x1c; - } #ifdef PECULIAR_486 - control_word &= ~0xe080; -#endif /* PECULIAR_486 */ - - top = (partial_status >> SW_Top_Shift) & 7; - - if ( partial_status & ~control_word & CW_Exceptions ) - partial_status |= (SW_Summary | SW_Backward); - else - partial_status &= ~(SW_Summary | SW_Backward); - - for ( i = 0; i < 8; i++ ) - { - tag = tag_word & 3; - tag_word >>= 2; - - if ( tag == TAG_Empty ) - /* New tag is empty. Accept it */ - FPU_settag(i, TAG_Empty); - else if ( FPU_gettag(i) == TAG_Empty ) - { - /* Old tag is empty and new tag is not empty. New tag is determined - by old reg contents */ - if ( exponent(&fpu_register(i)) == - EXTENDED_Ebias ) - { - if ( !(fpu_register(i).sigl | fpu_register(i).sigh) ) - FPU_settag(i, TAG_Zero); - else - FPU_settag(i, TAG_Special); - } - else if ( exponent(&fpu_register(i)) == 0x7fff - EXTENDED_Ebias ) - { - FPU_settag(i, TAG_Special); - } - else if ( fpu_register(i).sigh & 0x80000000 ) - FPU_settag(i, TAG_Valid); - else - FPU_settag(i, TAG_Special); /* An Un-normal */ - } - /* Else old tag is not empty and new tag is not empty. Old tag - remains correct */ - } - - return s; -} + control_word &= ~0xe080; +#endif /* PECULIAR_486 */ + + top = (partial_status >> SW_Top_Shift) & 7; + + if (partial_status & ~control_word & CW_Exceptions) + partial_status |= (SW_Summary | SW_Backward); + else + partial_status &= ~(SW_Summary | SW_Backward); + + for (i = 0; i < 8; i++) { + tag = tag_word & 3; + tag_word >>= 2; + + if (tag == TAG_Empty) + /* New tag is empty. Accept it */ + FPU_settag(i, TAG_Empty); + else if (FPU_gettag(i) == TAG_Empty) { + /* Old tag is empty and new tag is not empty. New tag is determined + by old reg contents */ + if (exponent(&fpu_register(i)) == -EXTENDED_Ebias) { + if (! + (fpu_register(i).sigl | fpu_register(i). + sigh)) + FPU_settag(i, TAG_Zero); + else + FPU_settag(i, TAG_Special); + } else if (exponent(&fpu_register(i)) == + 0x7fff - EXTENDED_Ebias) { + FPU_settag(i, TAG_Special); + } else if (fpu_register(i).sigh & 0x80000000) + FPU_settag(i, TAG_Valid); + else + FPU_settag(i, TAG_Special); /* An Un-normal */ + } + /* Else old tag is not empty and new tag is not empty. Old tag + remains correct */ + } + return s; +} void frstor(fpu_addr_modes addr_modes, u_char __user *data_address) { - int i, regnr; - u_char __user *s = fldenv(addr_modes, data_address); - int offset = (top & 7) * 10, other = 80 - offset; - - /* Copy all registers in stack order. */ - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_READ,s,80); - __copy_from_user(register_base+offset, s, other); - if ( offset ) - __copy_from_user(register_base, s+other, offset); - RE_ENTRANT_CHECK_ON; - - for ( i = 0; i < 8; i++ ) - { - regnr = (i+top) & 7; - if ( FPU_gettag(regnr) != TAG_Empty ) - /* The loaded data over-rides all other cases. */ - FPU_settag(regnr, FPU_tagof(&st(i))); - } + int i, regnr; + u_char __user *s = fldenv(addr_modes, data_address); + int offset = (top & 7) * 10, other = 80 - offset; + + /* Copy all registers in stack order. */ + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_READ, s, 80); + __copy_from_user(register_base + offset, s, other); + if (offset) + __copy_from_user(register_base, s + other, offset); + RE_ENTRANT_CHECK_ON; + + for (i = 0; i < 8; i++) { + regnr = (i + top) & 7; + if (FPU_gettag(regnr) != TAG_Empty) + /* The loaded data over-rides all other cases. */ + FPU_settag(regnr, FPU_tagof(&st(i))); + } } - u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d) { - if ( (addr_modes.default_mode == VM86) || - ((addr_modes.default_mode == PM16) - ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX)) ) - { - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,d,14); + if ((addr_modes.default_mode == VM86) || + ((addr_modes.default_mode == PM16) + ^ (addr_modes.override.operand_size == OP_SIZE_PREFIX))) { + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 14); #ifdef PECULIAR_486 - FPU_put_user(control_word & ~0xe080, (unsigned long __user *) d); + FPU_put_user(control_word & ~0xe080, (unsigned long __user *)d); #else - FPU_put_user(control_word, (unsigned short __user *) d); + FPU_put_user(control_word, (unsigned short __user *)d); #endif /* PECULIAR_486 */ - FPU_put_user(status_word(), (unsigned short __user *) (d+2)); - FPU_put_user(fpu_tag_word, (unsigned short __user *) (d+4)); - FPU_put_user(instruction_address.offset, (unsigned short __user *) (d+6)); - FPU_put_user(operand_address.offset, (unsigned short __user *) (d+0x0a)); - if ( addr_modes.default_mode == VM86 ) - { - FPU_put_user((instruction_address.offset & 0xf0000) >> 4, - (unsigned short __user *) (d+8)); - FPU_put_user((operand_address.offset & 0xf0000) >> 4, - (unsigned short __user *) (d+0x0c)); - } - else - { - FPU_put_user(instruction_address.selector, (unsigned short __user *) (d+8)); - FPU_put_user(operand_address.selector, (unsigned short __user *) (d+0x0c)); - } - RE_ENTRANT_CHECK_ON; - d += 0x0e; - } - else - { - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE, d, 7*4); + FPU_put_user(status_word(), (unsigned short __user *)(d + 2)); + FPU_put_user(fpu_tag_word, (unsigned short __user *)(d + 4)); + FPU_put_user(instruction_address.offset, + (unsigned short __user *)(d + 6)); + FPU_put_user(operand_address.offset, + (unsigned short __user *)(d + 0x0a)); + if (addr_modes.default_mode == VM86) { + FPU_put_user((instruction_address. + offset & 0xf0000) >> 4, + (unsigned short __user *)(d + 8)); + FPU_put_user((operand_address.offset & 0xf0000) >> 4, + (unsigned short __user *)(d + 0x0c)); + } else { + FPU_put_user(instruction_address.selector, + (unsigned short __user *)(d + 8)); + FPU_put_user(operand_address.selector, + (unsigned short __user *)(d + 0x0c)); + } + RE_ENTRANT_CHECK_ON; + d += 0x0e; + } else { + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 7 * 4); #ifdef PECULIAR_486 - control_word &= ~0xe080; - /* An 80486 sets nearly all of the reserved bits to 1. */ - control_word |= 0xffff0040; - partial_status = status_word() | 0xffff0000; - fpu_tag_word |= 0xffff0000; - I387.soft.fcs &= ~0xf8000000; - I387.soft.fos |= 0xffff0000; + control_word &= ~0xe080; + /* An 80486 sets nearly all of the reserved bits to 1. */ + control_word |= 0xffff0040; + partial_status = status_word() | 0xffff0000; + fpu_tag_word |= 0xffff0000; + I387.soft.fcs &= ~0xf8000000; + I387.soft.fos |= 0xffff0000; #endif /* PECULIAR_486 */ - if (__copy_to_user(d, &control_word, 7*4)) - FPU_abort; - RE_ENTRANT_CHECK_ON; - d += 0x1c; - } - - control_word |= CW_Exceptions; - partial_status &= ~(SW_Summary | SW_Backward); - - return d; -} + if (__copy_to_user(d, &control_word, 7 * 4)) + FPU_abort; + RE_ENTRANT_CHECK_ON; + d += 0x1c; + } + control_word |= CW_Exceptions; + partial_status &= ~(SW_Summary | SW_Backward); + + return d; +} void fsave(fpu_addr_modes addr_modes, u_char __user *data_address) { - u_char __user *d; - int offset = (top & 7) * 10, other = 80 - offset; + u_char __user *d; + int offset = (top & 7) * 10, other = 80 - offset; - d = fstenv(addr_modes, data_address); + d = fstenv(addr_modes, data_address); - RE_ENTRANT_CHECK_OFF; - FPU_access_ok(VERIFY_WRITE,d,80); + RE_ENTRANT_CHECK_OFF; + FPU_access_ok(VERIFY_WRITE, d, 80); - /* Copy all registers in stack order. */ - if (__copy_to_user(d, register_base+offset, other)) - FPU_abort; - if ( offset ) - if (__copy_to_user(d+other, register_base, offset)) - FPU_abort; - RE_ENTRANT_CHECK_ON; + /* Copy all registers in stack order. */ + if (__copy_to_user(d, register_base + offset, other)) + FPU_abort; + if (offset) + if (__copy_to_user(d + other, register_base, offset)) + FPU_abort; + RE_ENTRANT_CHECK_ON; - finit(); + finit(); } /*===========================================================================*/ diff --git a/arch/x86/math-emu/reg_mul.c b/arch/x86/math-emu/reg_mul.c index 40f50b61bc6..36c37f71f71 100644 --- a/arch/x86/math-emu/reg_mul.c +++ b/arch/x86/math-emu/reg_mul.c @@ -20,7 +20,6 @@ #include "reg_constant.h" #include "fpu_system.h" - /* Multiply two registers to give a register result. The sources are st(deststnr) and (b,tagb,signb). @@ -29,104 +28,88 @@ /* This routine must be called with non-empty source registers */ int FPU_mul(FPU_REG const *b, u_char tagb, int deststnr, int control_w) { - FPU_REG *a = &st(deststnr); - FPU_REG *dest = a; - u_char taga = FPU_gettagi(deststnr); - u_char saved_sign = getsign(dest); - u_char sign = (getsign(a) ^ getsign(b)); - int tag; - + FPU_REG *a = &st(deststnr); + FPU_REG *dest = a; + u_char taga = FPU_gettagi(deststnr); + u_char saved_sign = getsign(dest); + u_char sign = (getsign(a) ^ getsign(b)); + int tag; - if ( !(taga | tagb) ) - { - /* Both regs Valid, this should be the most common case. */ + if (!(taga | tagb)) { + /* Both regs Valid, this should be the most common case. */ - tag = FPU_u_mul(a, b, dest, control_w, sign, exponent(a) + exponent(b)); - if ( tag < 0 ) - { - setsign(dest, saved_sign); - return tag; + tag = + FPU_u_mul(a, b, dest, control_w, sign, + exponent(a) + exponent(b)); + if (tag < 0) { + setsign(dest, saved_sign); + return tag; + } + FPU_settagi(deststnr, tag); + return tag; } - FPU_settagi(deststnr, tag); - return tag; - } - if ( taga == TAG_Special ) - taga = FPU_Special(a); - if ( tagb == TAG_Special ) - tagb = FPU_Special(b); + if (taga == TAG_Special) + taga = FPU_Special(a); + if (tagb == TAG_Special) + tagb = FPU_Special(b); - if ( ((taga == TAG_Valid) && (tagb == TW_Denormal)) + if (((taga == TAG_Valid) && (tagb == TW_Denormal)) || ((taga == TW_Denormal) && (tagb == TAG_Valid)) - || ((taga == TW_Denormal) && (tagb == TW_Denormal)) ) - { - FPU_REG x, y; - if ( denormal_operand() < 0 ) - return FPU_Exception; - - FPU_to_exp16(a, &x); - FPU_to_exp16(b, &y); - tag = FPU_u_mul(&x, &y, dest, control_w, sign, - exponent16(&x) + exponent16(&y)); - if ( tag < 0 ) - { - setsign(dest, saved_sign); - return tag; - } - FPU_settagi(deststnr, tag); - return tag; - } - else if ( (taga <= TW_Denormal) && (tagb <= TW_Denormal) ) - { - if ( ((tagb == TW_Denormal) || (taga == TW_Denormal)) - && (denormal_operand() < 0) ) - return FPU_Exception; + || ((taga == TW_Denormal) && (tagb == TW_Denormal))) { + FPU_REG x, y; + if (denormal_operand() < 0) + return FPU_Exception; - /* Must have either both arguments == zero, or - one valid and the other zero. - The result is therefore zero. */ - FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); - /* The 80486 book says that the answer is +0, but a real - 80486 behaves this way. - IEEE-754 apparently says it should be this way. */ - setsign(dest, sign); - return TAG_Zero; - } - /* Must have infinities, NaNs, etc */ - else if ( (taga == TW_NaN) || (tagb == TW_NaN) ) - { - return real_2op_NaN(b, tagb, deststnr, &st(0)); - } - else if ( ((taga == TW_Infinity) && (tagb == TAG_Zero)) - || ((tagb == TW_Infinity) && (taga == TAG_Zero)) ) - { - return arith_invalid(deststnr); /* Zero*Infinity is invalid */ - } - else if ( ((taga == TW_Denormal) || (tagb == TW_Denormal)) - && (denormal_operand() < 0) ) - { - return FPU_Exception; - } - else if (taga == TW_Infinity) - { - FPU_copy_to_regi(a, TAG_Special, deststnr); - setsign(dest, sign); - return TAG_Special; - } - else if (tagb == TW_Infinity) - { - FPU_copy_to_regi(b, TAG_Special, deststnr); - setsign(dest, sign); - return TAG_Special; - } + FPU_to_exp16(a, &x); + FPU_to_exp16(b, &y); + tag = FPU_u_mul(&x, &y, dest, control_w, sign, + exponent16(&x) + exponent16(&y)); + if (tag < 0) { + setsign(dest, saved_sign); + return tag; + } + FPU_settagi(deststnr, tag); + return tag; + } else if ((taga <= TW_Denormal) && (tagb <= TW_Denormal)) { + if (((tagb == TW_Denormal) || (taga == TW_Denormal)) + && (denormal_operand() < 0)) + return FPU_Exception; + /* Must have either both arguments == zero, or + one valid and the other zero. + The result is therefore zero. */ + FPU_copy_to_regi(&CONST_Z, TAG_Zero, deststnr); + /* The 80486 book says that the answer is +0, but a real + 80486 behaves this way. + IEEE-754 apparently says it should be this way. */ + setsign(dest, sign); + return TAG_Zero; + } + /* Must have infinities, NaNs, etc */ + else if ((taga == TW_NaN) || (tagb == TW_NaN)) { + return real_2op_NaN(b, tagb, deststnr, &st(0)); + } else if (((taga == TW_Infinity) && (tagb == TAG_Zero)) + || ((tagb == TW_Infinity) && (taga == TAG_Zero))) { + return arith_invalid(deststnr); /* Zero*Infinity is invalid */ + } else if (((taga == TW_Denormal) || (tagb == TW_Denormal)) + && (denormal_operand() < 0)) { + return FPU_Exception; + } else if (taga == TW_Infinity) { + FPU_copy_to_regi(a, TAG_Special, deststnr); + setsign(dest, sign); + return TAG_Special; + } else if (tagb == TW_Infinity) { + FPU_copy_to_regi(b, TAG_Special, deststnr); + setsign(dest, sign); + return TAG_Special; + } #ifdef PARANOID - else - { - EXCEPTION(EX_INTERNAL|0x102); - return FPU_Exception; - } -#endif /* PARANOID */ + else { + EXCEPTION(EX_INTERNAL | 0x102); + return FPU_Exception; + } +#endif /* PARANOID */ return 0; } diff --git a/arch/x86/math-emu/status_w.h b/arch/x86/math-emu/status_w.h index 59e73302aa6..54a3f226982 100644 --- a/arch/x86/math-emu/status_w.h +++ b/arch/x86/math-emu/status_w.h @@ -10,7 +10,7 @@ #ifndef _STATUS_H_ #define _STATUS_H_ -#include "fpu_emu.h" /* for definition of PECULIAR_486 */ +#include "fpu_emu.h" /* for definition of PECULIAR_486 */ #ifdef __ASSEMBLY__ #define Const__(x) $##x @@ -34,7 +34,7 @@ #define SW_Denorm_Op Const__(0x0002) /* denormalized operand */ #define SW_Invalid Const__(0x0001) /* invalid operation */ -#define SW_Exc_Mask Const__(0x27f) /* Status word exception bit mask */ +#define SW_Exc_Mask Const__(0x27f) /* Status word exception bit mask */ #ifndef __ASSEMBLY__ @@ -50,8 +50,8 @@ ((partial_status & ~SW_Top & 0xffff) | ((top << SW_Top_Shift) & SW_Top)) static inline void setcc(int cc) { - partial_status &= ~(SW_C0|SW_C1|SW_C2|SW_C3); - partial_status |= (cc) & (SW_C0|SW_C1|SW_C2|SW_C3); + partial_status &= ~(SW_C0 | SW_C1 | SW_C2 | SW_C3); + partial_status |= (cc) & (SW_C0 | SW_C1 | SW_C2 | SW_C3); } #ifdef PECULIAR_486 diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32 index 362b4ad082d..c36ae88bb54 100644 --- a/arch/x86/mm/Makefile_32 +++ b/arch/x86/mm/Makefile_32 @@ -2,9 +2,8 @@ # Makefile for the linux i386-specific parts of the memory manager. # -obj-y := init_32.o pgtable_32.o fault_32.o ioremap_32.o extable_32.o pageattr_32.o mmap_32.o +obj-y := init_32.o pgtable_32.o fault.o ioremap.o extable.o pageattr.o mmap.o obj-$(CONFIG_NUMA) += discontig_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_HIGHMEM) += highmem_32.o -obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap_32.o diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64 index 6bcb47945b8..688c8c28ac8 100644 --- a/arch/x86/mm/Makefile_64 +++ b/arch/x86/mm/Makefile_64 @@ -2,9 +2,8 @@ # Makefile for the linux x86_64-specific parts of the memory manager. # -obj-y := init_64.o fault_64.o ioremap_64.o extable_64.o pageattr_64.o mmap_64.o +obj-y := init_64.o fault.o ioremap.o extable.o pageattr.o mmap.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NUMA) += numa_64.o obj-$(CONFIG_K8_NUMA) += k8topology_64.o obj-$(CONFIG_ACPI_NUMA) += srat_64.o - diff --git a/arch/x86/mm/boot_ioremap_32.c b/arch/x86/mm/boot_ioremap_32.c deleted file mode 100644 index f14da2a53ec..00000000000 --- a/arch/x86/mm/boot_ioremap_32.c +++ /dev/null @@ -1,100 +0,0 @@ -/* - * arch/i386/mm/boot_ioremap.c - * - * Re-map functions for early boot-time before paging_init() when the - * boot-time pagetables are still in use - * - * Written by Dave Hansen <haveblue@us.ibm.com> - */ - - -/* - * We need to use the 2-level pagetable functions, but CONFIG_X86_PAE - * keeps that from happening. If anyone has a better way, I'm listening. - * - * boot_pte_t is defined only if this all works correctly - */ - -#undef CONFIG_X86_PAE -#undef CONFIG_PARAVIRT -#include <asm/page.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <linux/init.h> -#include <linux/stddef.h> - -/* - * I'm cheating here. It is known that the two boot PTE pages are - * allocated next to each other. I'm pretending that they're just - * one big array. - */ - -#define BOOT_PTE_PTRS (PTRS_PER_PTE*2) - -static unsigned long boot_pte_index(unsigned long vaddr) -{ - return __pa(vaddr) >> PAGE_SHIFT; -} - -static inline boot_pte_t* boot_vaddr_to_pte(void *address) -{ - boot_pte_t* boot_pg = (boot_pte_t*)pg0; - return &boot_pg[boot_pte_index((unsigned long)address)]; -} - -/* - * This is only for a caller who is clever enough to page-align - * phys_addr and virtual_source, and who also has a preference - * about which virtual address from which to steal ptes - */ -static void __boot_ioremap(unsigned long phys_addr, unsigned long nrpages, - void* virtual_source) -{ - boot_pte_t* pte; - int i; - char *vaddr = virtual_source; - - pte = boot_vaddr_to_pte(virtual_source); - for (i=0; i < nrpages; i++, phys_addr += PAGE_SIZE, pte++) { - set_pte(pte, pfn_pte(phys_addr>>PAGE_SHIFT, PAGE_KERNEL)); - __flush_tlb_one(&vaddr[i*PAGE_SIZE]); - } -} - -/* the virtual space we're going to remap comes from this array */ -#define BOOT_IOREMAP_PAGES 4 -#define BOOT_IOREMAP_SIZE (BOOT_IOREMAP_PAGES*PAGE_SIZE) -static __initdata char boot_ioremap_space[BOOT_IOREMAP_SIZE] - __attribute__ ((aligned (PAGE_SIZE))); - -/* - * This only applies to things which need to ioremap before paging_init() - * bt_ioremap() and plain ioremap() are both useless at this point. - * - * When used, we're still using the boot-time pagetables, which only - * have 2 PTE pages mapping the first 8MB - * - * There is no unmap. The boot-time PTE pages aren't used after boot. - * If you really want the space back, just remap it yourself. - * boot_ioremap(&ioremap_space-PAGE_OFFSET, BOOT_IOREMAP_SIZE) - */ -__init void* boot_ioremap(unsigned long phys_addr, unsigned long size) -{ - unsigned long last_addr, offset; - unsigned int nrpages; - - last_addr = phys_addr + size - 1; - - /* page align the requested address */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - phys_addr; - - nrpages = size >> PAGE_SHIFT; - if (nrpages > BOOT_IOREMAP_PAGES) - return NULL; - - __boot_ioremap(phys_addr, nrpages, boot_ioremap_space); - - return &boot_ioremap_space[offset]; -} diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 13a474d3c6e..04b1d20e261 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -32,6 +32,7 @@ #include <linux/kexec.h> #include <linux/pfn.h> #include <linux/swap.h> +#include <linux/acpi.h> #include <asm/e820.h> #include <asm/setup.h> @@ -103,14 +104,10 @@ extern unsigned long highend_pfn, highstart_pfn; #define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) -static unsigned long node_remap_start_pfn[MAX_NUMNODES]; unsigned long node_remap_size[MAX_NUMNODES]; -static unsigned long node_remap_offset[MAX_NUMNODES]; static void *node_remap_start_vaddr[MAX_NUMNODES]; void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -static void *node_remap_end_vaddr[MAX_NUMNODES]; -static void *node_remap_alloc_vaddr[MAX_NUMNODES]; static unsigned long kva_start_pfn; static unsigned long kva_pages; /* @@ -167,6 +164,22 @@ static void __init allocate_pgdat(int nid) } } +#ifdef CONFIG_DISCONTIGMEM +/* + * In the discontig memory model, a portion of the kernel virtual area (KVA) + * is reserved and portions of nodes are mapped using it. This is to allow + * node-local memory to be allocated for structures that would normally require + * ZONE_NORMAL. The memory is allocated with alloc_remap() and callers + * should be prepared to allocate from the bootmem allocator instead. This KVA + * mechanism is incompatible with SPARSEMEM as it makes assumptions about the + * layout of memory that are broken if alloc_remap() succeeds for some of the + * map and fails for others + */ +static unsigned long node_remap_start_pfn[MAX_NUMNODES]; +static void *node_remap_end_vaddr[MAX_NUMNODES]; +static void *node_remap_alloc_vaddr[MAX_NUMNODES]; +static unsigned long node_remap_offset[MAX_NUMNODES]; + void *alloc_remap(int nid, unsigned long size) { void *allocation = node_remap_alloc_vaddr[nid]; @@ -263,11 +276,46 @@ static unsigned long calculate_numa_remap_pages(void) return reserve_pages; } +static void init_remap_allocator(int nid) +{ + node_remap_start_vaddr[nid] = pfn_to_kaddr( + kva_start_pfn + node_remap_offset[nid]); + node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + + (node_remap_size[nid] * PAGE_SIZE); + node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + + ALIGN(sizeof(pg_data_t), PAGE_SIZE); + + printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, + (ulong) node_remap_start_vaddr[nid], + (ulong) pfn_to_kaddr(highstart_pfn + + node_remap_offset[nid] + node_remap_size[nid])); +} +#else +void *alloc_remap(int nid, unsigned long size) +{ + return NULL; +} + +static unsigned long calculate_numa_remap_pages(void) +{ + return 0; +} + +static void init_remap_allocator(int nid) +{ +} + +void __init remap_numa_kva(void) +{ +} +#endif /* CONFIG_DISCONTIGMEM */ + extern void setup_bootmem_allocator(void); unsigned long __init setup_memory(void) { int nid; unsigned long system_start_pfn, system_max_low_pfn; + unsigned long wasted_pages; /* * When mapping a NUMA machine we allocate the node_mem_map arrays @@ -288,11 +336,18 @@ unsigned long __init setup_memory(void) #ifdef CONFIG_BLK_DEV_INITRD /* Numa kva area is below the initrd */ - if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) - kva_start_pfn = PFN_DOWN(boot_params.hdr.ramdisk_image) + if (initrd_start) + kva_start_pfn = PFN_DOWN(initrd_start - PAGE_OFFSET) - kva_pages; #endif - kva_start_pfn -= kva_start_pfn & (PTRS_PER_PTE-1); + + /* + * We waste pages past at the end of the KVA for no good reason other + * than how it is located. This is bad. + */ + wasted_pages = kva_start_pfn & (PTRS_PER_PTE-1); + kva_start_pfn -= wasted_pages; + kva_pages += wasted_pages; system_max_low_pfn = max_low_pfn = find_max_low_pfn(); printk("kva_start_pfn ~ %ld find_max_low_pfn() ~ %ld\n", @@ -318,19 +373,9 @@ unsigned long __init setup_memory(void) printk("Low memory ends at vaddr %08lx\n", (ulong) pfn_to_kaddr(max_low_pfn)); for_each_online_node(nid) { - node_remap_start_vaddr[nid] = pfn_to_kaddr( - kva_start_pfn + node_remap_offset[nid]); - /* Init the node remap allocator */ - node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + - (node_remap_size[nid] * PAGE_SIZE); - node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + - ALIGN(sizeof(pg_data_t), PAGE_SIZE); + init_remap_allocator(nid); allocate_pgdat(nid); - printk ("node %d will remap to vaddr %08lx - %08lx\n", nid, - (ulong) node_remap_start_vaddr[nid], - (ulong) pfn_to_kaddr(highstart_pfn - + node_remap_offset[nid] + node_remap_size[nid])); } printk("High memory starts at vaddr %08lx\n", (ulong) pfn_to_kaddr(highstart_pfn)); @@ -345,7 +390,8 @@ unsigned long __init setup_memory(void) void __init numa_kva_reserve(void) { - reserve_bootmem(PFN_PHYS(kva_start_pfn),PFN_PHYS(kva_pages)); + if (kva_pages) + reserve_bootmem(PFN_PHYS(kva_start_pfn), PFN_PHYS(kva_pages)); } void __init zone_sizes_init(void) @@ -430,3 +476,29 @@ int memory_add_physaddr_to_nid(u64 addr) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif + +#ifndef CONFIG_HAVE_ARCH_PARSE_SRAT +/* + * XXX FIXME: Make SLIT table parsing available to 32-bit NUMA + * + * These stub functions are needed to compile 32-bit NUMA when SRAT is + * not set. There are functions in srat_64.c for parsing this table + * and it may be possible to make them common functions. + */ +void acpi_numa_slit_init (struct acpi_table_slit *slit) +{ + printk(KERN_INFO "ACPI: No support for parsing SLIT table\n"); +} + +void acpi_numa_processor_affinity_init (struct acpi_srat_cpu_affinity *pa) +{ +} + +void acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma) +{ +} + +void acpi_numa_arch_fixup(void) +{ +} +#endif /* CONFIG_HAVE_ARCH_PARSE_SRAT */ diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c new file mode 100644 index 00000000000..7e8db53528a --- /dev/null +++ b/arch/x86/mm/extable.c @@ -0,0 +1,62 @@ +#include <linux/module.h> +#include <linux/spinlock.h> +#include <asm/uaccess.h> + + +int fixup_exception(struct pt_regs *regs) +{ + const struct exception_table_entry *fixup; + +#ifdef CONFIG_PNPBIOS + if (unlikely(SEGMENT_IS_PNP_CODE(regs->cs))) { + extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; + extern u32 pnp_bios_is_utter_crap; + pnp_bios_is_utter_crap = 1; + printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n"); + __asm__ volatile( + "movl %0, %%esp\n\t" + "jmp *%1\n\t" + : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip)); + panic("do_trap: can't hit this"); + } +#endif + + fixup = search_exception_tables(regs->ip); + if (fixup) { + regs->ip = fixup->fixup; + return 1; + } + + return 0; +} + +#ifdef CONFIG_X86_64 +/* + * Need to defined our own search_extable on X86_64 to work around + * a B stepping K8 bug. + */ +const struct exception_table_entry * +search_extable(const struct exception_table_entry *first, + const struct exception_table_entry *last, + unsigned long value) +{ + /* B stepping K8 bug */ + if ((value >> 32) == 0) + value |= 0xffffffffUL << 32; + + while (first <= last) { + const struct exception_table_entry *mid; + long diff; + + mid = (last - first) / 2 + first; + diff = mid->insn - value; + if (diff == 0) + return mid; + else if (diff < 0) + first = mid+1; + else + last = mid-1; + } + return NULL; +} +#endif diff --git a/arch/x86/mm/extable_32.c b/arch/x86/mm/extable_32.c deleted file mode 100644 index 0ce4f22a263..00000000000 --- a/arch/x86/mm/extable_32.c +++ /dev/null @@ -1,35 +0,0 @@ -/* - * linux/arch/i386/mm/extable.c - */ - -#include <linux/module.h> -#include <linux/spinlock.h> -#include <asm/uaccess.h> - -int fixup_exception(struct pt_regs *regs) -{ - const struct exception_table_entry *fixup; - -#ifdef CONFIG_PNPBIOS - if (unlikely(SEGMENT_IS_PNP_CODE(regs->xcs))) - { - extern u32 pnp_bios_fault_eip, pnp_bios_fault_esp; - extern u32 pnp_bios_is_utter_crap; - pnp_bios_is_utter_crap = 1; - printk(KERN_CRIT "PNPBIOS fault.. attempting recovery.\n"); - __asm__ volatile( - "movl %0, %%esp\n\t" - "jmp *%1\n\t" - : : "g" (pnp_bios_fault_esp), "g" (pnp_bios_fault_eip)); - panic("do_trap: can't hit this"); - } -#endif - - fixup = search_exception_tables(regs->eip); - if (fixup) { - regs->eip = fixup->fixup; - return 1; - } - - return 0; -} diff --git a/arch/x86/mm/extable_64.c b/arch/x86/mm/extable_64.c deleted file mode 100644 index 79ac6e7100a..00000000000 --- a/arch/x86/mm/extable_64.c +++ /dev/null @@ -1,34 +0,0 @@ -/* - * linux/arch/x86_64/mm/extable.c - */ - -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/init.h> -#include <asm/uaccess.h> - -/* Simple binary search */ -const struct exception_table_entry * -search_extable(const struct exception_table_entry *first, - const struct exception_table_entry *last, - unsigned long value) -{ - /* Work around a B stepping K8 bug */ - if ((value >> 32) == 0) - value |= 0xffffffffUL << 32; - - while (first <= last) { - const struct exception_table_entry *mid; - long diff; - - mid = (last - first) / 2 + first; - diff = mid->insn - value; - if (diff == 0) - return mid; - else if (diff < 0) - first = mid+1; - else - last = mid-1; - } - return NULL; -} diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c new file mode 100644 index 00000000000..e28cc5277b1 --- /dev/null +++ b/arch/x86/mm/fault.c @@ -0,0 +1,986 @@ +/* + * Copyright (C) 1995 Linus Torvalds + * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. + */ + +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/ptrace.h> +#include <linux/mman.h> +#include <linux/mm.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/init.h> +#include <linux/tty.h> +#include <linux/vt_kern.h> /* For unblank_screen() */ +#include <linux/compiler.h> +#include <linux/highmem.h> +#include <linux/bootmem.h> /* for max_low_pfn */ +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/kdebug.h> + +#include <asm/system.h> +#include <asm/desc.h> +#include <asm/segment.h> +#include <asm/pgalloc.h> +#include <asm/smp.h> +#include <asm/tlbflush.h> +#include <asm/proto.h> +#include <asm-generic/sections.h> + +/* + * Page fault error code bits + * bit 0 == 0 means no page found, 1 means protection fault + * bit 1 == 0 means read, 1 means write + * bit 2 == 0 means kernel, 1 means user-mode + * bit 3 == 1 means use of reserved bit detected + * bit 4 == 1 means fault was an instruction fetch + */ +#define PF_PROT (1<<0) +#define PF_WRITE (1<<1) +#define PF_USER (1<<2) +#define PF_RSVD (1<<3) +#define PF_INSTR (1<<4) + +static inline int notify_page_fault(struct pt_regs *regs) +{ +#ifdef CONFIG_KPROBES + int ret = 0; + + /* kprobe_running() needs smp_processor_id() */ +#ifdef CONFIG_X86_32 + if (!user_mode_vm(regs)) { +#else + if (!user_mode(regs)) { +#endif + preempt_disable(); + if (kprobe_running() && kprobe_fault_handler(regs, 14)) + ret = 1; + preempt_enable(); + } + + return ret; +#else + return 0; +#endif +} + +/* + * X86_32 + * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. + * Check that here and ignore it. + * + * X86_64 + * Sometimes the CPU reports invalid exceptions on prefetch. + * Check that here and ignore it. + * + * Opcode checker based on code by Richard Brunner + */ +static int is_prefetch(struct pt_regs *regs, unsigned long addr, + unsigned long error_code) +{ + unsigned char *instr; + int scan_more = 1; + int prefetch = 0; + unsigned char *max_instr; + +#ifdef CONFIG_X86_32 + if (!(__supported_pte_mask & _PAGE_NX)) + return 0; +#endif + + /* If it was a exec fault on NX page, ignore */ + if (error_code & PF_INSTR) + return 0; + + instr = (unsigned char *)convert_ip_to_linear(current, regs); + max_instr = instr + 15; + + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) + return 0; + + while (scan_more && instr < max_instr) { + unsigned char opcode; + unsigned char instr_hi; + unsigned char instr_lo; + + if (probe_kernel_address(instr, opcode)) + break; + + instr_hi = opcode & 0xf0; + instr_lo = opcode & 0x0f; + instr++; + + switch (instr_hi) { + case 0x20: + case 0x30: + /* + * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. + * In X86_64 long mode, the CPU will signal invalid + * opcode if some of these prefixes are present so + * X86_64 will never get here anyway + */ + scan_more = ((instr_lo & 7) == 0x6); + break; +#ifdef CONFIG_X86_64 + case 0x40: + /* + * In AMD64 long mode 0x40..0x4F are valid REX prefixes + * Need to figure out under what instruction mode the + * instruction was issued. Could check the LDT for lm, + * but for now it's good enough to assume that long + * mode only uses well known segments or kernel. + */ + scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); + break; +#endif + case 0x60: + /* 0x64 thru 0x67 are valid prefixes in all modes. */ + scan_more = (instr_lo & 0xC) == 0x4; + break; + case 0xF0: + /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */ + scan_more = !instr_lo || (instr_lo>>1) == 1; + break; + case 0x00: + /* Prefetch instruction is 0x0F0D or 0x0F18 */ + scan_more = 0; + + if (probe_kernel_address(instr, opcode)) + break; + prefetch = (instr_lo == 0xF) && + (opcode == 0x0D || opcode == 0x18); + break; + default: + scan_more = 0; + break; + } + } + return prefetch; +} + +static void force_sig_info_fault(int si_signo, int si_code, + unsigned long address, struct task_struct *tsk) +{ + siginfo_t info; + + info.si_signo = si_signo; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)address; + force_sig_info(si_signo, &info, tsk); +} + +#ifdef CONFIG_X86_64 +static int bad_address(void *p) +{ + unsigned long dummy; + return probe_kernel_address((unsigned long *)p, dummy); +} +#endif + +void dump_pagetable(unsigned long address) +{ +#ifdef CONFIG_X86_32 + __typeof__(pte_val(__pte(0))) page; + + page = read_cr3(); + page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; +#ifdef CONFIG_X86_PAE + printk("*pdpt = %016Lx ", page); + if ((page >> PAGE_SHIFT) < max_low_pfn + && page & _PAGE_PRESENT) { + page &= PAGE_MASK; + page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) + & (PTRS_PER_PMD - 1)]; + printk(KERN_CONT "*pde = %016Lx ", page); + page &= ~_PAGE_NX; + } +#else + printk("*pde = %08lx ", page); +#endif + + /* + * We must not directly access the pte in the highpte + * case if the page table is located in highmem. + * And let's rather not kmap-atomic the pte, just in case + * it's allocated already. + */ + if ((page >> PAGE_SHIFT) < max_low_pfn + && (page & _PAGE_PRESENT) + && !(page & _PAGE_PSE)) { + page &= PAGE_MASK; + page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) + & (PTRS_PER_PTE - 1)]; + printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); + } + + printk("\n"); +#else /* CONFIG_X86_64 */ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = (pgd_t *)read_cr3(); + + pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); + pgd += pgd_index(address); + if (bad_address(pgd)) goto bad; + printk("PGD %lx ", pgd_val(*pgd)); + if (!pgd_present(*pgd)) goto ret; + + pud = pud_offset(pgd, address); + if (bad_address(pud)) goto bad; + printk("PUD %lx ", pud_val(*pud)); + if (!pud_present(*pud)) goto ret; + + pmd = pmd_offset(pud, address); + if (bad_address(pmd)) goto bad; + printk("PMD %lx ", pmd_val(*pmd)); + if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; + + pte = pte_offset_kernel(pmd, address); + if (bad_address(pte)) goto bad; + printk("PTE %lx", pte_val(*pte)); +ret: + printk("\n"); + return; +bad: + printk("BAD\n"); +#endif +} + +#ifdef CONFIG_X86_32 +static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) +{ + unsigned index = pgd_index(address); + pgd_t *pgd_k; + pud_t *pud, *pud_k; + pmd_t *pmd, *pmd_k; + + pgd += index; + pgd_k = init_mm.pgd + index; + + if (!pgd_present(*pgd_k)) + return NULL; + + /* + * set_pgd(pgd, *pgd_k); here would be useless on PAE + * and redundant with the set_pmd() on non-PAE. As would + * set_pud. + */ + + pud = pud_offset(pgd, address); + pud_k = pud_offset(pgd_k, address); + if (!pud_present(*pud_k)) + return NULL; + + pmd = pmd_offset(pud, address); + pmd_k = pmd_offset(pud_k, address); + if (!pmd_present(*pmd_k)) + return NULL; + if (!pmd_present(*pmd)) { + set_pmd(pmd, *pmd_k); + arch_flush_lazy_mmu_mode(); + } else + BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); + return pmd_k; +} +#endif + +#ifdef CONFIG_X86_64 +static const char errata93_warning[] = +KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" +KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" +KERN_ERR "******* Please consider a BIOS update.\n" +KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; +#endif + +/* Workaround for K8 erratum #93 & buggy BIOS. + BIOS SMM functions are required to use a specific workaround + to avoid corruption of the 64bit RIP register on C stepping K8. + A lot of BIOS that didn't get tested properly miss this. + The OS sees this as a page fault with the upper 32bits of RIP cleared. + Try to work around it here. + Note we only handle faults in kernel here. + Does nothing for X86_32 + */ +static int is_errata93(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_64 + static int warned; + if (address != regs->ip) + return 0; + if ((address >> 32) != 0) + return 0; + address |= 0xffffffffUL << 32; + if ((address >= (u64)_stext && address <= (u64)_etext) || + (address >= MODULES_VADDR && address <= MODULES_END)) { + if (!warned) { + printk(errata93_warning); + warned = 1; + } + regs->ip = address; + return 1; + } +#endif + return 0; +} + +/* + * Work around K8 erratum #100 K8 in compat mode occasionally jumps to illegal + * addresses >4GB. We catch this in the page fault handler because these + * addresses are not reachable. Just detect this case and return. Any code + * segment in LDT is compatibility mode. + */ +static int is_errata100(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_64 + if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && + (address >> 32)) + return 1; +#endif + return 0; +} + +void do_invalid_op(struct pt_regs *, unsigned long); + +static int is_f00f_bug(struct pt_regs *regs, unsigned long address) +{ +#ifdef CONFIG_X86_F00F_BUG + unsigned long nr; + /* + * Pentium F0 0F C7 C8 bug workaround. + */ + if (boot_cpu_data.f00f_bug) { + nr = (address - idt_descr.address) >> 3; + + if (nr == 6) { + do_invalid_op(regs, 0); + return 1; + } + } +#endif + return 0; +} + +static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, + unsigned long address) +{ +#ifdef CONFIG_X86_32 + if (!oops_may_print()) + return; +#endif + +#ifdef CONFIG_X86_PAE + if (error_code & PF_INSTR) { + int level; + pte_t *pte = lookup_address(address, &level); + + if (pte && pte_present(*pte) && !pte_exec(*pte)) + printk(KERN_CRIT "kernel tried to execute " + "NX-protected page - exploit attempt? " + "(uid: %d)\n", current->uid); + } +#endif + + printk(KERN_ALERT "BUG: unable to handle kernel "); + if (address < PAGE_SIZE) + printk(KERN_CONT "NULL pointer dereference"); + else + printk(KERN_CONT "paging request"); +#ifdef CONFIG_X86_32 + printk(KERN_CONT " at %08lx\n", address); +#else + printk(KERN_CONT " at %016lx\n", address); +#endif + printk(KERN_ALERT "IP:"); + printk_address(regs->ip, 1); + dump_pagetable(address); +} + +#ifdef CONFIG_X86_64 +static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, + unsigned long error_code) +{ + unsigned long flags = oops_begin(); + struct task_struct *tsk; + + printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", + current->comm, address); + dump_pagetable(address); + tsk = current; + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + if (__die("Bad pagetable", regs, error_code)) + regs = NULL; + oops_end(flags, regs, SIGKILL); +} +#endif + +/* + * Handle a spurious fault caused by a stale TLB entry. This allows + * us to lazily refresh the TLB when increasing the permissions of a + * kernel page (RO -> RW or NX -> X). Doing it eagerly is very + * expensive since that implies doing a full cross-processor TLB + * flush, even if no stale TLB entries exist on other processors. + * There are no security implications to leaving a stale TLB when + * increasing the permissions on a page. + */ +static int spurious_fault(unsigned long address, + unsigned long error_code) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* Reserved-bit violation or user access to kernel space? */ + if (error_code & (PF_USER | PF_RSVD)) + return 0; + + pgd = init_mm.pgd + pgd_index(address); + if (!pgd_present(*pgd)) + return 0; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return 0; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return 0; + + pte = pte_offset_kernel(pmd, address); + if (!pte_present(*pte)) + return 0; + + if ((error_code & PF_WRITE) && !pte_write(*pte)) + return 0; + if ((error_code & PF_INSTR) && !pte_exec(*pte)) + return 0; + + return 1; +} + +/* + * X86_32 + * Handle a fault on the vmalloc or module mapping area + * + * X86_64 + * Handle a fault on the vmalloc area + * + * This assumes no large pages in there. + */ +static int vmalloc_fault(unsigned long address) +{ +#ifdef CONFIG_X86_32 + unsigned long pgd_paddr; + pmd_t *pmd_k; + pte_t *pte_k; + /* + * Synchronize this task's top level page-table + * with the 'reference' page table. + * + * Do _not_ use "current" here. We might be inside + * an interrupt in the middle of a task switch.. + */ + pgd_paddr = read_cr3(); + pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); + if (!pmd_k) + return -1; + pte_k = pte_offset_kernel(pmd_k, address); + if (!pte_present(*pte_k)) + return -1; + return 0; +#else + pgd_t *pgd, *pgd_ref; + pud_t *pud, *pud_ref; + pmd_t *pmd, *pmd_ref; + pte_t *pte, *pte_ref; + + /* Copy kernel mappings over when needed. This can also + happen within a race in page table update. In the later + case just flush. */ + + pgd = pgd_offset(current->mm ?: &init_mm, address); + pgd_ref = pgd_offset_k(address); + if (pgd_none(*pgd_ref)) + return -1; + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + /* Below here mismatches are bugs because these lower tables + are shared */ + + pud = pud_offset(pgd, address); + pud_ref = pud_offset(pgd_ref, address); + if (pud_none(*pud_ref)) + return -1; + if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) + BUG(); + pmd = pmd_offset(pud, address); + pmd_ref = pmd_offset(pud_ref, address); + if (pmd_none(*pmd_ref)) + return -1; + if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) + BUG(); + pte_ref = pte_offset_kernel(pmd_ref, address); + if (!pte_present(*pte_ref)) + return -1; + pte = pte_offset_kernel(pmd, address); + /* Don't use pte_page here, because the mappings can point + outside mem_map, and the NUMA hash lookup cannot handle + that. */ + if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) + BUG(); + return 0; +#endif +} + +int show_unhandled_signals = 1; + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +#ifdef CONFIG_X86_64 +asmlinkage +#endif +void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + struct task_struct *tsk; + struct mm_struct *mm; + struct vm_area_struct *vma; + unsigned long address; + int write, si_code; + int fault; +#ifdef CONFIG_X86_64 + unsigned long flags; +#endif + + /* + * We can fault from pretty much anywhere, with unknown IRQ state. + */ + trace_hardirqs_fixup(); + + tsk = current; + mm = tsk->mm; + prefetchw(&mm->mmap_sem); + + /* get the address */ + address = read_cr2(); + + si_code = SEGV_MAPERR; + + if (notify_page_fault(regs)) + return; + + /* + * We fault-in kernel-space virtual memory on-demand. The + * 'reference' page table is init_mm.pgd. + * + * NOTE! We MUST NOT take any locks for this case. We may + * be in an interrupt or a critical region, and should + * only copy the information from the master page table, + * nothing more. + * + * This verifies that the fault happens in kernel space + * (error_code & 4) == 0, and that the fault was not a + * protection error (error_code & 9) == 0. + */ +#ifdef CONFIG_X86_32 + if (unlikely(address >= TASK_SIZE)) { + if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && + vmalloc_fault(address) >= 0) + return; + + /* Can handle a stale RO->RW TLB */ + if (spurious_fault(address, error_code)) + return; + + /* + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock. + */ + goto bad_area_nosemaphore; + } + + /* It's safe to allow irq's after cr2 has been saved and the vmalloc + fault has been handled. */ + if (regs->flags & (X86_EFLAGS_IF|VM_MASK)) + local_irq_enable(); + + /* + * If we're in an interrupt, have no user context or are running in an + * atomic region then we must not take the fault. + */ + if (in_atomic() || !mm) + goto bad_area_nosemaphore; +#else /* CONFIG_X86_64 */ + if (unlikely(address >= TASK_SIZE64)) { + /* + * Don't check for the module range here: its PML4 + * is always initialized because it's shared with the main + * kernel text. Only vmalloc may need PML4 syncups. + */ + if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && + ((address >= VMALLOC_START && address < VMALLOC_END))) { + if (vmalloc_fault(address) >= 0) + return; + } + + /* Can handle a stale RO->RW TLB */ + if (spurious_fault(address, error_code)) + return; + + /* + * Don't take the mm semaphore here. If we fixup a prefetch + * fault we could otherwise deadlock. + */ + goto bad_area_nosemaphore; + } + if (likely(regs->flags & X86_EFLAGS_IF)) + local_irq_enable(); + + if (unlikely(error_code & PF_RSVD)) + pgtable_bad(address, regs, error_code); + + /* + * If we're in an interrupt, have no user context or are running in an + * atomic region then we must not take the fault. + */ + if (unlikely(in_atomic() || !mm)) + goto bad_area_nosemaphore; + + /* + * User-mode registers count as a user access even for any + * potential system fault or CPU buglet. + */ + if (user_mode_vm(regs)) + error_code |= PF_USER; +again: +#endif + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunately, in the case of an + * erroneous fault occurring in a code path which already holds mmap_sem + * we will deadlock attempting to validate the fault against the + * address space. Luckily the kernel only validly references user + * space from well defined areas of code, which are listed in the + * exceptions table. + * + * As the vast majority of faults will be valid we will only perform + * the source reference check when there is a possibility of a deadlock. + * Attempt to lock the address space, if we cannot we then validate the + * source. If this is invalid we can skip the address space check, + * thus avoiding the deadlock. + */ + if (!down_read_trylock(&mm->mmap_sem)) { + if ((error_code & PF_USER) == 0 && + !search_exception_tables(regs->ip)) + goto bad_area_nosemaphore; + down_read(&mm->mmap_sem); + } + + vma = find_vma(mm, address); + if (!vma) + goto bad_area; + if (vma->vm_start <= address) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (error_code & PF_USER) { + /* + * Accessing the stack below %sp is always a bug. + * The large cushion allows instructions like enter + * and pusha to work. ("enter $65535,$31" pushes + * 32 pointers and then decrements %sp by 65535.) + */ + if (address + 65536 + 32 * sizeof(unsigned long) < regs->sp) + goto bad_area; + } + if (expand_stack(vma, address)) + goto bad_area; +/* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ +good_area: + si_code = SEGV_ACCERR; + write = 0; + switch (error_code & (PF_PROT|PF_WRITE)) { + default: /* 3: write, present */ + /* fall through */ + case PF_WRITE: /* write, not present */ + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; + write++; + break; + case PF_PROT: /* read, present */ + goto bad_area; + case 0: /* read, not present */ + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + goto bad_area; + } + +#ifdef CONFIG_X86_32 +survive: +#endif + /* + * If for any reason at all we couldn't handle the fault, + * make sure we exit gracefully rather than endlessly redo + * the fault. + */ + fault = handle_mm_fault(mm, vma, address, write); + if (unlikely(fault & VM_FAULT_ERROR)) { + if (fault & VM_FAULT_OOM) + goto out_of_memory; + else if (fault & VM_FAULT_SIGBUS) + goto do_sigbus; + BUG(); + } + if (fault & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + +#ifdef CONFIG_X86_32 + /* + * Did it hit the DOS screen memory VA from vm86 mode? + */ + if (v8086_mode(regs)) { + unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; + if (bit < 32) + tsk->thread.screen_bitmap |= 1 << bit; + } +#endif + up_read(&mm->mmap_sem); + return; + +/* + * Something tried to access memory that isn't in our memory map.. + * Fix it, but check if it's kernel or user first.. + */ +bad_area: + up_read(&mm->mmap_sem); + +bad_area_nosemaphore: + /* User mode accesses just cause a SIGSEGV */ + if (error_code & PF_USER) { + /* + * It's possible to have interrupts off here. + */ + local_irq_enable(); + + /* + * Valid to do another page fault here because this one came + * from user space. + */ + if (is_prefetch(regs, address, error_code)) + return; + + if (is_errata100(regs, address)) + return; + + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk( +#ifdef CONFIG_X86_32 + "%s%s[%d]: segfault at %lx ip %08lx sp %08lx error %lx", +#else + "%s%s[%d]: segfault at %lx ip %lx sp %lx error %lx", +#endif + task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, + tsk->comm, task_pid_nr(tsk), address, regs->ip, + regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } + + tsk->thread.cr2 = address; + /* Kernel addresses are always protection faults */ + tsk->thread.error_code = error_code | (address >= TASK_SIZE); + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGSEGV, si_code, address, tsk); + return; + } + + if (is_f00f_bug(regs, address)) + return; + +no_context: + /* Are we prepared to handle this kernel fault? */ + if (fixup_exception(regs)) + return; + + /* + * X86_32 + * Valid to do another page fault here, because if this fault + * had been triggered by is_prefetch fixup_exception would have + * handled it. + * + * X86_64 + * Hall of shame of CPU/BIOS bugs. + */ + if (is_prefetch(regs, address, error_code)) + return; + + if (is_errata93(regs, address)) + return; + +/* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. + */ +#ifdef CONFIG_X86_32 + bust_spinlocks(1); +#else + flags = oops_begin(); +#endif + + show_fault_oops(regs, error_code, address); + + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; + +#ifdef CONFIG_X86_32 + die("Oops", regs, error_code); + bust_spinlocks(0); + do_exit(SIGKILL); +#else + if (__die("Oops", regs, error_code)) + regs = NULL; + /* Executive summary in case the body of the oops scrolled away */ + printk(KERN_EMERG "CR2: %016lx\n", address); + oops_end(flags, regs, SIGKILL); +#endif + +/* + * We ran out of memory, or some other thing happened to us that made + * us unable to handle the page fault gracefully. + */ +out_of_memory: + up_read(&mm->mmap_sem); + if (is_global_init(tsk)) { + yield(); +#ifdef CONFIG_X86_32 + down_read(&mm->mmap_sem); + goto survive; +#else + goto again; +#endif + } + + printk("VM: killing process %s\n", tsk->comm); + if (error_code & PF_USER) + do_group_exit(SIGKILL); + goto no_context; + +do_sigbus: + up_read(&mm->mmap_sem); + + /* Kernel mode? Handle exceptions or die */ + if (!(error_code & PF_USER)) + goto no_context; +#ifdef CONFIG_X86_32 + /* User space => ok to do another page fault */ + if (is_prefetch(regs, address, error_code)) + return; +#endif + tsk->thread.cr2 = address; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 14; + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); +} + +DEFINE_SPINLOCK(pgd_lock); +LIST_HEAD(pgd_list); + +void vmalloc_sync_all(void) +{ +#ifdef CONFIG_X86_32 + /* + * Note that races in the updates of insync and start aren't + * problematic: insync can only get set bits added, and updates to + * start are only improving performance (without affecting correctness + * if undone). + */ + static DECLARE_BITMAP(insync, PTRS_PER_PGD); + static unsigned long start = TASK_SIZE; + unsigned long address; + + if (SHARED_KERNEL_PMD) + return; + + BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); + for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { + if (!test_bit(pgd_index(address), insync)) { + unsigned long flags; + struct page *page; + + spin_lock_irqsave(&pgd_lock, flags); + list_for_each_entry(page, &pgd_list, lru) { + if (!vmalloc_sync_one(page_address(page), + address)) + break; + } + spin_unlock_irqrestore(&pgd_lock, flags); + if (!page) + set_bit(pgd_index(address), insync); + } + if (address == start && test_bit(pgd_index(address), insync)) + start = address + PGDIR_SIZE; + } +#else /* CONFIG_X86_64 */ + /* + * Note that races in the updates of insync and start aren't + * problematic: insync can only get set bits added, and updates to + * start are only improving performance (without affecting correctness + * if undone). + */ + static DECLARE_BITMAP(insync, PTRS_PER_PGD); + static unsigned long start = VMALLOC_START & PGDIR_MASK; + unsigned long address; + + for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { + if (!test_bit(pgd_index(address), insync)) { + const pgd_t *pgd_ref = pgd_offset_k(address); + struct page *page; + + if (pgd_none(*pgd_ref)) + continue; + spin_lock(&pgd_lock); + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + else + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + } + spin_unlock(&pgd_lock); + set_bit(pgd_index(address), insync); + } + if (address == start) + start = address + PGDIR_SIZE; + } + /* Check that there is no need to do the same for the modules area. */ + BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); + BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == + (__START_KERNEL & PGDIR_MASK))); +#endif +} diff --git a/arch/x86/mm/fault_32.c b/arch/x86/mm/fault_32.c deleted file mode 100644 index a2273d44aa2..00000000000 --- a/arch/x86/mm/fault_32.c +++ /dev/null @@ -1,659 +0,0 @@ -/* - * linux/arch/i386/mm/fault.c - * - * Copyright (C) 1995 Linus Torvalds - */ - -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> /* For unblank_screen() */ -#include <linux/highmem.h> -#include <linux/bootmem.h> /* for max_low_pfn */ -#include <linux/vmalloc.h> -#include <linux/module.h> -#include <linux/kprobes.h> -#include <linux/uaccess.h> -#include <linux/kdebug.h> -#include <linux/kprobes.h> - -#include <asm/system.h> -#include <asm/desc.h> -#include <asm/segment.h> - -extern void die(const char *,struct pt_regs *,long); - -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs) -{ - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ - if (!user_mode_vm(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 14)) - ret = 1; - preempt_enable(); - } - - return ret; -} -#else -static inline int notify_page_fault(struct pt_regs *regs) -{ - return 0; -} -#endif - -/* - * Return EIP plus the CS segment base. The segment limit is also - * adjusted, clamped to the kernel/user address space (whichever is - * appropriate), and returned in *eip_limit. - * - * The segment is checked, because it might have been changed by another - * task between the original faulting instruction and here. - * - * If CS is no longer a valid code segment, or if EIP is beyond the - * limit, or if it is a kernel address when CS is not a kernel segment, - * then the returned value will be greater than *eip_limit. - * - * This is slow, but is very rarely executed. - */ -static inline unsigned long get_segment_eip(struct pt_regs *regs, - unsigned long *eip_limit) -{ - unsigned long eip = regs->eip; - unsigned seg = regs->xcs & 0xffff; - u32 seg_ar, seg_limit, base, *desc; - - /* Unlikely, but must come before segment checks. */ - if (unlikely(regs->eflags & VM_MASK)) { - base = seg << 4; - *eip_limit = base + 0xffff; - return base + (eip & 0xffff); - } - - /* The standard kernel/user address space limit. */ - *eip_limit = user_mode(regs) ? USER_DS.seg : KERNEL_DS.seg; - - /* By far the most common cases. */ - if (likely(SEGMENT_IS_FLAT_CODE(seg))) - return eip; - - /* Check the segment exists, is within the current LDT/GDT size, - that kernel/user (ring 0..3) has the appropriate privilege, - that it's a code segment, and get the limit. */ - __asm__ ("larl %3,%0; lsll %3,%1" - : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg)); - if ((~seg_ar & 0x9800) || eip > seg_limit) { - *eip_limit = 0; - return 1; /* So that returned eip > *eip_limit. */ - } - - /* Get the GDT/LDT descriptor base. - When you look for races in this code remember that - LDT and other horrors are only used in user space. */ - if (seg & (1<<2)) { - /* Must lock the LDT while reading it. */ - mutex_lock(¤t->mm->context.lock); - desc = current->mm->context.ldt; - desc = (void *)desc + (seg & ~7); - } else { - /* Must disable preemption while reading the GDT. */ - desc = (u32 *)get_cpu_gdt_table(get_cpu()); - desc = (void *)desc + (seg & ~7); - } - - /* Decode the code segment base from the descriptor */ - base = get_desc_base((unsigned long *)desc); - - if (seg & (1<<2)) { - mutex_unlock(¤t->mm->context.lock); - } else - put_cpu(); - - /* Adjust EIP and segment limit, and clamp at the kernel limit. - It's legitimate for segments to wrap at 0xffffffff. */ - seg_limit += base; - if (seg_limit < *eip_limit && seg_limit >= base) - *eip_limit = seg_limit; - return eip + base; -} - -/* - * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch. - * Check that here and ignore it. - */ -static int __is_prefetch(struct pt_regs *regs, unsigned long addr) -{ - unsigned long limit; - unsigned char *instr = (unsigned char *)get_segment_eip (regs, &limit); - int scan_more = 1; - int prefetch = 0; - int i; - - for (i = 0; scan_more && i < 15; i++) { - unsigned char opcode; - unsigned char instr_hi; - unsigned char instr_lo; - - if (instr > (unsigned char *)limit) - break; - if (probe_kernel_address(instr, opcode)) - break; - - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; - instr++; - - switch (instr_hi) { - case 0x20: - case 0x30: - /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */ - scan_more = ((instr_lo & 7) == 0x6); - break; - - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; - break; - case 0xF0: - /* 0xF0, 0xF2, and 0xF3 are valid prefixes */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - if (instr > (unsigned char *)limit) - break; - if (probe_kernel_address(instr, opcode)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; - } - } - return prefetch; -} - -static inline int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) -{ - if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 6)) { - /* Catch an obscure case of prefetch inside an NX page. */ - if (nx_enabled && (error_code & 16)) - return 0; - return __is_prefetch(regs, addr); - } - return 0; -} - -static noinline void force_sig_info_fault(int si_signo, int si_code, - unsigned long address, struct task_struct *tsk) -{ - siginfo_t info; - - info.si_signo = si_signo; - info.si_errno = 0; - info.si_code = si_code; - info.si_addr = (void __user *)address; - force_sig_info(si_signo, &info, tsk); -} - -fastcall void do_invalid_op(struct pt_regs *, unsigned long); - -static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) -{ - unsigned index = pgd_index(address); - pgd_t *pgd_k; - pud_t *pud, *pud_k; - pmd_t *pmd, *pmd_k; - - pgd += index; - pgd_k = init_mm.pgd + index; - - if (!pgd_present(*pgd_k)) - return NULL; - - /* - * set_pgd(pgd, *pgd_k); here would be useless on PAE - * and redundant with the set_pmd() on non-PAE. As would - * set_pud. - */ - - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); - if (!pud_present(*pud_k)) - return NULL; - - pmd = pmd_offset(pud, address); - pmd_k = pmd_offset(pud_k, address); - if (!pmd_present(*pmd_k)) - return NULL; - if (!pmd_present(*pmd)) { - set_pmd(pmd, *pmd_k); - arch_flush_lazy_mmu_mode(); - } else - BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - return pmd_k; -} - -/* - * Handle a fault on the vmalloc or module mapping area - * - * This assumes no large pages in there. - */ -static inline int vmalloc_fault(unsigned long address) -{ - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - return 0; -} - -int show_unhandled_signals = 1; - -/* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. - * - * error_code: - * bit 0 == 0 means no page found, 1 means protection fault - * bit 1 == 0 means read, 1 means write - * bit 2 == 0 means kernel, 1 means user-mode - * bit 3 == 1 means use of reserved bit detected - * bit 4 == 1 means fault was an instruction fetch - */ -fastcall void __kprobes do_page_fault(struct pt_regs *regs, - unsigned long error_code) -{ - struct task_struct *tsk; - struct mm_struct *mm; - struct vm_area_struct * vma; - unsigned long address; - int write, si_code; - int fault; - - /* - * We can fault from pretty much anywhere, with unknown IRQ state. - */ - trace_hardirqs_fixup(); - - /* get the address */ - address = read_cr2(); - - tsk = current; - - si_code = SEGV_MAPERR; - - /* - * We fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * This verifies that the fault happens in kernel space - * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 9) == 0. - */ - if (unlikely(address >= TASK_SIZE)) { - if (!(error_code & 0x0000000d) && vmalloc_fault(address) >= 0) - return; - if (notify_page_fault(regs)) - return; - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. - */ - goto bad_area_nosemaphore; - } - - if (notify_page_fault(regs)) - return; - - /* It's safe to allow irq's after cr2 has been saved and the vmalloc - fault has been handled. */ - if (regs->eflags & (X86_EFLAGS_IF|VM_MASK)) - local_irq_enable(); - - mm = tsk->mm; - - /* - * If we're in an interrupt, have no user context or are running in an - * atomic region then we must not take the fault.. - */ - if (in_atomic() || !mm) - goto bad_area_nosemaphore; - - /* When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunately, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. - */ - if (!down_read_trylock(&mm->mmap_sem)) { - if ((error_code & 4) == 0 && - !search_exception_tables(regs->eip)) - goto bad_area_nosemaphore; - down_read(&mm->mmap_sem); - } - - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (error_code & 4) { - /* - * Accessing the stack below %esp is always a bug. - * The large cushion allows instructions like enter - * and pusha to work. ("enter $65535,$31" pushes - * 32 pointers and then decrements %esp by 65535.) - */ - if (address + 65536 + 32 * sizeof(unsigned long) < regs->esp) - goto bad_area; - } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ -good_area: - si_code = SEGV_ACCERR; - write = 0; - switch (error_code & 3) { - default: /* 3: write, present */ - /* fall through */ - case 2: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case 1: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; - } - - survive: - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - fault = handle_mm_fault(mm, vma, address, write); - if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); - } - if (fault & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - - /* - * Did it hit the DOS screen memory VA from vm86 mode? - */ - if (regs->eflags & VM_MASK) { - unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; - if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; - } - up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses just cause a SIGSEGV */ - if (error_code & 4) { - /* - * It's possible to have interrupts off here. - */ - local_irq_enable(); - - /* - * Valid to do another page fault here because this one came - * from user space. - */ - if (is_prefetch(regs, address, error_code)) - return; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk("%s%s[%d]: segfault at %08lx eip %08lx " - "esp %08lx error %lx\n", - task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, task_pid_nr(tsk), address, regs->eip, - regs->esp, error_code); - } - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGSEGV, si_code, address, tsk); - return; - } - -#ifdef CONFIG_X86_F00F_BUG - /* - * Pentium F0 0F C7 C8 bug workaround. - */ - if (boot_cpu_data.f00f_bug) { - unsigned long nr; - - nr = (address - idt_descr.address) >> 3; - - if (nr == 6) { - do_invalid_op(regs, 0); - return; - } - } -#endif - -no_context: - /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) - return; - - /* - * Valid to do another page fault here, because if this fault - * had been triggered by is_prefetch fixup_exception would have - * handled it. - */ - if (is_prefetch(regs, address, error_code)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - - bust_spinlocks(1); - - if (oops_may_print()) { - __typeof__(pte_val(__pte(0))) page; - -#ifdef CONFIG_X86_PAE - if (error_code & 16) { - pte_t *pte = lookup_address(address); - - if (pte && pte_present(*pte) && !pte_exec_kernel(*pte)) - printk(KERN_CRIT "kernel tried to execute " - "NX-protected page - exploit attempt? " - "(uid: %d)\n", current->uid); - } -#endif - if (address < PAGE_SIZE) - printk(KERN_ALERT "BUG: unable to handle kernel NULL " - "pointer dereference"); - else - printk(KERN_ALERT "BUG: unable to handle kernel paging" - " request"); - printk(" at virtual address %08lx\n",address); - printk(KERN_ALERT "printing eip: %08lx ", regs->eip); - - page = read_cr3(); - page = ((__typeof__(page) *) __va(page))[address >> PGDIR_SHIFT]; -#ifdef CONFIG_X86_PAE - printk("*pdpt = %016Lx ", page); - if ((page >> PAGE_SHIFT) < max_low_pfn - && page & _PAGE_PRESENT) { - page &= PAGE_MASK; - page = ((__typeof__(page) *) __va(page))[(address >> PMD_SHIFT) - & (PTRS_PER_PMD - 1)]; - printk(KERN_CONT "*pde = %016Lx ", page); - page &= ~_PAGE_NX; - } -#else - printk("*pde = %08lx ", page); -#endif - - /* - * We must not directly access the pte in the highpte - * case if the page table is located in highmem. - * And let's rather not kmap-atomic the pte, just in case - * it's allocated already. - */ - if ((page >> PAGE_SHIFT) < max_low_pfn - && (page & _PAGE_PRESENT) - && !(page & _PAGE_PSE)) { - page &= PAGE_MASK; - page = ((__typeof__(page) *) __va(page))[(address >> PAGE_SHIFT) - & (PTRS_PER_PTE - 1)]; - printk("*pte = %0*Lx ", sizeof(page)*2, (u64)page); - } - - printk("\n"); - } - - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - die("Oops", regs, error_code); - bust_spinlocks(0); - do_exit(SIGKILL); - -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ -out_of_memory: - up_read(&mm->mmap_sem); - if (is_global_init(tsk)) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_group_exit(SIGKILL); - goto no_context; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & 4)) - goto no_context; - - /* User space => ok to do another page fault */ - if (is_prefetch(regs, address, error_code)) - return; - - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); -} - -void vmalloc_sync_all(void) -{ - /* - * Note that races in the updates of insync and start aren't - * problematic: insync can only get set bits added, and updates to - * start are only improving performance (without affecting correctness - * if undone). - */ - static DECLARE_BITMAP(insync, PTRS_PER_PGD); - static unsigned long start = TASK_SIZE; - unsigned long address; - - if (SHARED_KERNEL_PMD) - return; - - BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); - for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { - if (!test_bit(pgd_index(address), insync)) { - unsigned long flags; - struct page *page; - - spin_lock_irqsave(&pgd_lock, flags); - for (page = pgd_list; page; page = - (struct page *)page->index) - if (!vmalloc_sync_one(page_address(page), - address)) { - BUG_ON(page != pgd_list); - break; - } - spin_unlock_irqrestore(&pgd_lock, flags); - if (!page) - set_bit(pgd_index(address), insync); - } - if (address == start && test_bit(pgd_index(address), insync)) - start = address + PGDIR_SIZE; - } -} diff --git a/arch/x86/mm/fault_64.c b/arch/x86/mm/fault_64.c deleted file mode 100644 index 0e26230669c..00000000000 --- a/arch/x86/mm/fault_64.c +++ /dev/null @@ -1,623 +0,0 @@ -/* - * linux/arch/x86-64/mm/fault.c - * - * Copyright (C) 1995 Linus Torvalds - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs. - */ - -#include <linux/signal.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/ptrace.h> -#include <linux/mman.h> -#include <linux/mm.h> -#include <linux/smp.h> -#include <linux/interrupt.h> -#include <linux/init.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> /* For unblank_screen() */ -#include <linux/compiler.h> -#include <linux/vmalloc.h> -#include <linux/module.h> -#include <linux/kprobes.h> -#include <linux/uaccess.h> -#include <linux/kdebug.h> -#include <linux/kprobes.h> - -#include <asm/system.h> -#include <asm/pgalloc.h> -#include <asm/smp.h> -#include <asm/tlbflush.h> -#include <asm/proto.h> -#include <asm-generic/sections.h> - -/* Page fault error code bits */ -#define PF_PROT (1<<0) /* or no page found */ -#define PF_WRITE (1<<1) -#define PF_USER (1<<2) -#define PF_RSVD (1<<3) -#define PF_INSTR (1<<4) - -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs) -{ - int ret = 0; - - /* kprobe_running() needs smp_processor_id() */ - if (!user_mode(regs)) { - preempt_disable(); - if (kprobe_running() && kprobe_fault_handler(regs, 14)) - ret = 1; - preempt_enable(); - } - - return ret; -} -#else -static inline int notify_page_fault(struct pt_regs *regs) -{ - return 0; -} -#endif - -/* Sometimes the CPU reports invalid exceptions on prefetch. - Check that here and ignore. - Opcode checker based on code by Richard Brunner */ -static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, - unsigned long error_code) -{ - unsigned char *instr; - int scan_more = 1; - int prefetch = 0; - unsigned char *max_instr; - - /* If it was a exec fault ignore */ - if (error_code & PF_INSTR) - return 0; - - instr = (unsigned char __user *)convert_rip_to_linear(current, regs); - max_instr = instr + 15; - - if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) - return 0; - - while (scan_more && instr < max_instr) { - unsigned char opcode; - unsigned char instr_hi; - unsigned char instr_lo; - - if (probe_kernel_address(instr, opcode)) - break; - - instr_hi = opcode & 0xf0; - instr_lo = opcode & 0x0f; - instr++; - - switch (instr_hi) { - case 0x20: - case 0x30: - /* Values 0x26,0x2E,0x36,0x3E are valid x86 - prefixes. In long mode, the CPU will signal - invalid opcode if some of these prefixes are - present so we will never get here anyway */ - scan_more = ((instr_lo & 7) == 0x6); - break; - - case 0x40: - /* In AMD64 long mode, 0x40 to 0x4F are valid REX prefixes - Need to figure out under what instruction mode the - instruction was issued ... */ - /* Could check the LDT for lm, but for now it's good - enough to assume that long mode only uses well known - segments or kernel. */ - scan_more = (!user_mode(regs)) || (regs->cs == __USER_CS); - break; - - case 0x60: - /* 0x64 thru 0x67 are valid prefixes in all modes. */ - scan_more = (instr_lo & 0xC) == 0x4; - break; - case 0xF0: - /* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */ - scan_more = !instr_lo || (instr_lo>>1) == 1; - break; - case 0x00: - /* Prefetch instruction is 0x0F0D or 0x0F18 */ - scan_more = 0; - if (probe_kernel_address(instr, opcode)) - break; - prefetch = (instr_lo == 0xF) && - (opcode == 0x0D || opcode == 0x18); - break; - default: - scan_more = 0; - break; - } - } - return prefetch; -} - -static int bad_address(void *p) -{ - unsigned long dummy; - return probe_kernel_address((unsigned long *)p, dummy); -} - -void dump_pagetable(unsigned long address) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = (pgd_t *)read_cr3(); - - pgd = __va((unsigned long)pgd & PHYSICAL_PAGE_MASK); - pgd += pgd_index(address); - if (bad_address(pgd)) goto bad; - printk("PGD %lx ", pgd_val(*pgd)); - if (!pgd_present(*pgd)) goto ret; - - pud = pud_offset(pgd, address); - if (bad_address(pud)) goto bad; - printk("PUD %lx ", pud_val(*pud)); - if (!pud_present(*pud)) goto ret; - - pmd = pmd_offset(pud, address); - if (bad_address(pmd)) goto bad; - printk("PMD %lx ", pmd_val(*pmd)); - if (!pmd_present(*pmd) || pmd_large(*pmd)) goto ret; - - pte = pte_offset_kernel(pmd, address); - if (bad_address(pte)) goto bad; - printk("PTE %lx", pte_val(*pte)); -ret: - printk("\n"); - return; -bad: - printk("BAD\n"); -} - -static const char errata93_warning[] = -KERN_ERR "******* Your BIOS seems to not contain a fix for K8 errata #93\n" -KERN_ERR "******* Working around it, but it may cause SEGVs or burn power.\n" -KERN_ERR "******* Please consider a BIOS update.\n" -KERN_ERR "******* Disabling USB legacy in the BIOS may also help.\n"; - -/* Workaround for K8 erratum #93 & buggy BIOS. - BIOS SMM functions are required to use a specific workaround - to avoid corruption of the 64bit RIP register on C stepping K8. - A lot of BIOS that didn't get tested properly miss this. - The OS sees this as a page fault with the upper 32bits of RIP cleared. - Try to work around it here. - Note we only handle faults in kernel here. */ - -static int is_errata93(struct pt_regs *regs, unsigned long address) -{ - static int warned; - if (address != regs->rip) - return 0; - if ((address >> 32) != 0) - return 0; - address |= 0xffffffffUL << 32; - if ((address >= (u64)_stext && address <= (u64)_etext) || - (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!warned) { - printk(errata93_warning); - warned = 1; - } - regs->rip = address; - return 1; - } - return 0; -} - -static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs, - unsigned long error_code) -{ - unsigned long flags = oops_begin(); - struct task_struct *tsk; - - printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", - current->comm, address); - dump_pagetable(address); - tsk = current; - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - __die("Bad pagetable", regs, error_code); - oops_end(flags); - do_exit(SIGKILL); -} - -/* - * Handle a fault on the vmalloc area - * - * This assumes no large pages in there. - */ -static int vmalloc_fault(unsigned long address) -{ - pgd_t *pgd, *pgd_ref; - pud_t *pud, *pud_ref; - pmd_t *pmd, *pmd_ref; - pte_t *pte, *pte_ref; - - /* Copy kernel mappings over when needed. This can also - happen within a race in page table update. In the later - case just flush. */ - - pgd = pgd_offset(current->mm ?: &init_mm, address); - pgd_ref = pgd_offset_k(address); - if (pgd_none(*pgd_ref)) - return -1; - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - - /* Below here mismatches are bugs because these lower tables - are shared */ - - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); - if (pud_none(*pud_ref)) - return -1; - if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref)) - BUG(); - pmd = pmd_offset(pud, address); - pmd_ref = pmd_offset(pud_ref, address); - if (pmd_none(*pmd_ref)) - return -1; - if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref)) - BUG(); - pte_ref = pte_offset_kernel(pmd_ref, address); - if (!pte_present(*pte_ref)) - return -1; - pte = pte_offset_kernel(pmd, address); - /* Don't use pte_page here, because the mappings can point - outside mem_map, and the NUMA hash lookup cannot handle - that. */ - if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref)) - BUG(); - return 0; -} - -int show_unhandled_signals = 1; - -/* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. - */ -asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, - unsigned long error_code) -{ - struct task_struct *tsk; - struct mm_struct *mm; - struct vm_area_struct * vma; - unsigned long address; - const struct exception_table_entry *fixup; - int write, fault; - unsigned long flags; - siginfo_t info; - - /* - * We can fault from pretty much anywhere, with unknown IRQ state. - */ - trace_hardirqs_fixup(); - - tsk = current; - mm = tsk->mm; - prefetchw(&mm->mmap_sem); - - /* get the address */ - address = read_cr2(); - - info.si_code = SEGV_MAPERR; - - - /* - * We fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * This verifies that the fault happens in kernel space - * (error_code & 4) == 0, and that the fault was not a - * protection error (error_code & 9) == 0. - */ - if (unlikely(address >= TASK_SIZE64)) { - /* - * Don't check for the module range here: its PML4 - * is always initialized because it's shared with the main - * kernel text. Only vmalloc may need PML4 syncups. - */ - if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && - ((address >= VMALLOC_START && address < VMALLOC_END))) { - if (vmalloc_fault(address) >= 0) - return; - } - if (notify_page_fault(regs)) - return; - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. - */ - goto bad_area_nosemaphore; - } - - if (notify_page_fault(regs)) - return; - - if (likely(regs->eflags & X86_EFLAGS_IF)) - local_irq_enable(); - - if (unlikely(error_code & PF_RSVD)) - pgtable_bad(address, regs, error_code); - - /* - * If we're in an interrupt or have no user - * context, we must not take the fault.. - */ - if (unlikely(in_atomic() || !mm)) - goto bad_area_nosemaphore; - - /* - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet. - */ - if (user_mode_vm(regs)) - error_code |= PF_USER; - - again: - /* When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunately, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. - */ - if (!down_read_trylock(&mm->mmap_sem)) { - if ((error_code & PF_USER) == 0 && - !search_exception_tables(regs->rip)) - goto bad_area_nosemaphore; - down_read(&mm->mmap_sem); - } - - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (likely(vma->vm_start <= address)) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (error_code & 4) { - /* Allow userspace just enough access below the stack pointer - * to let the 'enter' instruction work. - */ - if (address + 65536 + 32 * sizeof(unsigned long) < regs->rsp) - goto bad_area; - } - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ -good_area: - info.si_code = SEGV_ACCERR; - write = 0; - switch (error_code & (PF_PROT|PF_WRITE)) { - default: /* 3: write, present */ - /* fall through */ - case PF_WRITE: /* write, not present */ - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - write++; - break; - case PF_PROT: /* read, present */ - goto bad_area; - case 0: /* read, not present */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; - } - - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - fault = handle_mm_fault(mm, vma, address, write); - if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto do_sigbus; - BUG(); - } - if (fault & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - up_read(&mm->mmap_sem); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses just cause a SIGSEGV */ - if (error_code & PF_USER) { - - /* - * It's possible to have interrupts off here. - */ - local_irq_enable(); - - if (is_prefetch(regs, address, error_code)) - return; - - /* Work around K8 erratum #100 K8 in compat mode - occasionally jumps to illegal addresses >4GB. We - catch this here in the page fault handler because - these addresses are not reachable. Just detect this - case and return. Any code segment in LDT is - compatibility mode. */ - if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && - (address >> 32)) - return; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk( - "%s%s[%d]: segfault at %lx rip %lx rsp %lx error %lx\n", - tsk->pid > 1 ? KERN_INFO : KERN_EMERG, - tsk->comm, tsk->pid, address, regs->rip, - regs->rsp, error_code); - } - - tsk->thread.cr2 = address; - /* Kernel addresses are always protection faults */ - tsk->thread.error_code = error_code | (address >= TASK_SIZE); - tsk->thread.trap_no = 14; - info.si_signo = SIGSEGV; - info.si_errno = 0; - /* info.si_code has been set above */ - info.si_addr = (void __user *)address; - force_sig_info(SIGSEGV, &info, tsk); - return; - } - -no_context: - - /* Are we prepared to handle this kernel fault? */ - fixup = search_exception_tables(regs->rip); - if (fixup) { - regs->rip = fixup->fixup; - return; - } - - /* - * Hall of shame of CPU/BIOS bugs. - */ - - if (is_prefetch(regs, address, error_code)) - return; - - if (is_errata93(regs, address)) - return; - -/* - * Oops. The kernel tried to access some bad page. We'll have to - * terminate things with extreme prejudice. - */ - - flags = oops_begin(); - - if (address < PAGE_SIZE) - printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference"); - else - printk(KERN_ALERT "Unable to handle kernel paging request"); - printk(" at %016lx RIP: \n" KERN_ALERT,address); - printk_address(regs->rip); - dump_pagetable(address); - tsk->thread.cr2 = address; - tsk->thread.trap_no = 14; - tsk->thread.error_code = error_code; - __die("Oops", regs, error_code); - /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_EMERG "CR2: %016lx\n", address); - oops_end(flags); - do_exit(SIGKILL); - -/* - * We ran out of memory, or some other thing happened to us that made - * us unable to handle the page fault gracefully. - */ -out_of_memory: - up_read(&mm->mmap_sem); - if (is_global_init(current)) { - yield(); - goto again; - } - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_group_exit(SIGKILL); - goto no_context; - -do_sigbus: - up_read(&mm->mmap_sem); - - /* Kernel mode? Handle exceptions or die */ - if (!(error_code & PF_USER)) - goto no_context; - - tsk->thread.cr2 = address; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 14; - info.si_signo = SIGBUS; - info.si_errno = 0; - info.si_code = BUS_ADRERR; - info.si_addr = (void __user *)address; - force_sig_info(SIGBUS, &info, tsk); - return; -} - -DEFINE_SPINLOCK(pgd_lock); -LIST_HEAD(pgd_list); - -void vmalloc_sync_all(void) -{ - /* Note that races in the updates of insync and start aren't - problematic: - insync can only get set bits added, and updates to start are only - improving performance (without affecting correctness if undone). */ - static DECLARE_BITMAP(insync, PTRS_PER_PGD); - static unsigned long start = VMALLOC_START & PGDIR_MASK; - unsigned long address; - - for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { - if (!test_bit(pgd_index(address), insync)) { - const pgd_t *pgd_ref = pgd_offset_k(address); - struct page *page; - - if (pgd_none(*pgd_ref)) - continue; - spin_lock(&pgd_lock); - list_for_each_entry(page, &pgd_list, lru) { - pgd_t *pgd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - } - spin_unlock(&pgd_lock); - set_bit(pgd_index(address), insync); - } - if (address == start) - start = address + PGDIR_SIZE; - } - /* Check that there is no need to do the same for the modules area. */ - BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); - BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == - (__START_KERNEL & PGDIR_MASK))); -} diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 1c3bf95f735..3d936f23270 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -18,6 +18,49 @@ void kunmap(struct page *page) kunmap_high(page); } +static void debug_kmap_atomic_prot(enum km_type type) +{ +#ifdef CONFIG_DEBUG_HIGHMEM + static unsigned warn_count = 10; + + if (unlikely(warn_count == 0)) + return; + + if (unlikely(in_interrupt())) { + if (in_irq()) { + if (type != KM_IRQ0 && type != KM_IRQ1 && + type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && + type != KM_BOUNCE_READ) { + WARN_ON(1); + warn_count--; + } + } else if (!irqs_disabled()) { /* softirq */ + if (type != KM_IRQ0 && type != KM_IRQ1 && + type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && + type != KM_SKB_SUNRPC_DATA && + type != KM_SKB_DATA_SOFTIRQ && + type != KM_BOUNCE_READ) { + WARN_ON(1); + warn_count--; + } + } + } + + if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || + type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { + if (!irqs_disabled()) { + WARN_ON(1); + warn_count--; + } + } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { + if (irq_count() == 0 && !irqs_disabled()) { + WARN_ON(1); + warn_count--; + } + } +#endif +} + /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -30,8 +73,10 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; - /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ + + debug_kmap_atomic_prot(type); + pagefault_disable(); if (!PageHighMem(page)) diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 6c06d9c0488..4fbafb4bc2f 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -15,6 +15,7 @@ #include <asm/mman.h> #include <asm/tlb.h> #include <asm/tlbflush.h> +#include <asm/pgalloc.h> static unsigned long page_table_shareable(struct vm_area_struct *svma, struct vm_area_struct *vma, @@ -88,7 +89,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) spin_lock(&mm->page_table_lock); if (pud_none(*pud)) - pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK); + pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); else put_page(virt_to_page(spte)); spin_unlock(&mm->page_table_lock); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 3c76d194fd2..da524fb2242 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -27,7 +27,6 @@ #include <linux/bootmem.h> #include <linux/slab.h> #include <linux/proc_fs.h> -#include <linux/efi.h> #include <linux/memory_hotplug.h> #include <linux/initrd.h> #include <linux/cpumask.h> @@ -40,8 +39,10 @@ #include <asm/fixmap.h> #include <asm/e820.h> #include <asm/apic.h> +#include <asm/bugs.h> #include <asm/tlb.h> #include <asm/tlbflush.h> +#include <asm/pgalloc.h> #include <asm/sections.h> #include <asm/paravirt.h> @@ -50,7 +51,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; -static int noinline do_test_wp_bit(void); +static noinline int do_test_wp_bit(void); /* * Creates a middle page table and puts a pointer to it in the @@ -61,26 +62,26 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) { pud_t *pud; pmd_t *pmd_table; - + #ifdef CONFIG_X86_PAE if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); + paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); - if (pmd_table != pmd_offset(pud, 0)) - BUG(); + BUG_ON(pmd_table != pmd_offset(pud, 0)); } #endif pud = pud_offset(pgd, 0); pmd_table = pmd_offset(pud, 0); + return pmd_table; } /* * Create a page table and place a pointer to it in a middle page - * directory entry. + * directory entry: */ static pte_t * __init one_page_table_init(pmd_t *pmd) { @@ -90,9 +91,10 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) #ifdef CONFIG_DEBUG_PAGEALLOC page_table = (pte_t *) alloc_bootmem_pages(PAGE_SIZE); #endif - if (!page_table) + if (!page_table) { page_table = (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); + } paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); @@ -103,22 +105,21 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) } /* - * This function initializes a certain range of kernel virtual memory + * This function initializes a certain range of kernel virtual memory * with new bootmem page tables, everywhere page tables are missing in * the given range. - */ - -/* - * NOTE: The pagetables are allocated contiguous on the physical space - * so we can cache the place of the first one and move around without + * + * NOTE: The pagetables are allocated contiguous on the physical space + * so we can cache the place of the first one and move around without * checking the pgd every time. */ -static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) +static void __init +page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base) { - pgd_t *pgd; - pmd_t *pmd; int pgd_idx, pmd_idx; unsigned long vaddr; + pgd_t *pgd; + pmd_t *pmd; vaddr = start; pgd_idx = pgd_index(vaddr); @@ -128,7 +129,8 @@ static void __init page_table_range_init (unsigned long start, unsigned long end for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { pmd = one_md_table_init(pgd); pmd = pmd + pmd_index(vaddr); - for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); + pmd++, pmd_idx++) { one_page_table_init(pmd); vaddr += PMD_SIZE; @@ -145,17 +147,17 @@ static inline int is_kernel_text(unsigned long addr) } /* - * This maps the physical memory to kernel virtual address space, a total - * of max_low_pfn pages, by creating page tables starting from address - * PAGE_OFFSET. + * This maps the physical memory to kernel virtual address space, a total + * of max_low_pfn pages, by creating page tables starting from address + * PAGE_OFFSET: */ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) { + int pgd_idx, pmd_idx, pte_ofs; unsigned long pfn; pgd_t *pgd; pmd_t *pmd; pte_t *pte; - int pgd_idx, pmd_idx, pte_ofs; pgd_idx = pgd_index(PAGE_OFFSET); pgd = pgd_base + pgd_idx; @@ -165,29 +167,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base) pmd = one_md_table_init(pgd); if (pfn >= max_low_pfn) continue; - for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { - unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; - /* Map with big pages if possible, otherwise create normal page tables. */ + for (pmd_idx = 0; + pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; + pmd++, pmd_idx++) { + unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET; + + /* + * Map with big pages if possible, otherwise + * create normal page tables: + */ if (cpu_has_pse) { - unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; - if (is_kernel_text(address) || is_kernel_text(address2)) - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); - else - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); + unsigned int addr2; + pgprot_t prot = PAGE_KERNEL_LARGE; + + addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + + PAGE_OFFSET + PAGE_SIZE-1; + + if (is_kernel_text(addr) || + is_kernel_text(addr2)) + prot = PAGE_KERNEL_LARGE_EXEC; + + set_pmd(pmd, pfn_pmd(pfn, prot)); pfn += PTRS_PER_PTE; - } else { - pte = one_page_table_init(pmd); - - for (pte_ofs = 0; - pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; - pte++, pfn++, pte_ofs++, address += PAGE_SIZE) { - if (is_kernel_text(address)) - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); - else - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); - } + continue; + } + pte = one_page_table_init(pmd); + + for (pte_ofs = 0; + pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; + pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { + pgprot_t prot = PAGE_KERNEL; + + if (is_kernel_text(addr)) + prot = PAGE_KERNEL_EXEC; + + set_pte(pte, pfn_pte(pfn, prot)); } } } @@ -200,57 +216,23 @@ static inline int page_kills_ppro(unsigned long pagenr) return 0; } -int page_is_ram(unsigned long pagenr) -{ - int i; - unsigned long addr, end; - - if (efi_enabled) { - efi_memory_desc_t *md; - void *p; - - for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { - md = p; - if (!is_available_memory(md)) - continue; - addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >> PAGE_SHIFT; - - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; - } - - for (i = 0; i < e820.nr_map; i++) { - - if (e820.map[i].type != E820_RAM) /* not usable memory */ - continue; - /* - * !!!FIXME!!! Some BIOSen report areas as RAM that - * are not. Notably the 640->1Mb area. We need a sanity - * check here. - */ - addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT; - end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT; - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; -} - #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; pgprot_t kmap_prot; -#define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), (vaddr)), (vaddr)) +static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) +{ + return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), + vaddr), vaddr), vaddr); +} static void __init kmap_init(void) { unsigned long kmap_vstart; - /* cache the first kmap pte */ + /* + * Cache the first kmap pte: + */ kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); kmap_pte = kmap_get_fixmap_pte(kmap_vstart); @@ -259,11 +241,11 @@ static void __init kmap_init(void) static void __init permanent_kmaps_init(pgd_t *pgd_base) { + unsigned long vaddr; pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; - unsigned long vaddr; vaddr = PKMAP_BASE; page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); @@ -272,7 +254,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pud = pud_offset(pgd, vaddr); pmd = pmd_offset(pud, vaddr); pte = pte_offset_kernel(pmd, vaddr); - pkmap_page_table = pte; + pkmap_page_table = pte; } static void __meminit free_new_highpage(struct page *page) @@ -291,7 +273,8 @@ void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) SetPageReserved(page); } -static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long pfn) +static int __meminit +add_one_highpage_hotplug(struct page *page, unsigned long pfn) { free_new_highpage(page); totalram_pages++; @@ -299,6 +282,7 @@ static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long p max_mapnr = max(pfn, max_mapnr); #endif num_physpages++; + return 0; } @@ -306,7 +290,7 @@ static int __meminit add_one_highpage_hotplug(struct page *page, unsigned long p * Not currently handling the NUMA case. * Assuming single node and all memory that * has been added dynamically that would be - * onlined here is in HIGHMEM + * onlined here is in HIGHMEM. */ void __meminit online_page(struct page *page) { @@ -314,13 +298,11 @@ void __meminit online_page(struct page *page) add_one_highpage_hotplug(page, page_to_pfn(page)); } - -#ifdef CONFIG_NUMA -extern void set_highmem_pages_init(int); -#else +#ifndef CONFIG_NUMA static void __init set_highmem_pages_init(int bad_ppro) { int pfn; + for (pfn = highstart_pfn; pfn < highend_pfn; pfn++) { /* * Holes under sparsemem might not have no mem_map[]: @@ -330,23 +312,18 @@ static void __init set_highmem_pages_init(int bad_ppro) } totalram_pages += totalhigh_pages; } -#endif /* CONFIG_FLATMEM */ +#endif /* !CONFIG_NUMA */ #else -#define kmap_init() do { } while (0) -#define permanent_kmaps_init(pgd_base) do { } while (0) -#define set_highmem_pages_init(bad_ppro) do { } while (0) +# define kmap_init() do { } while (0) +# define permanent_kmaps_init(pgd_base) do { } while (0) +# define set_highmem_pages_init(bad_ppro) do { } while (0) #endif /* CONFIG_HIGHMEM */ -unsigned long long __PAGE_KERNEL = _PAGE_KERNEL; +pteval_t __PAGE_KERNEL = _PAGE_KERNEL; EXPORT_SYMBOL(__PAGE_KERNEL); -unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; -#ifdef CONFIG_NUMA -extern void __init remap_numa_kva(void); -#else -#define remap_numa_kva() do {} while (0) -#endif +pteval_t __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC; void __init native_pagetable_setup_start(pgd_t *base) { @@ -372,7 +349,7 @@ void __init native_pagetable_setup_start(pgd_t *base) memset(&base[USER_PTRS_PER_PGD], 0, KERNEL_PGD_PTRS * sizeof(pgd_t)); #else - paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT); + paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT); #endif } @@ -410,10 +387,10 @@ void __init native_pagetable_setup_done(pgd_t *base) * be partially populated, and so it avoids stomping on any existing * mappings. */ -static void __init pagetable_init (void) +static void __init pagetable_init(void) { - unsigned long vaddr, end; pgd_t *pgd_base = swapper_pg_dir; + unsigned long vaddr, end; paravirt_pagetable_setup_start(pgd_base); @@ -435,9 +412,11 @@ static void __init pagetable_init (void) * Fixed mappings, only the page table structure has to be * created - mappings will be set by set_fixmap(): */ + early_ioremap_clear(); vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; page_table_range_init(vaddr, end, pgd_base); + early_ioremap_reset(); permanent_kmaps_init(pgd_base); @@ -450,7 +429,7 @@ static void __init pagetable_init (void) * driver might have split up a kernel 4MB mapping. */ char __nosavedata swsusp_pg_dir[PAGE_SIZE] - __attribute__ ((aligned (PAGE_SIZE))); + __attribute__ ((aligned(PAGE_SIZE))); static inline void save_pg_dir(void) { @@ -462,7 +441,7 @@ static inline void save_pg_dir(void) } #endif -void zap_low_mappings (void) +void zap_low_mappings(void) { int i; @@ -474,22 +453,24 @@ void zap_low_mappings (void) * Note that "pgd_clear()" doesn't do it for * us, because pgd_clear() is a no-op on i386. */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) + for (i = 0; i < USER_PTRS_PER_PGD; i++) { #ifdef CONFIG_X86_PAE set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); #else set_pgd(swapper_pg_dir+i, __pgd(0)); #endif + } flush_tlb_all(); } -int nx_enabled = 0; +int nx_enabled; + +pteval_t __supported_pte_mask __read_mostly = ~_PAGE_NX; +EXPORT_SYMBOL_GPL(__supported_pte_mask); #ifdef CONFIG_X86_PAE -static int disable_nx __initdata = 0; -u64 __supported_pte_mask __read_mostly = ~_PAGE_NX; -EXPORT_SYMBOL_GPL(__supported_pte_mask); +static int disable_nx __initdata; /* * noexec = on|off @@ -506,11 +487,14 @@ static int __init noexec_setup(char *str) __supported_pte_mask |= _PAGE_NX; disable_nx = 0; } - } else if (!strcmp(str,"off")) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } else - return -EINVAL; + } else { + if (!strcmp(str, "off")) { + disable_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } else { + return -EINVAL; + } + } return 0; } @@ -522,6 +506,7 @@ static void __init set_nx(void) if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); + if ((v[3] & (1 << 20)) && !disable_nx) { rdmsr(MSR_EFER, l, h); l |= EFER_NX; @@ -531,35 +516,6 @@ static void __init set_nx(void) } } } - -/* - * Enables/disables executability of a given kernel page and - * returns the previous setting. - */ -int __init set_kernel_exec(unsigned long vaddr, int enable) -{ - pte_t *pte; - int ret = 1; - - if (!nx_enabled) - goto out; - - pte = lookup_address(vaddr); - BUG_ON(!pte); - - if (!pte_exec_kernel(*pte)) - ret = 0; - - if (enable) - pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32)); - else - pte->pte_high |= 1 << (_PAGE_BIT_NX - 32); - pte_update_defer(&init_mm, vaddr, pte); - __flush_tlb_all(); -out: - return ret; -} - #endif /* @@ -574,9 +530,8 @@ void __init paging_init(void) #ifdef CONFIG_X86_PAE set_nx(); if (nx_enabled) - printk("NX (Execute Disable) protection: active\n"); + printk(KERN_INFO "NX (Execute Disable) protection: active\n"); #endif - pagetable_init(); load_cr3(swapper_pg_dir); @@ -600,10 +555,10 @@ void __init paging_init(void) * used to involve black magic jumps to work around some nasty CPU bugs, * but fortunately the switch to using exceptions got rid of all that. */ - static void __init test_wp_bit(void) { - printk("Checking if this processor honours the WP bit even in supervisor mode... "); + printk(KERN_INFO + "Checking if this processor honours the WP bit even in supervisor mode..."); /* Any page-aligned address will do, the test is non-destructive */ __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); @@ -611,47 +566,46 @@ static void __init test_wp_bit(void) clear_fixmap(FIX_WP_TEST); if (!boot_cpu_data.wp_works_ok) { - printk("No.\n"); + printk(KERN_CONT "No.\n"); #ifdef CONFIG_X86_WP_WORKS_OK - panic("This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); + panic( + "This kernel doesn't support CPU's with broken WP. Recompile it for a 386!"); #endif } else { - printk("Ok.\n"); + printk(KERN_CONT "Ok.\n"); } } -static struct kcore_list kcore_mem, kcore_vmalloc; +static struct kcore_list kcore_mem, kcore_vmalloc; void __init mem_init(void) { - extern int ppro_with_ram_bug(void); int codesize, reservedpages, datasize, initsize; - int tmp; - int bad_ppro; + int tmp, bad_ppro; #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif - bad_ppro = ppro_with_ram_bug(); #ifdef CONFIG_HIGHMEM /* check that fixmap and pkmap do not overlap */ - if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { - printk(KERN_ERR "fixmap and kmap areas overlap - this will crash\n"); + if (PKMAP_BASE + LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) { + printk(KERN_ERR + "fixmap and kmap areas overlap - this will crash\n"); printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n", - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, FIXADDR_START); + PKMAP_BASE, PKMAP_BASE + LAST_PKMAP*PAGE_SIZE, + FIXADDR_START); BUG(); } #endif - /* this will put all low memory onto the freelists */ totalram_pages += free_all_bootmem(); reservedpages = 0; for (tmp = 0; tmp < max_low_pfn; tmp++) /* - * Only count reserved RAM pages + * Only count reserved RAM pages: */ if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp))) reservedpages++; @@ -662,11 +616,12 @@ void __init mem_init(void) datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); - printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk reserved, %dk data, %dk init, %ldk highmem)\n", + printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, " + "%dk reserved, %dk data, %dk init, %ldk highmem)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), num_physpages << (PAGE_SHIFT-10), codesize >> 10, @@ -677,45 +632,46 @@ void __init mem_init(void) ); #if 1 /* double-sanity-check paranoia */ - printk("virtual kernel memory layout:\n" - " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" + printk(KERN_INFO "virtual kernel memory layout:\n" + " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #ifdef CONFIG_HIGHMEM - " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" + " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #endif - " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" - " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" - " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" - " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" - " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", - FIXADDR_START, FIXADDR_TOP, - (FIXADDR_TOP - FIXADDR_START) >> 10, + " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" + " lowmem : 0x%08lx - 0x%08lx (%4ld MB)\n" + " .init : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .data : 0x%08lx - 0x%08lx (%4ld kB)\n" + " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", + FIXADDR_START, FIXADDR_TOP, + (FIXADDR_TOP - FIXADDR_START) >> 10, #ifdef CONFIG_HIGHMEM - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, - (LAST_PKMAP*PAGE_SIZE) >> 10, + PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, + (LAST_PKMAP*PAGE_SIZE) >> 10, #endif - VMALLOC_START, VMALLOC_END, - (VMALLOC_END - VMALLOC_START) >> 20, + VMALLOC_START, VMALLOC_END, + (VMALLOC_END - VMALLOC_START) >> 20, - (unsigned long)__va(0), (unsigned long)high_memory, - ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, + (unsigned long)__va(0), (unsigned long)high_memory, + ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20, - (unsigned long)&__init_begin, (unsigned long)&__init_end, - ((unsigned long)&__init_end - (unsigned long)&__init_begin) >> 10, + (unsigned long)&__init_begin, (unsigned long)&__init_end, + ((unsigned long)&__init_end - + (unsigned long)&__init_begin) >> 10, - (unsigned long)&_etext, (unsigned long)&_edata, - ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, + (unsigned long)&_etext, (unsigned long)&_edata, + ((unsigned long)&_edata - (unsigned long)&_etext) >> 10, - (unsigned long)&_text, (unsigned long)&_etext, - ((unsigned long)&_etext - (unsigned long)&_text) >> 10); + (unsigned long)&_text, (unsigned long)&_etext, + ((unsigned long)&_etext - (unsigned long)&_text) >> 10); #ifdef CONFIG_HIGHMEM - BUG_ON(PKMAP_BASE+LAST_PKMAP*PAGE_SIZE > FIXADDR_START); - BUG_ON(VMALLOC_END > PKMAP_BASE); + BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START); + BUG_ON(VMALLOC_END > PKMAP_BASE); #endif - BUG_ON(VMALLOC_START > VMALLOC_END); - BUG_ON((unsigned long)high_memory > VMALLOC_START); + BUG_ON(VMALLOC_START > VMALLOC_END); + BUG_ON((unsigned long)high_memory > VMALLOC_START); #endif /* double-sanity-check paranoia */ #ifdef CONFIG_X86_PAE @@ -746,49 +702,38 @@ int arch_add_memory(int nid, u64 start, u64 size) return __add_pages(zone, start_pfn, nr_pages); } - #endif -struct kmem_cache *pmd_cache; - -void __init pgtable_cache_init(void) -{ - if (PTRS_PER_PMD > 1) - pmd_cache = kmem_cache_create("pmd", - PTRS_PER_PMD*sizeof(pmd_t), - PTRS_PER_PMD*sizeof(pmd_t), - SLAB_PANIC, - pmd_ctor); -} - /* * This function cannot be __init, since exceptions don't work in that * section. Put this after the callers, so that it cannot be inlined. */ -static int noinline do_test_wp_bit(void) +static noinline int do_test_wp_bit(void) { char tmp_reg; int flag; __asm__ __volatile__( - " movb %0,%1 \n" - "1: movb %1,%0 \n" - " xorl %2,%2 \n" + " movb %0, %1 \n" + "1: movb %1, %0 \n" + " xorl %2, %2 \n" "2: \n" - ".section __ex_table,\"a\"\n" + ".section __ex_table, \"a\"\n" " .align 4 \n" - " .long 1b,2b \n" + " .long 1b, 2b \n" ".previous \n" :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), "=q" (tmp_reg), "=r" (flag) :"2" (1) :"memory"); - + return flag; } #ifdef CONFIG_DEBUG_RODATA +const int rodata_test_data = 0xC3; +EXPORT_SYMBOL_GPL(rodata_test_data); void mark_rodata_ro(void) { @@ -801,32 +746,58 @@ void mark_rodata_ro(void) if (num_possible_cpus() <= 1) #endif { - change_page_attr(virt_to_page(start), - size >> PAGE_SHIFT, PAGE_KERNEL_RX); - printk("Write protecting the kernel text: %luk\n", size >> 10); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel text: %luk\n", + size >> 10); + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", + start, start+size); + set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT); +#endif } #endif start += size; size = (unsigned long)__end_rodata - start; - change_page_attr(virt_to_page(start), - size >> PAGE_SHIFT, PAGE_KERNEL_RO); - printk("Write protecting the kernel read-only data: %luk\n", - size >> 10); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + size >> 10); + rodata_test(); - /* - * change_page_attr() requires a global_flush_tlb() call after it. - * We do this after the printk so that if something went wrong in the - * change, the printk gets out at least to give a better debug hint - * of who is the culprit. - */ - global_flush_tlb(); +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, start + size); + set_pages_rw(virt_to_page(start), size >> PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: write protecting again\n"); + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); +#endif } #endif void free_init_pages(char *what, unsigned long begin, unsigned long end) { +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * If debugging page accesses then do not free this memory but + * mark them not present - any buggy init-section access will + * create a kernel page fault: + */ + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", + begin, PAGE_ALIGN(end)); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); +#else unsigned long addr; + /* + * We just marked the kernel text read only above, now that + * we are going to free part of that, we need to make that + * writeable first. + */ + set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); + for (addr = begin; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); @@ -835,6 +806,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) totalram_pages++; } printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); +#endif } void free_initmem(void) @@ -850,4 +822,3 @@ void free_initrd_mem(unsigned long start, unsigned long end) free_init_pages("initrd memory", start, end); } #endif - diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0f9c8c89065..cc50a13ce8d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -43,12 +43,10 @@ #include <asm/proto.h> #include <asm/smp.h> #include <asm/sections.h> +#include <asm/kdebug.h> +#include <asm/numa.h> -#ifndef Dprintk -#define Dprintk(x...) -#endif - -const struct dma_mapping_ops* dma_ops; +const struct dma_mapping_ops *dma_ops; EXPORT_SYMBOL(dma_ops); static unsigned long dma_reserve __initdata; @@ -65,22 +63,26 @@ void show_mem(void) { long i, total = 0, reserved = 0; long shared = 0, cached = 0; - pg_data_t *pgdat; struct page *page; + pg_data_t *pgdat; printk(KERN_INFO "Mem-info:\n"); show_free_areas(); - printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10)); + printk(KERN_INFO "Free swap: %6ldkB\n", + nr_swap_pages << (PAGE_SHIFT-10)); for_each_online_pgdat(pgdat) { - for (i = 0; i < pgdat->node_spanned_pages; ++i) { - /* this loop can take a while with 256 GB and 4k pages - so update the NMI watchdog */ - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) { + for (i = 0; i < pgdat->node_spanned_pages; ++i) { + /* + * This loop can take a while with 256 GB and + * 4k pages so defer the NMI watchdog: + */ + if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) touch_nmi_watchdog(); - } + if (!pfn_valid(pgdat->node_start_pfn + i)) continue; + page = pfn_to_page(pgdat->node_start_pfn + i); total++; if (PageReserved(page)) @@ -89,51 +91,58 @@ void show_mem(void) cached++; else if (page_count(page)) shared += page_count(page) - 1; - } + } } - printk(KERN_INFO "%lu pages of RAM\n", total); - printk(KERN_INFO "%lu reserved pages\n",reserved); - printk(KERN_INFO "%lu pages shared\n",shared); - printk(KERN_INFO "%lu pages swap cached\n",cached); + printk(KERN_INFO "%lu pages of RAM\n", total); + printk(KERN_INFO "%lu reserved pages\n", reserved); + printk(KERN_INFO "%lu pages shared\n", shared); + printk(KERN_INFO "%lu pages swap cached\n", cached); } int after_bootmem; static __init void *spp_getpage(void) -{ +{ void *ptr; + if (after_bootmem) - ptr = (void *) get_zeroed_page(GFP_ATOMIC); + ptr = (void *) get_zeroed_page(GFP_ATOMIC); else ptr = alloc_bootmem_pages(PAGE_SIZE); - if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) - panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":""); - Dprintk("spp_getpage %p\n", ptr); + if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) { + panic("set_pte_phys: cannot allocate page data %s\n", + after_bootmem ? "after bootmem" : ""); + } + + pr_debug("spp_getpage %p\n", ptr); + return ptr; -} +} -static __init void set_pte_phys(unsigned long vaddr, - unsigned long phys, pgprot_t prot) +static __init void +set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte, new_pte; - Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys); + pr_debug("set_pte_phys %lx to %lx\n", vaddr, phys); pgd = pgd_offset_k(vaddr); if (pgd_none(*pgd)) { - printk("PGD FIXMAP MISSING, it should be setup in head.S!\n"); + printk(KERN_ERR + "PGD FIXMAP MISSING, it should be setup in head.S!\n"); return; } pud = pud_offset(pgd, vaddr); if (pud_none(*pud)) { - pmd = (pmd_t *) spp_getpage(); + pmd = (pmd_t *) spp_getpage(); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER)); if (pmd != pmd_offset(pud, 0)) { - printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0)); + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + pmd, pmd_offset(pud, 0)); return; } } @@ -142,7 +151,7 @@ static __init void set_pte_phys(unsigned long vaddr, pte = (pte_t *) spp_getpage(); set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER)); if (pte != pte_offset_kernel(pmd, 0)) { - printk("PAGETABLE BUG #02!\n"); + printk(KERN_ERR "PAGETABLE BUG #02!\n"); return; } } @@ -162,33 +171,35 @@ static __init void set_pte_phys(unsigned long vaddr, } /* NOTE: this is meant to be run only at boot */ -void __init -__set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) +void __init +__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) { unsigned long address = __fix_to_virt(idx); if (idx >= __end_of_fixed_addresses) { - printk("Invalid __set_fixmap\n"); + printk(KERN_ERR "Invalid __set_fixmap\n"); return; } set_pte_phys(address, phys, prot); } -unsigned long __meminitdata table_start, table_end; +static unsigned long __initdata table_start; +static unsigned long __meminitdata table_end; static __meminit void *alloc_low_page(unsigned long *phys) -{ +{ unsigned long pfn = table_end++; void *adr; if (after_bootmem) { adr = (void *)get_zeroed_page(GFP_ATOMIC); *phys = __pa(adr); + return adr; } - if (pfn >= end_pfn) - panic("alloc_low_page: ran out of memory"); + if (pfn >= end_pfn) + panic("alloc_low_page: ran out of memory"); adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); memset(adr, 0, PAGE_SIZE); @@ -197,44 +208,49 @@ static __meminit void *alloc_low_page(unsigned long *phys) } static __meminit void unmap_low_page(void *adr) -{ - +{ if (after_bootmem) return; early_iounmap(adr, PAGE_SIZE); -} +} /* Must run before zap_low_mappings */ __meminit void *early_ioremap(unsigned long addr, unsigned long size) { - unsigned long vaddr; pmd_t *pmd, *last_pmd; + unsigned long vaddr; int i, pmds; pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; vaddr = __START_KERNEL_map; pmd = level2_kernel_pgt; last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; + for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { for (i = 0; i < pmds; i++) { if (pmd_present(pmd[i])) - goto next; + goto continue_outer_loop; } vaddr += addr & ~PMD_MASK; addr &= PMD_MASK; + for (i = 0; i < pmds; i++, addr += PMD_SIZE) - set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE)); - __flush_tlb(); + set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC)); + __flush_tlb_all(); + return (void *)vaddr; - next: +continue_outer_loop: ; } - printk("early_ioremap(0x%lx, %lu) failed\n", addr, size); + printk(KERN_ERR "early_ioremap(0x%lx, %lu) failed\n", addr, size); + return NULL; } -/* To avoid virtual aliases later */ +/* + * To avoid virtual aliases later: + */ __meminit void early_iounmap(void *addr, unsigned long size) { unsigned long vaddr; @@ -244,9 +260,11 @@ __meminit void early_iounmap(void *addr, unsigned long size) vaddr = (unsigned long)addr; pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; pmd = level2_kernel_pgt + pmd_index(vaddr); + for (i = 0; i < pmds; i++) pmd_clear(pmd + i); - __flush_tlb(); + + __flush_tlb_all(); } static void __meminit @@ -259,16 +277,17 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) pmd_t *pmd = pmd_page + pmd_index(address); if (address >= end) { - if (!after_bootmem) + if (!after_bootmem) { for (; i < PTRS_PER_PMD; i++, pmd++) set_pmd(pmd, __pmd(0)); + } break; } if (pmd_val(*pmd)) continue; - entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address; + entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address; entry &= __supported_pte_mask; set_pmd(pmd, __pmd(entry)); } @@ -277,19 +296,19 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) static void __meminit phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end) { - pmd_t *pmd = pmd_offset(pud,0); + pmd_t *pmd = pmd_offset(pud, 0); spin_lock(&init_mm.page_table_lock); phys_pmd_init(pmd, address, end); spin_unlock(&init_mm.page_table_lock); __flush_tlb_all(); } -static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) -{ +static void __meminit +phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end) +{ int i = pud_index(addr); - - for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { + for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) { unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; @@ -297,10 +316,11 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne if (addr >= end) break; - if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) { - set_pud(pud, __pud(0)); + if (!after_bootmem && + !e820_any_mapped(addr, addr+PUD_SIZE, 0)) { + set_pud(pud, __pud(0)); continue; - } + } if (pud_val(*pud)) { phys_pmd_update(pud, addr, end); @@ -308,14 +328,16 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne } pmd = alloc_low_page(&pmd_phys); + spin_lock(&init_mm.page_table_lock); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); phys_pmd_init(pmd, addr, end); spin_unlock(&init_mm.page_table_lock); + unmap_low_page(pmd); } - __flush_tlb(); -} + __flush_tlb_all(); +} static void __init find_early_table_space(unsigned long end) { @@ -326,14 +348,23 @@ static void __init find_early_table_space(unsigned long end) tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) + round_up(pmds * sizeof(pmd_t), PAGE_SIZE); - /* RED-PEN putting page tables only on node 0 could - cause a hotspot and fill up ZONE_DMA. The page tables - need roughly 0.5KB per GB. */ - start = 0x8000; - table_start = find_e820_area(start, end, tables); + /* + * RED-PEN putting page tables only on node 0 could + * cause a hotspot and fill up ZONE_DMA. The page tables + * need roughly 0.5KB per GB. + */ + start = 0x8000; + table_start = find_e820_area(start, end, tables); if (table_start == -1UL) panic("Cannot find space for the kernel page tables"); + /* + * When you have a lot of RAM like 256GB, early_table will not fit + * into 0x8000 range, find_e820_area() will find area after kernel + * bss but the table_start is not page aligned, so need to round it + * up to avoid overlap with bss: + */ + table_start = round_up(table_start, PAGE_SIZE); table_start >>= PAGE_SHIFT; table_end = table_start; @@ -342,20 +373,23 @@ static void __init find_early_table_space(unsigned long end) (table_start << PAGE_SHIFT) + tables); } -/* Setup the direct mapping of the physical memory at PAGE_OFFSET. - This runs before bootmem is initialized and gets pages directly from the - physical memory. To access them they are temporarily mapped. */ +/* + * Setup the direct mapping of the physical memory at PAGE_OFFSET. + * This runs before bootmem is initialized and gets pages directly from + * the physical memory. To access them they are temporarily mapped. + */ void __init_refok init_memory_mapping(unsigned long start, unsigned long end) -{ - unsigned long next; +{ + unsigned long next; - Dprintk("init_memory_mapping\n"); + pr_debug("init_memory_mapping\n"); - /* + /* * Find space for the kernel direct mapping tables. - * Later we should allocate these tables in the local node of the memory - * mapped. Unfortunately this is done currently before the nodes are - * discovered. + * + * Later we should allocate these tables in the local node of the + * memory mapped. Unfortunately this is done currently before the + * nodes are discovered. */ if (!after_bootmem) find_early_table_space(end); @@ -364,8 +398,8 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end) end = (unsigned long)__va(end); for (; start < end; start = next) { - unsigned long pud_phys; pgd_t *pgd = pgd_offset_k(start); + unsigned long pud_phys; pud_t *pud; if (after_bootmem) @@ -374,23 +408,26 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end) pud = alloc_low_page(&pud_phys); next = start + PGDIR_SIZE; - if (next > end) - next = end; + if (next > end) + next = end; phys_pud_init(pud, __pa(start), __pa(next)); if (!after_bootmem) set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); unmap_low_page(pud); - } + } if (!after_bootmem) mmu_cr4_features = read_cr4(); __flush_tlb_all(); + + reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT); } #ifndef CONFIG_NUMA void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; @@ -402,39 +439,48 @@ void __init paging_init(void) } #endif -/* Unmap a kernel mapping if it exists. This is useful to avoid prefetches - from the CPU leading to inconsistent cache lines. address and size - must be aligned to 2MB boundaries. - Does nothing when the mapping doesn't exist. */ -void __init clear_kernel_mapping(unsigned long address, unsigned long size) +/* + * Unmap a kernel mapping if it exists. This is useful to avoid + * prefetches from the CPU leading to inconsistent cache lines. + * address and size must be aligned to 2MB boundaries. + * Does nothing when the mapping doesn't exist. + */ +void __init clear_kernel_mapping(unsigned long address, unsigned long size) { unsigned long end = address + size; BUG_ON(address & ~LARGE_PAGE_MASK); - BUG_ON(size & ~LARGE_PAGE_MASK); - - for (; address < end; address += LARGE_PAGE_SIZE) { + BUG_ON(size & ~LARGE_PAGE_MASK); + + for (; address < end; address += LARGE_PAGE_SIZE) { pgd_t *pgd = pgd_offset_k(address); pud_t *pud; pmd_t *pmd; + if (pgd_none(*pgd)) continue; + pud = pud_offset(pgd, address); if (pud_none(*pud)) - continue; + continue; + pmd = pmd_offset(pud, address); if (!pmd || pmd_none(*pmd)) - continue; - if (0 == (pmd_val(*pmd) & _PAGE_PSE)) { - /* Could handle this, but it should not happen currently. */ - printk(KERN_ERR - "clear_kernel_mapping: mapping has been split. will leak memory\n"); - pmd_ERROR(*pmd); + continue; + + if (!(pmd_val(*pmd) & _PAGE_PSE)) { + /* + * Could handle this, but it should not happen + * currently: + */ + printk(KERN_ERR "clear_kernel_mapping: " + "mapping has been split. will leak memory\n"); + pmd_ERROR(*pmd); } - set_pmd(pmd, __pmd(0)); + set_pmd(pmd, __pmd(0)); } __flush_tlb_all(); -} +} /* * Memory hotplug specific functions @@ -461,16 +507,12 @@ int arch_add_memory(int nid, u64 start, u64 size) unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - init_memory_mapping(start, (start + size -1)); + init_memory_mapping(start, start + size-1); ret = __add_pages(zone, start_pfn, nr_pages); - if (ret) - goto error; + WARN_ON(1); return ret; -error: - printk("%s: Problem encountered in __add_pages!\n", __func__); - return ret; } EXPORT_SYMBOL_GPL(arch_add_memory); @@ -484,36 +526,8 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif /* CONFIG_MEMORY_HOTPLUG */ -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE -/* - * Memory Hotadd without sparsemem. The mem_maps have been allocated in advance, - * just online the pages. - */ -int __add_pages(struct zone *z, unsigned long start_pfn, unsigned long nr_pages) -{ - int err = -EIO; - unsigned long pfn; - unsigned long total = 0, mem = 0; - for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) { - if (pfn_valid(pfn)) { - online_page(pfn_to_page(pfn)); - err = 0; - mem++; - } - total++; - } - if (!err) { - z->spanned_pages += total; - z->present_pages += mem; - z->zone_pgdat->node_spanned_pages += total; - z->zone_pgdat->node_present_pages += mem; - } - return err; -} -#endif - -static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, - kcore_vsyscall; +static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, + kcore_modules, kcore_vsyscall; void __init mem_init(void) { @@ -521,8 +535,15 @@ void __init mem_init(void) pci_iommu_alloc(); - /* clear the zero-page */ - memset(empty_zero_page, 0, PAGE_SIZE); + /* clear_bss() already clear the empty_zero_page */ + + /* temporary debugging - double check it's true: */ + { + int i; + + for (i = 0; i < 1024; i++) + WARN_ON_ONCE(empty_zero_page[i]); + } reservedpages = 0; @@ -534,7 +555,6 @@ void __init mem_init(void) #endif reservedpages = end_pfn - totalram_pages - absent_pages_in_range(0, end_pfn); - after_bootmem = 1; codesize = (unsigned long) &_etext - (unsigned long) &_text; @@ -542,15 +562,16 @@ void __init mem_init(void) initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; /* Register memory areas for /proc/kcore */ - kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); - kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, + kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); + kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START); kclist_add(&kcore_kernel, &_stext, _end - _stext); kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN); - kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, VSYSCALL_END - VSYSCALL_START); - printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n", + printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, " + "%ldk reserved, %ldk data, %ldk init)\n", (unsigned long) nr_free_pages() << (PAGE_SHIFT-10), end_pfn << (PAGE_SHIFT-10), codesize >> 10, @@ -566,19 +587,27 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) if (begin >= end) return; + /* + * If debugging page accesses then do not free this memory but + * mark them not present - any buggy init-section access will + * create a kernel page fault: + */ +#ifdef CONFIG_DEBUG_PAGEALLOC + printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", + begin, PAGE_ALIGN(end)); + set_memory_np(begin, (end - begin) >> PAGE_SHIFT); +#else printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); + for (addr = begin; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); init_page_count(virt_to_page(addr)); memset((void *)(addr & ~(PAGE_SIZE-1)), POISON_FREE_INITMEM, PAGE_SIZE); - if (addr >= __START_KERNEL_map) - change_page_attr_addr(addr, 1, __pgprot(0)); free_page(addr); totalram_pages++; } - if (addr > __START_KERNEL_map) - global_flush_tlb(); +#endif } void free_initmem(void) @@ -589,6 +618,8 @@ void free_initmem(void) } #ifdef CONFIG_DEBUG_RODATA +const int rodata_test_data = 0xC3; +EXPORT_SYMBOL_GPL(rodata_test_data); void mark_rodata_ro(void) { @@ -603,25 +634,27 @@ void mark_rodata_ro(void) #ifdef CONFIG_KPROBES start = (unsigned long)__start_rodata; #endif - + end = (unsigned long)__end_rodata; start = (start + PAGE_SIZE - 1) & PAGE_MASK; end &= PAGE_MASK; if (end <= start) return; - change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO); + set_memory_ro(start, (end - start) >> PAGE_SHIFT); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); - /* - * change_page_attr_addr() requires a global_flush_tlb() call after it. - * We do this after the printk so that if something went wrong in the - * change, the printk gets out at least to give a better debug hint - * of who is the culprit. - */ - global_flush_tlb(); + rodata_test(); + +#ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end); + set_memory_rw(start, (end-start) >> PAGE_SHIFT); + + printk(KERN_INFO "Testing CPA: again\n"); + set_memory_ro(start, (end-start) >> PAGE_SHIFT); +#endif } #endif @@ -632,17 +665,21 @@ void free_initrd_mem(unsigned long start, unsigned long end) } #endif -void __init reserve_bootmem_generic(unsigned long phys, unsigned len) -{ +void __init reserve_bootmem_generic(unsigned long phys, unsigned len) +{ #ifdef CONFIG_NUMA int nid = phys_to_nid(phys); #endif unsigned long pfn = phys >> PAGE_SHIFT; + if (pfn >= end_pfn) { - /* This can happen with kdump kernels when accessing firmware - tables. */ + /* + * This can happen with kdump kernels when accessing + * firmware tables: + */ if (pfn < end_pfn_map) return; + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", phys, len); return; @@ -650,9 +687,9 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) /* Should check here against the e820 map to avoid double free */ #ifdef CONFIG_NUMA - reserve_bootmem_node(NODE_DATA(nid), phys, len); -#else - reserve_bootmem(phys, len); + reserve_bootmem_node(NODE_DATA(nid), phys, len); +#else + reserve_bootmem(phys, len); #endif if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { dma_reserve += len / PAGE_SIZE; @@ -660,46 +697,49 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) } } -int kern_addr_valid(unsigned long addr) -{ +int kern_addr_valid(unsigned long addr) +{ unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; if (above != 0 && above != -1UL) - return 0; - + return 0; + pgd = pgd_offset_k(addr); if (pgd_none(*pgd)) return 0; pud = pud_offset(pgd, addr); if (pud_none(*pud)) - return 0; + return 0; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return 0; + if (pmd_large(*pmd)) return pfn_valid(pmd_pfn(*pmd)); pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) return 0; + return pfn_valid(pte_pfn(*pte)); } -/* A pseudo VMA to allow ptrace access for the vsyscall page. This only - covers the 64bit vsyscall page now. 32bit has a real VMA now and does - not need special handling anymore. */ - +/* + * A pseudo VMA to allow ptrace access for the vsyscall page. This only + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does + * not need special handling anymore: + */ static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_START, - .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT), - .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC + .vm_start = VSYSCALL_START, + .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE), + .vm_page_prot = PAGE_READONLY_EXEC, + .vm_flags = VM_READ | VM_EXEC }; struct vm_area_struct *get_gate_vma(struct task_struct *tsk) @@ -714,14 +754,17 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk) int in_gate_area(struct task_struct *task, unsigned long addr) { struct vm_area_struct *vma = get_gate_vma(task); + if (!vma) return 0; + return (addr >= vma->vm_start) && (addr < vma->vm_end); } -/* Use this when you have no reliable task/vma, typically from interrupt - * context. It is less reliable than using the task's vma and may give - * false positives. +/* + * Use this when you have no reliable task/vma, typically from interrupt + * context. It is less reliable than using the task's vma and may give + * false positives: */ int in_gate_area_no_task(unsigned long addr) { @@ -741,8 +784,8 @@ const char *arch_vma_name(struct vm_area_struct *vma) /* * Initialise the sparsemem vmemmap using huge-pages at the PMD level. */ -int __meminit vmemmap_populate(struct page *start_page, - unsigned long size, int node) +int __meminit +vmemmap_populate(struct page *start_page, unsigned long size, int node) { unsigned long addr = (unsigned long)start_page; unsigned long end = (unsigned long)(start_page + size); @@ -757,6 +800,7 @@ int __meminit vmemmap_populate(struct page *start_page, pgd = vmemmap_pgd_populate(addr, node); if (!pgd) return -ENOMEM; + pud = vmemmap_pud_populate(pgd, addr, node); if (!pud) return -ENOMEM; @@ -764,20 +808,22 @@ int __meminit vmemmap_populate(struct page *start_page, pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { pte_t entry; - void *p = vmemmap_alloc_block(PMD_SIZE, node); + void *p; + + p = vmemmap_alloc_block(PMD_SIZE, node); if (!p) return -ENOMEM; - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); - mk_pte_huge(entry); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE); set_pmd(pmd, __pmd(pte_val(entry))); printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n", addr, addr + PMD_SIZE - 1, p, node); - } else + } else { vmemmap_verify((pte_t *)pmd, node, addr, next); + } } - return 0; } #endif diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c new file mode 100644 index 00000000000..ed795721ca8 --- /dev/null +++ b/arch/x86/mm/ioremap.c @@ -0,0 +1,501 @@ +/* + * Re-map IO memory to kernel address space so that we can access it. + * This is needed for high PCI addresses that aren't mapped in the + * 640k-1MB IO memory area on PC's + * + * (C) Copyright 1995 1996 Linus Torvalds + */ + +#include <linux/bootmem.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#include <asm/cacheflush.h> +#include <asm/e820.h> +#include <asm/fixmap.h> +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/pgalloc.h> + +enum ioremap_mode { + IOR_MODE_UNCACHED, + IOR_MODE_CACHED, +}; + +#ifdef CONFIG_X86_64 + +unsigned long __phys_addr(unsigned long x) +{ + if (x >= __START_KERNEL_map) + return x - __START_KERNEL_map + phys_base; + return x - PAGE_OFFSET; +} +EXPORT_SYMBOL(__phys_addr); + +#endif + +int page_is_ram(unsigned long pagenr) +{ + unsigned long addr, end; + int i; + + for (i = 0; i < e820.nr_map; i++) { + /* + * Not usable memory: + */ + if (e820.map[i].type != E820_RAM) + continue; + addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT; + end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT; + + /* + * Sanity check: Some BIOSen report areas as RAM that + * are not. Notably the 640->1Mb area, which is the + * PCI BIOS area. + */ + if (addr >= (BIOS_BEGIN >> PAGE_SHIFT) && + end < (BIOS_END >> PAGE_SHIFT)) + continue; + + if ((pagenr >= addr) && (pagenr < end)) + return 1; + } + return 0; +} + +/* + * Fix up the linear direct mapping of the kernel to avoid cache attribute + * conflicts. + */ +static int ioremap_change_attr(unsigned long paddr, unsigned long size, + enum ioremap_mode mode) +{ + unsigned long vaddr = (unsigned long)__va(paddr); + unsigned long nrpages = size >> PAGE_SHIFT; + int err, level; + + /* No change for pages after the last mapping */ + if ((paddr + size - 1) >= (max_pfn_mapped << PAGE_SHIFT)) + return 0; + + /* + * If there is no identity map for this address, + * change_page_attr_addr is unnecessary + */ + if (!lookup_address(vaddr, &level)) + return 0; + + switch (mode) { + case IOR_MODE_UNCACHED: + default: + err = set_memory_uc(vaddr, nrpages); + break; + case IOR_MODE_CACHED: + err = set_memory_wb(vaddr, nrpages); + break; + } + + return err; +} + +/* + * Remap an arbitrary physical address space into the kernel virtual + * address space. Needed when the kernel wants to access high addresses + * directly. + * + * NOTE! We need to allow non-page-aligned mappings too: we will obviously + * have to convert them into an offset in a page-aligned mapping, but the + * caller shouldn't need to know that small detail. + */ +static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, + enum ioremap_mode mode) +{ + void __iomem *addr; + struct vm_struct *area; + unsigned long offset, last_addr; + pgprot_t prot; + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) + return NULL; + + /* + * Don't remap the low PCI/ISA area, it's always mapped.. + */ + if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) + return (__force void __iomem *)phys_to_virt(phys_addr); + + /* + * Don't allow anybody to remap normal RAM that we're using.. + */ + for (offset = phys_addr >> PAGE_SHIFT; offset < max_pfn_mapped && + (offset << PAGE_SHIFT) < last_addr; offset++) { + if (page_is_ram(offset)) + return NULL; + } + + switch (mode) { + case IOR_MODE_UNCACHED: + default: + prot = PAGE_KERNEL_NOCACHE; + break; + case IOR_MODE_CACHED: + prot = PAGE_KERNEL; + break; + } + + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr+1) - phys_addr; + + /* + * Ok, go for it.. + */ + area = get_vm_area(size, VM_IOREMAP); + if (!area) + return NULL; + area->phys_addr = phys_addr; + addr = (void __iomem *) area->addr; + if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, + phys_addr, prot)) { + remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); + return NULL; + } + + if (ioremap_change_attr(phys_addr, size, mode) < 0) { + vunmap(addr); + return NULL; + } + + return (void __iomem *) (offset + (char __iomem *)addr); +} + +/** + * ioremap_nocache - map bus memory into CPU space + * @offset: bus address of the memory + * @size: size of the resource to map + * + * ioremap_nocache performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked uncachable + * on the CPU as well as honouring existing caching rules from things like + * the PCI bus. Note that there are other caches and buffers on many + * busses. In particular driver authors should read up on PCI writes + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ +void __iomem *ioremap_nocache(unsigned long phys_addr, unsigned long size) +{ + return __ioremap(phys_addr, size, IOR_MODE_UNCACHED); +} +EXPORT_SYMBOL(ioremap_nocache); + +void __iomem *ioremap_cache(unsigned long phys_addr, unsigned long size) +{ + return __ioremap(phys_addr, size, IOR_MODE_CACHED); +} +EXPORT_SYMBOL(ioremap_cache); + +/** + * iounmap - Free a IO remapping + * @addr: virtual address from ioremap_* + * + * Caller must ensure there is only one unmapping for the same pointer. + */ +void iounmap(volatile void __iomem *addr) +{ + struct vm_struct *p, *o; + + if ((void __force *)addr <= high_memory) + return; + + /* + * __ioremap special-cases the PCI/ISA range by not instantiating a + * vm_area and by simply returning an address into the kernel mapping + * of ISA space. So handle that here. + */ + if (addr >= phys_to_virt(ISA_START_ADDRESS) && + addr < phys_to_virt(ISA_END_ADDRESS)) + return; + + addr = (volatile void __iomem *) + (PAGE_MASK & (unsigned long __force)addr); + + /* Use the vm area unlocked, assuming the caller + ensures there isn't another iounmap for the same address + in parallel. Reuse of the virtual address is prevented by + leaving it in the global lists until we're done with it. + cpa takes care of the direct mappings. */ + read_lock(&vmlist_lock); + for (p = vmlist; p; p = p->next) { + if (p->addr == addr) + break; + } + read_unlock(&vmlist_lock); + + if (!p) { + printk(KERN_ERR "iounmap: bad address %p\n", addr); + dump_stack(); + return; + } + + /* Reset the direct mapping. Can block */ + ioremap_change_attr(p->phys_addr, p->size, IOR_MODE_CACHED); + + /* Finally remove it */ + o = remove_vm_area((void *)addr); + BUG_ON(p != o || o == NULL); + kfree(p); +} +EXPORT_SYMBOL(iounmap); + +#ifdef CONFIG_X86_32 + +int __initdata early_ioremap_debug; + +static int __init early_ioremap_debug_setup(char *str) +{ + early_ioremap_debug = 1; + + return 0; +} +early_param("early_ioremap_debug", early_ioremap_debug_setup); + +static __initdata int after_paging_init; +static __initdata unsigned long bm_pte[1024] + __attribute__((aligned(PAGE_SIZE))); + +static inline unsigned long * __init early_ioremap_pgd(unsigned long addr) +{ + return (unsigned long *)swapper_pg_dir + ((addr >> 22) & 1023); +} + +static inline unsigned long * __init early_ioremap_pte(unsigned long addr) +{ + return bm_pte + ((addr >> PAGE_SHIFT) & 1023); +} + +void __init early_ioremap_init(void) +{ + unsigned long *pgd; + + if (early_ioremap_debug) + printk(KERN_INFO "early_ioremap_init()\n"); + + pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN)); + *pgd = __pa(bm_pte) | _PAGE_TABLE; + memset(bm_pte, 0, sizeof(bm_pte)); + /* + * The boot-ioremap range spans multiple pgds, for which + * we are not prepared: + */ + if (pgd != early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))) { + WARN_ON(1); + printk(KERN_WARNING "pgd %p != %p\n", + pgd, early_ioremap_pgd(fix_to_virt(FIX_BTMAP_END))); + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", + fix_to_virt(FIX_BTMAP_BEGIN)); + printk(KERN_WARNING "fix_to_virt(FIX_BTMAP_END): %08lx\n", + fix_to_virt(FIX_BTMAP_END)); + + printk(KERN_WARNING "FIX_BTMAP_END: %d\n", FIX_BTMAP_END); + printk(KERN_WARNING "FIX_BTMAP_BEGIN: %d\n", + FIX_BTMAP_BEGIN); + } +} + +void __init early_ioremap_clear(void) +{ + unsigned long *pgd; + + if (early_ioremap_debug) + printk(KERN_INFO "early_ioremap_clear()\n"); + + pgd = early_ioremap_pgd(fix_to_virt(FIX_BTMAP_BEGIN)); + *pgd = 0; + paravirt_release_pt(__pa(pgd) >> PAGE_SHIFT); + __flush_tlb_all(); +} + +void __init early_ioremap_reset(void) +{ + enum fixed_addresses idx; + unsigned long *pte, phys, addr; + + after_paging_init = 1; + for (idx = FIX_BTMAP_BEGIN; idx >= FIX_BTMAP_END; idx--) { + addr = fix_to_virt(idx); + pte = early_ioremap_pte(addr); + if (!*pte & _PAGE_PRESENT) { + phys = *pte & PAGE_MASK; + set_fixmap(idx, phys); + } + } +} + +static void __init __early_set_fixmap(enum fixed_addresses idx, + unsigned long phys, pgprot_t flags) +{ + unsigned long *pte, addr = __fix_to_virt(idx); + + if (idx >= __end_of_fixed_addresses) { + BUG(); + return; + } + pte = early_ioremap_pte(addr); + if (pgprot_val(flags)) + *pte = (phys & PAGE_MASK) | pgprot_val(flags); + else + *pte = 0; + __flush_tlb_one(addr); +} + +static inline void __init early_set_fixmap(enum fixed_addresses idx, + unsigned long phys) +{ + if (after_paging_init) + set_fixmap(idx, phys); + else + __early_set_fixmap(idx, phys, PAGE_KERNEL); +} + +static inline void __init early_clear_fixmap(enum fixed_addresses idx) +{ + if (after_paging_init) + clear_fixmap(idx); + else + __early_set_fixmap(idx, 0, __pgprot(0)); +} + + +int __initdata early_ioremap_nested; + +static int __init check_early_ioremap_leak(void) +{ + if (!early_ioremap_nested) + return 0; + + printk(KERN_WARNING + "Debug warning: early ioremap leak of %d areas detected.\n", + early_ioremap_nested); + printk(KERN_WARNING + "please boot with early_ioremap_debug and report the dmesg.\n"); + WARN_ON(1); + + return 1; +} +late_initcall(check_early_ioremap_leak); + +void __init *early_ioremap(unsigned long phys_addr, unsigned long size) +{ + unsigned long offset, last_addr; + unsigned int nrpages, nesting; + enum fixed_addresses idx0, idx; + + WARN_ON(system_state != SYSTEM_BOOTING); + + nesting = early_ioremap_nested; + if (early_ioremap_debug) { + printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", + phys_addr, size, nesting); + dump_stack(); + } + + /* Don't allow wraparound or zero size */ + last_addr = phys_addr + size - 1; + if (!size || last_addr < phys_addr) { + WARN_ON(1); + return NULL; + } + + if (nesting >= FIX_BTMAPS_NESTING) { + WARN_ON(1); + return NULL; + } + early_ioremap_nested++; + /* + * Mappings have to be page-aligned + */ + offset = phys_addr & ~PAGE_MASK; + phys_addr &= PAGE_MASK; + size = PAGE_ALIGN(last_addr) - phys_addr; + + /* + * Mappings have to fit in the FIX_BTMAP area. + */ + nrpages = size >> PAGE_SHIFT; + if (nrpages > NR_FIX_BTMAPS) { + WARN_ON(1); + return NULL; + } + + /* + * Ok, go for it.. + */ + idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; + idx = idx0; + while (nrpages > 0) { + early_set_fixmap(idx, phys_addr); + phys_addr += PAGE_SIZE; + --idx; + --nrpages; + } + if (early_ioremap_debug) + printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); + + return (void *) (offset + fix_to_virt(idx0)); +} + +void __init early_iounmap(void *addr, unsigned long size) +{ + unsigned long virt_addr; + unsigned long offset; + unsigned int nrpages; + enum fixed_addresses idx; + unsigned int nesting; + + nesting = --early_ioremap_nested; + WARN_ON(nesting < 0); + + if (early_ioremap_debug) { + printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, + size, nesting); + dump_stack(); + } + + virt_addr = (unsigned long)addr; + if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) { + WARN_ON(1); + return; + } + offset = virt_addr & ~PAGE_MASK; + nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; + + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; + while (nrpages > 0) { + early_clear_fixmap(idx); + --idx; + --nrpages; + } +} + +void __this_fixmap_does_not_exist(void) +{ + WARN_ON(1); +} + +#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/mm/ioremap_32.c b/arch/x86/mm/ioremap_32.c deleted file mode 100644 index 0b278315d73..00000000000 --- a/arch/x86/mm/ioremap_32.c +++ /dev/null @@ -1,274 +0,0 @@ -/* - * arch/i386/mm/ioremap.c - * - * Re-map IO memory to kernel address space so that we can access it. - * This is needed for high PCI addresses that aren't mapped in the - * 640k-1MB IO memory area on PC's - * - * (C) Copyright 1995 1996 Linus Torvalds - */ - -#include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/io.h> -#include <asm/fixmap.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> -#include <asm/pgtable.h> - -#define ISA_START_ADDRESS 0xa0000 -#define ISA_END_ADDRESS 0x100000 - -/* - * Generic mapping function (not visible outside): - */ - -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) -{ - void __iomem * addr; - struct vm_struct * area; - unsigned long offset, last_addr; - pgprot_t prot; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) - return (void __iomem *) phys_to_virt(phys_addr); - - /* - * Don't allow anybody to remap normal RAM that we're using.. - */ - if (phys_addr <= virt_to_phys(high_memory - 1)) { - char *t_addr, *t_end; - struct page *page; - - t_addr = __va(phys_addr); - t_end = t_addr + (size - 1); - - for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) - if(!PageReserved(page)) - return NULL; - } - - prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY - | _PAGE_ACCESSED | flags); - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr+1) - phys_addr; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP | (flags << 20)); - if (!area) - return NULL; - area->phys_addr = phys_addr; - addr = (void __iomem *) area->addr; - if (ioremap_page_range((unsigned long) addr, - (unsigned long) addr + size, phys_addr, prot)) { - vunmap((void __force *) addr); - return NULL; - } - return (void __iomem *) (offset + (char __iomem *)addr); -} -EXPORT_SYMBOL(__ioremap); - -/** - * ioremap_nocache - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap_nocache performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - * - * This version of ioremap ensures that the memory is marked uncachable - * on the CPU as well as honouring existing caching rules from things like - * the PCI bus. Note that there are other caches and buffers on many - * busses. In particular driver authors should read up on PCI writes - * - * It's useful if some control registers are in such an area and - * write combining or read caching is not desirable: - * - * Must be freed with iounmap. - */ - -void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) -{ - unsigned long last_addr; - void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD); - if (!p) - return p; - - /* Guaranteed to be > phys_addr, as per __ioremap() */ - last_addr = phys_addr + size - 1; - - if (last_addr < virt_to_phys(high_memory) - 1) { - struct page *ppage = virt_to_page(__va(phys_addr)); - unsigned long npages; - - phys_addr &= PAGE_MASK; - - /* This might overflow and become zero.. */ - last_addr = PAGE_ALIGN(last_addr); - - /* .. but that's ok, because modulo-2**n arithmetic will make - * the page-aligned "last - first" come out right. - */ - npages = (last_addr - phys_addr) >> PAGE_SHIFT; - - if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { - iounmap(p); - p = NULL; - } - global_flush_tlb(); - } - - return p; -} -EXPORT_SYMBOL(ioremap_nocache); - -/** - * iounmap - Free a IO remapping - * @addr: virtual address from ioremap_* - * - * Caller must ensure there is only one unmapping for the same pointer. - */ -void iounmap(volatile void __iomem *addr) -{ - struct vm_struct *p, *o; - - if ((void __force *)addr <= high_memory) - return; - - /* - * __ioremap special-cases the PCI/ISA range by not instantiating a - * vm_area and by simply returning an address into the kernel mapping - * of ISA space. So handle that here. - */ - if (addr >= phys_to_virt(ISA_START_ADDRESS) && - addr < phys_to_virt(ISA_END_ADDRESS)) - return; - - addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); - - /* Use the vm area unlocked, assuming the caller - ensures there isn't another iounmap for the same address - in parallel. Reuse of the virtual address is prevented by - leaving it in the global lists until we're done with it. - cpa takes care of the direct mappings. */ - read_lock(&vmlist_lock); - for (p = vmlist; p; p = p->next) { - if (p->addr == addr) - break; - } - read_unlock(&vmlist_lock); - - if (!p) { - printk("iounmap: bad address %p\n", addr); - dump_stack(); - return; - } - - /* Reset the direct mapping. Can block */ - if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) { - change_page_attr(virt_to_page(__va(p->phys_addr)), - get_vm_area_size(p) >> PAGE_SHIFT, - PAGE_KERNEL); - global_flush_tlb(); - } - - /* Finally remove it */ - o = remove_vm_area((void *)addr); - BUG_ON(p != o || o == NULL); - kfree(p); -} -EXPORT_SYMBOL(iounmap); - -void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) -{ - unsigned long offset, last_addr; - unsigned int nrpages; - enum fixed_addresses idx; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) - return phys_to_virt(phys_addr); - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - phys_addr; - - /* - * Mappings have to fit in the FIX_BTMAP area. - */ - nrpages = size >> PAGE_SHIFT; - if (nrpages > NR_FIX_BTMAPS) - return NULL; - - /* - * Ok, go for it.. - */ - idx = FIX_BTMAP_BEGIN; - while (nrpages > 0) { - set_fixmap(idx, phys_addr); - phys_addr += PAGE_SIZE; - --idx; - --nrpages; - } - return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN)); -} - -void __init bt_iounmap(void *addr, unsigned long size) -{ - unsigned long virt_addr; - unsigned long offset; - unsigned int nrpages; - enum fixed_addresses idx; - - virt_addr = (unsigned long)addr; - if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)) - return; - offset = virt_addr & ~PAGE_MASK; - nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; - - idx = FIX_BTMAP_BEGIN; - while (nrpages > 0) { - clear_fixmap(idx); - --idx; - --nrpages; - } -} diff --git a/arch/x86/mm/ioremap_64.c b/arch/x86/mm/ioremap_64.c deleted file mode 100644 index 6cac90aa503..00000000000 --- a/arch/x86/mm/ioremap_64.c +++ /dev/null @@ -1,210 +0,0 @@ -/* - * arch/x86_64/mm/ioremap.c - * - * Re-map IO memory to kernel address space so that we can access it. - * This is needed for high PCI addresses that aren't mapped in the - * 640k-1MB IO memory area on PC's - * - * (C) Copyright 1995 1996 Linus Torvalds - */ - -#include <linux/vmalloc.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/io.h> - -#include <asm/pgalloc.h> -#include <asm/fixmap.h> -#include <asm/tlbflush.h> -#include <asm/cacheflush.h> -#include <asm/proto.h> - -unsigned long __phys_addr(unsigned long x) -{ - if (x >= __START_KERNEL_map) - return x - __START_KERNEL_map + phys_base; - return x - PAGE_OFFSET; -} -EXPORT_SYMBOL(__phys_addr); - -#define ISA_START_ADDRESS 0xa0000 -#define ISA_END_ADDRESS 0x100000 - -/* - * Fix up the linear direct mapping of the kernel to avoid cache attribute - * conflicts. - */ -static int -ioremap_change_attr(unsigned long phys_addr, unsigned long size, - unsigned long flags) -{ - int err = 0; - if (phys_addr + size - 1 < (end_pfn_map << PAGE_SHIFT)) { - unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long vaddr = (unsigned long) __va(phys_addr); - - /* - * Must use a address here and not struct page because the phys addr - * can be a in hole between nodes and not have an memmap entry. - */ - err = change_page_attr_addr(vaddr,npages,__pgprot(__PAGE_KERNEL|flags)); - if (!err) - global_flush_tlb(); - } - return err; -} - -/* - * Generic mapping function - */ - -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flags) -{ - void * addr; - struct vm_struct * area; - unsigned long offset, last_addr; - pgprot_t pgprot; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - - /* - * Don't remap the low PCI/ISA area, it's always mapped.. - */ - if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS) - return (__force void __iomem *)phys_to_virt(phys_addr); - -#ifdef CONFIG_FLATMEM - /* - * Don't allow anybody to remap normal RAM that we're using.. - */ - if (last_addr < virt_to_phys(high_memory)) { - char *t_addr, *t_end; - struct page *page; - - t_addr = __va(phys_addr); - t_end = t_addr + (size - 1); - - for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); page++) - if(!PageReserved(page)) - return NULL; - } -#endif - - pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_GLOBAL - | _PAGE_DIRTY | _PAGE_ACCESSED | flags); - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr+1) - phys_addr; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP | (flags << 20)); - if (!area) - return NULL; - area->phys_addr = phys_addr; - addr = area->addr; - if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, - phys_addr, pgprot)) { - remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); - return NULL; - } - if (flags && ioremap_change_attr(phys_addr, size, flags) < 0) { - area->flags &= 0xffffff; - vunmap(addr); - return NULL; - } - return (__force void __iomem *) (offset + (char *)addr); -} -EXPORT_SYMBOL(__ioremap); - -/** - * ioremap_nocache - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap_nocache performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - * - * This version of ioremap ensures that the memory is marked uncachable - * on the CPU as well as honouring existing caching rules from things like - * the PCI bus. Note that there are other caches and buffers on many - * busses. In particular driver authors should read up on PCI writes - * - * It's useful if some control registers are in such an area and - * write combining or read caching is not desirable: - * - * Must be freed with iounmap. - */ - -void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size) -{ - return __ioremap(phys_addr, size, _PAGE_PCD); -} -EXPORT_SYMBOL(ioremap_nocache); - -/** - * iounmap - Free a IO remapping - * @addr: virtual address from ioremap_* - * - * Caller must ensure there is only one unmapping for the same pointer. - */ -void iounmap(volatile void __iomem *addr) -{ - struct vm_struct *p, *o; - - if (addr <= high_memory) - return; - if (addr >= phys_to_virt(ISA_START_ADDRESS) && - addr < phys_to_virt(ISA_END_ADDRESS)) - return; - - addr = (volatile void __iomem *)(PAGE_MASK & (unsigned long __force)addr); - /* Use the vm area unlocked, assuming the caller - ensures there isn't another iounmap for the same address - in parallel. Reuse of the virtual address is prevented by - leaving it in the global lists until we're done with it. - cpa takes care of the direct mappings. */ - read_lock(&vmlist_lock); - for (p = vmlist; p; p = p->next) { - if (p->addr == addr) - break; - } - read_unlock(&vmlist_lock); - - if (!p) { - printk("iounmap: bad address %p\n", addr); - dump_stack(); - return; - } - - /* Reset the direct mapping. Can block */ - if (p->flags >> 20) - ioremap_change_attr(p->phys_addr, p->size, 0); - - /* Finally remove it */ - o = remove_vm_area((void *)addr); - BUG_ON(p != o || o == NULL); - kfree(p); -} -EXPORT_SYMBOL(iounmap); - diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c index a96006f7ae0..7a2ebce87df 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/k8topology_64.c @@ -1,9 +1,9 @@ -/* +/* * AMD K8 NUMA support. * Discover the memory map and associated nodes. - * + * * This version reads it directly from the K8 northbridge. - * + * * Copyright 2002,2003 Andi Kleen, SuSE Labs. */ #include <linux/kernel.h> @@ -22,132 +22,135 @@ static __init int find_northbridge(void) { - int num; + int num; - for (num = 0; num < 32; num++) { + for (num = 0; num < 32; num++) { u32 header; - - header = read_pci_config(0, num, 0, 0x00); - if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16))) - continue; - - header = read_pci_config(0, num, 1, 0x00); - if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16))) - continue; - return num; - } - - return -1; + + header = read_pci_config(0, num, 0, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1300<<16))) + continue; + + header = read_pci_config(0, num, 1, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1301<<16))) + continue; + return num; + } + + return -1; } int __init k8_scan_nodes(unsigned long start, unsigned long end) -{ +{ unsigned long prevbase; struct bootnode nodes[8]; - int nodeid, i, j, nb; + int nodeid, i, nb; unsigned char nodeids[8]; int found = 0; u32 reg; unsigned numnodes; - unsigned num_cores; + unsigned cores; + unsigned bits; + int j; if (!early_pci_allowed()) return -1; - nb = find_northbridge(); - if (nb < 0) + nb = find_northbridge(); + if (nb < 0) return nb; - printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); - - num_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; - printk(KERN_INFO "CPU has %d num_cores\n", num_cores); + printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); - reg = read_pci_config(0, nb, 0, 0x60); + reg = read_pci_config(0, nb, 0, 0x60); numnodes = ((reg >> 4) & 0xF) + 1; if (numnodes <= 1) return -1; printk(KERN_INFO "Number of nodes %d\n", numnodes); - memset(&nodes,0,sizeof(nodes)); + memset(&nodes, 0, sizeof(nodes)); prevbase = 0; - for (i = 0; i < 8; i++) { - unsigned long base,limit; + for (i = 0; i < 8; i++) { + unsigned long base, limit; u32 nodeid; - + base = read_pci_config(0, nb, 1, 0x40 + i*8); limit = read_pci_config(0, nb, 1, 0x44 + i*8); - nodeid = limit & 7; + nodeid = limit & 7; nodeids[i] = nodeid; - if ((base & 3) == 0) { + if ((base & 3) == 0) { if (i < numnodes) - printk("Skipping disabled node %d\n", i); + printk("Skipping disabled node %d\n", i); continue; - } + } if (nodeid >= numnodes) { printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, - base, limit); + base, limit); continue; - } + } - if (!limit) { - printk(KERN_INFO "Skipping node entry %d (base %lx)\n", i, - base); + if (!limit) { + printk(KERN_INFO "Skipping node entry %d (base %lx)\n", + i, base); continue; } if ((base >> 8) & 3 || (limit >> 8) & 3) { - printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", - nodeid, (base>>8)&3, (limit>>8) & 3); - return -1; - } + printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", + nodeid, (base>>8)&3, (limit>>8) & 3); + return -1; + } if (node_isset(nodeid, node_possible_map)) { - printk(KERN_INFO "Node %d already present. Skipping\n", + printk(KERN_INFO "Node %d already present. Skipping\n", nodeid); continue; } - limit >>= 16; - limit <<= 24; + limit >>= 16; + limit <<= 24; limit |= (1<<24)-1; limit++; if (limit > end_pfn << PAGE_SHIFT) limit = end_pfn << PAGE_SHIFT; if (limit <= base) - continue; - + continue; + base >>= 16; - base <<= 24; - - if (base < start) - base = start; - if (limit > end) - limit = end; - if (limit == base) { - printk(KERN_ERR "Empty node %d\n", nodeid); - continue; + base <<= 24; + + if (base < start) + base = start; + if (limit > end) + limit = end; + if (limit == base) { + printk(KERN_ERR "Empty node %d\n", nodeid); + continue; } - if (limit < base) { + if (limit < base) { printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", - nodeid, base, limit); + nodeid, base, limit); continue; - } - + } + /* Could sort here, but pun for now. Should not happen anyroads. */ - if (prevbase > base) { + if (prevbase > base) { printk(KERN_ERR "Node map not sorted %lx,%lx\n", - prevbase,base); + prevbase, base); return -1; } - - printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", - nodeid, base, limit); - + + printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", + nodeid, base, limit); + found++; - - nodes[nodeid].start = base; + + nodes[nodeid].start = base; nodes[nodeid].end = limit; e820_register_active_regions(nodeid, nodes[nodeid].start >> PAGE_SHIFT, @@ -156,27 +159,31 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) prevbase = base; node_set(nodeid, node_possible_map); - } + } if (!found) - return -1; + return -1; memnode_shift = compute_hash_shift(nodes, 8); - if (memnode_shift < 0) { - printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); - return -1; - } - printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); + if (memnode_shift < 0) { + printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); + return -1; + } + printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); + + /* use the coreid bits from early_identify_cpu */ + bits = boot_cpu_data.x86_coreid_bits; + cores = (1<<bits); for (i = 0; i < 8; i++) { - if (nodes[i].start != nodes[i].end) { + if (nodes[i].start != nodes[i].end) { nodeid = nodeids[i]; - for (j = 0; j < num_cores; j++) - apicid_to_node[(nodeid * num_cores) + j] = i; - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - } + for (j = 0; j < cores; j++) + apicid_to_node[(nodeid << bits) + j] = i; + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + } } numa_init_array(); return 0; -} +} diff --git a/arch/x86/mm/mmap_32.c b/arch/x86/mm/mmap.c index 552e0847375..56fe7124fbe 100644 --- a/arch/x86/mm/mmap_32.c +++ b/arch/x86/mm/mmap.c @@ -1,10 +1,13 @@ /* - * linux/arch/i386/mm/mmap.c + * Flexible mmap layout support * - * flexible mmap layout support + * Based on code by Ingo Molnar and Andi Kleen, copyrighted + * as follows: * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. + * Copyright 2005 Andi Kleen, SUSE Labs. + * Copyright 2007 Jiri Kosina, SUSE Labs. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,14 +22,12 @@ * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * - * Started by Ingo Molnar <mingo@elte.hu> */ #include <linux/personality.h> #include <linux/mm.h> #include <linux/random.h> +#include <linux/limits.h> #include <linux/sched.h> /* @@ -37,20 +38,71 @@ #define MIN_GAP (128*1024*1024) #define MAX_GAP (TASK_SIZE/6*5) -static inline unsigned long mmap_base(struct mm_struct *mm) +/* + * True on X86_32 or when emulating IA32 on X86_64 + */ +static int mmap_is_ia32(void) { - unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; - unsigned long random_factor = 0; +#ifdef CONFIG_X86_32 + return 1; +#endif +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) + return 1; +#endif + return 0; +} - if (current->flags & PF_RANDOMIZE) - random_factor = get_random_int() % (1024*1024); +static int mmap_is_legacy(void) +{ + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; + + if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) + return 1; + + return sysctl_legacy_va_layout; +} + +static unsigned long mmap_rnd(void) +{ + unsigned long rnd = 0; + + /* + * 8 bits of randomness in 32bit mmaps, 20 address space bits + * 28 bits of randomness in 64bit mmaps, 40 address space bits + */ + if (current->flags & PF_RANDOMIZE) { + if (mmap_is_ia32()) + rnd = (long)get_random_int() % (1<<8); + else + rnd = (long)(get_random_int() % (1<<28)); + } + return rnd << PAGE_SHIFT; +} + +static unsigned long mmap_base(void) +{ + unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; if (gap < MIN_GAP) gap = MIN_GAP; else if (gap > MAX_GAP) gap = MAX_GAP; - return PAGE_ALIGN(TASK_SIZE - gap - random_factor); + return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd()); +} + +/* + * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 + * does, but not when emulating X86_32 + */ +static unsigned long mmap_legacy_base(void) +{ + if (mmap_is_ia32()) + return TASK_UNMAPPED_BASE; + else + return TASK_UNMAPPED_BASE + mmap_rnd(); } /* @@ -59,18 +111,12 @@ static inline unsigned long mmap_base(struct mm_struct *mm) */ void arch_pick_mmap_layout(struct mm_struct *mm) { - /* - * Fall back to the standard layout if the personality - * bit is set, or if the expected stack growth is unlimited: - */ - if (sysctl_legacy_va_layout || - (current->personality & ADDR_COMPAT_LAYOUT) || - current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) { - mm->mmap_base = TASK_UNMAPPED_BASE; + if (mmap_is_legacy()) { + mm->mmap_base = mmap_legacy_base(); mm->get_unmapped_area = arch_get_unmapped_area; mm->unmap_area = arch_unmap_area; } else { - mm->mmap_base = mmap_base(mm); + mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; mm->unmap_area = arch_unmap_area_topdown; } diff --git a/arch/x86/mm/mmap_64.c b/arch/x86/mm/mmap_64.c deleted file mode 100644 index 80bba0dc000..00000000000 --- a/arch/x86/mm/mmap_64.c +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright 2005 Andi Kleen, SuSE Labs. - * Licensed under GPL, v.2 - */ -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/random.h> -#include <asm/ia32.h> - -/* Notebook: move the mmap code from sys_x86_64.c over here. */ - -void arch_pick_mmap_layout(struct mm_struct *mm) -{ -#ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->flags & _TIF_IA32) - return ia32_pick_mmap_layout(mm); -#endif - mm->mmap_base = TASK_UNMAPPED_BASE; - if (current->flags & PF_RANDOMIZE) { - /* Add 28bit randomness which is about 40bits of address space - because mmap base has to be page aligned. - or ~1/128 of the total user VM - (total user address space is 47bits) */ - unsigned rnd = get_random_int() & 0xfffffff; - mm->mmap_base += ((unsigned long)rnd) << PAGE_SHIFT; - } - mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; -} - diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 3d6926ba899..dc3b1f7e145 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -1,7 +1,7 @@ -/* +/* * Generic VM initialization for x86-64 NUMA setups. * Copyright 2002,2003 Andi Kleen, SuSE Labs. - */ + */ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> @@ -11,35 +11,45 @@ #include <linux/ctype.h> #include <linux/module.h> #include <linux/nodemask.h> +#include <linux/sched.h> #include <asm/e820.h> #include <asm/proto.h> #include <asm/dma.h> #include <asm/numa.h> #include <asm/acpi.h> +#include <asm/k8.h> #ifndef Dprintk #define Dprintk(x...) #endif struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_data); + bootmem_data_t plat_node_bdata[MAX_NUMNODES]; struct memnode memnode; -unsigned char cpu_to_node[NR_CPUS] __read_mostly = { +int x86_cpu_to_node_map_init[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; -unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +void *x86_cpu_to_node_map_early_ptr; +DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE; +EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map); +EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr); + +s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; -cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; + +cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_to_cpumask_map); int numa_off __initdata; unsigned long __initdata nodemap_addr; unsigned long __initdata nodemap_size; - /* * Given a shift value, try to populate memnodemap[] * Returns : @@ -47,14 +57,13 @@ unsigned long __initdata nodemap_size; * 0 if memnodmap[] too small (of shift too small) * -1 if node overlap or lost ram (shift too big) */ -static int __init -populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) +static int __init populate_memnodemap(const struct bootnode *nodes, + int numnodes, int shift) { - int i; - int res = -1; unsigned long addr, end; + int i, res = -1; - memset(memnodemap, 0xff, memnodemapsize); + memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize); for (i = 0; i < numnodes; i++) { addr = nodes[i].start; end = nodes[i].end; @@ -63,13 +72,13 @@ populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) if ((end >> shift) >= memnodemapsize) return 0; do { - if (memnodemap[addr >> shift] != 0xff) + if (memnodemap[addr >> shift] != NUMA_NO_NODE) return -1; memnodemap[addr >> shift] = i; addr += (1UL << shift); } while (addr < end); res = 1; - } + } return res; } @@ -78,12 +87,12 @@ static int __init allocate_cachealigned_memnodemap(void) unsigned long pad, pad_addr; memnodemap = memnode.embedded_map; - if (memnodemapsize <= 48) + if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map)) return 0; pad = L1_CACHE_BYTES - 1; pad_addr = 0x8000; - nodemap_size = pad + memnodemapsize; + nodemap_size = pad + sizeof(s16) * memnodemapsize; nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT, nodemap_size); if (nodemap_addr == -1UL) { @@ -94,6 +103,7 @@ static int __init allocate_cachealigned_memnodemap(void) } pad_addr = (nodemap_addr + pad) & ~pad; memnodemap = phys_to_virt(pad_addr); + reserve_early(nodemap_addr, nodemap_addr + nodemap_size); printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n", nodemap_addr, nodemap_addr + nodemap_size); @@ -104,8 +114,8 @@ static int __init allocate_cachealigned_memnodemap(void) * The LSB of all start and end addresses in the node map is the value of the * maximum possible shift. */ -static int __init -extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) +static int __init extract_lsb_from_nodes(const struct bootnode *nodes, + int numnodes) { int i, nodes_used = 0; unsigned long start, end; @@ -140,51 +150,50 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes) shift); if (populate_memnodemap(nodes, numnodes, shift) != 1) { - printk(KERN_INFO - "Your memory is not aligned you need to rebuild your kernel " - "with a bigger NODEMAPSIZE shift=%d\n", - shift); + printk(KERN_INFO "Your memory is not aligned you need to " + "rebuild your kernel with a bigger NODEMAPSIZE " + "shift=%d\n", shift); return -1; } return shift; } -#ifdef CONFIG_SPARSEMEM int early_pfn_to_nid(unsigned long pfn) { return phys_to_nid(pfn << PAGE_SHIFT); } -#endif -static void * __init -early_node_mem(int nodeid, unsigned long start, unsigned long end, - unsigned long size) +static void * __init early_node_mem(int nodeid, unsigned long start, + unsigned long end, unsigned long size) { unsigned long mem = find_e820_area(start, end, size); void *ptr; + if (mem != -1L) return __va(mem); ptr = __alloc_bootmem_nopanic(size, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)); if (ptr == NULL) { printk(KERN_ERR "Cannot find %lu bytes in node %d\n", - size, nodeid); + size, nodeid); return NULL; } return ptr; } /* Initialize bootmem allocator for a node */ -void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) -{ - unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start; - unsigned long nodedata_phys; +void __init setup_node_bootmem(int nodeid, unsigned long start, + unsigned long end) +{ + unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size; + unsigned long bootmap_start, nodedata_phys; void *bootmap; const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); - start = round_up(start, ZONE_ALIGN); + start = round_up(start, ZONE_ALIGN); - printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); + printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, + start, end); start_pfn = start >> PAGE_SHIFT; end_pfn = end >> PAGE_SHIFT; @@ -200,75 +209,55 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; /* Find a place for the bootmem map */ - bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); + bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); bootmap = early_node_mem(nodeid, bootmap_start, end, bootmap_pages<<PAGE_SHIFT); if (bootmap == NULL) { if (nodedata_phys < start || nodedata_phys >= end) - free_bootmem((unsigned long)node_data[nodeid],pgdat_size); + free_bootmem((unsigned long)node_data[nodeid], + pgdat_size); node_data[nodeid] = NULL; return; } bootmap_start = __pa(bootmap); - Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); - + Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); + bootmap_size = init_bootmem_node(NODE_DATA(nodeid), - bootmap_start >> PAGE_SHIFT, - start_pfn, end_pfn); + bootmap_start >> PAGE_SHIFT, + start_pfn, end_pfn); free_bootmem_with_active_regions(nodeid, end); - reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); - reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT); + reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); + reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, + bootmap_pages<<PAGE_SHIFT); #ifdef CONFIG_ACPI_NUMA srat_reserve_add_area(nodeid); #endif node_set_online(nodeid); -} - -/* Initialize final allocator for a zone */ -void __init setup_node_zones(int nodeid) -{ - unsigned long start_pfn, end_pfn, memmapsize, limit; - - start_pfn = node_start_pfn(nodeid); - end_pfn = node_end_pfn(nodeid); - - Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n", - nodeid, start_pfn, end_pfn); - - /* Try to allocate mem_map at end to not fill up precious <4GB - memory. */ - memmapsize = sizeof(struct page) * (end_pfn-start_pfn); - limit = end_pfn << PAGE_SHIFT; -#ifdef CONFIG_FLAT_NODE_MEM_MAP - NODE_DATA(nodeid)->node_mem_map = - __alloc_bootmem_core(NODE_DATA(nodeid)->bdata, - memmapsize, SMP_CACHE_BYTES, - round_down(limit - memmapsize, PAGE_SIZE), - limit); -#endif -} +} +/* + * There are unfortunately some poorly designed mainboards around that + * only connect memory to a single CPU. This breaks the 1:1 cpu->node + * mapping. To avoid this fill in the mapping for all possible CPUs, + * as the number of CPUs is not known yet. We round robin the existing + * nodes. + */ void __init numa_init_array(void) { int rr, i; - /* There are unfortunately some poorly designed mainboards around - that only connect memory to a single CPU. This breaks the 1:1 cpu->node - mapping. To avoid this fill in the mapping for all possible - CPUs, as the number of CPUs is not known yet. - We round robin the existing nodes. */ + rr = first_node(node_online_map); for (i = 0; i < NR_CPUS; i++) { - if (cpu_to_node(i) != NUMA_NO_NODE) + if (early_cpu_to_node(i) != NUMA_NO_NODE) continue; - numa_set_node(i, rr); + numa_set_node(i, rr); rr = next_node(rr, node_online_map); if (rr == MAX_NUMNODES) rr = first_node(node_online_map); } - } #ifdef CONFIG_NUMA_EMU @@ -276,15 +265,17 @@ void __init numa_init_array(void) char *cmdline __initdata; /* - * Setups up nid to range from addr to addr + size. If the end boundary is - * greater than max_addr, then max_addr is used instead. The return value is 0 - * if there is additional memory left for allocation past addr and -1 otherwise. - * addr is adjusted to be at the end of the node. + * Setups up nid to range from addr to addr + size. If the end + * boundary is greater than max_addr, then max_addr is used instead. + * The return value is 0 if there is additional memory left for + * allocation past addr and -1 otherwise. addr is adjusted to be at + * the end of the node. */ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, u64 size, u64 max_addr) { int ret = 0; + nodes[nid].start = *addr; *addr += size; if (*addr >= max_addr) { @@ -335,6 +326,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, for (i = node_start; i < num_nodes + node_start; i++) { u64 end = *addr + size; + if (i < big) end += FAKE_NODE_MIN_SIZE; /* @@ -380,14 +372,9 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) { struct bootnode nodes[MAX_NUMNODES]; - u64 addr = start_pfn << PAGE_SHIFT; + u64 size, addr = start_pfn << PAGE_SHIFT; u64 max_addr = end_pfn << PAGE_SHIFT; - int num_nodes = 0; - int coeff_flag; - int coeff = -1; - int num = 0; - u64 size; - int i; + int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; memset(&nodes, 0, sizeof(nodes)); /* @@ -395,8 +382,9 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) * system RAM into N fake nodes. */ if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { - num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, - simple_strtol(cmdline, NULL, 0)); + long n = simple_strtol(cmdline, NULL, 0); + + num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); if (num_nodes < 0) return num_nodes; goto out; @@ -483,46 +471,47 @@ out: for_each_node_mask(i, node_possible_map) { e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, nodes[i].end >> PAGE_SHIFT); - setup_node_bootmem(i, nodes[i].start, nodes[i].end); + setup_node_bootmem(i, nodes[i].start, nodes[i].end); } acpi_fake_nodes(nodes, num_nodes); - numa_init_array(); - return 0; + numa_init_array(); + return 0; } #endif /* CONFIG_NUMA_EMU */ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) -{ +{ int i; nodes_clear(node_possible_map); #ifdef CONFIG_NUMA_EMU if (cmdline && !numa_emulation(start_pfn, end_pfn)) - return; + return; nodes_clear(node_possible_map); #endif #ifdef CONFIG_ACPI_NUMA if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT)) - return; + return; nodes_clear(node_possible_map); #endif #ifdef CONFIG_K8_NUMA - if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT)) + if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, + end_pfn<<PAGE_SHIFT)) return; nodes_clear(node_possible_map); #endif printk(KERN_INFO "%s\n", numa_off ? "NUMA turned off" : "No NUMA configuration found"); - printk(KERN_INFO "Faking a node at %016lx-%016lx\n", + printk(KERN_INFO "Faking a node at %016lx-%016lx\n", start_pfn << PAGE_SHIFT, - end_pfn << PAGE_SHIFT); - /* setup dummy node covering all memory */ - memnode_shift = 63; + end_pfn << PAGE_SHIFT); + /* setup dummy node covering all memory */ + memnode_shift = 63; memnodemap = memnode.embedded_map; memnodemap[0] = 0; nodes_clear(node_online_map); @@ -530,36 +519,48 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) node_set(0, node_possible_map); for (i = 0; i < NR_CPUS; i++) numa_set_node(i, 0); - node_to_cpumask[0] = cpumask_of_cpu(0); + /* cpumask_of_cpu() may not be available during early startup */ + memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0])); + cpu_set(0, node_to_cpumask_map[0]); e820_register_active_regions(0, start_pfn, end_pfn); setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); } __cpuinit void numa_add_cpu(int cpu) { - set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]); -} + set_bit(cpu, + (unsigned long *)&node_to_cpumask_map[early_cpu_to_node(cpu)]); +} void __cpuinit numa_set_node(int cpu, int node) { + int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr; + cpu_pda(cpu)->nodenumber = node; - cpu_to_node(cpu) = node; + + if(cpu_to_node_map) + cpu_to_node_map[cpu] = node; + else if(per_cpu_offset(cpu)) + per_cpu(x86_cpu_to_node_map, cpu) = node; + else + Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); } -unsigned long __init numa_free_all_bootmem(void) -{ - int i; +unsigned long __init numa_free_all_bootmem(void) +{ unsigned long pages = 0; - for_each_online_node(i) { + int i; + + for_each_online_node(i) pages += free_all_bootmem_node(NODE_DATA(i)); - } + return pages; -} +} void __init paging_init(void) -{ - int i; +{ unsigned long max_zone_pfns[MAX_NR_ZONES]; + memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; @@ -568,32 +569,27 @@ void __init paging_init(void) sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); - for_each_online_node(i) { - setup_node_zones(i); - } - free_area_init_nodes(max_zone_pfns); -} +} static __init int numa_setup(char *opt) -{ +{ if (!opt) return -EINVAL; - if (!strncmp(opt,"off",3)) + if (!strncmp(opt, "off", 3)) numa_off = 1; #ifdef CONFIG_NUMA_EMU if (!strncmp(opt, "fake=", 5)) cmdline = opt + 5; #endif #ifdef CONFIG_ACPI_NUMA - if (!strncmp(opt,"noacpi",6)) - acpi_numa = -1; - if (!strncmp(opt,"hotadd=", 7)) + if (!strncmp(opt, "noacpi", 6)) + acpi_numa = -1; + if (!strncmp(opt, "hotadd=", 7)) hotadd_percent = simple_strtoul(opt+7, NULL, 10); #endif return 0; -} - +} early_param("numa", numa_setup); /* @@ -611,38 +607,16 @@ early_param("numa", numa_setup); void __init init_cpu_to_node(void) { int i; - for (i = 0; i < NR_CPUS; i++) { - u8 apicid = x86_cpu_to_apicid_init[i]; + + for (i = 0; i < NR_CPUS; i++) { + u16 apicid = x86_cpu_to_apicid_init[i]; + if (apicid == BAD_APICID) continue; if (apicid_to_node[apicid] == NUMA_NO_NODE) continue; - numa_set_node(i,apicid_to_node[apicid]); + numa_set_node(i, apicid_to_node[apicid]); } } -EXPORT_SYMBOL(cpu_to_node); -EXPORT_SYMBOL(node_to_cpumask); -EXPORT_SYMBOL(memnode); -EXPORT_SYMBOL(node_data); - -#ifdef CONFIG_DISCONTIGMEM -/* - * Functions to convert PFNs from/to per node page addresses. - * These are out of line because they are quite big. - * They could be all tuned by pre caching more state. - * Should do that. - */ -int pfn_valid(unsigned long pfn) -{ - unsigned nid; - if (pfn >= num_physpages) - return 0; - nid = pfn_to_nid(pfn); - if (nid == 0xff) - return 0; - return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid); -} -EXPORT_SYMBOL(pfn_valid); -#endif diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c new file mode 100644 index 00000000000..06353d43f72 --- /dev/null +++ b/arch/x86/mm/pageattr-test.c @@ -0,0 +1,224 @@ +/* + * self test for change_page_attr. + * + * Clears the global bit on random pages in the direct mapping, then reverts + * and compares page tables forwards and afterwards. + */ +#include <linux/bootmem.h> +#include <linux/random.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/mm.h> + +#include <asm/cacheflush.h> +#include <asm/pgtable.h> +#include <asm/kdebug.h> + +enum { + NTEST = 4000, +#ifdef CONFIG_X86_64 + LPS = (1 << PMD_SHIFT), +#elif defined(CONFIG_X86_PAE) + LPS = (1 << PMD_SHIFT), +#else + LPS = (1 << 22), +#endif + GPS = (1<<30) +}; + +struct split_state { + long lpg, gpg, spg, exec; + long min_exec, max_exec; +}; + +static __init int print_split(struct split_state *s) +{ + long i, expected, missed = 0; + int printed = 0; + int err = 0; + + s->lpg = s->gpg = s->spg = s->exec = 0; + s->min_exec = ~0UL; + s->max_exec = 0; + for (i = 0; i < max_pfn_mapped; ) { + unsigned long addr = (unsigned long)__va(i << PAGE_SHIFT); + int level; + pte_t *pte; + + pte = lookup_address(addr, &level); + if (!pte) { + if (!printed) { + dump_pagetable(addr); + printk(KERN_INFO "CPA %lx no pte level %d\n", + addr, level); + printed = 1; + } + missed++; + i++; + continue; + } + + if (level == PG_LEVEL_1G && sizeof(long) == 8) { + s->gpg++; + i += GPS/PAGE_SIZE; + } else if (level == PG_LEVEL_2M) { + if (!(pte_val(*pte) & _PAGE_PSE)) { + printk(KERN_ERR + "%lx level %d but not PSE %Lx\n", + addr, level, (u64)pte_val(*pte)); + err = 1; + } + s->lpg++; + i += LPS/PAGE_SIZE; + } else { + s->spg++; + i++; + } + if (!(pte_val(*pte) & _PAGE_NX)) { + s->exec++; + if (addr < s->min_exec) + s->min_exec = addr; + if (addr > s->max_exec) + s->max_exec = addr; + } + } + printk(KERN_INFO + "CPA mapping 4k %lu large %lu gb %lu x %lu[%lx-%lx] miss %lu\n", + s->spg, s->lpg, s->gpg, s->exec, + s->min_exec != ~0UL ? s->min_exec : 0, s->max_exec, missed); + + expected = (s->gpg*GPS + s->lpg*LPS)/PAGE_SIZE + s->spg + missed; + if (expected != i) { + printk(KERN_ERR "CPA max_pfn_mapped %lu but expected %lu\n", + max_pfn_mapped, expected); + return 1; + } + return err; +} + +static unsigned long __initdata addr[NTEST]; +static unsigned int __initdata len[NTEST]; + +/* Change the global bit on random pages in the direct mapping */ +static __init int exercise_pageattr(void) +{ + struct split_state sa, sb, sc; + unsigned long *bm; + pte_t *pte, pte0; + int failed = 0; + int level; + int i, k; + int err; + + printk(KERN_INFO "CPA exercising pageattr\n"); + + bm = vmalloc((max_pfn_mapped + 7) / 8); + if (!bm) { + printk(KERN_ERR "CPA Cannot vmalloc bitmap\n"); + return -ENOMEM; + } + memset(bm, 0, (max_pfn_mapped + 7) / 8); + + failed += print_split(&sa); + srandom32(100); + + for (i = 0; i < NTEST; i++) { + unsigned long pfn = random32() % max_pfn_mapped; + + addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT); + len[i] = random32() % 100; + len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1); + + if (len[i] == 0) + len[i] = 1; + + pte = NULL; + pte0 = pfn_pte(0, __pgprot(0)); /* shut gcc up */ + + for (k = 0; k < len[i]; k++) { + pte = lookup_address(addr[i] + k*PAGE_SIZE, &level); + if (!pte || pgprot_val(pte_pgprot(*pte)) == 0) { + addr[i] = 0; + break; + } + if (k == 0) { + pte0 = *pte; + } else { + if (pgprot_val(pte_pgprot(*pte)) != + pgprot_val(pte_pgprot(pte0))) { + len[i] = k; + break; + } + } + if (test_bit(pfn + k, bm)) { + len[i] = k; + break; + } + __set_bit(pfn + k, bm); + } + if (!addr[i] || !pte || !k) { + addr[i] = 0; + continue; + } + + err = change_page_attr_clear(addr[i], len[i], + __pgprot(_PAGE_GLOBAL)); + if (err < 0) { + printk(KERN_ERR "CPA %d failed %d\n", i, err); + failed++; + } + + pte = lookup_address(addr[i], &level); + if (!pte || pte_global(*pte) || pte_huge(*pte)) { + printk(KERN_ERR "CPA %lx: bad pte %Lx\n", addr[i], + pte ? (u64)pte_val(*pte) : 0ULL); + failed++; + } + if (level != PG_LEVEL_4K) { + printk(KERN_ERR "CPA %lx: unexpected level %d\n", + addr[i], level); + failed++; + } + + } + vfree(bm); + + failed += print_split(&sb); + + printk(KERN_INFO "CPA reverting everything\n"); + for (i = 0; i < NTEST; i++) { + if (!addr[i]) + continue; + pte = lookup_address(addr[i], &level); + if (!pte) { + printk(KERN_ERR "CPA lookup of %lx failed\n", addr[i]); + failed++; + continue; + } + err = change_page_attr_set(addr[i], len[i], + __pgprot(_PAGE_GLOBAL)); + if (err < 0) { + printk(KERN_ERR "CPA reverting failed: %d\n", err); + failed++; + } + pte = lookup_address(addr[i], &level); + if (!pte || !pte_global(*pte)) { + printk(KERN_ERR "CPA %lx: bad pte after revert %Lx\n", + addr[i], pte ? (u64)pte_val(*pte) : 0ULL); + failed++; + } + + } + + failed += print_split(&sc); + + if (failed) { + printk(KERN_ERR "CPA selftests NOT PASSED. Please report.\n"); + WARN_ON(1); + } else { + printk(KERN_INFO "CPA selftests PASSED\n"); + } + + return 0; +} +module_init(exercise_pageattr); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c new file mode 100644 index 00000000000..1cc6607eacb --- /dev/null +++ b/arch/x86/mm/pageattr.c @@ -0,0 +1,564 @@ +/* + * Copyright 2002 Andi Kleen, SuSE Labs. + * Thanks to Ben LaHaise for precious feedback. + */ +#include <linux/highmem.h> +#include <linux/bootmem.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/mm.h> + +#include <asm/e820.h> +#include <asm/processor.h> +#include <asm/tlbflush.h> +#include <asm/sections.h> +#include <asm/uaccess.h> +#include <asm/pgalloc.h> + +static inline int +within(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr < end; +} + +/* + * Flushing functions + */ + +/** + * clflush_cache_range - flush a cache range with clflush + * @addr: virtual start address + * @size: number of bytes to flush + * + * clflush is an unordered instruction which needs fencing with mfence + * to avoid ordering issues. + */ +void clflush_cache_range(void *vaddr, unsigned int size) +{ + void *vend = vaddr + size - 1; + + mb(); + + for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) + clflush(vaddr); + /* + * Flush any possible final partial cacheline: + */ + clflush(vend); + + mb(); +} + +static void __cpa_flush_all(void *arg) +{ + /* + * Flush all to work around Errata in early athlons regarding + * large page flushing. + */ + __flush_tlb_all(); + + if (boot_cpu_data.x86_model >= 4) + wbinvd(); +} + +static void cpa_flush_all(void) +{ + BUG_ON(irqs_disabled()); + + on_each_cpu(__cpa_flush_all, NULL, 1, 1); +} + +static void __cpa_flush_range(void *arg) +{ + /* + * We could optimize that further and do individual per page + * tlb invalidates for a low number of pages. Caveat: we must + * flush the high aliases on 64bit as well. + */ + __flush_tlb_all(); +} + +static void cpa_flush_range(unsigned long start, int numpages) +{ + unsigned int i, level; + unsigned long addr; + + BUG_ON(irqs_disabled()); + WARN_ON(PAGE_ALIGN(start) != start); + + on_each_cpu(__cpa_flush_range, NULL, 1, 1); + + /* + * We only need to flush on one CPU, + * clflush is a MESI-coherent instruction that + * will cause all other CPUs to flush the same + * cachelines: + */ + for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) { + pte_t *pte = lookup_address(addr, &level); + + /* + * Only flush present addresses: + */ + if (pte && pte_present(*pte)) + clflush_cache_range((void *) addr, PAGE_SIZE); + } +} + +/* + * Certain areas of memory on x86 require very specific protection flags, + * for example the BIOS area or kernel text. Callers don't always get this + * right (again, ioremap() on BIOS memory is not uncommon) so this function + * checks and fixes these known static required protection bits. + */ +static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) +{ + pgprot_t forbidden = __pgprot(0); + + /* + * The BIOS area between 640k and 1Mb needs to be executable for + * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. + */ + if (within(__pa(address), BIOS_BEGIN, BIOS_END)) + pgprot_val(forbidden) |= _PAGE_NX; + + /* + * The kernel text needs to be executable for obvious reasons + * Does not cover __inittext since that is gone later on + */ + if (within(address, (unsigned long)_text, (unsigned long)_etext)) + pgprot_val(forbidden) |= _PAGE_NX; + +#ifdef CONFIG_DEBUG_RODATA + /* The .rodata section needs to be read-only */ + if (within(address, (unsigned long)__start_rodata, + (unsigned long)__end_rodata)) + pgprot_val(forbidden) |= _PAGE_RW; +#endif + + prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); + + return prot; +} + +pte_t *lookup_address(unsigned long address, int *level) +{ + pgd_t *pgd = pgd_offset_k(address); + pud_t *pud; + pmd_t *pmd; + + *level = PG_LEVEL_NONE; + + if (pgd_none(*pgd)) + return NULL; + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + return NULL; + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return NULL; + + *level = PG_LEVEL_2M; + if (pmd_large(*pmd)) + return (pte_t *)pmd; + + *level = PG_LEVEL_4K; + return pte_offset_kernel(pmd, address); +} + +static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) +{ + /* change init_mm */ + set_pte_atomic(kpte, pte); +#ifdef CONFIG_X86_32 + if (!SHARED_KERNEL_PMD) { + struct page *page; + + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + pgd = (pgd_t *)page_address(page) + pgd_index(address); + pud = pud_offset(pgd, address); + pmd = pmd_offset(pud, address); + set_pte_atomic((pte_t *)pmd, pte); + } + } +#endif +} + +static int split_large_page(pte_t *kpte, unsigned long address) +{ + pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte)); + gfp_t gfp_flags = GFP_KERNEL; + unsigned long flags; + unsigned long addr; + pte_t *pbase, *tmp; + struct page *base; + unsigned int i, level; + +#ifdef CONFIG_DEBUG_PAGEALLOC + gfp_flags = __GFP_HIGH | __GFP_NOFAIL | __GFP_NOWARN; + gfp_flags = GFP_ATOMIC | __GFP_NOWARN; +#endif + base = alloc_pages(gfp_flags, 0); + if (!base) + return -ENOMEM; + + spin_lock_irqsave(&pgd_lock, flags); + /* + * Check for races, another CPU might have split this page + * up for us already: + */ + tmp = lookup_address(address, &level); + if (tmp != kpte) { + WARN_ON_ONCE(1); + goto out_unlock; + } + + address = __pa(address); + addr = address & LARGE_PAGE_MASK; + pbase = (pte_t *)page_address(base); +#ifdef CONFIG_X86_32 + paravirt_alloc_pt(&init_mm, page_to_pfn(base)); +#endif + + pgprot_val(ref_prot) &= ~_PAGE_NX; + for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) + set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot)); + + /* + * Install the new, split up pagetable. Important detail here: + * + * On Intel the NX bit of all levels must be cleared to make a + * page executable. See section 4.13.2 of Intel 64 and IA-32 + * Architectures Software Developer's Manual). + */ + ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte))); + __set_pmd_pte(kpte, address, mk_pte(base, ref_prot)); + base = NULL; + +out_unlock: + spin_unlock_irqrestore(&pgd_lock, flags); + + if (base) + __free_pages(base, 0); + + return 0; +} + +static int +__change_page_attr(unsigned long address, unsigned long pfn, + pgprot_t mask_set, pgprot_t mask_clr) +{ + struct page *kpte_page; + int level, err = 0; + pte_t *kpte; + +#ifdef CONFIG_X86_32 + BUG_ON(pfn > max_low_pfn); +#endif + +repeat: + kpte = lookup_address(address, &level); + if (!kpte) + return -EINVAL; + + kpte_page = virt_to_page(kpte); + BUG_ON(PageLRU(kpte_page)); + BUG_ON(PageCompound(kpte_page)); + + if (level == PG_LEVEL_4K) { + pgprot_t new_prot = pte_pgprot(*kpte); + pte_t new_pte, old_pte = *kpte; + + pgprot_val(new_prot) &= ~pgprot_val(mask_clr); + pgprot_val(new_prot) |= pgprot_val(mask_set); + + new_prot = static_protections(new_prot, address); + + new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); + BUG_ON(pte_pfn(new_pte) != pte_pfn(old_pte)); + + set_pte_atomic(kpte, new_pte); + } else { + err = split_large_page(kpte, address); + if (!err) + goto repeat; + } + return err; +} + +/** + * change_page_attr_addr - Change page table attributes in linear mapping + * @address: Virtual address in linear mapping. + * @prot: New page table attribute (PAGE_*) + * + * Change page attributes of a page in the direct mapping. This is a variant + * of change_page_attr() that also works on memory holes that do not have + * mem_map entry (pfn_valid() is false). + * + * See change_page_attr() documentation for more details. + * + * Modules and drivers should use the set_memory_* APIs instead. + */ + +#define HIGH_MAP_START __START_KERNEL_map +#define HIGH_MAP_END (__START_KERNEL_map + KERNEL_TEXT_SIZE) + +static int +change_page_attr_addr(unsigned long address, pgprot_t mask_set, + pgprot_t mask_clr) +{ + unsigned long phys_addr = __pa(address); + unsigned long pfn = phys_addr >> PAGE_SHIFT; + int err; + +#ifdef CONFIG_X86_64 + /* + * If we are inside the high mapped kernel range, then we + * fixup the low mapping first. __va() returns the virtual + * address in the linear mapping: + */ + if (within(address, HIGH_MAP_START, HIGH_MAP_END)) + address = (unsigned long) __va(phys_addr); +#endif + + err = __change_page_attr(address, pfn, mask_set, mask_clr); + if (err) + return err; + +#ifdef CONFIG_X86_64 + /* + * If the physical address is inside the kernel map, we need + * to touch the high mapped kernel as well: + */ + if (within(phys_addr, 0, KERNEL_TEXT_SIZE)) { + /* + * Calc the high mapping address. See __phys_addr() + * for the non obvious details. + */ + address = phys_addr + HIGH_MAP_START - phys_base; + /* Make sure the kernel mappings stay executable */ + pgprot_val(mask_clr) |= _PAGE_NX; + + /* + * Our high aliases are imprecise, because we check + * everything between 0 and KERNEL_TEXT_SIZE, so do + * not propagate lookup failures back to users: + */ + __change_page_attr(address, pfn, mask_set, mask_clr); + } +#endif + return err; +} + +static int __change_page_attr_set_clr(unsigned long addr, int numpages, + pgprot_t mask_set, pgprot_t mask_clr) +{ + unsigned int i; + int ret; + + for (i = 0; i < numpages ; i++, addr += PAGE_SIZE) { + ret = change_page_attr_addr(addr, mask_set, mask_clr); + if (ret) + return ret; + } + + return 0; +} + +static int change_page_attr_set_clr(unsigned long addr, int numpages, + pgprot_t mask_set, pgprot_t mask_clr) +{ + int ret = __change_page_attr_set_clr(addr, numpages, mask_set, + mask_clr); + + /* + * On success we use clflush, when the CPU supports it to + * avoid the wbindv. If the CPU does not support it and in the + * error case we fall back to cpa_flush_all (which uses + * wbindv): + */ + if (!ret && cpu_has_clflush) + cpa_flush_range(addr, numpages); + else + cpa_flush_all(); + + return ret; +} + +static inline int change_page_attr_set(unsigned long addr, int numpages, + pgprot_t mask) +{ + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); +} + +static inline int change_page_attr_clear(unsigned long addr, int numpages, + pgprot_t mask) +{ + return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); + +} + +int set_memory_uc(unsigned long addr, int numpages) +{ + return change_page_attr_set(addr, numpages, + __pgprot(_PAGE_PCD | _PAGE_PWT)); +} +EXPORT_SYMBOL(set_memory_uc); + +int set_memory_wb(unsigned long addr, int numpages) +{ + return change_page_attr_clear(addr, numpages, + __pgprot(_PAGE_PCD | _PAGE_PWT)); +} +EXPORT_SYMBOL(set_memory_wb); + +int set_memory_x(unsigned long addr, int numpages) +{ + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); +} +EXPORT_SYMBOL(set_memory_x); + +int set_memory_nx(unsigned long addr, int numpages) +{ + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); +} +EXPORT_SYMBOL(set_memory_nx); + +int set_memory_ro(unsigned long addr, int numpages) +{ + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); +} + +int set_memory_rw(unsigned long addr, int numpages) +{ + return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); +} + +int set_memory_np(unsigned long addr, int numpages) +{ + return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); +} + +int set_pages_uc(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_uc(addr, numpages); +} +EXPORT_SYMBOL(set_pages_uc); + +int set_pages_wb(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_wb(addr, numpages); +} +EXPORT_SYMBOL(set_pages_wb); + +int set_pages_x(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_x(addr, numpages); +} +EXPORT_SYMBOL(set_pages_x); + +int set_pages_nx(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_nx(addr, numpages); +} +EXPORT_SYMBOL(set_pages_nx); + +int set_pages_ro(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_ro(addr, numpages); +} + +int set_pages_rw(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_rw(addr, numpages); +} + + +#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_CPA_DEBUG) +static inline int __change_page_attr_set(unsigned long addr, int numpages, + pgprot_t mask) +{ + return __change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); +} + +static inline int __change_page_attr_clear(unsigned long addr, int numpages, + pgprot_t mask) +{ + return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); +} +#endif + +#ifdef CONFIG_DEBUG_PAGEALLOC + +static int __set_pages_p(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return __change_page_attr_set(addr, numpages, + __pgprot(_PAGE_PRESENT | _PAGE_RW)); +} + +static int __set_pages_np(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return __change_page_attr_clear(addr, numpages, + __pgprot(_PAGE_PRESENT)); +} + +void kernel_map_pages(struct page *page, int numpages, int enable) +{ + if (PageHighMem(page)) + return; + if (!enable) { + debug_check_no_locks_freed(page_address(page), + numpages * PAGE_SIZE); + } + + /* + * If page allocator is not up yet then do not call c_p_a(): + */ + if (!debug_pagealloc_enabled) + return; + + /* + * The return value is ignored - the calls cannot fail, + * large pages are disabled at boot time: + */ + if (enable) + __set_pages_p(page, numpages); + else + __set_pages_np(page, numpages); + + /* + * We should perform an IPI and flush all tlbs, + * but that can deadlock->flush only current cpu: + */ + __flush_tlb_all(); +} +#endif + +/* + * The testcases use internal knowledge of the implementation that shouldn't + * be exposed to the rest of the kernel. Include these directly here. + */ +#ifdef CONFIG_CPA_DEBUG +#include "pageattr-test.c" +#endif diff --git a/arch/x86/mm/pageattr_32.c b/arch/x86/mm/pageattr_32.c deleted file mode 100644 index 260073c0760..00000000000 --- a/arch/x86/mm/pageattr_32.c +++ /dev/null @@ -1,278 +0,0 @@ -/* - * Copyright 2002 Andi Kleen, SuSE Labs. - * Thanks to Ben LaHaise for precious feedback. - */ - -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/highmem.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <asm/uaccess.h> -#include <asm/processor.h> -#include <asm/tlbflush.h> -#include <asm/pgalloc.h> -#include <asm/sections.h> - -static DEFINE_SPINLOCK(cpa_lock); -static struct list_head df_list = LIST_HEAD_INIT(df_list); - - -pte_t *lookup_address(unsigned long address) -{ - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud; - pmd_t *pmd; - if (pgd_none(*pgd)) - return NULL; - pud = pud_offset(pgd, address); - if (pud_none(*pud)) - return NULL; - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - return NULL; - if (pmd_large(*pmd)) - return (pte_t *)pmd; - return pte_offset_kernel(pmd, address); -} - -static struct page *split_large_page(unsigned long address, pgprot_t prot, - pgprot_t ref_prot) -{ - int i; - unsigned long addr; - struct page *base; - pte_t *pbase; - - spin_unlock_irq(&cpa_lock); - base = alloc_pages(GFP_KERNEL, 0); - spin_lock_irq(&cpa_lock); - if (!base) - return NULL; - - /* - * page_private is used to track the number of entries in - * the page table page that have non standard attributes. - */ - SetPagePrivate(base); - page_private(base) = 0; - - address = __pa(address); - addr = address & LARGE_PAGE_MASK; - pbase = (pte_t *)page_address(base); - paravirt_alloc_pt(&init_mm, page_to_pfn(base)); - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { - set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, - addr == address ? prot : ref_prot)); - } - return base; -} - -static void cache_flush_page(struct page *p) -{ - void *adr = page_address(p); - int i; - for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) - clflush(adr+i); -} - -static void flush_kernel_map(void *arg) -{ - struct list_head *lh = (struct list_head *)arg; - struct page *p; - - /* High level code is not ready for clflush yet */ - if (0 && cpu_has_clflush) { - list_for_each_entry (p, lh, lru) - cache_flush_page(p); - } else if (boot_cpu_data.x86_model >= 4) - wbinvd(); - - /* Flush all to work around Errata in early athlons regarding - * large page flushing. - */ - __flush_tlb_all(); -} - -static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) -{ - struct page *page; - unsigned long flags; - - set_pte_atomic(kpte, pte); /* change init_mm */ - if (SHARED_KERNEL_PMD) - return; - - spin_lock_irqsave(&pgd_lock, flags); - for (page = pgd_list; page; page = (struct page *)page->index) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pgd = (pgd_t *)page_address(page) + pgd_index(address); - pud = pud_offset(pgd, address); - pmd = pmd_offset(pud, address); - set_pte_atomic((pte_t *)pmd, pte); - } - spin_unlock_irqrestore(&pgd_lock, flags); -} - -/* - * No more special protections in this 2/4MB area - revert to a - * large page again. - */ -static inline void revert_page(struct page *kpte_page, unsigned long address) -{ - pgprot_t ref_prot; - pte_t *linear; - - ref_prot = - ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext) - ? PAGE_KERNEL_LARGE_EXEC : PAGE_KERNEL_LARGE; - - linear = (pte_t *) - pmd_offset(pud_offset(pgd_offset_k(address), address), address); - set_pmd_pte(linear, address, - pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, - ref_prot)); -} - -static inline void save_page(struct page *kpte_page) -{ - if (!test_and_set_bit(PG_arch_1, &kpte_page->flags)) - list_add(&kpte_page->lru, &df_list); -} - -static int -__change_page_attr(struct page *page, pgprot_t prot) -{ - pte_t *kpte; - unsigned long address; - struct page *kpte_page; - - BUG_ON(PageHighMem(page)); - address = (unsigned long)page_address(page); - - kpte = lookup_address(address); - if (!kpte) - return -EINVAL; - kpte_page = virt_to_page(kpte); - BUG_ON(PageLRU(kpte_page)); - BUG_ON(PageCompound(kpte_page)); - - if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { - if (!pte_huge(*kpte)) { - set_pte_atomic(kpte, mk_pte(page, prot)); - } else { - pgprot_t ref_prot; - struct page *split; - - ref_prot = - ((address & LARGE_PAGE_MASK) < (unsigned long)&_etext) - ? PAGE_KERNEL_EXEC : PAGE_KERNEL; - split = split_large_page(address, prot, ref_prot); - if (!split) - return -ENOMEM; - set_pmd_pte(kpte,address,mk_pte(split, ref_prot)); - kpte_page = split; - } - page_private(kpte_page)++; - } else if (!pte_huge(*kpte)) { - set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); - BUG_ON(page_private(kpte_page) == 0); - page_private(kpte_page)--; - } else - BUG(); - - /* - * If the pte was reserved, it means it was created at boot - * time (not via split_large_page) and in turn we must not - * replace it with a largepage. - */ - - save_page(kpte_page); - if (!PageReserved(kpte_page)) { - if (cpu_has_pse && (page_private(kpte_page) == 0)) { - paravirt_release_pt(page_to_pfn(kpte_page)); - revert_page(kpte_page, address); - } - } - return 0; -} - -static inline void flush_map(struct list_head *l) -{ - on_each_cpu(flush_kernel_map, l, 1, 1); -} - -/* - * Change the page attributes of an page in the linear mapping. - * - * This should be used when a page is mapped with a different caching policy - * than write-back somewhere - some CPUs do not like it when mappings with - * different caching policies exist. This changes the page attributes of the - * in kernel linear mapping too. - * - * The caller needs to ensure that there are no conflicting mappings elsewhere. - * This function only deals with the kernel linear map. - * - * Caller must call global_flush_tlb() after this. - */ -int change_page_attr(struct page *page, int numpages, pgprot_t prot) -{ - int err = 0; - int i; - unsigned long flags; - - spin_lock_irqsave(&cpa_lock, flags); - for (i = 0; i < numpages; i++, page++) { - err = __change_page_attr(page, prot); - if (err) - break; - } - spin_unlock_irqrestore(&cpa_lock, flags); - return err; -} - -void global_flush_tlb(void) -{ - struct list_head l; - struct page *pg, *next; - - BUG_ON(irqs_disabled()); - - spin_lock_irq(&cpa_lock); - list_replace_init(&df_list, &l); - spin_unlock_irq(&cpa_lock); - flush_map(&l); - list_for_each_entry_safe(pg, next, &l, lru) { - list_del(&pg->lru); - clear_bit(PG_arch_1, &pg->flags); - if (PageReserved(pg) || !cpu_has_pse || page_private(pg) != 0) - continue; - ClearPagePrivate(pg); - __free_page(pg); - } -} - -#ifdef CONFIG_DEBUG_PAGEALLOC -void kernel_map_pages(struct page *page, int numpages, int enable) -{ - if (PageHighMem(page)) - return; - if (!enable) - debug_check_no_locks_freed(page_address(page), - numpages * PAGE_SIZE); - - /* the return value is ignored - the calls cannot fail, - * large pages are disabled at boot time. - */ - change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0)); - /* we should perform an IPI and flush all tlbs, - * but that can deadlock->flush only current cpu. - */ - __flush_tlb_all(); -} -#endif - -EXPORT_SYMBOL(change_page_attr); -EXPORT_SYMBOL(global_flush_tlb); diff --git a/arch/x86/mm/pageattr_64.c b/arch/x86/mm/pageattr_64.c deleted file mode 100644 index c40afbaaf93..00000000000 --- a/arch/x86/mm/pageattr_64.c +++ /dev/null @@ -1,255 +0,0 @@ -/* - * Copyright 2002 Andi Kleen, SuSE Labs. - * Thanks to Ben LaHaise for precious feedback. - */ - -#include <linux/mm.h> -#include <linux/sched.h> -#include <linux/highmem.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <asm/uaccess.h> -#include <asm/processor.h> -#include <asm/tlbflush.h> -#include <asm/io.h> - -pte_t *lookup_address(unsigned long address) -{ - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - if (pgd_none(*pgd)) - return NULL; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return NULL; - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - return NULL; - if (pmd_large(*pmd)) - return (pte_t *)pmd; - pte = pte_offset_kernel(pmd, address); - if (pte && !pte_present(*pte)) - pte = NULL; - return pte; -} - -static struct page *split_large_page(unsigned long address, pgprot_t prot, - pgprot_t ref_prot) -{ - int i; - unsigned long addr; - struct page *base = alloc_pages(GFP_KERNEL, 0); - pte_t *pbase; - if (!base) - return NULL; - /* - * page_private is used to track the number of entries in - * the page table page have non standard attributes. - */ - SetPagePrivate(base); - page_private(base) = 0; - - address = __pa(address); - addr = address & LARGE_PAGE_MASK; - pbase = (pte_t *)page_address(base); - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { - pbase[i] = pfn_pte(addr >> PAGE_SHIFT, - addr == address ? prot : ref_prot); - } - return base; -} - -void clflush_cache_range(void *adr, int size) -{ - int i; - for (i = 0; i < size; i += boot_cpu_data.x86_clflush_size) - clflush(adr+i); -} - -static void flush_kernel_map(void *arg) -{ - struct list_head *l = (struct list_head *)arg; - struct page *pg; - - /* When clflush is available always use it because it is - much cheaper than WBINVD. */ - /* clflush is still broken. Disable for now. */ - if (1 || !cpu_has_clflush) - asm volatile("wbinvd" ::: "memory"); - else list_for_each_entry(pg, l, lru) { - void *adr = page_address(pg); - clflush_cache_range(adr, PAGE_SIZE); - } - __flush_tlb_all(); -} - -static inline void flush_map(struct list_head *l) -{ - on_each_cpu(flush_kernel_map, l, 1, 1); -} - -static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */ - -static inline void save_page(struct page *fpage) -{ - if (!test_and_set_bit(PG_arch_1, &fpage->flags)) - list_add(&fpage->lru, &deferred_pages); -} - -/* - * No more special protections in this 2/4MB area - revert to a - * large page again. - */ -static void revert_page(unsigned long address, pgprot_t ref_prot) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t large_pte; - unsigned long pfn; - - pgd = pgd_offset_k(address); - BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd,address); - BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, address); - BUG_ON(pmd_val(*pmd) & _PAGE_PSE); - pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; - large_pte = pfn_pte(pfn, ref_prot); - large_pte = pte_mkhuge(large_pte); - set_pte((pte_t *)pmd, large_pte); -} - -static int -__change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, - pgprot_t ref_prot) -{ - pte_t *kpte; - struct page *kpte_page; - pgprot_t ref_prot2; - - kpte = lookup_address(address); - if (!kpte) return 0; - kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); - BUG_ON(PageLRU(kpte_page)); - BUG_ON(PageCompound(kpte_page)); - if (pgprot_val(prot) != pgprot_val(ref_prot)) { - if (!pte_huge(*kpte)) { - set_pte(kpte, pfn_pte(pfn, prot)); - } else { - /* - * split_large_page will take the reference for this - * change_page_attr on the split page. - */ - struct page *split; - ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); - split = split_large_page(address, prot, ref_prot2); - if (!split) - return -ENOMEM; - pgprot_val(ref_prot2) &= ~_PAGE_NX; - set_pte(kpte, mk_pte(split, ref_prot2)); - kpte_page = split; - } - page_private(kpte_page)++; - } else if (!pte_huge(*kpte)) { - set_pte(kpte, pfn_pte(pfn, ref_prot)); - BUG_ON(page_private(kpte_page) == 0); - page_private(kpte_page)--; - } else - BUG(); - - /* on x86-64 the direct mapping set at boot is not using 4k pages */ - BUG_ON(PageReserved(kpte_page)); - - save_page(kpte_page); - if (page_private(kpte_page) == 0) - revert_page(address, ref_prot); - return 0; -} - -/* - * Change the page attributes of an page in the linear mapping. - * - * This should be used when a page is mapped with a different caching policy - * than write-back somewhere - some CPUs do not like it when mappings with - * different caching policies exist. This changes the page attributes of the - * in kernel linear mapping too. - * - * The caller needs to ensure that there are no conflicting mappings elsewhere. - * This function only deals with the kernel linear map. - * - * Caller must call global_flush_tlb() after this. - */ -int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) -{ - int err = 0, kernel_map = 0; - int i; - - if (address >= __START_KERNEL_map - && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) { - address = (unsigned long)__va(__pa(address)); - kernel_map = 1; - } - - down_write(&init_mm.mmap_sem); - for (i = 0; i < numpages; i++, address += PAGE_SIZE) { - unsigned long pfn = __pa(address) >> PAGE_SHIFT; - - if (!kernel_map || pte_present(pfn_pte(0, prot))) { - err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); - if (err) - break; - } - /* Handle kernel mapping too which aliases part of the - * lowmem */ - if (__pa(address) < KERNEL_TEXT_SIZE) { - unsigned long addr2; - pgprot_t prot2; - addr2 = __START_KERNEL_map + __pa(address); - /* Make sure the kernel mappings stay executable */ - prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); - err = __change_page_attr(addr2, pfn, prot2, - PAGE_KERNEL_EXEC); - } - } - up_write(&init_mm.mmap_sem); - return err; -} - -/* Don't call this for MMIO areas that may not have a mem_map entry */ -int change_page_attr(struct page *page, int numpages, pgprot_t prot) -{ - unsigned long addr = (unsigned long)page_address(page); - return change_page_attr_addr(addr, numpages, prot); -} - -void global_flush_tlb(void) -{ - struct page *pg, *next; - struct list_head l; - - /* - * Write-protect the semaphore, to exclude two contexts - * doing a list_replace_init() call in parallel and to - * exclude new additions to the deferred_pages list: - */ - down_write(&init_mm.mmap_sem); - list_replace_init(&deferred_pages, &l); - up_write(&init_mm.mmap_sem); - - flush_map(&l); - - list_for_each_entry_safe(pg, next, &l, lru) { - list_del(&pg->lru); - clear_bit(PG_arch_1, &pg->flags); - if (page_private(pg) != 0) - continue; - ClearPagePrivate(pg); - __free_page(pg); - } -} - -EXPORT_SYMBOL(change_page_attr); -EXPORT_SYMBOL(global_flush_tlb); diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index be61a1d845a..2ae5999a795 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -195,11 +195,6 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) return pte; } -void pmd_ctor(struct kmem_cache *cache, void *pmd) -{ - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); -} - /* * List of all pgd's needed for non-PAE so it can invalidate entries * in both cached and uncached pgd's; not needed for PAE since the @@ -210,27 +205,18 @@ void pmd_ctor(struct kmem_cache *cache, void *pmd) * vmalloc faults work because attached pagetables are never freed. * -- wli */ -DEFINE_SPINLOCK(pgd_lock); -struct page *pgd_list; - static inline void pgd_list_add(pgd_t *pgd) { struct page *page = virt_to_page(pgd); - page->index = (unsigned long)pgd_list; - if (pgd_list) - set_page_private(pgd_list, (unsigned long)&page->index); - pgd_list = page; - set_page_private(page, (unsigned long)&pgd_list); + + list_add(&page->lru, &pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { - struct page *next, **pprev, *page = virt_to_page(pgd); - next = (struct page *)page->index; - pprev = (struct page **)page_private(page); - *pprev = next; - if (next) - set_page_private(next, (unsigned long)pprev); + struct page *page = virt_to_page(pgd); + + list_del(&page->lru); } @@ -285,7 +271,6 @@ static void pgd_dtor(void *pgd) if (SHARED_KERNEL_PMD) return; - paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); spin_lock_irqsave(&pgd_lock, flags); pgd_list_del(pgd); spin_unlock_irqrestore(&pgd_lock, flags); @@ -294,77 +279,96 @@ static void pgd_dtor(void *pgd) #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) -/* If we allocate a pmd for part of the kernel address space, then - make sure its initialized with the appropriate kernel mappings. - Otherwise use a cached zeroed pmd. */ -static pmd_t *pmd_cache_alloc(int idx) +#ifdef CONFIG_X86_PAE +/* + * Mop up any pmd pages which may still be attached to the pgd. + * Normally they will be freed by munmap/exit_mmap, but any pmd we + * preallocate which never got a corresponding vma will need to be + * freed manually. + */ +static void pgd_mop_up_pmds(pgd_t *pgdp) { - pmd_t *pmd; + int i; - if (idx >= USER_PTRS_PER_PGD) { - pmd = (pmd_t *)__get_free_page(GFP_KERNEL); + for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { + pgd_t pgd = pgdp[i]; - if (pmd) - memcpy(pmd, - (void *)pgd_page_vaddr(swapper_pg_dir[idx]), + if (pgd_val(pgd) != 0) { + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); + + pgdp[i] = native_make_pgd(0); + + paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT); + pmd_free(pmd); + } + } +} + +/* + * In PAE mode, we need to do a cr3 reload (=tlb flush) when + * updating the top-level pagetable entries to guarantee the + * processor notices the update. Since this is expensive, and + * all 4 top-level entries are used almost immediately in a + * new process's life, we just pre-populate them here. + * + * Also, if we're in a paravirt environment where the kernel pmd is + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate + * and initialize the kernel pmds here. + */ +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +{ + pud_t *pud; + unsigned long addr; + int i; + + pud = pud_offset(pgd, 0); + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; + i++, pud++, addr += PUD_SIZE) { + pmd_t *pmd = pmd_alloc_one(mm, addr); + + if (!pmd) { + pgd_mop_up_pmds(pgd); + return 0; + } + + if (i >= USER_PTRS_PER_PGD) + memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), sizeof(pmd_t) * PTRS_PER_PMD); - } else - pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); - return pmd; + pud_populate(mm, pud, pmd); + } + + return 1; +} +#else /* !CONFIG_X86_PAE */ +/* No need to prepopulate any pagetable entries in non-PAE modes. */ +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +{ + return 1; } -static void pmd_cache_free(pmd_t *pmd, int idx) +static void pgd_mop_up_pmds(pgd_t *pgd) { - if (idx >= USER_PTRS_PER_PGD) - free_page((unsigned long)pmd); - else - kmem_cache_free(pmd_cache, pmd); } +#endif /* CONFIG_X86_PAE */ pgd_t *pgd_alloc(struct mm_struct *mm) { - int i; pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor); - if (PTRS_PER_PMD == 1 || !pgd) - return pgd; + mm->pgd = pgd; /* so that alloc_pd can use it */ - for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { - pmd_t *pmd = pmd_cache_alloc(i); - - if (!pmd) - goto out_oom; - - paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); - set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); + if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { + quicklist_free(0, pgd_dtor, pgd); + pgd = NULL; } - return pgd; -out_oom: - for (i--; i >= 0; i--) { - pgd_t pgdent = pgd[i]; - void* pmd = (void *)__va(pgd_val(pgdent)-1); - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - pmd_cache_free(pmd, i); - } - quicklist_free(0, pgd_dtor, pgd); - return NULL; + return pgd; } void pgd_free(pgd_t *pgd) { - int i; - - /* in the PAE case user pgd entries are overwritten before usage */ - if (PTRS_PER_PMD > 1) - for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { - pgd_t pgdent = pgd[i]; - void* pmd = (void *)__va(pgd_val(pgdent)-1); - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - pmd_cache_free(pmd, i); - } - /* in the non-PAE case, free_pgtables() clears user pgd entries */ + pgd_mop_up_pmds(pgd); quicklist_free(0, pgd_dtor, pgd); } @@ -372,4 +376,3 @@ void check_pgt_cache(void) { quicklist_trim(0, pgd_dtor, 25, 16); } - diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index ea85172fc0c..65416f843e5 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -130,6 +130,9 @@ void __init acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) { int pxm, node; + int apic_id; + + apic_id = pa->apic_id; if (srat_disabled()) return; if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) { @@ -145,68 +148,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) bad_srat(); return; } - apicid_to_node[pa->apic_id] = node; + apicid_to_node[apic_id] = node; acpi_numa = 1; printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", - pxm, pa->apic_id, node); -} - -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE -/* - * Protect against too large hotadd areas that would fill up memory. - */ -static int hotadd_enough_memory(struct bootnode *nd) -{ - static unsigned long allocated; - static unsigned long last_area_end; - unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT; - long mem = pages * sizeof(struct page); - unsigned long addr; - unsigned long allowed; - unsigned long oldpages = pages; - - if (mem < 0) - return 0; - allowed = (end_pfn - absent_pages_in_range(0, end_pfn)) * PAGE_SIZE; - allowed = (allowed / 100) * hotadd_percent; - if (allocated + mem > allowed) { - unsigned long range; - /* Give them at least part of their hotadd memory upto hotadd_percent - It would be better to spread the limit out - over multiple hotplug areas, but that is too complicated - right now */ - if (allocated >= allowed) - return 0; - range = allowed - allocated; - pages = (range / PAGE_SIZE); - mem = pages * sizeof(struct page); - nd->end = nd->start + range; - } - /* Not completely fool proof, but a good sanity check */ - addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem); - if (addr == -1UL) - return 0; - if (pages != oldpages) - printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n", - pages << PAGE_SHIFT); - last_area_end = addr + mem; - allocated += mem; - return 1; -} - -static int update_end_of_memory(unsigned long end) -{ - found_add_area = 1; - if ((end >> PAGE_SHIFT) > end_pfn) - end_pfn = end >> PAGE_SHIFT; - return 1; + pxm, apic_id, node); } -static inline int save_add_info(void) -{ - return hotadd_percent > 0; -} -#else int update_end_of_memory(unsigned long end) {return -1;} static int hotadd_enough_memory(struct bootnode *nd) {return 1;} #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE @@ -214,10 +161,9 @@ static inline int save_add_info(void) {return 1;} #else static inline int save_add_info(void) {return 0;} #endif -#endif /* * Update nodes_add and decide if to include add are in the zone. - * Both SPARSE and RESERVE need nodes_add infomation. + * Both SPARSE and RESERVE need nodes_add information. * This code supports one contiguous hot add area per node. */ static int reserve_hotadd(int node, unsigned long start, unsigned long end) @@ -377,7 +323,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) return 1; } -static void unparse_node(int node) +static void __init unparse_node(int node) { int i; node_clear(node, nodes_parsed); @@ -400,7 +346,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) /* First clean up the node list */ for (i = 0; i < MAX_NUMNODES; i++) { cutoff_node(i, start, end); - if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { + /* + * don't confuse VM with a node that doesn't have the + * minimum memory. + */ + if (nodes[i].end && + (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { unparse_node(i); node_set_offline(i); } @@ -431,9 +382,11 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) setup_node_bootmem(i, nodes[i].start, nodes[i].end); for (i = 0; i < NR_CPUS; i++) { - if (cpu_to_node(i) == NUMA_NO_NODE) + int node = early_cpu_to_node(i); + + if (node == NUMA_NO_NODE) continue; - if (!node_isset(cpu_to_node(i), node_possible_map)) + if (!node_isset(node, node_possible_map)) numa_set_node(i, NUMA_NO_NODE); } numa_init_array(); @@ -441,6 +394,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) } #ifdef CONFIG_NUMA_EMU +static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = { + [0 ... MAX_NUMNODES-1] = PXM_INVAL +}; +static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = { + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +}; static int __init find_node_by_addr(unsigned long addr) { int ret = NUMA_NO_NODE; @@ -457,7 +416,7 @@ static int __init find_node_by_addr(unsigned long addr) break; } } - return i; + return ret; } /* @@ -471,12 +430,6 @@ static int __init find_node_by_addr(unsigned long addr) void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes) { int i, j; - int fake_node_to_pxm_map[MAX_NUMNODES] = { - [0 ... MAX_NUMNODES-1] = PXM_INVAL - }; - unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = { - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE - }; printk(KERN_INFO "Faking PXM affinity for fake nodes on real " "topology.\n"); diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 0ed046a187f..e2095cba409 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -32,7 +32,7 @@ static int backtrace_stack(void *data, char *name) return 0; } -static void backtrace_address(void *data, unsigned long addr) +static void backtrace_address(void *data, unsigned long addr, int reliable) { unsigned int *depth = data; @@ -48,7 +48,7 @@ static struct stacktrace_ops backtrace_ops = { }; struct frame_head { - struct frame_head *ebp; + struct frame_head *bp; unsigned long ret; } __attribute__((packed)); @@ -67,21 +67,21 @@ dump_user_backtrace(struct frame_head * head) /* frame pointers should strictly progress back up the stack * (towards higher addresses) */ - if (head >= bufhead[0].ebp) + if (head >= bufhead[0].bp) return NULL; - return bufhead[0].ebp; + return bufhead[0].bp; } void x86_backtrace(struct pt_regs * const regs, unsigned int depth) { struct frame_head *head = (struct frame_head *)frame_pointer(regs); - unsigned long stack = stack_pointer(regs); + unsigned long stack = kernel_trap_sp(regs); if (!user_mode_vm(regs)) { if (depth) - dump_trace(NULL, regs, (unsigned long *)stack, + dump_trace(NULL, regs, (unsigned long *)stack, 0, &backtrace_ops, &depth); return; } diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index c8ab79ef427..1f11cf0a307 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -18,11 +18,11 @@ #include <asm/nmi.h> #include <asm/msr.h> #include <asm/apic.h> - + #include "op_counter.h" #include "op_x86_model.h" -static struct op_x86_model_spec const * model; +static struct op_x86_model_spec const *model; static struct op_msrs cpu_msrs[NR_CPUS]; static unsigned long saved_lvtpc[NR_CPUS]; @@ -41,7 +41,6 @@ static int nmi_suspend(struct sys_device *dev, pm_message_t state) return 0; } - static int nmi_resume(struct sys_device *dev) { if (nmi_enabled == 1) @@ -49,29 +48,27 @@ static int nmi_resume(struct sys_device *dev) return 0; } - static struct sysdev_class oprofile_sysclass = { .name = "oprofile", .resume = nmi_resume, .suspend = nmi_suspend, }; - static struct sys_device device_oprofile = { .id = 0, .cls = &oprofile_sysclass, }; - static int __init init_sysfs(void) { int error; - if (!(error = sysdev_class_register(&oprofile_sysclass))) + + error = sysdev_class_register(&oprofile_sysclass); + if (!error) error = sysdev_register(&device_oprofile); return error; } - static void exit_sysfs(void) { sysdev_unregister(&device_oprofile); @@ -90,7 +87,7 @@ static int profile_exceptions_notify(struct notifier_block *self, int ret = NOTIFY_DONE; int cpu = smp_processor_id(); - switch(val) { + switch (val) { case DIE_NMI: if (model->check_ctrs(args->regs, &cpu_msrs[cpu])) ret = NOTIFY_STOP; @@ -101,24 +98,24 @@ static int profile_exceptions_notify(struct notifier_block *self, return ret; } -static void nmi_cpu_save_registers(struct op_msrs * msrs) +static void nmi_cpu_save_registers(struct op_msrs *msrs) { unsigned int const nr_ctrs = model->num_counters; - unsigned int const nr_ctrls = model->num_controls; - struct op_msr * counters = msrs->counters; - struct op_msr * controls = msrs->controls; + unsigned int const nr_ctrls = model->num_controls; + struct op_msr *counters = msrs->counters; + struct op_msr *controls = msrs->controls; unsigned int i; for (i = 0; i < nr_ctrs; ++i) { - if (counters[i].addr){ + if (counters[i].addr) { rdmsr(counters[i].addr, counters[i].saved.low, counters[i].saved.high); } } - + for (i = 0; i < nr_ctrls; ++i) { - if (controls[i].addr){ + if (controls[i].addr) { rdmsr(controls[i].addr, controls[i].saved.low, controls[i].saved.high); @@ -126,15 +123,13 @@ static void nmi_cpu_save_registers(struct op_msrs * msrs) } } - -static void nmi_save_registers(void * dummy) +static void nmi_save_registers(void *dummy) { int cpu = smp_processor_id(); - struct op_msrs * msrs = &cpu_msrs[cpu]; + struct op_msrs *msrs = &cpu_msrs[cpu]; nmi_cpu_save_registers(msrs); } - static void free_msrs(void) { int i; @@ -146,7 +141,6 @@ static void free_msrs(void) } } - static int allocate_msrs(void) { int success = 1; @@ -173,11 +167,10 @@ static int allocate_msrs(void) return success; } - -static void nmi_cpu_setup(void * dummy) +static void nmi_cpu_setup(void *dummy) { int cpu = smp_processor_id(); - struct op_msrs * msrs = &cpu_msrs[cpu]; + struct op_msrs *msrs = &cpu_msrs[cpu]; spin_lock(&oprofilefs_lock); model->setup_ctrs(msrs); spin_unlock(&oprofilefs_lock); @@ -193,13 +186,14 @@ static struct notifier_block profile_exceptions_nb = { static int nmi_setup(void) { - int err=0; + int err = 0; int cpu; if (!allocate_msrs()) return -ENOMEM; - if ((err = register_die_notifier(&profile_exceptions_nb))){ + err = register_die_notifier(&profile_exceptions_nb); + if (err) { free_msrs(); return err; } @@ -210,7 +204,7 @@ static int nmi_setup(void) /* Assume saved/restored counters are the same on all CPUs */ model->fill_in_addresses(&cpu_msrs[0]); - for_each_possible_cpu (cpu) { + for_each_possible_cpu(cpu) { if (cpu != 0) { memcpy(cpu_msrs[cpu].counters, cpu_msrs[0].counters, sizeof(struct op_msr) * model->num_counters); @@ -226,39 +220,37 @@ static int nmi_setup(void) return 0; } - -static void nmi_restore_registers(struct op_msrs * msrs) +static void nmi_restore_registers(struct op_msrs *msrs) { unsigned int const nr_ctrs = model->num_counters; - unsigned int const nr_ctrls = model->num_controls; - struct op_msr * counters = msrs->counters; - struct op_msr * controls = msrs->controls; + unsigned int const nr_ctrls = model->num_controls; + struct op_msr *counters = msrs->counters; + struct op_msr *controls = msrs->controls; unsigned int i; for (i = 0; i < nr_ctrls; ++i) { - if (controls[i].addr){ + if (controls[i].addr) { wrmsr(controls[i].addr, controls[i].saved.low, controls[i].saved.high); } } - + for (i = 0; i < nr_ctrs; ++i) { - if (counters[i].addr){ + if (counters[i].addr) { wrmsr(counters[i].addr, counters[i].saved.low, counters[i].saved.high); } } } - -static void nmi_cpu_shutdown(void * dummy) +static void nmi_cpu_shutdown(void *dummy) { unsigned int v; int cpu = smp_processor_id(); - struct op_msrs * msrs = &cpu_msrs[cpu]; - + struct op_msrs *msrs = &cpu_msrs[cpu]; + /* restoring APIC_LVTPC can trigger an apic error because the delivery * mode and vector nr combination can be illegal. That's by design: on * power on apic lvt contain a zero vector nr which are legal only for @@ -271,7 +263,6 @@ static void nmi_cpu_shutdown(void * dummy) nmi_restore_registers(msrs); } - static void nmi_shutdown(void) { nmi_enabled = 0; @@ -281,45 +272,40 @@ static void nmi_shutdown(void) free_msrs(); } - -static void nmi_cpu_start(void * dummy) +static void nmi_cpu_start(void *dummy) { - struct op_msrs const * msrs = &cpu_msrs[smp_processor_id()]; + struct op_msrs const *msrs = &cpu_msrs[smp_processor_id()]; model->start(msrs); } - static int nmi_start(void) { on_each_cpu(nmi_cpu_start, NULL, 0, 1); return 0; } - - -static void nmi_cpu_stop(void * dummy) + +static void nmi_cpu_stop(void *dummy) { - struct op_msrs const * msrs = &cpu_msrs[smp_processor_id()]; + struct op_msrs const *msrs = &cpu_msrs[smp_processor_id()]; model->stop(msrs); } - - + static void nmi_stop(void) { on_each_cpu(nmi_cpu_stop, NULL, 0, 1); } - struct op_counter_config counter_config[OP_MAX_COUNTER]; -static int nmi_create_files(struct super_block * sb, struct dentry * root) +static int nmi_create_files(struct super_block *sb, struct dentry *root) { unsigned int i; for (i = 0; i < model->num_counters; ++i) { - struct dentry * dir; + struct dentry *dir; char buf[4]; - - /* quick little hack to _not_ expose a counter if it is not + + /* quick little hack to _not_ expose a counter if it is not * available for use. This should protect userspace app. * NOTE: assumes 1:1 mapping here (that counters are organized * sequentially in their struct assignment). @@ -329,21 +315,21 @@ static int nmi_create_files(struct super_block * sb, struct dentry * root) snprintf(buf, sizeof(buf), "%d", i); dir = oprofilefs_mkdir(sb, root, buf); - oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); - oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event); - oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count); - oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); - oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); - oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); + oprofilefs_create_ulong(sb, dir, "enabled", &counter_config[i].enabled); + oprofilefs_create_ulong(sb, dir, "event", &counter_config[i].event); + oprofilefs_create_ulong(sb, dir, "count", &counter_config[i].count); + oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); + oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); + oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); } return 0; } - + static int p4force; module_param(p4force, int, 0); - -static int __init p4_init(char ** cpu_type) + +static int __init p4_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; @@ -356,15 +342,15 @@ static int __init p4_init(char ** cpu_type) return 1; #else switch (smp_num_siblings) { - case 1: - *cpu_type = "i386/p4"; - model = &op_p4_spec; - return 1; - - case 2: - *cpu_type = "i386/p4-ht"; - model = &op_p4_ht2_spec; - return 1; + case 1: + *cpu_type = "i386/p4"; + model = &op_p4_spec; + return 1; + + case 2: + *cpu_type = "i386/p4-ht"; + model = &op_p4_ht2_spec; + return 1; } #endif @@ -373,8 +359,7 @@ static int __init p4_init(char ** cpu_type) return 0; } - -static int __init ppro_init(char ** cpu_type) +static int __init ppro_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; @@ -409,52 +394,52 @@ int __init op_nmi_init(struct oprofile_operations *ops) if (!cpu_has_apic) return -ENODEV; - + switch (vendor) { - case X86_VENDOR_AMD: - /* Needs to be at least an Athlon (or hammer in 32bit mode) */ + case X86_VENDOR_AMD: + /* Needs to be at least an Athlon (or hammer in 32bit mode) */ - switch (family) { - default: + switch (family) { + default: + return -ENODEV; + case 6: + model = &op_athlon_spec; + cpu_type = "i386/athlon"; + break; + case 0xf: + model = &op_athlon_spec; + /* Actually it could be i386/hammer too, but give + user space an consistent name. */ + cpu_type = "x86-64/hammer"; + break; + case 0x10: + model = &op_athlon_spec; + cpu_type = "x86-64/family10"; + break; + } + break; + + case X86_VENDOR_INTEL: + switch (family) { + /* Pentium IV */ + case 0xf: + if (!p4_init(&cpu_type)) return -ENODEV; - case 6: - model = &op_athlon_spec; - cpu_type = "i386/athlon"; - break; - case 0xf: - model = &op_athlon_spec; - /* Actually it could be i386/hammer too, but give - user space an consistent name. */ - cpu_type = "x86-64/hammer"; - break; - case 0x10: - model = &op_athlon_spec; - cpu_type = "x86-64/family10"; - break; - } break; - - case X86_VENDOR_INTEL: - switch (family) { - /* Pentium IV */ - case 0xf: - if (!p4_init(&cpu_type)) - return -ENODEV; - break; - - /* A P6-class processor */ - case 6: - if (!ppro_init(&cpu_type)) - return -ENODEV; - break; - - default: - return -ENODEV; - } + + /* A P6-class processor */ + case 6: + if (!ppro_init(&cpu_type)) + return -ENODEV; break; default: return -ENODEV; + } + break; + + default: + return -ENODEV; } init_sysfs(); @@ -469,7 +454,6 @@ int __init op_nmi_init(struct oprofile_operations *ops) return 0; } - void op_nmi_exit(void) { if (using_nmi) diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 86274639066..52deabc72a6 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -109,6 +109,19 @@ static void __devinit pcibios_fixup_ghosts(struct pci_bus *b) } } +static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev) +{ + struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE]; + + if (rom_r->parent) + return; + if (rom_r->start) + /* we deal with BIOS assigned ROM later */ + return; + if (!(pci_probe & PCI_ASSIGN_ROMS)) + rom_r->start = rom_r->end = rom_r->flags = 0; +} + /* * Called after each bus is probed, but before its children * are examined. @@ -116,8 +129,12 @@ static void __devinit pcibios_fixup_ghosts(struct pci_bus *b) void __devinit pcibios_fixup_bus(struct pci_bus *b) { + struct pci_dev *dev; + pcibios_fixup_ghosts(b); pci_read_bridge_bases(b); + list_for_each_entry(dev, &b->devices, bus_list) + pcibios_fixup_device_resources(dev); } /* diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 6cff66dd0c9..cb63007e20b 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -19,7 +19,7 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d) printk(KERN_WARNING "PCI: Searching for i450NX host bridges on %s\n", pci_name(d)); reg = 0xd0; - for(pxb=0; pxb<2; pxb++) { + for(pxb = 0; pxb < 2; pxb++) { pci_read_config_byte(d, reg++, &busno); pci_read_config_byte(d, reg++, &suba); pci_read_config_byte(d, reg++, &subb); @@ -56,7 +56,7 @@ static void __devinit pci_fixup_umc_ide(struct pci_dev *d) int i; printk(KERN_WARNING "PCI: Fixing base address flags for device %s\n", pci_name(d)); - for(i=0; i<4; i++) + for(i = 0; i < 4; i++) d->resource[i].flags |= PCI_BASE_ADDRESS_SPACE_IO; } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_UMC, PCI_DEVICE_ID_UMC_UM8886BF, pci_fixup_umc_ide); @@ -127,7 +127,7 @@ static void pci_fixup_via_northbridge_bug(struct pci_dev *d) NB latency to zero */ pci_write_config_byte(d, PCI_LATENCY_TIMER, 0); - where = 0x95; /* the memory write queue timer register is + where = 0x95; /* the memory write queue timer register is different for the KT266x's: 0x95 not 0x55 */ } else if (d->device == PCI_DEVICE_ID_VIA_8363_0 && (d->revision == VIA_8363_KL133_REVISION_ID || @@ -230,7 +230,7 @@ static int quirk_pcie_aspm_write(struct pci_bus *bus, unsigned int devfn, int wh if ((offset) && (where == offset)) value = value & 0xfffffffc; - + return raw_pci_ops->write(0, bus->number, devfn, where, size, value); } @@ -271,8 +271,8 @@ static void pcie_rootport_aspm_quirk(struct pci_dev *pdev) * after hot-remove, the pbus->devices is empty and this code * will set the offsets to zero and the bus ops to parent's bus * ops, which is unmodified. - */ - for (i= GET_INDEX(pdev->device, 0); i <= GET_INDEX(pdev->device, 7); ++i) + */ + for (i = GET_INDEX(pdev->device, 0); i <= GET_INDEX(pdev->device, 7); ++i) quirk_aspm_offset[i] = 0; pbus->ops = pbus->parent->ops; @@ -286,17 +286,17 @@ static void pcie_rootport_aspm_quirk(struct pci_dev *pdev) list_for_each_entry(dev, &pbus->devices, bus_list) { /* There are 0 to 8 devices attached to this bus */ cap_base = pci_find_capability(dev, PCI_CAP_ID_EXP); - quirk_aspm_offset[GET_INDEX(pdev->device, dev->devfn)]= cap_base + 0x10; + quirk_aspm_offset[GET_INDEX(pdev->device, dev->devfn)] = cap_base + 0x10; } pbus->ops = &quirk_pcie_aspm_ops; } } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA, pcie_rootport_aspm_quirk ); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA1, pcie_rootport_aspm_quirk ); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB, pcie_rootport_aspm_quirk ); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB1, pcie_rootport_aspm_quirk ); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC, pcie_rootport_aspm_quirk ); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_rootport_aspm_quirk ); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA, pcie_rootport_aspm_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PA1, pcie_rootport_aspm_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB, pcie_rootport_aspm_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PB1, pcie_rootport_aspm_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC, pcie_rootport_aspm_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_MCH_PC1, pcie_rootport_aspm_quirk); /* * Fixup to mark boot BIOS video selected by BIOS before it changes @@ -336,8 +336,8 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev) * PCI header type NORMAL. */ if (bridge - &&((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE) - ||(bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) { + && ((bridge->hdr_type == PCI_HEADER_TYPE_BRIDGE) + || (bridge->hdr_type == PCI_HEADER_TYPE_CARDBUS))) { pci_read_config_word(bridge, PCI_BRIDGE_CONTROL, &config); if (!(config & PCI_BRIDGE_CTL_VGA)) diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 88d8f5c0ecb..ed07ce6c171 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -200,6 +200,7 @@ static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 }; + WARN_ON_ONCE(pirq >= 16); return irqmap[read_config_nybble(router, 0x48, pirq-1)]; } @@ -207,7 +208,8 @@ static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i { static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 }; unsigned int val = irqmap[irq]; - + + WARN_ON_ONCE(pirq >= 16); if (val) { write_config_nybble(router, 0x48, pirq-1, val); return 1; @@ -257,12 +259,16 @@ static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; + + WARN_ON_ONCE(pirq >= 5); return read_config_nybble(router, 0x55, pirqmap[pirq-1]); } static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 }; + + WARN_ON_ONCE(pirq >= 5); write_config_nybble(router, 0x55, pirqmap[pirq-1], irq); return 1; } @@ -275,12 +281,16 @@ static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; + + WARN_ON_ONCE(pirq >= 4); return read_config_nybble(router,0x43, pirqmap[pirq-1]); } static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { static const unsigned char pirqmap[4] = { 1, 0, 2, 3 }; + + WARN_ON_ONCE(pirq >= 4); write_config_nybble(router, 0x43, pirqmap[pirq-1], irq); return 1; } @@ -419,6 +429,7 @@ static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { + WARN_ON_ONCE(pirq >= 9); if (pirq > 8) { printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); return 0; @@ -428,6 +439,7 @@ static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq) static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { + WARN_ON_ONCE(pirq >= 9); if (pirq > 8) { printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); return 0; @@ -449,14 +461,14 @@ static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, */ static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { - outb_p(pirq, 0xc00); + outb(pirq, 0xc00); return inb(0xc01) & 0xf; } static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { - outb_p(pirq, 0xc00); - outb_p(irq, 0xc01); + outb(pirq, 0xc00); + outb(irq, 0xc01); return 1; } diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 998fd3ec0d6..efcf620d143 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -19,7 +19,7 @@ unsigned long saved_context_esp, saved_context_ebp; unsigned long saved_context_esi, saved_context_edi; unsigned long saved_context_eflags; -void __save_processor_state(struct saved_context *ctxt) +static void __save_processor_state(struct saved_context *ctxt) { mtrr_save_fixed_ranges(NULL); kernel_fpu_begin(); @@ -74,19 +74,19 @@ static void fix_processor_context(void) /* * Now maybe reload the debug registers */ - if (current->thread.debugreg[7]){ - set_debugreg(current->thread.debugreg[0], 0); - set_debugreg(current->thread.debugreg[1], 1); - set_debugreg(current->thread.debugreg[2], 2); - set_debugreg(current->thread.debugreg[3], 3); + if (current->thread.debugreg7) { + set_debugreg(current->thread.debugreg0, 0); + set_debugreg(current->thread.debugreg1, 1); + set_debugreg(current->thread.debugreg2, 2); + set_debugreg(current->thread.debugreg3, 3); /* no 4 and 5 */ - set_debugreg(current->thread.debugreg[6], 6); - set_debugreg(current->thread.debugreg[7], 7); + set_debugreg(current->thread.debugreg6, 6); + set_debugreg(current->thread.debugreg7, 7); } } -void __restore_processor_state(struct saved_context *ctxt) +static void __restore_processor_state(struct saved_context *ctxt) { /* * control registers diff --git a/arch/x86/vdso/.gitignore b/arch/x86/vdso/.gitignore index f8b69d84238..60274d5746e 100644 --- a/arch/x86/vdso/.gitignore +++ b/arch/x86/vdso/.gitignore @@ -1 +1,6 @@ vdso.lds +vdso-syms.lds +vdso32-syms.lds +vdso32-syscall-syms.lds +vdso32-sysenter-syms.lds +vdso32-int80-syms.lds diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index e7bff0fbac2..d28dda57470 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -1,39 +1,37 @@ # -# x86-64 vDSO. +# Building vDSO images for x86. # +VDSO64-$(CONFIG_X86_64) := y +VDSO32-$(CONFIG_X86_32) := y +VDSO32-$(CONFIG_COMPAT) := y + +vdso-install-$(VDSO64-y) += vdso.so +vdso-install-$(VDSO32-y) += $(vdso32-y:=.so) + + # files to link into the vdso -# vdso-start.o has to be first -vobjs-y := vdso-start.o vdso-note.o vclock_gettime.o vgetcpu.o vvar.o +vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vvar.o # files to link into kernel -obj-y := vma.o vdso.o vdso-syms.o +obj-$(VDSO64-y) += vma.o vdso.o +obj-$(VDSO32-y) += vdso32.o vdso32-setup.o vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) $(obj)/vdso.o: $(obj)/vdso.so -targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) vdso-syms.o - -# The DSO images are built using a special linker script. -quiet_cmd_syscall = SYSCALL $@ - cmd_syscall = $(CC) -m elf_x86_64 -nostdlib $(SYSCFLAGS_$(@F)) \ - -Wl,-T,$(filter-out FORCE,$^) -o $@ +targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) export CPPFLAGS_vdso.lds += -P -C -vdso-flags = -fPIC -shared -Wl,-soname=linux-vdso.so.1 \ - $(call ld-option, -Wl$(comma)--hash-style=sysv) \ - -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 -SYSCFLAGS_vdso.so = $(vdso-flags) -SYSCFLAGS_vdso.so.dbg = $(vdso-flags) +VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -Wl,-soname=linux-vdso.so.1 \ + -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so -$(obj)/vdso.so: $(src)/vdso.lds $(vobjs) FORCE - $(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE - $(call if_changed,syscall) + $(call if_changed,vdso) $(obj)/%.so: OBJCOPYFLAGS := -S $(obj)/%.so: $(obj)/%.so.dbg FORCE @@ -41,24 +39,96 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE CFL := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64 -$(obj)/vclock_gettime.o: KBUILD_CFLAGS = $(CFL) -$(obj)/vgetcpu.o: KBUILD_CFLAGS = $(CFL) +$(vobjs): KBUILD_CFLAGS = $(CFL) + +targets += vdso-syms.lds +obj-$(VDSO64-y) += vdso-syms.lds + +# +# Match symbols in the DSO that look like VDSO*; produce a file of constants. +# +sed-vdsosym := -e 's/^00*/0/' \ + -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p' +quiet_cmd_vdsosym = VDSOSYM $@ + cmd_vdsosym = $(NM) $< | sed -n $(sed-vdsosym) | LC_ALL=C sort > $@ + +$(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE + $(call if_changed,vdsosym) + +# +# Build multiple 32-bit vDSO images to choose from at boot time. +# +obj-$(VDSO32-y) += vdso32-syms.lds +vdso32.so-$(CONFIG_X86_32) += int80 +vdso32.so-$(CONFIG_COMPAT) += syscall +vdso32.so-$(VDSO32-y) += sysenter + +CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) +VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1 + +# This makes sure the $(obj) subdirectory exists even though vdso32/ +# is not a kbuild sub-make subdirectory. +override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ -# We also create a special relocatable object that should mirror the symbol -# table and layout of the linked DSO. With ld -R we can then refer to -# these symbols in the kernel code rather than hand-coded addresses. -extra-y += vdso-syms.o -$(obj)/built-in.o: $(obj)/vdso-syms.o -$(obj)/built-in.o: ld_flags += -R $(obj)/vdso-syms.o +targets += vdso32/vdso32.lds +targets += $(vdso32.so-y:%=vdso32-%.so.dbg) $(vdso32.so-y:%=vdso32-%.so) +targets += vdso32/note.o $(vdso32.so-y:%=vdso32/%.o) -SYSCFLAGS_vdso-syms.o = -r -d -$(obj)/vdso-syms.o: $(src)/vdso.lds $(vobjs) FORCE - $(call if_changed,syscall) +extra-y += $(vdso32.so-y:%=vdso32-%.so) +$(obj)/vdso32.o: $(vdso32.so-y:%=$(obj)/vdso32-%.so) + +KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) +$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) +$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): asflags-$(CONFIG_X86_64) += -m32 + +$(vdso32.so-y:%=$(obj)/vdso32-%.so.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ + $(obj)/vdso32/vdso32.lds \ + $(obj)/vdso32/note.o \ + $(obj)/vdso32/%.o + $(call if_changed,vdso) + +# Make vdso32-*-syms.lds from each image, and then make sure they match. +# The only difference should be that some do not define VDSO32_SYSENTER_RETURN. + +targets += vdso32-syms.lds $(vdso32.so-y:%=vdso32-%-syms.lds) + +quiet_cmd_vdso32sym = VDSOSYM $@ +define cmd_vdso32sym + if LC_ALL=C sort -u $(filter-out FORCE,$^) > $(@D)/.tmp_$(@F) && \ + $(foreach H,$(filter-out FORCE,$^),\ + if grep -q VDSO32_SYSENTER_RETURN $H; \ + then diff -u $(@D)/.tmp_$(@F) $H; \ + else sed /VDSO32_SYSENTER_RETURN/d $(@D)/.tmp_$(@F) | \ + diff -u - $H; fi &&) : ;\ + then mv -f $(@D)/.tmp_$(@F) $@; \ + else rm -f $(@D)/.tmp_$(@F); exit 1; \ + fi +endef + +$(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE + $(call if_changed,vdso32sym) + +# +# The DSO images are built using a special linker script. +# +quiet_cmd_vdso = VDSO $@ + cmd_vdso = $(CC) -nostdlib -o $@ \ + $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ + -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) + +VDSO_LDFLAGS = -fPIC -shared $(call ld-option, -Wl$(comma)--hash-style=sysv) + +# +# Install the unstripped copy of vdso*.so listed in $(vdso-install-y). +# quiet_cmd_vdso_install = INSTALL $@ cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ -vdso.so: +$(vdso-install-y): %.so: $(obj)/%.so.dbg FORCE @mkdir -p $(MODLIB)/vdso $(call cmd,vdso_install) -vdso_install: vdso.so +PHONY += vdso_install $(vdso-install-y) +vdso_install: $(vdso-install-y) + +clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 5b54cdfb2b0..23476c2ebfc 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -19,7 +19,6 @@ #include <asm/hpet.h> #include <asm/unistd.h> #include <asm/io.h> -#include <asm/vgtod.h> #include "vextern.h" #define gtod vdso_vsyscall_gtod_data diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S new file mode 100644 index 00000000000..634a2cf6204 --- /dev/null +++ b/arch/x86/vdso/vdso-layout.lds.S @@ -0,0 +1,64 @@ +/* + * Linker script for vDSO. This is an ELF shared object prelinked to + * its virtual address, and with only one read-only segment. + * This script controls its layout. + */ + +SECTIONS +{ + . = VDSO_PRELINK + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + + .dynamic : { *(.dynamic) } :text :dynamic + + .rodata : { *(.rodata*) } :text + .data : { + *(.data*) + *(.sdata*) + *(.got.plt) *(.got) + *(.gnu.linkonce.d.*) + *(.bss*) + *(.dynbss*) + *(.gnu.linkonce.b.*) + } + + .altinstructions : { *(.altinstructions) } + .altinstr_replacement : { *(.altinstr_replacement) } + + /* + * Align the actual code well away from the non-instruction data. + * This is the best thing for the I-cache. + */ + . = ALIGN(0x100); + + .text : { *(.text*) } :text =0x90909090 +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} diff --git a/arch/x86/vdso/vdso-start.S b/arch/x86/vdso/vdso-start.S deleted file mode 100644 index 2dc2cdb84d6..00000000000 --- a/arch/x86/vdso/vdso-start.S +++ /dev/null @@ -1,2 +0,0 @@ - .globl vdso_kernel_start -vdso_kernel_start: diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S index 667d3245d97..4e5dd3b4de7 100644 --- a/arch/x86/vdso/vdso.lds.S +++ b/arch/x86/vdso/vdso.lds.S @@ -1,79 +1,37 @@ /* - * Linker script for vsyscall DSO. The vsyscall page is an ELF shared - * object prelinked to its virtual address, and with only one read-only - * segment (that fits in one page). This script controls its layout. + * Linker script for 64-bit vDSO. + * We #include the file to define the layout details. + * Here we only choose the prelinked virtual address. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. We can define local symbols here called VDSO* to make their + * values visible using the asm-x86/vdso.h macros from the kernel proper. */ -#include <asm/asm-offsets.h> -#include "voffset.h" #define VDSO_PRELINK 0xffffffffff700000 - -SECTIONS -{ - . = VDSO_PRELINK + SIZEOF_HEADERS; - - .hash : { *(.hash) } :text - .gnu.hash : { *(.gnu.hash) } - .dynsym : { *(.dynsym) } - .dynstr : { *(.dynstr) } - .gnu.version : { *(.gnu.version) } - .gnu.version_d : { *(.gnu.version_d) } - .gnu.version_r : { *(.gnu.version_r) } - - /* This linker script is used both with -r and with -shared. - For the layouts to match, we need to skip more than enough - space for the dynamic symbol table et al. If this amount - is insufficient, ld -shared will barf. Just increase it here. */ - . = VDSO_PRELINK + VDSO_TEXT_OFFSET; - - .text : { *(.text*) } :text - .rodata : { *(.rodata*) } :text - .data : { - *(.data*) - *(.sdata*) - *(.bss*) - *(.dynbss*) - } :text - - .altinstructions : { *(.altinstructions) } :text - .altinstr_replacement : { *(.altinstr_replacement) } :text - - .note : { *(.note.*) } :text :note - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text - .dynamic : { *(.dynamic) } :text :dynamic - .useless : { - *(.got.plt) *(.got) - *(.gnu.linkonce.d.*) - *(.gnu.linkonce.b.*) - } :text -} +#include "vdso-layout.lds.S" /* - * We must supply the ELF program headers explicitly to get just one - * PT_LOAD segment, and set the flags explicitly to make segments read-only. + * This controls what userland symbols we export from the vDSO. */ -PHDRS -{ - text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ - dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ - note PT_NOTE FLAGS(4); /* PF_R */ - eh_frame_hdr 0x6474e550; /* PT_GNU_EH_FRAME, but ld doesn't match the name */ +VERSION { + LINUX_2.6 { + global: + clock_gettime; + __vdso_clock_gettime; + gettimeofday; + __vdso_gettimeofday; + getcpu; + __vdso_getcpu; + local: *; + }; } +VDSO64_PRELINK = VDSO_PRELINK; + /* - * This controls what symbols we export from the DSO. + * Define VDSO64_x for each VEXTERN(x), for use via VDSO64_SYMBOL. */ -VERSION -{ - LINUX_2.6 { - global: - clock_gettime; - __vdso_clock_gettime; - gettimeofday; - __vdso_gettimeofday; - getcpu; - __vdso_getcpu; - local: *; - }; -} +#define VEXTERN(x) VDSO64_ ## x = vdso_ ## x; +#include "vextern.h" +#undef VEXTERN diff --git a/arch/x86/kernel/sysenter_32.c b/arch/x86/vdso/vdso32-setup.c index 5a2d951e260..348f1341e1c 100644 --- a/arch/x86/kernel/sysenter_32.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -23,6 +23,8 @@ #include <asm/unistd.h> #include <asm/elf.h> #include <asm/tlbflush.h> +#include <asm/vdso.h> +#include <asm/proto.h> enum { VDSO_DISABLED = 0, @@ -36,14 +38,24 @@ enum { #define VDSO_DEFAULT VDSO_ENABLED #endif +#ifdef CONFIG_X86_64 +#define vdso_enabled sysctl_vsyscall32 +#define arch_setup_additional_pages syscall32_setup_pages +#endif + +/* + * This is the difference between the prelinked addresses in the vDSO images + * and the VDSO_HIGH_BASE address where CONFIG_COMPAT_VDSO places the vDSO + * in the user address space. + */ +#define VDSO_ADDR_ADJUST (VDSO_HIGH_BASE - (unsigned long)VDSO32_PRELINK) + /* * Should the kernel map a VDSO page into processes and pass its * address down to glibc upon exec()? */ unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT; -EXPORT_SYMBOL_GPL(vdso_enabled); - static int __init vdso_setup(char *s) { vdso_enabled = simple_strtoul(s, NULL, 0); @@ -51,9 +63,18 @@ static int __init vdso_setup(char *s) return 1; } -__setup("vdso=", vdso_setup); +/* + * For consistency, the argument vdso32=[012] affects the 32-bit vDSO + * behavior on both 64-bit and 32-bit kernels. + * On 32-bit kernels, vdso=[012] means the same thing. + */ +__setup("vdso32=", vdso_setup); + +#ifdef CONFIG_X86_32 +__setup_param("vdso=", vdso32_setup, vdso_setup, 0); -extern asmlinkage void sysenter_entry(void); +EXPORT_SYMBOL_GPL(vdso_enabled); +#endif static __init void reloc_symtab(Elf32_Ehdr *ehdr, unsigned offset, unsigned size) @@ -78,7 +99,7 @@ static __init void reloc_symtab(Elf32_Ehdr *ehdr, case STT_FUNC: case STT_SECTION: case STT_FILE: - sym->st_value += VDSO_HIGH_BASE; + sym->st_value += VDSO_ADDR_ADJUST; } } } @@ -104,7 +125,7 @@ static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset) case DT_VERNEED: case DT_ADDRRNGLO ... DT_ADDRRNGHI: /* definitely pointers needing relocation */ - dyn->d_un.d_ptr += VDSO_HIGH_BASE; + dyn->d_un.d_ptr += VDSO_ADDR_ADJUST; break; case DT_ENCODING ... OLD_DT_LOOS-1: @@ -113,7 +134,7 @@ static __init void reloc_dyn(Elf32_Ehdr *ehdr, unsigned offset) they're even */ if (dyn->d_tag >= DT_ENCODING && (dyn->d_tag & 1) == 0) - dyn->d_un.d_ptr += VDSO_HIGH_BASE; + dyn->d_un.d_ptr += VDSO_ADDR_ADJUST; break; case DT_VERDEFNUM: @@ -142,15 +163,15 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr) int i; BUG_ON(memcmp(ehdr->e_ident, ELFMAG, 4) != 0 || - !elf_check_arch(ehdr) || + !elf_check_arch_ia32(ehdr) || ehdr->e_type != ET_DYN); - ehdr->e_entry += VDSO_HIGH_BASE; + ehdr->e_entry += VDSO_ADDR_ADJUST; /* rebase phdrs */ phdr = (void *)ehdr + ehdr->e_phoff; for (i = 0; i < ehdr->e_phnum; i++) { - phdr[i].p_vaddr += VDSO_HIGH_BASE; + phdr[i].p_vaddr += VDSO_ADDR_ADJUST; /* relocate dynamic stuff */ if (phdr[i].p_type == PT_DYNAMIC) @@ -163,7 +184,7 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr) if (!(shdr[i].sh_flags & SHF_ALLOC)) continue; - shdr[i].sh_addr += VDSO_HIGH_BASE; + shdr[i].sh_addr += VDSO_ADDR_ADJUST; if (shdr[i].sh_type == SHT_SYMTAB || shdr[i].sh_type == SHT_DYNSYM) @@ -172,6 +193,45 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr) } } +/* + * These symbols are defined by vdso32.S to mark the bounds + * of the ELF DSO images included therein. + */ +extern const char vdso32_default_start, vdso32_default_end; +extern const char vdso32_sysenter_start, vdso32_sysenter_end; +static struct page *vdso32_pages[1]; + +#ifdef CONFIG_X86_64 + +static int use_sysenter __read_mostly = -1; + +#define vdso32_sysenter() (use_sysenter > 0) + +/* May not be __init: called during resume */ +void syscall32_cpu_init(void) +{ + if (use_sysenter < 0) + use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL); + + /* Load these always in case some future AMD CPU supports + SYSENTER from compat mode too. */ + checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL); + checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + + wrmsrl(MSR_CSTAR, ia32_cstar_target); +} + +#define compat_uses_vma 1 + +static inline void map_compat_vdso(int map) +{ +} + +#else /* CONFIG_X86_32 */ + +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) + void enable_sep_cpu(void) { int cpu = get_cpu(); @@ -183,10 +243,10 @@ void enable_sep_cpu(void) } tss->x86_tss.ss1 = __KERNEL_CS; - tss->x86_tss.esp1 = sizeof(struct tss_struct) + (unsigned long) tss; + tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.esp1, 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0); + wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); put_cpu(); } @@ -209,13 +269,7 @@ static int __init gate_vma_init(void) return 0; } -/* - * These symbols are defined by vsyscall.o to mark the bounds - * of the ELF DSO images included therein. - */ -extern const char vsyscall_int80_start, vsyscall_int80_end; -extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; -static struct page *syscall_pages[1]; +#define compat_uses_vma 0 static void map_compat_vdso(int map) { @@ -226,31 +280,35 @@ static void map_compat_vdso(int map) vdso_mapped = map; - __set_fixmap(FIX_VDSO, page_to_pfn(syscall_pages[0]) << PAGE_SHIFT, + __set_fixmap(FIX_VDSO, page_to_pfn(vdso32_pages[0]) << PAGE_SHIFT, map ? PAGE_READONLY_EXEC : PAGE_NONE); /* flush stray tlbs */ flush_tlb_all(); } +#endif /* CONFIG_X86_64 */ + int __init sysenter_setup(void) { void *syscall_page = (void *)get_zeroed_page(GFP_ATOMIC); const void *vsyscall; size_t vsyscall_len; - syscall_pages[0] = virt_to_page(syscall_page); + vdso32_pages[0] = virt_to_page(syscall_page); +#ifdef CONFIG_X86_32 gate_vma_init(); printk("Compat vDSO mapped to %08lx.\n", __fix_to_virt(FIX_VDSO)); +#endif - if (!boot_cpu_has(X86_FEATURE_SEP)) { - vsyscall = &vsyscall_int80_start; - vsyscall_len = &vsyscall_int80_end - &vsyscall_int80_start; + if (!vdso32_sysenter()) { + vsyscall = &vdso32_default_start; + vsyscall_len = &vdso32_default_end - &vdso32_default_start; } else { - vsyscall = &vsyscall_sysenter_start; - vsyscall_len = &vsyscall_sysenter_end - &vsyscall_sysenter_start; + vsyscall = &vdso32_sysenter_start; + vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; } memcpy(syscall_page, vsyscall, vsyscall_len); @@ -259,9 +317,6 @@ int __init sysenter_setup(void) return 0; } -/* Defined in vsyscall-sysenter.S */ -extern void SYSENTER_RETURN; - /* Setup a VMA at program startup for the vsyscall page */ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) { @@ -286,7 +341,9 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) ret = addr; goto up_fail; } + } + if (compat_uses_vma || !compat) { /* * MAYWRITE to allow gdb to COW and set breakpoints * @@ -300,7 +357,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| VM_ALWAYSDUMP, - syscall_pages); + vdso32_pages); if (ret) goto up_fail; @@ -308,7 +365,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) current->mm->context.vdso = (void *)addr; current_thread_info()->sysenter_return = - (void *)VDSO_SYM(&SYSENTER_RETURN); + VDSO32_SYMBOL(addr, SYSENTER_RETURN); up_fail: up_write(&mm->mmap_sem); @@ -316,6 +373,45 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) return ret; } +#ifdef CONFIG_X86_64 + +__initcall(sysenter_setup); + +#ifdef CONFIG_SYSCTL +/* Register vsyscall32 into the ABI table */ +#include <linux/sysctl.h> + +static ctl_table abi_table2[] = { + { + .procname = "vsyscall32", + .data = &sysctl_vsyscall32, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + {} +}; + +static ctl_table abi_root_table2[] = { + { + .ctl_name = CTL_ABI, + .procname = "abi", + .mode = 0555, + .child = abi_table2 + }, + {} +}; + +static __init int ia32_binfmt_init(void) +{ + register_sysctl_table(abi_root_table2); + return 0; +} +__initcall(ia32_binfmt_init); +#endif + +#else /* CONFIG_X86_32 */ + const char *arch_vma_name(struct vm_area_struct *vma) { if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) @@ -344,3 +440,5 @@ int in_gate_area_no_task(unsigned long addr) { return 0; } + +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S new file mode 100644 index 00000000000..1e36f72cab8 --- /dev/null +++ b/arch/x86/vdso/vdso32.S @@ -0,0 +1,19 @@ +#include <linux/init.h> + +__INITDATA + + .globl vdso32_default_start, vdso32_default_end +vdso32_default_start: +#ifdef CONFIG_X86_32 + .incbin "arch/x86/vdso/vdso32-int80.so" +#else + .incbin "arch/x86/vdso/vdso32-syscall.so" +#endif +vdso32_default_end: + + .globl vdso32_sysenter_start, vdso32_sysenter_end +vdso32_sysenter_start: + .incbin "arch/x86/vdso/vdso32-sysenter.so" +vdso32_sysenter_end: + +__FINIT diff --git a/arch/x86/vdso/vdso32/.gitignore b/arch/x86/vdso/vdso32/.gitignore new file mode 100644 index 00000000000..e45fba9d0ce --- /dev/null +++ b/arch/x86/vdso/vdso32/.gitignore @@ -0,0 +1 @@ +vdso32.lds diff --git a/arch/x86/kernel/vsyscall-int80_32.S b/arch/x86/vdso/vdso32/int80.S index 103cab6aa7c..b15b7c01aed 100644 --- a/arch/x86/kernel/vsyscall-int80_32.S +++ b/arch/x86/vdso/vdso32/int80.S @@ -1,15 +1,15 @@ /* - * Code for the vsyscall page. This version uses the old int $0x80 method. + * Code for the vDSO. This version uses the old int $0x80 method. * - * NOTE: - * 1) __kernel_vsyscall _must_ be first in this page. - * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S - * for details. + * First get the common code for the sigreturn entry points. + * This must come first. */ +#include "sigreturn.S" .text .globl __kernel_vsyscall .type __kernel_vsyscall,@function + ALIGN __kernel_vsyscall: .LSTART_vsyscall: int $0x80 @@ -47,7 +47,10 @@ __kernel_vsyscall: .LENDFDEDLSI: .previous -/* - * Get the common code for the sigreturn entry points. - */ -#include "vsyscall-sigreturn_32.S" + /* + * Pad out the segment to match the size of the sysenter.S version. + */ +VDSO32_vsyscall_eh_frame_size = 0x40 + .section .data,"aw",@progbits + .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0 + .previous diff --git a/arch/x86/kernel/vsyscall-note_32.S b/arch/x86/vdso/vdso32/note.S index fcf376a37f7..c83f2573469 100644 --- a/arch/x86/kernel/vsyscall-note_32.S +++ b/arch/x86/vdso/vdso32/note.S @@ -33,12 +33,11 @@ ELFNOTE_END * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen. */ -#include "../../x86/xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ +#include "../../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ - .globl VDSO_NOTE_MASK ELFNOTE_START(GNU, 2, "a") .long 1 /* ncaps */ -VDSO_NOTE_MASK: +VDSO32_NOTE_MASK: /* Symbol used by arch/x86/xen/setup.c */ .long 0 /* mask */ .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */ ELFNOTE_END diff --git a/arch/x86/kernel/vsyscall-sigreturn_32.S b/arch/x86/vdso/vdso32/sigreturn.S index a92262f4165..31776d0efc8 100644 --- a/arch/x86/kernel/vsyscall-sigreturn_32.S +++ b/arch/x86/vdso/vdso32/sigreturn.S @@ -1,41 +1,42 @@ /* - * Common code for the sigreturn entry points on the vsyscall page. + * Common code for the sigreturn entry points in vDSO images. * So far this code is the same for both int80 and sysenter versions. - * This file is #include'd by vsyscall-*.S to define them after the - * vsyscall entry point. The kernel assumes that the addresses of these - * routines are constant for all vsyscall implementations. + * This file is #include'd by int80.S et al to define them first thing. + * The kernel assumes that the addresses of these routines are constant + * for all vDSO implementations. */ -#include <asm/unistd.h> +#include <linux/linkage.h> +#include <asm/unistd_32.h> #include <asm/asm-offsets.h> - -/* XXX - Should these be named "_sigtramp" or something? -*/ +#ifndef SYSCALL_ENTER_KERNEL +#define SYSCALL_ENTER_KERNEL int $0x80 +#endif .text - .org __kernel_vsyscall+32,0x90 .globl __kernel_sigreturn .type __kernel_sigreturn,@function + ALIGN __kernel_sigreturn: .LSTART_sigreturn: popl %eax /* XXX does this mean it needs unwind info? */ movl $__NR_sigreturn, %eax - int $0x80 + SYSCALL_ENTER_KERNEL .LEND_sigreturn: + nop .size __kernel_sigreturn,.-.LSTART_sigreturn - .balign 32 .globl __kernel_rt_sigreturn .type __kernel_rt_sigreturn,@function + ALIGN __kernel_rt_sigreturn: .LSTART_rt_sigreturn: movl $__NR_rt_sigreturn, %eax - int $0x80 + SYSCALL_ENTER_KERNEL .LEND_rt_sigreturn: + nop .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn - .balign 32 .previous .section .eh_frame,"a",@progbits @@ -70,9 +71,9 @@ __kernel_rt_sigreturn: be the value of the stack pointer in the caller. This means that we must define the CFA of this body of code to be the saved value of the stack pointer in the sigcontext. Which - also means that there is no fixed relation to the other + also means that there is no fixed relation to the other saved registers, which means that we must use DW_CFA_expression - to compute their addresses. It also means that when we + to compute their addresses. It also means that when we adjust the stack with the popl, we have to do it all over again. */ #define do_cfa_expr(offset) \ @@ -91,27 +92,27 @@ __kernel_rt_sigreturn: .sleb128 offset; /* offset */ \ 1: - do_cfa_expr(SIGCONTEXT_esp+4) - do_expr(0, SIGCONTEXT_eax+4) - do_expr(1, SIGCONTEXT_ecx+4) - do_expr(2, SIGCONTEXT_edx+4) - do_expr(3, SIGCONTEXT_ebx+4) - do_expr(5, SIGCONTEXT_ebp+4) - do_expr(6, SIGCONTEXT_esi+4) - do_expr(7, SIGCONTEXT_edi+4) - do_expr(8, SIGCONTEXT_eip+4) + do_cfa_expr(IA32_SIGCONTEXT_sp+4) + do_expr(0, IA32_SIGCONTEXT_ax+4) + do_expr(1, IA32_SIGCONTEXT_cx+4) + do_expr(2, IA32_SIGCONTEXT_dx+4) + do_expr(3, IA32_SIGCONTEXT_bx+4) + do_expr(5, IA32_SIGCONTEXT_bp+4) + do_expr(6, IA32_SIGCONTEXT_si+4) + do_expr(7, IA32_SIGCONTEXT_di+4) + do_expr(8, IA32_SIGCONTEXT_ip+4) .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ - do_cfa_expr(SIGCONTEXT_esp) - do_expr(0, SIGCONTEXT_eax) - do_expr(1, SIGCONTEXT_ecx) - do_expr(2, SIGCONTEXT_edx) - do_expr(3, SIGCONTEXT_ebx) - do_expr(5, SIGCONTEXT_ebp) - do_expr(6, SIGCONTEXT_esi) - do_expr(7, SIGCONTEXT_edi) - do_expr(8, SIGCONTEXT_eip) + do_cfa_expr(IA32_SIGCONTEXT_sp) + do_expr(0, IA32_SIGCONTEXT_ax) + do_expr(1, IA32_SIGCONTEXT_cx) + do_expr(2, IA32_SIGCONTEXT_dx) + do_expr(3, IA32_SIGCONTEXT_bx) + do_expr(5, IA32_SIGCONTEXT_bp) + do_expr(6, IA32_SIGCONTEXT_si) + do_expr(7, IA32_SIGCONTEXT_di) + do_expr(8, IA32_SIGCONTEXT_ip) .align 4 .LENDFDEDLSI1: @@ -128,15 +129,15 @@ __kernel_rt_sigreturn: slightly less complicated than the above, since we don't modify the stack pointer in the process. */ - do_cfa_expr(RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esp) - do_expr(0, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eax) - do_expr(1, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ecx) - do_expr(2, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edx) - do_expr(3, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebx) - do_expr(5, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_ebp) - do_expr(6, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_esi) - do_expr(7, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_edi) - do_expr(8, RT_SIGFRAME_sigcontext-4 + SIGCONTEXT_eip) + do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_sp) + do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ax) + do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_cx) + do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_dx) + do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bx) + do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bp) + do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_si) + do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_di) + do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ip) .align 4 .LENDFDEDLSI2: diff --git a/arch/x86/ia32/vsyscall-syscall.S b/arch/x86/vdso/vdso32/syscall.S index cf9ef678de3..5415b5613d5 100644 --- a/arch/x86/ia32/vsyscall-syscall.S +++ b/arch/x86/vdso/vdso32/syscall.S @@ -1,16 +1,18 @@ /* - * Code for the vsyscall page. This version uses the syscall instruction. + * Code for the vDSO. This version uses the syscall instruction. + * + * First get the common code for the sigreturn entry points. + * This must come first. */ +#define SYSCALL_ENTER_KERNEL syscall +#include "sigreturn.S" -#include <asm/ia32_unistd.h> -#include <asm/asm-offsets.h> #include <asm/segment.h> - .code32 .text - .section .text.vsyscall,"ax" .globl __kernel_vsyscall .type __kernel_vsyscall,@function + ALIGN __kernel_vsyscall: .LSTART_vsyscall: push %ebp @@ -64,6 +66,12 @@ __kernel_vsyscall: .uleb128 4 .align 4 .LENDFDE1: + .previous -#define SYSCALL_ENTER_KERNEL syscall -#include "vsyscall-sigreturn.S" + /* + * Pad out the segment to match the size of the sysenter.S version. + */ +VDSO32_vsyscall_eh_frame_size = 0x40 + .section .data,"aw",@progbits + .space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0 + .previous diff --git a/arch/x86/kernel/vsyscall-sysenter_32.S b/arch/x86/vdso/vdso32/sysenter.S index ed879bf4299..e2800affa75 100644 --- a/arch/x86/kernel/vsyscall-sysenter_32.S +++ b/arch/x86/vdso/vdso32/sysenter.S @@ -1,11 +1,10 @@ /* - * Code for the vsyscall page. This version uses the sysenter instruction. + * Code for the vDSO. This version uses the sysenter instruction. * - * NOTE: - * 1) __kernel_vsyscall _must_ be first in this page. - * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S - * for details. + * First get the common code for the sigreturn entry points. + * This must come first. */ +#include "sigreturn.S" /* * The caller puts arg2 in %ecx, which gets pushed. The kernel will use @@ -23,11 +22,12 @@ * arg6 from the stack. * * You can not use this vsyscall for the clone() syscall because the - * three dwords on the parent stack do not get copied to the child. + * three words on the parent stack do not get copied to the child. */ .text .globl __kernel_vsyscall .type __kernel_vsyscall,@function + ALIGN __kernel_vsyscall: .LSTART_vsyscall: push %ecx @@ -45,8 +45,7 @@ __kernel_vsyscall: /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ jmp .Lenter_kernel /* 16: System call normal return point is here! */ - .globl SYSENTER_RETURN /* Symbol used by sysenter.c */ -SYSENTER_RETURN: +VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */ pop %ebp .Lpop_ebp: pop %edx @@ -85,38 +84,33 @@ SYSENTER_RETURN: .uleb128 0 /* What follows are the instructions for the table generation. We have to record all changes of the stack pointer. */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpush_ecx-.LSTART_vsyscall + .byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */ .byte 0x0e /* DW_CFA_def_cfa_offset */ .byte 0x08 /* RA at offset 8 now */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpush_edx-.Lpush_ecx + .byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */ .byte 0x0e /* DW_CFA_def_cfa_offset */ .byte 0x0c /* RA at offset 12 now */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lenter_kernel-.Lpush_edx + .byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */ .byte 0x0e /* DW_CFA_def_cfa_offset */ .byte 0x10 /* RA at offset 16 now */ .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ /* Finally the epilogue. */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpop_ebp-.Lenter_kernel + .byte 0x40 + (.Lpop_ebp-.Lenter_kernel) /* DW_CFA_advance_loc */ .byte 0x0e /* DW_CFA_def_cfa_offset */ .byte 0x0c /* RA at offset 12 now */ .byte 0xc5 /* DW_CFA_restore %ebp */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpop_edx-.Lpop_ebp + .byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */ .byte 0x0e /* DW_CFA_def_cfa_offset */ .byte 0x08 /* RA at offset 8 now */ - .byte 0x04 /* DW_CFA_advance_loc4 */ - .long .Lpop_ecx-.Lpop_edx + .byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */ .byte 0x0e /* DW_CFA_def_cfa_offset */ .byte 0x04 /* RA at offset 4 now */ .align 4 .LENDFDEDLSI: .previous -/* - * Get the common code for the sigreturn entry points. - */ -#include "vsyscall-sigreturn_32.S" + /* + * Emit a symbol with the size of this .eh_frame data, + * to verify it matches the other versions. + */ +VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI) diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S new file mode 100644 index 00000000000..976124bb5f9 --- /dev/null +++ b/arch/x86/vdso/vdso32/vdso32.lds.S @@ -0,0 +1,37 @@ +/* + * Linker script for 32-bit vDSO. + * We #include the file to define the layout details. + * Here we only choose the prelinked virtual address. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. We can define local symbols here called VDSO* to make their + * values visible using the asm-x86/vdso.h macros from the kernel proper. + */ + +#define VDSO_PRELINK 0 +#include "../vdso-layout.lds.S" + +/* The ELF entry point can be used to set the AT_SYSINFO value. */ +ENTRY(__kernel_vsyscall); + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION +{ + LINUX_2.5 { + global: + __kernel_vsyscall; + __kernel_sigreturn; + __kernel_rt_sigreturn; + local: *; + }; +} + +/* + * Symbols we define here called VDSO* get their values into vdso32-syms.h. + */ +VDSO32_PRELINK = VDSO_PRELINK; +VDSO32_vsyscall = __kernel_vsyscall; +VDSO32_sigreturn = __kernel_sigreturn; +VDSO32_rt_sigreturn = __kernel_rt_sigreturn; diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c index 3b1ae1abfba..c8097f17f8a 100644 --- a/arch/x86/vdso/vgetcpu.c +++ b/arch/x86/vdso/vgetcpu.c @@ -15,11 +15,11 @@ long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { - unsigned int dummy, p; + unsigned int p; if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) { /* Load per CPU data from RDTSCP */ - rdtscp(dummy, dummy, p); + native_read_tscp(&p); } else { /* Load per CPU data from GDT */ asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index ff9333e5fb0..3fdd51497a8 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -11,23 +11,20 @@ #include <asm/vsyscall.h> #include <asm/vgtod.h> #include <asm/proto.h> -#include "voffset.h" +#include <asm/vdso.h> -int vdso_enabled = 1; - -#define VEXTERN(x) extern typeof(__ ## x) *vdso_ ## x; -#include "vextern.h" +#include "vextern.h" /* Just for VMAGIC. */ #undef VEXTERN -extern char vdso_kernel_start[], vdso_start[], vdso_end[]; +int vdso_enabled = 1; + +extern char vdso_start[], vdso_end[]; extern unsigned short vdso_sync_cpuid; struct page **vdso_pages; -static inline void *var_ref(void *vbase, char *var, char *name) +static inline void *var_ref(void *p, char *name) { - unsigned offset = var - &vdso_kernel_start[0] + VDSO_TEXT_OFFSET; - void *p = vbase + offset; if (*(void **)p != (void *)VMAGIC) { printk("VDSO: variable %s broken\n", name); vdso_enabled = 0; @@ -62,9 +59,8 @@ static int __init init_vdso_vars(void) vdso_enabled = 0; } -#define V(x) *(typeof(x) *) var_ref(vbase, (char *)RELOC_HIDE(&x, 0), #x) #define VEXTERN(x) \ - V(vdso_ ## x) = &__ ## x; + *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x; #include "vextern.h" #undef VEXTERN return 0; diff --git a/arch/x86/vdso/voffset.h b/arch/x86/vdso/voffset.h deleted file mode 100644 index 4af67c79085..00000000000 --- a/arch/x86/vdso/voffset.h +++ /dev/null @@ -1 +0,0 @@ -#define VDSO_TEXT_OFFSET 0x600 diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index fbfa55ce0d5..4d5f2649bee 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -5,6 +5,7 @@ config XEN bool "Xen guest support" select PARAVIRT + depends on X86_32 depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER) help This is the Linux Xen port. Enabling this will allow the diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 79ad1525215..de647bc6e74 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -141,8 +141,8 @@ static void __init xen_banner(void) printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); } -static void xen_cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) +static void xen_cpuid(unsigned int *ax, unsigned int *bx, + unsigned int *cx, unsigned int *dx) { unsigned maskedx = ~0; @@ -150,18 +150,18 @@ static void xen_cpuid(unsigned int *eax, unsigned int *ebx, * Mask out inconvenient features, to try and disable as many * unsupported kernel subsystems as possible. */ - if (*eax == 1) + if (*ax == 1) maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ (1 << X86_FEATURE_ACPI) | /* disable ACPI */ (1 << X86_FEATURE_ACC)); /* thermal monitoring */ asm(XEN_EMULATE_PREFIX "cpuid" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx)); - *edx &= maskedx; + : "=a" (*ax), + "=b" (*bx), + "=c" (*cx), + "=d" (*dx) + : "0" (*ax), "2" (*cx)); + *dx &= maskedx; } static void xen_set_debugreg(int reg, unsigned long val) @@ -275,19 +275,12 @@ static unsigned long xen_store_tr(void) static void xen_set_ldt(const void *addr, unsigned entries) { - unsigned long linear_addr = (unsigned long)addr; struct mmuext_op *op; struct multicall_space mcs = xen_mc_entry(sizeof(*op)); op = mcs.args; op->cmd = MMUEXT_SET_LDT; - if (linear_addr) { - /* ldt my be vmalloced, use arbitrary_virt_to_machine */ - xmaddr_t maddr; - maddr = arbitrary_virt_to_machine((unsigned long)addr); - linear_addr = (unsigned long)maddr.maddr; - } - op->arg1.linear_addr = linear_addr; + op->arg1.linear_addr = (unsigned long)addr; op->arg2.nr_ents = entries; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); @@ -295,7 +288,7 @@ static void xen_set_ldt(const void *addr, unsigned entries) xen_mc_issue(PARAVIRT_LAZY_CPU); } -static void xen_load_gdt(const struct Xgt_desc_struct *dtr) +static void xen_load_gdt(const struct desc_ptr *dtr) { unsigned long *frames; unsigned long va = dtr->address; @@ -357,11 +350,11 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) } static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, - u32 low, u32 high) + const void *ptr) { unsigned long lp = (unsigned long)&dt[entrynum]; xmaddr_t mach_lp = virt_to_machine(lp); - u64 entry = (u64)high << 32 | low; + u64 entry = *(u64 *)ptr; preempt_disable(); @@ -395,12 +388,11 @@ static int cvt_gate_to_trap(int vector, u32 low, u32 high, } /* Locations of each CPU's IDT */ -static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc); +static DEFINE_PER_CPU(struct desc_ptr, idt_desc); /* Set an IDT entry. If the entry is part of the current IDT, then also update Xen. */ -static void xen_write_idt_entry(struct desc_struct *dt, int entrynum, - u32 low, u32 high) +static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) { unsigned long p = (unsigned long)&dt[entrynum]; unsigned long start, end; @@ -412,14 +404,15 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum, xen_mc_flush(); - write_dt_entry(dt, entrynum, low, high); + native_write_idt_entry(dt, entrynum, g); if (p >= start && (p + 8) <= end) { struct trap_info info[2]; + u32 *desc = (u32 *)g; info[1].address = 0; - if (cvt_gate_to_trap(entrynum, low, high, &info[0])) + if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) if (HYPERVISOR_set_trap_table(info)) BUG(); } @@ -427,7 +420,7 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum, preempt_enable(); } -static void xen_convert_trap_info(const struct Xgt_desc_struct *desc, +static void xen_convert_trap_info(const struct desc_ptr *desc, struct trap_info *traps) { unsigned in, out, count; @@ -446,7 +439,7 @@ static void xen_convert_trap_info(const struct Xgt_desc_struct *desc, void xen_copy_trap_info(struct trap_info *traps) { - const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc); + const struct desc_ptr *desc = &__get_cpu_var(idt_desc); xen_convert_trap_info(desc, traps); } @@ -454,7 +447,7 @@ void xen_copy_trap_info(struct trap_info *traps) /* Load a new IDT into Xen. In principle this can be per-CPU, so we hold a spinlock to protect the static traps[] array (static because it avoids allocation, and saves stack space). */ -static void xen_load_idt(const struct Xgt_desc_struct *desc) +static void xen_load_idt(const struct desc_ptr *desc) { static DEFINE_SPINLOCK(lock); static struct trap_info traps[257]; @@ -475,22 +468,21 @@ static void xen_load_idt(const struct Xgt_desc_struct *desc) /* Write a GDT descriptor entry. Ignore LDT descriptors, since they're handled differently. */ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, - u32 low, u32 high) + const void *desc, int type) { preempt_disable(); - switch ((high >> 8) & 0xff) { - case DESCTYPE_LDT: - case DESCTYPE_TSS: + switch (type) { + case DESC_LDT: + case DESC_TSS: /* ignore */ break; default: { xmaddr_t maddr = virt_to_machine(&dt[entry]); - u64 desc = (u64)high << 32 | low; xen_mc_flush(); - if (HYPERVISOR_update_descriptor(maddr.maddr, desc)) + if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) BUG(); } @@ -499,11 +491,11 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, preempt_enable(); } -static void xen_load_esp0(struct tss_struct *tss, +static void xen_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { struct multicall_space mcs = xen_mc_entry(0); - MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0); + MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); xen_mc_issue(PARAVIRT_LAZY_CPU); } @@ -521,12 +513,12 @@ static void xen_io_delay(void) } #ifdef CONFIG_X86_LOCAL_APIC -static unsigned long xen_apic_read(unsigned long reg) +static u32 xen_apic_read(unsigned long reg) { return 0; } -static void xen_apic_write(unsigned long reg, unsigned long val) +static void xen_apic_write(unsigned long reg, u32 val) { /* Warn to see if there's any stray references */ WARN_ON(1); @@ -666,6 +658,13 @@ static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); } +/* Early release_pt assumes that all pts are pinned, since there's + only init_mm and anything attached to that is pinned. */ +static void xen_release_pt_init(u32 pfn) +{ + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +} + static void pin_pagetable_pfn(unsigned level, unsigned long pfn) { struct mmuext_op op; @@ -677,7 +676,7 @@ static void pin_pagetable_pfn(unsigned level, unsigned long pfn) /* This needs to make sure the new pte page is pinned iff its being attached to a pinned pagetable. */ -static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) +static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) { struct page *page = pfn_to_page(pfn); @@ -686,7 +685,7 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) if (!PageHighMem(page)) { make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); - pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); + pin_pagetable_pfn(level, pfn); } else /* make sure there are no stray mappings of this page */ @@ -694,6 +693,16 @@ static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) } } +static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) +{ + xen_alloc_ptpage(mm, pfn, MMUEXT_PIN_L1_TABLE); +} + +static void xen_alloc_pd(struct mm_struct *mm, u32 pfn) +{ + xen_alloc_ptpage(mm, pfn, MMUEXT_PIN_L2_TABLE); +} + /* This should never happen until we're OK to use struct page */ static void xen_release_pt(u32 pfn) { @@ -796,6 +805,9 @@ static __init void xen_pagetable_setup_done(pgd_t *base) /* This will work as long as patching hasn't happened yet (which it hasn't) */ pv_mmu_ops.alloc_pt = xen_alloc_pt; + pv_mmu_ops.alloc_pd = xen_alloc_pd; + pv_mmu_ops.release_pt = xen_release_pt; + pv_mmu_ops.release_pd = xen_release_pt; pv_mmu_ops.set_pte = xen_set_pte; if (!xen_feature(XENFEAT_auto_translated_physmap)) { @@ -953,7 +965,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .read_pmc = native_read_pmc, .iret = (void *)&hypercall_page[__HYPERVISOR_iret], - .irq_enable_sysexit = NULL, /* never called */ + .irq_enable_syscall_ret = NULL, /* never called */ .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, @@ -968,7 +980,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .write_ldt_entry = xen_write_ldt_entry, .write_gdt_entry = xen_write_gdt_entry, .write_idt_entry = xen_write_idt_entry, - .load_esp0 = xen_load_esp0, + .load_sp0 = xen_load_sp0, .set_iopl_mask = xen_set_iopl_mask, .io_delay = xen_io_delay, @@ -1019,10 +1031,10 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .pte_update_defer = paravirt_nop, .alloc_pt = xen_alloc_pt_init, - .release_pt = xen_release_pt, - .alloc_pd = paravirt_nop, + .release_pt = xen_release_pt_init, + .alloc_pd = xen_alloc_pt_init, .alloc_pd_clone = paravirt_nop, - .release_pd = paravirt_nop, + .release_pd = xen_release_pt_init, #ifdef CONFIG_HIGHPTE .kmap_atomic_pte = xen_kmap_atomic_pte, diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c index 6d1da5809e6..dcf613e1758 100644 --- a/arch/x86/xen/events.c +++ b/arch/x86/xen/events.c @@ -465,7 +465,7 @@ void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) * a bitset of words which contain pending event bits. The second * level is a bitset of pending events themselves. */ -fastcall void xen_evtchn_do_upcall(struct pt_regs *regs) +void xen_evtchn_do_upcall(struct pt_regs *regs) { int cpu = get_cpu(); struct shared_info *s = HYPERVISOR_shared_info; @@ -487,7 +487,7 @@ fastcall void xen_evtchn_do_upcall(struct pt_regs *regs) int irq = evtchn_to_irq[port]; if (irq != -1) { - regs->orig_eax = ~irq; + regs->orig_ax = ~irq; do_IRQ(regs); } } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 0ac6c5dc49b..45aa771e73a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -58,7 +58,8 @@ xmaddr_t arbitrary_virt_to_machine(unsigned long address) { - pte_t *pte = lookup_address(address); + int level; + pte_t *pte = lookup_address(address, &level); unsigned offset = address & PAGE_MASK; BUG_ON(pte == NULL); @@ -70,8 +71,9 @@ void make_lowmem_page_readonly(void *vaddr) { pte_t *pte, ptev; unsigned long address = (unsigned long)vaddr; + int level; - pte = lookup_address(address); + pte = lookup_address(address, &level); BUG_ON(pte == NULL); ptev = pte_wrprotect(*pte); @@ -84,8 +86,9 @@ void make_lowmem_page_readwrite(void *vaddr) { pte_t *pte, ptev; unsigned long address = (unsigned long)vaddr; + int level; - pte = lookup_address(address); + pte = lookup_address(address, &level); BUG_ON(pte == NULL); ptev = pte_mkwrite(*pte); @@ -241,12 +244,12 @@ unsigned long long xen_pgd_val(pgd_t pgd) pte_t xen_make_pte(unsigned long long pte) { - if (pte & 1) + if (pte & _PAGE_PRESENT) { pte = phys_to_machine(XPADDR(pte)).maddr; + pte &= ~(_PAGE_PCD | _PAGE_PWT); + } - pte &= ~_PAGE_PCD; - - return (pte_t){ pte, pte >> 32 }; + return (pte_t){ .pte = pte }; } pmd_t xen_make_pmd(unsigned long long pmd) @@ -290,10 +293,10 @@ unsigned long xen_pgd_val(pgd_t pgd) pte_t xen_make_pte(unsigned long pte) { - if (pte & _PAGE_PRESENT) + if (pte & _PAGE_PRESENT) { pte = phys_to_machine(XPADDR(pte)).maddr; - - pte &= ~_PAGE_PCD; + pte &= ~(_PAGE_PCD | _PAGE_PWT); + } return (pte_t){ pte }; } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index f84e7722664..3bad4773a2f 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -10,6 +10,7 @@ #include <linux/pm.h> #include <asm/elf.h> +#include <asm/vdso.h> #include <asm/e820.h> #include <asm/setup.h> #include <asm/xen/hypervisor.h> @@ -59,12 +60,10 @@ static void xen_idle(void) /* * Set the bit indicating "nosegneg" library variants should be used. */ -static void fiddle_vdso(void) +static void __init fiddle_vdso(void) { - extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S. */ - extern char vsyscall_int80_start; - u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK + - &vsyscall_int80_start); + extern const char vdso32_default_start; + u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index c1b131bcdcb..aafc5443740 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -146,7 +146,7 @@ void __init xen_smp_prepare_boot_cpu(void) old memory can be recycled */ make_lowmem_page_readwrite(&per_cpu__gdt_page); - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_possible_cpu(cpu) { cpus_clear(per_cpu(cpu_sibling_map, cpu)); /* * cpu_core_map lives in a per cpu area that is cleared @@ -163,7 +163,7 @@ void __init xen_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_possible_cpu(cpu) { cpus_clear(per_cpu(cpu_sibling_map, cpu)); /* * cpu_core_ map will be zeroed when the per @@ -239,10 +239,10 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); ctxt->user_regs.cs = __KERNEL_CS; - ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs); + ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); ctxt->kernel_ss = __KERNEL_DS; - ctxt->kernel_sp = idle->thread.esp0; + ctxt->kernel_sp = idle->thread.sp0; ctxt->event_callback_cs = __KERNEL_CS; ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index d083ff5ef08..b3721fd6877 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -592,7 +592,7 @@ __init void xen_time_init(void) set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - tsc_disable = 0; + setup_force_cpu_cap(X86_FEATURE_TSC); xen_setup_timer(cpu); xen_setup_cpu_clockevents(); diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index f8d6937db2e..288d587ce73 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -4,16 +4,18 @@ #ifdef CONFIG_XEN #include <linux/elfnote.h> +#include <linux/init.h> #include <asm/boot.h> #include <xen/interface/elfnote.h> -.pushsection .init.text + __INIT ENTRY(startup_xen) movl %esi,xen_start_info cld movl $(init_thread_union+THREAD_SIZE),%esp jmp xen_start_kernel -.popsection + + __FINIT .pushsection .bss.page_aligned .align PAGE_SIZE_asm |